Fast AVR integer square root assembly routines By Ruud v Gessel, dobbelbeker@chello.nl
Below you will see 4 AVR assembler routines
1) Sqrt16: 16 bits square
root. Returns floor integer. 19 code words, 90 – 96 cycles.
2) Sqrt32: 32 bits
square root. Returns floor integer. 32 code words, 264 – 306 cycles.
3) Sqrt16: 16 bits
square root. Returns rounded integer. 26 code words, 95 – 102 cycles.
4) Sqrt32: 32 bits
square root. Returns rounded integer. 43 code words, 271 - 316 cycles.
The routines preserve all registers except the passed argument (and of course the result registers). The
argument may consist of any registers (R0-R31) I used R5:R4:R3:R2 for 32 bits,
R3:R2 for 16 bits, the result must consist of high registers (R16-R31), I used
R17:R16.
The rounding routines can be shortened by adding an
extra register to the “rotation mask” and “developing sqrt”
variables, you can than simply decide how many times to iterate (9 times for 16
bits and 17 times for 32 bits), then almost no extra code is needed to
determine the last bit of the result. However after some tries it turned out
that though the code was a few % shorter, the number of execution cycles increased dramatically (the
number of instructions inside the loop increases with 5 and the number of
iterations increases with 2). For example, a rough draft of the 32 bits rounded
sqrt routine came out 7 code words shorter (without
preserving R0, R1 else it would only be 3 code word shorter) but took on
average 85 cycles more to execute (see routine at bottom of
doc). So I have abandoned this method (it will be useful for floating point
sqrt though).
Have Fun
Ruud
; Fast and short 16 bits AVR sqrt routine
;
; R17:R16 = sqrt(R3:R2),
rounded down to integer
;
; Registers:
; Destroys the argument in R3:R2
;
; Cycles incl call
& ret = 90 - 96
;
; Stack incl call = 2
Sqrt16: ldi
R17,0xc0 ; Rotation mask
register
ldi
R16,0x40 ; Developing sqrt
clc
; Enter loop with C=0
_sq16_1: brcs
_sq16_2 ; C --> Bit
is always 1
cp
R3,R16 ; Does
value fit?
brcs
_sq16_3 ; C -->
bit is 0
_sq16_2: sub
R3,R16 ;
Adjust argument for next bit
or
R16,R17 ; Set bit
to 1
_sq16_3: lsr
R17
; Shift right rotation mask
lsl R2
rol
R3
; Shift left argument, C --> Next sub is MUST
eor
R16,R17 ; Shift
right test bit in developing sqrt
andi R17,0xfe
; Becomes 0 for last bit
brne
_sq16_1 ; Develop 7
bits
brcs
_sq16_4 ; C-->
Last bit always 1
lsl
R2
; Need bit 7 in C for cpc
cpc
R16,R3 ;
After this C is last bit
_sq16_4: adc
R16,R17 ; Set last
bit if C (R17=0)
ret
; Fast and short 32 bits AVR sqrt routine
;
; R17:R16=SQRT(R5:R4:R3:R2) rounded down to
integer
;
; Registers:
; Destroys the argument in R5:R4:R3:R2
;
; Cycles incl call
& ret = 263 - 305
;
; Stack incl call = 4
Sqrt32: push R18
push R19
ldi R19,0xc0
clr
R18
; rotation mask in R19:R18
ldi R17,0x40
sub
R16,R16 ;
developing sqrt in R17:R16, C=0
_sq32_1: brcs
_sq32_2 ; C -->
Bit is always 1
cp R4,R16
cpc
R5,R17 ; Does
test value fit?
brcs
_sq32_3 ; C -->
nope, bit is 0
_sq32_2: sub R4,R16
sbc
R5,R17 ;
Adjust argument for next bit
or R16,R18
or
R17,R19 ; Set bit
to 1
_sq32_3: lsr
R19
ror
R18
; Shift right mask, C --> end loop
eor R17,R19
eor
R16,R18 ; Shift
right only test bit in result
rol
R2
; Bit 0 only set if end of loop
rol R3
rol R4
rol
R5
; Shift left remaining argument (C used at _sq32_1)
sbrs
R2,0
; Skip if 15 bits developed
rjmp
_sq32_1 ; Develop
15 bits of the sqrt
brcs
_sq32_4 ; C-->
Last bits always 1
lsl
R3
; Need bit 7 in C for cpc
cpc R16,R4
cpc
R17,R5 ;
After this C is last bit
_sq32_4: adc
R16,R19 ; Round up
if C (R19=0)
pop R19
pop R18
ret
; Fast and short 16 bits AVR sqrt routine
;
; R17:R16 = sqrt(R3:R2),
rounded to nearest integer (0.5 rounds up)
;
; Registers:
; Destroys the argument in R3:R2
;
; Cycles incl call
& ret = 95 - 102
;
; Stack incl call = 2
Sqrt16: ldi
R17,0xc0 ; Rotation mask
ldi
R16,0x40 ; Developing sqrt
clc
; Enter loop with C=0
_sq16_1: brcs
_sq16_2 ; C -->
Bit is always 1
cp R3,R16
; Does value fit?
brcs
_sq16_3 ; C -->
bit is 0
_sq16_2: sub
R3,R16 ;
Adjust argument for next bit
or
R16,R17 ; Set bit
to 1
_sq16_3: lsr
R17
; Shift right rotation mask
lsl R2
rol
R3
; Shift left argument, C --> Next sub is MUST
eor
R16,R17 ; Shift
right test bit in developing sqrt
andi
R17,0xfe ; Becomes 0 for
last bit
brne
_sq16_1 ; Develop 7
bits
brcs
_sq16_4 ; C-->
Last bits always 1
cp
R16,R3 ; Does
bit need update?
brcc
_sq16_5 ; NC -->
nope, bit still 0
_sq16_4: sbc
R2,R17 ;
Subtract C (any value from 1 to 0x7f will do)
sbc
R3,R16 ;
Update remainder
inc
R16
; Last bit = 1
_sq16_5: lsl
R2
; Only bit 7 matters
rol
R3
; *2 + C, new test value
brcs
_sq16_6 ; C -->
Always round up
cp
R16,R3 ; C
decides rounding
_sq16_6: adc
R16,R17 ; Round up
if C (R17=0)
ret
; Fast and short 32 bits AVR sqrt routine
;
; R17:R16=SQRT(R5:R4:R3:R2) rounded to the
nearest integer (0.5 rounds up)
;
; Registers:
; Destroys the argument in R5:R4:R3:R2
;
; Cycles incl call
& ret = 271 - 316
;
; Stack incl call = 4
Sqrt32: push R18
push R19
ldi R19,0xc0
clr
R18
; rotation mask in R19:R18
ldi R17,0x40
sub
R16,R16 ;
developing sqrt in R17:R16, C=0
_sq32_1: brcs
_sq32_2 ; C -->
Bit is always 1
cp R4,R16
cpc
R5,R17 ; Does
test value fit?
brcs
_sq32_3 ; C -->
nope, bit is 0
_sq32_2: sub R4,R16
sbc
R5,R17 ;
Adjust argument for next bit
or R16,R18
or
R17,R19 ; Set bit
to 1
_sq32_3: lsr
R19
ror
R18
; Shift right mask, C --> end loop
eor R17,R19
eor
R16,R18 ; Shift
right only test bit in result
rol
R2
; Bit 0 only set if end of loop
rol R3
rol R4
rol
R5
; Shift left remaining argument (C used at _sq32_1)
sbrs
R2,0
; Skip if 15 bits developed
rjmp
_sq32_1 ; Develop
15 bits of the sqrt
brcs
_sq32_4 ; C-->
Last bits always 1
cp R16,R4
cpc
R17,R5 ; Test
for last bit 1
brcc
_sq32_5 ; NC -->
bit is 0
_sq32_4: sbc
R3,R19 ;
Subtract C (any value from 1 to 0x7f will do)
sbc R4,R16
sbc
R5,R17 ;
Update argument for test
inc
R16
; Last bit is 1
_sq32_5: lsl
R3
; Only bit 7 matters
rol R4
rol R5 ; Remainder * 2 + C
brcs
_sq32_6 ; C -->
Always round up
cp R16,R4
cpc R17,R5
; C decides rounding
_sq32_6: adc
R16,R19
adc
R17,R19 ; Round up
if C (R19=0)
pop R19
pop R18
ret
; Illustration of 32 bits sqrt using extra registers
;
; R17:R16=SQRT(R5:R4:R3:R2) rounded to the
nearest integer (0.5 rounds up)
;
; Registers:
; Destroys the argument in R5:R4:R3:R2 and R0,
R1
;
; Cycles incl call
& ret = 336 -421
;
; Stack incl call = 4
Sqrt32: push R18
push R19
ldi R19,0xc0
clr
R18
; rotation mask in R19:R18:R0
ldi R17,0x40
clr R16
;
developing sqrt in R17:R16:R1
mul R16,R16 ; R0=R1=0, C=0
_sq32_1: brcs
_sq32_2 ; C -->
Bit is always 1
cp R3,R1 ; Extra instruction
cpc R4,R16
cpc
R5,R17 ; Does
test value fit?
brcs
_sq32_3 ; C -->
nope, bit is 0
_sq32_2: sub R3,R1
; Extra instruction
sbc R4,R16
sbc
R5,R17 ;
Adjust argument for next bit
or R1,R0 ; Extra instruction
or R16,R18
or
R17,R19 ; Set bit
to 1
_sq32_3: lsr
R19
ror
R18
; Shift right mask
ror R0 ; Extra instruction
eor R1,R0 ; Extra instruction
eor R17,R19
eor
R16,R18 ; Shift
right only test bit in result
lsl R2
rol R3
rol R4
rol
R5
; Shift left remaining argument (C used at _sq32_1)
sbrs
R0,5
; Skip if 17 bits developed
rjmp
_sq32_1 ; Develop
17 bits of the sqrt
lsl R1 ; C if round up
adc R16,R19
adc
R17,R19 ; Round up
if C (R19=0)
pop R19
pop R18
ret