Fast AVR integer square root assembly routines By Ruud v Gessel, dobbelbeker@chello.nl

Below you will see 4 AVR assembler routines

1)  Sqrt16: 16 bits square root. Returns floor integer. 19 code words, 90 – 96 cycles.

2)  Sqrt32: 32 bits square root. Returns floor integer. 32 code words, 264 – 306 cycles.

3)  Sqrt16: 16 bits square root. Returns rounded integer. 26 code words, 95 – 102 cycles.

4)  Sqrt32: 32 bits square root. Returns rounded integer. 43 code words, 271 - 316 cycles.

The routines preserve all registers except the passed argument (and of course the result registers). The argument may consist of any registers (R0-R31) I used R5:R4:R3:R2 for 32 bits, R3:R2 for 16 bits, the result must consist of high registers (R16-R31), I used R17:R16.

The rounding routines can be shortened by adding an extra register to the “rotation mask” and “developing sqrt” variables, you can than simply decide how many times to iterate (9 times for 16 bits and 17 times for 32 bits), then almost no extra code is needed to determine the last bit of the result. However after some tries it turned out that though the code was a few % shorter, the number of  execution cycles increased dramatically (the number of instructions inside the loop increases with 5 and the number of iterations increases with 2). For example, a rough draft of the 32 bits rounded sqrt routine came out 7 code words shorter (without preserving R0, R1 else it would only be 3 code word shorter) but took on average 85 cycles more to execute (see routine at bottom of doc). So I have abandoned this method (it will be useful for floating point sqrt though).

Have Fun

Ruud

;  Fast and short 16 bits AVR sqrt routine

;

;  R17:R16 = sqrt(R3:R2), rounded down to integer

;

;  Registers:

;  Destroys the argument in R3:R2

;

;  Cycles incl call & ret = 90 - 96

;

;  Stack incl call = 2

Sqrt16:     ldi   R17,0xc0          ; Rotation mask register

ldi   R16,0x40          ; Developing sqrt

clc                     ; Enter loop with C=0

_sq16_1:    brcs  _sq16_2           ; C --> Bit is always 1

cp    R3,R16            ; Does value fit?

brcs  _sq16_3           ; C --> bit is 0

_sq16_2:    sub   R3,R16            ; Adjust argument for next bit

or    R16,R17           ; Set bit to 1

_sq16_3:    lsr   R17               ; Shift right rotation mask

lsl   R2

rol   R3                ; Shift left argument, C --> Next sub is MUST

eor   R16,R17           ; Shift right test bit in developing sqrt

andi  R17,0xfe          ; Becomes 0 for last bit

brne  _sq16_1           ; Develop 7 bits

brcs  _sq16_4           ; C--> Last bit always 1

lsl   R2                ; Need bit 7 in C for cpc

cpc   R16,R3            ; After this C is last bit

_sq16_4:    adc   R16,R17           ; Set last bit if C (R17=0)

ret

;  Fast and short 32 bits AVR sqrt routine

;

;  R17:R16=SQRT(R5:R4:R3:R2) rounded down to integer

;

;  Registers:

;  Destroys the argument in R5:R4:R3:R2

;

;  Cycles incl call & ret = 263 - 305

;

;  Stack incl call = 4

Sqrt32:     push  R18

push  R19

ldi   R19,0xc0

clr   R18               ; rotation mask in R19:R18

ldi   R17,0x40

sub   R16,R16           ; developing sqrt in R17:R16, C=0

_sq32_1:    brcs  _sq32_2           ; C --> Bit is always 1

cp    R4,R16

cpc   R5,R17            ; Does test value fit?

brcs  _sq32_3           ; C --> nope, bit is 0

_sq32_2:    sub   R4,R16

sbc   R5,R17            ; Adjust argument for next bit

or    R16,R18

or    R17,R19           ; Set bit to 1

_sq32_3:    lsr   R19

ror   R18               ; Shift right mask, C --> end loop

eor   R17,R19

eor   R16,R18           ; Shift right only test bit in result

rol   R2                ; Bit 0 only set if end of loop

rol   R3

rol   R4

rol   R5                ; Shift left remaining argument (C used at _sq32_1)

sbrs  R2,0              ; Skip if 15 bits developed

rjmp  _sq32_1           ; Develop 15 bits of the sqrt

brcs  _sq32_4           ; C--> Last bits always 1

lsl   R3                ; Need bit 7 in C for cpc

cpc   R16,R4

cpc   R17,R5            ; After this C is last bit

_sq32_4:    adc   R16,R19           ; Round up if C (R19=0)

pop   R19

pop   R18

ret

;  Fast and short 16 bits AVR sqrt routine

;

;  R17:R16 = sqrt(R3:R2), rounded to nearest integer (0.5 rounds up)

;

;  Registers:

;  Destroys the argument in R3:R2

;

;  Cycles incl call & ret = 95 - 102

;

;  Stack incl call = 2

Sqrt16:     ldi   R17,0xc0          ; Rotation mask

ldi   R16,0x40          ; Developing sqrt

clc                     ; Enter loop with C=0

_sq16_1:    brcs  _sq16_2           ; C --> Bit is always 1

cp    R3,R16            ; Does value fit?

brcs  _sq16_3           ; C --> bit is 0

_sq16_2:    sub   R3,R16            ; Adjust argument for next bit

or    R16,R17           ; Set bit to 1

_sq16_3:    lsr   R17               ; Shift right rotation mask

lsl   R2

rol   R3                ; Shift left argument, C --> Next sub is MUST

eor   R16,R17           ; Shift right test bit in developing sqrt

andi  R17,0xfe          ; Becomes 0 for last bit

brne  _sq16_1           ; Develop 7 bits

brcs  _sq16_4           ; C--> Last bits always 1

cp    R16,R3            ; Does bit need update?

brcc  _sq16_5           ; NC --> nope, bit still 0

_sq16_4:    sbc   R2,R17            ; Subtract C (any value from 1 to 0x7f will do)

sbc   R3,R16            ; Update remainder

inc   R16               ; Last bit = 1

_sq16_5:    lsl   R2                ; Only bit 7 matters

rol   R3                ; *2 + C, new test value

brcs  _sq16_6           ; C --> Always round up

cp    R16,R3            ; C decides rounding

_sq16_6:    adc   R16,R17           ; Round up if C (R17=0)

ret

;  Fast and short 32 bits AVR sqrt routine

;

;  R17:R16=SQRT(R5:R4:R3:R2) rounded to the nearest integer (0.5 rounds up)

;

;  Registers:

;  Destroys the argument in R5:R4:R3:R2

;

;  Cycles incl call & ret = 271 - 316

;

;  Stack incl call = 4

Sqrt32:     push  R18

push  R19

ldi   R19,0xc0

clr   R18               ; rotation mask in R19:R18

ldi   R17,0x40

sub   R16,R16           ; developing sqrt in R17:R16, C=0

_sq32_1:    brcs  _sq32_2           ; C --> Bit is always 1

cp    R4,R16

cpc   R5,R17            ; Does test value fit?

brcs  _sq32_3           ; C --> nope, bit is 0

_sq32_2:    sub   R4,R16

sbc   R5,R17            ; Adjust argument for next bit

or    R16,R18

or    R17,R19           ; Set bit to 1

_sq32_3:    lsr   R19

ror   R18               ; Shift right mask, C --> end loop

eor   R17,R19

eor   R16,R18           ; Shift right only test bit in result

rol   R2                ; Bit 0 only set if end of loop

rol   R3

rol   R4

rol   R5                ; Shift left remaining argument (C used at _sq32_1)

sbrs  R2,0              ; Skip if 15 bits developed

rjmp  _sq32_1           ; Develop 15 bits of the sqrt

brcs  _sq32_4           ; C--> Last bits always 1

cp    R16,R4

cpc   R17,R5            ; Test for last bit 1

brcc  _sq32_5           ; NC --> bit is 0

_sq32_4:    sbc   R3,R19            ; Subtract C (any value from 1 to 0x7f will do)

sbc   R4,R16

sbc   R5,R17            ; Update argument for test

inc   R16               ; Last bit is 1

_sq32_5:    lsl   R3                ; Only bit 7 matters

rol   R4

rol   R5                ; Remainder * 2 + C

brcs  _sq32_6           ; C --> Always round up

cp    R16,R4

cpc   R17,R5            ; C decides rounding

adc   R17,R19           ; Round up if C (R19=0)

pop   R19

pop   R18

ret

;  Illustration of 32 bits sqrt using extra registers

;

;  R17:R16=SQRT(R5:R4:R3:R2) rounded to the nearest integer (0.5 rounds up)

;

;  Registers:

;  Destroys the argument in R5:R4:R3:R2 and R0, R1

;

;  Cycles incl call & ret = 336 -421

;

;  Stack incl call = 4

Sqrt32:     push  R18

push  R19

ldi   R19,0xc0

clr   R18               ; rotation mask in R19:R18:R0

ldi   R17,0x40

clr   R16               ; developing sqrt in R17:R16:R1

mul   R16,R16           ; R0=R1=0, C=0

_sq32_1:    brcs  _sq32_2           ; C --> Bit is always 1

cp    R3,R1             ; Extra instruction

cpc   R4,R16

cpc   R5,R17            ; Does test value fit?

brcs  _sq32_3           ; C --> nope, bit is 0

_sq32_2:    sub   R3,R1             ; Extra instruction

sbc   R4,R16

sbc   R5,R17            ; Adjust argument for next bit

or    R1,R0             ; Extra instruction

or    R16,R18

or    R17,R19           ; Set bit to 1

_sq32_3:    lsr   R19

ror   R18               ; Shift right mask

ror   R0                ; Extra instruction

eor   R1,R0             ; Extra instruction

eor   R17,R19

eor   R16,R18           ; Shift right only test bit in result

lsl   R2

rol   R3

rol   R4

rol   R5                ; Shift left remaining argument (C used at _sq32_1)

sbrs  R0,5              ; Skip if 17 bits developed

rjmp  _sq32_1           ; Develop 17 bits of the sqrt

lsl   R1                ; C if round up

adc   R17,R19           ; Round up if C (R19=0)

pop   R19

pop   R18

ret