Fast AVR integer square root assembly routines By Ruud v Gessel, dobbelbeker@chello.nl

 

 

Below you will see 4 AVR assembler routines

 

1)  Sqrt16: 16 bits square root. Returns floor integer. 19 code words, 90 – 96 cycles.

2)  Sqrt32: 32 bits square root. Returns floor integer. 32 code words, 264 – 306 cycles.

3)  Sqrt16: 16 bits square root. Returns rounded integer. 26 code words, 95 – 102 cycles.

4)  Sqrt32: 32 bits square root. Returns rounded integer. 43 code words, 271 - 316 cycles.

 

The routines preserve all registers except the passed argument (and of course the result registers). The argument may consist of any registers (R0-R31) I used R5:R4:R3:R2 for 32 bits, R3:R2 for 16 bits, the result must consist of high registers (R16-R31), I used R17:R16.

 

The rounding routines can be shortened by adding an extra register to the “rotation mask” and “developing sqrt” variables, you can than simply decide how many times to iterate (9 times for 16 bits and 17 times for 32 bits), then almost no extra code is needed to determine the last bit of the result. However after some tries it turned out that though the code was a few % shorter, the number of  execution cycles increased dramatically (the number of instructions inside the loop increases with 5 and the number of iterations increases with 2). For example, a rough draft of the 32 bits rounded sqrt routine came out 7 code words shorter (without preserving R0, R1 else it would only be 3 code word shorter) but took on average 85 cycles more to execute (see routine at bottom of doc). So I have abandoned this method (it will be useful for floating point sqrt though).

 

Have Fun

Ruud

 

;  Fast and short 16 bits AVR sqrt routine

;

;  R17:R16 = sqrt(R3:R2), rounded down to integer

;

;  Registers:

;  Destroys the argument in R3:R2

;

;  Cycles incl call & ret = 90 - 96

;

;  Stack incl call = 2

 

Sqrt16:     ldi   R17,0xc0          ; Rotation mask register

            ldi   R16,0x40          ; Developing sqrt

            clc                     ; Enter loop with C=0

 

_sq16_1:    brcs  _sq16_2           ; C --> Bit is always 1

            cp    R3,R16            ; Does value fit?

            brcs  _sq16_3           ; C --> bit is 0

_sq16_2:    sub   R3,R16            ; Adjust argument for next bit

            or    R16,R17           ; Set bit to 1

_sq16_3:    lsr   R17               ; Shift right rotation mask

            lsl   R2

            rol   R3                ; Shift left argument, C --> Next sub is MUST

            eor   R16,R17           ; Shift right test bit in developing sqrt

            andi  R17,0xfe          ; Becomes 0 for last bit

            brne  _sq16_1           ; Develop 7 bits

           

            brcs  _sq16_4           ; C--> Last bit always 1

            lsl   R2                ; Need bit 7 in C for cpc

            cpc   R16,R3            ; After this C is last bit

_sq16_4:    adc   R16,R17           ; Set last bit if C (R17=0)

            ret

 

 

;  Fast and short 32 bits AVR sqrt routine

;

;  R17:R16=SQRT(R5:R4:R3:R2) rounded down to integer

;

;  Registers:

;  Destroys the argument in R5:R4:R3:R2

;

;  Cycles incl call & ret = 263 - 305

;

;  Stack incl call = 4

 

Sqrt32:     push  R18

            push  R19

            ldi   R19,0xc0

            clr   R18               ; rotation mask in R19:R18

            ldi   R17,0x40

            sub   R16,R16           ; developing sqrt in R17:R16, C=0

 

_sq32_1:    brcs  _sq32_2           ; C --> Bit is always 1

            cp    R4,R16

            cpc   R5,R17            ; Does test value fit?

            brcs  _sq32_3           ; C --> nope, bit is 0

_sq32_2:    sub   R4,R16

            sbc   R5,R17            ; Adjust argument for next bit

            or    R16,R18

            or    R17,R19           ; Set bit to 1

_sq32_3:    lsr   R19

            ror   R18               ; Shift right mask, C --> end loop

            eor   R17,R19

            eor   R16,R18           ; Shift right only test bit in result

            rol   R2                ; Bit 0 only set if end of loop

            rol   R3

            rol   R4

            rol   R5                ; Shift left remaining argument (C used at _sq32_1)

            sbrs  R2,0              ; Skip if 15 bits developed

            rjmp  _sq32_1           ; Develop 15 bits of the sqrt

 

 

            brcs  _sq32_4           ; C--> Last bits always 1

            lsl   R3                ; Need bit 7 in C for cpc

            cpc   R16,R4

            cpc   R17,R5            ; After this C is last bit

_sq32_4:    adc   R16,R19           ; Round up if C (R19=0)

 

            pop   R19

            pop   R18

            ret

 

 

;  Fast and short 16 bits AVR sqrt routine

;

;  R17:R16 = sqrt(R3:R2), rounded to nearest integer (0.5 rounds up)

;

;  Registers:

;  Destroys the argument in R3:R2

;

;  Cycles incl call & ret = 95 - 102

;

;  Stack incl call = 2

 

Sqrt16:     ldi   R17,0xc0          ; Rotation mask

            ldi   R16,0x40          ; Developing sqrt

            clc                     ; Enter loop with C=0

 

_sq16_1:    brcs  _sq16_2           ; C --> Bit is always 1

            cp    R3,R16            ; Does value fit?

            brcs  _sq16_3           ; C --> bit is 0

_sq16_2:    sub   R3,R16            ; Adjust argument for next bit

            or    R16,R17           ; Set bit to 1

_sq16_3:    lsr   R17               ; Shift right rotation mask

            lsl   R2

            rol   R3                ; Shift left argument, C --> Next sub is MUST

            eor   R16,R17           ; Shift right test bit in developing sqrt

            andi  R17,0xfe          ; Becomes 0 for last bit

            brne  _sq16_1           ; Develop 7 bits

           

            brcs  _sq16_4           ; C--> Last bits always 1

            cp    R16,R3            ; Does bit need update?

            brcc  _sq16_5           ; NC --> nope, bit still 0

_sq16_4:    sbc   R2,R17            ; Subtract C (any value from 1 to 0x7f will do)

            sbc   R3,R16            ; Update remainder

            inc   R16               ; Last bit = 1

_sq16_5:    lsl   R2                ; Only bit 7 matters

            rol   R3                ; *2 + C, new test value

            brcs  _sq16_6           ; C --> Always round up

            cp    R16,R3            ; C decides rounding

_sq16_6:    adc   R16,R17           ; Round up if C (R17=0)

            ret

 

 

;  Fast and short 32 bits AVR sqrt routine

;

;  R17:R16=SQRT(R5:R4:R3:R2) rounded to the nearest integer (0.5 rounds up)

;

;  Registers:

;  Destroys the argument in R5:R4:R3:R2

;

;  Cycles incl call & ret = 271 - 316

;

;  Stack incl call = 4

 

Sqrt32:     push  R18

            push  R19

            ldi   R19,0xc0

            clr   R18               ; rotation mask in R19:R18

            ldi   R17,0x40

            sub   R16,R16           ; developing sqrt in R17:R16, C=0

 

_sq32_1:    brcs  _sq32_2           ; C --> Bit is always 1

            cp    R4,R16

            cpc   R5,R17            ; Does test value fit?

            brcs  _sq32_3           ; C --> nope, bit is 0

_sq32_2:    sub   R4,R16

            sbc   R5,R17            ; Adjust argument for next bit

            or    R16,R18

            or    R17,R19           ; Set bit to 1

_sq32_3:    lsr   R19

            ror   R18               ; Shift right mask, C --> end loop

            eor   R17,R19

            eor   R16,R18           ; Shift right only test bit in result

            rol   R2                ; Bit 0 only set if end of loop

            rol   R3

            rol   R4

            rol   R5                ; Shift left remaining argument (C used at _sq32_1)

            sbrs  R2,0              ; Skip if 15 bits developed

            rjmp  _sq32_1           ; Develop 15 bits of the sqrt

 

            brcs  _sq32_4           ; C--> Last bits always 1

            cp    R16,R4

            cpc   R17,R5            ; Test for last bit 1

            brcc  _sq32_5           ; NC --> bit is 0

 

_sq32_4:    sbc   R3,R19            ; Subtract C (any value from 1 to 0x7f will do)

            sbc   R4,R16

            sbc   R5,R17            ; Update argument for test

            inc   R16               ; Last bit is 1

 

_sq32_5:    lsl   R3                ; Only bit 7 matters

            rol   R4

            rol   R5                ; Remainder * 2 + C

            brcs  _sq32_6           ; C --> Always round up

            cp    R16,R4

            cpc   R17,R5            ; C decides rounding

_sq32_6:    adc   R16,R19

            adc   R17,R19           ; Round up if C (R19=0)

 

            pop   R19

            pop   R18

            ret

 

;  Illustration of 32 bits sqrt using extra registers

;

;  R17:R16=SQRT(R5:R4:R3:R2) rounded to the nearest integer (0.5 rounds up)

;

;  Registers:

;  Destroys the argument in R5:R4:R3:R2 and R0, R1

;

;  Cycles incl call & ret = 336 -421

;

;  Stack incl call = 4

 

Sqrt32:     push  R18

            push  R19

            ldi   R19,0xc0

            clr   R18               ; rotation mask in R19:R18:R0

            ldi   R17,0x40

            clr   R16               ; developing sqrt in R17:R16:R1

            mul   R16,R16           ; R0=R1=0, C=0

 

_sq32_1:    brcs  _sq32_2           ; C --> Bit is always 1

            cp    R3,R1             ; Extra instruction

            cpc   R4,R16

            cpc   R5,R17            ; Does test value fit?

            brcs  _sq32_3           ; C --> nope, bit is 0

_sq32_2:    sub   R3,R1             ; Extra instruction

            sbc   R4,R16

            sbc   R5,R17            ; Adjust argument for next bit

            or    R1,R0             ; Extra instruction

            or    R16,R18

            or    R17,R19           ; Set bit to 1

_sq32_3:    lsr   R19

            ror   R18               ; Shift right mask

            ror   R0                ; Extra instruction

            eor   R1,R0             ; Extra instruction

            eor   R17,R19

            eor   R16,R18           ; Shift right only test bit in result

            lsl   R2

            rol   R3

            rol   R4

            rol   R5                ; Shift left remaining argument (C used at _sq32_1)

            sbrs  R0,5              ; Skip if 17 bits developed

            rjmp  _sq32_1           ; Develop 17 bits of the sqrt

 

            lsl   R1                ; C if round up

            adc   R16,R19

            adc   R17,R19           ; Round up if C (R19=0)

 

            pop   R19

            pop   R18

            ret