Skip to main content
2 of 3
move possible oncoming changes to a comment
greybeard
  • 7.7k
  • 3
  • 21
  • 56

Have mercy on the maintenance programmer - It may be your older self.

Separate documentation tends to get
- separated, as in not always readily accessible
(Murphy: when direly needed)
- out of sync - well, that's a problem even with in-line comments
→ document, in the code (for every part created for a separate reason)
- what it is good for
- where it got inspired (there may be easily accessible explanations enlightening to someone unfamiliar with the problem at hand or the approach used: even name dropping may help find such)
- what has been the incentive to write it

Adopting good practices running counter to adverse customs isn't easy and fast.
Much material about assembly programming is pre-1980ies, when there was some reason to have short mnemonics for instructions and operands. (No matter pointing pen or finger at a (printed…if you were lucky) program listing: no pop-up. So better keep things all in one line…)
Please use telling names. Coding in assembly is no licence not to.
In a division implementation, I'd not imagine problems with R for remainder or Q for quotient. Resist any impulse to outsmart everyone with the likes of DVsor. N for numerator wouldn't be bad if talking about fractions, but if N2 and N1 in addition - all three in H and L flavours - weren't bad enough, along comes ;NOTE: N2 <=255 so NxH = 0, also P < 2^16 so we can discard upper byte of DH * NxL. P is mentioned in the ALGORITHM OVERVIEW.

In one comment, you switched from
sum = sum + term2
sum = sum + term2 + term3
to
sum = sum + term2
sum = term3 + sum + term2
(I'd prefer
sum += term2
sum += term3, anyway.)


I am looking for ways to reduce either
- the code size,
- lookup table size,
- or number of clock cycles

One source of inspiration on how to code integer arithmetic is libgcc:
A "non-performing" division would be slightly faster than a non-restoring one, but hardly faster than about 120 cycles.
Rather than trying to understand the algorithm you sketch in OVERVIEW and thinking up shortcuts myself, I scrutinised the code presented. Did you write it from scratch, or did you take some compiler output for inspiration?
Catching my eye:
- "register order" differs from the one implied by mul or the GCC calling convention, preventing the 1 cycle&word advantage each movw offers. As this is not included in The constraints, change either one.
- The critical ("normal"?) path turns out to be taken branches mostly. With AVR, non-taken branches are faster.
As it turns out, table access is on the critical path. While it would seem possible to save at least 127 bytes of R1H_TBL, it would cost speed.


With an eye on recognisability rather than mnemonic naming:

; not sure about the correct "movw syntax with defs", anyway)

idivu_16x8:                 ; 8 cycles less than idivu_16x16?
; stab at catching all the micro efficiencies:
; - AVR taken branches take longer: keep the critical path straight
; - "movw" takes one cycle where two "mov"s take two
; - where possible, arrange order of computation to render "tst" redundant
; to leverage movw, low and high registers are exchanged with respect to
;  <https://codereview.stackexchange.com/revisions/254077/4>
; digits starting an aligned in-line comment are cycle counts:
;  per instruction, cumulative in basic block,
;  and worst case cumulative from idivu_16x16

;code for D < 256   
    clr RZERO               ;1       3
;lookup low and high byte of reciprocal into P.
;where P = min(2^16 / D, 2^16-1)
    mov zl, DL              ;1       4
    ldi zh, high(R1L_TBL*2) ;1   1
    lpm PL, Z               ;3   2
    ldi zh, high(R1H_TBL*2) ;1   5
    lpm PH, Z               ;3   6  10
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    ;calculate N2 = (P * N) >> 16
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    ;     NH:NL
    ;  X  RH:RL
    ;------------------------------------------
    ;   N2H    |   N2L    |  N1H     | dropped
    ;----------+----------+----------+---------
    ; H(PH*NH) | L(PH*NH) | H(PL*NL) | L(PL*NL)
    ;          | H(PL*NH) | L(PL*NH) |
    ;          | H(PH*NL) | L(PH*NL) |
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    mul NL , PL     ;2      13  PL*NL
    mov N1H, PRODH  ;1   2      N1H <= H(PL*NL)
    mul NH , PH     ;2   3      PH*NH
    movw N2H,PRODH  ;1   5      N2H <= H(PH*NH)
    ;mov N2L,PRODL  ;1   6      N2L <= L(PH*NH)
    mul NH , PL     ;2   6      PL*NH
    add N1H, PRODL  ;1   8      N1H <= H(PL*NL) + L(PL*NH) 
    adc N2L, PRODH  ;1   9      N2L <= L(PH*NH) + H(PL*NH)
    adc N2H, RZERO  ;1  10      propagate carry to N2H      
    mul NL , PH     ;2  11      PH*NL
    add N1H, PRODL  ;1  13      N1H <= H(PL*NL) + L(PL*NH) + L(PH*NL)
    adc N2L, PRODH  ;1  14      N2L <= L(PH*NH) + H(PL*NH) + H(PH*NL)
    adc N2H, RZERO  ;1  15      propagate carry to N2H  
    ;calculate P = N2 * DL, note DH=0
    mul N2L, DL     ;2  16
    movw P, PROD    ;1  18
    ;mov PH, PRODH  ;1  19
    mul N2H, DL     ;2  20
    add PH, PRODL   ;1  22
    rjmp idivu_16x16_adj_n  ;2  23  36

d1Heq:
    mov N2L, N1H            ;1      25
    clr N2H                 ;1   1
    rjmp idivu_16x16_checkn ;2   2  27

idivu_16x16:
    ;Check that DH is not zero
    tst DH                  ;1   0   0
    breq idivu_16x8         ;2   1

;code for D >= 256
;idivu_16x16_dhne:
    clr RZERO               ;1       2 *
;Lookup Rx = min(256 / DH, 255)
    mov zl, DH              ;1       3 *
    ldi zh, high(R1H_TBL*2) ;1   1
    lpm Rx, Z               ;3   2

    ;N1 = (N? * Rx) >> 8
    mul Rx , NH             ;2   5
    movw N1L,PRODL          ;1   7
    ;mov N1H,PRODH          ;1   8
    mul Rx , NL             ;2   8
    add N1L, PRODH          ;1  10
    adc N1H, RZERO          ;1  11

    ;D1 = (D * Rx) >> 8
    mul Rx , DH             ;2  12
    movw D1L,PRODL          ;1  14
    ;mov D1H,PRODH          ;1  15
    mul Rx , DL             ;2  15
    add D1L, PRODH          ;1  17
    adc D1H, RZERO          ;1  18

    ;if D1H = 0 then use Rx = 256, otherwise use table
    ;tst D1H                ;1  19
    brne d1Heq              ;2  19  22

idivu_16x16_dxhne:
    ;Lookup Rx = (2 ^ 16) \ (256 + D1H)
    mov zl, D1L             ;1      23 *
    ldi zh, high(R2_TBL*2)  ;1   1
    lpm Rx, Z               ;3   2
    ;N2 = (N1 * R2) >> 16
    mul Rx , N1H            ;2   5
    mov PL , PRODL          ;1   7
    mov N2L, PRODH          ;1   8
    mul Rx , N1L            ;2   9
    add PL , PRODH          ;1  11
    adc N2L, RZERO          ;1  12
    clr N2H                 ;1  13  36

idivu_16x16_checkn:
    ;Check result (it may be off by +/- 1)
    ;P = N2 * D
    ;NOTE:  N2 <=255 so NxH = 0,
    ;       also P < 2^16 so we can discard upper byte of DH * NxL
    mul DL, N2L             ;2      37 *
    movw PL, PRODL          ;1   2
    ;mov PH, PRODH          ;1   3
    mul DH, N2L             ;2   3
    add PH, PRODL           ;1   5
    ;brcc idivu_16x16_adj_n ;2   6  43
    brcs idivu_16x16_mofl   ;2   6  43

;Adjust result up or down by 1 if needed.
idivu_16x16_adj_n:
    ;Add -P to N, with result in P
    ;mov N1L, NL            ;1      44 *
    movw N1H, NH            ;1
    sub N1L, PL             ;1   1
    sbc N1H, PH             ;1   2
    ;brsh idivu_16x16_pltn  ;2   3  47
    brlo idivu_16x16_decn2  ;2   3  47

idivu_16x16_pltn:
    ;test remainder to D 
    cp  N1L, DL             ;1      49 *
    cpc N1H, DH             ;1   1
    ;if remainder < D then goto end
    brlo idivu_16x16_end    ;2   2  51

    ;if remainder >= D then increment quotient, reduce remainder
    subi N2L, 0xFF          ;1   3
    sbci N2H, 0xFF          ;1   4
    sub N1L, DL             ;1   5
    sbc N1H, DH             ;1   6  55
idivu_16x16_end:
    ret                     ;       56 **

idivu_16x16_decn2:
    ;if P > N then decrement quotient, add to remainder
    subi N2L, 1             ;1      49
    sbci N2H, 0             ;1   1
    add  N1L, DL            ;1   2
    adc  N1H, DH            ;1   3
    ret                     ;    4  53


idivu_16x16_mofl:
    ;if multiply overflowed then...
    ;decrement quotient
    ;calculate remainder as N - P + D
    subi N2L, 0x01          ;1      45
    sbci N2H, 0x00          ;1   1
    mov N1L, NL             ;1   2
    mov N1H, NH             ;1   3
    sub N1L, PL             ;1   4
    sbc N1H, PH             ;1   5
    add N1L, DL             ;1   6
    adc N1H, DH             ;1   7
    ret                     ;1   8  53
greybeard
  • 7.7k
  • 3
  • 21
  • 56