Have mercy on the maintenance programmer - It may be your older self.
Separate documentation tends to get
- separated, as in not always readily accessible
(Murphy: when direly needed)
- out of sync - well, that's a problem even with in-line comments
→ document, in the code (for every part created for a separate reason)
- what it is good for
- where it got inspired (there may be easily accessible explanations enlightening to someone unfamiliar with the problem at hand or the approach used: even name dropping may help find such)
- what has been the incentive to write it
Adopting good practices running counter to adverse customs isn't easy and fast.
Much material about assembly programming is pre-1980ies, when there was some reason to have short mnemonics for instructions and operands. (No matter pointing pen or finger at a (printed…if you were lucky) program listing: no pop-up. So better keep things all in one line…)
Please use telling names. Coding in assembly is no licence not to.
In a division implementation, I'd not imagine problems with R for remainder or Q for quotient. Resist any impulse to outsmart everyone with the likes of DVsor. N for numerator wouldn't be bad if talking about fractions, but if
N2 and N1 in addition - all three in H and L flavours - weren't bad enough, along comes ;NOTE: N2 <=255 so NxH = 0, also P < 2^16 so we can discard upper byte of DH * NxL. P is mentioned in the ALGORITHM OVERVIEW.
In one comment, you switched from
sum = sum + term2
sum = sum + term2 + term3
to
sum = sum + term2
sum = term3 + sum + term2
(I'd prefer
sum += term2
sum += term3, anyway.)
I am looking for ways to reduce either
- the code size,
- lookup table size,
- or number of clock cycles
One source of inspiration on how to code integer arithmetic is libgcc:
A "non-performing" division would be slightly faster than a non-restoring one, but hardly faster than about 120 cycles.
Rather than trying to understand the algorithm you sketch in OVERVIEW and thinking up shortcuts myself, I scrutinised the code presented. Did you write it from scratch, or did you take some compiler output for inspiration?
Catching my eye:
- "register order" differs from the one implied by mul or the GCC calling convention, preventing the 1 cycle&word advantage each movw offers. As this is not included in The constraints, change either one.
- The critical ("normal"?) path turns out to be taken branches mostly. With AVR, non-taken branches are faster.
As it turns out, table access is on the critical path. While it would seem possible to save at least 127 bytes of R1H_TBL, it would cost speed.
With an eye on recognisability rather than mnemonic naming:
; not sure about the correct "movw syntax with defs", anyway)
idivu_16x8: ; 8 cycles less than idivu_16x16?
; stab at catching all the micro efficiencies:
; - AVR taken branches take longer: keep the critical path straight
; - "movw" takes one cycle where two "mov"s take two
; - where possible, arrange order of computation to render "tst" redundant
; to leverage movw, low and high registers are exchanged with respect to
; <https://codereview.stackexchange.com/revisions/254077/4>
; digits starting an aligned in-line comment are cycle counts:
; per instruction, cumulative in basic block,
; and worst case cumulative from idivu_16x16
;code for D < 256
clr RZERO ;1 3
;lookup low and high byte of reciprocal into P.
;where P = min(2^16 / D, 2^16-1)
mov zl, DL ;1 4
ldi zh, high(R1L_TBL*2) ;1 1
lpm PL, Z ;3 2
ldi zh, high(R1H_TBL*2) ;1 5
lpm PH, Z ;3 6 10
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;calculate N2 = (P * N) >> 16
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; NH:NL
; X RH:RL
;------------------------------------------
; N2H | N2L | N1H | dropped
;----------+----------+----------+---------
; H(PH*NH) | L(PH*NH) | H(PL*NL) | L(PL*NL)
; | H(PL*NH) | L(PL*NH) |
; | H(PH*NL) | L(PH*NL) |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mul NL , PL ;2 13 PL*NL
mov N1H, PRODH ;1 2 N1H <= H(PL*NL)
mul NH , PH ;2 3 PH*NH
movw N2H,PRODH ;1 5 N2H <= H(PH*NH)
;mov N2L,PRODL ;1 6 N2L <= L(PH*NH)
mul NH , PL ;2 6 PL*NH
add N1H, PRODL ;1 8 N1H <= H(PL*NL) + L(PL*NH)
adc N2L, PRODH ;1 9 N2L <= L(PH*NH) + H(PL*NH)
adc N2H, RZERO ;1 10 propagate carry to N2H
mul NL , PH ;2 11 PH*NL
add N1H, PRODL ;1 13 N1H <= H(PL*NL) + L(PL*NH) + L(PH*NL)
adc N2L, PRODH ;1 14 N2L <= L(PH*NH) + H(PL*NH) + H(PH*NL)
adc N2H, RZERO ;1 15 propagate carry to N2H
;calculate P = N2 * DL, note DH=0
mul N2L, DL ;2 16
movw P, PROD ;1 18
;mov PH, PRODH ;1 19
mul N2H, DL ;2 20
add PH, PRODL ;1 22
rjmp idivu_16x16_adj_n ;2 23 36
d1Heq:
mov N2L, N1H ;1 25
clr N2H ;1 1
rjmp idivu_16x16_checkn ;2 2 27
idivu_16x16:
;Check that DH is not zero
tst DH ;1 0 0
breq idivu_16x8 ;2 1
;code for D >= 256
;idivu_16x16_dhne:
clr RZERO ;1 2 *
;Lookup Rx = min(256 / DH, 255)
mov zl, DH ;1 3 *
ldi zh, high(R1H_TBL*2) ;1 1
lpm Rx, Z ;3 2
;N1 = (N? * Rx) >> 8
mul Rx , NH ;2 5
movw N1L,PRODL ;1 7
;mov N1H,PRODH ;1 8
mul Rx , NL ;2 8
add N1L, PRODH ;1 10
adc N1H, RZERO ;1 11
;D1 = (D * Rx) >> 8
mul Rx , DH ;2 12
movw D1L,PRODL ;1 14
;mov D1H,PRODH ;1 15
mul Rx , DL ;2 15
add D1L, PRODH ;1 17
adc D1H, RZERO ;1 18
;if D1H = 0 then use Rx = 256, otherwise use table
;tst D1H ;1 19
brne d1Heq ;2 19 22
idivu_16x16_dxhne:
;Lookup Rx = (2 ^ 16) \ (256 + D1H)
mov zl, D1L ;1 23 *
ldi zh, high(R2_TBL*2) ;1 1
lpm Rx, Z ;3 2
;N2 = (N1 * R2) >> 16
mul Rx , N1H ;2 5
mov PL , PRODL ;1 7
mov N2L, PRODH ;1 8
mul Rx , N1L ;2 9
add PL , PRODH ;1 11
adc N2L, RZERO ;1 12
clr N2H ;1 13 36
idivu_16x16_checkn:
;Check result (it may be off by +/- 1)
;P = N2 * D
;NOTE: N2 <=255 so NxH = 0,
; also P < 2^16 so we can discard upper byte of DH * NxL
mul DL, N2L ;2 37 *
movw PL, PRODL ;1 2
;mov PH, PRODH ;1 3
mul DH, N2L ;2 3
add PH, PRODL ;1 5
;brcc idivu_16x16_adj_n ;2 6 43
brcs idivu_16x16_mofl ;2 6 43
;Adjust result up or down by 1 if needed.
idivu_16x16_adj_n:
;Add -P to N, with result in P
;mov N1L, NL ;1 44 *
movw N1H, NH ;1
sub N1L, PL ;1 1
sbc N1H, PH ;1 2
;brsh idivu_16x16_pltn ;2 3 47
brlo idivu_16x16_decn2 ;2 3 47
idivu_16x16_pltn:
;test remainder to D
cp N1L, DL ;1 49 *
cpc N1H, DH ;1 1
;if remainder < D then goto end
brlo idivu_16x16_end ;2 2 51
;if remainder >= D then increment quotient, reduce remainder
subi N2L, 0xFF ;1 3
sbci N2H, 0xFF ;1 4
sub N1L, DL ;1 5
sbc N1H, DH ;1 6 55
idivu_16x16_end:
ret ; 56 **
idivu_16x16_decn2:
;if P > N then decrement quotient, add to remainder
subi N2L, 1 ;1 49
sbci N2H, 0 ;1 1
add N1L, DL ;1 2
adc N1H, DH ;1 3
ret ; 4 53
idivu_16x16_mofl:
;if multiply overflowed then...
;decrement quotient
;calculate remainder as N - P + D
subi N2L, 0x01 ;1 45
sbci N2H, 0x00 ;1 1
mov N1L, NL ;1 2
mov N1H, NH ;1 3
sub N1L, PL ;1 4
sbc N1H, PH ;1 5
add N1L, DL ;1 6
adc N1H, DH ;1 7
ret ;1 8 53