papers/20070526_cpu08_divu16_by_u16/div16e16x16.s

;$Header: /home/dashley/cvsrep/e3ft_gpl01/e3ft_gpl01/dtaipubs/cron/2007/cpu08div16by16a/div16e16x16.s,v 1.4 2007/05/26 17:48:46 dashley Exp $:
;------------------------------------------------------------------------------
        xdef        c_udiv, c_umod
        xref.b        c_reg
        .dcall        "9,0,c_umod"
        .dcall        "7,0,c_udiv"
;
;Function Entry Conditions:
;  NH in C_REG (a zero-page, i.e. direct addressing variable)
;  NL in A
;  DH in H
;  DL in X
;
;Function Exit Conditions
;  QH in C_REG
;  QL in A
;  RH in H
;  RL in X
;  Stack pointer (naturally) restored.
;  No restrictions on condition codes (can be arbitrarily changed).
;  No restrictions on other CPU registers (can be arbitrarily changed).
;
c_udiv:
        psha                        ; save NL
        pshh                        ; DH on the stack
        tst        1,sp                ; test zero
        bne        full                ; full division
;
;Short divisor division.
;Divisor <= 255 (and may also be zero).
;
;Note to MJB:  your staff did a better job on this than
;me.  I had 48 clocks, this path seems to be 45.
;        
        pulh            ; clean stack
        lda    c_reg    ; NH in A
        clrh            ; 1st divide 8 X 8
        div             ; divide (H:A)/(X) -> A with remainder H
        sta    c_reg    ; QH in place
        pula            ; complete dividend with NL
        div             ; divide
        pshh            ; move RL
        pulx            ; in X
        clrh            ; RH is zero
        rts             ; and return
;
;Long Divisor Division
;
;Mathematical Assertions
;   Divisor >= 256 (and is definitely not zero).
;   Numerator can be any value.
;   Quotient <= 255.
;   Remainder may be two bytes.
;
;Entry Conditions
;  Stack frame already grown by 2 (and could use 1,SP and 2,SP
;  for intermediate results).
;  NH in C_REG (a zero-page, i.e. direct addressing variable)
;  NL in A and 2,SP
;  DH in H and 1,SP
;  DL in X
;
full:
        pshx            ; save DL
        psha            ; start with
        lda    c_reg    ; dividend
        psha
        ;
        ;Stack condition
        ;  1,SP NH
        ;  2,SP NL
        ;  3,SP DL
        ;  4,SP DH
        ;  5,SP NL
        ;Due to redundancy, can probably re-use 5,SP for
        ;something else.
        ;
        lda    4,sp     ; DH
        ;
        ;Shift the numerator and denominator right until the denominator
        ;just fits in one byte.  At least one shift will be required,
        ;because the denominator is known to be >= 256, hence the
        ;do-while() rather than while() loop.
        ;
sbcl:
        lsr    1,sp
        ror    2,sp
        lsra
        rorx
        tsta
        bne    sbcl
;
;DTA note:  the loop above looks excellent.  No real fat.  The only
;optimizations that seems possible are (a) to use c_reg to shift one byte
;of the numerator in so that a 4-cycle rather than a 5-cycle instruction
;is used, or (b) to use X rather than SP so that have 2 4-cycle rather than
;2 5-cycle instructions.
;
;Do the initial digit estimation division.  Knuth calls out the possibility of
;an overflow, and that an overflow should be replaced by a quotient of 255.
;Not sure if an overflow is possible.  0 <= N <= 65535, and 256 <= D <= 65535.
;Assuming max N and min D after one shift gives 32767/128 = 255, still safe.
;Need to be more rigorous about the math to be sure that overflow not possible
;on the division.  Note also that Knuth's algorithm calls for left-shifting to
;normalize and for the creation of an extra digit (in our case, extra byte).
;Because we have bounds on the divisor going in, right-shifting gives the same
;effect for us.  There may be something about the setup that changes Knuth's
;assumptions that overflow is possible.  Will need to revisit this and prove
;it rigorously.
;
        txa
        pulh
        pulx
        div
;
;Let:
;  N be the numerator in to the function.
;  D be the denominator in to the function.
;  Q be the actual quotient.
;  R be the actual remainder.
;  Z(0) be the first trial quotient we calculated.
;  Z(-1) be the trial quotient reduced by one.
;  Z(-2) be the trial quotient reduced by two.
;  Z() be the trial quotient we are working with at some
;     point in time.
;
;The original MJB approach was to reconstruct Z * D, then
;subtract D up to twice to get Z(-1) * D and Z(-2) * D.  That seemed to
;bloat the code.  Let's try instead (to save FLASH) to loop using
;multiplication until N - (Z() * D) < D, or equivalently
;that Z() * D > N - D.  We've gotta use the first expression, however,
;because the second would require either an exception case or 2's
;complement arithmetic and a third byte if N < D.
;
;
;                     +-------------------------------------+
;                     | Entry Condition: have Z(0)          |
;                     | calculated.                         |
;                     +-------------------------------------+ 
;                                       | 
;                                       |
;                                      \|/
;                     +-------------------------------------+/
;                     | Calculate Z() * D                   |------+
;                     +-------------------------------------+\     |
;                                       |                          |
;                                       |                          |
;                                      \|/                         |
;                   Y +-------------------------------------+      |
;               +-----| Z() * D > 65535 ?                   |      |
;               |     +-------------------------------------+      |
;               |                       | N                        |
;               |                       |                          |
;               |                      \|/                         |
;               |     +-------------------------------------+      |
;               |     | Calculate N - Z() * D               |      |
;               |     +-------------------------------------+      |
;               |                       |                          |
;               |                       |                          |
;               |                      \|/                         |
;               |     +-------------------------------------+      |
;               |     | N - Z() * D < D ?                   |------------+
;               |     +-------------------------------------+      |     |
;               |                       |                          |     |
;               |                       |                          |     |
;               |                      \|/                         |     |
;               |    \+-------------------------------------+      |     |
;               +-----| Z() --                              |------+     |
;                    /+-------------------------------------+            |
;                                                                        |
;                                       +--------------------------------+
;                                       |
;                                      \|/
;                                     Done
;
;Stack Frame Setup
;  Keeping track of the stack frame is a lot of work before everything is
;  refined, so will just use the instructions and pretend the stack frame
;  has been enlarged via AIS.  (Note that I've swapped DH and DL below--they
;  seemed to be out of order from before.)
;
;  Imaginary Stack Setup
;    1,SP NH
;    2,SP NL
;    3,SP DH
;    4,SP DL
;    5,SP Z(0)
;    6,SP Z() * D, High Byte
;    7,SP Z() * D, Low Byte
;    8,SP N - Z() * D, High Byte
;    9,SP N - Z() * D, Low Byte
;    
;
        sta    5,sp     ;Save Z(0).
        ;
nextz:        
        ;-------------------
        ;Calculate Z() * D
        ;-------------------
        ldx    4,sp
        mul             ;Lower product in X:A.
        sta    7,sp
        stx    8,sp
        lda    5,sp
        ldx    3,sp
        mul             ;Upper product in X:A.
        tstx
        bne    p2big    ;If the MSB of the upper product (byte 3) isn't zero,
                        ;this automatically implies that Z() is too big.  It may
                        ;be possible to shave out this test ... requires more
                        ;analysis.
        add    6,sp
        bcs    p2big    ;If there is a carry out of adding the MSB in, this is
                        ;also an automatic cue that Z() is too big.  It may
                        ;be possible to shave out this test ... requires more
                        ;analysis.
        sta    6,sp     ;Put result back.  (6,sp):(7,sp) now contains product,
                        ;and it definitely fits in 16 bits.
        ;
        ;-----------------------
        ;Calculate N - Z() * D
        ;-----------------------
        lda    2,sp
        sub    7,sp
        sta    9,sp
        lda    1,sp
        sub    6,sp
        sta    8,sp
        bcs    p2big    ;If N-Z()*D is negative, Z() is too big.
        
        ;-------------------------------
        ;Compare N - Z() * D against D
        ;-------------------------------
        ;Branches below are imaginary -- too lazy to look up the
        ;instructions right now.
        ;-------------------------------
        lda    3,sp
        cmp    8,sp
        bcc    zwrong   ;Need to look up right branches.
        lda    4,sp
        cmp    9,sp
        bcc    zwrong   ;Need to look up right branches.
        bra    zright
        
        ;-------------------------------
        ;Z() --
        ;-------------------------------
p2big:        
zwrong:
        dec    5,sp
        lda    5,sp
        bra    nextz                                
                        
;At this point, the remainder is definitely is (8,sp):(9,sp), and
;the quotient is in 5,sp.  Set up and return.
zright:
        lda    8,sp
        psha
        pulh            ;MSB of remainder in H.
        ldx    9,sp     ;LSB of remainder in X.
        
        clr    c_reg    ;MSB of quotient zero.
        lda    5,sp     ;LSB of quotient in A.
                
        ais     #8      ;Adjust stack.  Imaginary offset.
        
        rts             ;And return
;
        end
;------------------------------------------------------------------------------
;$Log: div16e16x16.s,v $
;Revision 1.4  2007/05/26 17:48:46  dashley
;Preliminary alternate solution in place.
;
;Revision 1.3  2007/05/26 16:13:10  dashley
;Safety checkin after substantial edits.
;
;Revision 1.2  2007/05/25 23:34:31  dashley
;Proposed Knuth solution integrated into version-control system.
1	dashley	10	;$Header: /home/dashley/cvsrep/e3ft_gpl01/e3ft_gpl01/dtaipubs/cron/2007/cpu08div16by16a/div16e16x16.s,v 1.4 2007/05/26 17:48:46 dashley Exp $:
2			;------------------------------------------------------------------------------
3			xdef c_udiv, c_umod
4			xref.b c_reg
5			.dcall "9,0,c_umod"
6			.dcall "7,0,c_udiv"
7			;
8			;Function Entry Conditions:
9			; NH in C_REG (a zero-page, i.e. direct addressing variable)
10			; NL in A
11			; DH in H
12			; DL in X
13			;
14			;Function Exit Conditions
15			; QH in C_REG
16			; QL in A
17			; RH in H
18			; RL in X
19			; Stack pointer (naturally) restored.
20			; No restrictions on condition codes (can be arbitrarily changed).
21			; No restrictions on other CPU registers (can be arbitrarily changed).
22			;
23			c_udiv:
24			psha ; save NL
25			pshh ; DH on the stack
26			tst 1,sp ; test zero
27			bne full ; full division
28			;
29			;Short divisor division.
30			;Divisor <= 255 (and may also be zero).
31			;
32			;Note to MJB: your staff did a better job on this than
33			;me. I had 48 clocks, this path seems to be 45.
34			;
35			pulh ; clean stack
36			lda c_reg ; NH in A
37			clrh ; 1st divide 8 X 8
38			div ; divide (H:A)/(X) -> A with remainder H
39			sta c_reg ; QH in place
40			pula ; complete dividend with NL
41			div ; divide
42			pshh ; move RL
43			pulx ; in X
44			clrh ; RH is zero
45			rts ; and return
46			;
47			;Long Divisor Division
48			;
49			;Mathematical Assertions
50			; Divisor >= 256 (and is definitely not zero).
51			; Numerator can be any value.
52			; Quotient <= 255.
53			; Remainder may be two bytes.
54			;
55			;Entry Conditions
56			; Stack frame already grown by 2 (and could use 1,SP and 2,SP
57			; for intermediate results).
58			; NH in C_REG (a zero-page, i.e. direct addressing variable)
59			; NL in A and 2,SP
60			; DH in H and 1,SP
61			; DL in X
62			;
63			full:
64			pshx ; save DL
65			psha ; start with
66			lda c_reg ; dividend
67			psha
68			;
69			;Stack condition
70			; 1,SP NH
71			; 2,SP NL
72			; 3,SP DL
73			; 4,SP DH
74			; 5,SP NL
75			;Due to redundancy, can probably re-use 5,SP for
76			;something else.
77			;
78			lda 4,sp ; DH
79			;
80			;Shift the numerator and denominator right until the denominator
81			;just fits in one byte. At least one shift will be required,
82			;because the denominator is known to be >= 256, hence the
83			;do-while() rather than while() loop.
84			;
85			sbcl:
86			lsr 1,sp
87			ror 2,sp
88			lsra
89			rorx
90			tsta
91			bne sbcl
92			;
93			;DTA note: the loop above looks excellent. No real fat. The only
94			;optimizations that seems possible are (a) to use c_reg to shift one byte
95			;of the numerator in so that a 4-cycle rather than a 5-cycle instruction
96			;is used, or (b) to use X rather than SP so that have 2 4-cycle rather than
97			;2 5-cycle instructions.
98			;
99			;Do the initial digit estimation division. Knuth calls out the possibility of
100			;an overflow, and that an overflow should be replaced by a quotient of 255.
101			;Not sure if an overflow is possible. 0 <= N <= 65535, and 256 <= D <= 65535.
102			;Assuming max N and min D after one shift gives 32767/128 = 255, still safe.
103			;Need to be more rigorous about the math to be sure that overflow not possible
104			;on the division. Note also that Knuth's algorithm calls for left-shifting to
105			;normalize and for the creation of an extra digit (in our case, extra byte).
106			;Because we have bounds on the divisor going in, right-shifting gives the same
107			;effect for us. There may be something about the setup that changes Knuth's
108			;assumptions that overflow is possible. Will need to revisit this and prove
109			;it rigorously.
110			;
111			txa
112			pulh
113			pulx
114			div
115			;
116			;Let:
117			; N be the numerator in to the function.
118			; D be the denominator in to the function.
119			; Q be the actual quotient.
120			; R be the actual remainder.
121			; Z(0) be the first trial quotient we calculated.
122			; Z(-1) be the trial quotient reduced by one.
123			; Z(-2) be the trial quotient reduced by two.
124			; Z() be the trial quotient we are working with at some
125			; point in time.
126			;
127			;The original MJB approach was to reconstruct Z * D, then
128			;subtract D up to twice to get Z(-1) * D and Z(-2) * D. That seemed to
129			;bloat the code. Let's try instead (to save FLASH) to loop using
130			;multiplication until N - (Z() * D) < D, or equivalently
131			;that Z() * D > N - D. We've gotta use the first expression, however,
132			;because the second would require either an exception case or 2's
133			;complement arithmetic and a third byte if N < D.
134			;
135			;
136			; +-------------------------------------+
137			; \| Entry Condition: have Z(0) \|
138			; \| calculated. \|
139			; +-------------------------------------+
140			; \|
141			; \|
142			; \\|/
143			; +-------------------------------------+/
144			; \| Calculate Z() * D \|------+
145			; +-------------------------------------+\ \|
146			; \| \|
147			; \| \|
148			; \\|/ \|
149			; Y +-------------------------------------+ \|
150			; +-----\| Z() * D > 65535 ? \| \|
151			; \| +-------------------------------------+ \|
152			; \| \| N \|
153			; \| \| \|
154			; \| \\|/ \|
155			; \| +-------------------------------------+ \|
156			; \| \| Calculate N - Z() * D \| \|
157			; \| +-------------------------------------+ \|
158			; \| \| \|
159			; \| \| \|
160			; \| \\|/ \|
161			; \| +-------------------------------------+ \|
162			; \| \| N - Z() * D < D ? \|------------+
163			; \| +-------------------------------------+ \| \|
164			; \| \| \| \|
165			; \| \| \| \|
166			; \| \\|/ \| \|
167			; \| \+-------------------------------------+ \| \|
168			; +-----\| Z() -- \|------+ \|
169			; /+-------------------------------------+ \|
170			; \|
171			; +--------------------------------+
172			; \|
173			; \\|/
174			; Done
175			;
176			;Stack Frame Setup
177			; Keeping track of the stack frame is a lot of work before everything is
178			; refined, so will just use the instructions and pretend the stack frame
179			; has been enlarged via AIS. (Note that I've swapped DH and DL below--they
180			; seemed to be out of order from before.)
181			;
182			; Imaginary Stack Setup
183			; 1,SP NH
184			; 2,SP NL
185			; 3,SP DH
186			; 4,SP DL
187			; 5,SP Z(0)
188			; 6,SP Z() * D, High Byte
189			; 7,SP Z() * D, Low Byte
190			; 8,SP N - Z() * D, High Byte
191			; 9,SP N - Z() * D, Low Byte
192			;
193			;
194			sta 5,sp ;Save Z(0).
195			;
196			nextz:
197			;-------------------
198			;Calculate Z() * D
199			;-------------------
200			ldx 4,sp
201			mul ;Lower product in X:A.
202			sta 7,sp
203			stx 8,sp
204			lda 5,sp
205			ldx 3,sp
206			mul ;Upper product in X:A.
207			tstx
208			bne p2big ;If the MSB of the upper product (byte 3) isn't zero,
209			;this automatically implies that Z() is too big. It may
210			;be possible to shave out this test ... requires more
211			;analysis.
212			add 6,sp
213			bcs p2big ;If there is a carry out of adding the MSB in, this is
214			;also an automatic cue that Z() is too big. It may
215			;be possible to shave out this test ... requires more
216			;analysis.
217			sta 6,sp ;Put result back. (6,sp):(7,sp) now contains product,
218			;and it definitely fits in 16 bits.
219			;
220			;-----------------------
221			;Calculate N - Z() * D
222			;-----------------------
223			lda 2,sp
224			sub 7,sp
225			sta 9,sp
226			lda 1,sp
227			sub 6,sp
228			sta 8,sp
229			bcs p2big ;If N-Z()*D is negative, Z() is too big.
230
231			;-------------------------------
232			;Compare N - Z() * D against D
233			;-------------------------------
234			;Branches below are imaginary -- too lazy to look up the
235			;instructions right now.
236			;-------------------------------
237			lda 3,sp
238			cmp 8,sp
239			bcc zwrong ;Need to look up right branches.
240			lda 4,sp
241			cmp 9,sp
242			bcc zwrong ;Need to look up right branches.
243			bra zright
244
245			;-------------------------------
246			;Z() --
247			;-------------------------------
248			p2big:
249			zwrong:
250			dec 5,sp
251			lda 5,sp
252			bra nextz
253
254			;At this point, the remainder is definitely is (8,sp):(9,sp), and
255			;the quotient is in 5,sp. Set up and return.
256			zright:
257			lda 8,sp
258			psha
259			pulh ;MSB of remainder in H.
260			ldx 9,sp ;LSB of remainder in X.
261
262			clr c_reg ;MSB of quotient zero.
263			lda 5,sp ;LSB of quotient in A.
264
265			ais #8 ;Adjust stack. Imaginary offset.
266
267			rts ;And return
268			;
269			end
270			;------------------------------------------------------------------------------
271			;$Log: div16e16x16.s,v $
272			;Revision 1.4 2007/05/26 17:48:46 dashley
273			;Preliminary alternate solution in place.
274			;
275			;Revision 1.3 2007/05/26 16:13:10 dashley
276			;Safety checkin after substantial edits.
277			;
278			;Revision 1.2 2007/05/25 23:34:31 dashley
279			;Proposed Knuth solution integrated into version-control system.