1 |
;$Header: /home/dashley/cvsrep/e3ft_gpl01/e3ft_gpl01/dtaipubs/cron/2007/cpu08div16by16a/div16e16x16.s,v 1.4 2007/05/26 17:48:46 dashley Exp $:
|
2 |
;------------------------------------------------------------------------------
|
3 |
xdef c_udiv, c_umod
|
4 |
xref.b c_reg
|
5 |
.dcall "9,0,c_umod"
|
6 |
.dcall "7,0,c_udiv"
|
7 |
;
|
8 |
;Function Entry Conditions:
|
9 |
; NH in C_REG (a zero-page, i.e. direct addressing variable)
|
10 |
; NL in A
|
11 |
; DH in H
|
12 |
; DL in X
|
13 |
;
|
14 |
;Function Exit Conditions
|
15 |
; QH in C_REG
|
16 |
; QL in A
|
17 |
; RH in H
|
18 |
; RL in X
|
19 |
; Stack pointer (naturally) restored.
|
20 |
; No restrictions on condition codes (can be arbitrarily changed).
|
21 |
; No restrictions on other CPU registers (can be arbitrarily changed).
|
22 |
;
|
23 |
c_udiv:
|
24 |
psha ; save NL
|
25 |
pshh ; DH on the stack
|
26 |
tst 1,sp ; test zero
|
27 |
bne full ; full division
|
28 |
;
|
29 |
;Short divisor division.
|
30 |
;Divisor <= 255 (and may also be zero).
|
31 |
;
|
32 |
;Note to MJB: your staff did a better job on this than
|
33 |
;me. I had 48 clocks, this path seems to be 45.
|
34 |
;
|
35 |
pulh ; clean stack
|
36 |
lda c_reg ; NH in A
|
37 |
clrh ; 1st divide 8 X 8
|
38 |
div ; divide (H:A)/(X) -> A with remainder H
|
39 |
sta c_reg ; QH in place
|
40 |
pula ; complete dividend with NL
|
41 |
div ; divide
|
42 |
pshh ; move RL
|
43 |
pulx ; in X
|
44 |
clrh ; RH is zero
|
45 |
rts ; and return
|
46 |
;
|
47 |
;Long Divisor Division
|
48 |
;
|
49 |
;Mathematical Assertions
|
50 |
; Divisor >= 256 (and is definitely not zero).
|
51 |
; Numerator can be any value.
|
52 |
; Quotient <= 255.
|
53 |
; Remainder may be two bytes.
|
54 |
;
|
55 |
;Entry Conditions
|
56 |
; Stack frame already grown by 2 (and could use 1,SP and 2,SP
|
57 |
; for intermediate results).
|
58 |
; NH in C_REG (a zero-page, i.e. direct addressing variable)
|
59 |
; NL in A and 2,SP
|
60 |
; DH in H and 1,SP
|
61 |
; DL in X
|
62 |
;
|
63 |
full:
|
64 |
pshx ; save DL
|
65 |
psha ; start with
|
66 |
lda c_reg ; dividend
|
67 |
psha
|
68 |
;
|
69 |
;Stack condition
|
70 |
; 1,SP NH
|
71 |
; 2,SP NL
|
72 |
; 3,SP DL
|
73 |
; 4,SP DH
|
74 |
; 5,SP NL
|
75 |
;Due to redundancy, can probably re-use 5,SP for
|
76 |
;something else.
|
77 |
;
|
78 |
lda 4,sp ; DH
|
79 |
;
|
80 |
;Shift the numerator and denominator right until the denominator
|
81 |
;just fits in one byte. At least one shift will be required,
|
82 |
;because the denominator is known to be >= 256, hence the
|
83 |
;do-while() rather than while() loop.
|
84 |
;
|
85 |
sbcl:
|
86 |
lsr 1,sp
|
87 |
ror 2,sp
|
88 |
lsra
|
89 |
rorx
|
90 |
tsta
|
91 |
bne sbcl
|
92 |
;
|
93 |
;DTA note: the loop above looks excellent. No real fat. The only
|
94 |
;optimizations that seems possible are (a) to use c_reg to shift one byte
|
95 |
;of the numerator in so that a 4-cycle rather than a 5-cycle instruction
|
96 |
;is used, or (b) to use X rather than SP so that have 2 4-cycle rather than
|
97 |
;2 5-cycle instructions.
|
98 |
;
|
99 |
;Do the initial digit estimation division. Knuth calls out the possibility of
|
100 |
;an overflow, and that an overflow should be replaced by a quotient of 255.
|
101 |
;Not sure if an overflow is possible. 0 <= N <= 65535, and 256 <= D <= 65535.
|
102 |
;Assuming max N and min D after one shift gives 32767/128 = 255, still safe.
|
103 |
;Need to be more rigorous about the math to be sure that overflow not possible
|
104 |
;on the division. Note also that Knuth's algorithm calls for left-shifting to
|
105 |
;normalize and for the creation of an extra digit (in our case, extra byte).
|
106 |
;Because we have bounds on the divisor going in, right-shifting gives the same
|
107 |
;effect for us. There may be something about the setup that changes Knuth's
|
108 |
;assumptions that overflow is possible. Will need to revisit this and prove
|
109 |
;it rigorously.
|
110 |
;
|
111 |
txa
|
112 |
pulh
|
113 |
pulx
|
114 |
div
|
115 |
;
|
116 |
;Let:
|
117 |
; N be the numerator in to the function.
|
118 |
; D be the denominator in to the function.
|
119 |
; Q be the actual quotient.
|
120 |
; R be the actual remainder.
|
121 |
; Z(0) be the first trial quotient we calculated.
|
122 |
; Z(-1) be the trial quotient reduced by one.
|
123 |
; Z(-2) be the trial quotient reduced by two.
|
124 |
; Z() be the trial quotient we are working with at some
|
125 |
; point in time.
|
126 |
;
|
127 |
;The original MJB approach was to reconstruct Z * D, then
|
128 |
;subtract D up to twice to get Z(-1) * D and Z(-2) * D. That seemed to
|
129 |
;bloat the code. Let's try instead (to save FLASH) to loop using
|
130 |
;multiplication until N - (Z() * D) < D, or equivalently
|
131 |
;that Z() * D > N - D. We've gotta use the first expression, however,
|
132 |
;because the second would require either an exception case or 2's
|
133 |
;complement arithmetic and a third byte if N < D.
|
134 |
;
|
135 |
;
|
136 |
; +-------------------------------------+
|
137 |
; | Entry Condition: have Z(0) |
|
138 |
; | calculated. |
|
139 |
; +-------------------------------------+
|
140 |
; |
|
141 |
; |
|
142 |
; \|/
|
143 |
; +-------------------------------------+/
|
144 |
; | Calculate Z() * D |------+
|
145 |
; +-------------------------------------+\ |
|
146 |
; | |
|
147 |
; | |
|
148 |
; \|/ |
|
149 |
; Y +-------------------------------------+ |
|
150 |
; +-----| Z() * D > 65535 ? | |
|
151 |
; | +-------------------------------------+ |
|
152 |
; | | N |
|
153 |
; | | |
|
154 |
; | \|/ |
|
155 |
; | +-------------------------------------+ |
|
156 |
; | | Calculate N - Z() * D | |
|
157 |
; | +-------------------------------------+ |
|
158 |
; | | |
|
159 |
; | | |
|
160 |
; | \|/ |
|
161 |
; | +-------------------------------------+ |
|
162 |
; | | N - Z() * D < D ? |------------+
|
163 |
; | +-------------------------------------+ | |
|
164 |
; | | | |
|
165 |
; | | | |
|
166 |
; | \|/ | |
|
167 |
; | \+-------------------------------------+ | |
|
168 |
; +-----| Z() -- |------+ |
|
169 |
; /+-------------------------------------+ |
|
170 |
; |
|
171 |
; +--------------------------------+
|
172 |
; |
|
173 |
; \|/
|
174 |
; Done
|
175 |
;
|
176 |
;Stack Frame Setup
|
177 |
; Keeping track of the stack frame is a lot of work before everything is
|
178 |
; refined, so will just use the instructions and pretend the stack frame
|
179 |
; has been enlarged via AIS. (Note that I've swapped DH and DL below--they
|
180 |
; seemed to be out of order from before.)
|
181 |
;
|
182 |
; Imaginary Stack Setup
|
183 |
; 1,SP NH
|
184 |
; 2,SP NL
|
185 |
; 3,SP DH
|
186 |
; 4,SP DL
|
187 |
; 5,SP Z(0)
|
188 |
; 6,SP Z() * D, High Byte
|
189 |
; 7,SP Z() * D, Low Byte
|
190 |
; 8,SP N - Z() * D, High Byte
|
191 |
; 9,SP N - Z() * D, Low Byte
|
192 |
;
|
193 |
;
|
194 |
sta 5,sp ;Save Z(0).
|
195 |
;
|
196 |
nextz:
|
197 |
;-------------------
|
198 |
;Calculate Z() * D
|
199 |
;-------------------
|
200 |
ldx 4,sp
|
201 |
mul ;Lower product in X:A.
|
202 |
sta 7,sp
|
203 |
stx 8,sp
|
204 |
lda 5,sp
|
205 |
ldx 3,sp
|
206 |
mul ;Upper product in X:A.
|
207 |
tstx
|
208 |
bne p2big ;If the MSB of the upper product (byte 3) isn't zero,
|
209 |
;this automatically implies that Z() is too big. It may
|
210 |
;be possible to shave out this test ... requires more
|
211 |
;analysis.
|
212 |
add 6,sp
|
213 |
bcs p2big ;If there is a carry out of adding the MSB in, this is
|
214 |
;also an automatic cue that Z() is too big. It may
|
215 |
;be possible to shave out this test ... requires more
|
216 |
;analysis.
|
217 |
sta 6,sp ;Put result back. (6,sp):(7,sp) now contains product,
|
218 |
;and it definitely fits in 16 bits.
|
219 |
;
|
220 |
;-----------------------
|
221 |
;Calculate N - Z() * D
|
222 |
;-----------------------
|
223 |
lda 2,sp
|
224 |
sub 7,sp
|
225 |
sta 9,sp
|
226 |
lda 1,sp
|
227 |
sub 6,sp
|
228 |
sta 8,sp
|
229 |
bcs p2big ;If N-Z()*D is negative, Z() is too big.
|
230 |
|
231 |
;-------------------------------
|
232 |
;Compare N - Z() * D against D
|
233 |
;-------------------------------
|
234 |
;Branches below are imaginary -- too lazy to look up the
|
235 |
;instructions right now.
|
236 |
;-------------------------------
|
237 |
lda 3,sp
|
238 |
cmp 8,sp
|
239 |
bcc zwrong ;Need to look up right branches.
|
240 |
lda 4,sp
|
241 |
cmp 9,sp
|
242 |
bcc zwrong ;Need to look up right branches.
|
243 |
bra zright
|
244 |
|
245 |
;-------------------------------
|
246 |
;Z() --
|
247 |
;-------------------------------
|
248 |
p2big:
|
249 |
zwrong:
|
250 |
dec 5,sp
|
251 |
lda 5,sp
|
252 |
bra nextz
|
253 |
|
254 |
;At this point, the remainder is definitely is (8,sp):(9,sp), and
|
255 |
;the quotient is in 5,sp. Set up and return.
|
256 |
zright:
|
257 |
lda 8,sp
|
258 |
psha
|
259 |
pulh ;MSB of remainder in H.
|
260 |
ldx 9,sp ;LSB of remainder in X.
|
261 |
|
262 |
clr c_reg ;MSB of quotient zero.
|
263 |
lda 5,sp ;LSB of quotient in A.
|
264 |
|
265 |
ais #8 ;Adjust stack. Imaginary offset.
|
266 |
|
267 |
rts ;And return
|
268 |
;
|
269 |
end
|
270 |
;------------------------------------------------------------------------------
|
271 |
;$Log: div16e16x16.s,v $
|
272 |
;Revision 1.4 2007/05/26 17:48:46 dashley
|
273 |
;Preliminary alternate solution in place.
|
274 |
;
|
275 |
;Revision 1.3 2007/05/26 16:13:10 dashley
|
276 |
;Safety checkin after substantial edits.
|
277 |
;
|
278 |
;Revision 1.2 2007/05/25 23:34:31 dashley
|
279 |
;Proposed Knuth solution integrated into version-control system.
|