1 |
dashley |
10 |
;$Header: /home/dashley/cvsrep/e3ft_gpl01/e3ft_gpl01/dtaipubs/cron/2007/cpu08div16by16a/div16e16x16.s,v 1.4 2007/05/26 17:48:46 dashley Exp $:
|
2 |
|
|
;------------------------------------------------------------------------------
|
3 |
|
|
xdef c_udiv, c_umod
|
4 |
|
|
xref.b c_reg
|
5 |
|
|
.dcall "9,0,c_umod"
|
6 |
|
|
.dcall "7,0,c_udiv"
|
7 |
|
|
;
|
8 |
|
|
;Function Entry Conditions:
|
9 |
|
|
; NH in C_REG (a zero-page, i.e. direct addressing variable)
|
10 |
|
|
; NL in A
|
11 |
|
|
; DH in H
|
12 |
|
|
; DL in X
|
13 |
|
|
;
|
14 |
|
|
;Function Exit Conditions
|
15 |
|
|
; QH in C_REG
|
16 |
|
|
; QL in A
|
17 |
|
|
; RH in H
|
18 |
|
|
; RL in X
|
19 |
|
|
; Stack pointer (naturally) restored.
|
20 |
|
|
; No restrictions on condition codes (can be arbitrarily changed).
|
21 |
|
|
; No restrictions on other CPU registers (can be arbitrarily changed).
|
22 |
|
|
;
|
23 |
|
|
c_udiv:
|
24 |
|
|
psha ; save NL
|
25 |
|
|
pshh ; DH on the stack
|
26 |
|
|
tst 1,sp ; test zero
|
27 |
|
|
bne full ; full division
|
28 |
|
|
;
|
29 |
|
|
;Short divisor division.
|
30 |
|
|
;Divisor <= 255 (and may also be zero).
|
31 |
|
|
;
|
32 |
|
|
;Note to MJB: your staff did a better job on this than
|
33 |
|
|
;me. I had 48 clocks, this path seems to be 45.
|
34 |
|
|
;
|
35 |
|
|
pulh ; clean stack
|
36 |
|
|
lda c_reg ; NH in A
|
37 |
|
|
clrh ; 1st divide 8 X 8
|
38 |
|
|
div ; divide (H:A)/(X) -> A with remainder H
|
39 |
|
|
sta c_reg ; QH in place
|
40 |
|
|
pula ; complete dividend with NL
|
41 |
|
|
div ; divide
|
42 |
|
|
pshh ; move RL
|
43 |
|
|
pulx ; in X
|
44 |
|
|
clrh ; RH is zero
|
45 |
|
|
rts ; and return
|
46 |
|
|
;
|
47 |
|
|
;Long Divisor Division
|
48 |
|
|
;
|
49 |
|
|
;Mathematical Assertions
|
50 |
|
|
; Divisor >= 256 (and is definitely not zero).
|
51 |
|
|
; Numerator can be any value.
|
52 |
|
|
; Quotient <= 255.
|
53 |
|
|
; Remainder may be two bytes.
|
54 |
|
|
;
|
55 |
|
|
;Entry Conditions
|
56 |
|
|
; Stack frame already grown by 2 (and could use 1,SP and 2,SP
|
57 |
|
|
; for intermediate results).
|
58 |
|
|
; NH in C_REG (a zero-page, i.e. direct addressing variable)
|
59 |
|
|
; NL in A and 2,SP
|
60 |
|
|
; DH in H and 1,SP
|
61 |
|
|
; DL in X
|
62 |
|
|
;
|
63 |
|
|
full:
|
64 |
|
|
pshx ; save DL
|
65 |
|
|
psha ; start with
|
66 |
|
|
lda c_reg ; dividend
|
67 |
|
|
psha
|
68 |
|
|
;
|
69 |
|
|
;Stack condition
|
70 |
|
|
; 1,SP NH
|
71 |
|
|
; 2,SP NL
|
72 |
|
|
; 3,SP DL
|
73 |
|
|
; 4,SP DH
|
74 |
|
|
; 5,SP NL
|
75 |
|
|
;Due to redundancy, can probably re-use 5,SP for
|
76 |
|
|
;something else.
|
77 |
|
|
;
|
78 |
|
|
lda 4,sp ; DH
|
79 |
|
|
;
|
80 |
|
|
;Shift the numerator and denominator right until the denominator
|
81 |
|
|
;just fits in one byte. At least one shift will be required,
|
82 |
|
|
;because the denominator is known to be >= 256, hence the
|
83 |
|
|
;do-while() rather than while() loop.
|
84 |
|
|
;
|
85 |
|
|
sbcl:
|
86 |
|
|
lsr 1,sp
|
87 |
|
|
ror 2,sp
|
88 |
|
|
lsra
|
89 |
|
|
rorx
|
90 |
|
|
tsta
|
91 |
|
|
bne sbcl
|
92 |
|
|
;
|
93 |
|
|
;DTA note: the loop above looks excellent. No real fat. The only
|
94 |
|
|
;optimizations that seems possible are (a) to use c_reg to shift one byte
|
95 |
|
|
;of the numerator in so that a 4-cycle rather than a 5-cycle instruction
|
96 |
|
|
;is used, or (b) to use X rather than SP so that have 2 4-cycle rather than
|
97 |
|
|
;2 5-cycle instructions.
|
98 |
|
|
;
|
99 |
|
|
;Do the initial digit estimation division. Knuth calls out the possibility of
|
100 |
|
|
;an overflow, and that an overflow should be replaced by a quotient of 255.
|
101 |
|
|
;Not sure if an overflow is possible. 0 <= N <= 65535, and 256 <= D <= 65535.
|
102 |
|
|
;Assuming max N and min D after one shift gives 32767/128 = 255, still safe.
|
103 |
|
|
;Need to be more rigorous about the math to be sure that overflow not possible
|
104 |
|
|
;on the division. Note also that Knuth's algorithm calls for left-shifting to
|
105 |
|
|
;normalize and for the creation of an extra digit (in our case, extra byte).
|
106 |
|
|
;Because we have bounds on the divisor going in, right-shifting gives the same
|
107 |
|
|
;effect for us. There may be something about the setup that changes Knuth's
|
108 |
|
|
;assumptions that overflow is possible. Will need to revisit this and prove
|
109 |
|
|
;it rigorously.
|
110 |
|
|
;
|
111 |
|
|
txa
|
112 |
|
|
pulh
|
113 |
|
|
pulx
|
114 |
|
|
div
|
115 |
|
|
;
|
116 |
|
|
;Let:
|
117 |
|
|
; N be the numerator in to the function.
|
118 |
|
|
; D be the denominator in to the function.
|
119 |
|
|
; Q be the actual quotient.
|
120 |
|
|
; R be the actual remainder.
|
121 |
|
|
; Z(0) be the first trial quotient we calculated.
|
122 |
|
|
; Z(-1) be the trial quotient reduced by one.
|
123 |
|
|
; Z(-2) be the trial quotient reduced by two.
|
124 |
|
|
; Z() be the trial quotient we are working with at some
|
125 |
|
|
; point in time.
|
126 |
|
|
;
|
127 |
|
|
;The original MJB approach was to reconstruct Z * D, then
|
128 |
|
|
;subtract D up to twice to get Z(-1) * D and Z(-2) * D. That seemed to
|
129 |
|
|
;bloat the code. Let's try instead (to save FLASH) to loop using
|
130 |
|
|
;multiplication until N - (Z() * D) < D, or equivalently
|
131 |
|
|
;that Z() * D > N - D. We've gotta use the first expression, however,
|
132 |
|
|
;because the second would require either an exception case or 2's
|
133 |
|
|
;complement arithmetic and a third byte if N < D.
|
134 |
|
|
;
|
135 |
|
|
;
|
136 |
|
|
; +-------------------------------------+
|
137 |
|
|
; | Entry Condition: have Z(0) |
|
138 |
|
|
; | calculated. |
|
139 |
|
|
; +-------------------------------------+
|
140 |
|
|
; |
|
141 |
|
|
; |
|
142 |
|
|
; \|/
|
143 |
|
|
; +-------------------------------------+/
|
144 |
|
|
; | Calculate Z() * D |------+
|
145 |
|
|
; +-------------------------------------+\ |
|
146 |
|
|
; | |
|
147 |
|
|
; | |
|
148 |
|
|
; \|/ |
|
149 |
|
|
; Y +-------------------------------------+ |
|
150 |
|
|
; +-----| Z() * D > 65535 ? | |
|
151 |
|
|
; | +-------------------------------------+ |
|
152 |
|
|
; | | N |
|
153 |
|
|
; | | |
|
154 |
|
|
; | \|/ |
|
155 |
|
|
; | +-------------------------------------+ |
|
156 |
|
|
; | | Calculate N - Z() * D | |
|
157 |
|
|
; | +-------------------------------------+ |
|
158 |
|
|
; | | |
|
159 |
|
|
; | | |
|
160 |
|
|
; | \|/ |
|
161 |
|
|
; | +-------------------------------------+ |
|
162 |
|
|
; | | N - Z() * D < D ? |------------+
|
163 |
|
|
; | +-------------------------------------+ | |
|
164 |
|
|
; | | | |
|
165 |
|
|
; | | | |
|
166 |
|
|
; | \|/ | |
|
167 |
|
|
; | \+-------------------------------------+ | |
|
168 |
|
|
; +-----| Z() -- |------+ |
|
169 |
|
|
; /+-------------------------------------+ |
|
170 |
|
|
; |
|
171 |
|
|
; +--------------------------------+
|
172 |
|
|
; |
|
173 |
|
|
; \|/
|
174 |
|
|
; Done
|
175 |
|
|
;
|
176 |
|
|
;Stack Frame Setup
|
177 |
|
|
; Keeping track of the stack frame is a lot of work before everything is
|
178 |
|
|
; refined, so will just use the instructions and pretend the stack frame
|
179 |
|
|
; has been enlarged via AIS. (Note that I've swapped DH and DL below--they
|
180 |
|
|
; seemed to be out of order from before.)
|
181 |
|
|
;
|
182 |
|
|
; Imaginary Stack Setup
|
183 |
|
|
; 1,SP NH
|
184 |
|
|
; 2,SP NL
|
185 |
|
|
; 3,SP DH
|
186 |
|
|
; 4,SP DL
|
187 |
|
|
; 5,SP Z(0)
|
188 |
|
|
; 6,SP Z() * D, High Byte
|
189 |
|
|
; 7,SP Z() * D, Low Byte
|
190 |
|
|
; 8,SP N - Z() * D, High Byte
|
191 |
|
|
; 9,SP N - Z() * D, Low Byte
|
192 |
|
|
;
|
193 |
|
|
;
|
194 |
|
|
sta 5,sp ;Save Z(0).
|
195 |
|
|
;
|
196 |
|
|
nextz:
|
197 |
|
|
;-------------------
|
198 |
|
|
;Calculate Z() * D
|
199 |
|
|
;-------------------
|
200 |
|
|
ldx 4,sp
|
201 |
|
|
mul ;Lower product in X:A.
|
202 |
|
|
sta 7,sp
|
203 |
|
|
stx 8,sp
|
204 |
|
|
lda 5,sp
|
205 |
|
|
ldx 3,sp
|
206 |
|
|
mul ;Upper product in X:A.
|
207 |
|
|
tstx
|
208 |
|
|
bne p2big ;If the MSB of the upper product (byte 3) isn't zero,
|
209 |
|
|
;this automatically implies that Z() is too big. It may
|
210 |
|
|
;be possible to shave out this test ... requires more
|
211 |
|
|
;analysis.
|
212 |
|
|
add 6,sp
|
213 |
|
|
bcs p2big ;If there is a carry out of adding the MSB in, this is
|
214 |
|
|
;also an automatic cue that Z() is too big. It may
|
215 |
|
|
;be possible to shave out this test ... requires more
|
216 |
|
|
;analysis.
|
217 |
|
|
sta 6,sp ;Put result back. (6,sp):(7,sp) now contains product,
|
218 |
|
|
;and it definitely fits in 16 bits.
|
219 |
|
|
;
|
220 |
|
|
;-----------------------
|
221 |
|
|
;Calculate N - Z() * D
|
222 |
|
|
;-----------------------
|
223 |
|
|
lda 2,sp
|
224 |
|
|
sub 7,sp
|
225 |
|
|
sta 9,sp
|
226 |
|
|
lda 1,sp
|
227 |
|
|
sub 6,sp
|
228 |
|
|
sta 8,sp
|
229 |
|
|
bcs p2big ;If N-Z()*D is negative, Z() is too big.
|
230 |
|
|
|
231 |
|
|
;-------------------------------
|
232 |
|
|
;Compare N - Z() * D against D
|
233 |
|
|
;-------------------------------
|
234 |
|
|
;Branches below are imaginary -- too lazy to look up the
|
235 |
|
|
;instructions right now.
|
236 |
|
|
;-------------------------------
|
237 |
|
|
lda 3,sp
|
238 |
|
|
cmp 8,sp
|
239 |
|
|
bcc zwrong ;Need to look up right branches.
|
240 |
|
|
lda 4,sp
|
241 |
|
|
cmp 9,sp
|
242 |
|
|
bcc zwrong ;Need to look up right branches.
|
243 |
|
|
bra zright
|
244 |
|
|
|
245 |
|
|
;-------------------------------
|
246 |
|
|
;Z() --
|
247 |
|
|
;-------------------------------
|
248 |
|
|
p2big:
|
249 |
|
|
zwrong:
|
250 |
|
|
dec 5,sp
|
251 |
|
|
lda 5,sp
|
252 |
|
|
bra nextz
|
253 |
|
|
|
254 |
|
|
;At this point, the remainder is definitely is (8,sp):(9,sp), and
|
255 |
|
|
;the quotient is in 5,sp. Set up and return.
|
256 |
|
|
zright:
|
257 |
|
|
lda 8,sp
|
258 |
|
|
psha
|
259 |
|
|
pulh ;MSB of remainder in H.
|
260 |
|
|
ldx 9,sp ;LSB of remainder in X.
|
261 |
|
|
|
262 |
|
|
clr c_reg ;MSB of quotient zero.
|
263 |
|
|
lda 5,sp ;LSB of quotient in A.
|
264 |
|
|
|
265 |
|
|
ais #8 ;Adjust stack. Imaginary offset.
|
266 |
|
|
|
267 |
|
|
rts ;And return
|
268 |
|
|
;
|
269 |
|
|
end
|
270 |
|
|
;------------------------------------------------------------------------------
|
271 |
|
|
;$Log: div16e16x16.s,v $
|
272 |
|
|
;Revision 1.4 2007/05/26 17:48:46 dashley
|
273 |
|
|
;Preliminary alternate solution in place.
|
274 |
|
|
;
|
275 |
|
|
;Revision 1.3 2007/05/26 16:13:10 dashley
|
276 |
|
|
;Safety checkin after substantial edits.
|
277 |
|
|
;
|
278 |
|
|
;Revision 1.2 2007/05/25 23:34:31 dashley
|
279 |
|
|
;Proposed Knuth solution integrated into version-control system.
|