convert line ends
[canaan.git] / prj / tech / libsrc / r3d / projfast.asm
blobd4635d84263ad28c34eb2ef6a619c59d71a4ec03
1 ; $Source: x:/prj/tech/libsrc/r3d/RCS/projfast.asm $
2 ; $Revision: 1.8 $
3 ; $Author: dc $
4 ; $Date: 1997/01/05 22:51:04 $
6 ; Project-space transformation/clip coding
8 .486
9 include type.inc
10 include cseg.inc
11 include thunks.inc
12 include r3spoint.inc
13 include ctxts.inc
15 assume ds:_DATA
17 _DATA segment para public USE32 'DATA'
19 align 8
21 extern _r3d_x_off:dword
22 extern _r3d_y_off:dword
23 extern _r3d_x_off_24_8:dword
24 extern _r3d_y_off_24_8:dword
25 extern _r3d_c_w:dword
26 extern _r3d_c_h:dword
27 extern _r3d_c_w_24_8:dword
28 extern _r3d_c_h_24_8:dword
29 extern _r3d_x_clip:dword
30 extern _r3d_y_clip:dword
31 extern _r3d_near:dword
32 extern _r3d_fast_z:dword
33 extern _r3d_ccodes_or:dword
34 extern _r3d_glob:dword
36 x_off dq 0
37 y_off dq 0
38 tempbuf1 dq 0
39 tempbuf2 dq 0
41 two_to_52_power dd 059900000h
42 ; we want to multiply by 256
43 ; this means adding 8 to the exponent
44 ; the number is:
45 ; 0101 1001 1001 0000 0000 0000 0000 0000
46 ; sEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
47 ; exponent = 10110011
48 ; = 128 + 48 + 2 + 1
50 two_to_60_power dd 05d900000h
52 _DATA ends
54 _TEXT segment
56 _EXTFUNC mx_trans_mul_vec
59 ; r3_transform_block_proj_noclip
61 ; do projectspace point transformation/projection
63 ; C code snippet:
64 ; for (i=0; i < n; ++i) {
65 ; double w;
66 ; r3_trans_mul_vec(&dst->p, X2TRANS(&cx.o2c), src);
67 ; w = 1.0/dst->p.z; // w is 1/z;
68 ; dst->grp.sx = (int)(dst->p.x * w) + r3d_glob.x_off;
69 ; dst->grp.sy = (int)(dst->p.y * w) + r3d_glob.y_off;
70 ; dst->grp.w = w;
71 ; ++dst;
72 ; ++src;
73 ; }
74 ; takes eax = count, edx = dest, ebx = src, ecx = X2TRANS...
75 _FUNCDEF r3_transform_block_proj_noclip, 4
77 push esi
78 push edi
80 ; swap around registers preparing for inner loop
81 push eax
82 mov eax,edx
83 mov edx,ecx
84 pop ecx
86 ; we're going to use the add-a-big-number-instead-of-fist
87 ; hack. But we can fold them in to the adds we do as part
88 ; of projection
89 fld two_to_52_power
90 fild _r3d_x_off
91 fadd
92 fstp x_off
93 fld two_to_52_power
94 fild _r3d_y_off
95 fadd
96 fstp y_off
98 r3_tbpn_top:
99 ; we rely on the fact that the following
100 ; function doesn't change any registers;
101 ; this is only true of the asm implementation
102 _CALLFUNC mx_trans_mul_vec, 3
104 ; w = 1.0 / dst->p.z;
105 fld1
107 fld dword ptr [eax+R3S_POINT_Z]
109 fdiv
111 ; during the divide, preload
112 ; our next vector and the rest
113 ; of this point
115 mov esi,[eax+20]
116 mov esi,[ebx+12]
117 ; removed by Doug since it faults MSVC memory layout
118 ; really, this should be Stride Aware and should be Stride-4, probably?
119 ; mov esi,[eax+40]
120 ; really, the ebx one above probably should go too
123 ; dst->grp.w = w;
124 fst dword ptr [eax+R3S_POINT_W]
126 fld st(0)
128 fmul dword ptr [eax+R3S_POINT_X]
129 fxch st(1)
131 ; stall
133 fmul dword ptr [eax+R3S_POINT_Y]
134 fxch st(1)
136 ; add 2^52+2^51 + r3d_x_off
137 fadd x_off
138 fxch st(1)
140 ; ditto for y
141 fadd y_off
142 ; no fxch because next instruction isn't FP
144 add ebx,12
146 ; NB we set flags on the previous instruction,
147 ; don't stomp them before the jnz below!
149 fxch st(1)
151 fstp tempbuf1
153 fstp tempbuf2
155 ; load fixed point sx,sy
156 mov esi,dword ptr tempbuf1
157 mov edi,dword ptr tempbuf2
159 mov [eax+R3S_POINT_SX],esi
160 mov [eax+R3S_POINT_SY],edi
162 add eax, _r3d_glob[R3S_GLOBAL_CONTEXT_CUR_STRIDE]
163 dec ecx ; decrement loop count
165 jnz r3_tbpn_top
167 pop edi
168 pop esi
173 ; r3_transform_block_proj_clip
175 ; do projectspace point transformation/projection & clip coding
177 ; takes eax = count, edx = dest, ebx = src, ecx = X2TRANS...
178 _FUNCDEF r3_transform_block_proj_clip, 4
180 push ebp
181 push esi
182 push edi
184 push eax
185 mov eax,edx
186 mov edx,ecx
187 pop ecx
189 fld two_to_60_power
190 fild _r3d_x_off
191 fadd
192 fstp x_off
193 fld two_to_60_power
194 fild _r3d_y_off
195 fadd
196 fstp y_off
198 r3_tbpc_top:
199 ; we rely on the fact that the following
200 ; function doesn't change any registers
201 ; this is only true of the asm implementation
202 _CALLFUNC mx_trans_mul_vec, 3
204 ; w = 1.0 / dst->p.z;
205 fld1
207 fld dword ptr [eax+R3S_POINT_Z]
209 fdiv
211 ; during the divide, preload
212 ; our next vector and the rest
213 ; of this point
215 mov esi,[eax+20]
216 mov esi,[ebx+12]
217 ; removed by Doug since it faults MSVC memory layout
218 ; really, this should be Stride Aware and should be Stride-4, probably?
219 ; mov esi,[eax+40]
220 ; really, the ebx one above probably should go too
222 ; dst->grp.w = w;
223 fst dword ptr [eax+R3S_POINT_W]
225 fld st(0)
227 ; start computing sx,sy
228 fmul dword ptr [eax+R3S_POINT_X]
229 fxch st(1)
231 fmul dword ptr [eax+R3S_POINT_Y]
232 fxch st(1)
234 fadd x_off
235 fxch st(1)
237 fadd y_off
239 ; load z into integer for fast compare
240 ; and load "fast_z", which is a positive FP
241 ; number representing whether it's safe to
242 ; use fast (2d) clip coding
243 mov esi,[eax+R3S_POINT_Z]
244 mov edi,_r3d_fast_z
246 ; we can integer compare them because one of
247 ; them is definitely positive
248 cmp esi,edi
249 jle clip_slow
251 push ecx
253 fstp tempbuf2
255 fstp tempbuf1
257 mov esi,dword ptr tempbuf1
258 mov edi,dword ptr tempbuf2
261 ; ESI = sx, EDI = sy
263 ; TODO: should we offset these by half a pixel
264 ; to fix the clip coding? I think so. Would only
265 ; take one cycle to do.
267 ; The following code uses some pretty gory algorithms
268 ; to generate clip codes without branching.
270 ; There are two central concepts. First, if a value is
271 ; negative, then the highest bit is set. Thus, if we
272 ; just shift it right by 31, then we have a clip code for
273 ; it being negative.
275 ; Second, if we compare against a max value, we generate
276 ; a bunch of flags. The easiest one to set a bit from is
277 ; the carry flag, via sbb eax,eax. However, this sets a
278 ; bit if we borrowed in an _unsigned_ subtraction. Thus,
279 ; it sets a bit if (unsigned) x > max_x, which means it's
280 ; set if (signed) x > max_x || x < 0.
282 ; So we use the latter approach to set our flags, knowing
283 ; that it will incorrectly set an "off_right" code when it's
284 ; actually "off_left". We fix this up by making "off_left"
285 ; also set "off_right", and xoring.
287 shl esi,8 ; go from 24.8 to 16.16
288 mov ebp,_r3d_c_w_24_8 ; canvas width in 24.8
290 shl edi,8
291 mov [eax+R3S_POINT_SX],esi
293 mov [eax+R3S_POINT_SY],edi
294 mov esi,dword ptr tempbuf1
296 mov edi,dword ptr tempbuf2
297 add esi,128 ; offset location by half a pixel
299 add edi,128 ; offset location by half a pixel
300 cmp ebp,esi
302 sbb ebp,ebp ; ebp = -1 if (esi > ebp or esi is negative)
303 mov ecx,_r3d_c_h_24_8
305 shr esi,30 ; if sign bit was set in esi, now esi = 3
306 and ebp,R3C_OFF_RIGHT
308 ; at this point, esi = (R3C_OFF_LEFT | R3C_OFF_RIGHT) if it's off left
309 ; and ebp = R3C_OFF_RIGHT if it's off left or off right
311 xor esi,ebp
312 cmp ecx,edi
314 sbb ecx,ecx ; ecx = -1 if ecx>edi or edi is negative)
316 shr edi,28 ; edi & 4 if it was off top
317 and ecx,R3C_OFF_BOTTOM
319 and edi,12
320 ; at this point, edi = (R3C_OFF_TOP | R3C_OFF_BOTTOM) if it's off top
321 ; and ecx = R3C_OFF_BOTTOM if it's off top or off bottom
322 xor esi,ecx
324 xor esi,edi
325 pop ecx
327 ; now write it out, and update r3d_ccodes_or
328 mov [eax+R3S_POINT_CCODES],esi
329 mov edi,_r3d_ccodes_or
331 or edi,esi
333 mov _r3d_ccodes_or,edi
334 jmp point_done
336 ; straight-ahead version of above:
337 ; xor ecx,ecx
338 ; mov ebp,r3_c_w
339 ; cmp esi,0
340 ; jge not_left
341 ; mov ecx,R3C_OFF_LEFT
342 ; not_left:
343 ; cmp esi,ebp
344 ; jle not_right
345 ; or ecx,R3C_OFF_RIGHT
346 ; not_right:
347 ; mov ebp,r3_c_h
348 ; cmp edi,0
349 ; jge not_top
350 ; or ecx,R3C_OFF_TOP
351 ; not_top:
352 ; cmp edi,ebp
353 ; jle not_bottom
354 ; or ecx,R3C_OFF_BOTTOM
355 ; not_bottom:
356 ; mov [eax+R3S_POINT_CCODES],ecx
357 ; pop ecx
359 ; this will often branch mispredict,
360 ; and since it has branches to branches
361 ; can do really horrible and wacky things
362 ; the no-branch version is always 9 cycles;
363 ; the above code is at best 7 cycles assuming
364 ; all branches taken and perfect prediction
366 ; } else {
367 ; // slow clipping (need two multiplies)
368 ; mxs_real iz = z * r3d_glob.x_clip;
369 ; mxs_real jz = z * r3d_glob.y_clip;
370 ; if (p->p.x < -iz) code = R3C_OFF_LEFT | R3C_BEHIND; else code = R3C_BEHIND;
371 ; if (p->p.x > iz) code |= R3C_OFF_RIGHT;
372 ; if (p->p.y < -jz) code |= R3C_OFF_TOP;
373 ; if (p->p.y > jz) code |= R3C_OFF_BOTTOM;
376 clip_slow:
377 mov edi,_r3d_near
379 cmp esi,edi
380 jle point_behind
382 xor edi,edi
383 jmp post_init_clipcode
385 point_behind:
386 mov edi,R3C_BEHIND
389 post_init_clipcode:
390 mov esi,eax
392 fstp tempbuf2
394 fstp tempbuf1
396 mov ebp,dword ptr tempbuf1
397 mov eax,dword ptr tempbuf2
399 shl ebp,8
400 shl eax,8
401 mov [esi+R3S_POINT_SX],ebp
402 mov [esi+R3S_POINT_SY],eax
404 ; compute "iz" & "jz"
405 fld dword ptr [esi+R3S_POINT_Z]
406 ;fld st(0)
407 fmul _r3d_x_clip
408 ;fxch st(1)
409 fld dword ptr [esi+R3S_POINT_Z]
410 fmul _r3d_y_clip
411 fxch st(1)
413 ; start comparing
414 fcom dword ptr [esi+R3S_POINT_X] ; compare iz ?? x
415 fnstsw ax
416 shr ah,1 ; carry set if iz < x
417 sbb ebp,ebp
418 and ebp,R3C_OFF_RIGHT
419 or edi,ebp
420 fldz
421 fsubr
422 fxch st(1)
423 fcom dword ptr [esi+R3S_POINT_Y]
424 fnstsw ax
425 shr ah,1 ; carry set if jz < y
426 sbb ebp,ebp
427 and ebp,R3C_OFF_BOTTOM
428 or edi,ebp
429 fldz
430 fsubr
431 fxch st(1)
432 ; now we have -iz and -jz on the stack
433 fcomp dword ptr [esi+R3S_POINT_X]
434 fnstsw ax
435 and ah,65 ; now if ah=0, -iz > x
436 cmp ah,1 ; sets carry if ah = 0
437 sbb ebp,ebp
438 and ebp,R3C_OFF_LEFT
439 or edi,ebp
440 fcomp dword ptr [esi+R3S_POINT_Y]
441 fnstsw ax
442 and ah,65
443 cmp ah,1
444 sbb ebp,ebp
445 and ebp,R3C_OFF_TOP
446 or edi,ebp
447 mov ebp,_r3d_ccodes_or
448 mov [esi+R3S_POINT_CCODES],edi
449 or ebp,edi
450 mov _r3d_ccodes_or,ebp
451 mov eax,esi
453 point_done:
454 add eax, _r3d_glob[R3S_GLOBAL_CONTEXT_CUR_STRIDE]
455 add ebx,12
457 dec ecx
458 jnz r3_tbpc_top
460 pop edi
461 pop esi
462 pop ebp
465 _TEXT ends