1 ; $Source: x:/prj/tech/libsrc/r3d/RCS/projfast.asm $
4 ; $Date: 1997/01/05 22:51:04 $
6 ; Project-space transformation/clip coding
17 _DATA
segment para
public USE32
'DATA'
21 extern
_r3d_x_off:dword
22 extern
_r3d_y_off:dword
23 extern
_r3d_x_off_24_8:dword
24 extern
_r3d_y_off_24_8:dword
27 extern
_r3d_c_w_24_8:dword
28 extern
_r3d_c_h_24_8:dword
29 extern
_r3d_x_clip:dword
30 extern
_r3d_y_clip:dword
31 extern
_r3d_near:dword
32 extern
_r3d_fast_z:dword
33 extern
_r3d_ccodes_or:dword
34 extern
_r3d_glob:dword
41 two_to_52_power
dd 059900000h
42 ; we want to multiply by 256
43 ; this means adding 8 to the exponent
45 ; 0101 1001 1001 0000 0000 0000 0000 0000
46 ; sEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
50 two_to_60_power
dd 05d900000h
56 _EXTFUNC mx_trans_mul_vec
59 ; r3_transform_block_proj_noclip
61 ; do projectspace point transformation/projection
64 ; for (i=0; i < n; ++i) {
66 ; r3_trans_mul_vec(&dst->p, X2TRANS(&cx.o2c), src);
67 ; w = 1.0/dst->p.z; // w is 1/z;
68 ; dst->grp.sx = (int)(dst->p.x * w) + r3d_glob.x_off;
69 ; dst->grp.sy = (int)(dst->p.y * w) + r3d_glob.y_off;
74 ; takes eax = count, edx = dest, ebx = src, ecx = X2TRANS...
75 _FUNCDEF r3_transform_block_proj_noclip
, 4
80 ; swap around registers preparing for inner loop
86 ; we're going to use the add-a-big-number-instead-of-fist
87 ; hack. But we can fold them in to the adds we do as part
99 ; we rely on the fact that the following
100 ; function doesn't change any registers;
101 ; this is only true of the asm implementation
102 _CALLFUNC mx_trans_mul_vec
, 3
104 ; w = 1.0 / dst->p.z;
107 fld dword ptr [eax+R3S_POINT_Z
]
111 ; during the divide, preload
112 ; our next vector and the rest
117 ; removed by Doug since it faults MSVC memory layout
118 ; really, this should be Stride Aware and should be Stride-4, probably?
120 ; really, the ebx one above probably should go too
124 fst dword ptr [eax+R3S_POINT_W
]
128 fmul dword ptr [eax+R3S_POINT_X
]
133 fmul dword ptr [eax+R3S_POINT_Y
]
136 ; add 2^52+2^51 + r3d_x_off
142 ; no fxch because next instruction isn't FP
146 ; NB we set flags on the previous instruction,
147 ; don't stomp them before the jnz below!
155 ; load fixed point sx,sy
156 mov esi,dword ptr tempbuf1
157 mov edi,dword ptr tempbuf2
159 mov [eax+R3S_POINT_SX
],esi
160 mov [eax+R3S_POINT_SY
],edi
162 add eax, _r3d_glob
[R3S_GLOBAL_CONTEXT_CUR_STRIDE
]
163 dec ecx ; decrement loop count
173 ; r3_transform_block_proj_clip
175 ; do projectspace point transformation/projection & clip coding
177 ; takes eax = count, edx = dest, ebx = src, ecx = X2TRANS...
178 _FUNCDEF r3_transform_block_proj_clip
, 4
199 ; we rely on the fact that the following
200 ; function doesn't change any registers
201 ; this is only true of the asm implementation
202 _CALLFUNC mx_trans_mul_vec
, 3
204 ; w = 1.0 / dst->p.z;
207 fld dword ptr [eax+R3S_POINT_Z
]
211 ; during the divide, preload
212 ; our next vector and the rest
217 ; removed by Doug since it faults MSVC memory layout
218 ; really, this should be Stride Aware and should be Stride-4, probably?
220 ; really, the ebx one above probably should go too
223 fst dword ptr [eax+R3S_POINT_W
]
227 ; start computing sx,sy
228 fmul dword ptr [eax+R3S_POINT_X
]
231 fmul dword ptr [eax+R3S_POINT_Y
]
239 ; load z into integer for fast compare
240 ; and load "fast_z", which is a positive FP
241 ; number representing whether it's safe to
242 ; use fast (2d) clip coding
243 mov esi,[eax+R3S_POINT_Z
]
246 ; we can integer compare them because one of
247 ; them is definitely positive
257 mov esi,dword ptr tempbuf1
258 mov edi,dword ptr tempbuf2
263 ; TODO: should we offset these by half a pixel
264 ; to fix the clip coding? I think so. Would only
265 ; take one cycle to do.
267 ; The following code uses some pretty gory algorithms
268 ; to generate clip codes without branching.
270 ; There are two central concepts. First, if a value is
271 ; negative, then the highest bit is set. Thus, if we
272 ; just shift it right by 31, then we have a clip code for
275 ; Second, if we compare against a max value, we generate
276 ; a bunch of flags. The easiest one to set a bit from is
277 ; the carry flag, via sbb eax,eax. However, this sets a
278 ; bit if we borrowed in an _unsigned_ subtraction. Thus,
279 ; it sets a bit if (unsigned) x > max_x, which means it's
280 ; set if (signed) x > max_x || x < 0.
282 ; So we use the latter approach to set our flags, knowing
283 ; that it will incorrectly set an "off_right" code when it's
284 ; actually "off_left". We fix this up by making "off_left"
285 ; also set "off_right", and xoring.
287 shl esi,8 ; go from 24.8 to 16.16
288 mov ebp,_r3d_c_w_24_8
; canvas width in 24.8
291 mov [eax+R3S_POINT_SX
],esi
293 mov [eax+R3S_POINT_SY
],edi
294 mov esi,dword ptr tempbuf1
296 mov edi,dword ptr tempbuf2
297 add esi,128 ; offset location by half a pixel
299 add edi,128 ; offset location by half a pixel
302 sbb ebp,ebp ; ebp = -1 if (esi > ebp or esi is negative)
303 mov ecx,_r3d_c_h_24_8
305 shr esi,30 ; if sign bit was set in esi, now esi = 3
306 and ebp,R3C_OFF_RIGHT
308 ; at this point, esi = (R3C_OFF_LEFT | R3C_OFF_RIGHT) if it's off left
309 ; and ebp = R3C_OFF_RIGHT if it's off left or off right
314 sbb ecx,ecx ; ecx = -1 if ecx>edi or edi is negative)
316 shr edi,28 ; edi & 4 if it was off top
317 and ecx,R3C_OFF_BOTTOM
320 ; at this point, edi = (R3C_OFF_TOP | R3C_OFF_BOTTOM) if it's off top
321 ; and ecx = R3C_OFF_BOTTOM if it's off top or off bottom
327 ; now write it out, and update r3d_ccodes_or
328 mov [eax+R3S_POINT_CCODES
],esi
329 mov edi,_r3d_ccodes_or
333 mov _r3d_ccodes_or
,edi
336 ; straight-ahead version of above:
341 ; mov ecx,R3C_OFF_LEFT
345 ; or ecx,R3C_OFF_RIGHT
354 ; or ecx,R3C_OFF_BOTTOM
356 ; mov [eax+R3S_POINT_CCODES],ecx
359 ; this will often branch mispredict,
360 ; and since it has branches to branches
361 ; can do really horrible and wacky things
362 ; the no-branch version is always 9 cycles;
363 ; the above code is at best 7 cycles assuming
364 ; all branches taken and perfect prediction
367 ; // slow clipping (need two multiplies)
368 ; mxs_real iz = z * r3d_glob.x_clip;
369 ; mxs_real jz = z * r3d_glob.y_clip;
370 ; if (p->p.x < -iz) code = R3C_OFF_LEFT | R3C_BEHIND; else code = R3C_BEHIND;
371 ; if (p->p.x > iz) code |= R3C_OFF_RIGHT;
372 ; if (p->p.y < -jz) code |= R3C_OFF_TOP;
373 ; if (p->p.y > jz) code |= R3C_OFF_BOTTOM;
383 jmp post_init_clipcode
396 mov ebp,dword ptr tempbuf1
397 mov eax,dword ptr tempbuf2
401 mov [esi+R3S_POINT_SX
],ebp
402 mov [esi+R3S_POINT_SY
],eax
404 ; compute "iz" & "jz"
405 fld dword ptr [esi+R3S_POINT_Z
]
409 fld dword ptr [esi+R3S_POINT_Z
]
414 fcom dword ptr [esi+R3S_POINT_X
] ; compare iz ?? x
416 shr ah,1 ; carry set if iz < x
418 and ebp,R3C_OFF_RIGHT
423 fcom dword ptr [esi+R3S_POINT_Y
]
425 shr ah,1 ; carry set if jz < y
427 and ebp,R3C_OFF_BOTTOM
432 ; now we have -iz and -jz on the stack
433 fcomp dword ptr [esi+R3S_POINT_X
]
435 and ah,65 ; now if ah=0, -iz > x
436 cmp ah,1 ; sets carry if ah = 0
440 fcomp dword ptr [esi+R3S_POINT_Y
]
447 mov ebp,_r3d_ccodes_or
448 mov [esi+R3S_POINT_CCODES
],edi
450 mov _r3d_ccodes_or
,ebp
454 add eax, _r3d_glob
[R3S_GLOBAL_CONTEXT_CUR_STRIDE
]