prj/tech/libsrc/r3d/projfast.asm

   1 ; $Source: x:/prj/tech/libsrc/r3d/RCS/projfast.asm $
   2 ; $Revision: 1.8 $
   3 ; $Author: dc $
   4 ; $Date: 1997/01/05 22:51:04 $
   5 ;
   6 ; Project-space transformation/clip coding
   7
   8 .486
   9 include type.inc
  10 include cseg.inc
  11 include thunks.inc
  12 include r3spoint.inc
  13 include ctxts.inc
  14
  15         assume  ds:_DATA
  16
  17 _DATA   segment para public USE32 'DATA'
  18
  19         align   8
  20
  21 extern _r3d_x_off:dword
  22 extern _r3d_y_off:dword
  23 extern _r3d_x_off_24_8:dword
  24 extern _r3d_y_off_24_8:dword
  25 extern _r3d_c_w:dword
  26 extern _r3d_c_h:dword
  27 extern _r3d_c_w_24_8:dword
  28 extern _r3d_c_h_24_8:dword
  29 extern _r3d_x_clip:dword
  30 extern _r3d_y_clip:dword
  31 extern _r3d_near:dword
  32 extern _r3d_fast_z:dword
  33 extern _r3d_ccodes_or:dword
  34 extern _r3d_glob:dword
  35
  36 x_off dq 0
  37 y_off dq 0
  38 tempbuf1 dq 0
  39 tempbuf2 dq 0
  40
  41 two_to_52_power dd 059900000h
  42 ; we want to multiply by 256
  43 ; this means adding 8 to the exponent
  44 ; the number is:
  45 ;    0101 1001 1001 0000 0000 0000 0000 0000
  46 ;    sEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
  47 ; exponent = 10110011
  48 ;          = 128 + 48 + 2 + 1
  49
  50 two_to_60_power dd 05d900000h
  51
  52 _DATA ends
  53
  54 _TEXT segment
  55
  56 _EXTFUNC mx_trans_mul_vec
  57
  58 ;
  59 ;  r3_transform_block_proj_noclip
  60 ;
  61 ; do projectspace point transformation/projection
  62 ;
  63 ; C code snippet:
  64 ;  for (i=0; i < n; ++i) {
  65 ;     double w;
  66 ;     r3_trans_mul_vec(&dst->p, X2TRANS(&cx.o2c), src);
  67 ;     w = 1.0/dst->p.z; // w is 1/z;
  68 ;     dst->grp.sx = (int)(dst->p.x * w) + r3d_glob.x_off;
  69 ;     dst->grp.sy = (int)(dst->p.y * w) + r3d_glob.y_off;
  70 ;     dst->grp.w = w;
  71 ;     ++dst;
  72 ;     ++src;
  73 ;  }
  74 ; takes  eax = count,  edx = dest,  ebx = src, ecx = X2TRANS...
  75 _FUNCDEF r3_transform_block_proj_noclip, 4
  76
  77         push   esi
  78         push   edi
  79
  80     ; swap around registers preparing for inner loop
  81         push   eax
  82         mov    eax,edx
  83         mov    edx,ecx
  84         pop    ecx
  85
  86     ; we're going to use the add-a-big-number-instead-of-fist
  87     ; hack.  But we can fold them in to the adds we do as part
  88     ; of projection
  89         fld    two_to_52_power
  90         fild   _r3d_x_off
  91         fadd
  92         fstp   x_off
  93         fld    two_to_52_power
  94         fild   _r3d_y_off
  95         fadd
  96         fstp   y_off
  97
  98 r3_tbpn_top:
  99         ; we rely on the fact that the following
 100         ; function doesn't change any registers;
 101         ; this is only true of the asm implementation
 102         _CALLFUNC   mx_trans_mul_vec, 3
 103
 104         ; w = 1.0 / dst->p.z;
 105         fld1
 106
 107         fld    dword ptr [eax+R3S_POINT_Z]
 108
 109         fdiv
 110
 111         ; during the divide, preload
 112         ; our next vector and the rest
 113         ; of this point
 114
 115         mov    esi,[eax+20]
 116         mov    esi,[ebx+12]
 117 ; removed by Doug since it faults MSVC memory layout
 118 ; really, this should be Stride Aware and should be Stride-4, probably?
 119 ;        mov    esi,[eax+40]
 120 ; really, the ebx one above probably should go too
 121
 122
 123         ; dst->grp.w = w;
 124         fst    dword ptr [eax+R3S_POINT_W]
 125
 126         fld    st(0)
 127
 128         fmul   dword ptr [eax+R3S_POINT_X]
 129         fxch   st(1)
 130
 131         ; stall
 132
 133         fmul   dword ptr [eax+R3S_POINT_Y]
 134         fxch   st(1)
 135
 136         ; add 2^52+2^51 + r3d_x_off
 137         fadd   x_off
 138         fxch   st(1)
 139
 140         ; ditto for y
 141         fadd   y_off
 142         ; no fxch because next instruction isn't FP
 143
 144         add    ebx,12
 145
 146         ; NB  we set flags on the previous instruction,
 147         ; don't stomp them before the jnz below!
 148
 149         fxch   st(1)
 150
 151         fstp   tempbuf1
 152
 153         fstp   tempbuf2
 154
 155         ; load fixed point sx,sy
 156         mov    esi,dword ptr tempbuf1
 157         mov    edi,dword ptr tempbuf2
 158
 159         mov    [eax+R3S_POINT_SX],esi
 160         mov    [eax+R3S_POINT_SY],edi
 161
 162         add    eax, _r3d_glob[R3S_GLOBAL_CONTEXT_CUR_STRIDE]
 163         dec    ecx        ; decrement loop count
 164
 165         jnz    r3_tbpn_top
 166
 167         pop    edi
 168         pop    esi
 169
 170         ret
 171
 172 ;
 173 ;  r3_transform_block_proj_clip
 174 ;
 175 ; do projectspace point transformation/projection & clip coding
 176 ;
 177 ; takes  eax = count,  edx = dest,  ebx = src, ecx = X2TRANS...
 178 _FUNCDEF r3_transform_block_proj_clip, 4
 179
 180         push   ebp
 181         push   esi
 182         push   edi
 183
 184         push   eax
 185         mov    eax,edx
 186         mov    edx,ecx
 187         pop    ecx
 188
 189         fld    two_to_60_power
 190         fild   _r3d_x_off
 191         fadd
 192         fstp   x_off
 193         fld    two_to_60_power
 194         fild   _r3d_y_off
 195         fadd
 196         fstp   y_off
 197
 198 r3_tbpc_top:
 199         ; we rely on the fact that the following
 200         ; function doesn't change any registers
 201         ; this is only true of the asm implementation
 202         _CALLFUNC   mx_trans_mul_vec, 3
 203
 204         ; w = 1.0 / dst->p.z;
 205         fld1
 206
 207         fld    dword ptr [eax+R3S_POINT_Z]
 208
 209         fdiv
 210
 211         ; during the divide, preload
 212         ; our next vector and the rest
 213         ; of this point
 214
 215         mov    esi,[eax+20]
 216         mov    esi,[ebx+12]
 217 ; removed by Doug since it faults MSVC memory layout
 218 ; really, this should be Stride Aware and should be Stride-4, probably?
 219 ;        mov    esi,[eax+40]
 220 ; really, the ebx one above probably should go too
 221
 222         ; dst->grp.w = w;
 223         fst    dword ptr [eax+R3S_POINT_W]
 224
 225         fld    st(0)
 226
 227         ; start computing sx,sy
 228         fmul   dword ptr [eax+R3S_POINT_X]
 229         fxch   st(1)
 230
 231         fmul   dword ptr [eax+R3S_POINT_Y]
 232         fxch   st(1)
 233
 234         fadd   x_off
 235         fxch   st(1)
 236
 237         fadd   y_off
 238
 239         ; load z into integer for fast compare
 240         ; and load "fast_z", which is a positive FP
 241         ; number representing whether it's safe to
 242         ; use fast (2d) clip coding
 243         mov    esi,[eax+R3S_POINT_Z]
 244         mov    edi,_r3d_fast_z
 245
 246         ; we can integer compare them because one of
 247         ; them is definitely positive
 248         cmp    esi,edi
 249         jle    clip_slow
 250
 251         push   ecx
 252
 253         fstp   tempbuf2
 254
 255         fstp   tempbuf1
 256
 257         mov    esi,dword ptr tempbuf1
 258         mov    edi,dword ptr tempbuf2
 259
 260
 261         ; ESI = sx, EDI = sy
 262
 263         ; TODO: should we offset these by half a pixel
 264         ; to fix the clip coding?  I think so.  Would only
 265         ; take one cycle to do.
 266
 267         ; The following code uses some pretty gory algorithms
 268         ; to generate clip codes without branching.
 269
 270         ; There are two central concepts.  First, if a value is
 271         ; negative, then the highest bit is set.  Thus, if we
 272         ; just shift it right by 31, then we have a clip code for
 273         ; it being negative.
 274
 275         ; Second, if we compare against a max value, we generate
 276         ; a bunch of flags.  The easiest one to set a bit from is
 277         ; the carry flag, via sbb eax,eax.  However, this sets a
 278         ; bit if we borrowed in an _unsigned_ subtraction.  Thus,
 279         ; it sets a bit if (unsigned) x > max_x, which means it's
 280         ; set if (signed) x > max_x || x < 0.
 281
 282         ; So we use the latter approach to set our flags, knowing
 283         ; that it will incorrectly set an "off_right" code when it's
 284         ; actually "off_left".  We fix this up by making "off_left"
 285         ; also set "off_right", and xoring.
 286
 287         shl    esi,8            ; go from 24.8 to 16.16
 288         mov    ebp,_r3d_c_w_24_8  ; canvas width in 24.8
 289
 290         shl    edi,8
 291         mov    [eax+R3S_POINT_SX],esi
 292
 293         mov    [eax+R3S_POINT_SY],edi
 294         mov    esi,dword ptr tempbuf1
 295
 296         mov    edi,dword ptr tempbuf2
 297         add    esi,128       ; offset location by half a pixel
 298
 299         add    edi,128       ; offset location by half a pixel
 300         cmp    ebp,esi
 301
 302         sbb    ebp,ebp        ; ebp = -1 if (esi > ebp or esi is negative)
 303         mov    ecx,_r3d_c_h_24_8
 304
 305         shr    esi,30         ; if sign bit was set in esi, now esi = 3
 306         and    ebp,R3C_OFF_RIGHT
 307
 308         ; at this point, esi = (R3C_OFF_LEFT | R3C_OFF_RIGHT) if it's off left
 309         ; and ebp = R3C_OFF_RIGHT if it's off left or off right
 310
 311         xor    esi,ebp
 312         cmp    ecx,edi
 313
 314         sbb    ecx,ecx        ; ecx = -1 if ecx>edi or edi is negative)
 315
 316         shr    edi,28         ; edi & 4 if it was off top
 317         and    ecx,R3C_OFF_BOTTOM
 318
 319         and    edi,12
 320         ; at this point, edi = (R3C_OFF_TOP | R3C_OFF_BOTTOM) if it's off top
 321         ; and ecx = R3C_OFF_BOTTOM if it's off top or off bottom
 322         xor    esi,ecx
 323
 324         xor    esi,edi
 325         pop    ecx
 326
 327         ; now write it out, and update r3d_ccodes_or
 328         mov    [eax+R3S_POINT_CCODES],esi
 329         mov    edi,_r3d_ccodes_or
 330
 331         or     edi,esi
 332
 333         mov    _r3d_ccodes_or,edi
 334         jmp    point_done
 335
 336 ; straight-ahead version of above:
 337 ;       xor    ecx,ecx
 338 ;       mov    ebp,r3_c_w
 339 ;       cmp    esi,0
 340 ;       jge    not_left
 341 ;       mov    ecx,R3C_OFF_LEFT
 342 ; not_left:
 343 ;       cmp    esi,ebp
 344 ;       jle    not_right
 345 ;       or     ecx,R3C_OFF_RIGHT
 346 ; not_right:
 347 ;       mov    ebp,r3_c_h
 348 ;       cmp    edi,0
 349 ;       jge    not_top
 350 ;       or     ecx,R3C_OFF_TOP
 351 ; not_top:
 352 ;       cmp    edi,ebp
 353 ;       jle    not_bottom
 354 ;       or     ecx,R3C_OFF_BOTTOM
 355 ; not_bottom:
 356 ;       mov    [eax+R3S_POINT_CCODES],ecx
 357 ;       pop    ecx
 358 ;
 359 ;  this will often branch mispredict,
 360 ;  and since it has branches to branches
 361 ;  can do really horrible and wacky things
 362 ;  the no-branch version is always 9 cycles;
 363 ;  the above code is at best 7 cycles assuming
 364 ;  all branches taken and perfect prediction
 365
 366 ;     } else {
 367 ;        // slow clipping (need two multiplies)
 368 ;        mxs_real iz = z * r3d_glob.x_clip;
 369 ;        mxs_real jz = z * r3d_glob.y_clip;
 370 ;        if (p->p.x < -iz) code = R3C_OFF_LEFT | R3C_BEHIND; else code = R3C_BEHIND;
 371 ;        if (p->p.x >  iz) code |= R3C_OFF_RIGHT;
 372 ;        if (p->p.y < -jz) code |= R3C_OFF_TOP;
 373 ;        if (p->p.y >  jz) code |= R3C_OFF_BOTTOM;
 374 ;     }
 375
 376 clip_slow:
 377         mov    edi,_r3d_near
 378
 379         cmp    esi,edi
 380         jle    point_behind
 381
 382         xor    edi,edi
 383         jmp    post_init_clipcode
 384
 385 point_behind:
 386         mov    edi,R3C_BEHIND
 387         nop
 388
 389 post_init_clipcode:
 390         mov    esi,eax
 391
 392         fstp   tempbuf2
 393
 394         fstp   tempbuf1
 395
 396         mov    ebp,dword ptr tempbuf1
 397         mov    eax,dword ptr tempbuf2
 398
 399         shl    ebp,8
 400         shl    eax,8
 401         mov    [esi+R3S_POINT_SX],ebp
 402         mov    [esi+R3S_POINT_SY],eax
 403
 404         ; compute "iz" & "jz"
 405         fld   dword ptr [esi+R3S_POINT_Z]
 406         ;fld    st(0)
 407         fmul  _r3d_x_clip
 408         ;fxch   st(1)
 409         fld   dword ptr [esi+R3S_POINT_Z]
 410         fmul  _r3d_y_clip
 411         fxch  st(1)
 412
 413         ; start comparing
 414         fcom   dword ptr [esi+R3S_POINT_X]    ; compare  iz ?? x
 415         fnstsw ax
 416         shr    ah,1      ; carry set if iz < x
 417         sbb    ebp,ebp
 418         and    ebp,R3C_OFF_RIGHT
 419         or     edi,ebp
 420         fldz
 421         fsubr
 422         fxch   st(1)
 423         fcom   dword ptr [esi+R3S_POINT_Y]
 424         fnstsw ax
 425         shr    ah,1      ; carry set if jz < y
 426         sbb    ebp,ebp
 427         and    ebp,R3C_OFF_BOTTOM
 428         or     edi,ebp
 429         fldz
 430         fsubr
 431         fxch   st(1)
 432         ; now we have -iz and -jz on the stack
 433         fcomp  dword ptr [esi+R3S_POINT_X]
 434         fnstsw ax
 435         and    ah,65     ; now if ah=0, -iz > x
 436         cmp    ah,1      ; sets carry if ah = 0
 437         sbb    ebp,ebp
 438         and    ebp,R3C_OFF_LEFT
 439         or     edi,ebp
 440         fcomp  dword ptr [esi+R3S_POINT_Y]
 441         fnstsw ax
 442         and    ah,65
 443         cmp    ah,1
 444         sbb    ebp,ebp
 445         and    ebp,R3C_OFF_TOP
 446         or     edi,ebp
 447         mov    ebp,_r3d_ccodes_or
 448         mov    [esi+R3S_POINT_CCODES],edi
 449         or     ebp,edi
 450         mov    _r3d_ccodes_or,ebp
 451         mov    eax,esi
 452
 453 point_done:
 454         add    eax, _r3d_glob[R3S_GLOBAL_CONTEXT_CUR_STRIDE]
 455         add    ebx,12
 456
 457         dec    ecx
 458         jnz    r3_tbpc_top
 459
 460         pop    edi
 461         pop    esi
 462         pop    ebp
 463
 464         ret
 465 _TEXT ends
 466 end