hvirtual/mpeg2enc/fdct_mmx.s

   1 ; //////////////////////////////////////////////////////////////////////////////
   2 ; //
   3 ; //  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
   4 ; //  ----------
   5 ; //  Intel Application Note AP-922 - fast, precise implementation of DCT
   6 ; //        http://developer.intel.com/vtune/cbts/appnotes.htm
   7 ; //  ----------
   8 ; //
   9 ; //       This routine can use a 3D-Now/MMX enhancement to increase the
  10 ; //  accuracy of the fdct_col_4 macro.  The dct_col function uses 3D-Now's
  11 ; //  PMHULHRW instead of MMX's PMHULHW(and POR).  The substitution improves
  12 ; //  accuracy very slightly with performance penalty.  If the target CPU
  13 ; //  does not support 3D-Now, then this function cannot be executed.
  14 ; //
  15 ; //  For a fast, precise MMX implementation of inverse-DCT
  16 ; //              visit http://www.elecard.com/peter
  17 ; //
  18 ; //  v1.0 07/22/2000 (initial release)
  19 ; //
  20 ; //  liaor@iname.com  http://members.tripod.com/~liaor
  21 ; //////////////////////////////////////////////////////////////////////////////
  22
  23 ;;;
  24 ;;; A.Stevens Jul 2000:  ported to nasm syntax and disentangled from
  25 ;;; from Win**** compiler specific stuff.
  26 ;;; All the real work was done above though.
  27 ;;; See above for how to optimise quality on 3DNow! CPU's
  28
  29                 ;;
  30                 ;;              Macros for code-readability...
  31                 ;;
  32 %define INP eax         ;        pointer to (short *blk)
  33 %define OUT ecx         ;        pointer to output (temporary store space qwTemp[])
  34 %define TABLE ebx       ; pointer to tab_frw_01234567[]
  35 %define TABLEF ebx  ; pointer to tg_all_16
  36 %define round_frw_row edx
  37
  38
  39 %define x0 INP + 0*16
  40 %define x1 INP + 1*16
  41 %define x2 INP + 2*16
  42 %define x3 INP + 3*16
  43 %define x4 INP + 4*16
  44 %define x5 INP + 5*16
  45 %define x6 INP + 6*16
  46 %define x7 INP + 7*16
  47 %define y0 OUT + 0*16
  48 %define y1 OUT + 1*16
  49 %define y2 OUT + 2*16
  50 %define y3 OUT + 3*16
  51 %define y4 OUT + 4*16
  52 %define y5 OUT + 5*16
  53 %define y6 OUT + 6*16
  54 %define y7 OUT + 7*16
  55
  56                 ;;
  57                 ;; Constants for DCT
  58                 ;;
  59 %define BITS_FRW_ACC    3 ; 2 or 3 for accuracy
  60 %define SHIFT_FRW_COL   BITS_FRW_ACC
  61 %define SHIFT_FRW_ROW   (BITS_FRW_ACC + 17)
  62 %define RND_FRW_ROW             (1 << (SHIFT_FRW_ROW-1))
  63 %define RND_FRW_COL             (1 << (SHIFT_FRW_COL-1))
  64
  65 extern fdct_one_corr
  66 extern fdct_r_row                               ;  Defined in C for convenience
  67                 ;;
  68                 ;; Concatenated table of forward dct transformation coeffs.
  69                 ;;
  70 extern  fdct_tg_all_16                  ; Defined in C for convenience
  71                 ;; Offsets into table..
  72
  73 %define tg_1_16 (TABLEF + 0)
  74 %define tg_2_16 (TABLEF + 8)
  75 %define tg_3_16 (TABLEF + 16)
  76 %define cos_4_16 (TABLEF + 24)
  77 %define ocos_4_16 (TABLEF + 32)
  78
  79                 ;;
  80                 ;; Concatenated table of forward dct coefficients
  81                 ;;
  82 extern tab_frw_01234567         ; Defined in C for convenience
  83
  84                 ;; Offsets into table..
  85 SECTION .text
  86
  87 global fdct_mmx
  88
  89 ;;;
  90 ;;; void fdct_mmx( short *blk )
  91 ;;;
  92
  93
  94
  95 ;     ////////////////////////////////////////////////////////////////////////
  96 ;     //
  97 ;     // The high-level pseudocode for the fdct_am32() routine :
  98 ;     //
  99 ;     // fdct_am32()
 100 ;     // {
 101 ;     //    forward_dct_col03(); // dct_column transform on cols 0-3
 102 ;     //    forward_dct_col47(); // dct_column transform on cols 4-7
 103 ;     //    for ( j = 0; j < 8; j=j+1 )
 104 ;     //      forward_dct_row1(j); // dct_row transform on row #j
 105 ;     // }
 106 ;     //
 107 ;
 108
 109 align 32
 110 fdct_mmx:
 111         push ebp                        ; save stack pointer
 112         mov ebp, esp            ; link
 113
 114         push ebx
 115         push ecx
 116         push edx
 117         push edi
 118
 119         mov INP, [ebp+8];               ; input data is row 0 of blk[]
 120     ;// transform the left half of the matrix (4 columns)
 121
 122     lea TABLEF,  [fdct_tg_all_16];
 123     mov OUT, INP;
 124
 125 ;       lea round_frw_col,  [r_frw_col]
 126     ; for ( i = 0; i < 2; i = i + 1)
 127     ; the for-loop is executed twice.  We are better off unrolling the
 128     ; loop to avoid branch misprediction.
 129 mmx32_fdct_col03:
 130     movq mm0, [x1] ; 0 ; x1
 131      ;;
 132
 133     movq mm1, [x6] ; 1 ; x6
 134     movq mm2, mm0 ; 2 ; x1
 135
 136     movq mm3, [x2] ; 3 ; x2
 137     paddsw mm0, mm1 ; t1 = x[1] + x[6]
 138
 139     movq mm4, [x5] ; 4 ; x5
 140     psllw mm0, SHIFT_FRW_COL ; t1
 141
 142     movq mm5, [x0] ; 5 ; x0
 143     paddsw mm4, mm3 ; t2 = x[2] + x[5]
 144
 145     paddsw mm5, [x7] ; t0 = x[0] + x[7]
 146     psllw mm4, SHIFT_FRW_COL ; t2
 147
 148     movq mm6, mm0 ; 6 ; t1
 149     psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]
 150
 151     movq mm1,  [tg_2_16] ; 1 ; tg_2_16
 152     psubsw mm0, mm4 ; tm12 = t1 - t2
 153
 154     movq mm7, [x3] ; 7 ; x3
 155     pmulhw mm1, mm0 ; tm12*tg_2_16
 156
 157     paddsw mm7, [x4] ; t3 = x[3] + x[4]
 158     psllw mm5, SHIFT_FRW_COL ; t0
 159
 160     paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2
 161     psllw mm7, SHIFT_FRW_COL ; t3
 162
 163     movq mm4, mm5 ; 4 ; t0
 164     psubsw mm5, mm7 ; tm03 = t0 - t3
 165
 166     paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16
 167     paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3
 168
 169     por mm1,  [fdct_one_corr] ; correction y2 +0.5
 170     psllw mm2, SHIFT_FRW_COL+1 ; t6
 171
 172     pmulhw mm5,  [tg_2_16] ; tm03*tg_2_16
 173     movq mm7, mm4 ; 7 ; tp03
 174
 175     psubsw mm3, [x5] ; t5 = x[2] - x[5]
 176     psubsw mm4, mm6 ; y4 = tp03 - tp12
 177
 178     movq [y2], mm1 ; 1 ; save y2
 179     paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12
 180
 181     movq mm1, [x3] ; 1 ; x3
 182     psllw mm3, SHIFT_FRW_COL+1 ; t5
 183
 184     psubsw mm1, [x4] ; t4 = x[3] - x[4]
 185     movq mm6, mm2 ; 6 ; t6
 186
 187     movq [y4], mm4 ; 4 ; save y4
 188     paddsw mm2, mm3 ; t6 + t5
 189
 190     pmulhw mm2,  [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16
 191     psubsw mm6, mm3 ; 3 ; t6 - t5
 192
 193     pmulhw mm6,  [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16
 194     psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12
 195
 196     por mm5,  [fdct_one_corr] ; correction y6 +0.5
 197     psllw mm1, SHIFT_FRW_COL ; t4
 198
 199     por mm2,  [fdct_one_corr] ; correction tp65 +0.5
 200     movq mm4, mm1 ; 4 ; t4
 201
 202     movq mm3, [x0] ; 3 ; x0
 203     paddsw mm1, mm6 ; tp465 = t4 + tm65
 204
 205     psubsw mm3, [x7] ; t7 = x[0] - x[7]
 206     psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65
 207
 208     movq mm0,  [tg_1_16] ; 0 ; tg_1_16
 209     psllw mm3, SHIFT_FRW_COL ; t7
 210
 211     movq mm6,  [tg_3_16] ; 6 ; tg_3_16
 212     pmulhw mm0, mm1 ; tp465*tg_1_16
 213
 214     movq [y0], mm7 ; 7 ; save y0
 215     pmulhw mm6, mm4 ; tm465*tg_3_16
 216
 217     movq [y6], mm5 ; 5 ; save y6
 218     movq mm7, mm3 ; 7 ; t7
 219
 220     movq mm5,  [tg_3_16] ; 5 ; tg_3_16
 221     psubsw mm7, mm2 ; tm765 = t7 - tp65
 222
 223     paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65
 224     pmulhw mm5, mm7 ; tm765*tg_3_16
 225
 226     paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16
 227     paddsw mm6, mm4 ; tm465*tg_3_16
 228
 229     pmulhw mm3,  [tg_1_16] ; tp765*tg_1_16
 230     ;;
 231
 232     por mm0,  [fdct_one_corr] ; correction y1 +0.5
 233     paddsw mm5, mm7 ; tm765*tg_3_16
 234
 235     psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16
 236     add INP, 0x08   ; ; increment pointer
 237
 238     movq [y1], mm0 ; 0 ; save y1
 239     paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465
 240
 241     movq [y3], mm7 ; 7 ; save y3
 242     psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465
 243
 244     movq [y5], mm5 ; 5 ; save y5
 245
 246
 247 mmx32_fdct_col47: ; begin processing last four columns
 248     movq mm0, [x1] ; 0 ; x1
 249     ;;
 250     movq [y7], mm3 ; 3 ; save y7 (columns 0-4)
 251     ;;
 252
 253     movq mm1, [x6] ; 1 ; x6
 254     movq mm2, mm0 ; 2 ; x1
 255
 256     movq mm3, [x2] ; 3 ; x2
 257     paddsw mm0, mm1 ; t1 = x[1] + x[6]
 258
 259     movq mm4, [x5] ; 4 ; x5
 260     psllw mm0, SHIFT_FRW_COL ; t1
 261
 262     movq mm5, [x0] ; 5 ; x0
 263     paddsw mm4, mm3 ; t2 = x[2] + x[5]
 264
 265     paddsw mm5, [x7] ; t0 = x[0] + x[7]
 266     psllw mm4, SHIFT_FRW_COL ; t2
 267
 268     movq mm6, mm0 ; 6 ; t1
 269     psubsw mm2, mm1 ; 1 ; t6 = x[1] - x[6]
 270
 271     movq mm1,  [tg_2_16] ; 1 ; tg_2_16
 272     psubsw mm0, mm4 ; tm12 = t1 - t2
 273
 274     movq mm7, [x3] ; 7 ; x3
 275     pmulhw mm1, mm0 ; tm12*tg_2_16
 276
 277     paddsw mm7, [x4] ; t3 = x[3] + x[4]
 278     psllw mm5, SHIFT_FRW_COL ; t0
 279
 280     paddsw mm6, mm4 ; 4 ; tp12 = t1 + t2
 281     psllw mm7, SHIFT_FRW_COL ; t3
 282
 283     movq mm4, mm5 ; 4 ; t0
 284     psubsw mm5, mm7 ; tm03 = t0 - t3
 285
 286     paddsw mm1, mm5 ; y2 = tm03 + tm12*tg_2_16
 287     paddsw mm4, mm7 ; 7 ; tp03 = t0 + t3
 288
 289     por mm1,  [fdct_one_corr] ; correction y2 +0.5
 290     psllw mm2, SHIFT_FRW_COL+1 ; t6
 291
 292     pmulhw mm5,  [tg_2_16] ; tm03*tg_2_16
 293     movq mm7, mm4 ; 7 ; tp03
 294
 295     psubsw mm3, [x5] ; t5 = x[2] - x[5]
 296     psubsw mm4, mm6 ; y4 = tp03 - tp12
 297
 298     movq [y2+8], mm1 ; 1 ; save y2
 299     paddsw mm7, mm6 ; 6 ; y0 = tp03 + tp12
 300
 301     movq mm1, [x3] ; 1 ; x3
 302     psllw mm3, SHIFT_FRW_COL+1 ; t5
 303
 304     psubsw mm1, [x4] ; t4 = x[3] - x[4]
 305     movq mm6, mm2 ; 6 ; t6
 306
 307     movq [y4+8], mm4 ; 4 ; save y4
 308     paddsw mm2, mm3 ; t6 + t5
 309
 310     pmulhw mm2,  [ocos_4_16] ; tp65 = (t6 + t5)*cos_4_16
 311     psubsw mm6, mm3 ; 3 ; t6 - t5
 312
 313     pmulhw mm6,  [ocos_4_16] ; tm65 = (t6 - t5)*cos_4_16
 314     psubsw mm5, mm0 ; 0 ; y6 = tm03*tg_2_16 - tm12
 315
 316     por mm5,  [fdct_one_corr] ; correction y6 +0.5
 317     psllw mm1, SHIFT_FRW_COL ; t4
 318
 319     por mm2,  [fdct_one_corr] ; correction tp65 +0.5
 320     movq mm4, mm1 ; 4 ; t4
 321
 322     movq mm3, [x0] ; 3 ; x0
 323     paddsw mm1, mm6 ; tp465 = t4 + tm65
 324
 325     psubsw mm3, [x7] ; t7 = x[0] - x[7]
 326     psubsw mm4, mm6 ; 6 ; tm465 = t4 - tm65
 327
 328     movq mm0,  [tg_1_16] ; 0 ; tg_1_16
 329     psllw mm3, SHIFT_FRW_COL ; t7
 330
 331     movq mm6,  [tg_3_16] ; 6 ; tg_3_16
 332     pmulhw mm0, mm1 ; tp465*tg_1_16
 333
 334     movq [y0+8], mm7 ; 7 ; save y0
 335     pmulhw mm6, mm4 ; tm465*tg_3_16
 336
 337     movq [y6+8], mm5 ; 5 ; save y6
 338     movq mm7, mm3 ; 7 ; t7
 339
 340     movq mm5,  [tg_3_16] ; 5 ; tg_3_16
 341     psubsw mm7, mm2 ; tm765 = t7 - tp65
 342
 343     paddsw mm3, mm2 ; 2 ; tp765 = t7 + tp65
 344     pmulhw mm5, mm7 ; tm765*tg_3_16
 345
 346     paddsw mm0, mm3 ; y1 = tp765 + tp465*tg_1_16
 347     paddsw mm6, mm4 ; tm465*tg_3_16
 348
 349     pmulhw mm3,  [tg_1_16] ; tp765*tg_1_16
 350     ;;
 351
 352     por mm0, [fdct_one_corr] ; correction y1 +0.5
 353     paddsw mm5, mm7 ; tm765*tg_3_16
 354
 355     psubsw mm7, mm6 ; 6 ; y3 = tm765 - tm465*tg_3_16
 356     ;;
 357
 358     movq [y1+8], mm0 ; 0 ; save y1
 359     paddsw mm5, mm4 ; 4 ; y5 = tm765*tg_3_16 + tm465
 360
 361     movq [y3+8], mm7 ; 7 ; save y3
 362     psubsw mm3, mm1 ; 1 ; y7 = tp765*tg_1_16 - tp465
 363
 364     movq [y5+8], mm5 ; 5 ; save y5
 365
 366     movq [y7+8], mm3 ; 3 ; save y7
 367
 368 ;    emms;
 369 ;    }   ; end of forward_dct_col07()
 370     ;  done with dct_row transform
 371
 372
 373   ; fdct_mmx32_cols() --
 374   ; the following subroutine repeats the row-transform operation,
 375   ; except with different shift&round constants.  This version
 376   ; does NOT transpose the output again.  Thus the final output
 377   ; is transposed with respect to the source.
 378   ;
 379   ;  The output is stored into blk[], which destroys the original
 380   ;  input data.
 381         mov INP,  [ebp+8];              ;; row 0
 382          mov edi, 0x08; ;x = 8
 383
 384         lea TABLE,  [tab_frw_01234567]; ; row 0
 385          mov OUT, INP;
 386
 387         lea round_frw_row,  [fdct_r_row];
 388         ; for ( x = 8; x > 0; --x )  ; transform one row per iteration
 389
 390 ; ---------- loop begin
 391   lp_mmx_fdct_row1:
 392     movd mm5,  [INP+12]; ; mm5 = 7 6
 393
 394     punpcklwd mm5,  [INP+8] ; mm5 =  5 7 4 6
 395
 396     movq mm2, mm5;     ; mm2 = 5 7 4 6
 397     psrlq mm5, 32;     ; mm5 = _ _ 5 7
 398
 399     movq mm0,  [INP]; ; mm0 = 3 2 1 0
 400     punpcklwd mm5, mm2;; mm5 = 4 5 6 7
 401
 402     movq mm1, mm0;     ; mm1 = 3 2 1 0
 403     paddsw mm0, mm5;   ; mm0 = [3+4, 2+5, 1+6, 0+7] (xt3, xt2, xt1, xt0)
 404
 405     psubsw mm1, mm5;   ; mm1 = [3-4, 2-5, 1-6, 0-7] (xt7, xt6, xt5, xt4)
 406     movq mm2, mm0;     ; mm2 = [ xt3 xt2 xt1 xt0 ]
 407
 408     ;movq [ xt3xt2xt1xt0 ], mm0;
 409     ;movq [ xt7xt6xt5xt4 ], mm1;
 410
 411     punpcklwd mm0, mm1;; mm0 = [ xt5 xt1 xt4 xt0 ]
 412
 413     punpckhwd mm2, mm1;; mm2 = [ xt7 xt3 xt6 xt2 ]
 414     movq mm1, mm2;     ; mm1
 415
 416     ;; shuffle bytes around
 417
 418 ;  movq mm0,  [INP] ; 0 ; x3 x2 x1 x0
 419
 420 ;  movq mm1,  [INP+8] ; 1 ; x7 x6 x5 x4
 421     movq mm2, mm0 ; 2 ; x3 x2 x1 x0
 422
 423     movq mm3,  [TABLE] ; 3 ; w06 w04 w02 w00
 424     punpcklwd mm0, mm1 ; x5 x1 x4 x0
 425
 426     movq mm5, mm0 ; 5 ; x5 x1 x4 x0
 427     punpckldq mm0, mm0 ; x4 x0 x4 x0  [ xt2 xt0 xt2 xt0 ]
 428
 429     movq mm4,  [TABLE+8] ; 4 ; w07 w05 w03 w01
 430     punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2
 431
 432     pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00
 433     movq mm6, mm2 ; 6 ; x7 x3 x6 x2
 434
 435     movq mm1,  [TABLE+32] ; 1 ; w22 w20 w18 w16
 436     punpckldq mm2, mm2 ; x6 x2 x6 x2  [ xt3 xt1 xt3 xt1 ]
 437
 438     pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01
 439     punpckhdq mm5, mm5 ; x5 x1 x5 x1  [ xt6 xt4 xt6 xt4 ]
 440
 441     pmaddwd mm0,  [TABLE+16] ; x4*w14+x0*w12 x4*w10+x0*w08
 442     punpckhdq mm6, mm6 ; x7 x3 x7 x3  [ xt7 xt5 xt7 xt5 ]
 443
 444     movq mm7,  [TABLE+40] ; 7 ; w23 w21 w19 w17
 445     pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16
 446 ;mm3 = a1, a0 (y2,y0)
 447 ;mm1 = b1, b0 (y3,y1)
 448 ;mm0 = a3,a2  (y6,y4)
 449 ;mm5 = b3,b2  (y7,y5)
 450
 451     paddd mm3,  [round_frw_row] ; +rounder (y2,y0)
 452     pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17
 453
 454     pmaddwd mm2,  [TABLE+24] ; x6*w15+x2*w13 x6*w11+x2*w09
 455     paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) ; now ( y2, y0)
 456
 457     pmaddwd mm5,  [TABLE+48] ; x5*w30+x1*w28 x5*w26+x1*w24
 458     ;;
 459
 460     pmaddwd mm6,  [TABLE+56] ; x7*w31+x3*w29 x7*w27+x3*w25
 461     paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) ; now ( y3, y1)
 462
 463     paddd mm0,  [round_frw_row] ; +rounder (y6,y4)
 464     psrad mm3, SHIFT_FRW_ROW ; (y2, y0)
 465
 466     paddd mm1,  [round_frw_row] ; +rounder (y3,y1)
 467     paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2) ; now (y6, y4)
 468
 469     paddd mm5,  [round_frw_row] ; +rounder (y7,y5)
 470     psrad mm1, SHIFT_FRW_ROW ; y1=a1+b1 y0=a0+b0
 471
 472     paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2) ; now ( y7, y5)
 473     psrad mm0, SHIFT_FRW_ROW ;y3=a3+b3 y2=a2+b2
 474
 475     add OUT, 16;  ; increment row-output address by 1 row
 476     psrad mm5, SHIFT_FRW_ROW ; y4=a3-b3 y5=a2-b2
 477
 478     add INP, 16;  ; increment row-address by 1 row
 479     packssdw mm3, mm0 ; 0 ; y6 y4 y2 y0
 480
 481     packssdw mm1, mm5 ; 3 ; y7 y5 y3 y1
 482     movq mm6, mm3;    ; mm0 = y6 y4 y2 y0
 483
 484     punpcklwd mm3, mm1; ; y3 y2 y1 y0
 485     sub edi, 0x01;   ; i = i - 1
 486
 487     punpckhwd mm6, mm1; ; y7 y6 y5 y4
 488     add TABLE,64;  ; increment to next table
 489
 490     movq  [OUT-16], mm3 ; 1 ; save y3 y2 y1 y0
 491
 492     movq  [OUT-8], mm6 ; 7 ; save y7 y6 y5 y4
 493
 494     cmp edi, 0x00;
 495     jg near lp_mmx_fdct_row1;  ; begin fdct processing on next row
 496                 ;;
 497                 ;; Tidy up and return
 498                 ;;
 499         pop edi
 500         pop edx
 501         pop ecx
 502         pop ebx
 503
 504         pop ebp                 ; restore stack pointer
 505         emms
 506         ret
 507