mpeg2enc/quant_mmx.s

   1 ;
   2 ;  Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
   3
   4 ;
   5 ;  This program is free software; you can redistribute it and/or
   6 ;  modify it under the terms of the GNU General Public License
   7 ;  as published by the Free Software Foundation; either version 2
   8 ;  of the License, or (at your option) any later version.
   9 ;
  10 ;  This program is distributed in the hope that it will be useful,
  11 ;  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 ;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 ;  GNU General Public License for more details.
  14 ;
  15 ;  You should have received a copy of the GNU General Public License
  16 ;  along with this program; if not, write to the Free Software
  17 ;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  18 ;
  19 ;
  20 ;
  21 ;  quantize_ni_mmx.s:  MMX optimized coefficient quantization sub-routine
  22
  23
  24 global quantize_ni_mmx
  25 ; int quantize_ni_mmx(short *dst, short *src,
  26 ;                             short *quant_mat, short *i_quant_mat,
  27 ;                     int imquant, int mquant, int sat_limit)
  28
  29 ;  See quantize.c: quant_non_intra_hv_inv()  for reference implementation in C...
  30                 ;;  mquant is not currently used.
  31 ; eax = row counter...
  32 ; ebx = pqm
  33 ; ecx = piqm  ; Matrix of quads first (2^16/quant)
  34                           ; then (2^16/quant)*(2^16%quant) the second part is for rounding
  35 ; edx = temp
  36 ; edi = psrc
  37 ; esi = pdst
  38
  39 ; mm0 = [imquant|0..3]W
  40 ; mm1 = [sat_limit|0..3]W
  41 ; mm2 = *psrc -> src
  42 ; mm3 = rounding corrections... / temp
  43 ; mm4 = sign
  44 ; mm5 = nzflag accumulators
  45 ; mm6 = overflow limit
  46 ; mm7 = temp
  47
  48                 ;;
  49                 ;;  private constants needed
  50                 ;;
  51
  52 SECTION .data
  53 align 16
  54 overflim:
  55                         dw      1024-1
  56                         dw      1024-1
  57                         dw      1024-1
  58                         dw      1024-1
  59
  60                         ;; BUFFER NO LONGER USED DUE TO IMPROVED MAIN ROUTINE...
  61 SECTION .bss
  62 align 32
  63 quant_buf:      resw 64
  64
  65 SECTION .text
  66
  67
  68 align 32
  69 quantize_ni_mmx:
  70         push ebp                                ; save frame pointer
  71         mov ebp, esp            ; link
  72         push ebx
  73         push ecx
  74         push edx
  75         push esi
  76         push edi
  77
  78         mov edi, [ebp+8]    ; get dst
  79         mov esi, [ebp+12]       ; get psrc
  80         mov ebx, [ebp+16]       ; get pqm
  81         mov ecx,  [ebp+20]  ; get piqm
  82         movd mm0, [ebp+24]  ; get imquant (2^16 / mquant )
  83         movq mm1, mm0
  84         punpcklwd mm0, mm1
  85         punpcklwd mm0, mm0    ; mm0 = [imquant|0..3]W
  86
  87         movq  mm6, [overflim]; overflow limit
  88
  89         movd mm1, [ebp+32]  ; sat_limit
  90         movq mm2, mm1
  91         punpcklwd mm1, mm2   ; [sat_limit|0..3]W
  92         punpcklwd mm1, mm1   ; mm1 = [sat_limit|0..3]W
  93
  94         pxor      mm5, mm5  ; Non-zero flag accumulator
  95         mov eax,  16            ; 16 quads to do
  96         jmp nextquadniq
  97
  98 align 32
  99 nextquadniq:
 100         movq mm2, [esi]                         ; mm0 = *psrc
 101
 102         pxor    mm4, mm4
 103         pcmpgtw mm4, mm2       ; mm4 = *psrc < 0
 104         movq    mm7, mm2       ; mm7 = *psrc
 105         psllw   mm7, 1         ; mm7 = 2*(*psrc)
 106         pand    mm7, mm4       ; mm7 = 2*(*psrc)*(*psrc < 0)
 107         psubw   mm2, mm7       ; mm2 = abs(*psrc)
 108
 109         ;;
 110         ;;  Check whether we'll saturate intermediate results
 111         ;;  Eventually flag is low 8 bits of result
 112         ;;
 113
 114         movq    mm7, mm2
 115         pcmpgtw mm7, mm6    ; Tooo  big for 16 bit arithmetic :-( (should be *very* rare)
 116         movq    mm3, mm7
 117         psrlq   mm3, 32
 118         por     mm7, mm3
 119         movd    edx, mm7
 120         cmp             edx, 0
 121         jnz             near out_of_range
 122
 123         ;;
 124         ;; Carry on with the arithmetic...
 125         psllw   mm2, 5         ; mm2 = 32*abs(*psrc)
 126         movq    mm7, [ebx]     ; mm7 = *pqm>>1
 127         psrlw   mm7, 1
 128         paddw   mm2, mm7       ; mm2 = 32*abs(*psrc)+((*pqm)/2) = "p"
 129
 130
 131         ;;
 132         ;; Do the first multiplication.  Cunningly we've set things up so
 133         ;; it is exactly the top 16 bits we're interested in...
 134         ;;
 135         ;; We need the low word results for a rounding correction.
 136         ;; This is *not* exact (that actual
 137     ;; correction the product abs(*psrc)*(*pqm)*(2^16%*qm) >> 16
 138     ;;  However we get very very few wrong and none too low (the most
 139     ;; important) and no errors for small coefficients (also important)
 140         ;;      if we simply add abs(*psrc)
 141
 142
 143         movq    mm3, mm2
 144         pmullw  mm3, [ecx]
 145         movq    mm7, mm2
 146         psrlw   mm7, 1            ; Want to see if adding p would carry into upper 16 bits
 147         psrlw   mm3, 1
 148         paddw  mm3, mm7
 149         psrlw   mm3, 15           ; High bit in lsb rest 0's
 150         pmulhw  mm2, [ecx]        ; mm2 = (p*iqm+p) >> IQUANT_SCALE_POW2 ~= p/*qm
 151
 152
 153
 154         ;;
 155         ;; To hide the latency lets update some pointers...
 156         add   esi, 8                                    ; 4 word's
 157         add   ecx, 8                                    ; 4 word's
 158         sub   eax, 1
 159
 160         ;; Add rounding correction....
 161         paddw   mm2, mm3
 162
 163
 164         ;;
 165         ;; Do the second multiplication, again we ned to make a rounding adjustment
 166         ;; EXPERIMENT:   see comments in quantize.c:quant_non_intra_hv don't adjust...
 167 ;       movq    mm3, mm2
 168 ;       pmullw  mm3, mm0
 169 ;       movq    mm7, mm2
 170 ;       psrlw   mm7, 1            ; Want to see if adding p would carry into upper 16 bits
 171 ;       psrlw   mm3, 1
 172 ;       paddw mm3, mm7
 173 ;       psrlw   mm3, 15           ; High bit in lsb rest 0's
 174
 175         pmulhw  mm2, mm0     ; mm2 ~= (p/(qm*mquant))
 176
 177         ;;
 178         ;; To hide the latency lets update some more pointers...
 179         add   edi, 8
 180         add   ebx, 8
 181
 182         ;; Correct rounding and the factor of two (we want p/(qm*2*mquant)
 183 ;       paddw mm2, mm3
 184         psrlw mm2, 1
 185
 186
 187         ;;
 188         ;; Check for saturation
 189         ;;
 190         movq mm7, mm2
 191         pcmpgtw mm7, mm1
 192         movq    mm3, mm7
 193         psrlq   mm3, 32
 194         movq    mm3, mm7
 195         por             mm7, mm3
 196         movd    edx, mm7
 197         cmp             edx, 0
 198         jnz             saturated
 199
 200         ;;
 201         ;;  Accumulate non-zero flags
 202         por     mm5, mm2
 203
 204         ;;
 205         ;; Now correct the sign mm4 = *psrc < 0
 206         ;;
 207
 208         pxor mm7, mm7        ; mm7 = -2*mm2
 209         psubw mm7, mm2
 210         psllw mm7, 1
 211         pand  mm7, mm4       ; mm7 = -2*mm2 * (*psrc < 0)
 212         paddw mm2, mm7       ; mm7 = samesign(*psrc, mm2 )
 213
 214                 ;;
 215                 ;;  Store the quantised words....
 216                 ;;
 217
 218         movq [edi-8], mm2
 219         test eax, eax
 220
 221         jnz near nextquadniq
 222
 223         ;; Return saturation in low word and nzflag in high word of result dword
 224
 225
 226         movq  mm0, mm5
 227         psrlq mm0, 32
 228         por   mm5, mm0
 229         movd  edx, mm5
 230         mov   ebx, edx
 231         shl   ebx, 16
 232         or    edx, ebx
 233     and   edx, 0xffff0000  ;; hiwgh word ecx is nzflag
 234         mov   eax, edx
 235
 236 return:
 237         pop edi
 238         pop esi
 239         pop edx
 240         pop ecx
 241         pop ebx
 242
 243         pop ebp                 ; restore stack pointer
 244
 245         emms                    ; clear mmx registers
 246         ret
 247
 248 out_of_range:
 249         mov     eax,    0x00ff
 250         jp      return
 251 saturated:
 252
 253         mov eax,    0xff00
 254         jp return
 255
 256
 257
 258
 259 ;;;
 260 ;;;  void iquant_non_intra_m1_{sse,mmx}(int16_t *src, int16_t *dst, uint16_t
 261 ;;;                               *quant_mat)
 262 ;;; mmx/sse Inverse mpeg-1 quantisation routine.
 263 ;;;
 264 ;;; eax - block counter...
 265 ;;; edi - src
 266 ;;; esi - dst
 267 ;;; edx - quant_mat
 268
 269                 ;; MMX Register usage
 270                 ;; mm7 = [1|0..3]W
 271                 ;; mm6 = [2047|0..3]W
 272                 ;; mm5 = 0
 273
 274
 275 global iquant_non_intra_m1_sse
 276 align 32
 277 iquant_non_intra_m1_sse:
 278
 279                 push ebp                                ; save frame pointer
 280                 mov ebp, esp            ; link
 281
 282                 push eax
 283                 push esi
 284                 push edi
 285                 push edx
 286
 287                 mov             edi, [ebp+8]                    ; get psrc
 288                 mov             esi, [ebp+12]                   ; get pdst
 289                 mov             edx, [ebp+16]                   ; get quant table
 290                 mov             eax,1
 291                 movd    mm7, eax
 292                 punpcklwd       mm7, mm7
 293                 punpckldq       mm7, mm7
 294
 295                 mov     eax, 2047
 296                 movd    mm6, eax
 297                 punpcklwd               mm6, mm6
 298                 punpckldq               mm6, mm6
 299
 300                 mov             eax, 64                 ; 64 coeffs in a DCT block
 301                 pxor    mm5, mm5
 302
 303 iquant_loop_sse:
 304                 movq    mm0, [edi]      ; mm0 = *psrc
 305                 add             edi,8
 306                 pxor    mm1,mm1
 307                 movq    mm2, mm0
 308                 pcmpeqw mm2, mm1                ; mm2 = 1's for non-zero in mm0
 309                 pcmpeqw mm2, mm1
 310
 311                 ;; Work with absolute value for convience...
 312                 psubw   mm1, mm0        ; mm1 = -*psrc
 313                 pmaxsw  mm1, mm0        ; mm1 = val = max(*psrc,-*psrc) = abs(*psrc)
 314                 paddw   mm1, mm1                ; mm1 *= 2;
 315                 paddw   mm1, mm7                ; mm1 += 1
 316                 pmullw  mm1, [edx]              ; mm1 = (val*2+1) * *quant_mat
 317                 add             edx, 8
 318                 psraw   mm1, 5                  ; mm1 = ((val*2+1) * *quant_mat)/32
 319
 320                 ;; Now that nasty mis-match control
 321
 322                 movq    mm3, mm1
 323                 pand    mm3, mm7
 324                 pxor    mm3, mm7                ; mm3 = ~(val&1) (in the low bits, others 0)
 325                 movq    mm4, mm1
 326                 pcmpeqw mm4, mm5                ; mm4 = (val == 0)
 327                 pxor    mm4, mm7                ;  Low bits now (val != 0)
 328                 pand    mm3, mm4                ; mm3 =  (~(val&1))&(val!=0)
 329
 330                 psubw   mm1, mm3                ; mm1 -= (~(val&1))&(val!=0)
 331                 pminsw  mm1, mm6                ; mm1 = saturated(res)
 332
 333                 ;; Handle zero case and restoring sign
 334                 pand    mm1, mm2                ; Zero in the zero case
 335                 pxor    mm3, mm3
 336                 psubw   mm3, mm1                ;  mm3 = - res
 337                 paddw   mm3, mm3                ;  mm3 = - 2*res
 338                 pcmpgtw mm0, mm5                ;  mm0 = *psrc < 0
 339                 pcmpeqw mm0, mm5                ;  mm0 = *psrc >= 0
 340                 pand    mm3, mm0                ;  mm3 = *psrc <= 0 ? -2 * res :         0
 341                 paddw   mm1, mm3                ;  mm3 = samesign(*psrc,res)
 342                 movq    [esi], mm1
 343                 add             esi,8
 344
 345                 sub             eax, 4
 346                 jnz             iquant_loop_sse
 347
 348                 pop     edx
 349                 pop edi
 350                 pop esi
 351                 pop eax
 352
 353                 pop ebp                 ; restore stack pointer
 354
 355                 emms                    ; clear mmx registers
 356                 ret
 357
 358
 359 ;;;
 360 ;;;  void iquant_non_intra_m1_mmx(int16_t *src, int16_t *dst, uint16_t
 361 ;;;                               *quant_mat)
 362 ;;; eax - block counter...
 363 ;;; edi - src
 364 ;;; esi - dst
 365 ;;; edx - quant_mat
 366
 367                 ;; MMX Register usage
 368                 ;; mm7 = [1|0..3]W
 369                 ;; mm6 = [MAX_UINT16-2047|0..3]W
 370                 ;; mm5 = 0
 371
 372
 373 global iquant_non_intra_m1_mmx
 374 align 32
 375 iquant_non_intra_m1_mmx:
 376
 377                 push ebp                                ; save frame pointer
 378                 mov ebp, esp            ; link
 379
 380                 push eax
 381                 push esi
 382                 push edi
 383                 push edx
 384
 385                 mov             edi, [ebp+8]                    ; get psrc
 386                 mov             esi, [ebp+12]                   ; get pdst
 387                 mov             edx, [ebp+16]                   ; get quant table
 388                 mov             eax,1
 389                 movd    mm7, eax
 390                 punpcklwd       mm7, mm7
 391                 punpckldq       mm7, mm7
 392
 393                 mov     eax, (0xffff-2047)
 394                 movd    mm6, eax
 395                 punpcklwd               mm6, mm6
 396                 punpckldq               mm6, mm6
 397
 398                 mov             eax, 64                 ; 64 coeffs in a DCT block
 399                 pxor    mm5, mm5
 400
 401 iquant_loop:
 402                 movq    mm0, [edi]      ; mm0 = *psrc
 403                 add             edi,8
 404                 pxor    mm1, mm1
 405                 movq    mm2, mm0
 406                 pcmpeqw mm2, mm5                ; mm2 = 1's for non-zero in mm0
 407                 pcmpeqw mm2, mm5
 408
 409                 ;; Work with absolute value for convience...
 410
 411                 psubw   mm1, mm0        ; mm1 = -*psrc
 412                 psllw   mm1, 1                  ; mm1 = -2*psrc
 413                 movq    mm3, mm0                ; mm3 = *psrc > 0
 414                 pcmpgtw mm3, mm5
 415                 pcmpeqw mm3, mm5        ; mm3 = *psrc <= 0
 416                 pand    mm3, mm1                ; mm3 = (*psrc <= 0)*-2* *psrc
 417                 movq    mm1, mm0        ; mm1 = (*psrc <= 0)*-2* *psrc + *psrc = abs(*psrc)
 418                 paddw   mm1, mm3
 419
 420
 421                 paddw   mm1, mm1                ; mm1 *= 2;
 422                 paddw   mm1, mm7                ; mm1 += 1
 423                 pmullw  mm1, [edx]              ; mm1 = (val*2+1) * *quant_mat
 424                 add             edx, 8
 425                 psraw   mm1, 5                  ; mm1 = ((val*2+1) * *quant_mat)/32
 426
 427                 ;; Now that nasty mis-match control
 428
 429                 movq    mm3, mm1
 430                 pand    mm3, mm7
 431                 pxor    mm3, mm7                ; mm3 = ~(val&1) (in the low bits, others 0)
 432                 movq    mm4, mm1
 433                 pcmpeqw mm4, mm5                ; mm4 = (val == 0)
 434                 pxor    mm4, mm7                ;  Low bits now (val != 0)
 435                 pand    mm3, mm4                ; mm3 =  (~(val&1))&(val!=0)
 436
 437                 psubw   mm1, mm3                ; mm1 -= (~(val&1))&(val!=0)
 438
 439                 paddsw  mm1, mm6                ; Will saturate if > 2047
 440                 psubw   mm1, mm6                ; 2047 if saturated... unchanged otherwise
 441
 442                 ;; Handle zero case and restoring sign
 443                 pand    mm1, mm2                ; Zero in the zero case
 444                 pxor    mm3, mm3
 445                 psubw   mm3, mm1                ;  mm3 = - res
 446                 paddw   mm3, mm3                ;  mm3 = - 2*res
 447                 pcmpgtw mm0, mm5                ;  mm0 = *psrc < 0
 448                 pcmpeqw mm0, mm5                ;  mm0 = *psrc >= 0
 449                 pand    mm3, mm0                ;  mm3 = *psrc <= 0 ? -2 * res :         0
 450                 paddw   mm1, mm3                ;  mm3 = samesign(*psrc,res)
 451                 movq    [esi], mm1
 452                 add             esi,8
 453
 454                 sub             eax, 4
 455                 jnz             near iquant_loop
 456
 457                 pop     edx
 458                 pop edi
 459                 pop esi
 460                 pop eax
 461
 462                 pop ebp                 ; restore stack pointer
 463
 464                 emms                    ; clear mmx registers
 465                 ret
 466
 467
 468
 469 ;;;  int32_t quant_weight_coeff_sum_mmx(int16_t *src, int16_t *i_quant_mat
 470 ;;; Simply add up the sum of coefficients weighted
 471 ;;; by their quantisation coefficients
 472 ;;;                               )
 473 ;;; eax - block counter...
 474 ;;; edi - src
 475 ;;; esi - dst
 476 ;;; edx - quant_mat
 477
 478                 ;; MMX Register usage
 479                 ;; mm7 = [1|0..3]W
 480                 ;; mm6 = [2047|0..3]W
 481                 ;; mm5 = 0
 482
 483 global quant_weight_coeff_sum_mmx
 484 align 32
 485 quant_weight_coeff_sum_mmx:
 486         push ebp                                ; save frame pointer
 487         mov ebp, esp            ; link
 488
 489         push ecx
 490         push esi
 491         push edi
 492
 493         mov edi, [ebp+8]        ; get pdst
 494         mov esi, [ebp+12]       ; get piqm
 495
 496         mov ecx, 16                     ; 16 coefficient / quantiser quads to process...
 497         pxor mm6, mm6           ; Accumulator
 498         pxor mm7, mm7           ; Zero
 499 quantsum:
 500         movq    mm0, [edi]
 501         movq    mm2, [esi]
 502
 503         ;;
 504         ;;      Compute absolute value of coefficients...
 505         ;;
 506         movq    mm1, mm7
 507         pcmpgtw mm1, mm0   ; (mm0 < 0 )
 508         movq    mm3, mm0
 509         psllw   mm3, 1     ; 2*mm0
 510         pand    mm3, mm1   ; 2*mm0 * (mm0 < 0)
 511         psubw   mm0, mm3   ; mm0 = abs(mm0)
 512
 513
 514         ;;
 515         ;; Compute the low and high words of the result....
 516         ;;
 517         movq    mm1, mm0
 518         pmullw  mm0, mm2
 519         add             edi, 8
 520         add             esi, 8
 521         pmulhw  mm1, mm2
 522
 523         movq      mm3, mm0
 524         punpcklwd  mm3, mm1
 525         punpckhwd  mm0, mm1
 526         paddd      mm6, mm3
 527         paddd      mm6, mm0
 528
 529
 530         sub ecx,        1
 531         jnz   quantsum
 532
 533         movd   eax, mm6
 534         psrlq  mm6, 32
 535         movd   ecx, mm6
 536         add    eax, ecx
 537
 538         pop edi
 539         pop esi
 540         pop ecx
 541
 542         pop ebp                 ; restore stack pointer
 543
 544         emms                    ; clear mmx registers
 545         ret
 546
 547