vp8/common/x86/postproc_mmx.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11
  12 #include <math.h>
  13 #include <stdlib.h>
  14 #include "vpx_scale/yv12config.h"
  15 #include "pragmas.h"
  16
  17 #define VP8_FILTER_WEIGHT 128
  18 #define VP8_FILTER_SHIFT  7
  19
  20
  21
  22 /* static constants */
  23 __declspec(align(16))
  24 const static short  Blur[48] =
  25 {
  26
  27     16, 16, 16, 16, 16, 16, 16, 16,
  28     16, 16, 16, 16, 16, 16, 16, 16,
  29     64, 64, 64, 64, 64, 64, 64, 64,
  30     16, 16, 16, 16, 16, 16, 16, 16,
  31     16, 16, 16, 16, 16, 16, 16, 16,
  32     0,  0,  0,  0,  0,  0,  0,  0,
  33
  34 };
  35 #define RD  __declspec(align(16)) __int64 rd  = 0x0040004000400040;
  36 #define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
  37
  38 #ifndef RELOCATEABLE
  39 const static RD;
  40 const static R4D2;
  41 #endif
  42
  43
  44 /* external references */
  45 extern double vp8_gaussian(double sigma, double mu, double x);
  46 extern short vp8_rv[];
  47 extern int vp8_q2mbl(int x) ;
  48
  49
  50
  51 void vp8_post_proc_down_and_across_mmx
  52 (
  53     unsigned char *src_ptr,
  54     unsigned char *dst_ptr,
  55     int src_pixels_per_line,
  56     int dst_pixels_per_line,
  57     int rows,
  58     int cols,
  59     int flimit
  60 )
  61 {
  62 #ifdef RELOCATEABLE
  63     RD
  64     R4D2
  65 #endif
  66
  67     __asm
  68     {
  69         push        ebx
  70         lea         ebx, Blur
  71         movd        mm2, flimit
  72         punpcklwd   mm2, mm2
  73         punpckldq   mm2, mm2
  74
  75         mov         esi,        src_ptr
  76         mov         edi,        dst_ptr
  77
  78         mov         ecx, DWORD PTR rows
  79         mov         eax, src_pixels_per_line ;
  80         destination pitch?
  81         pxor        mm0, mm0              ;
  82         mm0 = 00000000
  83
  84         nextrow:
  85
  86         xor         edx,        edx       ;
  87
  88         clear out edx for use as loop counter
  89         nextcol:
  90
  91         pxor        mm7, mm7              ;
  92
  93     mm7 = 00000000
  94     movq        mm6, [ebx + 32 ]      ;
  95         mm6 = kernel 2 taps
  96         movq        mm3, [esi]            ;
  97         mm4 = r0 p0..p7
  98         punpcklbw   mm3, mm0              ;
  99         mm3 = p0..p3
 100         movq        mm1, mm3              ;
 101         mm1 = p0..p3
 102         pmullw      mm3, mm6              ;
 103         mm3 *= kernel 2 modifiers
 104
 105         movq        mm6, [ebx + 48]       ;
 106         mm6 = kernel 3 taps
 107         movq        mm5, [esi + eax]      ;
 108         mm4 = r1 p0..p7
 109         punpcklbw   mm5, mm0              ;
 110         mm5 = r1 p0..p3
 111         pmullw      mm6, mm5              ;
 112         mm6 *= p0..p3 * kernel 3 modifiers
 113         paddusw     mm3, mm6              ;
 114         mm3 += mm6
 115
 116         ;
 117         thresholding
 118         movq        mm7, mm1              ;
 119         mm7 = r0 p0..p3
 120         psubusw     mm7, mm5              ;
 121         mm7 = r0 p0..p3 - r1 p0..p3
 122         psubusw     mm5, mm1              ;
 123         mm5 = r1 p0..p3 - r0 p0..p3
 124         paddusw     mm7, mm5              ;
 125         mm7 = abs(r0 p0..p3 - r1 p0..p3)
 126         pcmpgtw     mm7, mm2
 127
 128         movq        mm6, [ebx + 64 ]      ;
 129         mm6 = kernel 4 modifiers
 130         movq        mm5, [esi + 2*eax]    ;
 131         mm4 = r2 p0..p7
 132         punpcklbw   mm5, mm0              ;
 133         mm5 = r2 p0..p3
 134         pmullw      mm6, mm5              ;
 135         mm5 *= kernel 4 modifiers
 136         paddusw     mm3, mm6              ;
 137         mm3 += mm5
 138
 139         ;
 140         thresholding
 141         movq        mm6, mm1              ;
 142         mm6 = r0 p0..p3
 143         psubusw     mm6, mm5              ;
 144         mm6 = r0 p0..p3 - r2 p0..p3
 145         psubusw     mm5, mm1              ;
 146         mm5 = r2 p0..p3 - r2 p0..p3
 147         paddusw     mm6, mm5              ;
 148         mm6 = abs(r0 p0..p3 - r2 p0..p3)
 149         pcmpgtw     mm6, mm2
 150         por         mm7, mm6              ;
 151         accumulate thresholds
 152
 153
 154         neg         eax
 155         movq        mm6, [ebx ]           ;
 156         kernel 0 taps
 157         movq        mm5, [esi+2*eax]      ;
 158         mm4 = r-2 p0..p7
 159         punpcklbw   mm5, mm0              ;
 160         mm5 = r-2 p0..p3
 161         pmullw      mm6, mm5              ;
 162         mm5 *= kernel 0 modifiers
 163         paddusw     mm3, mm6              ;
 164         mm3 += mm5
 165
 166         ;
 167         thresholding
 168         movq        mm6, mm1              ;
 169         mm6 = r0 p0..p3
 170         psubusw     mm6, mm5              ;
 171         mm6 = p0..p3 - r-2 p0..p3
 172         psubusw     mm5, mm1              ;
 173         mm5 = r-2 p0..p3 - p0..p3
 174         paddusw     mm6, mm5              ;
 175         mm6 = abs(r0 p0..p3 - r-2 p0..p3)
 176         pcmpgtw     mm6, mm2
 177         por         mm7, mm6              ;
 178         accumulate thresholds
 179
 180         movq        mm6, [ebx + 16]       ;
 181         kernel 1 taps
 182         movq        mm4, [esi+eax]        ;
 183         mm4 = r-1 p0..p7
 184         punpcklbw   mm4, mm0              ;
 185         mm4 = r-1 p0..p3
 186         pmullw      mm6, mm4              ;
 187         mm4 *= kernel 1 modifiers.
 188         paddusw     mm3, mm6              ;
 189         mm3 += mm5
 190
 191         ;
 192         thresholding
 193         movq        mm6, mm1              ;
 194         mm6 = r0 p0..p3
 195         psubusw     mm6, mm4              ;
 196         mm6 = p0..p3 - r-2 p0..p3
 197         psubusw     mm4, mm1              ;
 198         mm5 = r-1 p0..p3 - p0..p3
 199         paddusw     mm6, mm4              ;
 200         mm6 = abs(r0 p0..p3 - r-1 p0..p3)
 201         pcmpgtw     mm6, mm2
 202         por         mm7, mm6              ;
 203         accumulate thresholds
 204
 205
 206         paddusw     mm3, rd               ;
 207         mm3 += round value
 208         psraw       mm3, VP8_FILTER_SHIFT     ;
 209         mm3 /= 128
 210
 211         pand        mm1, mm7              ;
 212         mm1 select vals > thresh from source
 213         pandn       mm7, mm3              ;
 214         mm7 select vals < thresh from blurred result
 215         paddusw     mm1, mm7              ;
 216         combination
 217
 218         packuswb    mm1, mm0              ;
 219         pack to bytes
 220
 221         movd        [edi], mm1            ;
 222         neg         eax                   ;
 223         pitch is positive
 224
 225
 226         add         esi, 4
 227         add         edi, 4
 228         add         edx, 4
 229
 230         cmp         edx, cols
 231         jl          nextcol
 232         // done with the all cols, start the across filtering in place
 233         sub         esi, edx
 234         sub         edi, edx
 235
 236
 237         push        eax
 238         xor         edx,    edx
 239         mov         eax,    [edi-4];
 240
 241         acrossnextcol:
 242         pxor        mm7, mm7              ;
 243         mm7 = 00000000
 244         movq        mm6, [ebx + 32 ]      ;
 245         movq        mm4, [edi+edx]        ;
 246         mm4 = p0..p7
 247         movq        mm3, mm4              ;
 248         mm3 = p0..p7
 249         punpcklbw   mm3, mm0              ;
 250         mm3 = p0..p3
 251         movq        mm1, mm3              ;
 252         mm1 = p0..p3
 253         pmullw      mm3, mm6              ;
 254         mm3 *= kernel 2 modifiers
 255
 256         movq        mm6, [ebx + 48]
 257         psrlq       mm4, 8                ;
 258         mm4 = p1..p7
 259         movq        mm5, mm4              ;
 260         mm5 = p1..p7
 261         punpcklbw   mm5, mm0              ;
 262         mm5 = p1..p4
 263         pmullw      mm6, mm5              ;
 264         mm6 *= p1..p4 * kernel 3 modifiers
 265         paddusw     mm3, mm6              ;
 266         mm3 += mm6
 267
 268         ;
 269         thresholding
 270         movq        mm7, mm1              ;
 271         mm7 = p0..p3
 272         psubusw     mm7, mm5              ;
 273         mm7 = p0..p3 - p1..p4
 274         psubusw     mm5, mm1              ;
 275         mm5 = p1..p4 - p0..p3
 276         paddusw     mm7, mm5              ;
 277         mm7 = abs(p0..p3 - p1..p4)
 278         pcmpgtw     mm7, mm2
 279
 280         movq        mm6, [ebx + 64 ]
 281         psrlq       mm4, 8                ;
 282         mm4 = p2..p7
 283         movq        mm5, mm4              ;
 284         mm5 = p2..p7
 285         punpcklbw   mm5, mm0              ;
 286         mm5 = p2..p5
 287         pmullw      mm6, mm5              ;
 288         mm5 *= kernel 4 modifiers
 289         paddusw     mm3, mm6              ;
 290         mm3 += mm5
 291
 292         ;
 293         thresholding
 294         movq        mm6, mm1              ;
 295         mm6 = p0..p3
 296         psubusw     mm6, mm5              ;
 297         mm6 = p0..p3 - p1..p4
 298         psubusw     mm5, mm1              ;
 299         mm5 = p1..p4 - p0..p3
 300         paddusw     mm6, mm5              ;
 301         mm6 = abs(p0..p3 - p1..p4)
 302         pcmpgtw     mm6, mm2
 303         por         mm7, mm6              ;
 304         accumulate thresholds
 305
 306
 307         movq        mm6, [ebx ]
 308         movq        mm4, [edi+edx-2]      ;
 309         mm4 = p-2..p5
 310         movq        mm5, mm4              ;
 311         mm5 = p-2..p5
 312         punpcklbw   mm5, mm0              ;
 313         mm5 = p-2..p1
 314         pmullw      mm6, mm5              ;
 315         mm5 *= kernel 0 modifiers
 316         paddusw     mm3, mm6              ;
 317         mm3 += mm5
 318
 319         ;
 320         thresholding
 321         movq        mm6, mm1              ;
 322         mm6 = p0..p3
 323         psubusw     mm6, mm5              ;
 324         mm6 = p0..p3 - p1..p4
 325         psubusw     mm5, mm1              ;
 326         mm5 = p1..p4 - p0..p3
 327         paddusw     mm6, mm5              ;
 328         mm6 = abs(p0..p3 - p1..p4)
 329         pcmpgtw     mm6, mm2
 330         por         mm7, mm6              ;
 331         accumulate thresholds
 332
 333         movq        mm6, [ebx + 16]
 334         psrlq       mm4, 8                ;
 335         mm4 = p-1..p5
 336         punpcklbw   mm4, mm0              ;
 337         mm4 = p-1..p2
 338         pmullw      mm6, mm4              ;
 339         mm4 *= kernel 1 modifiers.
 340         paddusw     mm3, mm6              ;
 341         mm3 += mm5
 342
 343         ;
 344         thresholding
 345         movq        mm6, mm1              ;
 346         mm6 = p0..p3
 347         psubusw     mm6, mm4              ;
 348         mm6 = p0..p3 - p1..p4
 349         psubusw     mm4, mm1              ;
 350         mm5 = p1..p4 - p0..p3
 351         paddusw     mm6, mm4              ;
 352         mm6 = abs(p0..p3 - p1..p4)
 353         pcmpgtw     mm6, mm2
 354         por         mm7, mm6              ;
 355         accumulate thresholds
 356
 357         paddusw     mm3, rd               ;
 358         mm3 += round value
 359         psraw       mm3, VP8_FILTER_SHIFT     ;
 360         mm3 /= 128
 361
 362         pand        mm1, mm7              ;
 363         mm1 select vals > thresh from source
 364         pandn       mm7, mm3              ;
 365         mm7 select vals < thresh from blurred result
 366         paddusw     mm1, mm7              ;
 367         combination
 368
 369         packuswb    mm1, mm0              ;
 370         pack to bytes
 371         mov         DWORD PTR [edi+edx-4],  eax   ;
 372         store previous four bytes
 373         movd        eax,    mm1
 374
 375         add         edx, 4
 376         cmp         edx, cols
 377         jl          acrossnextcol;
 378
 379         mov         DWORD PTR [edi+edx-4],  eax
 380         pop         eax
 381
 382         // done with this rwo
 383         add         esi, eax               ;
 384         next line
 385         mov         eax, dst_pixels_per_line ;
 386         destination pitch?
 387         add         edi, eax               ;
 388         next destination
 389         mov         eax, src_pixels_per_line ;
 390         destination pitch?
 391
 392         dec         ecx                   ;
 393         decrement count
 394         jnz         nextrow               ;
 395         next row
 396         pop         ebx
 397
 398     }
 399 }
 400
 401
 402
 403 void vp8_post_proc_down_and_across_xmm
 404 (
 405     unsigned char *src_ptr,
 406     unsigned char *dst_ptr,
 407     int src_pixels_per_line,
 408     int dst_pixels_per_line,
 409     int rows,
 410     int cols,
 411     int flimit
 412 )
 413 {
 414 #ifdef RELOCATEABLE
 415     R4D2
 416 #endif
 417
 418     __asm
 419     {
 420         movd        xmm2,       flimit
 421         punpcklwd   xmm2,       xmm2
 422         punpckldq   xmm2,       xmm2
 423         punpcklqdq  xmm2,       xmm2
 424
 425         mov         esi,        src_ptr
 426         mov         edi,        dst_ptr
 427
 428         mov         ecx,        DWORD PTR rows
 429         mov         eax,        src_pixels_per_line ;
 430         destination pitch?
 431         pxor        xmm0,       xmm0              ;
 432         mm0 = 00000000
 433
 434         nextrow:
 435
 436         xor         edx,        edx       ;
 437
 438         clear out edx for use as loop counter
 439         nextcol:
 440         movq        xmm3,       QWORD PTR [esi]         ;
 441
 442         mm4 = r0 p0..p7
 443         punpcklbw   xmm3,       xmm0                    ;
 444         mm3 = p0..p3
 445         movdqa      xmm1,       xmm3                    ;
 446         mm1 = p0..p3
 447         psllw       xmm3,       2                       ;
 448
 449         movq        xmm5,       QWORD PTR [esi + eax]   ;
 450         mm4 = r1 p0..p7
 451         punpcklbw   xmm5,       xmm0                    ;
 452         mm5 = r1 p0..p3
 453         paddusw     xmm3,       xmm5                    ;
 454         mm3 += mm6
 455
 456         ;
 457         thresholding
 458         movdqa      xmm7,       xmm1                    ;
 459         mm7 = r0 p0..p3
 460         psubusw     xmm7,       xmm5                    ;
 461         mm7 = r0 p0..p3 - r1 p0..p3
 462         psubusw     xmm5,       xmm1                    ;
 463         mm5 = r1 p0..p3 - r0 p0..p3
 464         paddusw     xmm7,       xmm5                    ;
 465         mm7 = abs(r0 p0..p3 - r1 p0..p3)
 466         pcmpgtw     xmm7,       xmm2
 467
 468         movq        xmm5,       QWORD PTR [esi + 2*eax] ;
 469         mm4 = r2 p0..p7
 470         punpcklbw   xmm5,       xmm0                    ;
 471         mm5 = r2 p0..p3
 472         paddusw     xmm3,       xmm5                    ;
 473         mm3 += mm5
 474
 475         ;
 476         thresholding
 477         movdqa      xmm6,       xmm1                    ;
 478         mm6 = r0 p0..p3
 479         psubusw     xmm6,       xmm5                    ;
 480         mm6 = r0 p0..p3 - r2 p0..p3
 481         psubusw     xmm5,       xmm1                    ;
 482         mm5 = r2 p0..p3 - r2 p0..p3
 483         paddusw     xmm6,       xmm5                    ;
 484         mm6 = abs(r0 p0..p3 - r2 p0..p3)
 485         pcmpgtw     xmm6,       xmm2
 486         por         xmm7,       xmm6                    ;
 487         accumulate thresholds
 488
 489
 490         neg         eax
 491         movq        xmm5,       QWORD PTR [esi+2*eax]   ;
 492         mm4 = r-2 p0..p7
 493         punpcklbw   xmm5,       xmm0                    ;
 494         mm5 = r-2 p0..p3
 495         paddusw     xmm3,       xmm5                    ;
 496         mm3 += mm5
 497
 498         ;
 499         thresholding
 500         movdqa      xmm6,       xmm1                    ;
 501         mm6 = r0 p0..p3
 502         psubusw     xmm6,       xmm5                    ;
 503         mm6 = p0..p3 - r-2 p0..p3
 504         psubusw     xmm5,       xmm1                    ;
 505         mm5 = r-2 p0..p3 - p0..p3
 506         paddusw     xmm6,       xmm5                    ;
 507         mm6 = abs(r0 p0..p3 - r-2 p0..p3)
 508         pcmpgtw     xmm6,       xmm2
 509         por         xmm7,       xmm6                    ;
 510         accumulate thresholds
 511
 512         movq        xmm4,       QWORD PTR [esi+eax]     ;
 513         mm4 = r-1 p0..p7
 514         punpcklbw   xmm4,       xmm0                    ;
 515         mm4 = r-1 p0..p3
 516         paddusw     xmm3,       xmm4                    ;
 517         mm3 += mm5
 518
 519         ;
 520         thresholding
 521         movdqa      xmm6,       xmm1                    ;
 522         mm6 = r0 p0..p3
 523         psubusw     xmm6,       xmm4                    ;
 524         mm6 = p0..p3 - r-2 p0..p3
 525         psubusw     xmm4,       xmm1                    ;
 526         mm5 = r-1 p0..p3 - p0..p3
 527         paddusw     xmm6,       xmm4                    ;
 528         mm6 = abs(r0 p0..p3 - r-1 p0..p3)
 529         pcmpgtw     xmm6,       xmm2
 530         por         xmm7,       xmm6                    ;
 531         accumulate thresholds
 532
 533
 534         paddusw     xmm3,       rd42                    ;
 535         mm3 += round value
 536         psraw       xmm3,       3                       ;
 537         mm3 /= 8
 538
 539         pand        xmm1,       xmm7                    ;
 540         mm1 select vals > thresh from source
 541         pandn       xmm7,       xmm3                    ;
 542         mm7 select vals < thresh from blurred result
 543         paddusw     xmm1,       xmm7                    ;
 544         combination
 545
 546         packuswb    xmm1,       xmm0                    ;
 547         pack to bytes
 548         movq        QWORD PTR [edi], xmm1             ;
 549
 550         neg         eax                   ;
 551         pitch is positive
 552         add         esi,        8
 553         add         edi,        8
 554
 555         add         edx,        8
 556         cmp         edx,        cols
 557
 558         jl          nextcol
 559
 560         // done with the all cols, start the across filtering in place
 561         sub         esi,        edx
 562         sub         edi,        edx
 563
 564         xor         edx,        edx
 565         movq        mm0,        QWORD PTR [edi-8];
 566
 567         acrossnextcol:
 568         movq        xmm7,       QWORD PTR [edi +edx -2]
 569         movd        xmm4,       DWORD PTR [edi +edx +6]
 570
 571         pslldq      xmm4,       8
 572         por         xmm4,       xmm7
 573
 574         movdqa      xmm3,       xmm4
 575         psrldq      xmm3,       2
 576         punpcklbw   xmm3,       xmm0              ;
 577         mm3 = p0..p3
 578         movdqa      xmm1,       xmm3              ;
 579         mm1 = p0..p3
 580         psllw       xmm3,       2
 581
 582
 583         movdqa      xmm5,       xmm4
 584         psrldq      xmm5,       3
 585         punpcklbw   xmm5,       xmm0              ;
 586         mm5 = p1..p4
 587         paddusw     xmm3,       xmm5              ;
 588         mm3 += mm6
 589
 590         ;
 591         thresholding
 592         movdqa      xmm7,       xmm1              ;
 593         mm7 = p0..p3
 594         psubusw     xmm7,       xmm5              ;
 595         mm7 = p0..p3 - p1..p4
 596         psubusw     xmm5,       xmm1              ;
 597         mm5 = p1..p4 - p0..p3
 598         paddusw     xmm7,       xmm5              ;
 599         mm7 = abs(p0..p3 - p1..p4)
 600         pcmpgtw     xmm7,       xmm2
 601
 602         movdqa      xmm5,       xmm4
 603         psrldq      xmm5,       4
 604         punpcklbw   xmm5,       xmm0              ;
 605         mm5 = p2..p5
 606         paddusw     xmm3,       xmm5              ;
 607         mm3 += mm5
 608
 609         ;
 610         thresholding
 611         movdqa      xmm6,       xmm1              ;
 612         mm6 = p0..p3
 613         psubusw     xmm6,       xmm5              ;
 614         mm6 = p0..p3 - p1..p4
 615         psubusw     xmm5,       xmm1              ;
 616         mm5 = p1..p4 - p0..p3
 617         paddusw     xmm6,       xmm5              ;
 618         mm6 = abs(p0..p3 - p1..p4)
 619         pcmpgtw     xmm6,       xmm2
 620         por         xmm7,       xmm6              ;
 621         accumulate thresholds
 622
 623
 624         movdqa      xmm5,       xmm4              ;
 625         mm5 = p-2..p5
 626         punpcklbw   xmm5,       xmm0              ;
 627         mm5 = p-2..p1
 628         paddusw     xmm3,       xmm5              ;
 629         mm3 += mm5
 630
 631         ;
 632         thresholding
 633         movdqa      xmm6,       xmm1              ;
 634         mm6 = p0..p3
 635         psubusw     xmm6,       xmm5              ;
 636         mm6 = p0..p3 - p1..p4
 637         psubusw     xmm5,       xmm1              ;
 638         mm5 = p1..p4 - p0..p3
 639         paddusw     xmm6,       xmm5              ;
 640         mm6 = abs(p0..p3 - p1..p4)
 641         pcmpgtw     xmm6,       xmm2
 642         por         xmm7,       xmm6              ;
 643         accumulate thresholds
 644
 645         psrldq      xmm4,       1                   ;
 646         mm4 = p-1..p5
 647         punpcklbw   xmm4,       xmm0              ;
 648         mm4 = p-1..p2
 649         paddusw     xmm3,       xmm4              ;
 650         mm3 += mm5
 651
 652         ;
 653         thresholding
 654         movdqa      xmm6,       xmm1              ;
 655         mm6 = p0..p3
 656         psubusw     xmm6,       xmm4              ;
 657         mm6 = p0..p3 - p1..p4
 658         psubusw     xmm4,       xmm1              ;
 659         mm5 = p1..p4 - p0..p3
 660         paddusw     xmm6,       xmm4              ;
 661         mm6 = abs(p0..p3 - p1..p4)
 662         pcmpgtw     xmm6,       xmm2
 663         por         xmm7,       xmm6              ;
 664         accumulate thresholds
 665
 666         paddusw     xmm3,       rd42              ;
 667         mm3 += round value
 668         psraw       xmm3,       3                 ;
 669         mm3 /= 8
 670
 671         pand        xmm1,       xmm7              ;
 672         mm1 select vals > thresh from source
 673         pandn       xmm7,       xmm3              ;
 674         mm7 select vals < thresh from blurred result
 675         paddusw     xmm1,       xmm7              ;
 676         combination
 677
 678         packuswb    xmm1,       xmm0              ;
 679         pack to bytes
 680         movq        QWORD PTR [edi+edx-8],  mm0   ;
 681         store previous four bytes
 682         movdq2q     mm0,        xmm1
 683
 684         add         edx,        8
 685         cmp         edx,        cols
 686         jl          acrossnextcol;
 687
 688         // last 8 pixels
 689         movq        QWORD PTR [edi+edx-8],  mm0
 690
 691         // done with this rwo
 692         add         esi, eax               ;
 693         next line
 694         mov         eax, dst_pixels_per_line ;
 695         destination pitch?
 696         add         edi, eax               ;
 697         next destination
 698         mov         eax, src_pixels_per_line ;
 699         destination pitch?
 700
 701         dec         ecx                   ;
 702         decrement count
 703         jnz         nextrow               ;
 704         next row
 705     }
 706 }
 707
 708
 709 void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
 710 {
 711     int c, i;
 712     __declspec(align(16))
 713     int flimit2[2];
 714     __declspec(align(16))
 715     unsigned char d[16][8];
 716
 717     flimit = vp8_q2mbl(flimit);
 718
 719     for (i = 0; i < 2; i++)
 720         flimit2[i] = flimit;
 721
 722     rows += 8;
 723
 724     for (c = 0; c < cols; c += 4)
 725     {
 726         unsigned char *s = &dst[c];
 727
 728         __asm
 729         {
 730             mov         esi,        s           ;
 731             pxor        mm0,        mm0     ;
 732
 733             mov         eax,        pitch       ;
 734             neg         eax                                     // eax = -pitch
 735
 736             lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
 737             neg         eax
 738
 739
 740             pxor        mm5,        mm5
 741             pxor        mm6,        mm6     ;
 742
 743             pxor        mm7,        mm7     ;
 744             mov         edi,        esi
 745
 746             mov         ecx,        15          ;
 747
 748             loop_initvar:
 749             movd        mm1,        DWORD PTR [edi];
 750             punpcklbw   mm1,        mm0     ;
 751
 752             paddw       mm5,        mm1     ;
 753             pmullw      mm1,        mm1     ;
 754
 755             movq        mm2,        mm1     ;
 756             punpcklwd   mm1,        mm0     ;
 757
 758             punpckhwd   mm2,        mm0     ;
 759             paddd       mm6,        mm1     ;
 760
 761             paddd       mm7,        mm2     ;
 762             lea         edi,        [edi+eax]   ;
 763
 764             dec         ecx
 765             jne         loop_initvar
 766             //save the var and sum
 767             xor         edx,        edx
 768             loop_row:
 769             movd        mm1,        DWORD PTR [esi]     // [s-pitch*8]
 770             movd        mm2,        DWORD PTR [edi]     // [s+pitch*7]
 771
 772             punpcklbw   mm1,        mm0
 773             punpcklbw   mm2,        mm0
 774
 775             paddw       mm5,        mm2
 776             psubw       mm5,        mm1
 777
 778             pmullw      mm2,        mm2
 779             movq        mm4,        mm2
 780
 781             punpcklwd   mm2,        mm0
 782             punpckhwd   mm4,        mm0
 783
 784             paddd       mm6,        mm2
 785             paddd       mm7,        mm4
 786
 787             pmullw      mm1,        mm1
 788             movq        mm2,        mm1
 789
 790             punpcklwd   mm1,        mm0
 791             psubd       mm6,        mm1
 792
 793             punpckhwd   mm2,        mm0
 794             psubd       mm7,        mm2
 795
 796
 797             movq        mm3,        mm6
 798             pslld       mm3,        4
 799
 800             psubd       mm3,        mm6
 801             movq        mm1,        mm5
 802
 803             movq        mm4,        mm5
 804             pmullw      mm1,        mm1
 805
 806             pmulhw      mm4,        mm4
 807             movq        mm2,        mm1
 808
 809             punpcklwd   mm1,        mm4
 810             punpckhwd   mm2,        mm4
 811
 812             movq        mm4,        mm7
 813             pslld       mm4,        4
 814
 815             psubd       mm4,        mm7
 816
 817             psubd       mm3,        mm1
 818             psubd       mm4,        mm2
 819
 820             psubd       mm3,        flimit2
 821             psubd       mm4,        flimit2
 822
 823             psrad       mm3,        31
 824             psrad       mm4,        31
 825
 826             packssdw    mm3,        mm4
 827             packsswb    mm3,        mm0
 828
 829             movd        mm1,        DWORD PTR [esi+eax*8]
 830
 831             movq        mm2,        mm1
 832             punpcklbw   mm1,        mm0
 833
 834             paddw       mm1,        mm5
 835             mov         ecx,        edx
 836
 837             and         ecx,        127
 838             movq        mm4,        vp8_rv[ecx*2]
 839
 840             paddw       mm1,        mm4
 841             //paddw     xmm1,       eight8s
 842             psraw       mm1,        4
 843
 844             packuswb    mm1,        mm0
 845             pand        mm1,        mm3
 846
 847             pandn       mm3,        mm2
 848             por         mm1,        mm3
 849
 850             and         ecx,        15
 851             movd        DWORD PTR  d[ecx*4], mm1
 852
 853             mov         ecx,        edx
 854             sub         ecx,        8
 855
 856             and         ecx,        15
 857             movd        mm1,        DWORD PTR d[ecx*4]
 858
 859             movd        [esi],      mm1
 860             lea         esi,        [esi+eax]
 861
 862             lea         edi,        [edi+eax]
 863             add         edx,        1
 864
 865             cmp         edx,        rows
 866             jl          loop_row
 867
 868         }
 869
 870     }
 871 }
 872
 873 void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
 874 {
 875     int c, i;
 876     __declspec(align(16))
 877     int flimit4[4];
 878     __declspec(align(16))
 879     unsigned char d[16][8];
 880
 881     flimit = vp8_q2mbl(flimit);
 882
 883     for (i = 0; i < 4; i++)
 884         flimit4[i] = flimit;
 885
 886     rows += 8;
 887
 888     for (c = 0; c < cols; c += 8)
 889     {
 890         unsigned char *s = &dst[c];
 891
 892         __asm
 893         {
 894             mov         esi,        s           ;
 895             pxor        xmm0,       xmm0        ;
 896
 897             mov         eax,        pitch       ;
 898             neg         eax                                     // eax = -pitch
 899
 900             lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
 901             neg         eax
 902
 903
 904             pxor        xmm5,       xmm5
 905             pxor        xmm6,       xmm6        ;
 906
 907             pxor        xmm7,       xmm7        ;
 908             mov         edi,        esi
 909
 910             mov         ecx,        15          ;
 911
 912             loop_initvar:
 913             movq        xmm1,       QWORD PTR [edi];
 914             punpcklbw   xmm1,       xmm0        ;
 915
 916             paddw       xmm5,       xmm1        ;
 917             pmullw      xmm1,       xmm1        ;
 918
 919             movdqa      xmm2,       xmm1        ;
 920             punpcklwd   xmm1,       xmm0        ;
 921
 922             punpckhwd   xmm2,       xmm0        ;
 923             paddd       xmm6,       xmm1        ;
 924
 925             paddd       xmm7,       xmm2        ;
 926             lea         edi,        [edi+eax]   ;
 927
 928             dec         ecx
 929             jne         loop_initvar
 930             //save the var and sum
 931             xor         edx,        edx
 932             loop_row:
 933             movq        xmm1,       QWORD PTR [esi]     // [s-pitch*8]
 934             movq        xmm2,       QWORD PTR [edi]     // [s+pitch*7]
 935
 936             punpcklbw   xmm1,       xmm0
 937             punpcklbw   xmm2,       xmm0
 938
 939             paddw       xmm5,       xmm2
 940             psubw       xmm5,       xmm1
 941
 942             pmullw      xmm2,       xmm2
 943             movdqa      xmm4,       xmm2
 944
 945             punpcklwd   xmm2,       xmm0
 946             punpckhwd   xmm4,       xmm0
 947
 948             paddd       xmm6,       xmm2
 949             paddd       xmm7,       xmm4
 950
 951             pmullw      xmm1,       xmm1
 952             movdqa      xmm2,       xmm1
 953
 954             punpcklwd   xmm1,       xmm0
 955             psubd       xmm6,       xmm1
 956
 957             punpckhwd   xmm2,       xmm0
 958             psubd       xmm7,       xmm2
 959
 960
 961             movdqa      xmm3,       xmm6
 962             pslld       xmm3,       4
 963
 964             psubd       xmm3,       xmm6
 965             movdqa      xmm1,       xmm5
 966
 967             movdqa      xmm4,       xmm5
 968             pmullw      xmm1,       xmm1
 969
 970             pmulhw      xmm4,       xmm4
 971             movdqa      xmm2,       xmm1
 972
 973             punpcklwd   xmm1,       xmm4
 974             punpckhwd   xmm2,       xmm4
 975
 976             movdqa      xmm4,       xmm7
 977             pslld       xmm4,       4
 978
 979             psubd       xmm4,       xmm7
 980
 981             psubd       xmm3,       xmm1
 982             psubd       xmm4,       xmm2
 983
 984             psubd       xmm3,       flimit4
 985             psubd       xmm4,       flimit4
 986
 987             psrad       xmm3,       31
 988             psrad       xmm4,       31
 989
 990             packssdw    xmm3,       xmm4
 991             packsswb    xmm3,       xmm0
 992
 993             movq        xmm1,       QWORD PTR [esi+eax*8]
 994
 995             movq        xmm2,       xmm1
 996             punpcklbw   xmm1,       xmm0
 997
 998             paddw       xmm1,       xmm5
 999             mov         ecx,        edx
1000
1001             and         ecx,        127
1002             movdqu      xmm4,       vp8_rv[ecx*2]
1003
1004             paddw       xmm1,       xmm4
1005             //paddw     xmm1,       eight8s
1006             psraw       xmm1,       4
1007
1008             packuswb    xmm1,       xmm0
1009             pand        xmm1,       xmm3
1010
1011             pandn       xmm3,       xmm2
1012             por         xmm1,       xmm3
1013
1014             and         ecx,        15
1015             movq        QWORD PTR  d[ecx*8], xmm1
1016
1017             mov         ecx,        edx
1018             sub         ecx,        8
1019
1020             and         ecx,        15
1021             movq        mm0,        d[ecx*8]
1022
1023             movq        [esi],      mm0
1024             lea         esi,        [esi+eax]
1025
1026             lea         edi,        [edi+eax]
1027             add         edx,        1
1028
1029             cmp         edx,        rows
1030             jl          loop_row
1031
1032         }
1033
1034     }
1035 }
1036 #if 0
1037 /****************************************************************************
1038  *
1039  *  ROUTINE       : plane_add_noise_wmt
1040  *
1041  *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
1042  *                                  noise to
1043  *                  unsigned int Width    width of plane
1044  *                  unsigned int Height   height of plane
1045  *                  int  Pitch    distance between subsequent lines of frame
1046  *                  int  q        quantizer used to determine amount of noise
1047  *                                  to add
1048  *
1049  *  OUTPUTS       : None.
1050  *
1051  *  RETURNS       : void.
1052  *
1053  *  FUNCTION      : adds gaussian noise to a plane of pixels
1054  *
1055  *  SPECIAL NOTES : None.
1056  *
1057  ****************************************************************************/
1058 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1059 {
1060     unsigned int i;
1061
1062     __declspec(align(16)) unsigned char blackclamp[16];
1063     __declspec(align(16)) unsigned char whiteclamp[16];
1064     __declspec(align(16)) unsigned char bothclamp[16];
1065     char char_dist[300];
1066     char Rand[2048];
1067     double sigma;
1068 //    return;
1069     __asm emms
1070     sigma = a + .5 + .6 * (63 - q) / 63.0;
1071
1072     // set up a lookup table of 256 entries that matches
1073     // a gaussian distribution with sigma determined by q.
1074     //
1075     {
1076         double i;
1077         int next, j;
1078
1079         next = 0;
1080
1081         for (i = -32; i < 32; i++)
1082         {
1083             double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
1084             int a = (int)(g + .5);
1085
1086             if (a)
1087             {
1088                 for (j = 0; j < a; j++)
1089                 {
1090                     char_dist[next+j] = (char) i;
1091                 }
1092
1093                 next = next + j;
1094             }
1095
1096         }
1097
1098         for (next = next; next < 256; next++)
1099             char_dist[next] = 0;
1100
1101     }
1102
1103     for (i = 0; i < 2048; i++)
1104     {
1105         Rand[i] = char_dist[rand() & 0xff];
1106     }
1107
1108     for (i = 0; i < 16; i++)
1109     {
1110         blackclamp[i] = -char_dist[0];
1111         whiteclamp[i] = -char_dist[0];
1112         bothclamp[i] = -2 * char_dist[0];
1113     }
1114
1115     for (i = 0; i < Height; i++)
1116     {
1117         unsigned char *Pos = Start + i * Pitch;
1118         char  *Ref = Rand + (rand() & 0xff);
1119
1120         __asm
1121         {
1122             mov ecx, [Width]
1123             mov esi, Pos
1124             mov edi, Ref
1125             xor         eax, eax
1126
1127             nextset:
1128             movdqu      xmm1, [esi+eax]        // get the source
1129
1130             psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
1131             paddusb     xmm1, bothclamp
1132             psubusb     xmm1, whiteclamp
1133
1134             movdqu      xmm2, [edi+eax]        // get the noise for this line
1135             paddb       xmm1, xmm2             // add it in
1136             movdqu      [esi+eax], xmm1        // store the result
1137
1138             add         eax, 16                // move to the next line
1139
1140             cmp         eax, ecx
1141             jl          nextset
1142
1143
1144         }
1145
1146     }
1147 }
1148 #endif
1149 __declspec(align(16))
1150 static const int four8s[4] = { 8, 8, 8, 8};
1151 void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
1152 {
1153     int r, i;
1154     __declspec(align(16))
1155     int flimit4[4];
1156     unsigned char *s = src;
1157     int sumsq;
1158     int sum;
1159
1160
1161     flimit = vp8_q2mbl(flimit);
1162     flimit4[0] =
1163         flimit4[1] =
1164             flimit4[2] =
1165                 flimit4[3] = flimit;
1166
1167     for (r = 0; r < rows; r++)
1168     {
1169
1170
1171         sumsq = 0;
1172         sum = 0;
1173
1174         for (i = -8; i <= 6; i++)
1175         {
1176             sumsq += s[i] * s[i];
1177             sum   += s[i];
1178         }
1179
1180         __asm
1181         {
1182             mov         eax,    sumsq
1183             movd        xmm7,   eax
1184
1185             mov         eax,    sum
1186             movd        xmm6,   eax
1187
1188             mov         esi,    s
1189             xor         ecx,    ecx
1190
1191             mov         edx,    cols
1192             add         edx,    8
1193             pxor        mm0,    mm0
1194             pxor        mm1,    mm1
1195
1196             pxor        xmm0,   xmm0
1197             nextcol4:
1198
1199             movd        xmm1,   DWORD PTR [esi+ecx-8]   // -8 -7 -6 -5
1200             movd        xmm2,   DWORD PTR [esi+ecx+7]   // +7 +8 +9 +10
1201
1202             punpcklbw   xmm1,   xmm0                    // expanding
1203             punpcklbw   xmm2,   xmm0                    // expanding
1204
1205             punpcklwd   xmm1,   xmm0                    // expanding to dwords
1206             punpcklwd   xmm2,   xmm0                    // expanding to dwords
1207
1208             psubd       xmm2,   xmm1                    // 7--8   8--7   9--6 10--5
1209             paddd       xmm1,   xmm1                    // -8*2   -7*2   -6*2 -5*2
1210
1211             paddd       xmm1,   xmm2                    // 7+-8   8+-7   9+-6 10+-5
1212             pmaddwd     xmm1,   xmm2                    // squared of 7+-8   8+-7   9+-6 10+-5
1213
1214             paddd       xmm6,   xmm2
1215             paddd       xmm7,   xmm1
1216
1217             pshufd      xmm6,   xmm6,   0               // duplicate the last ones
1218             pshufd      xmm7,   xmm7,   0               // duplicate the last ones
1219
1220             psrldq      xmm1,       4                   // 8--7   9--6 10--5  0000
1221             psrldq      xmm2,       4                   // 8--7   9--6 10--5  0000
1222
1223             pshufd      xmm3,   xmm1,   3               // 0000  8--7   8--7   8--7 squared
1224             pshufd      xmm4,   xmm2,   3               // 0000  8--7   8--7   8--7 squared
1225
1226             paddd       xmm6,   xmm4
1227             paddd       xmm7,   xmm3
1228
1229             pshufd      xmm3,   xmm1,   01011111b       // 0000  0000   9--6   9--6 squared
1230             pshufd      xmm4,   xmm2,   01011111b       // 0000  0000   9--6   9--6 squared
1231
1232             paddd       xmm7,   xmm3
1233             paddd       xmm6,   xmm4
1234
1235             pshufd      xmm3,   xmm1,   10111111b       // 0000  0000   8--7   8--7 squared
1236             pshufd      xmm4,   xmm2,   10111111b       // 0000  0000   8--7   8--7 squared
1237
1238             paddd       xmm7,   xmm3
1239             paddd       xmm6,   xmm4
1240
1241             movdqa      xmm3,   xmm6
1242             pmaddwd     xmm3,   xmm3
1243
1244             movdqa      xmm5,   xmm7
1245             pslld       xmm5,   4
1246
1247             psubd       xmm5,   xmm7
1248             psubd       xmm5,   xmm3
1249
1250             psubd       xmm5,   flimit4
1251             psrad       xmm5,   31
1252
1253             packssdw    xmm5,   xmm0
1254             packsswb    xmm5,   xmm0
1255
1256             movd        xmm1,   DWORD PTR [esi+ecx]
1257             movq        xmm2,   xmm1
1258
1259             punpcklbw   xmm1,   xmm0
1260             punpcklwd   xmm1,   xmm0
1261
1262             paddd       xmm1,   xmm6
1263             paddd       xmm1,   four8s
1264
1265             psrad       xmm1,   4
1266             packssdw    xmm1,   xmm0
1267
1268             packuswb    xmm1,   xmm0
1269             pand        xmm1,   xmm5
1270
1271             pandn       xmm5,   xmm2
1272             por         xmm5,   xmm1
1273
1274             movd        [esi+ecx-8],  mm0
1275             movq        mm0,    mm1
1276
1277             movdq2q     mm1,    xmm5
1278             psrldq      xmm7,   12
1279
1280             psrldq      xmm6,   12
1281             add         ecx,    4
1282
1283             cmp         ecx,    edx
1284             jl          nextcol4
1285
1286         }
1287         s += pitch;
1288     }
1289 }
1290
1291 #if 0
1292
1293 /****************************************************************************
1294  *
1295  *  ROUTINE       : plane_add_noise_mmx
1296  *
1297  *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
1298  *                                  noise to
1299  *                  unsigned int Width    width of plane
1300  *                  unsigned int Height   height of plane
1301  *                  int  Pitch    distance between subsequent lines of frame
1302  *                  int  q        quantizer used to determine amount of noise
1303  *                                  to add
1304  *
1305  *  OUTPUTS       : None.
1306  *
1307  *  RETURNS       : void.
1308  *
1309  *  FUNCTION      : adds gaussian noise to a plane of pixels
1310  *
1311  *  SPECIAL NOTES : None.
1312  *
1313  ****************************************************************************/
1314 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1315 {
1316     unsigned int i;
1317     int Pitch4 = Pitch * 4;
1318     const int noise_amount = 2;
1319     const int noise_adder = 2 * noise_amount + 1;
1320
1321     __declspec(align(16)) unsigned char blackclamp[16];
1322     __declspec(align(16)) unsigned char whiteclamp[16];
1323     __declspec(align(16)) unsigned char bothclamp[16];
1324
1325     char char_dist[300];
1326     char Rand[2048];
1327
1328     double sigma;
1329     __asm emms
1330     sigma = a + .5 + .6 * (63 - q) / 63.0;
1331
1332     // set up a lookup table of 256 entries that matches
1333     // a gaussian distribution with sigma determined by q.
1334     //
1335     {
1336         double i, sum = 0;
1337         int next, j;
1338
1339         next = 0;
1340
1341         for (i = -32; i < 32; i++)
1342         {
1343             int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
1344
1345             if (a)
1346             {
1347                 for (j = 0; j < a; j++)
1348                 {
1349                     char_dist[next+j] = (char) i;
1350                 }
1351
1352                 next = next + j;
1353             }
1354
1355         }
1356
1357         for (next = next; next < 256; next++)
1358             char_dist[next] = 0;
1359
1360     }
1361
1362     for (i = 0; i < 2048; i++)
1363     {
1364         Rand[i] = char_dist[rand() & 0xff];
1365     }
1366
1367     for (i = 0; i < 16; i++)
1368     {
1369         blackclamp[i] = -char_dist[0];
1370         whiteclamp[i] = -char_dist[0];
1371         bothclamp[i] = -2 * char_dist[0];
1372     }
1373
1374     for (i = 0; i < Height; i++)
1375     {
1376         unsigned char *Pos = Start + i * Pitch;
1377         char  *Ref = Rand + (rand() & 0xff);
1378
1379         __asm
1380         {
1381             mov ecx, [Width]
1382             mov esi, Pos
1383             mov edi, Ref
1384             xor         eax, eax
1385
1386             nextset:
1387             movq        mm1, [esi+eax]        // get the source
1388
1389             psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
1390             paddusb     mm1, bothclamp
1391             psubusb     mm1, whiteclamp
1392
1393             movq        mm2, [edi+eax]        // get the noise for this line
1394             paddb       mm1, mm2             // add it in
1395             movq        [esi+eax], mm1        // store the result
1396
1397             add         eax, 8                // move to the next line
1398
1399             cmp         eax, ecx
1400             jl          nextset
1401
1402
1403         }
1404
1405     }
1406 }
1407 #else
1408 extern char an[8][64][3072];
1409 extern int cd[8][64];
1410
1411 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1412 {
1413     unsigned int i;
1414     __declspec(align(16)) unsigned char blackclamp[16];
1415     __declspec(align(16)) unsigned char whiteclamp[16];
1416     __declspec(align(16)) unsigned char bothclamp[16];
1417
1418
1419     __asm emms
1420
1421     for (i = 0; i < 16; i++)
1422     {
1423         blackclamp[i] = -cd[a][q];
1424         whiteclamp[i] = -cd[a][q];
1425         bothclamp[i] = -2 * cd[a][q];
1426     }
1427
1428     for (i = 0; i < Height; i++)
1429     {
1430         unsigned char *Pos = Start + i * Pitch;
1431         char  *Ref = an[a][q] + (rand() & 0xff);
1432
1433         __asm
1434         {
1435             mov ecx, [Width]
1436             mov esi, Pos
1437             mov edi, Ref
1438             xor         eax, eax
1439
1440             nextset:
1441             movq        mm1, [esi+eax]        // get the source
1442
1443             psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
1444             paddusb     mm1, bothclamp
1445             psubusb     mm1, whiteclamp
1446
1447             movq        mm2, [edi+eax]        // get the noise for this line
1448             paddb       mm1, mm2             // add it in
1449             movq        [esi+eax], mm1        // store the result
1450
1451             add         eax, 8                // move to the next line
1452
1453             cmp         eax, ecx
1454             jl          nextset
1455         }
1456     }
1457 }
1458
1459
1460 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1461 {
1462     unsigned int i;
1463
1464     __declspec(align(16)) unsigned char blackclamp[16];
1465     __declspec(align(16)) unsigned char whiteclamp[16];
1466     __declspec(align(16)) unsigned char bothclamp[16];
1467
1468     __asm emms
1469
1470     for (i = 0; i < 16; i++)
1471     {
1472         blackclamp[i] = -cd[a][q];
1473         whiteclamp[i] = -cd[a][q];
1474         bothclamp[i] = -2 * cd[a][q];
1475     }
1476
1477     for (i = 0; i < Height; i++)
1478     {
1479         unsigned char *Pos = Start + i * Pitch;
1480         char *Ref = an[a][q] + (rand() & 0xff);
1481
1482         __asm
1483         {
1484             mov ecx,    [Width]
1485             mov esi,    Pos
1486             mov edi,    Ref
1487             xor         eax, eax
1488
1489             nextset:
1490             movdqu      xmm1, [esi+eax]        // get the source
1491
1492             psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
1493             paddusb     xmm1, bothclamp
1494             psubusb     xmm1, whiteclamp
1495
1496             movdqu      xmm2, [edi+eax]        // get the noise for this line
1497             paddb       xmm1, xmm2             // add it in
1498             movdqu      [esi+eax], xmm1        // store the result
1499
1500             add         eax, 16                // move to the next line
1501
1502             cmp         eax, ecx
1503             jl          nextset
1504         }
1505     }
1506 }
1507
1508 #endif