vpx_scale/win32/scaleopt.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11
  12 /****************************************************************************
  13 *
  14 *   Module Title :     scaleopt.cpp
  15 *
  16 *   Description  :     Optimized scaling functions
  17 *
  18 ****************************************************************************/
  19 #include "pragmas.h"
  20
  21
  22
  23 /****************************************************************************
  24 *  Module Statics
  25 ****************************************************************************/
  26 __declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
  27 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
  28 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
  29 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
  30 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
  31 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
  32 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
  33 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
  34 __declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
  35 __declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
  36 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
  37
  38
  39
  40 #include "vpx_scale/vpxscale.h"
  41 #include "vpx_mem/vpx_mem.h"
  42
  43 /****************************************************************************
  44  *
  45  *  ROUTINE       : horizontal_line_3_5_scale_mmx
  46  *
  47  *  INPUTS        : const unsigned char *source :
  48  *                  unsigned int source_width    :
  49  *                  unsigned char *dest         :
  50  *                  unsigned int dest_width      :
  51  *
  52  *  OUTPUTS       : None.
  53  *
  54  *  RETURNS       : void
  55  *
  56  *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
  57  *
  58  *  SPECIAL NOTES : None.
  59  *
  60  ****************************************************************************/
  61 static
  62 void horizontal_line_3_5_scale_mmx
  63 (
  64     const unsigned char *source,
  65     unsigned int source_width,
  66     unsigned char *dest,
  67     unsigned int dest_width
  68 )
  69 {
  70     (void) dest_width;
  71
  72     __asm
  73     {
  74
  75         push ebx
  76
  77         mov         esi,    source
  78         mov         edi,    dest
  79
  80         mov         ecx,    source_width
  81         lea         edx,    [esi+ecx-3];
  82
  83         movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
  84         movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
  85
  86         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
  87         pxor        mm7,    mm7             // clear mm7
  88
  89         horiz_line_3_5_loop:
  90
  91         mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
  92         mov        ebx,    eax
  93
  94         and         ebx,    0xffff00        // ebx = xx 01 02 xx
  95         mov         ecx,    eax             // ecx = 00 01 02 03
  96
  97         and         eax,    0xffff0000      // eax = xx xx 02 03
  98         xor         ecx,    eax             // ecx = 00 01 xx xx
  99
 100         shr         ebx,    8               // ebx = 01 02 xx xx
 101         or          eax,    ebx             // eax = 01 02 02 03
 102
 103         shl         ebx,    16              // ebx = xx xx 01 02
 104         movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
 105
 106         or          ebx,    ecx             // ebx = 00 01 01 02
 107         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
 108
 109         movd        mm0,    ebx             // mm0 = 00 01 01 02
 110         pmullw      mm1,    mm6             //
 111
 112         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
 113         pmullw      mm0,    mm5             //
 114
 115         mov         [edi],  ebx             // writeoutput 00 xx xx xx
 116         add         esi,    3
 117
 118         add         edi,    5
 119         paddw       mm0,    mm1
 120
 121         paddw       mm0,    mm4
 122         psrlw       mm0,    8
 123
 124         cmp         esi,    edx
 125         packuswb    mm0,    mm7
 126
 127         movd        DWORD Ptr [edi-4], mm0
 128         jl          horiz_line_3_5_loop
 129
 130 //Exit:
 131         mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
 132         mov         ebx,    eax
 133
 134         and         ebx,    0xffff00        // ebx = xx 01 02 xx
 135         mov         ecx,    eax             // ecx = 00 01 02 03
 136
 137         and         eax,    0xffff0000      // eax = xx xx 02 03
 138         xor         ecx,    eax             // ecx = 00 01 xx xx
 139
 140         shr         ebx,    8               // ebx = 01 02 xx xx
 141         or          eax,    ebx             // eax = 01 02 02 03
 142
 143         shl         eax,    8               // eax = xx 01 02 02
 144         and         eax,    0xffff0000      // eax = xx xx 02 02
 145
 146         or          eax,    ebx             // eax = 01 02 02 02
 147
 148         shl         ebx,    16              // ebx = xx xx 01 02
 149         movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
 150
 151         or          ebx,    ecx             // ebx = 00 01 01 02
 152         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
 153
 154         movd        mm0,    ebx             // mm0 = 00 01 01 02
 155         pmullw      mm1,    mm6             //
 156
 157         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
 158         pmullw      mm0,    mm5             //
 159
 160         mov         [edi],  ebx             // writeoutput 00 xx xx xx
 161         paddw       mm0,    mm1
 162
 163         paddw       mm0,    mm4
 164         psrlw       mm0,    8
 165
 166         packuswb    mm0,    mm7
 167         movd        DWORD Ptr [edi+1], mm0
 168
 169         pop ebx
 170
 171     }
 172
 173 }
 174
 175
 176 /****************************************************************************
 177  *
 178  *  ROUTINE       : horizontal_line_4_5_scale_mmx
 179  *
 180  *  INPUTS        : const unsigned char *source :
 181  *                  unsigned int source_width    :
 182  *                  unsigned char *dest         :
 183  *                  unsigned int dest_width      :
 184  *
 185  *  OUTPUTS       : None.
 186  *
 187  *  RETURNS       : void
 188  *
 189  *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
 190  *
 191  *  SPECIAL NOTES : None.
 192  *
 193  ****************************************************************************/
 194 static
 195 void horizontal_line_4_5_scale_mmx
 196 (
 197     const unsigned char *source,
 198     unsigned int source_width,
 199     unsigned char *dest,
 200     unsigned int dest_width
 201 )
 202 {
 203     (void)dest_width;
 204
 205     __asm
 206     {
 207
 208         mov         esi,    source
 209         mov         edi,    dest
 210
 211         mov         ecx,    source_width
 212         lea         edx,    [esi+ecx-8];
 213
 214         movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
 215         movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
 216
 217         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
 218         pxor        mm7,    mm7             // clear mm7
 219
 220         horiz_line_4_5_loop:
 221
 222         movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
 223         movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08
 224
 225         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
 226         movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
 227
 228         movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
 229         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
 230
 231         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
 232         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
 233
 234         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
 235         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
 236
 237         movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
 238         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
 239
 240         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
 241         pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
 242
 243         paddw       mm0,    mm1             // added round values
 244         paddw       mm0,    mm4
 245
 246         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
 247         packuswb    mm0,    mm7
 248
 249         movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
 250         add         edi,    10
 251
 252         add         esi,    8
 253         paddw       mm2,    mm3             //
 254
 255         paddw       mm2,    mm4             // added round values
 256         cmp         esi,    edx
 257
 258         psrlw       mm2,    8
 259         packuswb    mm2,    mm7
 260
 261         movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
 262         jl         horiz_line_4_5_loop
 263
 264 //Exit:
 265         movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
 266         movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
 267
 268         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
 269         psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
 270
 271         movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
 272         pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
 273
 274         psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
 275         por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
 276
 277         movq        mm3,    mm1
 278
 279         movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
 280         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
 281
 282         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
 283         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
 284
 285         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
 286         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
 287
 288         movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
 289         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
 290
 291         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
 292         pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
 293
 294         paddw       mm0,    mm1             // added round values
 295         paddw       mm0,    mm4
 296
 297         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
 298         packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
 299
 300         movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
 301         paddw       mm2,    mm3             //
 302
 303         paddw       mm2,    mm4             // added round values
 304         psrlw       mm2,    8
 305
 306         packuswb    mm2,    mm7
 307         movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09
 308
 309
 310     }
 311 }
 312
 313 /****************************************************************************
 314  *
 315  *  ROUTINE       : vertical_band_4_5_scale_mmx
 316  *
 317  *  INPUTS        : unsigned char *dest    :
 318  *                  unsigned int dest_pitch :
 319  *                  unsigned int dest_width :
 320  *
 321  *  OUTPUTS       : None.
 322  *
 323  *  RETURNS       : void
 324  *
 325  *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
 326  *
 327  *  SPECIAL NOTES : The routine uses the first line of the band below
 328  *                  the current band. The function also has a "C" only
 329  *                  version.
 330  *
 331  ****************************************************************************/
 332 static
 333 void vertical_band_4_5_scale_mmx
 334 (
 335     unsigned char *dest,
 336     unsigned int dest_pitch,
 337     unsigned int dest_width
 338 )
 339 {
 340     __asm
 341     {
 342
 343         mov         esi,    dest                    // Get the source and destination pointer
 344         mov         ecx,    dest_pitch               // Get the pitch size
 345
 346         lea         edi,    [esi+ecx*2]             // tow lines below
 347         add         edi,    ecx                     // three lines below
 348
 349         pxor        mm7,    mm7                     // clear out mm7
 350         mov         edx,    dest_width               // Loop counter
 351
 352         vs_4_5_loop:
 353
 354         movq        mm0,    QWORD ptr [esi]         // src[0];
 355         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
 356
 357         movq        mm2,    mm0                     // Make a copy
 358         punpcklbw   mm0,    mm7                     // unpack low to word
 359
 360         movq        mm5,    one_fifth
 361         punpckhbw   mm2,    mm7                     // unpack high to word
 362
 363         pmullw      mm0,    mm5                     // a * 1/5
 364
 365         movq        mm3,    mm1                     // make a copy
 366         punpcklbw   mm1,    mm7                     // unpack low to word
 367
 368         pmullw      mm2,    mm5                     // a * 1/5
 369         movq        mm6,    four_fifths               // constan
 370
 371         movq        mm4,    mm1                     // copy of low b
 372         pmullw      mm4,    mm6                     // b * 4/5
 373
 374         punpckhbw   mm3,    mm7                     // unpack high to word
 375         movq        mm5,    mm3                     // copy of high b
 376
 377         pmullw      mm5,    mm6                     // b * 4/5
 378         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
 379
 380         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
 381         paddw       mm0,    round_values             // + 128
 382
 383         paddw       mm2,    round_values             // + 128
 384         psrlw       mm0,    8
 385
 386         psrlw       mm2,    8
 387         packuswb    mm0,    mm2                     // des [1]
 388
 389         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
 390         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
 391
 392         // mm1, mm3 --- Src[1]
 393         // mm0 --- Src[2]
 394         // mm7 for unpacking
 395
 396         movq        mm5,    two_fifths
 397         movq        mm2,    mm0                     // make a copy
 398
 399         pmullw      mm1,    mm5                     // b * 2/5
 400         movq        mm6,    three_fifths
 401
 402
 403         punpcklbw   mm0,    mm7                     // unpack low to word
 404         pmullw      mm3,    mm5                     // b * 2/5
 405
 406         movq        mm4,    mm0                     // make copy of c
 407         punpckhbw   mm2,    mm7                     // unpack high to word
 408
 409         pmullw      mm4,    mm6                     // c * 3/5
 410         movq        mm5,    mm2
 411
 412         pmullw      mm5,    mm6                     // c * 3/5
 413         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
 414
 415         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
 416         paddw       mm1,    round_values             // + 128
 417
 418         paddw       mm3,    round_values             // + 128
 419         psrlw       mm1,    8
 420
 421         psrlw       mm3,    8
 422         packuswb    mm1,    mm3                     // des[2]
 423
 424         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
 425         movq        mm1,    [edi]                   // mm1=Src[3];
 426
 427         // mm0, mm2 --- Src[2]
 428         // mm1 --- Src[3]
 429         // mm6 --- 3/5
 430         // mm7 for unpacking
 431
 432         pmullw      mm0,    mm6                     // c * 3/5
 433         movq        mm5,    two_fifths               // mm5 = 2/5
 434
 435         movq        mm3,    mm1                     // make a copy
 436         pmullw      mm2,    mm6                     // c * 3/5
 437
 438         punpcklbw   mm1,    mm7                     // unpack low
 439         movq        mm4,    mm1                     // make a copy
 440
 441         punpckhbw   mm3,    mm7                     // unpack high
 442         pmullw      mm4,    mm5                     // d * 2/5
 443
 444         movq        mm6,    mm3                     // make a copy
 445         pmullw      mm6,    mm5                     // d * 2/5
 446
 447         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
 448         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
 449
 450         paddw       mm0,    round_values             // + 128
 451         paddw       mm2,    round_values             // + 128
 452
 453         psrlw       mm0,    8
 454         psrlw       mm2,    8
 455
 456         packuswb    mm0,    mm2                     // des[3]
 457         movq        QWORD ptr [edi], mm0            // write des[3]
 458
 459         //  mm1, mm3 --- Src[3]
 460         //  mm7 -- cleared for unpacking
 461
 462         movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group
 463
 464         movq        mm5,    four_fifths              // mm5 = 4/5
 465         pmullw      mm1,    mm5                     // d * 4/5
 466
 467         movq        mm6,    one_fifth                // mm6 = 1/5
 468         movq        mm2,    mm0                     // make a copy
 469
 470         pmullw      mm3,    mm5                     // d * 4/5
 471         punpcklbw   mm0,    mm7                     // unpack low
 472
 473         pmullw      mm0,    mm6                     // an * 1/5
 474         punpckhbw   mm2,    mm7                     // unpack high
 475
 476         paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
 477         pmullw      mm2,    mm6                     // an * 1/5
 478
 479         paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
 480         paddw       mm1,    round_values             // + 128
 481
 482         paddw       mm3,    round_values             // + 128
 483         psrlw       mm1,    8
 484
 485         psrlw       mm3,    8
 486         packuswb    mm1,    mm3                     // des[4]
 487
 488         movq        QWORD ptr [edi+ecx], mm1        // write des[4]
 489
 490         add         edi,    8
 491         add         esi,    8
 492
 493         sub         edx,    8
 494         jg         vs_4_5_loop
 495     }
 496 }
 497
 498 /****************************************************************************
 499  *
 500  *  ROUTINE       : last_vertical_band_4_5_scale_mmx
 501  *
 502  *  INPUTS        : unsigned char *dest    :
 503  *                  unsigned int dest_pitch :
 504  *                  unsigned int dest_width :
 505  *
 506  *  OUTPUTS       : None.
 507  *
 508  *  RETURNS       : None
 509  *
 510  *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
 511  *
 512  *  SPECIAL NOTES : The routine uses the first line of the band below
 513  *                  the current band. The function also has an "C" only
 514  *                  version.
 515  *
 516  ****************************************************************************/
 517 static
 518 void last_vertical_band_4_5_scale_mmx
 519 (
 520     unsigned char *dest,
 521     unsigned int dest_pitch,
 522     unsigned int dest_width
 523 )
 524 {
 525     __asm
 526     {
 527         mov         esi,    dest                    // Get the source and destination pointer
 528         mov         ecx,    dest_pitch               // Get the pitch size
 529
 530         lea         edi,    [esi+ecx*2]             // tow lines below
 531         add         edi,    ecx                     // three lines below
 532
 533         pxor        mm7,    mm7                     // clear out mm7
 534         mov         edx,    dest_width               // Loop counter
 535
 536         last_vs_4_5_loop:
 537
 538         movq        mm0,    QWORD ptr [esi]         // src[0];
 539         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
 540
 541         movq        mm2,    mm0                     // Make a copy
 542         punpcklbw   mm0,    mm7                     // unpack low to word
 543
 544         movq        mm5,    one_fifth
 545         punpckhbw   mm2,    mm7                     // unpack high to word
 546
 547         pmullw      mm0,    mm5                     // a * 1/5
 548
 549         movq        mm3,    mm1                     // make a copy
 550         punpcklbw   mm1,    mm7                     // unpack low to word
 551
 552         pmullw      mm2,    mm5                     // a * 1/5
 553         movq        mm6,    four_fifths               // constan
 554
 555         movq        mm4,    mm1                     // copy of low b
 556         pmullw      mm4,    mm6                     // b * 4/5
 557
 558         punpckhbw   mm3,    mm7                     // unpack high to word
 559         movq        mm5,    mm3                     // copy of high b
 560
 561         pmullw      mm5,    mm6                     // b * 4/5
 562         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
 563
 564         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
 565         paddw       mm0,    round_values             // + 128
 566
 567         paddw       mm2,    round_values             // + 128
 568         psrlw       mm0,    8
 569
 570         psrlw       mm2,    8
 571         packuswb    mm0,    mm2                     // des [1]
 572
 573         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
 574         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
 575
 576         // mm1, mm3 --- Src[1]
 577         // mm0 --- Src[2]
 578         // mm7 for unpacking
 579
 580         movq        mm5,    two_fifths
 581         movq        mm2,    mm0                     // make a copy
 582
 583         pmullw      mm1,    mm5                     // b * 2/5
 584         movq        mm6,    three_fifths
 585
 586
 587         punpcklbw   mm0,    mm7                     // unpack low to word
 588         pmullw      mm3,    mm5                     // b * 2/5
 589
 590         movq        mm4,    mm0                     // make copy of c
 591         punpckhbw   mm2,    mm7                     // unpack high to word
 592
 593         pmullw      mm4,    mm6                     // c * 3/5
 594         movq        mm5,    mm2
 595
 596         pmullw      mm5,    mm6                     // c * 3/5
 597         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
 598
 599         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
 600         paddw       mm1,    round_values             // + 128
 601
 602         paddw       mm3,    round_values             // + 128
 603         psrlw       mm1,    8
 604
 605         psrlw       mm3,    8
 606         packuswb    mm1,    mm3                     // des[2]
 607
 608         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
 609         movq        mm1,    [edi]                   // mm1=Src[3];
 610
 611         movq        QWORD ptr [edi+ecx], mm1        // write des[4];
 612
 613         // mm0, mm2 --- Src[2]
 614         // mm1 --- Src[3]
 615         // mm6 --- 3/5
 616         // mm7 for unpacking
 617
 618         pmullw      mm0,    mm6                     // c * 3/5
 619         movq        mm5,    two_fifths               // mm5 = 2/5
 620
 621         movq        mm3,    mm1                     // make a copy
 622         pmullw      mm2,    mm6                     // c * 3/5
 623
 624         punpcklbw   mm1,    mm7                     // unpack low
 625         movq        mm4,    mm1                     // make a copy
 626
 627         punpckhbw   mm3,    mm7                     // unpack high
 628         pmullw      mm4,    mm5                     // d * 2/5
 629
 630         movq        mm6,    mm3                     // make a copy
 631         pmullw      mm6,    mm5                     // d * 2/5
 632
 633         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
 634         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
 635
 636         paddw       mm0,    round_values             // + 128
 637         paddw       mm2,    round_values             // + 128
 638
 639         psrlw       mm0,    8
 640         psrlw       mm2,    8
 641
 642         packuswb    mm0,    mm2                     // des[3]
 643         movq        QWORD ptr [edi], mm0            // write des[3]
 644
 645         //  mm1, mm3 --- Src[3]
 646         //  mm7 -- cleared for unpacking
 647         add         edi,    8
 648         add         esi,    8
 649
 650         sub         edx,    8
 651         jg          last_vs_4_5_loop
 652     }
 653 }
 654
 655 /****************************************************************************
 656  *
 657  *  ROUTINE       : vertical_band_3_5_scale_mmx
 658  *
 659  *  INPUTS        : unsigned char *dest    :
 660  *                  unsigned int dest_pitch :
 661  *                  unsigned int dest_width :
 662  *
 663  *  OUTPUTS       : None.
 664  *
 665  *  RETURNS       : void
 666  *
 667  *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
 668  *
 669  *  SPECIAL NOTES : The routine uses the first line of the band below
 670  *                  the current band. The function also has an "C" only
 671  *                  version.
 672  *
 673  ****************************************************************************/
 674 static
 675 void vertical_band_3_5_scale_mmx
 676 (
 677     unsigned char *dest,
 678     unsigned int dest_pitch,
 679     unsigned int dest_width
 680 )
 681 {
 682     __asm
 683     {
 684         mov         esi,    dest                    // Get the source and destination pointer
 685         mov         ecx,    dest_pitch               // Get the pitch size
 686
 687         lea         edi,    [esi+ecx*2]             // tow lines below
 688         add         edi,    ecx                     // three lines below
 689
 690         pxor        mm7,    mm7                     // clear out mm7
 691         mov         edx,    dest_width               // Loop counter
 692
 693         vs_3_5_loop:
 694
 695         movq        mm0,    QWORD ptr [esi]         // src[0];
 696         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
 697
 698         movq        mm2,    mm0                     // Make a copy
 699         punpcklbw   mm0,    mm7                     // unpack low to word
 700
 701         movq        mm5,    two_fifths               // mm5 = 2/5
 702         punpckhbw   mm2,    mm7                     // unpack high to word
 703
 704         pmullw      mm0,    mm5                     // a * 2/5
 705
 706         movq        mm3,    mm1                     // make a copy
 707         punpcklbw   mm1,    mm7                     // unpack low to word
 708
 709         pmullw      mm2,    mm5                     // a * 2/5
 710         movq        mm6,    three_fifths             // mm6 = 3/5
 711
 712         movq        mm4,    mm1                     // copy of low b
 713         pmullw      mm4,    mm6                     // b * 3/5
 714
 715         punpckhbw   mm3,    mm7                     // unpack high to word
 716         movq        mm5,    mm3                     // copy of high b
 717
 718         pmullw      mm5,    mm6                     // b * 3/5
 719         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
 720
 721         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
 722         paddw       mm0,    round_values             // + 128
 723
 724         paddw       mm2,    round_values             // + 128
 725         psrlw       mm0,    8
 726
 727         psrlw       mm2,    8
 728         packuswb    mm0,    mm2                     // des [1]
 729
 730         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
 731         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
 732
 733         // mm1, mm3 --- Src[1]
 734         // mm0 --- Src[2]
 735         // mm7 for unpacking
 736
 737         movq        mm4,    mm1                     // b low
 738         pmullw      mm1,    four_fifths              // b * 4/5 low
 739
 740         movq        mm5,    mm3                     // b high
 741         pmullw      mm3,    four_fifths              // b * 4/5 high
 742
 743         movq        mm2,    mm0                     // c
 744         pmullw      mm4,    one_fifth                // b * 1/5
 745
 746         punpcklbw   mm0,    mm7                     // c low
 747         pmullw      mm5,    one_fifth                // b * 1/5
 748
 749         movq        mm6,    mm0                     // make copy of c low
 750         punpckhbw   mm2,    mm7                     // c high
 751
 752         pmullw      mm6,    one_fifth                // c * 1/5 low
 753         movq        mm7,    mm2                     // make copy of c high
 754
 755         pmullw      mm7,    one_fifth                // c * 1/5 high
 756         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
 757
 758         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
 759         movq        mm6,    mm0                     // make copy of c low
 760
 761         pmullw      mm6,    four_fifths              // c * 4/5 low
 762         movq        mm7,    mm2                     // make copy of c high
 763
 764         pmullw      mm7,    four_fifths              // c * 4/5 high
 765
 766         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
 767         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
 768
 769         paddw       mm1,    round_values             // + 128
 770         paddw       mm3,    round_values             // + 128
 771
 772         psrlw       mm1,    8
 773         psrlw       mm3,    8
 774
 775         packuswb    mm1,    mm3                     // des[2]
 776         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
 777
 778         paddw       mm4,    round_values             // + 128
 779         paddw       mm5,    round_values             // + 128
 780
 781         psrlw       mm4,    8
 782         psrlw       mm5,    8
 783
 784         packuswb    mm4,    mm5                     // des[3]
 785         movq        QWORD ptr [edi], mm4            // write des[3]
 786
 787         //  mm0, mm2 --- Src[3]
 788
 789         pxor        mm7,    mm7                     // clear mm7 for unpacking
 790         movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group
 791
 792         movq        mm5,    three_fifths             // mm5 = 3/5
 793         pmullw      mm0,    mm5                     // d * 3/5
 794
 795         movq        mm6,    two_fifths                // mm6 = 2/5
 796         movq        mm3,    mm1                     // make a copy
 797
 798         pmullw      mm2,    mm5                     // d * 3/5
 799         punpcklbw   mm1,    mm7                     // unpack low
 800
 801         pmullw      mm1,    mm6                     // an * 2/5
 802         punpckhbw   mm3,    mm7                     // unpack high
 803
 804         paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
 805         pmullw      mm3,    mm6                     // an * 2/5
 806
 807         paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
 808         paddw       mm0,    round_values             // + 128
 809
 810         paddw       mm2,    round_values             // + 128
 811         psrlw       mm0,    8
 812
 813         psrlw       mm2,    8
 814         packuswb    mm0,    mm2                     // des[4]
 815
 816         movq        QWORD ptr [edi+ecx], mm0        // write des[4]
 817
 818         add         edi,    8
 819         add         esi,    8
 820
 821         sub         edx,    8
 822         jg          vs_3_5_loop
 823     }
 824 }
 825
 826 /****************************************************************************
 827  *
 828  *  ROUTINE       : last_vertical_band_3_5_scale_mmx
 829  *
 830  *  INPUTS        : unsigned char *dest    :
 831  *                  unsigned int dest_pitch :
 832  *                  unsigned int dest_width :
 833  *
 834  *  OUTPUTS       : None.
 835  *
 836  *  RETURNS       : void
 837  *
 838  *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
 839  *
 840  *  SPECIAL NOTES : The routine uses the first line of the band below
 841  *                  the current band. The function also has an "C" only
 842  *                  version.
 843  *
 844  ****************************************************************************/
 845 static
 846 void last_vertical_band_3_5_scale_mmx
 847 (
 848     unsigned char *dest,
 849     unsigned int dest_pitch,
 850     unsigned int dest_width
 851 )
 852 {
 853     __asm
 854     {
 855         mov         esi,    dest                    // Get the source and destination pointer
 856         mov         ecx,    dest_pitch               // Get the pitch size
 857
 858         lea         edi,    [esi+ecx*2]             // tow lines below
 859         add         edi,    ecx                     // three lines below
 860
 861         pxor        mm7,    mm7                     // clear out mm7
 862         mov         edx,    dest_width               // Loop counter
 863
 864
 865         last_vs_3_5_loop:
 866
 867         movq        mm0,    QWORD ptr [esi]         // src[0];
 868         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
 869
 870         movq        mm2,    mm0                     // Make a copy
 871         punpcklbw   mm0,    mm7                     // unpack low to word
 872
 873         movq        mm5,    two_fifths               // mm5 = 2/5
 874         punpckhbw   mm2,    mm7                     // unpack high to word
 875
 876         pmullw      mm0,    mm5                     // a * 2/5
 877
 878         movq        mm3,    mm1                     // make a copy
 879         punpcklbw   mm1,    mm7                     // unpack low to word
 880
 881         pmullw      mm2,    mm5                     // a * 2/5
 882         movq        mm6,    three_fifths             // mm6 = 3/5
 883
 884         movq        mm4,    mm1                     // copy of low b
 885         pmullw      mm4,    mm6                     // b * 3/5
 886
 887         punpckhbw   mm3,    mm7                     // unpack high to word
 888         movq        mm5,    mm3                     // copy of high b
 889
 890         pmullw      mm5,    mm6                     // b * 3/5
 891         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
 892
 893         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
 894         paddw       mm0,    round_values             // + 128
 895
 896         paddw       mm2,    round_values             // + 128
 897         psrlw       mm0,    8
 898
 899         psrlw       mm2,    8
 900         packuswb    mm0,    mm2                     // des [1]
 901
 902         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
 903         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
 904
 905
 906
 907         // mm1, mm3 --- Src[1]
 908         // mm0 --- Src[2]
 909         // mm7 for unpacking
 910
 911         movq        mm4,    mm1                     // b low
 912         pmullw      mm1,    four_fifths              // b * 4/5 low
 913
 914         movq        QWORD ptr [edi+ecx], mm0        // write des[4]
 915
 916         movq        mm5,    mm3                     // b high
 917         pmullw      mm3,    four_fifths              // b * 4/5 high
 918
 919         movq        mm2,    mm0                     // c
 920         pmullw      mm4,    one_fifth                // b * 1/5
 921
 922         punpcklbw   mm0,    mm7                     // c low
 923         pmullw      mm5,    one_fifth                // b * 1/5
 924
 925         movq        mm6,    mm0                     // make copy of c low
 926         punpckhbw   mm2,    mm7                     // c high
 927
 928         pmullw      mm6,    one_fifth                // c * 1/5 low
 929         movq        mm7,    mm2                     // make copy of c high
 930
 931         pmullw      mm7,    one_fifth                // c * 1/5 high
 932         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
 933
 934         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
 935         movq        mm6,    mm0                     // make copy of c low
 936
 937         pmullw      mm6,    four_fifths              // c * 4/5 low
 938         movq        mm7,    mm2                     // make copy of c high
 939
 940         pmullw      mm7,    four_fifths              // c * 4/5 high
 941
 942         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
 943         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
 944
 945         paddw       mm1,    round_values             // + 128
 946         paddw       mm3,    round_values             // + 128
 947
 948         psrlw       mm1,    8
 949         psrlw       mm3,    8
 950
 951         packuswb    mm1,    mm3                     // des[2]
 952         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
 953
 954         paddw       mm4,    round_values             // + 128
 955         paddw       mm5,    round_values             // + 128
 956
 957         psrlw       mm4,    8
 958         psrlw       mm5,    8
 959
 960         packuswb    mm4,    mm5                     // des[3]
 961         movq        QWORD ptr [edi], mm4            // write des[3]
 962
 963         //  mm0, mm2 --- Src[3]
 964
 965         add         edi,    8
 966         add         esi,    8
 967
 968         sub         edx,    8
 969         jg          last_vs_3_5_loop
 970     }
 971 }
 972
 973 /****************************************************************************
 974  *
 975  *  ROUTINE       : vertical_band_1_2_scale_mmx
 976  *
 977  *  INPUTS        : unsigned char *dest    :
 978  *                  unsigned int dest_pitch :
 979  *                  unsigned int dest_width :
 980  *
 981  *  OUTPUTS       : None.
 982  *
 983  *  RETURNS       : void
 984  *
 985  *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
 986  *
 987  *  SPECIAL NOTES : The routine uses the first line of the band below
 988  *                  the current band. The function also has an "C" only
 989  *                  version.
 990  *
 991  ****************************************************************************/
 992 static
 993 void vertical_band_1_2_scale_mmx
 994 (
 995     unsigned char *dest,
 996     unsigned int dest_pitch,
 997     unsigned int dest_width
 998 )
 999 {
1000     __asm
1001     {
1002
1003         mov         esi,    dest                    // Get the source and destination pointer
1004         mov         ecx,    dest_pitch               // Get the pitch size
1005
1006         pxor        mm7,    mm7                     // clear out mm7
1007         mov         edx,    dest_width               // Loop counter
1008
1009         vs_1_2_loop:
1010
1011         movq        mm0,    [esi]                   // get Src[0]
1012         movq        mm1,    [esi + ecx * 2]         // get Src[1]
1013
1014         movq        mm2,    mm0                     // make copy before unpack
1015         movq        mm3,    mm1                     // make copy before unpack
1016
1017         punpcklbw   mm0,    mm7                     // low Src[0]
1018         movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
1019
1020         punpcklbw   mm1,    mm7                     // low Src[1]
1021         paddw       mm0,    mm1                     // low (a + b)
1022
1023         punpckhbw   mm2,    mm7                     // high Src[0]
1024         paddw       mm0,    mm6                     // low (a + b + 1)
1025
1026         punpckhbw   mm3,    mm7
1027         paddw       mm2,    mm3                     // high (a + b )
1028
1029         psraw       mm0,    1                       // low (a + b +1 )/2
1030         paddw       mm2,    mm6                     // high (a + b + 1)
1031
1032         psraw       mm2,    1                       // high (a + b + 1)/2
1033         packuswb    mm0,    mm2                     // pack results
1034
1035         movq        [esi+ecx], mm0                  // write out eight bytes
1036         add         esi,    8
1037
1038         sub         edx,    8
1039         jg          vs_1_2_loop
1040     }
1041
1042 }
1043
1044 /****************************************************************************
1045  *
1046  *  ROUTINE       : last_vertical_band_1_2_scale_mmx
1047  *
1048  *  INPUTS        : unsigned char *dest    :
1049  *                  unsigned int dest_pitch :
1050  *                  unsigned int dest_width :
1051  *
1052  *  OUTPUTS       : None.
1053  *
1054  *  RETURNS       : void
1055  *
1056  *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
1057  *
1058  *  SPECIAL NOTES : The routine uses the first line of the band below
1059  *                  the current band. The function also has an "C" only
1060  *                  version.
1061  *
1062  ****************************************************************************/
1063 static
1064 void last_vertical_band_1_2_scale_mmx
1065 (
1066     unsigned char *dest,
1067     unsigned int dest_pitch,
1068     unsigned int dest_width
1069 )
1070 {
1071     __asm
1072     {
1073         mov         esi,    dest                    // Get the source and destination pointer
1074         mov         ecx,    dest_pitch               // Get the pitch size
1075
1076         mov         edx,    dest_width               // Loop counter
1077
1078         last_vs_1_2_loop:
1079
1080         movq        mm0,    [esi]                   // get Src[0]
1081         movq        [esi+ecx], mm0                  // write out eight bytes
1082
1083         add         esi,    8
1084         sub         edx,    8
1085
1086         jg         last_vs_1_2_loop
1087     }
1088 }
1089
1090 /****************************************************************************
1091  *
1092  *  ROUTINE       : horizontal_line_1_2_scale
1093  *
1094  *  INPUTS        : const unsigned char *source :
1095  *                  unsigned int source_width    :
1096  *                  unsigned char *dest         :
1097  *                  unsigned int dest_width      :
1098  *
1099  *  OUTPUTS       : None.
1100  *
1101  *  RETURNS       : void
1102  *
1103  *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
1104  *
1105  *  SPECIAL NOTES : None.
1106  *
1107  ****************************************************************************/
1108 static
1109 void horizontal_line_1_2_scale_mmx
1110 (
1111     const unsigned char *source,
1112     unsigned int source_width,
1113     unsigned char *dest,
1114     unsigned int dest_width
1115 )
1116 {
1117     (void) dest_width;
1118
1119     __asm
1120     {
1121         mov         esi,    source
1122         mov         edi,    dest
1123
1124         pxor        mm7,    mm7
1125         movq        mm6,    four_ones
1126
1127         mov         ecx,    source_width
1128
1129         hs_1_2_loop:
1130
1131         movq        mm0,    [esi]
1132         movq        mm1,    [esi+1]
1133
1134         movq        mm2,    mm0
1135         movq        mm3,    mm1
1136
1137         movq        mm4,    mm0
1138         punpcklbw   mm0,    mm7
1139
1140         punpcklbw   mm1,    mm7
1141         paddw       mm0,    mm1
1142
1143         paddw       mm0,    mm6
1144         punpckhbw   mm2,    mm7
1145
1146         punpckhbw   mm3,    mm7
1147         paddw       mm2,    mm3
1148
1149         paddw       mm2,    mm6
1150         psraw       mm0,    1
1151
1152         psraw       mm2,    1
1153         packuswb    mm0,    mm2
1154
1155         movq        mm2,    mm4
1156         punpcklbw   mm2,    mm0
1157
1158         movq        [edi],  mm2
1159         punpckhbw   mm4,    mm0
1160
1161         movq        [edi+8], mm4
1162         add         esi,    8
1163
1164         add         edi,    16
1165         sub         ecx,    8
1166
1167         cmp         ecx,    8
1168         jg          hs_1_2_loop
1169
1170 // last eight pixel
1171
1172         movq        mm0,    [esi]
1173         movq        mm1,    mm0
1174
1175         movq        mm2,    mm0
1176         movq        mm3,    mm1
1177
1178         psrlq       mm1,    8
1179         psrlq       mm3,    56
1180
1181         psllq       mm3,    56
1182         por         mm1,    mm3
1183
1184         movq        mm3,    mm1
1185         movq        mm4,    mm0
1186
1187         punpcklbw   mm0,    mm7
1188         punpcklbw   mm1,    mm7
1189
1190         paddw       mm0,    mm1
1191         paddw       mm0,    mm6
1192
1193         punpckhbw   mm2,    mm7
1194         punpckhbw   mm3,    mm7
1195
1196         paddw       mm2,    mm3
1197         paddw       mm2,    mm6
1198
1199         psraw       mm0,    1
1200         psraw       mm2,    1
1201
1202         packuswb    mm0,    mm2
1203         movq        mm2,    mm4
1204
1205         punpcklbw   mm2,    mm0
1206         movq        [edi],  mm2
1207
1208         punpckhbw   mm4,    mm0
1209         movq        [edi+8], mm4
1210     }
1211 }
1212
1213
1214
1215
1216
1217 __declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
1218 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
1219
1220
1221 /****************************************************************************
1222  *
1223  *  ROUTINE       : horizontal_line_5_4_scale_mmx
1224  *
1225  *  INPUTS        : const unsigned char *source : Pointer to source data.
1226  *                  unsigned int source_width    : Stride of source.
1227  *                  unsigned char *dest         : Pointer to destination data.
1228  *                  unsigned int dest_width      : Stride of destination (NOT USED).
1229  *
1230  *  OUTPUTS       : None.
1231  *
1232  *  RETURNS       : void
1233  *
1234  *  FUNCTION      : Copies horizontal line of pixels from source to
1235  *                  destination scaling up by 4 to 5.
1236  *
1237  *  SPECIAL NOTES : None.
1238  *
1239  ****************************************************************************/
1240 static
1241 void horizontal_line_5_4_scale_mmx
1242 (
1243     const unsigned char *source,
1244     unsigned int source_width,
1245     unsigned char *dest,
1246     unsigned int dest_width
1247 )
1248 {
1249     /*
1250     unsigned i;
1251     unsigned int a, b, c, d, e;
1252     unsigned char *des = dest;
1253     const unsigned char *src = source;
1254
1255     (void) dest_width;
1256
1257     for ( i=0; i<source_width; i+=5 )
1258     {
1259         a = src[0];
1260         b = src[1];
1261         c = src[2];
1262         d = src[3];
1263         e = src[4];
1264
1265         des[0] = a;
1266         des[1] = ((b*192 + c* 64 + 128)>>8);
1267         des[2] = ((c*128 + d*128 + 128)>>8);
1268         des[3] = ((d* 64 + e*192 + 128)>>8);
1269
1270         src += 5;
1271         des += 4;
1272     }
1273     */
1274     (void) dest_width;
1275
1276     __asm
1277     {
1278
1279         mov         esi,        source              ;
1280         mov         edi,        dest                ;
1281
1282         mov         ecx,        source_width         ;
1283         movq        mm5,        const54_1           ;
1284
1285         pxor        mm7,        mm7                 ;
1286         movq        mm6,        const54_2           ;
1287
1288         movq        mm4,        round_values         ;
1289         lea         edx,        [esi+ecx]           ;
1290         horizontal_line_5_4_loop:
1291
1292         movq        mm0,        QWORD PTR  [esi]    ;
1293         00 01 02 03 04 05 06 07
1294         movq        mm1,        mm0                 ;
1295         00 01 02 03 04 05 06 07
1296
1297         psrlq       mm0,        8                   ;
1298         01 02 03 04 05 06 07 xx
1299         punpcklbw   mm1,        mm7                 ;
1300         xx 00 xx 01 xx 02 xx 03
1301
1302         punpcklbw   mm0,        mm7                 ;
1303         xx 01 xx 02 xx 03 xx 04
1304         pmullw      mm1,        mm5
1305
1306         pmullw      mm0,        mm6
1307         add         esi,        5
1308
1309         add         edi,        4
1310         paddw       mm1,        mm0
1311
1312         paddw       mm1,        mm4
1313         psrlw       mm1,        8
1314
1315         cmp         esi,        edx
1316         packuswb    mm1,        mm7
1317
1318         movd        DWORD PTR [edi-4], mm1
1319
1320         jl          horizontal_line_5_4_loop
1321
1322     }
1323
1324 }
1325 __declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
1326 __declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
1327 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
1328
1329 static
1330 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1331 {
1332
1333     __asm
1334     {
1335         push        ebx
1336
1337         mov         esi,    source                    // Get the source and destination pointer
1338         mov         ecx,    src_pitch               // Get the pitch size
1339
1340         mov         edi,    dest                    // tow lines below
1341         pxor        mm7,    mm7                     // clear out mm7
1342
1343         mov         edx,    dest_pitch               // Loop counter
1344         mov         ebx,    dest_width
1345
1346         vs_5_4_loop:
1347
1348         movd        mm0,    DWORD ptr [esi]         // src[0];
1349         movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
1350
1351         movd        mm2,    DWORD ptr [esi+ecx*2]
1352         lea         eax,    [esi+ecx*2]             //
1353
1354         punpcklbw   mm1,    mm7
1355         punpcklbw   mm2,    mm7
1356
1357         movq        mm3,    mm2
1358         pmullw      mm1,    three_fourths
1359
1360         pmullw      mm2,    one_fourths
1361         movd        mm4,    [eax+ecx]
1362
1363         pmullw      mm3,    two_fourths
1364         punpcklbw   mm4,    mm7
1365
1366         movq        mm5,    mm4
1367         pmullw      mm4,    two_fourths
1368
1369         paddw       mm1,    mm2
1370         movd        mm6,    [eax+ecx*2]
1371
1372         pmullw      mm5,    one_fourths
1373         paddw       mm1,    round_values;
1374
1375         paddw       mm3,    mm4
1376         psrlw       mm1,    8
1377
1378         punpcklbw   mm6,    mm7
1379         paddw       mm3,    round_values
1380
1381         pmullw      mm6,    three_fourths
1382         psrlw       mm3,    8
1383
1384         packuswb    mm1,    mm7
1385         packuswb    mm3,    mm7
1386
1387         movd        DWORD PTR [edi], mm0
1388         movd        DWORD PTR [edi+edx], mm1
1389
1390
1391         paddw       mm5,    mm6
1392         movd        DWORD PTR [edi+edx*2], mm3
1393
1394         lea         eax,    [edi+edx*2]
1395         paddw       mm5,    round_values
1396
1397         psrlw       mm5,    8
1398         add         edi,    4
1399
1400         packuswb    mm5,    mm7
1401         movd        DWORD PTR [eax+edx], mm5
1402
1403         add         esi,    4
1404         sub         ebx,    4
1405
1406         jg         vs_5_4_loop
1407
1408         pop         ebx
1409     }
1410 }
1411
1412
1413 __declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
1414 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
1415
1416
1417 static
1418 void horizontal_line_5_3_scale_mmx
1419 (
1420     const unsigned char *source,
1421     unsigned int source_width,
1422     unsigned char *dest,
1423     unsigned int dest_width
1424 )
1425 {
1426
1427     (void) dest_width;
1428     __asm
1429     {
1430
1431         mov         esi,        source              ;
1432         mov         edi,        dest                ;
1433
1434         mov         ecx,        source_width         ;
1435         movq        mm5,        const53_1           ;
1436
1437         pxor        mm7,        mm7                 ;
1438         movq        mm6,        const53_2           ;
1439
1440         movq        mm4,        round_values         ;
1441         lea         edx,        [esi+ecx-5]         ;
1442         horizontal_line_5_3_loop:
1443
1444         movq        mm0,        QWORD PTR  [esi]    ;
1445         00 01 02 03 04 05 06 07
1446         movq        mm1,        mm0                 ;
1447         00 01 02 03 04 05 06 07
1448
1449         psllw       mm0,        8                   ;
1450         xx 00 xx 02 xx 04 xx 06
1451         psrlw       mm1,        8                   ;
1452         01 xx 03 xx 05 xx 07 xx
1453
1454         psrlw       mm0,        8                   ;
1455         00 xx 02 xx 04 xx 06 xx
1456         psllq       mm1,        16                  ;
1457         xx xx 01 xx 03 xx 05 xx
1458
1459         pmullw      mm0,        mm6
1460
1461         pmullw      mm1,        mm5
1462         add         esi,        5
1463
1464         add         edi,        3
1465         paddw       mm1,        mm0
1466
1467         paddw       mm1,        mm4
1468         psrlw       mm1,        8
1469
1470         cmp         esi,        edx
1471         packuswb    mm1,        mm7
1472
1473         movd        DWORD PTR [edi-3], mm1
1474         jl          horizontal_line_5_3_loop
1475
1476 //exit condition
1477         movq        mm0,        QWORD PTR  [esi]    ;
1478         00 01 02 03 04 05 06 07
1479         movq        mm1,        mm0                 ;
1480         00 01 02 03 04 05 06 07
1481
1482         psllw       mm0,        8                   ;
1483         xx 00 xx 02 xx 04 xx 06
1484         psrlw       mm1,        8                   ;
1485         01 xx 03 xx 05 xx 07 xx
1486
1487         psrlw       mm0,        8                   ;
1488         00 xx 02 xx 04 xx 06 xx
1489         psllq       mm1,        16                  ;
1490         xx xx 01 xx 03 xx 05 xx
1491
1492         pmullw      mm0,        mm6
1493
1494         pmullw      mm1,        mm5
1495         paddw       mm1,        mm0
1496
1497         paddw       mm1,        mm4
1498         psrlw       mm1,        8
1499
1500         packuswb    mm1,        mm7
1501         movd        eax,        mm1
1502
1503         mov         edx,        eax
1504         shr         edx,        16
1505
1506         mov         WORD PTR[edi],   ax
1507         mov         BYTE PTR[edi+2], dl
1508
1509     }
1510
1511 }
1512
1513 __declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
1514 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
1515
1516 static
1517 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1518 {
1519
1520     __asm
1521     {
1522         push        ebx
1523
1524         mov         esi,    source                    // Get the source and destination pointer
1525         mov         ecx,    src_pitch               // Get the pitch size
1526
1527         mov         edi,    dest                    // tow lines below
1528         pxor        mm7,    mm7                     // clear out mm7
1529
1530         mov         edx,    dest_pitch               // Loop counter
1531         movq        mm5,    one_thirds
1532
1533         movq        mm6,    two_thirds
1534         mov         ebx,    dest_width;
1535
1536         vs_5_3_loop:
1537
1538         movd        mm0,    DWORD ptr [esi]         // src[0];
1539         movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
1540
1541         movd        mm2,    DWORD ptr [esi+ecx*2]
1542         lea         eax,    [esi+ecx*2]             //
1543
1544         punpcklbw   mm1,    mm7
1545         punpcklbw   mm2,    mm7
1546
1547         pmullw      mm1,    mm5
1548         pmullw      mm2,    mm6
1549
1550         movd        mm3,    DWORD ptr [eax+ecx]
1551         movd        mm4,    DWORD ptr [eax+ecx*2]
1552
1553         punpcklbw   mm3,    mm7
1554         punpcklbw   mm4,    mm7
1555
1556         pmullw      mm3,    mm6
1557         pmullw      mm4,    mm5
1558
1559
1560         movd        DWORD PTR [edi], mm0
1561         paddw       mm1,    mm2
1562
1563         paddw       mm1,    round_values
1564         psrlw       mm1,    8
1565
1566         packuswb    mm1,    mm7
1567         paddw       mm3,    mm4
1568
1569         paddw       mm3,    round_values
1570         movd        DWORD PTR [edi+edx], mm1
1571
1572         psrlw       mm3,    8
1573         packuswb    mm3,    mm7
1574
1575         movd        DWORD PTR [edi+edx*2], mm3
1576
1577
1578         add         edi,    4
1579         add         esi,    4
1580
1581         sub         ebx,    4
1582         jg          vs_5_3_loop
1583
1584         pop         ebx
1585     }
1586 }
1587
1588
1589
1590
1591 /****************************************************************************
1592  *
1593  *  ROUTINE       : horizontal_line_2_1_scale
1594  *
1595  *  INPUTS        : const unsigned char *source :
1596  *                  unsigned int source_width    :
1597  *                  unsigned char *dest         :
1598  *                  unsigned int dest_width      :
1599  *
1600  *  OUTPUTS       : None.
1601  *
1602  *  RETURNS       : void
1603  *
1604  *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
1605  *
1606  *  SPECIAL NOTES : None.
1607  *
1608  ****************************************************************************/
1609 static
1610 void horizontal_line_2_1_scale_mmx
1611 (
1612     const unsigned char *source,
1613     unsigned int source_width,
1614     unsigned char *dest,
1615     unsigned int dest_width
1616 )
1617 {
1618     (void) dest_width;
1619     (void) source_width;
1620     __asm
1621     {
1622         mov         esi,    source
1623         mov         edi,    dest
1624
1625         pxor        mm7,    mm7
1626         mov         ecx,    dest_width
1627
1628         xor         edx,    edx
1629         hs_2_1_loop:
1630
1631         movq        mm0,    [esi+edx*2]
1632         psllw       mm0,    8
1633
1634         psrlw       mm0,    8
1635         packuswb    mm0,    mm7
1636
1637         movd        DWORD Ptr [edi+edx], mm0;
1638         add         edx,    4
1639
1640         cmp         edx,    ecx
1641         jl          hs_2_1_loop
1642
1643     }
1644 }
1645
1646
1647
1648 static
1649 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1650 {
1651     (void) dest_pitch;
1652     (void) src_pitch;
1653     vpx_memcpy(dest, source, dest_width);
1654 }
1655
1656
1657 __declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
1658 __declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
1659
1660 static
1661 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1662 {
1663
1664     (void) dest_pitch;
1665     __asm
1666     {
1667         mov         esi,        source
1668         mov         edi,        dest
1669
1670         mov         eax,        src_pitch
1671         mov         edx,        dest_width
1672
1673         pxor        mm7,        mm7
1674         sub         esi,        eax             //back one line
1675
1676
1677         lea         ecx,        [esi+edx];
1678         movq        mm6,        round_values;
1679
1680         movq        mm5,        three_sixteenths;
1681         movq        mm4,        ten_sixteenths;
1682
1683         vs_2_1_i_loop:
1684         movd        mm0,        [esi]           //
1685         movd        mm1,        [esi+eax]       //
1686
1687         movd        mm2,        [esi+eax*2]     //
1688         punpcklbw   mm0,        mm7
1689
1690         pmullw      mm0,        mm5
1691         punpcklbw   mm1,        mm7
1692
1693         pmullw      mm1,        mm4
1694         punpcklbw   mm2,        mm7
1695
1696         pmullw      mm2,        mm5
1697         paddw       mm0,        round_values
1698
1699         paddw       mm1,        mm2
1700         paddw       mm0,        mm1
1701
1702         psrlw       mm0,        8
1703         packuswb    mm0,        mm7
1704
1705         movd        DWORD PTR [edi],        mm0
1706         add         esi,        4
1707
1708         add         edi,        4;
1709         cmp         esi,        ecx
1710         jl          vs_2_1_i_loop
1711
1712     }
1713 }
1714
1715
1716
1717 void
1718 register_mmxscalers(void)
1719 {
1720     vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
1721     vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
1722     vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
1723     vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
1724     vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
1725     vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
1726     vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
1727     vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
1728     vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
1729
1730     vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
1731     vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
1732     vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
1733     vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
1734     vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
1735     vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
1736
1737
1738
1739     vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
1740     vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
1741     vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
1742     vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
1743     vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
1744     vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
1745     vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
1746
1747
1748
1749
1750 }