vpx_scale/x86_64/scaleopt.c

   1 /*
   2  *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license and patent
   5  *  grant that can be found in the LICENSE file in the root of the source
   6  *  tree. All contributing project authors may be found in the AUTHORS
   7  *  file in the root of the source tree.
   8  */
   9
  10
  11 /****************************************************************************
  12 *
  13 *   Module Title :     scaleopt.cpp
  14 *
  15 *   Description  :     Optimized scaling functions
  16 *
  17 ****************************************************************************/
  18 #include "pragmas.h"
  19
  20
  21
  22 /****************************************************************************
  23 *  Module Statics
  24 ****************************************************************************/
  25 __declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
  26 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
  27 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
  28 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
  29 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
  30 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
  31 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
  32 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
  33 __declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
  34 __declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
  35 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
  36
  37
  38
  39 #include "vpx_scale/vpxscale.h"
  40 #include "vpx_mem/vpx_mem.h"
  41
  42 /****************************************************************************
  43 *
  44 *  ROUTINE       : horizontal_line_3_5_scale_mmx
  45 *
  46 *  INPUTS        : const unsigned char *source :
  47 *                  unsigned int source_width    :
  48 *                  unsigned char *dest         :
  49 *                  unsigned int dest_width      :
  50 *
  51 *  OUTPUTS       : None.
  52 *
  53 *  RETURNS       : void
  54 *
  55 *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
  56 *
  57 *  SPECIAL NOTES : None.
  58 *
  59 ****************************************************************************/
  60 static
  61 void horizontal_line_3_5_scale_mmx
  62 (
  63     const unsigned char *source,
  64     unsigned int source_width,
  65     unsigned char *dest,
  66     unsigned int dest_width
  67 )
  68 {
  69     (void) dest_width;
  70
  71     __asm
  72     {
  73
  74         push        rbx
  75
  76         mov         rsi,    source
  77         mov         rdi,    dest
  78
  79         mov         ecx,    source_width
  80         lea         rdx,    [rsi+rcx-3];
  81
  82         movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
  83         movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
  84
  85         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
  86         pxor        mm7,    mm7             // clear mm7
  87
  88         horiz_line_3_5_loop:
  89
  90         mov         eax,    DWORD PTR [rsi] // eax = 00 01 02 03
  91         mov         ebx,    eax
  92
  93         and         ebx,    0xffff00        // ebx = xx 01 02 xx
  94         mov         ecx,    eax             // ecx = 00 01 02 03
  95
  96         and         eax,    0xffff0000      // eax = xx xx 02 03
  97         xor         ecx,    eax             // ecx = 00 01 xx xx
  98
  99         shr         ebx,    8               // ebx = 01 02 xx xx
 100         or          eax,    ebx             // eax = 01 02 02 03
 101
 102         shl         ebx,    16              // ebx = xx xx 01 02
 103         movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
 104
 105         or          ebx,    ecx             // ebx = 00 01 01 02
 106         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
 107
 108         movd        mm0,    ebx             // mm0 = 00 01 01 02
 109         pmullw      mm1,    mm6             //
 110
 111         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
 112         pmullw      mm0,    mm5             //
 113
 114         mov         [rdi],  ebx             // writeoutput 00 xx xx xx
 115         add         rsi,    3
 116
 117         add         rdi,    5
 118         paddw       mm0,    mm1
 119
 120         paddw       mm0,    mm4
 121         psrlw       mm0,    8
 122
 123         cmp         rsi,    rdx
 124         packuswb    mm0,    mm7
 125
 126         movd        DWORD Ptr [rdi-4], mm0
 127         jl          horiz_line_3_5_loop
 128
 129 //Exit:
 130         mov         eax,    DWORD PTR [rsi] // eax = 00 01 02 03
 131         mov         ebx,    eax
 132
 133         and         ebx,    0xffff00        // ebx = xx 01 02 xx
 134         mov         ecx,    eax             // ecx = 00 01 02 03
 135
 136         and         eax,    0xffff0000      // eax = xx xx 02 03
 137         xor         ecx,    eax             // ecx = 00 01 xx xx
 138
 139         shr         ebx,    8               // ebx = 01 02 xx xx
 140         or          eax,    ebx             // eax = 01 02 02 03
 141
 142         shl         eax,    8               // eax = xx 01 02 02
 143         and         eax,    0xffff0000      // eax = xx xx 02 02
 144
 145         or          eax,    ebx             // eax = 01 02 02 02
 146
 147         shl         ebx,    16              // ebx = xx xx 01 02
 148         movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
 149
 150         or          ebx,    ecx             // ebx = 00 01 01 02
 151         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
 152
 153         movd        mm0,    ebx             // mm0 = 00 01 01 02
 154         pmullw      mm1,    mm6             //
 155
 156         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
 157         pmullw      mm0,    mm5             //
 158
 159         mov         [rdi],  ebx             // writeoutput 00 xx xx xx
 160         paddw       mm0,    mm1
 161
 162         paddw       mm0,    mm4
 163         psrlw       mm0,    8
 164
 165         packuswb    mm0,    mm7
 166         movd        DWORD Ptr [rdi+1], mm0
 167
 168         pop rbx
 169
 170     }
 171
 172 }
 173
 174
 175 /****************************************************************************
 176 *
 177 *  ROUTINE       : horizontal_line_4_5_scale_mmx
 178 *
 179 *  INPUTS        : const unsigned char *source :
 180 *                  unsigned int source_width    :
 181 *                  unsigned char *dest         :
 182 *                  unsigned int dest_width      :
 183 *
 184 *  OUTPUTS       : None.
 185 *
 186 *  RETURNS       : void
 187 *
 188 *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
 189 *
 190 *  SPECIAL NOTES : None.
 191 *
 192 ****************************************************************************/
 193 static
 194 void horizontal_line_4_5_scale_mmx
 195 (
 196     const unsigned char *source,
 197     unsigned int source_width,
 198     unsigned char *dest,
 199     unsigned int dest_width
 200 )
 201 {
 202     (void)dest_width;
 203
 204     __asm
 205     {
 206
 207         mov         rsi,    source
 208         mov         rdi,    dest
 209
 210         mov         ecx,    source_width
 211         lea         rdx,    [rsi+rcx-8];
 212
 213         movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
 214         movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
 215
 216         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
 217         pxor        mm7,    mm7             // clear mm7
 218
 219         horiz_line_4_5_loop:
 220
 221         movq        mm0,    QWORD PTR [rsi]           // mm0 = 00 01 02 03 04 05 06 07
 222         movq        mm1,    QWORD PTR [rsi+1];        // mm1 = 01 02 03 04 05 06 07 08
 223
 224         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
 225         movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
 226
 227         movd        DWORD PTR [rdi],  mm0             // write output 00 xx xx xx
 228         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
 229
 230         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
 231         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
 232
 233         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
 234         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
 235
 236         movd        DWORD PTR [rdi+5], mm2            // write ouput 05 xx xx xx
 237         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
 238
 239         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
 240         pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
 241
 242         paddw       mm0,    mm1             // added round values
 243         paddw       mm0,    mm4
 244
 245         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
 246         packuswb    mm0,    mm7
 247
 248         movd        DWORD PTR [rdi+1], mm0  // write output 01 02 03 04
 249         add         rdi,    10
 250
 251         add         rsi,    8
 252         paddw       mm2,    mm3             //
 253
 254         paddw       mm2,    mm4             // added round values
 255         cmp         rsi,    rdx
 256
 257         psrlw       mm2,    8
 258         packuswb    mm2,    mm7
 259
 260         movd        DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09
 261         jl         horiz_line_4_5_loop
 262
 263 //Exit:
 264         movq        mm0,    [rsi]           // mm0 = 00 01 02 03 04 05 06 07
 265         movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
 266
 267         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
 268         psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
 269
 270         movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
 271         pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
 272
 273         psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
 274         por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
 275
 276         movq        mm3,    mm1
 277
 278         movd        DWORD PTR [rdi],  mm0   // write output 00 xx xx xx
 279         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
 280
 281         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
 282         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
 283
 284         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
 285         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
 286
 287         movd        DWORD PTR [rdi+5], mm2  // write ouput 05 xx xx xx
 288         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
 289
 290         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
 291         pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
 292
 293         paddw       mm0,    mm1             // added round values
 294         paddw       mm0,    mm4
 295
 296         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
 297         packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
 298
 299         movd        DWORD PTR [rdi+1], mm0  // write output 01 02 03 04
 300         paddw       mm2,    mm3             //
 301
 302         paddw       mm2,    mm4             // added round values
 303         psrlw       mm2,    8
 304
 305         packuswb    mm2,    mm7
 306         movd        DWORD PTR [rdi+6], mm2  // writeoutput 06 07 08 09
 307
 308
 309     }
 310 }
 311
 312 /****************************************************************************
 313 *
 314 *  ROUTINE       : vertical_band_4_5_scale_mmx
 315 *
 316 *  INPUTS        : unsigned char *dest    :
 317 *                  unsigned int dest_pitch :
 318 *                  unsigned int dest_width :
 319 *
 320 *  OUTPUTS       : None.
 321 *
 322 *  RETURNS       : void
 323 *
 324 *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
 325 *
 326 *  SPECIAL NOTES : The routine uses the first line of the band below
 327 *                  the current band. The function also has a "C" only
 328 *                  version.
 329 *
 330 ****************************************************************************/
 331 static
 332 void vertical_band_4_5_scale_mmx
 333 (
 334     unsigned char *dest,
 335     unsigned int dest_pitch,
 336     unsigned int dest_width
 337 )
 338 {
 339     __asm
 340     {
 341
 342         mov         rsi,    dest                    // Get the source and destination pointer
 343         mov         ecx,    dest_pitch               // Get the pitch size
 344
 345         lea         rdi,    [rsi+rcx*2]             // tow lines below
 346         add         rdi,    rcx                     // three lines below
 347
 348         pxor        mm7,    mm7                     // clear out mm7
 349         mov         edx,    dest_width               // Loop counter
 350
 351         vs_4_5_loop:
 352
 353         movq        mm0,    QWORD ptr [rsi]         // src[0];
 354         movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
 355
 356         movq        mm2,    mm0                     // Make a copy
 357         punpcklbw   mm0,    mm7                     // unpack low to word
 358
 359         movq        mm5,    one_fifth
 360         punpckhbw   mm2,    mm7                     // unpack high to word
 361
 362         pmullw      mm0,    mm5                     // a * 1/5
 363
 364         movq        mm3,    mm1                     // make a copy
 365         punpcklbw   mm1,    mm7                     // unpack low to word
 366
 367         pmullw      mm2,    mm5                     // a * 1/5
 368         movq        mm6,    four_fifths               // constan
 369
 370         movq        mm4,    mm1                     // copy of low b
 371         pmullw      mm4,    mm6                     // b * 4/5
 372
 373         punpckhbw   mm3,    mm7                     // unpack high to word
 374         movq        mm5,    mm3                     // copy of high b
 375
 376         pmullw      mm5,    mm6                     // b * 4/5
 377         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
 378
 379         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
 380         paddw       mm0,    round_values             // + 128
 381
 382         paddw       mm2,    round_values             // + 128
 383         psrlw       mm0,    8
 384
 385         psrlw       mm2,    8
 386         packuswb    mm0,    mm2                     // des [1]
 387
 388         movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
 389         movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
 390
 391         // mm1, mm3 --- Src[1]
 392         // mm0 --- Src[2]
 393         // mm7 for unpacking
 394
 395         movq        mm5,    two_fifths
 396         movq        mm2,    mm0                     // make a copy
 397
 398         pmullw      mm1,    mm5                     // b * 2/5
 399         movq        mm6,    three_fifths
 400
 401
 402         punpcklbw   mm0,    mm7                     // unpack low to word
 403         pmullw      mm3,    mm5                     // b * 2/5
 404
 405         movq        mm4,    mm0                     // make copy of c
 406         punpckhbw   mm2,    mm7                     // unpack high to word
 407
 408         pmullw      mm4,    mm6                     // c * 3/5
 409         movq        mm5,    mm2
 410
 411         pmullw      mm5,    mm6                     // c * 3/5
 412         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
 413
 414         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
 415         paddw       mm1,    round_values             // + 128
 416
 417         paddw       mm3,    round_values             // + 128
 418         psrlw       mm1,    8
 419
 420         psrlw       mm3,    8
 421         packuswb    mm1,    mm3                     // des[2]
 422
 423         movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
 424         movq        mm1,    [rdi]                   // mm1=Src[3];
 425
 426         // mm0, mm2 --- Src[2]
 427         // mm1 --- Src[3]
 428         // mm6 --- 3/5
 429         // mm7 for unpacking
 430
 431         pmullw      mm0,    mm6                     // c * 3/5
 432         movq        mm5,    two_fifths               // mm5 = 2/5
 433
 434         movq        mm3,    mm1                     // make a copy
 435         pmullw      mm2,    mm6                     // c * 3/5
 436
 437         punpcklbw   mm1,    mm7                     // unpack low
 438         movq        mm4,    mm1                     // make a copy
 439
 440         punpckhbw   mm3,    mm7                     // unpack high
 441         pmullw      mm4,    mm5                     // d * 2/5
 442
 443         movq        mm6,    mm3                     // make a copy
 444         pmullw      mm6,    mm5                     // d * 2/5
 445
 446         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
 447         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
 448
 449         paddw       mm0,    round_values             // + 128
 450         paddw       mm2,    round_values             // + 128
 451
 452         psrlw       mm0,    8
 453         psrlw       mm2,    8
 454
 455         packuswb    mm0,    mm2                     // des[3]
 456         movq        QWORD ptr [rdi], mm0            // write des[3]
 457
 458         //  mm1, mm3 --- Src[3]
 459         //  mm7 -- cleared for unpacking
 460
 461         movq        mm0,    [rdi+rcx*2]             // mm0, Src[0] of the next group
 462
 463         movq        mm5,    four_fifths              // mm5 = 4/5
 464         pmullw      mm1,    mm5                     // d * 4/5
 465
 466         movq        mm6,    one_fifth                // mm6 = 1/5
 467         movq        mm2,    mm0                     // make a copy
 468
 469         pmullw      mm3,    mm5                     // d * 4/5
 470         punpcklbw   mm0,    mm7                     // unpack low
 471
 472         pmullw      mm0,    mm6                     // an * 1/5
 473         punpckhbw   mm2,    mm7                     // unpack high
 474
 475         paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
 476         pmullw      mm2,    mm6                     // an * 1/5
 477
 478         paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
 479         paddw       mm1,    round_values             // + 128
 480
 481         paddw       mm3,    round_values             // + 128
 482         psrlw       mm1,    8
 483
 484         psrlw       mm3,    8
 485         packuswb    mm1,    mm3                     // des[4]
 486
 487         movq        QWORD ptr [rdi+rcx], mm1        // write des[4]
 488
 489         add         rdi,    8
 490         add         rsi,    8
 491
 492         sub         rdx,    8
 493         jg          vs_4_5_loop
 494     }
 495 }
 496
 497 /****************************************************************************
 498 *
 499 *  ROUTINE       : last_vertical_band_4_5_scale_mmx
 500 *
 501 *  INPUTS        : unsigned char *dest    :
 502 *                  unsigned int dest_pitch :
 503 *                  unsigned int dest_width :
 504 *
 505 *  OUTPUTS       : None.
 506 *
 507 *  RETURNS       : None
 508 *
 509 *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
 510 *
 511 *  SPECIAL NOTES : The routine uses the first line of the band below
 512 *                  the current band. The function also has an "C" only
 513 *                  version.
 514 *
 515 ****************************************************************************/
 516 static
 517 void last_vertical_band_4_5_scale_mmx
 518 (
 519     unsigned char *dest,
 520     unsigned int dest_pitch,
 521     unsigned int dest_width
 522 )
 523 {
 524     __asm
 525     {
 526         mov         rsi,    dest                    // Get the source and destination pointer
 527         mov         ecx,    dest_pitch               // Get the pitch size
 528
 529         lea         rdi,    [rsi+rcx*2]             // tow lines below
 530         add         rdi,    rcx                     // three lines below
 531
 532         pxor        mm7,    mm7                     // clear out mm7
 533         mov         edx,    dest_width               // Loop counter
 534
 535         last_vs_4_5_loop:
 536
 537         movq        mm0,    QWORD ptr [rsi]         // src[0];
 538         movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
 539
 540         movq        mm2,    mm0                     // Make a copy
 541         punpcklbw   mm0,    mm7                     // unpack low to word
 542
 543         movq        mm5,    one_fifth
 544         punpckhbw   mm2,    mm7                     // unpack high to word
 545
 546         pmullw      mm0,    mm5                     // a * 1/5
 547
 548         movq        mm3,    mm1                     // make a copy
 549         punpcklbw   mm1,    mm7                     // unpack low to word
 550
 551         pmullw      mm2,    mm5                     // a * 1/5
 552         movq        mm6,    four_fifths               // constan
 553
 554         movq        mm4,    mm1                     // copy of low b
 555         pmullw      mm4,    mm6                     // b * 4/5
 556
 557         punpckhbw   mm3,    mm7                     // unpack high to word
 558         movq        mm5,    mm3                     // copy of high b
 559
 560         pmullw      mm5,    mm6                     // b * 4/5
 561         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
 562
 563         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
 564         paddw       mm0,    round_values             // + 128
 565
 566         paddw       mm2,    round_values             // + 128
 567         psrlw       mm0,    8
 568
 569         psrlw       mm2,    8
 570         packuswb    mm0,    mm2                     // des [1]
 571
 572         movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
 573         movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
 574
 575         // mm1, mm3 --- Src[1]
 576         // mm0 --- Src[2]
 577         // mm7 for unpacking
 578
 579         movq        mm5,    two_fifths
 580         movq        mm2,    mm0                     // make a copy
 581
 582         pmullw      mm1,    mm5                     // b * 2/5
 583         movq        mm6,    three_fifths
 584
 585
 586         punpcklbw   mm0,    mm7                     // unpack low to word
 587         pmullw      mm3,    mm5                     // b * 2/5
 588
 589         movq        mm4,    mm0                     // make copy of c
 590         punpckhbw   mm2,    mm7                     // unpack high to word
 591
 592         pmullw      mm4,    mm6                     // c * 3/5
 593         movq        mm5,    mm2
 594
 595         pmullw      mm5,    mm6                     // c * 3/5
 596         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
 597
 598         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
 599         paddw       mm1,    round_values             // + 128
 600
 601         paddw       mm3,    round_values             // + 128
 602         psrlw       mm1,    8
 603
 604         psrlw       mm3,    8
 605         packuswb    mm1,    mm3                     // des[2]
 606
 607         movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
 608         movq        mm1,    [rdi]                   // mm1=Src[3];
 609
 610         movq        QWORD ptr [rdi+rcx], mm1        // write des[4];
 611
 612         // mm0, mm2 --- Src[2]
 613         // mm1 --- Src[3]
 614         // mm6 --- 3/5
 615         // mm7 for unpacking
 616
 617         pmullw      mm0,    mm6                     // c * 3/5
 618         movq        mm5,    two_fifths               // mm5 = 2/5
 619
 620         movq        mm3,    mm1                     // make a copy
 621         pmullw      mm2,    mm6                     // c * 3/5
 622
 623         punpcklbw   mm1,    mm7                     // unpack low
 624         movq        mm4,    mm1                     // make a copy
 625
 626         punpckhbw   mm3,    mm7                     // unpack high
 627         pmullw      mm4,    mm5                     // d * 2/5
 628
 629         movq        mm6,    mm3                     // make a copy
 630         pmullw      mm6,    mm5                     // d * 2/5
 631
 632         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
 633         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
 634
 635         paddw       mm0,    round_values             // + 128
 636         paddw       mm2,    round_values             // + 128
 637
 638         psrlw       mm0,    8
 639         psrlw       mm2,    8
 640
 641         packuswb    mm0,    mm2                     // des[3]
 642         movq        QWORD ptr [rdi], mm0            // write des[3]
 643
 644         //  mm1, mm3 --- Src[3]
 645         //  mm7 -- cleared for unpacking
 646         add         rdi,    8
 647         add         rsi,    8
 648
 649         sub         rdx,    8
 650         jg          last_vs_4_5_loop
 651     }
 652 }
 653
 654 /****************************************************************************
 655 *
 656 *  ROUTINE       : vertical_band_3_5_scale_mmx
 657 *
 658 *  INPUTS        : unsigned char *dest    :
 659 *                  unsigned int dest_pitch :
 660 *                  unsigned int dest_width :
 661 *
 662 *  OUTPUTS       : None.
 663 *
 664 *  RETURNS       : void
 665 *
 666 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
 667 *
 668 *  SPECIAL NOTES : The routine uses the first line of the band below
 669 *                  the current band. The function also has an "C" only
 670 *                  version.
 671 *
 672 ****************************************************************************/
 673 static
 674 void vertical_band_3_5_scale_mmx
 675 (
 676     unsigned char *dest,
 677     unsigned int dest_pitch,
 678     unsigned int dest_width
 679 )
 680 {
 681     __asm
 682     {
 683         mov         rsi,    dest                    // Get the source and destination pointer
 684         mov         ecx,    dest_pitch               // Get the pitch size
 685
 686         lea         rdi,    [rsi+rcx*2]             // two lines below
 687         add         rdi,    rcx                     // three lines below
 688
 689         pxor        mm7,    mm7                     // clear out mm7
 690         mov         edx,    dest_width               // Loop counter
 691
 692         vs_3_5_loop:
 693
 694         movq        mm0,    QWORD ptr [rsi]         // src[0];
 695         movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
 696
 697         movq        mm2,    mm0                     // Make a copy
 698         punpcklbw   mm0,    mm7                     // unpack low to word
 699
 700         movq        mm5,    two_fifths               // mm5 = 2/5
 701         punpckhbw   mm2,    mm7                     // unpack high to word
 702
 703         pmullw      mm0,    mm5                     // a * 2/5
 704
 705         movq        mm3,    mm1                     // make a copy
 706         punpcklbw   mm1,    mm7                     // unpack low to word
 707
 708         pmullw      mm2,    mm5                     // a * 2/5
 709         movq        mm6,    three_fifths             // mm6 = 3/5
 710
 711         movq        mm4,    mm1                     // copy of low b
 712         pmullw      mm4,    mm6                     // b * 3/5
 713
 714         punpckhbw   mm3,    mm7                     // unpack high to word
 715         movq        mm5,    mm3                     // copy of high b
 716
 717         pmullw      mm5,    mm6                     // b * 3/5
 718         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
 719
 720         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
 721         paddw       mm0,    round_values             // + 128
 722
 723         paddw       mm2,    round_values             // + 128
 724         psrlw       mm0,    8
 725
 726         psrlw       mm2,    8
 727         packuswb    mm0,    mm2                     // des [1]
 728
 729         movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
 730         movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
 731
 732         // mm1, mm3 --- Src[1]
 733         // mm0 --- Src[2]
 734         // mm7 for unpacking
 735
 736         movq        mm4,    mm1                     // b low
 737         pmullw      mm1,    four_fifths              // b * 4/5 low
 738
 739         movq        mm5,    mm3                     // b high
 740         pmullw      mm3,    four_fifths              // b * 4/5 high
 741
 742         movq        mm2,    mm0                     // c
 743         pmullw      mm4,    one_fifth                // b * 1/5
 744
 745         punpcklbw   mm0,    mm7                     // c low
 746         pmullw      mm5,    one_fifth                // b * 1/5
 747
 748         movq        mm6,    mm0                     // make copy of c low
 749         punpckhbw   mm2,    mm7                     // c high
 750
 751         pmullw      mm6,    one_fifth                // c * 1/5 low
 752         movq        mm7,    mm2                     // make copy of c high
 753
 754         pmullw      mm7,    one_fifth                // c * 1/5 high
 755         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
 756
 757         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
 758         movq        mm6,    mm0                     // make copy of c low
 759
 760         pmullw      mm6,    four_fifths              // c * 4/5 low
 761         movq        mm7,    mm2                     // make copy of c high
 762
 763         pmullw      mm7,    four_fifths              // c * 4/5 high
 764
 765         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
 766         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
 767
 768         paddw       mm1,    round_values             // + 128
 769         paddw       mm3,    round_values             // + 128
 770
 771         psrlw       mm1,    8
 772         psrlw       mm3,    8
 773
 774         packuswb    mm1,    mm3                     // des[2]
 775         movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
 776
 777         paddw       mm4,    round_values             // + 128
 778         paddw       mm5,    round_values             // + 128
 779
 780         psrlw       mm4,    8
 781         psrlw       mm5,    8
 782
 783         packuswb    mm4,    mm5                     // des[3]
 784         movq        QWORD ptr [rdi], mm4            // write des[3]
 785
 786         //  mm0, mm2 --- Src[3]
 787
 788         pxor        mm7,    mm7                     // clear mm7 for unpacking
 789         movq        mm1,    [rdi+rcx*2]             // mm1 = Src[0] of the next group
 790
 791         movq        mm5,    three_fifths             // mm5 = 3/5
 792         pmullw      mm0,    mm5                     // d * 3/5
 793
 794         movq        mm6,    two_fifths                // mm6 = 2/5
 795         movq        mm3,    mm1                     // make a copy
 796
 797         pmullw      mm2,    mm5                     // d * 3/5
 798         punpcklbw   mm1,    mm7                     // unpack low
 799
 800         pmullw      mm1,    mm6                     // an * 2/5
 801         punpckhbw   mm3,    mm7                     // unpack high
 802
 803         paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
 804         pmullw      mm3,    mm6                     // an * 2/5
 805
 806         paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
 807         paddw       mm0,    round_values             // + 128
 808
 809         paddw       mm2,    round_values             // + 128
 810         psrlw       mm0,    8
 811
 812         psrlw       mm2,    8
 813         packuswb    mm0,    mm2                     // des[4]
 814
 815         movq        QWORD ptr [rdi+rcx], mm0        // write des[4]
 816
 817         add         rdi,    8
 818         add         rsi,    8
 819
 820         sub         rdx,    8
 821         jg          vs_3_5_loop
 822     }
 823 }
 824
 825 /****************************************************************************
 826 *
 827 *  ROUTINE       : last_vertical_band_3_5_scale_mmx
 828 *
 829 *  INPUTS        : unsigned char *dest    :
 830 *                  unsigned int dest_pitch :
 831 *                  unsigned int dest_width :
 832 *
 833 *  OUTPUTS       : None.
 834 *
 835 *  RETURNS       : void
 836 *
 837 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
 838 *
 839 *  SPECIAL NOTES : The routine uses the first line of the band below
 840 *                  the current band. The function also has an "C" only
 841 *                  version.
 842 *
 843 ****************************************************************************/
 844 static
 845 void last_vertical_band_3_5_scale_mmx
 846 (
 847     unsigned char *dest,
 848     unsigned int dest_pitch,
 849     unsigned int dest_width
 850 )
 851 {
 852     __asm
 853     {
 854         mov         rsi,    dest                    // Get the source and destination pointer
 855         mov         ecx,    dest_pitch               // Get the pitch size
 856
 857         lea         rdi,    [rsi+rcx*2]             // tow lines below
 858         add         rdi,    rcx                     // three lines below
 859
 860         pxor        mm7,    mm7                     // clear out mm7
 861         mov         edx,    dest_width               // Loop counter
 862
 863
 864         last_vs_3_5_loop:
 865
 866         movq        mm0,    QWORD ptr [rsi]         // src[0];
 867         movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
 868
 869         movq        mm2,    mm0                     // Make a copy
 870         punpcklbw   mm0,    mm7                     // unpack low to word
 871
 872         movq        mm5,    two_fifths               // mm5 = 2/5
 873         punpckhbw   mm2,    mm7                     // unpack high to word
 874
 875         pmullw      mm0,    mm5                     // a * 2/5
 876
 877         movq        mm3,    mm1                     // make a copy
 878         punpcklbw   mm1,    mm7                     // unpack low to word
 879
 880         pmullw      mm2,    mm5                     // a * 2/5
 881         movq        mm6,    three_fifths             // mm6 = 3/5
 882
 883         movq        mm4,    mm1                     // copy of low b
 884         pmullw      mm4,    mm6                     // b * 3/5
 885
 886         punpckhbw   mm3,    mm7                     // unpack high to word
 887         movq        mm5,    mm3                     // copy of high b
 888
 889         pmullw      mm5,    mm6                     // b * 3/5
 890         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
 891
 892         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
 893         paddw       mm0,    round_values             // + 128
 894
 895         paddw       mm2,    round_values             // + 128
 896         psrlw       mm0,    8
 897
 898         psrlw       mm2,    8
 899         packuswb    mm0,    mm2                     // des [1]
 900
 901         movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
 902         movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
 903
 904
 905
 906         // mm1, mm3 --- Src[1]
 907         // mm0 --- Src[2]
 908         // mm7 for unpacking
 909
 910         movq        mm4,    mm1                     // b low
 911         pmullw      mm1,    four_fifths              // b * 4/5 low
 912
 913         movq        QWORD ptr [rdi+rcx], mm0        // write des[4]
 914
 915         movq        mm5,    mm3                     // b high
 916         pmullw      mm3,    four_fifths              // b * 4/5 high
 917
 918         movq        mm2,    mm0                     // c
 919         pmullw      mm4,    one_fifth                // b * 1/5
 920
 921         punpcklbw   mm0,    mm7                     // c low
 922         pmullw      mm5,    one_fifth                // b * 1/5
 923
 924         movq        mm6,    mm0                     // make copy of c low
 925         punpckhbw   mm2,    mm7                     // c high
 926
 927         pmullw      mm6,    one_fifth                // c * 1/5 low
 928         movq        mm7,    mm2                     // make copy of c high
 929
 930         pmullw      mm7,    one_fifth                // c * 1/5 high
 931         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
 932
 933         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
 934         movq        mm6,    mm0                     // make copy of c low
 935
 936         pmullw      mm6,    four_fifths              // c * 4/5 low
 937         movq        mm7,    mm2                     // make copy of c high
 938
 939         pmullw      mm7,    four_fifths              // c * 4/5 high
 940
 941         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
 942         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
 943
 944         paddw       mm1,    round_values             // + 128
 945         paddw       mm3,    round_values             // + 128
 946
 947         psrlw       mm1,    8
 948         psrlw       mm3,    8
 949
 950         packuswb    mm1,    mm3                     // des[2]
 951         movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
 952
 953         paddw       mm4,    round_values             // + 128
 954         paddw       mm5,    round_values             // + 128
 955
 956         psrlw       mm4,    8
 957         psrlw       mm5,    8
 958
 959         packuswb    mm4,    mm5                     // des[3]
 960         movq        QWORD ptr [rdi], mm4            // write des[3]
 961
 962         //  mm0, mm2 --- Src[3]
 963
 964         add         rdi,    8
 965         add         rsi,    8
 966
 967         sub         rdx,    8
 968         jg          last_vs_3_5_loop
 969     }
 970 }
 971
 972 /****************************************************************************
 973 *
 974 *  ROUTINE       : vertical_band_1_2_scale_mmx
 975 *
 976 *  INPUTS        : unsigned char *dest    :
 977 *                  unsigned int dest_pitch :
 978 *                  unsigned int dest_width :
 979 *
 980 *  OUTPUTS       : None.
 981 *
 982 *  RETURNS       : void
 983 *
 984 *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
 985 *
 986 *  SPECIAL NOTES : The routine uses the first line of the band below
 987 *                  the current band. The function also has an "C" only
 988 *                  version.
 989 *
 990 ****************************************************************************/
 991 static
 992 void vertical_band_1_2_scale_mmx
 993 (
 994     unsigned char *dest,
 995     unsigned int dest_pitch,
 996     unsigned int dest_width
 997 )
 998 {
 999     __asm
1000     {
1001
1002         mov         rsi,    dest                    // Get the source and destination pointer
1003         mov         ecx,    dest_pitch               // Get the pitch size
1004
1005         pxor        mm7,    mm7                     // clear out mm7
1006         mov         edx,    dest_width               // Loop counter
1007
1008         vs_1_2_loop:
1009
1010         movq        mm0,    [rsi]                   // get Src[0]
1011         movq        mm1,    [rsi + rcx * 2]         // get Src[1]
1012
1013         movq        mm2,    mm0                     // make copy before unpack
1014         movq        mm3,    mm1                     // make copy before unpack
1015
1016         punpcklbw   mm0,    mm7                     // low Src[0]
1017         movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
1018
1019         punpcklbw   mm1,    mm7                     // low Src[1]
1020         paddw       mm0,    mm1                     // low (a + b)
1021
1022         punpckhbw   mm2,    mm7                     // high Src[0]
1023         paddw       mm0,    mm6                     // low (a + b + 1)
1024
1025         punpckhbw   mm3,    mm7
1026         paddw       mm2,    mm3                     // high (a + b )
1027
1028         psraw       mm0,    1                       // low (a + b +1 )/2
1029         paddw       mm2,    mm6                     // high (a + b + 1)
1030
1031         psraw       mm2,    1                       // high (a + b + 1)/2
1032         packuswb    mm0,    mm2                     // pack results
1033
1034         movq        [rsi+rcx], mm0                  // write out eight bytes
1035         add         rsi,    8
1036
1037         sub         rdx,    8
1038         jg          vs_1_2_loop
1039     }
1040
1041 }
1042
1043 /****************************************************************************
1044 *
1045 *  ROUTINE       : last_vertical_band_1_2_scale_mmx
1046 *
1047 *  INPUTS        : unsigned char *dest    :
1048 *                  unsigned int dest_pitch :
1049 *                  unsigned int dest_width :
1050 *
1051 *  OUTPUTS       : None.
1052 *
1053 *  RETURNS       : void
1054 *
1055 *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
1056 *
1057 *  SPECIAL NOTES : The routine uses the first line of the band below
1058 *                  the current band. The function also has an "C" only
1059 *                  version.
1060 *
1061 ****************************************************************************/
1062 static
1063 void last_vertical_band_1_2_scale_mmx
1064 (
1065     unsigned char *dest,
1066     unsigned int dest_pitch,
1067     unsigned int dest_width
1068 )
1069 {
1070     __asm
1071     {
1072         mov         rsi,    dest                    // Get the source and destination pointer
1073         mov         ecx,    dest_pitch               // Get the pitch size
1074
1075         mov         edx,    dest_width               // Loop counter
1076
1077         last_vs_1_2_loop:
1078
1079         movq        mm0,    [rsi]                   // get Src[0]
1080         movq        [rsi+rcx], mm0                  // write out eight bytes
1081
1082         add         rsi,    8
1083         sub         rdx,    8
1084
1085         jg          last_vs_1_2_loop
1086     }
1087 }
1088
1089 /****************************************************************************
1090 *
1091 *  ROUTINE       : horizontal_line_1_2_scale
1092 *
1093 *  INPUTS        : const unsigned char *source :
1094 *                  unsigned int source_width    :
1095 *                  unsigned char *dest         :
1096 *                  unsigned int dest_width      :
1097 *
1098 *  OUTPUTS       : None.
1099 *
1100 *  RETURNS       : void
1101 *
1102 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
1103 *
1104 *  SPECIAL NOTES : None.
1105 *
1106 ****************************************************************************/
1107 static
1108 void horizontal_line_1_2_scale_mmx
1109 (
1110     const unsigned char *source,
1111     unsigned int source_width,
1112     unsigned char *dest,
1113     unsigned int dest_width
1114 )
1115 {
1116     (void) dest_width;
1117
1118     __asm
1119     {
1120         mov         rsi,    source
1121         mov         rdi,    dest
1122
1123         pxor        mm7,    mm7
1124         movq        mm6,    four_ones
1125
1126         mov         ecx,    source_width
1127
1128         hs_1_2_loop:
1129
1130         movq        mm0,    [rsi]
1131         movq        mm1,    [rsi+1]
1132
1133         movq        mm2,    mm0
1134         movq        mm3,    mm1
1135
1136         movq        mm4,    mm0
1137         punpcklbw   mm0,    mm7
1138
1139         punpcklbw   mm1,    mm7
1140         paddw       mm0,    mm1
1141
1142         paddw       mm0,    mm6
1143         punpckhbw   mm2,    mm7
1144
1145         punpckhbw   mm3,    mm7
1146         paddw       mm2,    mm3
1147
1148         paddw       mm2,    mm6
1149         psraw       mm0,    1
1150
1151         psraw       mm2,    1
1152         packuswb    mm0,    mm2
1153
1154         movq        mm2,    mm4
1155         punpcklbw   mm2,    mm0
1156
1157         movq        [rdi],  mm2
1158         punpckhbw   mm4,    mm0
1159
1160         movq        [rdi+8], mm4
1161         add         rsi,    8
1162
1163         add         rdi,    16
1164         sub         rcx,    8
1165
1166         cmp         rcx,    8
1167         jg          hs_1_2_loop
1168
1169 // last eight pixel
1170
1171         movq        mm0,    [rsi]
1172         movq        mm1,    mm0
1173
1174         movq        mm2,    mm0
1175         movq        mm3,    mm1
1176
1177         psrlq       mm1,    8
1178         psrlq       mm3,    56
1179
1180         psllq       mm3,    56
1181         por         mm1,    mm3
1182
1183         movq        mm3,    mm1
1184         movq        mm4,    mm0
1185
1186         punpcklbw   mm0,    mm7
1187         punpcklbw   mm1,    mm7
1188
1189         paddw       mm0,    mm1
1190         paddw       mm0,    mm6
1191
1192         punpckhbw   mm2,    mm7
1193         punpckhbw   mm3,    mm7
1194
1195         paddw       mm2,    mm3
1196         paddw       mm2,    mm6
1197
1198         psraw       mm0,    1
1199         psraw       mm2,    1
1200
1201         packuswb    mm0,    mm2
1202         movq        mm2,    mm4
1203
1204         punpcklbw   mm2,    mm0
1205         movq        [rdi],  mm2
1206
1207         punpckhbw   mm4,    mm0
1208         movq        [rdi+8], mm4
1209     }
1210 }
1211
1212
1213
1214
1215
1216 __declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
1217 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
1218
1219
1220 /****************************************************************************
1221 *
1222 *  ROUTINE       : horizontal_line_5_4_scale_mmx
1223 *
1224 *  INPUTS        : const unsigned char *source : Pointer to source data.
1225 *                  unsigned int source_width    : Stride of source.
1226 *                  unsigned char *dest         : Pointer to destination data.
1227 *                  unsigned int dest_width      : Stride of destination (NOT USED).
1228 *
1229 *  OUTPUTS       : None.
1230 *
1231 *  RETURNS       : void
1232 *
1233 *  FUNCTION      : Copies horizontal line of pixels from source to
1234 *                  destination scaling up by 4 to 5.
1235 *
1236 *  SPECIAL NOTES : None.
1237 *
1238 ****************************************************************************/
1239 static
1240 void horizontal_line_5_4_scale_mmx
1241 (
1242     const unsigned char *source,
1243     unsigned int source_width,
1244     unsigned char *dest,
1245     unsigned int dest_width
1246 )
1247 {
1248     /*
1249     unsigned i;
1250     unsigned int a, b, c, d, e;
1251     unsigned char *des = dest;
1252     const unsigned char *src = source;
1253
1254     (void) dest_width;
1255
1256     for ( i=0; i<source_width; i+=5 )
1257     {
1258         a = src[0];
1259         b = src[1];
1260         c = src[2];
1261         d = src[3];
1262         e = src[4];
1263
1264         des[0] = a;
1265         des[1] = ((b*192 + c* 64 + 128)>>8);
1266         des[2] = ((c*128 + d*128 + 128)>>8);
1267         des[3] = ((d* 64 + e*192 + 128)>>8);
1268
1269         src += 5;
1270         des += 4;
1271     }
1272     */
1273     __asm
1274     {
1275
1276         mov         rsi,        source              ;
1277         mov         rdi,        dest                ;
1278
1279         mov         ecx,        source_width         ;
1280         movq        mm5,        const54_1           ;
1281
1282         pxor        mm7,        mm7                 ;
1283         movq        mm6,        const54_2           ;
1284
1285         movq        mm4,        round_values         ;
1286         lea         rdx,        [rsi+rcx]           ;
1287         horizontal_line_5_4_loop:
1288
1289         movq        mm0,        QWORD PTR  [rsi]    ;
1290         00 01 02 03 04 05 06 07
1291         movq        mm1,        mm0                 ;
1292         00 01 02 03 04 05 06 07
1293
1294         psrlq       mm0,        8                   ;
1295         01 02 03 04 05 06 07 xx
1296         punpcklbw   mm1,        mm7                 ;
1297         xx 00 xx 01 xx 02 xx 03
1298
1299         punpcklbw   mm0,        mm7                 ;
1300         xx 01 xx 02 xx 03 xx 04
1301         pmullw      mm1,        mm5
1302
1303         pmullw      mm0,        mm6
1304         add         rsi,        5
1305
1306         add         rdi,        4
1307         paddw       mm1,        mm0
1308
1309         paddw       mm1,        mm4
1310         psrlw       mm1,        8
1311
1312         cmp         rsi,        rdx
1313         packuswb    mm1,        mm7
1314
1315         movd        DWORD PTR [rdi-4], mm1
1316
1317         jl          horizontal_line_5_4_loop
1318
1319     }
1320
1321 }
1322 __declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
1323 __declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
1324 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
1325
1326 static
1327 void vertical_band_5_4_scale_mmx
1328 (
1329     unsigned char *source,
1330     unsigned int src_pitch,
1331     unsigned char *dest,
1332     unsigned int dest_pitch,
1333     unsigned int dest_width
1334 )
1335 {
1336
1337     __asm
1338     {
1339
1340         mov         rsi,    source                    // Get the source and destination pointer
1341         mov         ecx,    src_pitch               // Get the pitch size
1342
1343         mov         rdi,    dest                    // tow lines below
1344         pxor        mm7,    mm7                     // clear out mm7
1345
1346         mov         edx,    dest_pitch               // Loop counter
1347         mov         ebx,    dest_width
1348
1349         vs_5_4_loop:
1350
1351         movd        mm0,    DWORD ptr [rsi]         // src[0];
1352         movd        mm1,    DWORD ptr [rsi+rcx]     // src[1];
1353
1354         movd        mm2,    DWORD ptr [rsi+rcx*2]
1355         lea         rax,    [rsi+rcx*2]             //
1356
1357         punpcklbw   mm1,    mm7
1358         punpcklbw   mm2,    mm7
1359
1360         movq        mm3,    mm2
1361         pmullw      mm1,    three_fourths
1362
1363         pmullw      mm2,    one_fourths
1364         movd        mm4,    [rax+rcx]
1365
1366         pmullw      mm3,    two_fourths
1367         punpcklbw   mm4,    mm7
1368
1369         movq        mm5,    mm4
1370         pmullw      mm4,    two_fourths
1371
1372         paddw       mm1,    mm2
1373         movd        mm6,    [rax+rcx*2]
1374
1375         pmullw      mm5,    one_fourths
1376         paddw       mm1,    round_values;
1377
1378         paddw       mm3,    mm4
1379         psrlw       mm1,    8
1380
1381         punpcklbw   mm6,    mm7
1382         paddw       mm3,    round_values
1383
1384         pmullw      mm6,    three_fourths
1385         psrlw       mm3,    8
1386
1387         packuswb    mm1,    mm7
1388         packuswb    mm3,    mm7
1389
1390         movd        DWORD PTR [rdi], mm0
1391         movd        DWORD PTR [rdi+rdx], mm1
1392
1393
1394         paddw       mm5,    mm6
1395         movd        DWORD PTR [rdi+rdx*2], mm3
1396
1397         lea         rax,    [rdi+rdx*2]
1398         paddw       mm5,    round_values
1399
1400         psrlw       mm5,    8
1401         add         rdi,    4
1402
1403         packuswb    mm5,    mm7
1404         movd        DWORD PTR [rax+rdx], mm5
1405
1406         add         rsi,    4
1407         sub         rbx,    4
1408
1409         jg         vs_5_4_loop
1410     }
1411 }
1412
1413
1414 __declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
1415 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
1416
1417
1418 static
1419 void horizontal_line_5_3_scale_mmx
1420 (
1421     const unsigned char *source,
1422     unsigned int source_width,
1423     unsigned char *dest,
1424     unsigned int dest_width
1425 )
1426 {
1427     __asm
1428     {
1429
1430         mov         rsi,        source              ;
1431         mov         rdi,        dest                ;
1432
1433         mov         ecx,        source_width         ;
1434         movq        mm5,        const53_1           ;
1435
1436         pxor        mm7,        mm7                 ;
1437         movq        mm6,        const53_2           ;
1438
1439         movq        mm4,        round_values         ;
1440         lea         rdx,        [rsi+rcx-5]         ;
1441         horizontal_line_5_3_loop:
1442
1443         movq        mm0,        QWORD PTR  [rsi]    ;
1444         00 01 02 03 04 05 06 07
1445         movq        mm1,        mm0                 ;
1446         00 01 02 03 04 05 06 07
1447
1448         psllw       mm0,        8                   ;
1449         xx 00 xx 02 xx 04 xx 06
1450         psrlw       mm1,        8                   ;
1451         01 xx 03 xx 05 xx 07 xx
1452
1453         psrlw       mm0,        8                   ;
1454         00 xx 02 xx 04 xx 06 xx
1455         psllq       mm1,        16                  ;
1456         xx xx 01 xx 03 xx 05 xx
1457
1458         pmullw      mm0,        mm6
1459
1460         pmullw      mm1,        mm5
1461         add         rsi,        5
1462
1463         add         rdi,        3
1464         paddw       mm1,        mm0
1465
1466         paddw       mm1,        mm4
1467         psrlw       mm1,        8
1468
1469         cmp         rsi,        rdx
1470         packuswb    mm1,        mm7
1471
1472         movd        DWORD PTR [rdi-3], mm1
1473         jl          horizontal_line_5_3_loop
1474
1475 //exit condition
1476         movq        mm0,        QWORD PTR  [rsi]    ;
1477         00 01 02 03 04 05 06 07
1478         movq        mm1,        mm0                 ;
1479         00 01 02 03 04 05 06 07
1480
1481         psllw       mm0,        8                   ;
1482         xx 00 xx 02 xx 04 xx 06
1483         psrlw       mm1,        8                   ;
1484         01 xx 03 xx 05 xx 07 xx
1485
1486         psrlw       mm0,        8                   ;
1487         00 xx 02 xx 04 xx 06 xx
1488         psllq       mm1,        16                  ;
1489         xx xx 01 xx 03 xx 05 xx
1490
1491         pmullw      mm0,        mm6
1492
1493         pmullw      mm1,        mm5
1494         paddw       mm1,        mm0
1495
1496         paddw       mm1,        mm4
1497         psrlw       mm1,        8
1498
1499         packuswb    mm1,        mm7
1500         movd        rax,        mm1
1501
1502         mov         rdx,        rax
1503         shr         rdx,        16
1504
1505         mov         WORD PTR[rdi],   ax
1506         mov         BYTE PTR[rdi+2], dl
1507
1508     }
1509
1510 }
1511
1512 __declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
1513 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
1514
1515 static
1516 void vertical_band_5_3_scale_mmx
1517 (
1518     unsigned char *source,
1519     unsigned int src_pitch,
1520     unsigned char *dest,
1521     unsigned int dest_pitch,
1522     unsigned int dest_width
1523 )
1524 {
1525
1526     __asm
1527     {
1528
1529         mov         rsi,    source                    // Get the source and destination pointer
1530         mov         ecx,    src_pitch               // Get the pitch size
1531
1532         mov         rdi,    dest                    // tow lines below
1533         pxor        mm7,    mm7                     // clear out mm7
1534
1535         mov         edx,    dest_pitch               // Loop counter
1536         movq        mm5,    one_thirds
1537
1538         movq        mm6,    two_thirds
1539         mov         ebx,    dest_width;
1540
1541         vs_5_3_loop:
1542
1543         movd        mm0,    DWORD ptr [rsi]         // src[0];
1544         movd        mm1,    DWORD ptr [rsi+rcx]     // src[1];
1545
1546         movd        mm2,    DWORD ptr [rsi+rcx*2]
1547         lea         rax,    [rsi+rcx*2]             //
1548
1549         punpcklbw   mm1,    mm7
1550         punpcklbw   mm2,    mm7
1551
1552         pmullw      mm1,    mm5
1553         pmullw      mm2,    mm6
1554
1555         movd        mm3,    DWORD ptr [rax+rcx]
1556         movd        mm4,    DWORD ptr [rax+rcx*2]
1557
1558         punpcklbw   mm3,    mm7
1559         punpcklbw   mm4,    mm7
1560
1561         pmullw      mm3,    mm6
1562         pmullw      mm4,    mm5
1563
1564
1565         movd        DWORD PTR [rdi], mm0
1566         paddw       mm1,    mm2
1567
1568         paddw       mm1,    round_values
1569         psrlw       mm1,    8
1570
1571         packuswb    mm1,    mm7
1572         paddw       mm3,    mm4
1573
1574         paddw       mm3,    round_values
1575         movd        DWORD PTR [rdi+rdx], mm1
1576
1577         psrlw       mm3,    8
1578         packuswb    mm3,    mm7
1579
1580         movd        DWORD PTR [rdi+rdx*2], mm3
1581
1582
1583         add         rdi,    4
1584         add         rsi,    4
1585
1586         sub         rbx,    4
1587         jg          vs_5_3_loop
1588     }
1589 }
1590
1591
1592
1593
1594 /****************************************************************************
1595 *
1596 *  ROUTINE       : horizontal_line_2_1_scale
1597 *
1598 *  INPUTS        : const unsigned char *source :
1599 *                  unsigned int source_width    :
1600 *                  unsigned char *dest         :
1601 *                  unsigned int dest_width      :
1602 *
1603 *  OUTPUTS       : None.
1604 *
1605 *  RETURNS       : void
1606 *
1607 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
1608 *
1609 *  SPECIAL NOTES : None.
1610 *
1611 ****************************************************************************/
1612 static
1613 void horizontal_line_2_1_scale_mmx
1614 (
1615     const unsigned char *source,
1616     unsigned int source_width,
1617     unsigned char *dest,
1618     unsigned int dest_width
1619 )
1620 {
1621     (void) dest_width;
1622
1623     __asm
1624     {
1625         mov         rsi,    source
1626         mov         rdi,    dest
1627
1628         pxor        mm7,    mm7
1629         mov         ecx,    dest_width
1630
1631         xor         rdx,    rdx
1632         hs_2_1_loop:
1633
1634         movq        mm0,    [rsi+rdx*2]
1635         psllw       mm0,    8
1636
1637         psrlw       mm0,    8
1638         packuswb    mm0,    mm7
1639
1640         movd        DWORD Ptr [rdi+rdx], mm0;
1641         add         rdx,    4
1642
1643         cmp         rdx,    rcx
1644         jl          hs_2_1_loop
1645
1646     }
1647 }
1648
1649
1650
1651 static
1652 void vertical_band_2_1_scale_mmx
1653 (
1654     unsigned char *source,
1655     unsigned int src_pitch,
1656     unsigned char *dest,
1657     unsigned int dest_pitch,
1658     unsigned int dest_width)
1659 {
1660     vpx_memcpy(dest, source, dest_width);
1661 }
1662
1663
1664 __declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
1665 __declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
1666
1667 static
1668 void vertical_band_2_1_scale_i_mmx
1669 (
1670     unsigned char *source,
1671     unsigned int src_pitch,
1672     unsigned char *dest,
1673     unsigned int dest_pitch,
1674     unsigned int dest_width
1675 )
1676 {
1677     __asm
1678     {
1679         mov         rsi,        source
1680         mov         rdi,        dest
1681
1682         mov         eax,        src_pitch
1683         mov         edx,        dest_width
1684
1685         pxor        mm7,        mm7
1686         sub         rsi,        rax             //back one line
1687
1688
1689         lea         rcx,        [rsi+rdx];
1690         movq        mm6,        round_values;
1691
1692         movq        mm5,        three_sixteenths;
1693         movq        mm4,        ten_sixteenths;
1694
1695         vs_2_1_i_loop:
1696         movd        mm0,        [rsi]           //
1697         movd        mm1,        [rsi+rax]       //
1698
1699         movd        mm2,        [rsi+rax*2]     //
1700         punpcklbw   mm0,        mm7
1701
1702         pmullw      mm0,        mm5
1703         punpcklbw   mm1,        mm7
1704
1705         pmullw      mm1,        mm4
1706         punpcklbw   mm2,        mm7
1707
1708         pmullw      mm2,        mm5
1709         paddw       mm0,        round_values
1710
1711         paddw       mm1,        mm2
1712         paddw       mm0,        mm1
1713
1714         psrlw       mm0,        8
1715         packuswb    mm0,        mm7
1716
1717         movd        DWORD PTR [rdi],        mm0
1718         add         rsi,        4
1719
1720         add         rdi,        4;
1721         cmp         rsi,        rcx
1722         jl          vs_2_1_i_loop
1723
1724     }
1725 }
1726
1727
1728
1729 void
1730 register_mmxscalers(void)
1731 {
1732     vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
1733     vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
1734     vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
1735     vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
1736     vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
1737     vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
1738     vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
1739     vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
1740     vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
1741
1742     vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
1743     vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
1744     vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
1745     vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
1746     vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
1747     vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
1748     vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
1749 }