Merge "ARMv6 optimized half pixel variance calculations"
[libvpx.git] / vpx_scale / win32 / scaleopt.c
blob3711fe5eb0b7637f2f4388cf27d5232c69d8b9a5
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
12 /****************************************************************************
14 * Module Title : scaleopt.cpp
16 * Description : Optimized scaling functions
18 ****************************************************************************/
19 #include "pragmas.h"
23 /****************************************************************************
24 * Module Statics
25 ****************************************************************************/
26 __declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 };
27 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
28 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
29 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
30 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
31 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
32 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
33 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
34 __declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
35 __declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
36 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
40 #include "vpx_scale/vpxscale.h"
41 #include "vpx_mem/vpx_mem.h"
43 /****************************************************************************
45 * ROUTINE : horizontal_line_3_5_scale_mmx
47 * INPUTS : const unsigned char *source :
48 * unsigned int source_width :
49 * unsigned char *dest :
50 * unsigned int dest_width :
52 * OUTPUTS : None.
54 * RETURNS : void
56 * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
58 * SPECIAL NOTES : None.
60 ****************************************************************************/
61 static
62 void horizontal_line_3_5_scale_mmx
64 const unsigned char *source,
65 unsigned int source_width,
66 unsigned char *dest,
67 unsigned int dest_width
70 (void) dest_width;
72 __asm
75 push ebx
77 mov esi, source
78 mov edi, dest
80 mov ecx, source_width
81 lea edx, [esi+ecx-3];
83 movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
84 movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
86 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
87 pxor mm7, mm7 // clear mm7
89 horiz_line_3_5_loop:
91 mov eax, DWORD PTR [esi] // eax = 00 01 02 03
92 mov ebx, eax
94 and ebx, 0xffff00 // ebx = xx 01 02 xx
95 mov ecx, eax // ecx = 00 01 02 03
97 and eax, 0xffff0000 // eax = xx xx 02 03
98 xor ecx, eax // ecx = 00 01 xx xx
100 shr ebx, 8 // ebx = 01 02 xx xx
101 or eax, ebx // eax = 01 02 02 03
103 shl ebx, 16 // ebx = xx xx 01 02
104 movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
106 or ebx, ecx // ebx = 00 01 01 02
107 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
109 movd mm0, ebx // mm0 = 00 01 01 02
110 pmullw mm1, mm6 //
112 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
113 pmullw mm0, mm5 //
115 mov [edi], ebx // writeoutput 00 xx xx xx
116 add esi, 3
118 add edi, 5
119 paddw mm0, mm1
121 paddw mm0, mm4
122 psrlw mm0, 8
124 cmp esi, edx
125 packuswb mm0, mm7
127 movd DWORD Ptr [edi-4], mm0
128 jl horiz_line_3_5_loop
130 //Exit:
131 mov eax, DWORD PTR [esi] // eax = 00 01 02 03
132 mov ebx, eax
134 and ebx, 0xffff00 // ebx = xx 01 02 xx
135 mov ecx, eax // ecx = 00 01 02 03
137 and eax, 0xffff0000 // eax = xx xx 02 03
138 xor ecx, eax // ecx = 00 01 xx xx
140 shr ebx, 8 // ebx = 01 02 xx xx
141 or eax, ebx // eax = 01 02 02 03
143 shl eax, 8 // eax = xx 01 02 02
144 and eax, 0xffff0000 // eax = xx xx 02 02
146 or eax, ebx // eax = 01 02 02 02
148 shl ebx, 16 // ebx = xx xx 01 02
149 movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
151 or ebx, ecx // ebx = 00 01 01 02
152 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
154 movd mm0, ebx // mm0 = 00 01 01 02
155 pmullw mm1, mm6 //
157 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
158 pmullw mm0, mm5 //
160 mov [edi], ebx // writeoutput 00 xx xx xx
161 paddw mm0, mm1
163 paddw mm0, mm4
164 psrlw mm0, 8
166 packuswb mm0, mm7
167 movd DWORD Ptr [edi+1], mm0
169 pop ebx
176 /****************************************************************************
178 * ROUTINE : horizontal_line_4_5_scale_mmx
180 * INPUTS : const unsigned char *source :
181 * unsigned int source_width :
182 * unsigned char *dest :
183 * unsigned int dest_width :
185 * OUTPUTS : None.
187 * RETURNS : void
189 * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
191 * SPECIAL NOTES : None.
193 ****************************************************************************/
194 static
195 void horizontal_line_4_5_scale_mmx
197 const unsigned char *source,
198 unsigned int source_width,
199 unsigned char *dest,
200 unsigned int dest_width
203 (void)dest_width;
205 __asm
208 mov esi, source
209 mov edi, dest
211 mov ecx, source_width
212 lea edx, [esi+ecx-8];
214 movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
215 movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
217 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
218 pxor mm7, mm7 // clear mm7
220 horiz_line_4_5_loop:
222 movq mm0, QWORD PTR [esi] // mm0 = 00 01 02 03 04 05 06 07
223 movq mm1, QWORD PTR [esi+1]; // mm1 = 01 02 03 04 05 06 07 08
225 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
226 movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
228 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
229 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
231 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
232 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
234 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
235 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
237 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
238 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
240 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
241 pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
243 paddw mm0, mm1 // added round values
244 paddw mm0, mm4
246 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
247 packuswb mm0, mm7
249 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
250 add edi, 10
252 add esi, 8
253 paddw mm2, mm3 //
255 paddw mm2, mm4 // added round values
256 cmp esi, edx
258 psrlw mm2, 8
259 packuswb mm2, mm7
261 movd DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
262 jl horiz_line_4_5_loop
264 //Exit:
265 movq mm0, [esi] // mm0 = 00 01 02 03 04 05 06 07
266 movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
268 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
269 psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
271 movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
272 pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
274 psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
275 por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
277 movq mm3, mm1
279 movd DWORD PTR [edi], mm0 // write output 00 xx xx xx
280 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
282 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
283 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
285 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
286 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
288 movd DWORD PTR [edi+5], mm2 // write ouput 05 xx xx xx
289 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
291 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
292 pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
294 paddw mm0, mm1 // added round values
295 paddw mm0, mm4
297 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
298 packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
300 movd DWORD PTR [edi+1], mm0 // write output 01 02 03 04
301 paddw mm2, mm3 //
303 paddw mm2, mm4 // added round values
304 psrlw mm2, 8
306 packuswb mm2, mm7
307 movd DWORD PTR [edi+6], mm2 // writeoutput 06 07 08 09
313 /****************************************************************************
315 * ROUTINE : vertical_band_4_5_scale_mmx
317 * INPUTS : unsigned char *dest :
318 * unsigned int dest_pitch :
319 * unsigned int dest_width :
321 * OUTPUTS : None.
323 * RETURNS : void
325 * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
327 * SPECIAL NOTES : The routine uses the first line of the band below
328 * the current band. The function also has a "C" only
329 * version.
331 ****************************************************************************/
332 static
333 void vertical_band_4_5_scale_mmx
335 unsigned char *dest,
336 unsigned int dest_pitch,
337 unsigned int dest_width
340 __asm
343 mov esi, dest // Get the source and destination pointer
344 mov ecx, dest_pitch // Get the pitch size
346 lea edi, [esi+ecx*2] // tow lines below
347 add edi, ecx // three lines below
349 pxor mm7, mm7 // clear out mm7
350 mov edx, dest_width // Loop counter
352 vs_4_5_loop:
354 movq mm0, QWORD ptr [esi] // src[0];
355 movq mm1, QWORD ptr [esi+ecx] // src[1];
357 movq mm2, mm0 // Make a copy
358 punpcklbw mm0, mm7 // unpack low to word
360 movq mm5, one_fifth
361 punpckhbw mm2, mm7 // unpack high to word
363 pmullw mm0, mm5 // a * 1/5
365 movq mm3, mm1 // make a copy
366 punpcklbw mm1, mm7 // unpack low to word
368 pmullw mm2, mm5 // a * 1/5
369 movq mm6, four_fifths // constan
371 movq mm4, mm1 // copy of low b
372 pmullw mm4, mm6 // b * 4/5
374 punpckhbw mm3, mm7 // unpack high to word
375 movq mm5, mm3 // copy of high b
377 pmullw mm5, mm6 // b * 4/5
378 paddw mm0, mm4 // a * 1/5 + b * 4/5
380 paddw mm2, mm5 // a * 1/5 + b * 4/5
381 paddw mm0, round_values // + 128
383 paddw mm2, round_values // + 128
384 psrlw mm0, 8
386 psrlw mm2, 8
387 packuswb mm0, mm2 // des [1]
389 movq QWORD ptr [esi+ecx], mm0 // write des[1]
390 movq mm0, [esi+ecx*2] // mm0 = src[2]
392 // mm1, mm3 --- Src[1]
393 // mm0 --- Src[2]
394 // mm7 for unpacking
396 movq mm5, two_fifths
397 movq mm2, mm0 // make a copy
399 pmullw mm1, mm5 // b * 2/5
400 movq mm6, three_fifths
403 punpcklbw mm0, mm7 // unpack low to word
404 pmullw mm3, mm5 // b * 2/5
406 movq mm4, mm0 // make copy of c
407 punpckhbw mm2, mm7 // unpack high to word
409 pmullw mm4, mm6 // c * 3/5
410 movq mm5, mm2
412 pmullw mm5, mm6 // c * 3/5
413 paddw mm1, mm4 // b * 2/5 + c * 3/5
415 paddw mm3, mm5 // b * 2/5 + c * 3/5
416 paddw mm1, round_values // + 128
418 paddw mm3, round_values // + 128
419 psrlw mm1, 8
421 psrlw mm3, 8
422 packuswb mm1, mm3 // des[2]
424 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
425 movq mm1, [edi] // mm1=Src[3];
427 // mm0, mm2 --- Src[2]
428 // mm1 --- Src[3]
429 // mm6 --- 3/5
430 // mm7 for unpacking
432 pmullw mm0, mm6 // c * 3/5
433 movq mm5, two_fifths // mm5 = 2/5
435 movq mm3, mm1 // make a copy
436 pmullw mm2, mm6 // c * 3/5
438 punpcklbw mm1, mm7 // unpack low
439 movq mm4, mm1 // make a copy
441 punpckhbw mm3, mm7 // unpack high
442 pmullw mm4, mm5 // d * 2/5
444 movq mm6, mm3 // make a copy
445 pmullw mm6, mm5 // d * 2/5
447 paddw mm0, mm4 // c * 3/5 + d * 2/5
448 paddw mm2, mm6 // c * 3/5 + d * 2/5
450 paddw mm0, round_values // + 128
451 paddw mm2, round_values // + 128
453 psrlw mm0, 8
454 psrlw mm2, 8
456 packuswb mm0, mm2 // des[3]
457 movq QWORD ptr [edi], mm0 // write des[3]
459 // mm1, mm3 --- Src[3]
460 // mm7 -- cleared for unpacking
462 movq mm0, [edi+ecx*2] // mm0, Src[0] of the next group
464 movq mm5, four_fifths // mm5 = 4/5
465 pmullw mm1, mm5 // d * 4/5
467 movq mm6, one_fifth // mm6 = 1/5
468 movq mm2, mm0 // make a copy
470 pmullw mm3, mm5 // d * 4/5
471 punpcklbw mm0, mm7 // unpack low
473 pmullw mm0, mm6 // an * 1/5
474 punpckhbw mm2, mm7 // unpack high
476 paddw mm1, mm0 // d * 4/5 + an * 1/5
477 pmullw mm2, mm6 // an * 1/5
479 paddw mm3, mm2 // d * 4/5 + an * 1/5
480 paddw mm1, round_values // + 128
482 paddw mm3, round_values // + 128
483 psrlw mm1, 8
485 psrlw mm3, 8
486 packuswb mm1, mm3 // des[4]
488 movq QWORD ptr [edi+ecx], mm1 // write des[4]
490 add edi, 8
491 add esi, 8
493 sub edx, 8
494 jg vs_4_5_loop
498 /****************************************************************************
500 * ROUTINE : last_vertical_band_4_5_scale_mmx
502 * INPUTS : unsigned char *dest :
503 * unsigned int dest_pitch :
504 * unsigned int dest_width :
506 * OUTPUTS : None.
508 * RETURNS : None
510 * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
512 * SPECIAL NOTES : The routine uses the first line of the band below
513 * the current band. The function also has an "C" only
514 * version.
516 ****************************************************************************/
517 static
518 void last_vertical_band_4_5_scale_mmx
520 unsigned char *dest,
521 unsigned int dest_pitch,
522 unsigned int dest_width
525 __asm
527 mov esi, dest // Get the source and destination pointer
528 mov ecx, dest_pitch // Get the pitch size
530 lea edi, [esi+ecx*2] // tow lines below
531 add edi, ecx // three lines below
533 pxor mm7, mm7 // clear out mm7
534 mov edx, dest_width // Loop counter
536 last_vs_4_5_loop:
538 movq mm0, QWORD ptr [esi] // src[0];
539 movq mm1, QWORD ptr [esi+ecx] // src[1];
541 movq mm2, mm0 // Make a copy
542 punpcklbw mm0, mm7 // unpack low to word
544 movq mm5, one_fifth
545 punpckhbw mm2, mm7 // unpack high to word
547 pmullw mm0, mm5 // a * 1/5
549 movq mm3, mm1 // make a copy
550 punpcklbw mm1, mm7 // unpack low to word
552 pmullw mm2, mm5 // a * 1/5
553 movq mm6, four_fifths // constan
555 movq mm4, mm1 // copy of low b
556 pmullw mm4, mm6 // b * 4/5
558 punpckhbw mm3, mm7 // unpack high to word
559 movq mm5, mm3 // copy of high b
561 pmullw mm5, mm6 // b * 4/5
562 paddw mm0, mm4 // a * 1/5 + b * 4/5
564 paddw mm2, mm5 // a * 1/5 + b * 4/5
565 paddw mm0, round_values // + 128
567 paddw mm2, round_values // + 128
568 psrlw mm0, 8
570 psrlw mm2, 8
571 packuswb mm0, mm2 // des [1]
573 movq QWORD ptr [esi+ecx], mm0 // write des[1]
574 movq mm0, [esi+ecx*2] // mm0 = src[2]
576 // mm1, mm3 --- Src[1]
577 // mm0 --- Src[2]
578 // mm7 for unpacking
580 movq mm5, two_fifths
581 movq mm2, mm0 // make a copy
583 pmullw mm1, mm5 // b * 2/5
584 movq mm6, three_fifths
587 punpcklbw mm0, mm7 // unpack low to word
588 pmullw mm3, mm5 // b * 2/5
590 movq mm4, mm0 // make copy of c
591 punpckhbw mm2, mm7 // unpack high to word
593 pmullw mm4, mm6 // c * 3/5
594 movq mm5, mm2
596 pmullw mm5, mm6 // c * 3/5
597 paddw mm1, mm4 // b * 2/5 + c * 3/5
599 paddw mm3, mm5 // b * 2/5 + c * 3/5
600 paddw mm1, round_values // + 128
602 paddw mm3, round_values // + 128
603 psrlw mm1, 8
605 psrlw mm3, 8
606 packuswb mm1, mm3 // des[2]
608 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
609 movq mm1, [edi] // mm1=Src[3];
611 movq QWORD ptr [edi+ecx], mm1 // write des[4];
613 // mm0, mm2 --- Src[2]
614 // mm1 --- Src[3]
615 // mm6 --- 3/5
616 // mm7 for unpacking
618 pmullw mm0, mm6 // c * 3/5
619 movq mm5, two_fifths // mm5 = 2/5
621 movq mm3, mm1 // make a copy
622 pmullw mm2, mm6 // c * 3/5
624 punpcklbw mm1, mm7 // unpack low
625 movq mm4, mm1 // make a copy
627 punpckhbw mm3, mm7 // unpack high
628 pmullw mm4, mm5 // d * 2/5
630 movq mm6, mm3 // make a copy
631 pmullw mm6, mm5 // d * 2/5
633 paddw mm0, mm4 // c * 3/5 + d * 2/5
634 paddw mm2, mm6 // c * 3/5 + d * 2/5
636 paddw mm0, round_values // + 128
637 paddw mm2, round_values // + 128
639 psrlw mm0, 8
640 psrlw mm2, 8
642 packuswb mm0, mm2 // des[3]
643 movq QWORD ptr [edi], mm0 // write des[3]
645 // mm1, mm3 --- Src[3]
646 // mm7 -- cleared for unpacking
647 add edi, 8
648 add esi, 8
650 sub edx, 8
651 jg last_vs_4_5_loop
655 /****************************************************************************
657 * ROUTINE : vertical_band_3_5_scale_mmx
659 * INPUTS : unsigned char *dest :
660 * unsigned int dest_pitch :
661 * unsigned int dest_width :
663 * OUTPUTS : None.
665 * RETURNS : void
667 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
669 * SPECIAL NOTES : The routine uses the first line of the band below
670 * the current band. The function also has an "C" only
671 * version.
673 ****************************************************************************/
674 static
675 void vertical_band_3_5_scale_mmx
677 unsigned char *dest,
678 unsigned int dest_pitch,
679 unsigned int dest_width
682 __asm
684 mov esi, dest // Get the source and destination pointer
685 mov ecx, dest_pitch // Get the pitch size
687 lea edi, [esi+ecx*2] // tow lines below
688 add edi, ecx // three lines below
690 pxor mm7, mm7 // clear out mm7
691 mov edx, dest_width // Loop counter
693 vs_3_5_loop:
695 movq mm0, QWORD ptr [esi] // src[0];
696 movq mm1, QWORD ptr [esi+ecx] // src[1];
698 movq mm2, mm0 // Make a copy
699 punpcklbw mm0, mm7 // unpack low to word
701 movq mm5, two_fifths // mm5 = 2/5
702 punpckhbw mm2, mm7 // unpack high to word
704 pmullw mm0, mm5 // a * 2/5
706 movq mm3, mm1 // make a copy
707 punpcklbw mm1, mm7 // unpack low to word
709 pmullw mm2, mm5 // a * 2/5
710 movq mm6, three_fifths // mm6 = 3/5
712 movq mm4, mm1 // copy of low b
713 pmullw mm4, mm6 // b * 3/5
715 punpckhbw mm3, mm7 // unpack high to word
716 movq mm5, mm3 // copy of high b
718 pmullw mm5, mm6 // b * 3/5
719 paddw mm0, mm4 // a * 2/5 + b * 3/5
721 paddw mm2, mm5 // a * 2/5 + b * 3/5
722 paddw mm0, round_values // + 128
724 paddw mm2, round_values // + 128
725 psrlw mm0, 8
727 psrlw mm2, 8
728 packuswb mm0, mm2 // des [1]
730 movq QWORD ptr [esi+ecx], mm0 // write des[1]
731 movq mm0, [esi+ecx*2] // mm0 = src[2]
733 // mm1, mm3 --- Src[1]
734 // mm0 --- Src[2]
735 // mm7 for unpacking
737 movq mm4, mm1 // b low
738 pmullw mm1, four_fifths // b * 4/5 low
740 movq mm5, mm3 // b high
741 pmullw mm3, four_fifths // b * 4/5 high
743 movq mm2, mm0 // c
744 pmullw mm4, one_fifth // b * 1/5
746 punpcklbw mm0, mm7 // c low
747 pmullw mm5, one_fifth // b * 1/5
749 movq mm6, mm0 // make copy of c low
750 punpckhbw mm2, mm7 // c high
752 pmullw mm6, one_fifth // c * 1/5 low
753 movq mm7, mm2 // make copy of c high
755 pmullw mm7, one_fifth // c * 1/5 high
756 paddw mm1, mm6 // b * 4/5 + c * 1/5 low
758 paddw mm3, mm7 // b * 4/5 + c * 1/5 high
759 movq mm6, mm0 // make copy of c low
761 pmullw mm6, four_fifths // c * 4/5 low
762 movq mm7, mm2 // make copy of c high
764 pmullw mm7, four_fifths // c * 4/5 high
766 paddw mm4, mm6 // b * 1/5 + c * 4/5 low
767 paddw mm5, mm7 // b * 1/5 + c * 4/5 high
769 paddw mm1, round_values // + 128
770 paddw mm3, round_values // + 128
772 psrlw mm1, 8
773 psrlw mm3, 8
775 packuswb mm1, mm3 // des[2]
776 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
778 paddw mm4, round_values // + 128
779 paddw mm5, round_values // + 128
781 psrlw mm4, 8
782 psrlw mm5, 8
784 packuswb mm4, mm5 // des[3]
785 movq QWORD ptr [edi], mm4 // write des[3]
787 // mm0, mm2 --- Src[3]
789 pxor mm7, mm7 // clear mm7 for unpacking
790 movq mm1, [edi+ecx*2] // mm1 = Src[0] of the next group
792 movq mm5, three_fifths // mm5 = 3/5
793 pmullw mm0, mm5 // d * 3/5
795 movq mm6, two_fifths // mm6 = 2/5
796 movq mm3, mm1 // make a copy
798 pmullw mm2, mm5 // d * 3/5
799 punpcklbw mm1, mm7 // unpack low
801 pmullw mm1, mm6 // an * 2/5
802 punpckhbw mm3, mm7 // unpack high
804 paddw mm0, mm1 // d * 3/5 + an * 2/5
805 pmullw mm3, mm6 // an * 2/5
807 paddw mm2, mm3 // d * 3/5 + an * 2/5
808 paddw mm0, round_values // + 128
810 paddw mm2, round_values // + 128
811 psrlw mm0, 8
813 psrlw mm2, 8
814 packuswb mm0, mm2 // des[4]
816 movq QWORD ptr [edi+ecx], mm0 // write des[4]
818 add edi, 8
819 add esi, 8
821 sub edx, 8
822 jg vs_3_5_loop
826 /****************************************************************************
828 * ROUTINE : last_vertical_band_3_5_scale_mmx
830 * INPUTS : unsigned char *dest :
831 * unsigned int dest_pitch :
832 * unsigned int dest_width :
834 * OUTPUTS : None.
836 * RETURNS : void
838 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
840 * SPECIAL NOTES : The routine uses the first line of the band below
841 * the current band. The function also has an "C" only
842 * version.
844 ****************************************************************************/
845 static
846 void last_vertical_band_3_5_scale_mmx
848 unsigned char *dest,
849 unsigned int dest_pitch,
850 unsigned int dest_width
853 __asm
855 mov esi, dest // Get the source and destination pointer
856 mov ecx, dest_pitch // Get the pitch size
858 lea edi, [esi+ecx*2] // tow lines below
859 add edi, ecx // three lines below
861 pxor mm7, mm7 // clear out mm7
862 mov edx, dest_width // Loop counter
865 last_vs_3_5_loop:
867 movq mm0, QWORD ptr [esi] // src[0];
868 movq mm1, QWORD ptr [esi+ecx] // src[1];
870 movq mm2, mm0 // Make a copy
871 punpcklbw mm0, mm7 // unpack low to word
873 movq mm5, two_fifths // mm5 = 2/5
874 punpckhbw mm2, mm7 // unpack high to word
876 pmullw mm0, mm5 // a * 2/5
878 movq mm3, mm1 // make a copy
879 punpcklbw mm1, mm7 // unpack low to word
881 pmullw mm2, mm5 // a * 2/5
882 movq mm6, three_fifths // mm6 = 3/5
884 movq mm4, mm1 // copy of low b
885 pmullw mm4, mm6 // b * 3/5
887 punpckhbw mm3, mm7 // unpack high to word
888 movq mm5, mm3 // copy of high b
890 pmullw mm5, mm6 // b * 3/5
891 paddw mm0, mm4 // a * 2/5 + b * 3/5
893 paddw mm2, mm5 // a * 2/5 + b * 3/5
894 paddw mm0, round_values // + 128
896 paddw mm2, round_values // + 128
897 psrlw mm0, 8
899 psrlw mm2, 8
900 packuswb mm0, mm2 // des [1]
902 movq QWORD ptr [esi+ecx], mm0 // write des[1]
903 movq mm0, [esi+ecx*2] // mm0 = src[2]
907 // mm1, mm3 --- Src[1]
908 // mm0 --- Src[2]
909 // mm7 for unpacking
911 movq mm4, mm1 // b low
912 pmullw mm1, four_fifths // b * 4/5 low
914 movq QWORD ptr [edi+ecx], mm0 // write des[4]
916 movq mm5, mm3 // b high
917 pmullw mm3, four_fifths // b * 4/5 high
919 movq mm2, mm0 // c
920 pmullw mm4, one_fifth // b * 1/5
922 punpcklbw mm0, mm7 // c low
923 pmullw mm5, one_fifth // b * 1/5
925 movq mm6, mm0 // make copy of c low
926 punpckhbw mm2, mm7 // c high
928 pmullw mm6, one_fifth // c * 1/5 low
929 movq mm7, mm2 // make copy of c high
931 pmullw mm7, one_fifth // c * 1/5 high
932 paddw mm1, mm6 // b * 4/5 + c * 1/5 low
934 paddw mm3, mm7 // b * 4/5 + c * 1/5 high
935 movq mm6, mm0 // make copy of c low
937 pmullw mm6, four_fifths // c * 4/5 low
938 movq mm7, mm2 // make copy of c high
940 pmullw mm7, four_fifths // c * 4/5 high
942 paddw mm4, mm6 // b * 1/5 + c * 4/5 low
943 paddw mm5, mm7 // b * 1/5 + c * 4/5 high
945 paddw mm1, round_values // + 128
946 paddw mm3, round_values // + 128
948 psrlw mm1, 8
949 psrlw mm3, 8
951 packuswb mm1, mm3 // des[2]
952 movq QWORD ptr [esi+ecx*2], mm1 // write des[2]
954 paddw mm4, round_values // + 128
955 paddw mm5, round_values // + 128
957 psrlw mm4, 8
958 psrlw mm5, 8
960 packuswb mm4, mm5 // des[3]
961 movq QWORD ptr [edi], mm4 // write des[3]
963 // mm0, mm2 --- Src[3]
965 add edi, 8
966 add esi, 8
968 sub edx, 8
969 jg last_vs_3_5_loop
973 /****************************************************************************
975 * ROUTINE : vertical_band_1_2_scale_mmx
977 * INPUTS : unsigned char *dest :
978 * unsigned int dest_pitch :
979 * unsigned int dest_width :
981 * OUTPUTS : None.
983 * RETURNS : void
985 * FUNCTION : 1 to 2 up-scaling of a band of pixels.
987 * SPECIAL NOTES : The routine uses the first line of the band below
988 * the current band. The function also has an "C" only
989 * version.
991 ****************************************************************************/
992 static
993 void vertical_band_1_2_scale_mmx
995 unsigned char *dest,
996 unsigned int dest_pitch,
997 unsigned int dest_width
1000 __asm
1003 mov esi, dest // Get the source and destination pointer
1004 mov ecx, dest_pitch // Get the pitch size
1006 pxor mm7, mm7 // clear out mm7
1007 mov edx, dest_width // Loop counter
1009 vs_1_2_loop:
1011 movq mm0, [esi] // get Src[0]
1012 movq mm1, [esi + ecx * 2] // get Src[1]
1014 movq mm2, mm0 // make copy before unpack
1015 movq mm3, mm1 // make copy before unpack
1017 punpcklbw mm0, mm7 // low Src[0]
1018 movq mm6, four_ones // mm6= 1, 1, 1, 1
1020 punpcklbw mm1, mm7 // low Src[1]
1021 paddw mm0, mm1 // low (a + b)
1023 punpckhbw mm2, mm7 // high Src[0]
1024 paddw mm0, mm6 // low (a + b + 1)
1026 punpckhbw mm3, mm7
1027 paddw mm2, mm3 // high (a + b )
1029 psraw mm0, 1 // low (a + b +1 )/2
1030 paddw mm2, mm6 // high (a + b + 1)
1032 psraw mm2, 1 // high (a + b + 1)/2
1033 packuswb mm0, mm2 // pack results
1035 movq [esi+ecx], mm0 // write out eight bytes
1036 add esi, 8
1038 sub edx, 8
1039 jg vs_1_2_loop
1044 /****************************************************************************
1046 * ROUTINE : last_vertical_band_1_2_scale_mmx
1048 * INPUTS : unsigned char *dest :
1049 * unsigned int dest_pitch :
1050 * unsigned int dest_width :
1052 * OUTPUTS : None.
1054 * RETURNS : void
1056 * FUNCTION : 1 to 2 up-scaling of band of pixels.
1058 * SPECIAL NOTES : The routine uses the first line of the band below
1059 * the current band. The function also has an "C" only
1060 * version.
1062 ****************************************************************************/
1063 static
1064 void last_vertical_band_1_2_scale_mmx
1066 unsigned char *dest,
1067 unsigned int dest_pitch,
1068 unsigned int dest_width
1071 __asm
1073 mov esi, dest // Get the source and destination pointer
1074 mov ecx, dest_pitch // Get the pitch size
1076 mov edx, dest_width // Loop counter
1078 last_vs_1_2_loop:
1080 movq mm0, [esi] // get Src[0]
1081 movq [esi+ecx], mm0 // write out eight bytes
1083 add esi, 8
1084 sub edx, 8
1086 jg last_vs_1_2_loop
1090 /****************************************************************************
1092 * ROUTINE : horizontal_line_1_2_scale
1094 * INPUTS : const unsigned char *source :
1095 * unsigned int source_width :
1096 * unsigned char *dest :
1097 * unsigned int dest_width :
1099 * OUTPUTS : None.
1101 * RETURNS : void
1103 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
1105 * SPECIAL NOTES : None.
1107 ****************************************************************************/
1108 static
1109 void horizontal_line_1_2_scale_mmx
1111 const unsigned char *source,
1112 unsigned int source_width,
1113 unsigned char *dest,
1114 unsigned int dest_width
1117 (void) dest_width;
1119 __asm
1121 mov esi, source
1122 mov edi, dest
1124 pxor mm7, mm7
1125 movq mm6, four_ones
1127 mov ecx, source_width
1129 hs_1_2_loop:
1131 movq mm0, [esi]
1132 movq mm1, [esi+1]
1134 movq mm2, mm0
1135 movq mm3, mm1
1137 movq mm4, mm0
1138 punpcklbw mm0, mm7
1140 punpcklbw mm1, mm7
1141 paddw mm0, mm1
1143 paddw mm0, mm6
1144 punpckhbw mm2, mm7
1146 punpckhbw mm3, mm7
1147 paddw mm2, mm3
1149 paddw mm2, mm6
1150 psraw mm0, 1
1152 psraw mm2, 1
1153 packuswb mm0, mm2
1155 movq mm2, mm4
1156 punpcklbw mm2, mm0
1158 movq [edi], mm2
1159 punpckhbw mm4, mm0
1161 movq [edi+8], mm4
1162 add esi, 8
1164 add edi, 16
1165 sub ecx, 8
1167 cmp ecx, 8
1168 jg hs_1_2_loop
1170 // last eight pixel
1172 movq mm0, [esi]
1173 movq mm1, mm0
1175 movq mm2, mm0
1176 movq mm3, mm1
1178 psrlq mm1, 8
1179 psrlq mm3, 56
1181 psllq mm3, 56
1182 por mm1, mm3
1184 movq mm3, mm1
1185 movq mm4, mm0
1187 punpcklbw mm0, mm7
1188 punpcklbw mm1, mm7
1190 paddw mm0, mm1
1191 paddw mm0, mm6
1193 punpckhbw mm2, mm7
1194 punpckhbw mm3, mm7
1196 paddw mm2, mm3
1197 paddw mm2, mm6
1199 psraw mm0, 1
1200 psraw mm2, 1
1202 packuswb mm0, mm2
1203 movq mm2, mm4
1205 punpcklbw mm2, mm0
1206 movq [edi], mm2
1208 punpckhbw mm4, mm0
1209 movq [edi+8], mm4
1217 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
1218 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
1221 /****************************************************************************
1223 * ROUTINE : horizontal_line_5_4_scale_mmx
1225 * INPUTS : const unsigned char *source : Pointer to source data.
1226 * unsigned int source_width : Stride of source.
1227 * unsigned char *dest : Pointer to destination data.
1228 * unsigned int dest_width : Stride of destination (NOT USED).
1230 * OUTPUTS : None.
1232 * RETURNS : void
1234 * FUNCTION : Copies horizontal line of pixels from source to
1235 * destination scaling up by 4 to 5.
1237 * SPECIAL NOTES : None.
1239 ****************************************************************************/
1240 static
1241 void horizontal_line_5_4_scale_mmx
1243 const unsigned char *source,
1244 unsigned int source_width,
1245 unsigned char *dest,
1246 unsigned int dest_width
1250 unsigned i;
1251 unsigned int a, b, c, d, e;
1252 unsigned char *des = dest;
1253 const unsigned char *src = source;
1255 (void) dest_width;
1257 for ( i=0; i<source_width; i+=5 )
1259 a = src[0];
1260 b = src[1];
1261 c = src[2];
1262 d = src[3];
1263 e = src[4];
1265 des[0] = a;
1266 des[1] = ((b*192 + c* 64 + 128)>>8);
1267 des[2] = ((c*128 + d*128 + 128)>>8);
1268 des[3] = ((d* 64 + e*192 + 128)>>8);
1270 src += 5;
1271 des += 4;
1274 (void) dest_width;
1276 __asm
1279 mov esi, source ;
1280 mov edi, dest ;
1282 mov ecx, source_width ;
1283 movq mm5, const54_1 ;
1285 pxor mm7, mm7 ;
1286 movq mm6, const54_2 ;
1288 movq mm4, round_values ;
1289 lea edx, [esi+ecx] ;
1290 horizontal_line_5_4_loop:
1292 movq mm0, QWORD PTR [esi] ;
1293 00 01 02 03 04 05 06 07
1294 movq mm1, mm0 ;
1295 00 01 02 03 04 05 06 07
1297 psrlq mm0, 8 ;
1298 01 02 03 04 05 06 07 xx
1299 punpcklbw mm1, mm7 ;
1300 xx 00 xx 01 xx 02 xx 03
1302 punpcklbw mm0, mm7 ;
1303 xx 01 xx 02 xx 03 xx 04
1304 pmullw mm1, mm5
1306 pmullw mm0, mm6
1307 add esi, 5
1309 add edi, 4
1310 paddw mm1, mm0
1312 paddw mm1, mm4
1313 psrlw mm1, 8
1315 cmp esi, edx
1316 packuswb mm1, mm7
1318 movd DWORD PTR [edi-4], mm1
1320 jl horizontal_line_5_4_loop
1325 __declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 };
1326 __declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 };
1327 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
1329 static
1330 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1333 __asm
1335 push ebx
1337 mov esi, source // Get the source and destination pointer
1338 mov ecx, src_pitch // Get the pitch size
1340 mov edi, dest // tow lines below
1341 pxor mm7, mm7 // clear out mm7
1343 mov edx, dest_pitch // Loop counter
1344 mov ebx, dest_width
1346 vs_5_4_loop:
1348 movd mm0, DWORD ptr [esi] // src[0];
1349 movd mm1, DWORD ptr [esi+ecx] // src[1];
1351 movd mm2, DWORD ptr [esi+ecx*2]
1352 lea eax, [esi+ecx*2] //
1354 punpcklbw mm1, mm7
1355 punpcklbw mm2, mm7
1357 movq mm3, mm2
1358 pmullw mm1, three_fourths
1360 pmullw mm2, one_fourths
1361 movd mm4, [eax+ecx]
1363 pmullw mm3, two_fourths
1364 punpcklbw mm4, mm7
1366 movq mm5, mm4
1367 pmullw mm4, two_fourths
1369 paddw mm1, mm2
1370 movd mm6, [eax+ecx*2]
1372 pmullw mm5, one_fourths
1373 paddw mm1, round_values;
1375 paddw mm3, mm4
1376 psrlw mm1, 8
1378 punpcklbw mm6, mm7
1379 paddw mm3, round_values
1381 pmullw mm6, three_fourths
1382 psrlw mm3, 8
1384 packuswb mm1, mm7
1385 packuswb mm3, mm7
1387 movd DWORD PTR [edi], mm0
1388 movd DWORD PTR [edi+edx], mm1
1391 paddw mm5, mm6
1392 movd DWORD PTR [edi+edx*2], mm3
1394 lea eax, [edi+edx*2]
1395 paddw mm5, round_values
1397 psrlw mm5, 8
1398 add edi, 4
1400 packuswb mm5, mm7
1401 movd DWORD PTR [eax+edx], mm5
1403 add esi, 4
1404 sub ebx, 4
1406 jg vs_5_4_loop
1408 pop ebx
1413 __declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 };
1414 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 };
1417 static
1418 void horizontal_line_5_3_scale_mmx
1420 const unsigned char *source,
1421 unsigned int source_width,
1422 unsigned char *dest,
1423 unsigned int dest_width
1427 (void) dest_width;
1428 __asm
1431 mov esi, source ;
1432 mov edi, dest ;
1434 mov ecx, source_width ;
1435 movq mm5, const53_1 ;
1437 pxor mm7, mm7 ;
1438 movq mm6, const53_2 ;
1440 movq mm4, round_values ;
1441 lea edx, [esi+ecx-5] ;
1442 horizontal_line_5_3_loop:
1444 movq mm0, QWORD PTR [esi] ;
1445 00 01 02 03 04 05 06 07
1446 movq mm1, mm0 ;
1447 00 01 02 03 04 05 06 07
1449 psllw mm0, 8 ;
1450 xx 00 xx 02 xx 04 xx 06
1451 psrlw mm1, 8 ;
1452 01 xx 03 xx 05 xx 07 xx
1454 psrlw mm0, 8 ;
1455 00 xx 02 xx 04 xx 06 xx
1456 psllq mm1, 16 ;
1457 xx xx 01 xx 03 xx 05 xx
1459 pmullw mm0, mm6
1461 pmullw mm1, mm5
1462 add esi, 5
1464 add edi, 3
1465 paddw mm1, mm0
1467 paddw mm1, mm4
1468 psrlw mm1, 8
1470 cmp esi, edx
1471 packuswb mm1, mm7
1473 movd DWORD PTR [edi-3], mm1
1474 jl horizontal_line_5_3_loop
1476 //exit condition
1477 movq mm0, QWORD PTR [esi] ;
1478 00 01 02 03 04 05 06 07
1479 movq mm1, mm0 ;
1480 00 01 02 03 04 05 06 07
1482 psllw mm0, 8 ;
1483 xx 00 xx 02 xx 04 xx 06
1484 psrlw mm1, 8 ;
1485 01 xx 03 xx 05 xx 07 xx
1487 psrlw mm0, 8 ;
1488 00 xx 02 xx 04 xx 06 xx
1489 psllq mm1, 16 ;
1490 xx xx 01 xx 03 xx 05 xx
1492 pmullw mm0, mm6
1494 pmullw mm1, mm5
1495 paddw mm1, mm0
1497 paddw mm1, mm4
1498 psrlw mm1, 8
1500 packuswb mm1, mm7
1501 movd eax, mm1
1503 mov edx, eax
1504 shr edx, 16
1506 mov WORD PTR[edi], ax
1507 mov BYTE PTR[edi+2], dl
1513 __declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 };
1514 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
1516 static
1517 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1520 __asm
1522 push ebx
1524 mov esi, source // Get the source and destination pointer
1525 mov ecx, src_pitch // Get the pitch size
1527 mov edi, dest // tow lines below
1528 pxor mm7, mm7 // clear out mm7
1530 mov edx, dest_pitch // Loop counter
1531 movq mm5, one_thirds
1533 movq mm6, two_thirds
1534 mov ebx, dest_width;
1536 vs_5_3_loop:
1538 movd mm0, DWORD ptr [esi] // src[0];
1539 movd mm1, DWORD ptr [esi+ecx] // src[1];
1541 movd mm2, DWORD ptr [esi+ecx*2]
1542 lea eax, [esi+ecx*2] //
1544 punpcklbw mm1, mm7
1545 punpcklbw mm2, mm7
1547 pmullw mm1, mm5
1548 pmullw mm2, mm6
1550 movd mm3, DWORD ptr [eax+ecx]
1551 movd mm4, DWORD ptr [eax+ecx*2]
1553 punpcklbw mm3, mm7
1554 punpcklbw mm4, mm7
1556 pmullw mm3, mm6
1557 pmullw mm4, mm5
1560 movd DWORD PTR [edi], mm0
1561 paddw mm1, mm2
1563 paddw mm1, round_values
1564 psrlw mm1, 8
1566 packuswb mm1, mm7
1567 paddw mm3, mm4
1569 paddw mm3, round_values
1570 movd DWORD PTR [edi+edx], mm1
1572 psrlw mm3, 8
1573 packuswb mm3, mm7
1575 movd DWORD PTR [edi+edx*2], mm3
1578 add edi, 4
1579 add esi, 4
1581 sub ebx, 4
1582 jg vs_5_3_loop
1584 pop ebx
1591 /****************************************************************************
1593 * ROUTINE : horizontal_line_2_1_scale
1595 * INPUTS : const unsigned char *source :
1596 * unsigned int source_width :
1597 * unsigned char *dest :
1598 * unsigned int dest_width :
1600 * OUTPUTS : None.
1602 * RETURNS : void
1604 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
1606 * SPECIAL NOTES : None.
1608 ****************************************************************************/
1609 static
1610 void horizontal_line_2_1_scale_mmx
1612 const unsigned char *source,
1613 unsigned int source_width,
1614 unsigned char *dest,
1615 unsigned int dest_width
1618 (void) dest_width;
1619 (void) source_width;
1620 __asm
1622 mov esi, source
1623 mov edi, dest
1625 pxor mm7, mm7
1626 mov ecx, dest_width
1628 xor edx, edx
1629 hs_2_1_loop:
1631 movq mm0, [esi+edx*2]
1632 psllw mm0, 8
1634 psrlw mm0, 8
1635 packuswb mm0, mm7
1637 movd DWORD Ptr [edi+edx], mm0;
1638 add edx, 4
1640 cmp edx, ecx
1641 jl hs_2_1_loop
1648 static
1649 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1651 (void) dest_pitch;
1652 (void) src_pitch;
1653 vpx_memcpy(dest, source, dest_width);
1657 __declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
1658 __declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
1660 static
1661 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
1664 (void) dest_pitch;
1665 __asm
1667 mov esi, source
1668 mov edi, dest
1670 mov eax, src_pitch
1671 mov edx, dest_width
1673 pxor mm7, mm7
1674 sub esi, eax //back one line
1677 lea ecx, [esi+edx];
1678 movq mm6, round_values;
1680 movq mm5, three_sixteenths;
1681 movq mm4, ten_sixteenths;
1683 vs_2_1_i_loop:
1684 movd mm0, [esi] //
1685 movd mm1, [esi+eax] //
1687 movd mm2, [esi+eax*2] //
1688 punpcklbw mm0, mm7
1690 pmullw mm0, mm5
1691 punpcklbw mm1, mm7
1693 pmullw mm1, mm4
1694 punpcklbw mm2, mm7
1696 pmullw mm2, mm5
1697 paddw mm0, round_values
1699 paddw mm1, mm2
1700 paddw mm0, mm1
1702 psrlw mm0, 8
1703 packuswb mm0, mm7
1705 movd DWORD PTR [edi], mm0
1706 add esi, 4
1708 add edi, 4;
1709 cmp esi, ecx
1710 jl vs_2_1_i_loop
1717 void
1718 register_mmxscalers(void)
1720 vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;
1721 vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;
1722 vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;
1723 vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;
1724 vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;
1725 vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;
1726 vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;
1727 vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;
1728 vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;
1730 vp8_horizontal_line_3_4_scale = vp8cx_horizontal_line_3_4_scale_c;
1731 vp8_vertical_band_3_4_scale = vp8cx_vertical_band_3_4_scale_c;
1732 vp8_last_vertical_band_3_4_scale = vp8cx_last_vertical_band_3_4_scale_c;
1733 vp8_horizontal_line_2_3_scale = vp8cx_horizontal_line_2_3_scale_c;
1734 vp8_vertical_band_2_3_scale = vp8cx_vertical_band_2_3_scale_c;
1735 vp8_last_vertical_band_2_3_scale = vp8cx_last_vertical_band_2_3_scale_c;
1739 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
1740 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
1741 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
1742 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
1743 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
1744 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
1745 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;