2 * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license and patent
5 * grant that can be found in the LICENSE file in the root of the source
6 * tree. All contributing project authors may be found in the AUTHORS
7 * file in the root of the source tree.
11 /****************************************************************************
13 * Module Title : scaleopt.cpp
15 * Description : Optimized scaling functions
17 ****************************************************************************/
22 /****************************************************************************
24 ****************************************************************************/
25 __declspec(align(16)) const static unsigned short one_fifth
[] = { 51, 51, 51, 51 };
26 __declspec(align(16)) const static unsigned short two_fifths
[] = { 102, 102, 102, 102 };
27 __declspec(align(16)) const static unsigned short three_fifths
[] = { 154, 154, 154, 154 };
28 __declspec(align(16)) const static unsigned short four_fifths
[] = { 205, 205, 205, 205 };
29 __declspec(align(16)) const static unsigned short round_values
[] = { 128, 128, 128, 128 };
30 __declspec(align(16)) const static unsigned short four_ones
[] = { 1, 1, 1, 1};
31 __declspec(align(16)) const static unsigned short const45_2
[] = {205, 154, 102, 51 };
32 __declspec(align(16)) const static unsigned short const45_1
[] = { 51, 102, 154, 205 };
33 __declspec(align(16)) const static unsigned char mask45
[] = { 0, 0, 0, 0, 0, 0, 255, 0};
34 __declspec(align(16)) const static unsigned short const35_2
[] = { 154, 51, 205, 102 };
35 __declspec(align(16)) const static unsigned short const35_1
[] = { 102, 205, 51, 154 };
39 #include "vpx_scale/vpxscale.h"
40 #include "vpx_mem/vpx_mem.h"
42 /****************************************************************************
44 * ROUTINE : horizontal_line_3_5_scale_mmx
46 * INPUTS : const unsigned char *source :
47 * unsigned int source_width :
48 * unsigned char *dest :
49 * unsigned int dest_width :
55 * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
57 * SPECIAL NOTES : None.
59 ****************************************************************************/
61 void horizontal_line_3_5_scale_mmx
63 const unsigned char *source
,
64 unsigned int source_width
,
66 unsigned int dest_width
82 movq mm5
, const35_1
// mm5 = 66 xx cd xx 33 xx 9a xx
83 movq mm6
, const35_2
// mm6 = 9a xx 33 xx cd xx 66 xx
85 movq mm4
, round_values
// mm4 = 80 xx 80 xx 80 xx 80 xx
86 pxor mm7
, mm7
// clear mm7
90 mov eax
, DWORD PTR
[rsi
] // eax = 00 01 02 03
93 and ebx
, 0xffff00 // ebx = xx 01 02 xx
94 mov ecx
, eax
// ecx = 00 01 02 03
96 and eax
, 0xffff0000 // eax = xx xx 02 03
97 xor ecx
, eax
// ecx = 00 01 xx xx
99 shr ebx
, 8 // ebx = 01 02 xx xx
100 or eax
, ebx
// eax = 01 02 02 03
102 shl ebx
, 16 // ebx = xx xx 01 02
103 movd mm1
, eax
// mm1 = 01 02 02 03 xx xx xx xx
105 or ebx
, ecx
// ebx = 00 01 01 02
106 punpcklbw mm1
, mm7
// mm1 = 01 xx 02 xx 02 xx 03 xx
108 movd mm0
, ebx
// mm0 = 00 01 01 02
111 punpcklbw mm0
, mm7
// mm0 = 00 xx 01 xx 01 xx 02 xx
114 mov
[rdi
], ebx
// writeoutput 00 xx xx xx
126 movd DWORD Ptr
[rdi
-4], mm0
127 jl horiz_line_3_5_loop
130 mov eax
, DWORD PTR
[rsi
] // eax = 00 01 02 03
133 and ebx
, 0xffff00 // ebx = xx 01 02 xx
134 mov ecx
, eax
// ecx = 00 01 02 03
136 and eax
, 0xffff0000 // eax = xx xx 02 03
137 xor ecx
, eax
// ecx = 00 01 xx xx
139 shr ebx
, 8 // ebx = 01 02 xx xx
140 or eax
, ebx
// eax = 01 02 02 03
142 shl eax
, 8 // eax = xx 01 02 02
143 and eax
, 0xffff0000 // eax = xx xx 02 02
145 or eax
, ebx
// eax = 01 02 02 02
147 shl ebx
, 16 // ebx = xx xx 01 02
148 movd mm1
, eax
// mm1 = 01 02 02 02 xx xx xx xx
150 or ebx
, ecx
// ebx = 00 01 01 02
151 punpcklbw mm1
, mm7
// mm1 = 01 xx 02 xx 02 xx 02 xx
153 movd mm0
, ebx
// mm0 = 00 01 01 02
156 punpcklbw mm0
, mm7
// mm0 = 00 xx 01 xx 01 xx 02 xx
159 mov
[rdi
], ebx
// writeoutput 00 xx xx xx
166 movd DWORD Ptr
[rdi
+1], mm0
175 /****************************************************************************
177 * ROUTINE : horizontal_line_4_5_scale_mmx
179 * INPUTS : const unsigned char *source :
180 * unsigned int source_width :
181 * unsigned char *dest :
182 * unsigned int dest_width :
188 * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
190 * SPECIAL NOTES : None.
192 ****************************************************************************/
194 void horizontal_line_4_5_scale_mmx
196 const unsigned char *source
,
197 unsigned int source_width
,
199 unsigned int dest_width
210 mov ecx
, source_width
211 lea rdx
, [rsi
+rcx
-8];
213 movq mm5
, const45_1
// mm5 = 33 xx 66 xx 9a xx cd xx
214 movq mm6
, const45_2
// mm6 = cd xx 9a xx 66 xx 33 xx
216 movq mm4
, round_values
// mm4 = 80 xx 80 xx 80 xx 80 xx
217 pxor mm7
, mm7
// clear mm7
221 movq mm0
, QWORD PTR
[rsi
] // mm0 = 00 01 02 03 04 05 06 07
222 movq mm1
, QWORD PTR
[rsi
+1]; // mm1 = 01 02 03 04 05 06 07 08
224 movq mm2
, mm0
// mm2 = 00 01 02 03 04 05 06 07
225 movq mm3
, mm1
// mm3 = 01 02 03 04 05 06 07 08
227 movd DWORD PTR
[rdi
], mm0
// write output 00 xx xx xx
228 punpcklbw mm0
, mm7
// mm0 = 00 xx 01 xx 02 xx 03 xx
230 punpcklbw mm1
, mm7
// mm1 = 01 xx 02 xx 03 xx 04 xx
231 pmullw mm0
, mm5
// 00* 51 01*102 02*154 03*205
233 pmullw mm1
, mm6
// 01*205 02*154 03*102 04* 51
234 punpckhbw mm2
, mm7
// mm2 = 04 xx 05 xx 06 xx 07 xx
236 movd DWORD PTR
[rdi
+5], mm2
// write ouput 05 xx xx xx
237 pmullw mm2
, mm5
// 04* 51 05*102 06*154 07*205
239 punpckhbw mm3
, mm7
// mm3 = 05 xx 06 xx 07 xx 08 xx
240 pmullw mm3
, mm6
// 05*205 06*154 07*102 08* 51
242 paddw mm0
, mm1
// added round values
245 psrlw mm0
, 8 // output: 01 xx 02 xx 03 xx 04 xx
248 movd DWORD PTR
[rdi
+1], mm0
// write output 01 02 03 04
254 paddw mm2
, mm4
// added round values
260 movd DWORD PTR
[rdi
-4], mm2
// writeoutput 06 07 08 09
261 jl horiz_line_4_5_loop
264 movq mm0
, [rsi
] // mm0 = 00 01 02 03 04 05 06 07
265 movq mm1
, mm0
// mm1 = 00 01 02 03 04 05 06 07
267 movq mm2
, mm0
// mm2 = 00 01 02 03 04 05 06 07
268 psrlq mm1
, 8 // mm1 = 01 02 03 04 05 06 07 00
270 movq mm3
, mask45
// mm3 = 00 00 00 00 00 00 ff 00
271 pand mm3
, mm1
// mm3 = 00 00 00 00 00 00 07 00
273 psllq mm3
, 8 // mm3 = 00 00 00 00 00 00 00 07
274 por mm1
, mm3
// mm1 = 01 02 03 04 05 06 07 07
278 movd DWORD PTR
[rdi
], mm0
// write output 00 xx xx xx
279 punpcklbw mm0
, mm7
// mm0 = 00 xx 01 xx 02 xx 03 xx
281 punpcklbw mm1
, mm7
// mm1 = 01 xx 02 xx 03 xx 04 xx
282 pmullw mm0
, mm5
// 00* 51 01*102 02*154 03*205
284 pmullw mm1
, mm6
// 01*205 02*154 03*102 04* 51
285 punpckhbw mm2
, mm7
// mm2 = 04 xx 05 xx 06 xx 07 xx
287 movd DWORD PTR
[rdi
+5], mm2
// write ouput 05 xx xx xx
288 pmullw mm2
, mm5
// 04* 51 05*102 06*154 07*205
290 punpckhbw mm3
, mm7
// mm3 = 05 xx 06 xx 07 xx 08 xx
291 pmullw mm3
, mm6
// 05*205 06*154 07*102 07* 51
293 paddw mm0
, mm1
// added round values
296 psrlw mm0
, 8 // output: 01 xx 02 xx 03 xx 04 xx
297 packuswb mm0
, mm7
// 01 02 03 04 xx xx xx xx
299 movd DWORD PTR
[rdi
+1], mm0
// write output 01 02 03 04
302 paddw mm2
, mm4
// added round values
306 movd DWORD PTR
[rdi
+6], mm2
// writeoutput 06 07 08 09
312 /****************************************************************************
314 * ROUTINE : vertical_band_4_5_scale_mmx
316 * INPUTS : unsigned char *dest :
317 * unsigned int dest_pitch :
318 * unsigned int dest_width :
324 * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
326 * SPECIAL NOTES : The routine uses the first line of the band below
327 * the current band. The function also has a "C" only
330 ****************************************************************************/
332 void vertical_band_4_5_scale_mmx
335 unsigned int dest_pitch
,
336 unsigned int dest_width
342 mov rsi
, dest
// Get the source and destination pointer
343 mov ecx
, dest_pitch
// Get the pitch size
345 lea rdi
, [rsi
+rcx
*2] // tow lines below
346 add rdi
, rcx
// three lines below
348 pxor mm7
, mm7
// clear out mm7
349 mov edx
, dest_width
// Loop counter
353 movq mm0
, QWORD ptr
[rsi
] // src[0];
354 movq mm1
, QWORD ptr
[rsi
+rcx
] // src[1];
356 movq mm2
, mm0
// Make a copy
357 punpcklbw mm0
, mm7
// unpack low to word
360 punpckhbw mm2
, mm7
// unpack high to word
362 pmullw mm0
, mm5
// a * 1/5
364 movq mm3
, mm1
// make a copy
365 punpcklbw mm1
, mm7
// unpack low to word
367 pmullw mm2
, mm5
// a * 1/5
368 movq mm6
, four_fifths
// constan
370 movq mm4
, mm1
// copy of low b
371 pmullw mm4
, mm6
// b * 4/5
373 punpckhbw mm3
, mm7
// unpack high to word
374 movq mm5
, mm3
// copy of high b
376 pmullw mm5
, mm6
// b * 4/5
377 paddw mm0
, mm4
// a * 1/5 + b * 4/5
379 paddw mm2
, mm5
// a * 1/5 + b * 4/5
380 paddw mm0
, round_values
// + 128
382 paddw mm2
, round_values
// + 128
386 packuswb mm0
, mm2
// des [1]
388 movq QWORD ptr
[rsi
+rcx
], mm0
// write des[1]
389 movq mm0
, [rsi
+rcx
*2] // mm0 = src[2]
391 // mm1, mm3 --- Src[1]
396 movq mm2
, mm0
// make a copy
398 pmullw mm1
, mm5
// b * 2/5
399 movq mm6
, three_fifths
402 punpcklbw mm0
, mm7
// unpack low to word
403 pmullw mm3
, mm5
// b * 2/5
405 movq mm4
, mm0
// make copy of c
406 punpckhbw mm2
, mm7
// unpack high to word
408 pmullw mm4
, mm6
// c * 3/5
411 pmullw mm5
, mm6
// c * 3/5
412 paddw mm1
, mm4
// b * 2/5 + c * 3/5
414 paddw mm3
, mm5
// b * 2/5 + c * 3/5
415 paddw mm1
, round_values
// + 128
417 paddw mm3
, round_values
// + 128
421 packuswb mm1
, mm3
// des[2]
423 movq QWORD ptr
[rsi
+rcx
*2], mm1
// write des[2]
424 movq mm1
, [rdi
] // mm1=Src[3];
426 // mm0, mm2 --- Src[2]
431 pmullw mm0
, mm6
// c * 3/5
432 movq mm5
, two_fifths
// mm5 = 2/5
434 movq mm3
, mm1
// make a copy
435 pmullw mm2
, mm6
// c * 3/5
437 punpcklbw mm1
, mm7
// unpack low
438 movq mm4
, mm1
// make a copy
440 punpckhbw mm3
, mm7
// unpack high
441 pmullw mm4
, mm5
// d * 2/5
443 movq mm6
, mm3
// make a copy
444 pmullw mm6
, mm5
// d * 2/5
446 paddw mm0
, mm4
// c * 3/5 + d * 2/5
447 paddw mm2
, mm6
// c * 3/5 + d * 2/5
449 paddw mm0
, round_values
// + 128
450 paddw mm2
, round_values
// + 128
455 packuswb mm0
, mm2
// des[3]
456 movq QWORD ptr
[rdi
], mm0
// write des[3]
458 // mm1, mm3 --- Src[3]
459 // mm7 -- cleared for unpacking
461 movq mm0
, [rdi
+rcx
*2] // mm0, Src[0] of the next group
463 movq mm5
, four_fifths
// mm5 = 4/5
464 pmullw mm1
, mm5
// d * 4/5
466 movq mm6
, one_fifth
// mm6 = 1/5
467 movq mm2
, mm0
// make a copy
469 pmullw mm3
, mm5
// d * 4/5
470 punpcklbw mm0
, mm7
// unpack low
472 pmullw mm0
, mm6
// an * 1/5
473 punpckhbw mm2
, mm7
// unpack high
475 paddw mm1
, mm0
// d * 4/5 + an * 1/5
476 pmullw mm2
, mm6
// an * 1/5
478 paddw mm3
, mm2
// d * 4/5 + an * 1/5
479 paddw mm1
, round_values
// + 128
481 paddw mm3
, round_values
// + 128
485 packuswb mm1
, mm3
// des[4]
487 movq QWORD ptr
[rdi
+rcx
], mm1
// write des[4]
497 /****************************************************************************
499 * ROUTINE : last_vertical_band_4_5_scale_mmx
501 * INPUTS : unsigned char *dest :
502 * unsigned int dest_pitch :
503 * unsigned int dest_width :
509 * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
511 * SPECIAL NOTES : The routine uses the first line of the band below
512 * the current band. The function also has an "C" only
515 ****************************************************************************/
517 void last_vertical_band_4_5_scale_mmx
520 unsigned int dest_pitch
,
521 unsigned int dest_width
526 mov rsi
, dest
// Get the source and destination pointer
527 mov ecx
, dest_pitch
// Get the pitch size
529 lea rdi
, [rsi
+rcx
*2] // tow lines below
530 add rdi
, rcx
// three lines below
532 pxor mm7
, mm7
// clear out mm7
533 mov edx
, dest_width
// Loop counter
537 movq mm0
, QWORD ptr
[rsi
] // src[0];
538 movq mm1
, QWORD ptr
[rsi
+rcx
] // src[1];
540 movq mm2
, mm0
// Make a copy
541 punpcklbw mm0
, mm7
// unpack low to word
544 punpckhbw mm2
, mm7
// unpack high to word
546 pmullw mm0
, mm5
// a * 1/5
548 movq mm3
, mm1
// make a copy
549 punpcklbw mm1
, mm7
// unpack low to word
551 pmullw mm2
, mm5
// a * 1/5
552 movq mm6
, four_fifths
// constan
554 movq mm4
, mm1
// copy of low b
555 pmullw mm4
, mm6
// b * 4/5
557 punpckhbw mm3
, mm7
// unpack high to word
558 movq mm5
, mm3
// copy of high b
560 pmullw mm5
, mm6
// b * 4/5
561 paddw mm0
, mm4
// a * 1/5 + b * 4/5
563 paddw mm2
, mm5
// a * 1/5 + b * 4/5
564 paddw mm0
, round_values
// + 128
566 paddw mm2
, round_values
// + 128
570 packuswb mm0
, mm2
// des [1]
572 movq QWORD ptr
[rsi
+rcx
], mm0
// write des[1]
573 movq mm0
, [rsi
+rcx
*2] // mm0 = src[2]
575 // mm1, mm3 --- Src[1]
580 movq mm2
, mm0
// make a copy
582 pmullw mm1
, mm5
// b * 2/5
583 movq mm6
, three_fifths
586 punpcklbw mm0
, mm7
// unpack low to word
587 pmullw mm3
, mm5
// b * 2/5
589 movq mm4
, mm0
// make copy of c
590 punpckhbw mm2
, mm7
// unpack high to word
592 pmullw mm4
, mm6
// c * 3/5
595 pmullw mm5
, mm6
// c * 3/5
596 paddw mm1
, mm4
// b * 2/5 + c * 3/5
598 paddw mm3
, mm5
// b * 2/5 + c * 3/5
599 paddw mm1
, round_values
// + 128
601 paddw mm3
, round_values
// + 128
605 packuswb mm1
, mm3
// des[2]
607 movq QWORD ptr
[rsi
+rcx
*2], mm1
// write des[2]
608 movq mm1
, [rdi
] // mm1=Src[3];
610 movq QWORD ptr
[rdi
+rcx
], mm1
// write des[4];
612 // mm0, mm2 --- Src[2]
617 pmullw mm0
, mm6
// c * 3/5
618 movq mm5
, two_fifths
// mm5 = 2/5
620 movq mm3
, mm1
// make a copy
621 pmullw mm2
, mm6
// c * 3/5
623 punpcklbw mm1
, mm7
// unpack low
624 movq mm4
, mm1
// make a copy
626 punpckhbw mm3
, mm7
// unpack high
627 pmullw mm4
, mm5
// d * 2/5
629 movq mm6
, mm3
// make a copy
630 pmullw mm6
, mm5
// d * 2/5
632 paddw mm0
, mm4
// c * 3/5 + d * 2/5
633 paddw mm2
, mm6
// c * 3/5 + d * 2/5
635 paddw mm0
, round_values
// + 128
636 paddw mm2
, round_values
// + 128
641 packuswb mm0
, mm2
// des[3]
642 movq QWORD ptr
[rdi
], mm0
// write des[3]
644 // mm1, mm3 --- Src[3]
645 // mm7 -- cleared for unpacking
654 /****************************************************************************
656 * ROUTINE : vertical_band_3_5_scale_mmx
658 * INPUTS : unsigned char *dest :
659 * unsigned int dest_pitch :
660 * unsigned int dest_width :
666 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
668 * SPECIAL NOTES : The routine uses the first line of the band below
669 * the current band. The function also has an "C" only
672 ****************************************************************************/
674 void vertical_band_3_5_scale_mmx
677 unsigned int dest_pitch
,
678 unsigned int dest_width
683 mov rsi
, dest
// Get the source and destination pointer
684 mov ecx
, dest_pitch
// Get the pitch size
686 lea rdi
, [rsi
+rcx
*2] // two lines below
687 add rdi
, rcx
// three lines below
689 pxor mm7
, mm7
// clear out mm7
690 mov edx
, dest_width
// Loop counter
694 movq mm0
, QWORD ptr
[rsi
] // src[0];
695 movq mm1
, QWORD ptr
[rsi
+rcx
] // src[1];
697 movq mm2
, mm0
// Make a copy
698 punpcklbw mm0
, mm7
// unpack low to word
700 movq mm5
, two_fifths
// mm5 = 2/5
701 punpckhbw mm2
, mm7
// unpack high to word
703 pmullw mm0
, mm5
// a * 2/5
705 movq mm3
, mm1
// make a copy
706 punpcklbw mm1
, mm7
// unpack low to word
708 pmullw mm2
, mm5
// a * 2/5
709 movq mm6
, three_fifths
// mm6 = 3/5
711 movq mm4
, mm1
// copy of low b
712 pmullw mm4
, mm6
// b * 3/5
714 punpckhbw mm3
, mm7
// unpack high to word
715 movq mm5
, mm3
// copy of high b
717 pmullw mm5
, mm6
// b * 3/5
718 paddw mm0
, mm4
// a * 2/5 + b * 3/5
720 paddw mm2
, mm5
// a * 2/5 + b * 3/5
721 paddw mm0
, round_values
// + 128
723 paddw mm2
, round_values
// + 128
727 packuswb mm0
, mm2
// des [1]
729 movq QWORD ptr
[rsi
+rcx
], mm0
// write des[1]
730 movq mm0
, [rsi
+rcx
*2] // mm0 = src[2]
732 // mm1, mm3 --- Src[1]
736 movq mm4
, mm1
// b low
737 pmullw mm1
, four_fifths
// b * 4/5 low
739 movq mm5
, mm3
// b high
740 pmullw mm3
, four_fifths
// b * 4/5 high
743 pmullw mm4
, one_fifth
// b * 1/5
745 punpcklbw mm0
, mm7
// c low
746 pmullw mm5
, one_fifth
// b * 1/5
748 movq mm6
, mm0
// make copy of c low
749 punpckhbw mm2
, mm7
// c high
751 pmullw mm6
, one_fifth
// c * 1/5 low
752 movq mm7
, mm2
// make copy of c high
754 pmullw mm7
, one_fifth
// c * 1/5 high
755 paddw mm1
, mm6
// b * 4/5 + c * 1/5 low
757 paddw mm3
, mm7
// b * 4/5 + c * 1/5 high
758 movq mm6
, mm0
// make copy of c low
760 pmullw mm6
, four_fifths
// c * 4/5 low
761 movq mm7
, mm2
// make copy of c high
763 pmullw mm7
, four_fifths
// c * 4/5 high
765 paddw mm4
, mm6
// b * 1/5 + c * 4/5 low
766 paddw mm5
, mm7
// b * 1/5 + c * 4/5 high
768 paddw mm1
, round_values
// + 128
769 paddw mm3
, round_values
// + 128
774 packuswb mm1
, mm3
// des[2]
775 movq QWORD ptr
[rsi
+rcx
*2], mm1
// write des[2]
777 paddw mm4
, round_values
// + 128
778 paddw mm5
, round_values
// + 128
783 packuswb mm4
, mm5
// des[3]
784 movq QWORD ptr
[rdi
], mm4
// write des[3]
786 // mm0, mm2 --- Src[3]
788 pxor mm7
, mm7
// clear mm7 for unpacking
789 movq mm1
, [rdi
+rcx
*2] // mm1 = Src[0] of the next group
791 movq mm5
, three_fifths
// mm5 = 3/5
792 pmullw mm0
, mm5
// d * 3/5
794 movq mm6
, two_fifths
// mm6 = 2/5
795 movq mm3
, mm1
// make a copy
797 pmullw mm2
, mm5
// d * 3/5
798 punpcklbw mm1
, mm7
// unpack low
800 pmullw mm1
, mm6
// an * 2/5
801 punpckhbw mm3
, mm7
// unpack high
803 paddw mm0
, mm1
// d * 3/5 + an * 2/5
804 pmullw mm3
, mm6
// an * 2/5
806 paddw mm2
, mm3
// d * 3/5 + an * 2/5
807 paddw mm0
, round_values
// + 128
809 paddw mm2
, round_values
// + 128
813 packuswb mm0
, mm2
// des[4]
815 movq QWORD ptr
[rdi
+rcx
], mm0
// write des[4]
825 /****************************************************************************
827 * ROUTINE : last_vertical_band_3_5_scale_mmx
829 * INPUTS : unsigned char *dest :
830 * unsigned int dest_pitch :
831 * unsigned int dest_width :
837 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
839 * SPECIAL NOTES : The routine uses the first line of the band below
840 * the current band. The function also has an "C" only
843 ****************************************************************************/
845 void last_vertical_band_3_5_scale_mmx
848 unsigned int dest_pitch
,
849 unsigned int dest_width
854 mov rsi
, dest
// Get the source and destination pointer
855 mov ecx
, dest_pitch
// Get the pitch size
857 lea rdi
, [rsi
+rcx
*2] // tow lines below
858 add rdi
, rcx
// three lines below
860 pxor mm7
, mm7
// clear out mm7
861 mov edx
, dest_width
// Loop counter
866 movq mm0
, QWORD ptr
[rsi
] // src[0];
867 movq mm1
, QWORD ptr
[rsi
+rcx
] // src[1];
869 movq mm2
, mm0
// Make a copy
870 punpcklbw mm0
, mm7
// unpack low to word
872 movq mm5
, two_fifths
// mm5 = 2/5
873 punpckhbw mm2
, mm7
// unpack high to word
875 pmullw mm0
, mm5
// a * 2/5
877 movq mm3
, mm1
// make a copy
878 punpcklbw mm1
, mm7
// unpack low to word
880 pmullw mm2
, mm5
// a * 2/5
881 movq mm6
, three_fifths
// mm6 = 3/5
883 movq mm4
, mm1
// copy of low b
884 pmullw mm4
, mm6
// b * 3/5
886 punpckhbw mm3
, mm7
// unpack high to word
887 movq mm5
, mm3
// copy of high b
889 pmullw mm5
, mm6
// b * 3/5
890 paddw mm0
, mm4
// a * 2/5 + b * 3/5
892 paddw mm2
, mm5
// a * 2/5 + b * 3/5
893 paddw mm0
, round_values
// + 128
895 paddw mm2
, round_values
// + 128
899 packuswb mm0
, mm2
// des [1]
901 movq QWORD ptr
[rsi
+rcx
], mm0
// write des[1]
902 movq mm0
, [rsi
+rcx
*2] // mm0 = src[2]
906 // mm1, mm3 --- Src[1]
910 movq mm4
, mm1
// b low
911 pmullw mm1
, four_fifths
// b * 4/5 low
913 movq QWORD ptr
[rdi
+rcx
], mm0
// write des[4]
915 movq mm5
, mm3
// b high
916 pmullw mm3
, four_fifths
// b * 4/5 high
919 pmullw mm4
, one_fifth
// b * 1/5
921 punpcklbw mm0
, mm7
// c low
922 pmullw mm5
, one_fifth
// b * 1/5
924 movq mm6
, mm0
// make copy of c low
925 punpckhbw mm2
, mm7
// c high
927 pmullw mm6
, one_fifth
// c * 1/5 low
928 movq mm7
, mm2
// make copy of c high
930 pmullw mm7
, one_fifth
// c * 1/5 high
931 paddw mm1
, mm6
// b * 4/5 + c * 1/5 low
933 paddw mm3
, mm7
// b * 4/5 + c * 1/5 high
934 movq mm6
, mm0
// make copy of c low
936 pmullw mm6
, four_fifths
// c * 4/5 low
937 movq mm7
, mm2
// make copy of c high
939 pmullw mm7
, four_fifths
// c * 4/5 high
941 paddw mm4
, mm6
// b * 1/5 + c * 4/5 low
942 paddw mm5
, mm7
// b * 1/5 + c * 4/5 high
944 paddw mm1
, round_values
// + 128
945 paddw mm3
, round_values
// + 128
950 packuswb mm1
, mm3
// des[2]
951 movq QWORD ptr
[rsi
+rcx
*2], mm1
// write des[2]
953 paddw mm4
, round_values
// + 128
954 paddw mm5
, round_values
// + 128
959 packuswb mm4
, mm5
// des[3]
960 movq QWORD ptr
[rdi
], mm4
// write des[3]
962 // mm0, mm2 --- Src[3]
972 /****************************************************************************
974 * ROUTINE : vertical_band_1_2_scale_mmx
976 * INPUTS : unsigned char *dest :
977 * unsigned int dest_pitch :
978 * unsigned int dest_width :
984 * FUNCTION : 1 to 2 up-scaling of a band of pixels.
986 * SPECIAL NOTES : The routine uses the first line of the band below
987 * the current band. The function also has an "C" only
990 ****************************************************************************/
992 void vertical_band_1_2_scale_mmx
995 unsigned int dest_pitch
,
996 unsigned int dest_width
1002 mov rsi
, dest
// Get the source and destination pointer
1003 mov ecx
, dest_pitch
// Get the pitch size
1005 pxor mm7
, mm7
// clear out mm7
1006 mov edx
, dest_width
// Loop counter
1010 movq mm0
, [rsi
] // get Src[0]
1011 movq mm1
, [rsi
+ rcx
* 2] // get Src[1]
1013 movq mm2
, mm0
// make copy before unpack
1014 movq mm3
, mm1
// make copy before unpack
1016 punpcklbw mm0
, mm7
// low Src[0]
1017 movq mm6
, four_ones
// mm6= 1, 1, 1, 1
1019 punpcklbw mm1
, mm7
// low Src[1]
1020 paddw mm0
, mm1
// low (a + b)
1022 punpckhbw mm2
, mm7
// high Src[0]
1023 paddw mm0
, mm6
// low (a + b + 1)
1026 paddw mm2
, mm3
// high (a + b )
1028 psraw mm0
, 1 // low (a + b +1 )/2
1029 paddw mm2
, mm6
// high (a + b + 1)
1031 psraw mm2
, 1 // high (a + b + 1)/2
1032 packuswb mm0
, mm2
// pack results
1034 movq
[rsi
+rcx
], mm0
// write out eight bytes
1043 /****************************************************************************
1045 * ROUTINE : last_vertical_band_1_2_scale_mmx
1047 * INPUTS : unsigned char *dest :
1048 * unsigned int dest_pitch :
1049 * unsigned int dest_width :
1055 * FUNCTION : 1 to 2 up-scaling of band of pixels.
1057 * SPECIAL NOTES : The routine uses the first line of the band below
1058 * the current band. The function also has an "C" only
1061 ****************************************************************************/
1063 void last_vertical_band_1_2_scale_mmx
1065 unsigned char *dest
,
1066 unsigned int dest_pitch
,
1067 unsigned int dest_width
1072 mov rsi
, dest
// Get the source and destination pointer
1073 mov ecx
, dest_pitch
// Get the pitch size
1075 mov edx
, dest_width
// Loop counter
1079 movq mm0
, [rsi
] // get Src[0]
1080 movq
[rsi
+rcx
], mm0
// write out eight bytes
1089 /****************************************************************************
1091 * ROUTINE : horizontal_line_1_2_scale
1093 * INPUTS : const unsigned char *source :
1094 * unsigned int source_width :
1095 * unsigned char *dest :
1096 * unsigned int dest_width :
1102 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
1104 * SPECIAL NOTES : None.
1106 ****************************************************************************/
1108 void horizontal_line_1_2_scale_mmx
1110 const unsigned char *source
,
1111 unsigned int source_width
,
1112 unsigned char *dest
,
1113 unsigned int dest_width
1126 mov ecx
, source_width
1216 __declspec(align(16)) const static unsigned short const54_2
[] = { 0, 64, 128, 192 };
1217 __declspec(align(16)) const static unsigned short const54_1
[] = {256, 192, 128, 64 };
1220 /****************************************************************************
1222 * ROUTINE : horizontal_line_5_4_scale_mmx
1224 * INPUTS : const unsigned char *source : Pointer to source data.
1225 * unsigned int source_width : Stride of source.
1226 * unsigned char *dest : Pointer to destination data.
1227 * unsigned int dest_width : Stride of destination (NOT USED).
1233 * FUNCTION : Copies horizontal line of pixels from source to
1234 * destination scaling up by 4 to 5.
1236 * SPECIAL NOTES : None.
1238 ****************************************************************************/
1240 void horizontal_line_5_4_scale_mmx
1242 const unsigned char *source
,
1243 unsigned int source_width
,
1244 unsigned char *dest
,
1245 unsigned int dest_width
1250 unsigned int a, b, c, d, e;
1251 unsigned char *des = dest;
1252 const unsigned char *src = source;
1256 for ( i=0; i<source_width; i+=5 )
1265 des[1] = ((b*192 + c* 64 + 128)>>8);
1266 des[2] = ((c*128 + d*128 + 128)>>8);
1267 des[3] = ((d* 64 + e*192 + 128)>>8);
1279 mov ecx
, source_width
;
1280 movq mm5
, const54_1
;
1283 movq mm6
, const54_2
;
1285 movq mm4
, round_values
;
1286 lea rdx
, [rsi
+rcx
] ;
1287 horizontal_line_5_4_loop
:
1289 movq mm0
, QWORD PTR
[rsi
] ;
1290 00 01 02 03 04 05 06 07
1292 00 01 02 03 04 05 06 07
1295 01 02 03 04 05 06 07 xx
1296 punpcklbw mm1
, mm7
;
1297 xx
00 xx
01 xx
02 xx
03
1299 punpcklbw mm0
, mm7
;
1300 xx
01 xx
02 xx
03 xx
04
1315 movd DWORD PTR
[rdi
-4], mm1
1317 jl horizontal_line_5_4_loop
1322 __declspec(align(16)) const static unsigned short one_fourths
[] = { 64, 64, 64, 64 };
1323 __declspec(align(16)) const static unsigned short two_fourths
[] = { 128, 128, 128, 128 };
1324 __declspec(align(16)) const static unsigned short three_fourths
[] = { 192, 192, 192, 192 };
1327 void vertical_band_5_4_scale_mmx
1329 unsigned char *source
,
1330 unsigned int src_pitch
,
1331 unsigned char *dest
,
1332 unsigned int dest_pitch
,
1333 unsigned int dest_width
1340 mov rsi
, source
// Get the source and destination pointer
1341 mov ecx
, src_pitch
// Get the pitch size
1343 mov rdi
, dest
// tow lines below
1344 pxor mm7
, mm7
// clear out mm7
1346 mov edx
, dest_pitch
// Loop counter
1351 movd mm0
, DWORD ptr
[rsi
] // src[0];
1352 movd mm1
, DWORD ptr
[rsi
+rcx
] // src[1];
1354 movd mm2
, DWORD ptr
[rsi
+rcx
*2]
1355 lea rax
, [rsi
+rcx
*2] //
1361 pmullw mm1
, three_fourths
1363 pmullw mm2
, one_fourths
1366 pmullw mm3
, two_fourths
1370 pmullw mm4
, two_fourths
1373 movd mm6
, [rax
+rcx
*2]
1375 pmullw mm5
, one_fourths
1376 paddw mm1
, round_values
;
1382 paddw mm3
, round_values
1384 pmullw mm6
, three_fourths
1390 movd DWORD PTR
[rdi
], mm0
1391 movd DWORD PTR
[rdi
+rdx
], mm1
1395 movd DWORD PTR
[rdi
+rdx
*2], mm3
1397 lea rax
, [rdi
+rdx
*2]
1398 paddw mm5
, round_values
1404 movd DWORD PTR
[rax
+rdx
], mm5
1414 __declspec(align(16)) const static unsigned short const53_1
[] = { 0, 85, 171, 0 };
1415 __declspec(align(16)) const static unsigned short const53_2
[] = {256, 171, 85, 0 };
1419 void horizontal_line_5_3_scale_mmx
1421 const unsigned char *source
,
1422 unsigned int source_width
,
1423 unsigned char *dest
,
1424 unsigned int dest_width
1433 mov ecx
, source_width
;
1434 movq mm5
, const53_1
;
1437 movq mm6
, const53_2
;
1439 movq mm4
, round_values
;
1440 lea rdx
, [rsi
+rcx
-5] ;
1441 horizontal_line_5_3_loop
:
1443 movq mm0
, QWORD PTR
[rsi
] ;
1444 00 01 02 03 04 05 06 07
1446 00 01 02 03 04 05 06 07
1449 xx
00 xx
02 xx
04 xx
06
1451 01 xx
03 xx
05 xx
07 xx
1454 00 xx
02 xx
04 xx
06 xx
1456 xx xx
01 xx
03 xx
05 xx
1472 movd DWORD PTR
[rdi
-3], mm1
1473 jl horizontal_line_5_3_loop
1476 movq mm0
, QWORD PTR
[rsi
] ;
1477 00 01 02 03 04 05 06 07
1479 00 01 02 03 04 05 06 07
1482 xx
00 xx
02 xx
04 xx
06
1484 01 xx
03 xx
05 xx
07 xx
1487 00 xx
02 xx
04 xx
06 xx
1489 xx xx
01 xx
03 xx
05 xx
1505 mov WORD PTR
[rdi
], ax
1506 mov BYTE PTR
[rdi
+2], dl
1512 __declspec(align(16)) const static unsigned short one_thirds
[] = { 85, 85, 85, 85 };
1513 __declspec(align(16)) const static unsigned short two_thirds
[] = { 171, 171, 171, 171 };
1516 void vertical_band_5_3_scale_mmx
1518 unsigned char *source
,
1519 unsigned int src_pitch
,
1520 unsigned char *dest
,
1521 unsigned int dest_pitch
,
1522 unsigned int dest_width
1529 mov rsi
, source
// Get the source and destination pointer
1530 mov ecx
, src_pitch
// Get the pitch size
1532 mov rdi
, dest
// tow lines below
1533 pxor mm7
, mm7
// clear out mm7
1535 mov edx
, dest_pitch
// Loop counter
1536 movq mm5
, one_thirds
1538 movq mm6
, two_thirds
1539 mov ebx
, dest_width
;
1543 movd mm0
, DWORD ptr
[rsi
] // src[0];
1544 movd mm1
, DWORD ptr
[rsi
+rcx
] // src[1];
1546 movd mm2
, DWORD ptr
[rsi
+rcx
*2]
1547 lea rax
, [rsi
+rcx
*2] //
1555 movd mm3
, DWORD ptr
[rax
+rcx
]
1556 movd mm4
, DWORD ptr
[rax
+rcx
*2]
1565 movd DWORD PTR
[rdi
], mm0
1568 paddw mm1
, round_values
1574 paddw mm3
, round_values
1575 movd DWORD PTR
[rdi
+rdx
], mm1
1580 movd DWORD PTR
[rdi
+rdx
*2], mm3
1594 /****************************************************************************
1596 * ROUTINE : horizontal_line_2_1_scale
1598 * INPUTS : const unsigned char *source :
1599 * unsigned int source_width :
1600 * unsigned char *dest :
1601 * unsigned int dest_width :
1607 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
1609 * SPECIAL NOTES : None.
1611 ****************************************************************************/
1613 void horizontal_line_2_1_scale_mmx
1615 const unsigned char *source
,
1616 unsigned int source_width
,
1617 unsigned char *dest
,
1618 unsigned int dest_width
1634 movq mm0
, [rsi
+rdx
*2]
1640 movd DWORD Ptr
[rdi
+rdx
], mm0
;
1652 void vertical_band_2_1_scale_mmx
1654 unsigned char *source
,
1655 unsigned int src_pitch
,
1656 unsigned char *dest
,
1657 unsigned int dest_pitch
,
1658 unsigned int dest_width
)
1660 vpx_memcpy(dest
, source
, dest_width
);
1664 __declspec(align(16)) const static unsigned short three_sixteenths
[] = { 48, 48, 48, 48 };
1665 __declspec(align(16)) const static unsigned short ten_sixteenths
[] = { 160, 160, 160, 160 };
1668 void vertical_band_2_1_scale_i_mmx
1670 unsigned char *source
,
1671 unsigned int src_pitch
,
1672 unsigned char *dest
,
1673 unsigned int dest_pitch
,
1674 unsigned int dest_width
1686 sub rsi
, rax
//back one line
1690 movq mm6
, round_values
;
1692 movq mm5
, three_sixteenths
;
1693 movq mm4
, ten_sixteenths
;
1697 movd mm1
, [rsi
+rax
] //
1699 movd mm2
, [rsi
+rax
*2] //
1709 paddw mm0
, round_values
1717 movd DWORD PTR
[rdi
], mm0
1730 register_mmxscalers(void)
1732 vp8_horizontal_line_1_2_scale
= horizontal_line_1_2_scale_mmx
;
1733 vp8_horizontal_line_3_5_scale
= horizontal_line_3_5_scale_mmx
;
1734 vp8_horizontal_line_4_5_scale
= horizontal_line_4_5_scale_mmx
;
1735 vp8_vertical_band_1_2_scale
= vertical_band_1_2_scale_mmx
;
1736 vp8_last_vertical_band_1_2_scale
= last_vertical_band_1_2_scale_mmx
;
1737 vp8_vertical_band_3_5_scale
= vertical_band_3_5_scale_mmx
;
1738 vp8_last_vertical_band_3_5_scale
= last_vertical_band_3_5_scale_mmx
;
1739 vp8_vertical_band_4_5_scale
= vertical_band_4_5_scale_mmx
;
1740 vp8_last_vertical_band_4_5_scale
= last_vertical_band_4_5_scale_mmx
;
1742 vp8_vertical_band_5_4_scale
= vertical_band_5_4_scale_mmx
;
1743 vp8_vertical_band_5_3_scale
= vertical_band_5_3_scale_mmx
;
1744 vp8_vertical_band_2_1_scale
= vertical_band_2_1_scale_mmx
;
1745 vp8_vertical_band_2_1_scale_i
= vertical_band_2_1_scale_i_mmx
;
1746 vp8_horizontal_line_2_1_scale
= horizontal_line_2_1_scale_mmx
;
1747 vp8_horizontal_line_5_3_scale
= horizontal_line_5_3_scale_mmx
;
1748 vp8_horizontal_line_5_4_scale
= horizontal_line_5_4_scale_mmx
;