2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
12 /****************************************************************************
14 * Module Title : scaleopt.cpp
16 * Description : Optimized scaling functions
18 ****************************************************************************/
23 /****************************************************************************
25 ****************************************************************************/
26 __declspec(align(16)) const static unsigned short one_fifth
[] = { 51, 51, 51, 51 };
27 __declspec(align(16)) const static unsigned short two_fifths
[] = { 102, 102, 102, 102 };
28 __declspec(align(16)) const static unsigned short three_fifths
[] = { 154, 154, 154, 154 };
29 __declspec(align(16)) const static unsigned short four_fifths
[] = { 205, 205, 205, 205 };
30 __declspec(align(16)) const static unsigned short round_values
[] = { 128, 128, 128, 128 };
31 __declspec(align(16)) const static unsigned short four_ones
[] = { 1, 1, 1, 1};
32 __declspec(align(16)) const static unsigned short const45_2
[] = {205, 154, 102, 51 };
33 __declspec(align(16)) const static unsigned short const45_1
[] = { 51, 102, 154, 205 };
34 __declspec(align(16)) const static unsigned char mask45
[] = { 0, 0, 0, 0, 0, 0, 255, 0};
35 __declspec(align(16)) const static unsigned short const35_2
[] = { 154, 51, 205, 102 };
36 __declspec(align(16)) const static unsigned short const35_1
[] = { 102, 205, 51, 154 };
40 #include "vpx_scale/vpxscale.h"
41 #include "vpx_mem/vpx_mem.h"
43 /****************************************************************************
45 * ROUTINE : horizontal_line_3_5_scale_mmx
47 * INPUTS : const unsigned char *source :
48 * unsigned int source_width :
49 * unsigned char *dest :
50 * unsigned int dest_width :
56 * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
58 * SPECIAL NOTES : None.
60 ****************************************************************************/
62 void horizontal_line_3_5_scale_mmx
64 const unsigned char *source
,
65 unsigned int source_width
,
67 unsigned int dest_width
83 movq mm5
, const35_1
// mm5 = 66 xx cd xx 33 xx 9a xx
84 movq mm6
, const35_2
// mm6 = 9a xx 33 xx cd xx 66 xx
86 movq mm4
, round_values
// mm4 = 80 xx 80 xx 80 xx 80 xx
87 pxor mm7
, mm7
// clear mm7
91 mov eax
, DWORD PTR
[esi
] // eax = 00 01 02 03
94 and ebx
, 0xffff00 // ebx = xx 01 02 xx
95 mov ecx
, eax
// ecx = 00 01 02 03
97 and eax
, 0xffff0000 // eax = xx xx 02 03
98 xor ecx
, eax
// ecx = 00 01 xx xx
100 shr ebx
, 8 // ebx = 01 02 xx xx
101 or eax
, ebx
// eax = 01 02 02 03
103 shl ebx
, 16 // ebx = xx xx 01 02
104 movd mm1
, eax
// mm1 = 01 02 02 03 xx xx xx xx
106 or ebx
, ecx
// ebx = 00 01 01 02
107 punpcklbw mm1
, mm7
// mm1 = 01 xx 02 xx 02 xx 03 xx
109 movd mm0
, ebx
// mm0 = 00 01 01 02
112 punpcklbw mm0
, mm7
// mm0 = 00 xx 01 xx 01 xx 02 xx
115 mov
[edi
], ebx
// writeoutput 00 xx xx xx
127 movd DWORD Ptr
[edi
-4], mm0
128 jl horiz_line_3_5_loop
131 mov eax
, DWORD PTR
[esi
] // eax = 00 01 02 03
134 and ebx
, 0xffff00 // ebx = xx 01 02 xx
135 mov ecx
, eax
// ecx = 00 01 02 03
137 and eax
, 0xffff0000 // eax = xx xx 02 03
138 xor ecx
, eax
// ecx = 00 01 xx xx
140 shr ebx
, 8 // ebx = 01 02 xx xx
141 or eax
, ebx
// eax = 01 02 02 03
143 shl eax
, 8 // eax = xx 01 02 02
144 and eax
, 0xffff0000 // eax = xx xx 02 02
146 or eax
, ebx
// eax = 01 02 02 02
148 shl ebx
, 16 // ebx = xx xx 01 02
149 movd mm1
, eax
// mm1 = 01 02 02 02 xx xx xx xx
151 or ebx
, ecx
// ebx = 00 01 01 02
152 punpcklbw mm1
, mm7
// mm1 = 01 xx 02 xx 02 xx 02 xx
154 movd mm0
, ebx
// mm0 = 00 01 01 02
157 punpcklbw mm0
, mm7
// mm0 = 00 xx 01 xx 01 xx 02 xx
160 mov
[edi
], ebx
// writeoutput 00 xx xx xx
167 movd DWORD Ptr
[edi
+1], mm0
176 /****************************************************************************
178 * ROUTINE : horizontal_line_4_5_scale_mmx
180 * INPUTS : const unsigned char *source :
181 * unsigned int source_width :
182 * unsigned char *dest :
183 * unsigned int dest_width :
189 * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
191 * SPECIAL NOTES : None.
193 ****************************************************************************/
195 void horizontal_line_4_5_scale_mmx
197 const unsigned char *source
,
198 unsigned int source_width
,
200 unsigned int dest_width
211 mov ecx
, source_width
212 lea edx
, [esi
+ecx
-8];
214 movq mm5
, const45_1
// mm5 = 33 xx 66 xx 9a xx cd xx
215 movq mm6
, const45_2
// mm6 = cd xx 9a xx 66 xx 33 xx
217 movq mm4
, round_values
// mm4 = 80 xx 80 xx 80 xx 80 xx
218 pxor mm7
, mm7
// clear mm7
222 movq mm0
, QWORD PTR
[esi
] // mm0 = 00 01 02 03 04 05 06 07
223 movq mm1
, QWORD PTR
[esi
+1]; // mm1 = 01 02 03 04 05 06 07 08
225 movq mm2
, mm0
// mm2 = 00 01 02 03 04 05 06 07
226 movq mm3
, mm1
// mm3 = 01 02 03 04 05 06 07 08
228 movd DWORD PTR
[edi
], mm0
// write output 00 xx xx xx
229 punpcklbw mm0
, mm7
// mm0 = 00 xx 01 xx 02 xx 03 xx
231 punpcklbw mm1
, mm7
// mm1 = 01 xx 02 xx 03 xx 04 xx
232 pmullw mm0
, mm5
// 00* 51 01*102 02*154 03*205
234 pmullw mm1
, mm6
// 01*205 02*154 03*102 04* 51
235 punpckhbw mm2
, mm7
// mm2 = 04 xx 05 xx 06 xx 07 xx
237 movd DWORD PTR
[edi
+5], mm2
// write ouput 05 xx xx xx
238 pmullw mm2
, mm5
// 04* 51 05*102 06*154 07*205
240 punpckhbw mm3
, mm7
// mm3 = 05 xx 06 xx 07 xx 08 xx
241 pmullw mm3
, mm6
// 05*205 06*154 07*102 08* 51
243 paddw mm0
, mm1
// added round values
246 psrlw mm0
, 8 // output: 01 xx 02 xx 03 xx 04 xx
249 movd DWORD PTR
[edi
+1], mm0
// write output 01 02 03 04
255 paddw mm2
, mm4
// added round values
261 movd DWORD PTR
[edi
-4], mm2
// writeoutput 06 07 08 09
262 jl horiz_line_4_5_loop
265 movq mm0
, [esi
] // mm0 = 00 01 02 03 04 05 06 07
266 movq mm1
, mm0
// mm1 = 00 01 02 03 04 05 06 07
268 movq mm2
, mm0
// mm2 = 00 01 02 03 04 05 06 07
269 psrlq mm1
, 8 // mm1 = 01 02 03 04 05 06 07 00
271 movq mm3
, mask45
// mm3 = 00 00 00 00 00 00 ff 00
272 pand mm3
, mm1
// mm3 = 00 00 00 00 00 00 07 00
274 psllq mm3
, 8 // mm3 = 00 00 00 00 00 00 00 07
275 por mm1
, mm3
// mm1 = 01 02 03 04 05 06 07 07
279 movd DWORD PTR
[edi
], mm0
// write output 00 xx xx xx
280 punpcklbw mm0
, mm7
// mm0 = 00 xx 01 xx 02 xx 03 xx
282 punpcklbw mm1
, mm7
// mm1 = 01 xx 02 xx 03 xx 04 xx
283 pmullw mm0
, mm5
// 00* 51 01*102 02*154 03*205
285 pmullw mm1
, mm6
// 01*205 02*154 03*102 04* 51
286 punpckhbw mm2
, mm7
// mm2 = 04 xx 05 xx 06 xx 07 xx
288 movd DWORD PTR
[edi
+5], mm2
// write ouput 05 xx xx xx
289 pmullw mm2
, mm5
// 04* 51 05*102 06*154 07*205
291 punpckhbw mm3
, mm7
// mm3 = 05 xx 06 xx 07 xx 08 xx
292 pmullw mm3
, mm6
// 05*205 06*154 07*102 07* 51
294 paddw mm0
, mm1
// added round values
297 psrlw mm0
, 8 // output: 01 xx 02 xx 03 xx 04 xx
298 packuswb mm0
, mm7
// 01 02 03 04 xx xx xx xx
300 movd DWORD PTR
[edi
+1], mm0
// write output 01 02 03 04
303 paddw mm2
, mm4
// added round values
307 movd DWORD PTR
[edi
+6], mm2
// writeoutput 06 07 08 09
313 /****************************************************************************
315 * ROUTINE : vertical_band_4_5_scale_mmx
317 * INPUTS : unsigned char *dest :
318 * unsigned int dest_pitch :
319 * unsigned int dest_width :
325 * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
327 * SPECIAL NOTES : The routine uses the first line of the band below
328 * the current band. The function also has a "C" only
331 ****************************************************************************/
333 void vertical_band_4_5_scale_mmx
336 unsigned int dest_pitch
,
337 unsigned int dest_width
343 mov esi
, dest
// Get the source and destination pointer
344 mov ecx
, dest_pitch
// Get the pitch size
346 lea edi
, [esi
+ecx
*2] // tow lines below
347 add edi
, ecx
// three lines below
349 pxor mm7
, mm7
// clear out mm7
350 mov edx
, dest_width
// Loop counter
354 movq mm0
, QWORD ptr
[esi
] // src[0];
355 movq mm1
, QWORD ptr
[esi
+ecx
] // src[1];
357 movq mm2
, mm0
// Make a copy
358 punpcklbw mm0
, mm7
// unpack low to word
361 punpckhbw mm2
, mm7
// unpack high to word
363 pmullw mm0
, mm5
// a * 1/5
365 movq mm3
, mm1
// make a copy
366 punpcklbw mm1
, mm7
// unpack low to word
368 pmullw mm2
, mm5
// a * 1/5
369 movq mm6
, four_fifths
// constan
371 movq mm4
, mm1
// copy of low b
372 pmullw mm4
, mm6
// b * 4/5
374 punpckhbw mm3
, mm7
// unpack high to word
375 movq mm5
, mm3
// copy of high b
377 pmullw mm5
, mm6
// b * 4/5
378 paddw mm0
, mm4
// a * 1/5 + b * 4/5
380 paddw mm2
, mm5
// a * 1/5 + b * 4/5
381 paddw mm0
, round_values
// + 128
383 paddw mm2
, round_values
// + 128
387 packuswb mm0
, mm2
// des [1]
389 movq QWORD ptr
[esi
+ecx
], mm0
// write des[1]
390 movq mm0
, [esi
+ecx
*2] // mm0 = src[2]
392 // mm1, mm3 --- Src[1]
397 movq mm2
, mm0
// make a copy
399 pmullw mm1
, mm5
// b * 2/5
400 movq mm6
, three_fifths
403 punpcklbw mm0
, mm7
// unpack low to word
404 pmullw mm3
, mm5
// b * 2/5
406 movq mm4
, mm0
// make copy of c
407 punpckhbw mm2
, mm7
// unpack high to word
409 pmullw mm4
, mm6
// c * 3/5
412 pmullw mm5
, mm6
// c * 3/5
413 paddw mm1
, mm4
// b * 2/5 + c * 3/5
415 paddw mm3
, mm5
// b * 2/5 + c * 3/5
416 paddw mm1
, round_values
// + 128
418 paddw mm3
, round_values
// + 128
422 packuswb mm1
, mm3
// des[2]
424 movq QWORD ptr
[esi
+ecx
*2], mm1
// write des[2]
425 movq mm1
, [edi
] // mm1=Src[3];
427 // mm0, mm2 --- Src[2]
432 pmullw mm0
, mm6
// c * 3/5
433 movq mm5
, two_fifths
// mm5 = 2/5
435 movq mm3
, mm1
// make a copy
436 pmullw mm2
, mm6
// c * 3/5
438 punpcklbw mm1
, mm7
// unpack low
439 movq mm4
, mm1
// make a copy
441 punpckhbw mm3
, mm7
// unpack high
442 pmullw mm4
, mm5
// d * 2/5
444 movq mm6
, mm3
// make a copy
445 pmullw mm6
, mm5
// d * 2/5
447 paddw mm0
, mm4
// c * 3/5 + d * 2/5
448 paddw mm2
, mm6
// c * 3/5 + d * 2/5
450 paddw mm0
, round_values
// + 128
451 paddw mm2
, round_values
// + 128
456 packuswb mm0
, mm2
// des[3]
457 movq QWORD ptr
[edi
], mm0
// write des[3]
459 // mm1, mm3 --- Src[3]
460 // mm7 -- cleared for unpacking
462 movq mm0
, [edi
+ecx
*2] // mm0, Src[0] of the next group
464 movq mm5
, four_fifths
// mm5 = 4/5
465 pmullw mm1
, mm5
// d * 4/5
467 movq mm6
, one_fifth
// mm6 = 1/5
468 movq mm2
, mm0
// make a copy
470 pmullw mm3
, mm5
// d * 4/5
471 punpcklbw mm0
, mm7
// unpack low
473 pmullw mm0
, mm6
// an * 1/5
474 punpckhbw mm2
, mm7
// unpack high
476 paddw mm1
, mm0
// d * 4/5 + an * 1/5
477 pmullw mm2
, mm6
// an * 1/5
479 paddw mm3
, mm2
// d * 4/5 + an * 1/5
480 paddw mm1
, round_values
// + 128
482 paddw mm3
, round_values
// + 128
486 packuswb mm1
, mm3
// des[4]
488 movq QWORD ptr
[edi
+ecx
], mm1
// write des[4]
498 /****************************************************************************
500 * ROUTINE : last_vertical_band_4_5_scale_mmx
502 * INPUTS : unsigned char *dest :
503 * unsigned int dest_pitch :
504 * unsigned int dest_width :
510 * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
512 * SPECIAL NOTES : The routine uses the first line of the band below
513 * the current band. The function also has an "C" only
516 ****************************************************************************/
518 void last_vertical_band_4_5_scale_mmx
521 unsigned int dest_pitch
,
522 unsigned int dest_width
527 mov esi
, dest
// Get the source and destination pointer
528 mov ecx
, dest_pitch
// Get the pitch size
530 lea edi
, [esi
+ecx
*2] // tow lines below
531 add edi
, ecx
// three lines below
533 pxor mm7
, mm7
// clear out mm7
534 mov edx
, dest_width
// Loop counter
538 movq mm0
, QWORD ptr
[esi
] // src[0];
539 movq mm1
, QWORD ptr
[esi
+ecx
] // src[1];
541 movq mm2
, mm0
// Make a copy
542 punpcklbw mm0
, mm7
// unpack low to word
545 punpckhbw mm2
, mm7
// unpack high to word
547 pmullw mm0
, mm5
// a * 1/5
549 movq mm3
, mm1
// make a copy
550 punpcklbw mm1
, mm7
// unpack low to word
552 pmullw mm2
, mm5
// a * 1/5
553 movq mm6
, four_fifths
// constan
555 movq mm4
, mm1
// copy of low b
556 pmullw mm4
, mm6
// b * 4/5
558 punpckhbw mm3
, mm7
// unpack high to word
559 movq mm5
, mm3
// copy of high b
561 pmullw mm5
, mm6
// b * 4/5
562 paddw mm0
, mm4
// a * 1/5 + b * 4/5
564 paddw mm2
, mm5
// a * 1/5 + b * 4/5
565 paddw mm0
, round_values
// + 128
567 paddw mm2
, round_values
// + 128
571 packuswb mm0
, mm2
// des [1]
573 movq QWORD ptr
[esi
+ecx
], mm0
// write des[1]
574 movq mm0
, [esi
+ecx
*2] // mm0 = src[2]
576 // mm1, mm3 --- Src[1]
581 movq mm2
, mm0
// make a copy
583 pmullw mm1
, mm5
// b * 2/5
584 movq mm6
, three_fifths
587 punpcklbw mm0
, mm7
// unpack low to word
588 pmullw mm3
, mm5
// b * 2/5
590 movq mm4
, mm0
// make copy of c
591 punpckhbw mm2
, mm7
// unpack high to word
593 pmullw mm4
, mm6
// c * 3/5
596 pmullw mm5
, mm6
// c * 3/5
597 paddw mm1
, mm4
// b * 2/5 + c * 3/5
599 paddw mm3
, mm5
// b * 2/5 + c * 3/5
600 paddw mm1
, round_values
// + 128
602 paddw mm3
, round_values
// + 128
606 packuswb mm1
, mm3
// des[2]
608 movq QWORD ptr
[esi
+ecx
*2], mm1
// write des[2]
609 movq mm1
, [edi
] // mm1=Src[3];
611 movq QWORD ptr
[edi
+ecx
], mm1
// write des[4];
613 // mm0, mm2 --- Src[2]
618 pmullw mm0
, mm6
// c * 3/5
619 movq mm5
, two_fifths
// mm5 = 2/5
621 movq mm3
, mm1
// make a copy
622 pmullw mm2
, mm6
// c * 3/5
624 punpcklbw mm1
, mm7
// unpack low
625 movq mm4
, mm1
// make a copy
627 punpckhbw mm3
, mm7
// unpack high
628 pmullw mm4
, mm5
// d * 2/5
630 movq mm6
, mm3
// make a copy
631 pmullw mm6
, mm5
// d * 2/5
633 paddw mm0
, mm4
// c * 3/5 + d * 2/5
634 paddw mm2
, mm6
// c * 3/5 + d * 2/5
636 paddw mm0
, round_values
// + 128
637 paddw mm2
, round_values
// + 128
642 packuswb mm0
, mm2
// des[3]
643 movq QWORD ptr
[edi
], mm0
// write des[3]
645 // mm1, mm3 --- Src[3]
646 // mm7 -- cleared for unpacking
655 /****************************************************************************
657 * ROUTINE : vertical_band_3_5_scale_mmx
659 * INPUTS : unsigned char *dest :
660 * unsigned int dest_pitch :
661 * unsigned int dest_width :
667 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
669 * SPECIAL NOTES : The routine uses the first line of the band below
670 * the current band. The function also has an "C" only
673 ****************************************************************************/
675 void vertical_band_3_5_scale_mmx
678 unsigned int dest_pitch
,
679 unsigned int dest_width
684 mov esi
, dest
// Get the source and destination pointer
685 mov ecx
, dest_pitch
// Get the pitch size
687 lea edi
, [esi
+ecx
*2] // tow lines below
688 add edi
, ecx
// three lines below
690 pxor mm7
, mm7
// clear out mm7
691 mov edx
, dest_width
// Loop counter
695 movq mm0
, QWORD ptr
[esi
] // src[0];
696 movq mm1
, QWORD ptr
[esi
+ecx
] // src[1];
698 movq mm2
, mm0
// Make a copy
699 punpcklbw mm0
, mm7
// unpack low to word
701 movq mm5
, two_fifths
// mm5 = 2/5
702 punpckhbw mm2
, mm7
// unpack high to word
704 pmullw mm0
, mm5
// a * 2/5
706 movq mm3
, mm1
// make a copy
707 punpcklbw mm1
, mm7
// unpack low to word
709 pmullw mm2
, mm5
// a * 2/5
710 movq mm6
, three_fifths
// mm6 = 3/5
712 movq mm4
, mm1
// copy of low b
713 pmullw mm4
, mm6
// b * 3/5
715 punpckhbw mm3
, mm7
// unpack high to word
716 movq mm5
, mm3
// copy of high b
718 pmullw mm5
, mm6
// b * 3/5
719 paddw mm0
, mm4
// a * 2/5 + b * 3/5
721 paddw mm2
, mm5
// a * 2/5 + b * 3/5
722 paddw mm0
, round_values
// + 128
724 paddw mm2
, round_values
// + 128
728 packuswb mm0
, mm2
// des [1]
730 movq QWORD ptr
[esi
+ecx
], mm0
// write des[1]
731 movq mm0
, [esi
+ecx
*2] // mm0 = src[2]
733 // mm1, mm3 --- Src[1]
737 movq mm4
, mm1
// b low
738 pmullw mm1
, four_fifths
// b * 4/5 low
740 movq mm5
, mm3
// b high
741 pmullw mm3
, four_fifths
// b * 4/5 high
744 pmullw mm4
, one_fifth
// b * 1/5
746 punpcklbw mm0
, mm7
// c low
747 pmullw mm5
, one_fifth
// b * 1/5
749 movq mm6
, mm0
// make copy of c low
750 punpckhbw mm2
, mm7
// c high
752 pmullw mm6
, one_fifth
// c * 1/5 low
753 movq mm7
, mm2
// make copy of c high
755 pmullw mm7
, one_fifth
// c * 1/5 high
756 paddw mm1
, mm6
// b * 4/5 + c * 1/5 low
758 paddw mm3
, mm7
// b * 4/5 + c * 1/5 high
759 movq mm6
, mm0
// make copy of c low
761 pmullw mm6
, four_fifths
// c * 4/5 low
762 movq mm7
, mm2
// make copy of c high
764 pmullw mm7
, four_fifths
// c * 4/5 high
766 paddw mm4
, mm6
// b * 1/5 + c * 4/5 low
767 paddw mm5
, mm7
// b * 1/5 + c * 4/5 high
769 paddw mm1
, round_values
// + 128
770 paddw mm3
, round_values
// + 128
775 packuswb mm1
, mm3
// des[2]
776 movq QWORD ptr
[esi
+ecx
*2], mm1
// write des[2]
778 paddw mm4
, round_values
// + 128
779 paddw mm5
, round_values
// + 128
784 packuswb mm4
, mm5
// des[3]
785 movq QWORD ptr
[edi
], mm4
// write des[3]
787 // mm0, mm2 --- Src[3]
789 pxor mm7
, mm7
// clear mm7 for unpacking
790 movq mm1
, [edi
+ecx
*2] // mm1 = Src[0] of the next group
792 movq mm5
, three_fifths
// mm5 = 3/5
793 pmullw mm0
, mm5
// d * 3/5
795 movq mm6
, two_fifths
// mm6 = 2/5
796 movq mm3
, mm1
// make a copy
798 pmullw mm2
, mm5
// d * 3/5
799 punpcklbw mm1
, mm7
// unpack low
801 pmullw mm1
, mm6
// an * 2/5
802 punpckhbw mm3
, mm7
// unpack high
804 paddw mm0
, mm1
// d * 3/5 + an * 2/5
805 pmullw mm3
, mm6
// an * 2/5
807 paddw mm2
, mm3
// d * 3/5 + an * 2/5
808 paddw mm0
, round_values
// + 128
810 paddw mm2
, round_values
// + 128
814 packuswb mm0
, mm2
// des[4]
816 movq QWORD ptr
[edi
+ecx
], mm0
// write des[4]
826 /****************************************************************************
828 * ROUTINE : last_vertical_band_3_5_scale_mmx
830 * INPUTS : unsigned char *dest :
831 * unsigned int dest_pitch :
832 * unsigned int dest_width :
838 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
840 * SPECIAL NOTES : The routine uses the first line of the band below
841 * the current band. The function also has an "C" only
844 ****************************************************************************/
846 void last_vertical_band_3_5_scale_mmx
849 unsigned int dest_pitch
,
850 unsigned int dest_width
855 mov esi
, dest
// Get the source and destination pointer
856 mov ecx
, dest_pitch
// Get the pitch size
858 lea edi
, [esi
+ecx
*2] // tow lines below
859 add edi
, ecx
// three lines below
861 pxor mm7
, mm7
// clear out mm7
862 mov edx
, dest_width
// Loop counter
867 movq mm0
, QWORD ptr
[esi
] // src[0];
868 movq mm1
, QWORD ptr
[esi
+ecx
] // src[1];
870 movq mm2
, mm0
// Make a copy
871 punpcklbw mm0
, mm7
// unpack low to word
873 movq mm5
, two_fifths
// mm5 = 2/5
874 punpckhbw mm2
, mm7
// unpack high to word
876 pmullw mm0
, mm5
// a * 2/5
878 movq mm3
, mm1
// make a copy
879 punpcklbw mm1
, mm7
// unpack low to word
881 pmullw mm2
, mm5
// a * 2/5
882 movq mm6
, three_fifths
// mm6 = 3/5
884 movq mm4
, mm1
// copy of low b
885 pmullw mm4
, mm6
// b * 3/5
887 punpckhbw mm3
, mm7
// unpack high to word
888 movq mm5
, mm3
// copy of high b
890 pmullw mm5
, mm6
// b * 3/5
891 paddw mm0
, mm4
// a * 2/5 + b * 3/5
893 paddw mm2
, mm5
// a * 2/5 + b * 3/5
894 paddw mm0
, round_values
// + 128
896 paddw mm2
, round_values
// + 128
900 packuswb mm0
, mm2
// des [1]
902 movq QWORD ptr
[esi
+ecx
], mm0
// write des[1]
903 movq mm0
, [esi
+ecx
*2] // mm0 = src[2]
907 // mm1, mm3 --- Src[1]
911 movq mm4
, mm1
// b low
912 pmullw mm1
, four_fifths
// b * 4/5 low
914 movq QWORD ptr
[edi
+ecx
], mm0
// write des[4]
916 movq mm5
, mm3
// b high
917 pmullw mm3
, four_fifths
// b * 4/5 high
920 pmullw mm4
, one_fifth
// b * 1/5
922 punpcklbw mm0
, mm7
// c low
923 pmullw mm5
, one_fifth
// b * 1/5
925 movq mm6
, mm0
// make copy of c low
926 punpckhbw mm2
, mm7
// c high
928 pmullw mm6
, one_fifth
// c * 1/5 low
929 movq mm7
, mm2
// make copy of c high
931 pmullw mm7
, one_fifth
// c * 1/5 high
932 paddw mm1
, mm6
// b * 4/5 + c * 1/5 low
934 paddw mm3
, mm7
// b * 4/5 + c * 1/5 high
935 movq mm6
, mm0
// make copy of c low
937 pmullw mm6
, four_fifths
// c * 4/5 low
938 movq mm7
, mm2
// make copy of c high
940 pmullw mm7
, four_fifths
// c * 4/5 high
942 paddw mm4
, mm6
// b * 1/5 + c * 4/5 low
943 paddw mm5
, mm7
// b * 1/5 + c * 4/5 high
945 paddw mm1
, round_values
// + 128
946 paddw mm3
, round_values
// + 128
951 packuswb mm1
, mm3
// des[2]
952 movq QWORD ptr
[esi
+ecx
*2], mm1
// write des[2]
954 paddw mm4
, round_values
// + 128
955 paddw mm5
, round_values
// + 128
960 packuswb mm4
, mm5
// des[3]
961 movq QWORD ptr
[edi
], mm4
// write des[3]
963 // mm0, mm2 --- Src[3]
973 /****************************************************************************
975 * ROUTINE : vertical_band_1_2_scale_mmx
977 * INPUTS : unsigned char *dest :
978 * unsigned int dest_pitch :
979 * unsigned int dest_width :
985 * FUNCTION : 1 to 2 up-scaling of a band of pixels.
987 * SPECIAL NOTES : The routine uses the first line of the band below
988 * the current band. The function also has an "C" only
991 ****************************************************************************/
993 void vertical_band_1_2_scale_mmx
996 unsigned int dest_pitch
,
997 unsigned int dest_width
1003 mov esi
, dest
// Get the source and destination pointer
1004 mov ecx
, dest_pitch
// Get the pitch size
1006 pxor mm7
, mm7
// clear out mm7
1007 mov edx
, dest_width
// Loop counter
1011 movq mm0
, [esi
] // get Src[0]
1012 movq mm1
, [esi
+ ecx
* 2] // get Src[1]
1014 movq mm2
, mm0
// make copy before unpack
1015 movq mm3
, mm1
// make copy before unpack
1017 punpcklbw mm0
, mm7
// low Src[0]
1018 movq mm6
, four_ones
// mm6= 1, 1, 1, 1
1020 punpcklbw mm1
, mm7
// low Src[1]
1021 paddw mm0
, mm1
// low (a + b)
1023 punpckhbw mm2
, mm7
// high Src[0]
1024 paddw mm0
, mm6
// low (a + b + 1)
1027 paddw mm2
, mm3
// high (a + b )
1029 psraw mm0
, 1 // low (a + b +1 )/2
1030 paddw mm2
, mm6
// high (a + b + 1)
1032 psraw mm2
, 1 // high (a + b + 1)/2
1033 packuswb mm0
, mm2
// pack results
1035 movq
[esi
+ecx
], mm0
// write out eight bytes
1044 /****************************************************************************
1046 * ROUTINE : last_vertical_band_1_2_scale_mmx
1048 * INPUTS : unsigned char *dest :
1049 * unsigned int dest_pitch :
1050 * unsigned int dest_width :
1056 * FUNCTION : 1 to 2 up-scaling of band of pixels.
1058 * SPECIAL NOTES : The routine uses the first line of the band below
1059 * the current band. The function also has an "C" only
1062 ****************************************************************************/
1064 void last_vertical_band_1_2_scale_mmx
1066 unsigned char *dest
,
1067 unsigned int dest_pitch
,
1068 unsigned int dest_width
1073 mov esi
, dest
// Get the source and destination pointer
1074 mov ecx
, dest_pitch
// Get the pitch size
1076 mov edx
, dest_width
// Loop counter
1080 movq mm0
, [esi
] // get Src[0]
1081 movq
[esi
+ecx
], mm0
// write out eight bytes
1090 /****************************************************************************
1092 * ROUTINE : horizontal_line_1_2_scale
1094 * INPUTS : const unsigned char *source :
1095 * unsigned int source_width :
1096 * unsigned char *dest :
1097 * unsigned int dest_width :
1103 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
1105 * SPECIAL NOTES : None.
1107 ****************************************************************************/
1109 void horizontal_line_1_2_scale_mmx
1111 const unsigned char *source
,
1112 unsigned int source_width
,
1113 unsigned char *dest
,
1114 unsigned int dest_width
1127 mov ecx
, source_width
1217 __declspec(align(16)) const static unsigned short const54_2
[] = { 0, 64, 128, 192 };
1218 __declspec(align(16)) const static unsigned short const54_1
[] = {256, 192, 128, 64 };
1221 /****************************************************************************
1223 * ROUTINE : horizontal_line_5_4_scale_mmx
1225 * INPUTS : const unsigned char *source : Pointer to source data.
1226 * unsigned int source_width : Stride of source.
1227 * unsigned char *dest : Pointer to destination data.
1228 * unsigned int dest_width : Stride of destination (NOT USED).
1234 * FUNCTION : Copies horizontal line of pixels from source to
1235 * destination scaling up by 4 to 5.
1237 * SPECIAL NOTES : None.
1239 ****************************************************************************/
1241 void horizontal_line_5_4_scale_mmx
1243 const unsigned char *source
,
1244 unsigned int source_width
,
1245 unsigned char *dest
,
1246 unsigned int dest_width
1251 unsigned int a, b, c, d, e;
1252 unsigned char *des = dest;
1253 const unsigned char *src = source;
1257 for ( i=0; i<source_width; i+=5 )
1266 des[1] = ((b*192 + c* 64 + 128)>>8);
1267 des[2] = ((c*128 + d*128 + 128)>>8);
1268 des[3] = ((d* 64 + e*192 + 128)>>8);
1282 mov ecx
, source_width
;
1283 movq mm5
, const54_1
;
1286 movq mm6
, const54_2
;
1288 movq mm4
, round_values
;
1289 lea edx
, [esi
+ecx
] ;
1290 horizontal_line_5_4_loop
:
1292 movq mm0
, QWORD PTR
[esi
] ;
1293 00 01 02 03 04 05 06 07
1295 00 01 02 03 04 05 06 07
1298 01 02 03 04 05 06 07 xx
1299 punpcklbw mm1
, mm7
;
1300 xx
00 xx
01 xx
02 xx
03
1302 punpcklbw mm0
, mm7
;
1303 xx
01 xx
02 xx
03 xx
04
1318 movd DWORD PTR
[edi
-4], mm1
1320 jl horizontal_line_5_4_loop
1325 __declspec(align(16)) const static unsigned short one_fourths
[] = { 64, 64, 64, 64 };
1326 __declspec(align(16)) const static unsigned short two_fourths
[] = { 128, 128, 128, 128 };
1327 __declspec(align(16)) const static unsigned short three_fourths
[] = { 192, 192, 192, 192 };
1330 void vertical_band_5_4_scale_mmx(unsigned char *source
, unsigned int src_pitch
, unsigned char *dest
, unsigned int dest_pitch
, unsigned int dest_width
)
1337 mov esi
, source
// Get the source and destination pointer
1338 mov ecx
, src_pitch
// Get the pitch size
1340 mov edi
, dest
// tow lines below
1341 pxor mm7
, mm7
// clear out mm7
1343 mov edx
, dest_pitch
// Loop counter
1348 movd mm0
, DWORD ptr
[esi
] // src[0];
1349 movd mm1
, DWORD ptr
[esi
+ecx
] // src[1];
1351 movd mm2
, DWORD ptr
[esi
+ecx
*2]
1352 lea eax
, [esi
+ecx
*2] //
1358 pmullw mm1
, three_fourths
1360 pmullw mm2
, one_fourths
1363 pmullw mm3
, two_fourths
1367 pmullw mm4
, two_fourths
1370 movd mm6
, [eax
+ecx
*2]
1372 pmullw mm5
, one_fourths
1373 paddw mm1
, round_values
;
1379 paddw mm3
, round_values
1381 pmullw mm6
, three_fourths
1387 movd DWORD PTR
[edi
], mm0
1388 movd DWORD PTR
[edi
+edx
], mm1
1392 movd DWORD PTR
[edi
+edx
*2], mm3
1394 lea eax
, [edi
+edx
*2]
1395 paddw mm5
, round_values
1401 movd DWORD PTR
[eax
+edx
], mm5
1413 __declspec(align(16)) const static unsigned short const53_1
[] = { 0, 85, 171, 0 };
1414 __declspec(align(16)) const static unsigned short const53_2
[] = {256, 171, 85, 0 };
1418 void horizontal_line_5_3_scale_mmx
1420 const unsigned char *source
,
1421 unsigned int source_width
,
1422 unsigned char *dest
,
1423 unsigned int dest_width
1434 mov ecx
, source_width
;
1435 movq mm5
, const53_1
;
1438 movq mm6
, const53_2
;
1440 movq mm4
, round_values
;
1441 lea edx
, [esi
+ecx
-5] ;
1442 horizontal_line_5_3_loop
:
1444 movq mm0
, QWORD PTR
[esi
] ;
1445 00 01 02 03 04 05 06 07
1447 00 01 02 03 04 05 06 07
1450 xx
00 xx
02 xx
04 xx
06
1452 01 xx
03 xx
05 xx
07 xx
1455 00 xx
02 xx
04 xx
06 xx
1457 xx xx
01 xx
03 xx
05 xx
1473 movd DWORD PTR
[edi
-3], mm1
1474 jl horizontal_line_5_3_loop
1477 movq mm0
, QWORD PTR
[esi
] ;
1478 00 01 02 03 04 05 06 07
1480 00 01 02 03 04 05 06 07
1483 xx
00 xx
02 xx
04 xx
06
1485 01 xx
03 xx
05 xx
07 xx
1488 00 xx
02 xx
04 xx
06 xx
1490 xx xx
01 xx
03 xx
05 xx
1506 mov WORD PTR
[edi
], ax
1507 mov BYTE PTR
[edi
+2], dl
1513 __declspec(align(16)) const static unsigned short one_thirds
[] = { 85, 85, 85, 85 };
1514 __declspec(align(16)) const static unsigned short two_thirds
[] = { 171, 171, 171, 171 };
1517 void vertical_band_5_3_scale_mmx(unsigned char *source
, unsigned int src_pitch
, unsigned char *dest
, unsigned int dest_pitch
, unsigned int dest_width
)
1524 mov esi
, source
// Get the source and destination pointer
1525 mov ecx
, src_pitch
// Get the pitch size
1527 mov edi
, dest
// tow lines below
1528 pxor mm7
, mm7
// clear out mm7
1530 mov edx
, dest_pitch
// Loop counter
1531 movq mm5
, one_thirds
1533 movq mm6
, two_thirds
1534 mov ebx
, dest_width
;
1538 movd mm0
, DWORD ptr
[esi
] // src[0];
1539 movd mm1
, DWORD ptr
[esi
+ecx
] // src[1];
1541 movd mm2
, DWORD ptr
[esi
+ecx
*2]
1542 lea eax
, [esi
+ecx
*2] //
1550 movd mm3
, DWORD ptr
[eax
+ecx
]
1551 movd mm4
, DWORD ptr
[eax
+ecx
*2]
1560 movd DWORD PTR
[edi
], mm0
1563 paddw mm1
, round_values
1569 paddw mm3
, round_values
1570 movd DWORD PTR
[edi
+edx
], mm1
1575 movd DWORD PTR
[edi
+edx
*2], mm3
1591 /****************************************************************************
1593 * ROUTINE : horizontal_line_2_1_scale
1595 * INPUTS : const unsigned char *source :
1596 * unsigned int source_width :
1597 * unsigned char *dest :
1598 * unsigned int dest_width :
1604 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
1606 * SPECIAL NOTES : None.
1608 ****************************************************************************/
1610 void horizontal_line_2_1_scale_mmx
1612 const unsigned char *source
,
1613 unsigned int source_width
,
1614 unsigned char *dest
,
1615 unsigned int dest_width
1619 (void) source_width
;
1631 movq mm0
, [esi
+edx
*2]
1637 movd DWORD Ptr
[edi
+edx
], mm0
;
1649 void vertical_band_2_1_scale_mmx(unsigned char *source
, unsigned int src_pitch
, unsigned char *dest
, unsigned int dest_pitch
, unsigned int dest_width
)
1653 vpx_memcpy(dest
, source
, dest_width
);
1657 __declspec(align(16)) const static unsigned short three_sixteenths
[] = { 48, 48, 48, 48 };
1658 __declspec(align(16)) const static unsigned short ten_sixteenths
[] = { 160, 160, 160, 160 };
1661 void vertical_band_2_1_scale_i_mmx(unsigned char *source
, unsigned int src_pitch
, unsigned char *dest
, unsigned int dest_pitch
, unsigned int dest_width
)
1674 sub esi
, eax
//back one line
1678 movq mm6
, round_values
;
1680 movq mm5
, three_sixteenths
;
1681 movq mm4
, ten_sixteenths
;
1685 movd mm1
, [esi
+eax
] //
1687 movd mm2
, [esi
+eax
*2] //
1697 paddw mm0
, round_values
1705 movd DWORD PTR
[edi
], mm0
1718 register_mmxscalers(void)
1720 vp8_horizontal_line_1_2_scale
= horizontal_line_1_2_scale_mmx
;
1721 vp8_vertical_band_1_2_scale
= vertical_band_1_2_scale_mmx
;
1722 vp8_last_vertical_band_1_2_scale
= last_vertical_band_1_2_scale_mmx
;
1723 vp8_horizontal_line_3_5_scale
= horizontal_line_3_5_scale_mmx
;
1724 vp8_vertical_band_3_5_scale
= vertical_band_3_5_scale_mmx
;
1725 vp8_last_vertical_band_3_5_scale
= last_vertical_band_3_5_scale_mmx
;
1726 vp8_horizontal_line_4_5_scale
= horizontal_line_4_5_scale_mmx
;
1727 vp8_vertical_band_4_5_scale
= vertical_band_4_5_scale_mmx
;
1728 vp8_last_vertical_band_4_5_scale
= last_vertical_band_4_5_scale_mmx
;
1730 vp8_horizontal_line_3_4_scale
= vp8cx_horizontal_line_3_4_scale_c
;
1731 vp8_vertical_band_3_4_scale
= vp8cx_vertical_band_3_4_scale_c
;
1732 vp8_last_vertical_band_3_4_scale
= vp8cx_last_vertical_band_3_4_scale_c
;
1733 vp8_horizontal_line_2_3_scale
= vp8cx_horizontal_line_2_3_scale_c
;
1734 vp8_vertical_band_2_3_scale
= vp8cx_vertical_band_2_3_scale_c
;
1735 vp8_last_vertical_band_2_3_scale
= vp8cx_last_vertical_band_2_3_scale_c
;
1739 vp8_vertical_band_5_4_scale
= vertical_band_5_4_scale_mmx
;
1740 vp8_vertical_band_5_3_scale
= vertical_band_5_3_scale_mmx
;
1741 vp8_vertical_band_2_1_scale
= vertical_band_2_1_scale_mmx
;
1742 vp8_vertical_band_2_1_scale_i
= vertical_band_2_1_scale_i_mmx
;
1743 vp8_horizontal_line_2_1_scale
= horizontal_line_2_1_scale_mmx
;
1744 vp8_horizontal_line_5_3_scale
= horizontal_line_5_3_scale_mmx
;
1745 vp8_horizontal_line_5_4_scale
= horizontal_line_5_4_scale_mmx
;