Initial WebM release
[libvpx.git] / vpx_scale / x86_64 / scaleopt.c
blob3d2d5f237b00c0012c48d1bb6f8676f38529ea6e
1 /*
2 * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license and patent
5 * grant that can be found in the LICENSE file in the root of the source
6 * tree. All contributing project authors may be found in the AUTHORS
7 * file in the root of the source tree.
8 */
11 /****************************************************************************
13 * Module Title : scaleopt.cpp
15 * Description : Optimized scaling functions
17 ****************************************************************************/
18 #include "pragmas.h"
22 /****************************************************************************
23 * Module Statics
24 ****************************************************************************/
25 __declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 };
26 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
27 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
28 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
29 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
30 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
31 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
32 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
33 __declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
34 __declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
35 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
39 #include "vpx_scale/vpxscale.h"
40 #include "vpx_mem/vpx_mem.h"
42 /****************************************************************************
44 * ROUTINE : horizontal_line_3_5_scale_mmx
46 * INPUTS : const unsigned char *source :
47 * unsigned int source_width :
48 * unsigned char *dest :
49 * unsigned int dest_width :
51 * OUTPUTS : None.
53 * RETURNS : void
55 * FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
57 * SPECIAL NOTES : None.
59 ****************************************************************************/
60 static
61 void horizontal_line_3_5_scale_mmx
63 const unsigned char *source,
64 unsigned int source_width,
65 unsigned char *dest,
66 unsigned int dest_width
69 (void) dest_width;
71 __asm
74 push rbx
76 mov rsi, source
77 mov rdi, dest
79 mov ecx, source_width
80 lea rdx, [rsi+rcx-3];
82 movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
83 movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
85 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
86 pxor mm7, mm7 // clear mm7
88 horiz_line_3_5_loop:
90 mov eax, DWORD PTR [rsi] // eax = 00 01 02 03
91 mov ebx, eax
93 and ebx, 0xffff00 // ebx = xx 01 02 xx
94 mov ecx, eax // ecx = 00 01 02 03
96 and eax, 0xffff0000 // eax = xx xx 02 03
97 xor ecx, eax // ecx = 00 01 xx xx
99 shr ebx, 8 // ebx = 01 02 xx xx
100 or eax, ebx // eax = 01 02 02 03
102 shl ebx, 16 // ebx = xx xx 01 02
103 movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
105 or ebx, ecx // ebx = 00 01 01 02
106 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
108 movd mm0, ebx // mm0 = 00 01 01 02
109 pmullw mm1, mm6 //
111 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
112 pmullw mm0, mm5 //
114 mov [rdi], ebx // writeoutput 00 xx xx xx
115 add rsi, 3
117 add rdi, 5
118 paddw mm0, mm1
120 paddw mm0, mm4
121 psrlw mm0, 8
123 cmp rsi, rdx
124 packuswb mm0, mm7
126 movd DWORD Ptr [rdi-4], mm0
127 jl horiz_line_3_5_loop
129 //Exit:
130 mov eax, DWORD PTR [rsi] // eax = 00 01 02 03
131 mov ebx, eax
133 and ebx, 0xffff00 // ebx = xx 01 02 xx
134 mov ecx, eax // ecx = 00 01 02 03
136 and eax, 0xffff0000 // eax = xx xx 02 03
137 xor ecx, eax // ecx = 00 01 xx xx
139 shr ebx, 8 // ebx = 01 02 xx xx
140 or eax, ebx // eax = 01 02 02 03
142 shl eax, 8 // eax = xx 01 02 02
143 and eax, 0xffff0000 // eax = xx xx 02 02
145 or eax, ebx // eax = 01 02 02 02
147 shl ebx, 16 // ebx = xx xx 01 02
148 movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
150 or ebx, ecx // ebx = 00 01 01 02
151 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
153 movd mm0, ebx // mm0 = 00 01 01 02
154 pmullw mm1, mm6 //
156 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
157 pmullw mm0, mm5 //
159 mov [rdi], ebx // writeoutput 00 xx xx xx
160 paddw mm0, mm1
162 paddw mm0, mm4
163 psrlw mm0, 8
165 packuswb mm0, mm7
166 movd DWORD Ptr [rdi+1], mm0
168 pop rbx
175 /****************************************************************************
177 * ROUTINE : horizontal_line_4_5_scale_mmx
179 * INPUTS : const unsigned char *source :
180 * unsigned int source_width :
181 * unsigned char *dest :
182 * unsigned int dest_width :
184 * OUTPUTS : None.
186 * RETURNS : void
188 * FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
190 * SPECIAL NOTES : None.
192 ****************************************************************************/
193 static
194 void horizontal_line_4_5_scale_mmx
196 const unsigned char *source,
197 unsigned int source_width,
198 unsigned char *dest,
199 unsigned int dest_width
202 (void)dest_width;
204 __asm
207 mov rsi, source
208 mov rdi, dest
210 mov ecx, source_width
211 lea rdx, [rsi+rcx-8];
213 movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
214 movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
216 movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
217 pxor mm7, mm7 // clear mm7
219 horiz_line_4_5_loop:
221 movq mm0, QWORD PTR [rsi] // mm0 = 00 01 02 03 04 05 06 07
222 movq mm1, QWORD PTR [rsi+1]; // mm1 = 01 02 03 04 05 06 07 08
224 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
225 movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
227 movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx
228 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
230 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
231 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
233 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
234 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
236 movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx
237 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
239 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
240 pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
242 paddw mm0, mm1 // added round values
243 paddw mm0, mm4
245 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
246 packuswb mm0, mm7
248 movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04
249 add rdi, 10
251 add rsi, 8
252 paddw mm2, mm3 //
254 paddw mm2, mm4 // added round values
255 cmp rsi, rdx
257 psrlw mm2, 8
258 packuswb mm2, mm7
260 movd DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09
261 jl horiz_line_4_5_loop
263 //Exit:
264 movq mm0, [rsi] // mm0 = 00 01 02 03 04 05 06 07
265 movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
267 movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
268 psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
270 movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
271 pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
273 psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
274 por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
276 movq mm3, mm1
278 movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx
279 punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
281 punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
282 pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
284 pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
285 punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
287 movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx
288 pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
290 punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
291 pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
293 paddw mm0, mm1 // added round values
294 paddw mm0, mm4
296 psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
297 packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
299 movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04
300 paddw mm2, mm3 //
302 paddw mm2, mm4 // added round values
303 psrlw mm2, 8
305 packuswb mm2, mm7
306 movd DWORD PTR [rdi+6], mm2 // writeoutput 06 07 08 09
312 /****************************************************************************
314 * ROUTINE : vertical_band_4_5_scale_mmx
316 * INPUTS : unsigned char *dest :
317 * unsigned int dest_pitch :
318 * unsigned int dest_width :
320 * OUTPUTS : None.
322 * RETURNS : void
324 * FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
326 * SPECIAL NOTES : The routine uses the first line of the band below
327 * the current band. The function also has a "C" only
328 * version.
330 ****************************************************************************/
331 static
332 void vertical_band_4_5_scale_mmx
334 unsigned char *dest,
335 unsigned int dest_pitch,
336 unsigned int dest_width
339 __asm
342 mov rsi, dest // Get the source and destination pointer
343 mov ecx, dest_pitch // Get the pitch size
345 lea rdi, [rsi+rcx*2] // tow lines below
346 add rdi, rcx // three lines below
348 pxor mm7, mm7 // clear out mm7
349 mov edx, dest_width // Loop counter
351 vs_4_5_loop:
353 movq mm0, QWORD ptr [rsi] // src[0];
354 movq mm1, QWORD ptr [rsi+rcx] // src[1];
356 movq mm2, mm0 // Make a copy
357 punpcklbw mm0, mm7 // unpack low to word
359 movq mm5, one_fifth
360 punpckhbw mm2, mm7 // unpack high to word
362 pmullw mm0, mm5 // a * 1/5
364 movq mm3, mm1 // make a copy
365 punpcklbw mm1, mm7 // unpack low to word
367 pmullw mm2, mm5 // a * 1/5
368 movq mm6, four_fifths // constan
370 movq mm4, mm1 // copy of low b
371 pmullw mm4, mm6 // b * 4/5
373 punpckhbw mm3, mm7 // unpack high to word
374 movq mm5, mm3 // copy of high b
376 pmullw mm5, mm6 // b * 4/5
377 paddw mm0, mm4 // a * 1/5 + b * 4/5
379 paddw mm2, mm5 // a * 1/5 + b * 4/5
380 paddw mm0, round_values // + 128
382 paddw mm2, round_values // + 128
383 psrlw mm0, 8
385 psrlw mm2, 8
386 packuswb mm0, mm2 // des [1]
388 movq QWORD ptr [rsi+rcx], mm0 // write des[1]
389 movq mm0, [rsi+rcx*2] // mm0 = src[2]
391 // mm1, mm3 --- Src[1]
392 // mm0 --- Src[2]
393 // mm7 for unpacking
395 movq mm5, two_fifths
396 movq mm2, mm0 // make a copy
398 pmullw mm1, mm5 // b * 2/5
399 movq mm6, three_fifths
402 punpcklbw mm0, mm7 // unpack low to word
403 pmullw mm3, mm5 // b * 2/5
405 movq mm4, mm0 // make copy of c
406 punpckhbw mm2, mm7 // unpack high to word
408 pmullw mm4, mm6 // c * 3/5
409 movq mm5, mm2
411 pmullw mm5, mm6 // c * 3/5
412 paddw mm1, mm4 // b * 2/5 + c * 3/5
414 paddw mm3, mm5 // b * 2/5 + c * 3/5
415 paddw mm1, round_values // + 128
417 paddw mm3, round_values // + 128
418 psrlw mm1, 8
420 psrlw mm3, 8
421 packuswb mm1, mm3 // des[2]
423 movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
424 movq mm1, [rdi] // mm1=Src[3];
426 // mm0, mm2 --- Src[2]
427 // mm1 --- Src[3]
428 // mm6 --- 3/5
429 // mm7 for unpacking
431 pmullw mm0, mm6 // c * 3/5
432 movq mm5, two_fifths // mm5 = 2/5
434 movq mm3, mm1 // make a copy
435 pmullw mm2, mm6 // c * 3/5
437 punpcklbw mm1, mm7 // unpack low
438 movq mm4, mm1 // make a copy
440 punpckhbw mm3, mm7 // unpack high
441 pmullw mm4, mm5 // d * 2/5
443 movq mm6, mm3 // make a copy
444 pmullw mm6, mm5 // d * 2/5
446 paddw mm0, mm4 // c * 3/5 + d * 2/5
447 paddw mm2, mm6 // c * 3/5 + d * 2/5
449 paddw mm0, round_values // + 128
450 paddw mm2, round_values // + 128
452 psrlw mm0, 8
453 psrlw mm2, 8
455 packuswb mm0, mm2 // des[3]
456 movq QWORD ptr [rdi], mm0 // write des[3]
458 // mm1, mm3 --- Src[3]
459 // mm7 -- cleared for unpacking
461 movq mm0, [rdi+rcx*2] // mm0, Src[0] of the next group
463 movq mm5, four_fifths // mm5 = 4/5
464 pmullw mm1, mm5 // d * 4/5
466 movq mm6, one_fifth // mm6 = 1/5
467 movq mm2, mm0 // make a copy
469 pmullw mm3, mm5 // d * 4/5
470 punpcklbw mm0, mm7 // unpack low
472 pmullw mm0, mm6 // an * 1/5
473 punpckhbw mm2, mm7 // unpack high
475 paddw mm1, mm0 // d * 4/5 + an * 1/5
476 pmullw mm2, mm6 // an * 1/5
478 paddw mm3, mm2 // d * 4/5 + an * 1/5
479 paddw mm1, round_values // + 128
481 paddw mm3, round_values // + 128
482 psrlw mm1, 8
484 psrlw mm3, 8
485 packuswb mm1, mm3 // des[4]
487 movq QWORD ptr [rdi+rcx], mm1 // write des[4]
489 add rdi, 8
490 add rsi, 8
492 sub rdx, 8
493 jg vs_4_5_loop
497 /****************************************************************************
499 * ROUTINE : last_vertical_band_4_5_scale_mmx
501 * INPUTS : unsigned char *dest :
502 * unsigned int dest_pitch :
503 * unsigned int dest_width :
505 * OUTPUTS : None.
507 * RETURNS : None
509 * FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
511 * SPECIAL NOTES : The routine uses the first line of the band below
512 * the current band. The function also has an "C" only
513 * version.
515 ****************************************************************************/
516 static
517 void last_vertical_band_4_5_scale_mmx
519 unsigned char *dest,
520 unsigned int dest_pitch,
521 unsigned int dest_width
524 __asm
526 mov rsi, dest // Get the source and destination pointer
527 mov ecx, dest_pitch // Get the pitch size
529 lea rdi, [rsi+rcx*2] // tow lines below
530 add rdi, rcx // three lines below
532 pxor mm7, mm7 // clear out mm7
533 mov edx, dest_width // Loop counter
535 last_vs_4_5_loop:
537 movq mm0, QWORD ptr [rsi] // src[0];
538 movq mm1, QWORD ptr [rsi+rcx] // src[1];
540 movq mm2, mm0 // Make a copy
541 punpcklbw mm0, mm7 // unpack low to word
543 movq mm5, one_fifth
544 punpckhbw mm2, mm7 // unpack high to word
546 pmullw mm0, mm5 // a * 1/5
548 movq mm3, mm1 // make a copy
549 punpcklbw mm1, mm7 // unpack low to word
551 pmullw mm2, mm5 // a * 1/5
552 movq mm6, four_fifths // constan
554 movq mm4, mm1 // copy of low b
555 pmullw mm4, mm6 // b * 4/5
557 punpckhbw mm3, mm7 // unpack high to word
558 movq mm5, mm3 // copy of high b
560 pmullw mm5, mm6 // b * 4/5
561 paddw mm0, mm4 // a * 1/5 + b * 4/5
563 paddw mm2, mm5 // a * 1/5 + b * 4/5
564 paddw mm0, round_values // + 128
566 paddw mm2, round_values // + 128
567 psrlw mm0, 8
569 psrlw mm2, 8
570 packuswb mm0, mm2 // des [1]
572 movq QWORD ptr [rsi+rcx], mm0 // write des[1]
573 movq mm0, [rsi+rcx*2] // mm0 = src[2]
575 // mm1, mm3 --- Src[1]
576 // mm0 --- Src[2]
577 // mm7 for unpacking
579 movq mm5, two_fifths
580 movq mm2, mm0 // make a copy
582 pmullw mm1, mm5 // b * 2/5
583 movq mm6, three_fifths
586 punpcklbw mm0, mm7 // unpack low to word
587 pmullw mm3, mm5 // b * 2/5
589 movq mm4, mm0 // make copy of c
590 punpckhbw mm2, mm7 // unpack high to word
592 pmullw mm4, mm6 // c * 3/5
593 movq mm5, mm2
595 pmullw mm5, mm6 // c * 3/5
596 paddw mm1, mm4 // b * 2/5 + c * 3/5
598 paddw mm3, mm5 // b * 2/5 + c * 3/5
599 paddw mm1, round_values // + 128
601 paddw mm3, round_values // + 128
602 psrlw mm1, 8
604 psrlw mm3, 8
605 packuswb mm1, mm3 // des[2]
607 movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
608 movq mm1, [rdi] // mm1=Src[3];
610 movq QWORD ptr [rdi+rcx], mm1 // write des[4];
612 // mm0, mm2 --- Src[2]
613 // mm1 --- Src[3]
614 // mm6 --- 3/5
615 // mm7 for unpacking
617 pmullw mm0, mm6 // c * 3/5
618 movq mm5, two_fifths // mm5 = 2/5
620 movq mm3, mm1 // make a copy
621 pmullw mm2, mm6 // c * 3/5
623 punpcklbw mm1, mm7 // unpack low
624 movq mm4, mm1 // make a copy
626 punpckhbw mm3, mm7 // unpack high
627 pmullw mm4, mm5 // d * 2/5
629 movq mm6, mm3 // make a copy
630 pmullw mm6, mm5 // d * 2/5
632 paddw mm0, mm4 // c * 3/5 + d * 2/5
633 paddw mm2, mm6 // c * 3/5 + d * 2/5
635 paddw mm0, round_values // + 128
636 paddw mm2, round_values // + 128
638 psrlw mm0, 8
639 psrlw mm2, 8
641 packuswb mm0, mm2 // des[3]
642 movq QWORD ptr [rdi], mm0 // write des[3]
644 // mm1, mm3 --- Src[3]
645 // mm7 -- cleared for unpacking
646 add rdi, 8
647 add rsi, 8
649 sub rdx, 8
650 jg last_vs_4_5_loop
654 /****************************************************************************
656 * ROUTINE : vertical_band_3_5_scale_mmx
658 * INPUTS : unsigned char *dest :
659 * unsigned int dest_pitch :
660 * unsigned int dest_width :
662 * OUTPUTS : None.
664 * RETURNS : void
666 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
668 * SPECIAL NOTES : The routine uses the first line of the band below
669 * the current band. The function also has an "C" only
670 * version.
672 ****************************************************************************/
673 static
674 void vertical_band_3_5_scale_mmx
676 unsigned char *dest,
677 unsigned int dest_pitch,
678 unsigned int dest_width
681 __asm
683 mov rsi, dest // Get the source and destination pointer
684 mov ecx, dest_pitch // Get the pitch size
686 lea rdi, [rsi+rcx*2] // two lines below
687 add rdi, rcx // three lines below
689 pxor mm7, mm7 // clear out mm7
690 mov edx, dest_width // Loop counter
692 vs_3_5_loop:
694 movq mm0, QWORD ptr [rsi] // src[0];
695 movq mm1, QWORD ptr [rsi+rcx] // src[1];
697 movq mm2, mm0 // Make a copy
698 punpcklbw mm0, mm7 // unpack low to word
700 movq mm5, two_fifths // mm5 = 2/5
701 punpckhbw mm2, mm7 // unpack high to word
703 pmullw mm0, mm5 // a * 2/5
705 movq mm3, mm1 // make a copy
706 punpcklbw mm1, mm7 // unpack low to word
708 pmullw mm2, mm5 // a * 2/5
709 movq mm6, three_fifths // mm6 = 3/5
711 movq mm4, mm1 // copy of low b
712 pmullw mm4, mm6 // b * 3/5
714 punpckhbw mm3, mm7 // unpack high to word
715 movq mm5, mm3 // copy of high b
717 pmullw mm5, mm6 // b * 3/5
718 paddw mm0, mm4 // a * 2/5 + b * 3/5
720 paddw mm2, mm5 // a * 2/5 + b * 3/5
721 paddw mm0, round_values // + 128
723 paddw mm2, round_values // + 128
724 psrlw mm0, 8
726 psrlw mm2, 8
727 packuswb mm0, mm2 // des [1]
729 movq QWORD ptr [rsi+rcx], mm0 // write des[1]
730 movq mm0, [rsi+rcx*2] // mm0 = src[2]
732 // mm1, mm3 --- Src[1]
733 // mm0 --- Src[2]
734 // mm7 for unpacking
736 movq mm4, mm1 // b low
737 pmullw mm1, four_fifths // b * 4/5 low
739 movq mm5, mm3 // b high
740 pmullw mm3, four_fifths // b * 4/5 high
742 movq mm2, mm0 // c
743 pmullw mm4, one_fifth // b * 1/5
745 punpcklbw mm0, mm7 // c low
746 pmullw mm5, one_fifth // b * 1/5
748 movq mm6, mm0 // make copy of c low
749 punpckhbw mm2, mm7 // c high
751 pmullw mm6, one_fifth // c * 1/5 low
752 movq mm7, mm2 // make copy of c high
754 pmullw mm7, one_fifth // c * 1/5 high
755 paddw mm1, mm6 // b * 4/5 + c * 1/5 low
757 paddw mm3, mm7 // b * 4/5 + c * 1/5 high
758 movq mm6, mm0 // make copy of c low
760 pmullw mm6, four_fifths // c * 4/5 low
761 movq mm7, mm2 // make copy of c high
763 pmullw mm7, four_fifths // c * 4/5 high
765 paddw mm4, mm6 // b * 1/5 + c * 4/5 low
766 paddw mm5, mm7 // b * 1/5 + c * 4/5 high
768 paddw mm1, round_values // + 128
769 paddw mm3, round_values // + 128
771 psrlw mm1, 8
772 psrlw mm3, 8
774 packuswb mm1, mm3 // des[2]
775 movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
777 paddw mm4, round_values // + 128
778 paddw mm5, round_values // + 128
780 psrlw mm4, 8
781 psrlw mm5, 8
783 packuswb mm4, mm5 // des[3]
784 movq QWORD ptr [rdi], mm4 // write des[3]
786 // mm0, mm2 --- Src[3]
788 pxor mm7, mm7 // clear mm7 for unpacking
789 movq mm1, [rdi+rcx*2] // mm1 = Src[0] of the next group
791 movq mm5, three_fifths // mm5 = 3/5
792 pmullw mm0, mm5 // d * 3/5
794 movq mm6, two_fifths // mm6 = 2/5
795 movq mm3, mm1 // make a copy
797 pmullw mm2, mm5 // d * 3/5
798 punpcklbw mm1, mm7 // unpack low
800 pmullw mm1, mm6 // an * 2/5
801 punpckhbw mm3, mm7 // unpack high
803 paddw mm0, mm1 // d * 3/5 + an * 2/5
804 pmullw mm3, mm6 // an * 2/5
806 paddw mm2, mm3 // d * 3/5 + an * 2/5
807 paddw mm0, round_values // + 128
809 paddw mm2, round_values // + 128
810 psrlw mm0, 8
812 psrlw mm2, 8
813 packuswb mm0, mm2 // des[4]
815 movq QWORD ptr [rdi+rcx], mm0 // write des[4]
817 add rdi, 8
818 add rsi, 8
820 sub rdx, 8
821 jg vs_3_5_loop
825 /****************************************************************************
827 * ROUTINE : last_vertical_band_3_5_scale_mmx
829 * INPUTS : unsigned char *dest :
830 * unsigned int dest_pitch :
831 * unsigned int dest_width :
833 * OUTPUTS : None.
835 * RETURNS : void
837 * FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
839 * SPECIAL NOTES : The routine uses the first line of the band below
840 * the current band. The function also has an "C" only
841 * version.
843 ****************************************************************************/
844 static
845 void last_vertical_band_3_5_scale_mmx
847 unsigned char *dest,
848 unsigned int dest_pitch,
849 unsigned int dest_width
852 __asm
854 mov rsi, dest // Get the source and destination pointer
855 mov ecx, dest_pitch // Get the pitch size
857 lea rdi, [rsi+rcx*2] // tow lines below
858 add rdi, rcx // three lines below
860 pxor mm7, mm7 // clear out mm7
861 mov edx, dest_width // Loop counter
864 last_vs_3_5_loop:
866 movq mm0, QWORD ptr [rsi] // src[0];
867 movq mm1, QWORD ptr [rsi+rcx] // src[1];
869 movq mm2, mm0 // Make a copy
870 punpcklbw mm0, mm7 // unpack low to word
872 movq mm5, two_fifths // mm5 = 2/5
873 punpckhbw mm2, mm7 // unpack high to word
875 pmullw mm0, mm5 // a * 2/5
877 movq mm3, mm1 // make a copy
878 punpcklbw mm1, mm7 // unpack low to word
880 pmullw mm2, mm5 // a * 2/5
881 movq mm6, three_fifths // mm6 = 3/5
883 movq mm4, mm1 // copy of low b
884 pmullw mm4, mm6 // b * 3/5
886 punpckhbw mm3, mm7 // unpack high to word
887 movq mm5, mm3 // copy of high b
889 pmullw mm5, mm6 // b * 3/5
890 paddw mm0, mm4 // a * 2/5 + b * 3/5
892 paddw mm2, mm5 // a * 2/5 + b * 3/5
893 paddw mm0, round_values // + 128
895 paddw mm2, round_values // + 128
896 psrlw mm0, 8
898 psrlw mm2, 8
899 packuswb mm0, mm2 // des [1]
901 movq QWORD ptr [rsi+rcx], mm0 // write des[1]
902 movq mm0, [rsi+rcx*2] // mm0 = src[2]
906 // mm1, mm3 --- Src[1]
907 // mm0 --- Src[2]
908 // mm7 for unpacking
910 movq mm4, mm1 // b low
911 pmullw mm1, four_fifths // b * 4/5 low
913 movq QWORD ptr [rdi+rcx], mm0 // write des[4]
915 movq mm5, mm3 // b high
916 pmullw mm3, four_fifths // b * 4/5 high
918 movq mm2, mm0 // c
919 pmullw mm4, one_fifth // b * 1/5
921 punpcklbw mm0, mm7 // c low
922 pmullw mm5, one_fifth // b * 1/5
924 movq mm6, mm0 // make copy of c low
925 punpckhbw mm2, mm7 // c high
927 pmullw mm6, one_fifth // c * 1/5 low
928 movq mm7, mm2 // make copy of c high
930 pmullw mm7, one_fifth // c * 1/5 high
931 paddw mm1, mm6 // b * 4/5 + c * 1/5 low
933 paddw mm3, mm7 // b * 4/5 + c * 1/5 high
934 movq mm6, mm0 // make copy of c low
936 pmullw mm6, four_fifths // c * 4/5 low
937 movq mm7, mm2 // make copy of c high
939 pmullw mm7, four_fifths // c * 4/5 high
941 paddw mm4, mm6 // b * 1/5 + c * 4/5 low
942 paddw mm5, mm7 // b * 1/5 + c * 4/5 high
944 paddw mm1, round_values // + 128
945 paddw mm3, round_values // + 128
947 psrlw mm1, 8
948 psrlw mm3, 8
950 packuswb mm1, mm3 // des[2]
951 movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
953 paddw mm4, round_values // + 128
954 paddw mm5, round_values // + 128
956 psrlw mm4, 8
957 psrlw mm5, 8
959 packuswb mm4, mm5 // des[3]
960 movq QWORD ptr [rdi], mm4 // write des[3]
962 // mm0, mm2 --- Src[3]
964 add rdi, 8
965 add rsi, 8
967 sub rdx, 8
968 jg last_vs_3_5_loop
972 /****************************************************************************
974 * ROUTINE : vertical_band_1_2_scale_mmx
976 * INPUTS : unsigned char *dest :
977 * unsigned int dest_pitch :
978 * unsigned int dest_width :
980 * OUTPUTS : None.
982 * RETURNS : void
984 * FUNCTION : 1 to 2 up-scaling of a band of pixels.
986 * SPECIAL NOTES : The routine uses the first line of the band below
987 * the current band. The function also has an "C" only
988 * version.
990 ****************************************************************************/
991 static
992 void vertical_band_1_2_scale_mmx
994 unsigned char *dest,
995 unsigned int dest_pitch,
996 unsigned int dest_width
999 __asm
1002 mov rsi, dest // Get the source and destination pointer
1003 mov ecx, dest_pitch // Get the pitch size
1005 pxor mm7, mm7 // clear out mm7
1006 mov edx, dest_width // Loop counter
1008 vs_1_2_loop:
1010 movq mm0, [rsi] // get Src[0]
1011 movq mm1, [rsi + rcx * 2] // get Src[1]
1013 movq mm2, mm0 // make copy before unpack
1014 movq mm3, mm1 // make copy before unpack
1016 punpcklbw mm0, mm7 // low Src[0]
1017 movq mm6, four_ones // mm6= 1, 1, 1, 1
1019 punpcklbw mm1, mm7 // low Src[1]
1020 paddw mm0, mm1 // low (a + b)
1022 punpckhbw mm2, mm7 // high Src[0]
1023 paddw mm0, mm6 // low (a + b + 1)
1025 punpckhbw mm3, mm7
1026 paddw mm2, mm3 // high (a + b )
1028 psraw mm0, 1 // low (a + b +1 )/2
1029 paddw mm2, mm6 // high (a + b + 1)
1031 psraw mm2, 1 // high (a + b + 1)/2
1032 packuswb mm0, mm2 // pack results
1034 movq [rsi+rcx], mm0 // write out eight bytes
1035 add rsi, 8
1037 sub rdx, 8
1038 jg vs_1_2_loop
1043 /****************************************************************************
1045 * ROUTINE : last_vertical_band_1_2_scale_mmx
1047 * INPUTS : unsigned char *dest :
1048 * unsigned int dest_pitch :
1049 * unsigned int dest_width :
1051 * OUTPUTS : None.
1053 * RETURNS : void
1055 * FUNCTION : 1 to 2 up-scaling of band of pixels.
1057 * SPECIAL NOTES : The routine uses the first line of the band below
1058 * the current band. The function also has an "C" only
1059 * version.
1061 ****************************************************************************/
1062 static
1063 void last_vertical_band_1_2_scale_mmx
1065 unsigned char *dest,
1066 unsigned int dest_pitch,
1067 unsigned int dest_width
1070 __asm
1072 mov rsi, dest // Get the source and destination pointer
1073 mov ecx, dest_pitch // Get the pitch size
1075 mov edx, dest_width // Loop counter
1077 last_vs_1_2_loop:
1079 movq mm0, [rsi] // get Src[0]
1080 movq [rsi+rcx], mm0 // write out eight bytes
1082 add rsi, 8
1083 sub rdx, 8
1085 jg last_vs_1_2_loop
1089 /****************************************************************************
1091 * ROUTINE : horizontal_line_1_2_scale
1093 * INPUTS : const unsigned char *source :
1094 * unsigned int source_width :
1095 * unsigned char *dest :
1096 * unsigned int dest_width :
1098 * OUTPUTS : None.
1100 * RETURNS : void
1102 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
1104 * SPECIAL NOTES : None.
1106 ****************************************************************************/
1107 static
1108 void horizontal_line_1_2_scale_mmx
1110 const unsigned char *source,
1111 unsigned int source_width,
1112 unsigned char *dest,
1113 unsigned int dest_width
1116 (void) dest_width;
1118 __asm
1120 mov rsi, source
1121 mov rdi, dest
1123 pxor mm7, mm7
1124 movq mm6, four_ones
1126 mov ecx, source_width
1128 hs_1_2_loop:
1130 movq mm0, [rsi]
1131 movq mm1, [rsi+1]
1133 movq mm2, mm0
1134 movq mm3, mm1
1136 movq mm4, mm0
1137 punpcklbw mm0, mm7
1139 punpcklbw mm1, mm7
1140 paddw mm0, mm1
1142 paddw mm0, mm6
1143 punpckhbw mm2, mm7
1145 punpckhbw mm3, mm7
1146 paddw mm2, mm3
1148 paddw mm2, mm6
1149 psraw mm0, 1
1151 psraw mm2, 1
1152 packuswb mm0, mm2
1154 movq mm2, mm4
1155 punpcklbw mm2, mm0
1157 movq [rdi], mm2
1158 punpckhbw mm4, mm0
1160 movq [rdi+8], mm4
1161 add rsi, 8
1163 add rdi, 16
1164 sub rcx, 8
1166 cmp rcx, 8
1167 jg hs_1_2_loop
1169 // last eight pixel
1171 movq mm0, [rsi]
1172 movq mm1, mm0
1174 movq mm2, mm0
1175 movq mm3, mm1
1177 psrlq mm1, 8
1178 psrlq mm3, 56
1180 psllq mm3, 56
1181 por mm1, mm3
1183 movq mm3, mm1
1184 movq mm4, mm0
1186 punpcklbw mm0, mm7
1187 punpcklbw mm1, mm7
1189 paddw mm0, mm1
1190 paddw mm0, mm6
1192 punpckhbw mm2, mm7
1193 punpckhbw mm3, mm7
1195 paddw mm2, mm3
1196 paddw mm2, mm6
1198 psraw mm0, 1
1199 psraw mm2, 1
1201 packuswb mm0, mm2
1202 movq mm2, mm4
1204 punpcklbw mm2, mm0
1205 movq [rdi], mm2
1207 punpckhbw mm4, mm0
1208 movq [rdi+8], mm4
1216 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
1217 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
1220 /****************************************************************************
1222 * ROUTINE : horizontal_line_5_4_scale_mmx
1224 * INPUTS : const unsigned char *source : Pointer to source data.
1225 * unsigned int source_width : Stride of source.
1226 * unsigned char *dest : Pointer to destination data.
1227 * unsigned int dest_width : Stride of destination (NOT USED).
1229 * OUTPUTS : None.
1231 * RETURNS : void
1233 * FUNCTION : Copies horizontal line of pixels from source to
1234 * destination scaling up by 4 to 5.
1236 * SPECIAL NOTES : None.
1238 ****************************************************************************/
1239 static
1240 void horizontal_line_5_4_scale_mmx
1242 const unsigned char *source,
1243 unsigned int source_width,
1244 unsigned char *dest,
1245 unsigned int dest_width
1249 unsigned i;
1250 unsigned int a, b, c, d, e;
1251 unsigned char *des = dest;
1252 const unsigned char *src = source;
1254 (void) dest_width;
1256 for ( i=0; i<source_width; i+=5 )
1258 a = src[0];
1259 b = src[1];
1260 c = src[2];
1261 d = src[3];
1262 e = src[4];
1264 des[0] = a;
1265 des[1] = ((b*192 + c* 64 + 128)>>8);
1266 des[2] = ((c*128 + d*128 + 128)>>8);
1267 des[3] = ((d* 64 + e*192 + 128)>>8);
1269 src += 5;
1270 des += 4;
1273 __asm
1276 mov rsi, source ;
1277 mov rdi, dest ;
1279 mov ecx, source_width ;
1280 movq mm5, const54_1 ;
1282 pxor mm7, mm7 ;
1283 movq mm6, const54_2 ;
1285 movq mm4, round_values ;
1286 lea rdx, [rsi+rcx] ;
1287 horizontal_line_5_4_loop:
1289 movq mm0, QWORD PTR [rsi] ;
1290 00 01 02 03 04 05 06 07
1291 movq mm1, mm0 ;
1292 00 01 02 03 04 05 06 07
1294 psrlq mm0, 8 ;
1295 01 02 03 04 05 06 07 xx
1296 punpcklbw mm1, mm7 ;
1297 xx 00 xx 01 xx 02 xx 03
1299 punpcklbw mm0, mm7 ;
1300 xx 01 xx 02 xx 03 xx 04
1301 pmullw mm1, mm5
1303 pmullw mm0, mm6
1304 add rsi, 5
1306 add rdi, 4
1307 paddw mm1, mm0
1309 paddw mm1, mm4
1310 psrlw mm1, 8
1312 cmp rsi, rdx
1313 packuswb mm1, mm7
1315 movd DWORD PTR [rdi-4], mm1
1317 jl horizontal_line_5_4_loop
1322 __declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 };
1323 __declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 };
1324 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
1326 static
1327 void vertical_band_5_4_scale_mmx
1329 unsigned char *source,
1330 unsigned int src_pitch,
1331 unsigned char *dest,
1332 unsigned int dest_pitch,
1333 unsigned int dest_width
1337 __asm
1340 mov rsi, source // Get the source and destination pointer
1341 mov ecx, src_pitch // Get the pitch size
1343 mov rdi, dest // tow lines below
1344 pxor mm7, mm7 // clear out mm7
1346 mov edx, dest_pitch // Loop counter
1347 mov ebx, dest_width
1349 vs_5_4_loop:
1351 movd mm0, DWORD ptr [rsi] // src[0];
1352 movd mm1, DWORD ptr [rsi+rcx] // src[1];
1354 movd mm2, DWORD ptr [rsi+rcx*2]
1355 lea rax, [rsi+rcx*2] //
1357 punpcklbw mm1, mm7
1358 punpcklbw mm2, mm7
1360 movq mm3, mm2
1361 pmullw mm1, three_fourths
1363 pmullw mm2, one_fourths
1364 movd mm4, [rax+rcx]
1366 pmullw mm3, two_fourths
1367 punpcklbw mm4, mm7
1369 movq mm5, mm4
1370 pmullw mm4, two_fourths
1372 paddw mm1, mm2
1373 movd mm6, [rax+rcx*2]
1375 pmullw mm5, one_fourths
1376 paddw mm1, round_values;
1378 paddw mm3, mm4
1379 psrlw mm1, 8
1381 punpcklbw mm6, mm7
1382 paddw mm3, round_values
1384 pmullw mm6, three_fourths
1385 psrlw mm3, 8
1387 packuswb mm1, mm7
1388 packuswb mm3, mm7
1390 movd DWORD PTR [rdi], mm0
1391 movd DWORD PTR [rdi+rdx], mm1
1394 paddw mm5, mm6
1395 movd DWORD PTR [rdi+rdx*2], mm3
1397 lea rax, [rdi+rdx*2]
1398 paddw mm5, round_values
1400 psrlw mm5, 8
1401 add rdi, 4
1403 packuswb mm5, mm7
1404 movd DWORD PTR [rax+rdx], mm5
1406 add rsi, 4
1407 sub rbx, 4
1409 jg vs_5_4_loop
1414 __declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 };
1415 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 };
1418 static
1419 void horizontal_line_5_3_scale_mmx
1421 const unsigned char *source,
1422 unsigned int source_width,
1423 unsigned char *dest,
1424 unsigned int dest_width
1427 __asm
1430 mov rsi, source ;
1431 mov rdi, dest ;
1433 mov ecx, source_width ;
1434 movq mm5, const53_1 ;
1436 pxor mm7, mm7 ;
1437 movq mm6, const53_2 ;
1439 movq mm4, round_values ;
1440 lea rdx, [rsi+rcx-5] ;
1441 horizontal_line_5_3_loop:
1443 movq mm0, QWORD PTR [rsi] ;
1444 00 01 02 03 04 05 06 07
1445 movq mm1, mm0 ;
1446 00 01 02 03 04 05 06 07
1448 psllw mm0, 8 ;
1449 xx 00 xx 02 xx 04 xx 06
1450 psrlw mm1, 8 ;
1451 01 xx 03 xx 05 xx 07 xx
1453 psrlw mm0, 8 ;
1454 00 xx 02 xx 04 xx 06 xx
1455 psllq mm1, 16 ;
1456 xx xx 01 xx 03 xx 05 xx
1458 pmullw mm0, mm6
1460 pmullw mm1, mm5
1461 add rsi, 5
1463 add rdi, 3
1464 paddw mm1, mm0
1466 paddw mm1, mm4
1467 psrlw mm1, 8
1469 cmp rsi, rdx
1470 packuswb mm1, mm7
1472 movd DWORD PTR [rdi-3], mm1
1473 jl horizontal_line_5_3_loop
1475 //exit condition
1476 movq mm0, QWORD PTR [rsi] ;
1477 00 01 02 03 04 05 06 07
1478 movq mm1, mm0 ;
1479 00 01 02 03 04 05 06 07
1481 psllw mm0, 8 ;
1482 xx 00 xx 02 xx 04 xx 06
1483 psrlw mm1, 8 ;
1484 01 xx 03 xx 05 xx 07 xx
1486 psrlw mm0, 8 ;
1487 00 xx 02 xx 04 xx 06 xx
1488 psllq mm1, 16 ;
1489 xx xx 01 xx 03 xx 05 xx
1491 pmullw mm0, mm6
1493 pmullw mm1, mm5
1494 paddw mm1, mm0
1496 paddw mm1, mm4
1497 psrlw mm1, 8
1499 packuswb mm1, mm7
1500 movd rax, mm1
1502 mov rdx, rax
1503 shr rdx, 16
1505 mov WORD PTR[rdi], ax
1506 mov BYTE PTR[rdi+2], dl
1512 __declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 };
1513 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
1515 static
1516 void vertical_band_5_3_scale_mmx
1518 unsigned char *source,
1519 unsigned int src_pitch,
1520 unsigned char *dest,
1521 unsigned int dest_pitch,
1522 unsigned int dest_width
1526 __asm
1529 mov rsi, source // Get the source and destination pointer
1530 mov ecx, src_pitch // Get the pitch size
1532 mov rdi, dest // tow lines below
1533 pxor mm7, mm7 // clear out mm7
1535 mov edx, dest_pitch // Loop counter
1536 movq mm5, one_thirds
1538 movq mm6, two_thirds
1539 mov ebx, dest_width;
1541 vs_5_3_loop:
1543 movd mm0, DWORD ptr [rsi] // src[0];
1544 movd mm1, DWORD ptr [rsi+rcx] // src[1];
1546 movd mm2, DWORD ptr [rsi+rcx*2]
1547 lea rax, [rsi+rcx*2] //
1549 punpcklbw mm1, mm7
1550 punpcklbw mm2, mm7
1552 pmullw mm1, mm5
1553 pmullw mm2, mm6
1555 movd mm3, DWORD ptr [rax+rcx]
1556 movd mm4, DWORD ptr [rax+rcx*2]
1558 punpcklbw mm3, mm7
1559 punpcklbw mm4, mm7
1561 pmullw mm3, mm6
1562 pmullw mm4, mm5
1565 movd DWORD PTR [rdi], mm0
1566 paddw mm1, mm2
1568 paddw mm1, round_values
1569 psrlw mm1, 8
1571 packuswb mm1, mm7
1572 paddw mm3, mm4
1574 paddw mm3, round_values
1575 movd DWORD PTR [rdi+rdx], mm1
1577 psrlw mm3, 8
1578 packuswb mm3, mm7
1580 movd DWORD PTR [rdi+rdx*2], mm3
1583 add rdi, 4
1584 add rsi, 4
1586 sub rbx, 4
1587 jg vs_5_3_loop
1594 /****************************************************************************
1596 * ROUTINE : horizontal_line_2_1_scale
1598 * INPUTS : const unsigned char *source :
1599 * unsigned int source_width :
1600 * unsigned char *dest :
1601 * unsigned int dest_width :
1603 * OUTPUTS : None.
1605 * RETURNS : void
1607 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
1609 * SPECIAL NOTES : None.
1611 ****************************************************************************/
1612 static
1613 void horizontal_line_2_1_scale_mmx
1615 const unsigned char *source,
1616 unsigned int source_width,
1617 unsigned char *dest,
1618 unsigned int dest_width
1621 (void) dest_width;
1623 __asm
1625 mov rsi, source
1626 mov rdi, dest
1628 pxor mm7, mm7
1629 mov ecx, dest_width
1631 xor rdx, rdx
1632 hs_2_1_loop:
1634 movq mm0, [rsi+rdx*2]
1635 psllw mm0, 8
1637 psrlw mm0, 8
1638 packuswb mm0, mm7
1640 movd DWORD Ptr [rdi+rdx], mm0;
1641 add rdx, 4
1643 cmp rdx, rcx
1644 jl hs_2_1_loop
1651 static
1652 void vertical_band_2_1_scale_mmx
1654 unsigned char *source,
1655 unsigned int src_pitch,
1656 unsigned char *dest,
1657 unsigned int dest_pitch,
1658 unsigned int dest_width)
1660 vpx_memcpy(dest, source, dest_width);
1664 __declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
1665 __declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
1667 static
1668 void vertical_band_2_1_scale_i_mmx
1670 unsigned char *source,
1671 unsigned int src_pitch,
1672 unsigned char *dest,
1673 unsigned int dest_pitch,
1674 unsigned int dest_width
1677 __asm
1679 mov rsi, source
1680 mov rdi, dest
1682 mov eax, src_pitch
1683 mov edx, dest_width
1685 pxor mm7, mm7
1686 sub rsi, rax //back one line
1689 lea rcx, [rsi+rdx];
1690 movq mm6, round_values;
1692 movq mm5, three_sixteenths;
1693 movq mm4, ten_sixteenths;
1695 vs_2_1_i_loop:
1696 movd mm0, [rsi] //
1697 movd mm1, [rsi+rax] //
1699 movd mm2, [rsi+rax*2] //
1700 punpcklbw mm0, mm7
1702 pmullw mm0, mm5
1703 punpcklbw mm1, mm7
1705 pmullw mm1, mm4
1706 punpcklbw mm2, mm7
1708 pmullw mm2, mm5
1709 paddw mm0, round_values
1711 paddw mm1, mm2
1712 paddw mm0, mm1
1714 psrlw mm0, 8
1715 packuswb mm0, mm7
1717 movd DWORD PTR [rdi], mm0
1718 add rsi, 4
1720 add rdi, 4;
1721 cmp rsi, rcx
1722 jl vs_2_1_i_loop
1729 void
1730 register_mmxscalers(void)
1732 vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;
1733 vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;
1734 vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;
1735 vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;
1736 vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;
1737 vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;
1738 vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;
1739 vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;
1740 vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;
1742 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
1743 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
1744 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
1745 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
1746 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
1747 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
1748 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;