Initial WebM release
[libvpx.git] / vp8 / common / x86 / subpixel_mmx.asm
blobc5021181387f0a5d7a5729bf7c9bbf44f0ee034c
2 ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
14 %define BLOCK_HEIGHT_WIDTH 4
15 %define vp8_filter_weight 128
16 %define VP8_FILTER_SHIFT 7
19 ;void vp8_filter_block1d_h6_mmx
21 ; unsigned char *src_ptr,
22 ; unsigned short *output_ptr,
23 ; unsigned int src_pixels_per_line,
24 ; unsigned int pixel_step,
25 ; unsigned int output_height,
26 ; unsigned int output_width,
27 ; short * vp8_filter
29 global sym(vp8_filter_block1d_h6_mmx)
30 sym(vp8_filter_block1d_h6_mmx):
31 push rbp
32 mov rbp, rsp
33 SHADOW_ARGS_TO_STACK 7
34 GET_GOT rbx
35 push rsi
36 push rdi
37 ; end prolog
39 mov rdx, arg(6) ;vp8_filter
41 movq mm1, [rdx + 16] ; do both the negative taps first!!!
42 movq mm2, [rdx + 32] ;
43 movq mm6, [rdx + 48] ;
44 movq mm7, [rdx + 64] ;
46 mov rdi, arg(1) ;output_ptr
47 mov rsi, arg(0) ;src_ptr
48 movsxd rcx, dword ptr arg(4) ;output_height
49 movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
50 pxor mm0, mm0 ; mm0 = 00000000
52 nextrow:
53 movq mm3, [rsi-2] ; mm3 = p-2..p5
54 movq mm4, mm3 ; mm4 = p-2..p5
55 psrlq mm3, 8 ; mm3 = p-1..p5
56 punpcklbw mm3, mm0 ; mm3 = p-1..p2
57 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
59 movq mm5, mm4 ; mm5 = p-2..p5
60 punpckhbw mm4, mm0 ; mm5 = p2..p5
61 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
62 paddsw mm3, mm4 ; mm3 += mm5
64 movq mm4, mm5 ; mm4 = p-2..p5;
65 psrlq mm5, 16 ; mm5 = p0..p5;
66 punpcklbw mm5, mm0 ; mm5 = p0..p3
67 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
68 paddsw mm3, mm5 ; mm3 += mm5
70 movq mm5, mm4 ; mm5 = p-2..p5
71 psrlq mm4, 24 ; mm4 = p1..p5
72 punpcklbw mm4, mm0 ; mm4 = p1..p4
73 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
74 paddsw mm3, mm4 ; mm3 += mm5
76 ; do outer positive taps
77 movd mm4, [rsi+3]
78 punpcklbw mm4, mm0 ; mm5 = p3..p6
79 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
80 paddsw mm3, mm4 ; mm3 += mm5
82 punpcklbw mm5, mm0 ; mm5 = p-2..p1
83 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
84 paddsw mm3, mm5 ; mm3 += mm5
86 paddsw mm3, [rd GLOBAL] ; mm3 += round value
87 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
88 packuswb mm3, mm0 ; pack and unpack to saturate
89 punpcklbw mm3, mm0 ;
91 movq [rdi], mm3 ; store the results in the destination
93 %if ABI_IS_32BIT
94 add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
95 add rdi, rax;
96 %else
97 movsxd r8, dword ptr arg(2) ;src_pixels_per_line
98 add rdi, rax;
100 add rsi, r8 ; next line
101 %endif
103 dec rcx ; decrement count
104 jnz nextrow ; next row
106 ; begin epilog
107 pop rdi
108 pop rsi
109 RESTORE_GOT
110 UNSHADOW_ARGS
111 pop rbp
116 ; THIS FUNCTION APPEARS TO BE UNUSED
118 ;void vp8_filter_block1d_v6_mmx
120 ; short *src_ptr,
121 ; unsigned char *output_ptr,
122 ; unsigned int pixels_per_line,
123 ; unsigned int pixel_step,
124 ; unsigned int output_height,
125 ; unsigned int output_width,
126 ; short * vp8_filter
128 global sym(vp8_filter_block1d_v6_mmx)
129 sym(vp8_filter_block1d_v6_mmx):
130 push rbp
131 mov rbp, rsp
132 SHADOW_ARGS_TO_STACK 7
133 GET_GOT rbx
134 push rsi
135 push rdi
136 ; end prolog
138 movq mm5, [rd GLOBAL]
139 push rbx
140 mov rbx, arg(6) ;vp8_filter
141 movq mm1, [rbx + 16] ; do both the negative taps first!!!
142 movq mm2, [rbx + 32] ;
143 movq mm6, [rbx + 48] ;
144 movq mm7, [rbx + 64] ;
146 movsxd rdx, dword ptr arg(2) ;pixels_per_line
147 mov rdi, arg(1) ;output_ptr
148 mov rsi, arg(0) ;src_ptr
149 sub rsi, rdx
150 sub rsi, rdx
151 movsxd rcx, DWORD PTR arg(4) ;output_height
152 movsxd rax, DWORD PTR arg(5) ;output_width ; destination pitch?
153 pxor mm0, mm0 ; mm0 = 00000000
156 nextrow_v:
157 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
158 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
161 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
162 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
163 paddsw mm3, mm4 ; mm3 += mm4
165 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
166 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
167 paddsw mm3, mm4 ; mm3 += mm4
169 movq mm4, [rsi] ; mm4 = p0..p3 = row -2
170 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
171 paddsw mm3, mm4 ; mm3 += mm4
174 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
175 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
176 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
177 paddsw mm3, mm4 ; mm3 += mm4
179 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
180 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
181 paddsw mm3, mm4 ; mm3 += mm4
184 paddsw mm3, mm5 ; mm3 += round value
185 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
186 packuswb mm3, mm0 ; pack and saturate
188 movd [rdi],mm3 ; store the results in the destination
190 add rdi,rax;
192 dec rcx ; decrement count
193 jnz nextrow_v ; next row
195 pop rbx
197 ; begin epilog
198 pop rdi
199 pop rsi
200 RESTORE_GOT
201 UNSHADOW_ARGS
202 pop rbp
206 ;void vp8_filter_block1dc_v6_mmx
208 ; short *src_ptr,
209 ; unsigned char *output_ptr,
210 ; int output_pitch,
211 ; unsigned int pixels_per_line,
212 ; unsigned int pixel_step,
213 ; unsigned int output_height,
214 ; unsigned int output_width,
215 ; short * vp8_filter
217 global sym(vp8_filter_block1dc_v6_mmx)
218 sym(vp8_filter_block1dc_v6_mmx):
219 push rbp
220 mov rbp, rsp
221 SHADOW_ARGS_TO_STACK 8
222 GET_GOT rbx
223 push rsi
224 push rdi
225 ; end prolog
227 movq mm5, [rd GLOBAL]
228 push rbx
229 mov rbx, arg(7) ;vp8_filter
230 movq mm1, [rbx + 16] ; do both the negative taps first!!!
231 movq mm2, [rbx + 32] ;
232 movq mm6, [rbx + 48] ;
233 movq mm7, [rbx + 64] ;
235 movsxd rdx, dword ptr arg(3) ;pixels_per_line
236 mov rdi, arg(1) ;output_ptr
237 mov rsi, arg(0) ;src_ptr
238 sub rsi, rdx
239 sub rsi, rdx
240 movsxd rcx, DWORD PTR arg(5) ;output_height
241 movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
242 pxor mm0, mm0 ; mm0 = 00000000
245 nextrow_cv:
246 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
247 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
250 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
251 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
252 paddsw mm3, mm4 ; mm3 += mm4
254 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
255 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
256 paddsw mm3, mm4 ; mm3 += mm4
258 movq mm4, [rsi] ; mm4 = p0..p3 = row -2
259 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
260 paddsw mm3, mm4 ; mm3 += mm4
263 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
264 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
265 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
266 paddsw mm3, mm4 ; mm3 += mm4
268 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
269 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
270 paddsw mm3, mm4 ; mm3 += mm4
273 paddsw mm3, mm5 ; mm3 += round value
274 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
275 packuswb mm3, mm0 ; pack and saturate
277 movd [rdi],mm3 ; store the results in the destination
278 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
279 ; recon block should be in cache this shouldn't cost much. Its obviously
280 ; avoidable!!!.
281 lea rdi, [rdi+rax] ;
282 dec rcx ; decrement count
283 jnz nextrow_cv ; next row
285 pop rbx
287 ; begin epilog
288 pop rdi
289 pop rsi
290 RESTORE_GOT
291 UNSHADOW_ARGS
292 pop rbp
296 ;void bilinear_predict8x8_mmx
298 ; unsigned char *src_ptr,
299 ; int src_pixels_per_line,
300 ; int xoffset,
301 ; int yoffset,
302 ; unsigned char *dst_ptr,
303 ; int dst_pitch
305 global sym(vp8_bilinear_predict8x8_mmx)
306 sym(vp8_bilinear_predict8x8_mmx):
307 push rbp
308 mov rbp, rsp
309 SHADOW_ARGS_TO_STACK 6
310 GET_GOT rbx
311 push rsi
312 push rdi
313 ; end prolog
315 ;const short *HFilter = bilinear_filters_mmx[xoffset];
316 ;const short *VFilter = bilinear_filters_mmx[yoffset];
318 movsxd rax, dword ptr arg(2) ;xoffset
319 mov rdi, arg(4) ;dst_ptr ;
321 shl rax, 5 ; offset * 32
322 lea rcx, [sym(vp8_bilinear_filters_mmx) GLOBAL]
324 add rax, rcx ; HFilter
325 mov rsi, arg(0) ;src_ptr ;
327 movsxd rdx, dword ptr arg(5) ;dst_pitch
328 movq mm1, [rax] ;
330 movq mm2, [rax+16] ;
331 movsxd rax, dword ptr arg(3) ;yoffset
333 pxor mm0, mm0 ;
335 shl rax, 5 ; offset*32
336 add rax, rcx ; VFilter
338 lea rcx, [rdi+rdx*8] ;
339 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
343 ; get the first horizontal line done ;
344 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
345 movq mm4, mm3 ; make a copy of current line
347 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
348 punpckhbw mm4, mm0 ;
350 pmullw mm3, mm1 ;
351 pmullw mm4, mm1 ;
353 movq mm5, [rsi+1] ;
354 movq mm6, mm5 ;
356 punpcklbw mm5, mm0 ;
357 punpckhbw mm6, mm0 ;
359 pmullw mm5, mm2 ;
360 pmullw mm6, mm2 ;
362 paddw mm3, mm5 ;
363 paddw mm4, mm6 ;
365 paddw mm3, [rd GLOBAL] ; xmm3 += round value
366 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
368 paddw mm4, [rd GLOBAL] ;
369 psraw mm4, VP8_FILTER_SHIFT ;
371 movq mm7, mm3 ;
372 packuswb mm7, mm4 ;
374 add rsi, rdx ; next line
375 next_row_8x8:
376 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
377 movq mm4, mm3 ; make a copy of current line
379 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
380 punpckhbw mm4, mm0 ;
382 pmullw mm3, mm1 ;
383 pmullw mm4, mm1 ;
385 movq mm5, [rsi+1] ;
386 movq mm6, mm5 ;
388 punpcklbw mm5, mm0 ;
389 punpckhbw mm6, mm0 ;
391 pmullw mm5, mm2 ;
392 pmullw mm6, mm2 ;
394 paddw mm3, mm5 ;
395 paddw mm4, mm6 ;
397 movq mm5, mm7 ;
398 movq mm6, mm7 ;
400 punpcklbw mm5, mm0 ;
401 punpckhbw mm6, mm0
403 pmullw mm5, [rax] ;
404 pmullw mm6, [rax] ;
406 paddw mm3, [rd GLOBAL] ; xmm3 += round value
407 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
409 paddw mm4, [rd GLOBAL] ;
410 psraw mm4, VP8_FILTER_SHIFT ;
412 movq mm7, mm3 ;
413 packuswb mm7, mm4 ;
416 pmullw mm3, [rax+16] ;
417 pmullw mm4, [rax+16] ;
419 paddw mm3, mm5 ;
420 paddw mm4, mm6 ;
423 paddw mm3, [rd GLOBAL] ; xmm3 += round value
424 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
426 paddw mm4, [rd GLOBAL] ;
427 psraw mm4, VP8_FILTER_SHIFT ;
429 packuswb mm3, mm4
431 movq [rdi], mm3 ; store the results in the destination
433 %if ABI_IS_32BIT
434 add rsi, rdx ; next line
435 add rdi, dword ptr arg(5) ;dst_pitch ;
436 %else
437 movsxd r8, dword ptr arg(5) ;dst_pitch
438 add rsi, rdx ; next line
439 add rdi, r8 ;dst_pitch
440 %endif
441 cmp rdi, rcx ;
442 jne next_row_8x8
444 ; begin epilog
445 pop rdi
446 pop rsi
447 RESTORE_GOT
448 UNSHADOW_ARGS
449 pop rbp
453 ;void bilinear_predict8x4_mmx
455 ; unsigned char *src_ptr,
456 ; int src_pixels_per_line,
457 ; int xoffset,
458 ; int yoffset,
459 ; unsigned char *dst_ptr,
460 ; int dst_pitch
462 global sym(vp8_bilinear_predict8x4_mmx)
463 sym(vp8_bilinear_predict8x4_mmx):
464 push rbp
465 mov rbp, rsp
466 SHADOW_ARGS_TO_STACK 6
467 GET_GOT rbx
468 push rsi
469 push rdi
470 ; end prolog
472 ;const short *HFilter = bilinear_filters_mmx[xoffset];
473 ;const short *VFilter = bilinear_filters_mmx[yoffset];
475 movsxd rax, dword ptr arg(2) ;xoffset
476 mov rdi, arg(4) ;dst_ptr ;
478 lea rcx, [sym(vp8_bilinear_filters_mmx) GLOBAL]
479 shl rax, 5
481 mov rsi, arg(0) ;src_ptr ;
482 add rax, rcx
484 movsxd rdx, dword ptr arg(5) ;dst_pitch
485 movq mm1, [rax] ;
487 movq mm2, [rax+16] ;
488 movsxd rax, dword ptr arg(3) ;yoffset
490 pxor mm0, mm0 ;
491 shl rax, 5
493 add rax, rcx
494 lea rcx, [rdi+rdx*4] ;
496 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
498 ; get the first horizontal line done ;
499 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
500 movq mm4, mm3 ; make a copy of current line
502 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
503 punpckhbw mm4, mm0 ;
505 pmullw mm3, mm1 ;
506 pmullw mm4, mm1 ;
508 movq mm5, [rsi+1] ;
509 movq mm6, mm5 ;
511 punpcklbw mm5, mm0 ;
512 punpckhbw mm6, mm0 ;
514 pmullw mm5, mm2 ;
515 pmullw mm6, mm2 ;
517 paddw mm3, mm5 ;
518 paddw mm4, mm6 ;
520 paddw mm3, [rd GLOBAL] ; xmm3 += round value
521 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
523 paddw mm4, [rd GLOBAL] ;
524 psraw mm4, VP8_FILTER_SHIFT ;
526 movq mm7, mm3 ;
527 packuswb mm7, mm4 ;
529 add rsi, rdx ; next line
530 next_row_8x4:
531 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
532 movq mm4, mm3 ; make a copy of current line
534 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
535 punpckhbw mm4, mm0 ;
537 pmullw mm3, mm1 ;
538 pmullw mm4, mm1 ;
540 movq mm5, [rsi+1] ;
541 movq mm6, mm5 ;
543 punpcklbw mm5, mm0 ;
544 punpckhbw mm6, mm0 ;
546 pmullw mm5, mm2 ;
547 pmullw mm6, mm2 ;
549 paddw mm3, mm5 ;
550 paddw mm4, mm6 ;
552 movq mm5, mm7 ;
553 movq mm6, mm7 ;
555 punpcklbw mm5, mm0 ;
556 punpckhbw mm6, mm0
558 pmullw mm5, [rax] ;
559 pmullw mm6, [rax] ;
561 paddw mm3, [rd GLOBAL] ; xmm3 += round value
562 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
564 paddw mm4, [rd GLOBAL] ;
565 psraw mm4, VP8_FILTER_SHIFT ;
567 movq mm7, mm3 ;
568 packuswb mm7, mm4 ;
571 pmullw mm3, [rax+16] ;
572 pmullw mm4, [rax+16] ;
574 paddw mm3, mm5 ;
575 paddw mm4, mm6 ;
578 paddw mm3, [rd GLOBAL] ; xmm3 += round value
579 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
581 paddw mm4, [rd GLOBAL] ;
582 psraw mm4, VP8_FILTER_SHIFT ;
584 packuswb mm3, mm4
586 movq [rdi], mm3 ; store the results in the destination
588 %if ABI_IS_32BIT
589 add rsi, rdx ; next line
590 add rdi, dword ptr arg(5) ;dst_pitch ;
591 %else
592 movsxd r8, dword ptr arg(5) ;dst_pitch
593 add rsi, rdx ; next line
594 add rdi, r8
595 %endif
596 cmp rdi, rcx ;
597 jne next_row_8x4
599 ; begin epilog
600 pop rdi
601 pop rsi
602 RESTORE_GOT
603 UNSHADOW_ARGS
604 pop rbp
608 ;void bilinear_predict4x4_mmx
610 ; unsigned char *src_ptr,
611 ; int src_pixels_per_line,
612 ; int xoffset,
613 ; int yoffset,
614 ; unsigned char *dst_ptr,
615 ; int dst_pitch
617 global sym(vp8_bilinear_predict4x4_mmx)
618 sym(vp8_bilinear_predict4x4_mmx):
619 push rbp
620 mov rbp, rsp
621 SHADOW_ARGS_TO_STACK 6
622 GET_GOT rbx
623 push rsi
624 push rdi
625 ; end prolog
627 ;const short *HFilter = bilinear_filters_mmx[xoffset];
628 ;const short *VFilter = bilinear_filters_mmx[yoffset];
630 movsxd rax, dword ptr arg(2) ;xoffset
631 mov rdi, arg(4) ;dst_ptr ;
633 lea rcx, [sym(vp8_bilinear_filters_mmx) GLOBAL]
634 shl rax, 5
636 add rax, rcx ; HFilter
637 mov rsi, arg(0) ;src_ptr ;
639 movsxd rdx, dword ptr arg(5) ;ldst_pitch
640 movq mm1, [rax] ;
642 movq mm2, [rax+16] ;
643 movsxd rax, dword ptr arg(3) ;yoffset
645 pxor mm0, mm0 ;
646 shl rax, 5
648 add rax, rcx
649 lea rcx, [rdi+rdx*4] ;
651 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
653 ; get the first horizontal line done ;
654 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
655 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
657 pmullw mm3, mm1 ;
658 movd mm5, [rsi+1] ;
660 punpcklbw mm5, mm0 ;
661 pmullw mm5, mm2 ;
663 paddw mm3, mm5 ;
664 paddw mm3, [rd GLOBAL] ; xmm3 += round value
666 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
668 movq mm7, mm3 ;
669 packuswb mm7, mm0 ;
671 add rsi, rdx ; next line
672 next_row_4x4:
673 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
674 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
676 pmullw mm3, mm1 ;
677 movd mm5, [rsi+1] ;
679 punpcklbw mm5, mm0 ;
680 pmullw mm5, mm2 ;
682 paddw mm3, mm5 ;
684 movq mm5, mm7 ;
685 punpcklbw mm5, mm0 ;
687 pmullw mm5, [rax] ;
688 paddw mm3, [rd GLOBAL] ; xmm3 += round value
690 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
691 movq mm7, mm3 ;
693 packuswb mm7, mm0 ;
695 pmullw mm3, [rax+16] ;
696 paddw mm3, mm5 ;
699 paddw mm3, [rd GLOBAL] ; xmm3 += round value
700 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
702 packuswb mm3, mm0
703 movd [rdi], mm3 ; store the results in the destination
705 %if ABI_IS_32BIT
706 add rsi, rdx ; next line
707 add rdi, dword ptr arg(5) ;dst_pitch ;
708 %else
709 movsxd r8, dword ptr arg(5) ;dst_pitch ;
710 add rsi, rdx ; next line
711 add rdi, r8
712 %endif
714 cmp rdi, rcx ;
715 jne next_row_4x4
717 ; begin epilog
718 pop rdi
719 pop rsi
720 RESTORE_GOT
721 UNSHADOW_ARGS
722 pop rbp
727 SECTION_RODATA
728 align 16
730 times 4 dw 0x40
732 align 16
733 global sym(vp8_six_tap_mmx)
734 sym(vp8_six_tap_mmx):
735 times 8 dw 0
736 times 8 dw 0
737 times 8 dw 128
738 times 8 dw 0
739 times 8 dw 0
740 times 8 dw 0
742 times 8 dw 0
743 times 8 dw -6
744 times 8 dw 123
745 times 8 dw 12
746 times 8 dw -1
747 times 8 dw 0
749 times 8 dw 2
750 times 8 dw -11
751 times 8 dw 108
752 times 8 dw 36
753 times 8 dw -8
754 times 8 dw 1
756 times 8 dw 0
757 times 8 dw -9
758 times 8 dw 93
759 times 8 dw 50
760 times 8 dw -6
761 times 8 dw 0
763 times 8 dw 3
764 times 8 dw -16
765 times 8 dw 77
766 times 8 dw 77
767 times 8 dw -16
768 times 8 dw 3
770 times 8 dw 0
771 times 8 dw -6
772 times 8 dw 50
773 times 8 dw 93
774 times 8 dw -9
775 times 8 dw 0
777 times 8 dw 1
778 times 8 dw -8
779 times 8 dw 36
780 times 8 dw 108
781 times 8 dw -11
782 times 8 dw 2
784 times 8 dw 0
785 times 8 dw -1
786 times 8 dw 12
787 times 8 dw 123
788 times 8 dw -6
789 times 8 dw 0
792 align 16
793 global sym(vp8_bilinear_filters_mmx)
794 sym(vp8_bilinear_filters_mmx):
795 times 8 dw 128
796 times 8 dw 0
798 times 8 dw 112
799 times 8 dw 16
801 times 8 dw 96
802 times 8 dw 32
804 times 8 dw 80
805 times 8 dw 48
807 times 8 dw 64
808 times 8 dw 64
810 times 8 dw 48
811 times 8 dw 80
813 times 8 dw 32
814 times 8 dw 96
816 times 8 dw 16
817 times 8 dw 112