Merge "vp8_rd_pick_best_mbsegmentation code restructure"
[libvpx.git] / vp8 / common / x86 / subpixel_mmx.asm
blob23ed4e2087115e3543feea74468c0df32ef3510e
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
15 %define BLOCK_HEIGHT_WIDTH 4
16 %define vp8_filter_weight 128
17 %define VP8_FILTER_SHIFT 7
20 ;void vp8_filter_block1d_h6_mmx
22 ; unsigned char *src_ptr,
23 ; unsigned short *output_ptr,
24 ; unsigned int src_pixels_per_line,
25 ; unsigned int pixel_step,
26 ; unsigned int output_height,
27 ; unsigned int output_width,
28 ; short * vp8_filter
30 global sym(vp8_filter_block1d_h6_mmx)
31 sym(vp8_filter_block1d_h6_mmx):
32 push rbp
33 mov rbp, rsp
34 SHADOW_ARGS_TO_STACK 7
35 GET_GOT rbx
36 push rsi
37 push rdi
38 ; end prolog
40 mov rdx, arg(6) ;vp8_filter
42 movq mm1, [rdx + 16] ; do both the negative taps first!!!
43 movq mm2, [rdx + 32] ;
44 movq mm6, [rdx + 48] ;
45 movq mm7, [rdx + 64] ;
47 mov rdi, arg(1) ;output_ptr
48 mov rsi, arg(0) ;src_ptr
49 movsxd rcx, dword ptr arg(4) ;output_height
50 movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
51 pxor mm0, mm0 ; mm0 = 00000000
53 nextrow:
54 movq mm3, [rsi-2] ; mm3 = p-2..p5
55 movq mm4, mm3 ; mm4 = p-2..p5
56 psrlq mm3, 8 ; mm3 = p-1..p5
57 punpcklbw mm3, mm0 ; mm3 = p-1..p2
58 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
60 movq mm5, mm4 ; mm5 = p-2..p5
61 punpckhbw mm4, mm0 ; mm5 = p2..p5
62 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
63 paddsw mm3, mm4 ; mm3 += mm5
65 movq mm4, mm5 ; mm4 = p-2..p5;
66 psrlq mm5, 16 ; mm5 = p0..p5;
67 punpcklbw mm5, mm0 ; mm5 = p0..p3
68 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
69 paddsw mm3, mm5 ; mm3 += mm5
71 movq mm5, mm4 ; mm5 = p-2..p5
72 psrlq mm4, 24 ; mm4 = p1..p5
73 punpcklbw mm4, mm0 ; mm4 = p1..p4
74 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
75 paddsw mm3, mm4 ; mm3 += mm5
77 ; do outer positive taps
78 movd mm4, [rsi+3]
79 punpcklbw mm4, mm0 ; mm5 = p3..p6
80 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
81 paddsw mm3, mm4 ; mm3 += mm5
83 punpcklbw mm5, mm0 ; mm5 = p-2..p1
84 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
85 paddsw mm3, mm5 ; mm3 += mm5
87 paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
88 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
89 packuswb mm3, mm0 ; pack and unpack to saturate
90 punpcklbw mm3, mm0 ;
92 movq [rdi], mm3 ; store the results in the destination
94 %if ABI_IS_32BIT
95 add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
96 add rdi, rax;
97 %else
98 movsxd r8, dword ptr arg(2) ;src_pixels_per_line
99 add rdi, rax;
101 add rsi, r8 ; next line
102 %endif
104 dec rcx ; decrement count
105 jnz nextrow ; next row
107 ; begin epilog
108 pop rdi
109 pop rsi
110 RESTORE_GOT
111 UNSHADOW_ARGS
112 pop rbp
117 ; THIS FUNCTION APPEARS TO BE UNUSED
119 ;void vp8_filter_block1d_v6_mmx
121 ; short *src_ptr,
122 ; unsigned char *output_ptr,
123 ; unsigned int pixels_per_line,
124 ; unsigned int pixel_step,
125 ; unsigned int output_height,
126 ; unsigned int output_width,
127 ; short * vp8_filter
129 global sym(vp8_filter_block1d_v6_mmx)
130 sym(vp8_filter_block1d_v6_mmx):
131 push rbp
132 mov rbp, rsp
133 SHADOW_ARGS_TO_STACK 7
134 GET_GOT rbx
135 push rsi
136 push rdi
137 ; end prolog
139 movq mm5, [GLOBAL(rd)]
140 push rbx
141 mov rbx, arg(6) ;vp8_filter
142 movq mm1, [rbx + 16] ; do both the negative taps first!!!
143 movq mm2, [rbx + 32] ;
144 movq mm6, [rbx + 48] ;
145 movq mm7, [rbx + 64] ;
147 movsxd rdx, dword ptr arg(2) ;pixels_per_line
148 mov rdi, arg(1) ;output_ptr
149 mov rsi, arg(0) ;src_ptr
150 sub rsi, rdx
151 sub rsi, rdx
152 movsxd rcx, DWORD PTR arg(4) ;output_height
153 movsxd rax, DWORD PTR arg(5) ;output_width ; destination pitch?
154 pxor mm0, mm0 ; mm0 = 00000000
157 nextrow_v:
158 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
159 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
162 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
163 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
164 paddsw mm3, mm4 ; mm3 += mm4
166 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
167 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
168 paddsw mm3, mm4 ; mm3 += mm4
170 movq mm4, [rsi] ; mm4 = p0..p3 = row -2
171 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
172 paddsw mm3, mm4 ; mm3 += mm4
175 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
176 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
177 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
178 paddsw mm3, mm4 ; mm3 += mm4
180 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
181 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
182 paddsw mm3, mm4 ; mm3 += mm4
185 paddsw mm3, mm5 ; mm3 += round value
186 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
187 packuswb mm3, mm0 ; pack and saturate
189 movd [rdi],mm3 ; store the results in the destination
191 add rdi,rax;
193 dec rcx ; decrement count
194 jnz nextrow_v ; next row
196 pop rbx
198 ; begin epilog
199 pop rdi
200 pop rsi
201 RESTORE_GOT
202 UNSHADOW_ARGS
203 pop rbp
207 ;void vp8_filter_block1dc_v6_mmx
209 ; short *src_ptr,
210 ; unsigned char *output_ptr,
211 ; int output_pitch,
212 ; unsigned int pixels_per_line,
213 ; unsigned int pixel_step,
214 ; unsigned int output_height,
215 ; unsigned int output_width,
216 ; short * vp8_filter
218 global sym(vp8_filter_block1dc_v6_mmx)
219 sym(vp8_filter_block1dc_v6_mmx):
220 push rbp
221 mov rbp, rsp
222 SHADOW_ARGS_TO_STACK 8
223 GET_GOT rbx
224 push rsi
225 push rdi
226 ; end prolog
228 movq mm5, [GLOBAL(rd)]
229 push rbx
230 mov rbx, arg(7) ;vp8_filter
231 movq mm1, [rbx + 16] ; do both the negative taps first!!!
232 movq mm2, [rbx + 32] ;
233 movq mm6, [rbx + 48] ;
234 movq mm7, [rbx + 64] ;
236 movsxd rdx, dword ptr arg(3) ;pixels_per_line
237 mov rdi, arg(1) ;output_ptr
238 mov rsi, arg(0) ;src_ptr
239 sub rsi, rdx
240 sub rsi, rdx
241 movsxd rcx, DWORD PTR arg(5) ;output_height
242 movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
243 pxor mm0, mm0 ; mm0 = 00000000
246 nextrow_cv:
247 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
248 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
251 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
252 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
253 paddsw mm3, mm4 ; mm3 += mm4
255 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
256 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
257 paddsw mm3, mm4 ; mm3 += mm4
259 movq mm4, [rsi] ; mm4 = p0..p3 = row -2
260 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
261 paddsw mm3, mm4 ; mm3 += mm4
264 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
265 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
266 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
267 paddsw mm3, mm4 ; mm3 += mm4
269 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
270 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
271 paddsw mm3, mm4 ; mm3 += mm4
274 paddsw mm3, mm5 ; mm3 += round value
275 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
276 packuswb mm3, mm0 ; pack and saturate
278 movd [rdi],mm3 ; store the results in the destination
279 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
280 ; recon block should be in cache this shouldn't cost much. Its obviously
281 ; avoidable!!!.
282 lea rdi, [rdi+rax] ;
283 dec rcx ; decrement count
284 jnz nextrow_cv ; next row
286 pop rbx
288 ; begin epilog
289 pop rdi
290 pop rsi
291 RESTORE_GOT
292 UNSHADOW_ARGS
293 pop rbp
297 ;void bilinear_predict8x8_mmx
299 ; unsigned char *src_ptr,
300 ; int src_pixels_per_line,
301 ; int xoffset,
302 ; int yoffset,
303 ; unsigned char *dst_ptr,
304 ; int dst_pitch
306 global sym(vp8_bilinear_predict8x8_mmx)
307 sym(vp8_bilinear_predict8x8_mmx):
308 push rbp
309 mov rbp, rsp
310 SHADOW_ARGS_TO_STACK 6
311 GET_GOT rbx
312 push rsi
313 push rdi
314 ; end prolog
316 ;const short *HFilter = bilinear_filters_mmx[xoffset];
317 ;const short *VFilter = bilinear_filters_mmx[yoffset];
319 movsxd rax, dword ptr arg(2) ;xoffset
320 mov rdi, arg(4) ;dst_ptr ;
322 shl rax, 5 ; offset * 32
323 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
325 add rax, rcx ; HFilter
326 mov rsi, arg(0) ;src_ptr ;
328 movsxd rdx, dword ptr arg(5) ;dst_pitch
329 movq mm1, [rax] ;
331 movq mm2, [rax+16] ;
332 movsxd rax, dword ptr arg(3) ;yoffset
334 pxor mm0, mm0 ;
336 shl rax, 5 ; offset*32
337 add rax, rcx ; VFilter
339 lea rcx, [rdi+rdx*8] ;
340 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
344 ; get the first horizontal line done ;
345 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
346 movq mm4, mm3 ; make a copy of current line
348 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
349 punpckhbw mm4, mm0 ;
351 pmullw mm3, mm1 ;
352 pmullw mm4, mm1 ;
354 movq mm5, [rsi+1] ;
355 movq mm6, mm5 ;
357 punpcklbw mm5, mm0 ;
358 punpckhbw mm6, mm0 ;
360 pmullw mm5, mm2 ;
361 pmullw mm6, mm2 ;
363 paddw mm3, mm5 ;
364 paddw mm4, mm6 ;
366 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
367 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
369 paddw mm4, [GLOBAL(rd)] ;
370 psraw mm4, VP8_FILTER_SHIFT ;
372 movq mm7, mm3 ;
373 packuswb mm7, mm4 ;
375 add rsi, rdx ; next line
376 next_row_8x8:
377 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
378 movq mm4, mm3 ; make a copy of current line
380 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
381 punpckhbw mm4, mm0 ;
383 pmullw mm3, mm1 ;
384 pmullw mm4, mm1 ;
386 movq mm5, [rsi+1] ;
387 movq mm6, mm5 ;
389 punpcklbw mm5, mm0 ;
390 punpckhbw mm6, mm0 ;
392 pmullw mm5, mm2 ;
393 pmullw mm6, mm2 ;
395 paddw mm3, mm5 ;
396 paddw mm4, mm6 ;
398 movq mm5, mm7 ;
399 movq mm6, mm7 ;
401 punpcklbw mm5, mm0 ;
402 punpckhbw mm6, mm0
404 pmullw mm5, [rax] ;
405 pmullw mm6, [rax] ;
407 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
408 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
410 paddw mm4, [GLOBAL(rd)] ;
411 psraw mm4, VP8_FILTER_SHIFT ;
413 movq mm7, mm3 ;
414 packuswb mm7, mm4 ;
417 pmullw mm3, [rax+16] ;
418 pmullw mm4, [rax+16] ;
420 paddw mm3, mm5 ;
421 paddw mm4, mm6 ;
424 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
425 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
427 paddw mm4, [GLOBAL(rd)] ;
428 psraw mm4, VP8_FILTER_SHIFT ;
430 packuswb mm3, mm4
432 movq [rdi], mm3 ; store the results in the destination
434 %if ABI_IS_32BIT
435 add rsi, rdx ; next line
436 add rdi, dword ptr arg(5) ;dst_pitch ;
437 %else
438 movsxd r8, dword ptr arg(5) ;dst_pitch
439 add rsi, rdx ; next line
440 add rdi, r8 ;dst_pitch
441 %endif
442 cmp rdi, rcx ;
443 jne next_row_8x8
445 ; begin epilog
446 pop rdi
447 pop rsi
448 RESTORE_GOT
449 UNSHADOW_ARGS
450 pop rbp
454 ;void bilinear_predict8x4_mmx
456 ; unsigned char *src_ptr,
457 ; int src_pixels_per_line,
458 ; int xoffset,
459 ; int yoffset,
460 ; unsigned char *dst_ptr,
461 ; int dst_pitch
463 global sym(vp8_bilinear_predict8x4_mmx)
464 sym(vp8_bilinear_predict8x4_mmx):
465 push rbp
466 mov rbp, rsp
467 SHADOW_ARGS_TO_STACK 6
468 GET_GOT rbx
469 push rsi
470 push rdi
471 ; end prolog
473 ;const short *HFilter = bilinear_filters_mmx[xoffset];
474 ;const short *VFilter = bilinear_filters_mmx[yoffset];
476 movsxd rax, dword ptr arg(2) ;xoffset
477 mov rdi, arg(4) ;dst_ptr ;
479 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
480 shl rax, 5
482 mov rsi, arg(0) ;src_ptr ;
483 add rax, rcx
485 movsxd rdx, dword ptr arg(5) ;dst_pitch
486 movq mm1, [rax] ;
488 movq mm2, [rax+16] ;
489 movsxd rax, dword ptr arg(3) ;yoffset
491 pxor mm0, mm0 ;
492 shl rax, 5
494 add rax, rcx
495 lea rcx, [rdi+rdx*4] ;
497 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
499 ; get the first horizontal line done ;
500 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
501 movq mm4, mm3 ; make a copy of current line
503 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
504 punpckhbw mm4, mm0 ;
506 pmullw mm3, mm1 ;
507 pmullw mm4, mm1 ;
509 movq mm5, [rsi+1] ;
510 movq mm6, mm5 ;
512 punpcklbw mm5, mm0 ;
513 punpckhbw mm6, mm0 ;
515 pmullw mm5, mm2 ;
516 pmullw mm6, mm2 ;
518 paddw mm3, mm5 ;
519 paddw mm4, mm6 ;
521 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
522 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
524 paddw mm4, [GLOBAL(rd)] ;
525 psraw mm4, VP8_FILTER_SHIFT ;
527 movq mm7, mm3 ;
528 packuswb mm7, mm4 ;
530 add rsi, rdx ; next line
531 next_row_8x4:
532 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
533 movq mm4, mm3 ; make a copy of current line
535 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
536 punpckhbw mm4, mm0 ;
538 pmullw mm3, mm1 ;
539 pmullw mm4, mm1 ;
541 movq mm5, [rsi+1] ;
542 movq mm6, mm5 ;
544 punpcklbw mm5, mm0 ;
545 punpckhbw mm6, mm0 ;
547 pmullw mm5, mm2 ;
548 pmullw mm6, mm2 ;
550 paddw mm3, mm5 ;
551 paddw mm4, mm6 ;
553 movq mm5, mm7 ;
554 movq mm6, mm7 ;
556 punpcklbw mm5, mm0 ;
557 punpckhbw mm6, mm0
559 pmullw mm5, [rax] ;
560 pmullw mm6, [rax] ;
562 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
563 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
565 paddw mm4, [GLOBAL(rd)] ;
566 psraw mm4, VP8_FILTER_SHIFT ;
568 movq mm7, mm3 ;
569 packuswb mm7, mm4 ;
572 pmullw mm3, [rax+16] ;
573 pmullw mm4, [rax+16] ;
575 paddw mm3, mm5 ;
576 paddw mm4, mm6 ;
579 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
580 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
582 paddw mm4, [GLOBAL(rd)] ;
583 psraw mm4, VP8_FILTER_SHIFT ;
585 packuswb mm3, mm4
587 movq [rdi], mm3 ; store the results in the destination
589 %if ABI_IS_32BIT
590 add rsi, rdx ; next line
591 add rdi, dword ptr arg(5) ;dst_pitch ;
592 %else
593 movsxd r8, dword ptr arg(5) ;dst_pitch
594 add rsi, rdx ; next line
595 add rdi, r8
596 %endif
597 cmp rdi, rcx ;
598 jne next_row_8x4
600 ; begin epilog
601 pop rdi
602 pop rsi
603 RESTORE_GOT
604 UNSHADOW_ARGS
605 pop rbp
609 ;void bilinear_predict4x4_mmx
611 ; unsigned char *src_ptr,
612 ; int src_pixels_per_line,
613 ; int xoffset,
614 ; int yoffset,
615 ; unsigned char *dst_ptr,
616 ; int dst_pitch
618 global sym(vp8_bilinear_predict4x4_mmx)
619 sym(vp8_bilinear_predict4x4_mmx):
620 push rbp
621 mov rbp, rsp
622 SHADOW_ARGS_TO_STACK 6
623 GET_GOT rbx
624 push rsi
625 push rdi
626 ; end prolog
628 ;const short *HFilter = bilinear_filters_mmx[xoffset];
629 ;const short *VFilter = bilinear_filters_mmx[yoffset];
631 movsxd rax, dword ptr arg(2) ;xoffset
632 mov rdi, arg(4) ;dst_ptr ;
634 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
635 shl rax, 5
637 add rax, rcx ; HFilter
638 mov rsi, arg(0) ;src_ptr ;
640 movsxd rdx, dword ptr arg(5) ;ldst_pitch
641 movq mm1, [rax] ;
643 movq mm2, [rax+16] ;
644 movsxd rax, dword ptr arg(3) ;yoffset
646 pxor mm0, mm0 ;
647 shl rax, 5
649 add rax, rcx
650 lea rcx, [rdi+rdx*4] ;
652 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
654 ; get the first horizontal line done ;
655 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
656 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
658 pmullw mm3, mm1 ;
659 movd mm5, [rsi+1] ;
661 punpcklbw mm5, mm0 ;
662 pmullw mm5, mm2 ;
664 paddw mm3, mm5 ;
665 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
667 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
669 movq mm7, mm3 ;
670 packuswb mm7, mm0 ;
672 add rsi, rdx ; next line
673 next_row_4x4:
674 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
675 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
677 pmullw mm3, mm1 ;
678 movd mm5, [rsi+1] ;
680 punpcklbw mm5, mm0 ;
681 pmullw mm5, mm2 ;
683 paddw mm3, mm5 ;
685 movq mm5, mm7 ;
686 punpcklbw mm5, mm0 ;
688 pmullw mm5, [rax] ;
689 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
691 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
692 movq mm7, mm3 ;
694 packuswb mm7, mm0 ;
696 pmullw mm3, [rax+16] ;
697 paddw mm3, mm5 ;
700 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
701 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
703 packuswb mm3, mm0
704 movd [rdi], mm3 ; store the results in the destination
706 %if ABI_IS_32BIT
707 add rsi, rdx ; next line
708 add rdi, dword ptr arg(5) ;dst_pitch ;
709 %else
710 movsxd r8, dword ptr arg(5) ;dst_pitch ;
711 add rsi, rdx ; next line
712 add rdi, r8
713 %endif
715 cmp rdi, rcx ;
716 jne next_row_4x4
718 ; begin epilog
719 pop rdi
720 pop rsi
721 RESTORE_GOT
722 UNSHADOW_ARGS
723 pop rbp
728 SECTION_RODATA
729 align 16
731 times 4 dw 0x40
733 align 16
734 global HIDDEN_DATA(sym(vp8_six_tap_mmx))
735 sym(vp8_six_tap_mmx):
736 times 8 dw 0
737 times 8 dw 0
738 times 8 dw 128
739 times 8 dw 0
740 times 8 dw 0
741 times 8 dw 0
743 times 8 dw 0
744 times 8 dw -6
745 times 8 dw 123
746 times 8 dw 12
747 times 8 dw -1
748 times 8 dw 0
750 times 8 dw 2
751 times 8 dw -11
752 times 8 dw 108
753 times 8 dw 36
754 times 8 dw -8
755 times 8 dw 1
757 times 8 dw 0
758 times 8 dw -9
759 times 8 dw 93
760 times 8 dw 50
761 times 8 dw -6
762 times 8 dw 0
764 times 8 dw 3
765 times 8 dw -16
766 times 8 dw 77
767 times 8 dw 77
768 times 8 dw -16
769 times 8 dw 3
771 times 8 dw 0
772 times 8 dw -6
773 times 8 dw 50
774 times 8 dw 93
775 times 8 dw -9
776 times 8 dw 0
778 times 8 dw 1
779 times 8 dw -8
780 times 8 dw 36
781 times 8 dw 108
782 times 8 dw -11
783 times 8 dw 2
785 times 8 dw 0
786 times 8 dw -1
787 times 8 dw 12
788 times 8 dw 123
789 times 8 dw -6
790 times 8 dw 0
793 align 16
794 global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
795 sym(vp8_bilinear_filters_mmx):
796 times 8 dw 128
797 times 8 dw 0
799 times 8 dw 112
800 times 8 dw 16
802 times 8 dw 96
803 times 8 dw 32
805 times 8 dw 80
806 times 8 dw 48
808 times 8 dw 64
809 times 8 dw 64
811 times 8 dw 48
812 times 8 dw 80
814 times 8 dw 32
815 times 8 dw 96
817 times 8 dw 16
818 times 8 dw 112