2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %define BLOCK_HEIGHT_WIDTH
4
15 %define VP8_FILTER_WEIGHT
128
16 %define VP8_FILTER_SHIFT
7
19 ;/************************************************************************************
20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
21 ; input pixel array has output_height rows. This routine assumes that output_height is an
22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
23 ; rows each iteration to take advantage of the 128 bits operations.
25 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
27 ;*************************************************************************************/
28 ;void vp8_filter_block1d8_h6_ssse3
30 ; unsigned char *src_ptr,
31 ; unsigned int src_pixels_per_line,
32 ; unsigned char *output_ptr,
33 ; unsigned int output_pitch,
34 ; unsigned int output_height,
35 ; unsigned int vp8_filter_index
37 global sym
(vp8_filter_block1d8_h6_ssse3
)
38 sym
(vp8_filter_block1d8_h6_ssse3
):
41 SHADOW_ARGS_TO_STACK
6
47 movsxd rdx
, DWORD PTR arg
(5) ;table index
51 movdqa xmm7
, [GLOBAL(rd
)]
53 lea rax
, [GLOBAL(k0_k5
)]
55 mov rdi
, arg
(2) ;output_ptr
57 cmp esi, DWORD PTR [rax
]
58 je vp8_filter_block1d8_h4_ssse3
60 movdqa xmm4
, XMMWORD
PTR [rax
] ;k0_k5
61 movdqa xmm5
, XMMWORD
PTR [rax
+256] ;k2_k4
62 movdqa xmm6
, XMMWORD
PTR [rax
+128] ;k1_k3
64 mov rsi
, arg
(0) ;src_ptr
65 movsxd rax
, dword ptr arg
(1) ;src_pixels_per_line
66 movsxd rcx
, dword ptr arg
(4) ;output_height
68 movsxd rdx
, dword ptr arg
(3) ;output_pitch
72 filter_block1d8_h6_rowloop_ssse3:
73 movq xmm0
, MMWORD
PTR [rsi
- 2] ; -2 -1 0 1 2 3 4 5
75 movq xmm2
, MMWORD
PTR [rsi
+ 3] ; 3 4 5 6 7 8 9 10
77 punpcklbw xmm0
, xmm2
; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
83 pshufb xmm1
, [GLOBAL(shuf2bfrom1
)]
85 pshufb xmm2
, [GLOBAL(shuf3bfrom1
)]
103 movq MMWORD
Ptr [rdi
], xmm0
104 jnz filter_block1d8_h6_rowloop_ssse3
114 vp8_filter_block1d8_h4_ssse3:
115 movdqa xmm5
, XMMWORD
PTR [rax
+256] ;k2_k4
116 movdqa xmm6
, XMMWORD
PTR [rax
+128] ;k1_k3
118 movdqa xmm3
, XMMWORD
PTR [GLOBAL(shuf2bfrom1
)]
119 movdqa xmm4
, XMMWORD
PTR [GLOBAL(shuf3bfrom1
)]
121 mov rsi
, arg
(0) ;src_ptr
123 movsxd rax
, dword ptr arg
(1) ;src_pixels_per_line
124 movsxd rcx
, dword ptr arg
(4) ;output_height
126 movsxd rdx
, dword ptr arg
(3) ;output_pitch
130 filter_block1d8_h4_rowloop_ssse3:
131 movq xmm0
, MMWORD
PTR [rsi
- 2] ; -2 -1 0 1 2 3 4 5
133 movq xmm1
, MMWORD
PTR [rsi
+ 3] ; 3 4 5 6 7 8 9 10
135 punpcklbw xmm0
, xmm1
; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
157 movq MMWORD
Ptr [rdi
], xmm0
159 jnz filter_block1d8_h4_rowloop_ssse3
168 ;void vp8_filter_block1d16_h6_ssse3
170 ; unsigned char *src_ptr,
171 ; unsigned int src_pixels_per_line,
172 ; unsigned char *output_ptr,
173 ; unsigned int output_pitch,
174 ; unsigned int output_height,
175 ; unsigned int vp8_filter_index
177 global sym
(vp8_filter_block1d16_h6_ssse3
)
178 sym
(vp8_filter_block1d16_h6_ssse3
):
181 SHADOW_ARGS_TO_STACK
6
188 movsxd rdx
, DWORD PTR arg
(5) ;table index
192 lea rax
, [GLOBAL(k0_k5
)]
195 mov rdi
, arg
(2) ;output_ptr
198 ;; cmp esi, DWORD PTR [rax]
199 ;; je vp8_filter_block1d16_h4_ssse3
201 mov rsi
, arg
(0) ;src_ptr
203 movdqa xmm4
, XMMWORD
PTR [rax
] ;k0_k5
204 movdqa xmm5
, XMMWORD
PTR [rax
+256] ;k2_k4
205 movdqa xmm6
, XMMWORD
PTR [rax
+128] ;k1_k3
207 movsxd rax
, dword ptr arg
(1) ;src_pixels_per_line
208 movsxd rcx
, dword ptr arg
(4) ;output_height
209 movsxd rdx
, dword ptr arg
(3) ;output_pitch
211 filter_block1d16_h6_rowloop_ssse3:
212 movq xmm0
, MMWORD
PTR [rsi
- 2] ; -2 -1 0 1 2 3 4 5
214 movq xmm3
, MMWORD
PTR [rsi
+ 3] ; 3 4 5 6 7 8 9 10
216 punpcklbw xmm0
, xmm3
; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
222 pshufb xmm1
, [GLOBAL(shuf2bfrom1
)]
224 pshufb xmm2
, [GLOBAL(shuf3bfrom1
)]
225 movq xmm3
, MMWORD
PTR [rsi
+ 6]
228 movq xmm7
, MMWORD
PTR [rsi
+ 11]
240 paddsw xmm0
, [GLOBAL(rd
)]
242 pshufb xmm1
, [GLOBAL(shuf2bfrom1
)]
243 pshufb xmm2
, [GLOBAL(shuf3bfrom1
)]
256 paddsw xmm3
, [GLOBAL(rd
)]
262 punpcklqdq xmm0
, xmm3
264 movdqa XMMWORD
Ptr [rdi
], xmm0
268 jnz filter_block1d16_h6_rowloop_ssse3
278 vp8_filter_block1d16_h4_ssse3:
279 movdqa xmm5
, XMMWORD
PTR [rax
+256] ;k2_k4
280 movdqa xmm6
, XMMWORD
PTR [rax
+128] ;k1_k3
282 mov rsi
, arg
(0) ;src_ptr
283 movsxd rax
, dword ptr arg
(1) ;src_pixels_per_line
284 movsxd rcx
, dword ptr arg
(4) ;output_height
285 movsxd rdx
, dword ptr arg
(3) ;output_pitch
287 filter_block1d16_h4_rowloop_ssse3:
288 movdqu xmm1
, XMMWORD
PTR [rsi
- 2]
291 pshufb xmm1
, [GLOBAL(shuf2b
)]
292 pshufb xmm2
, [GLOBAL(shuf3b
)]
295 movdqu xmm3
, XMMWORD
PTR [rsi
+ 6]
299 pshufb xmm3
, [GLOBAL(shuf3b
)]
300 pshufb xmm0
, [GLOBAL(shuf2b
)]
302 paddsw xmm1
, [GLOBAL(rd
)]
312 paddsw xmm3
, [GLOBAL(rd
)]
316 punpcklqdq xmm1
, xmm3
318 movdqa XMMWORD
Ptr [rdi
], xmm1
322 jnz filter_block1d16_h4_rowloop_ssse3
333 ;void vp8_filter_block1d4_h6_ssse3
335 ; unsigned char *src_ptr,
336 ; unsigned int src_pixels_per_line,
337 ; unsigned char *output_ptr,
338 ; unsigned int output_pitch,
339 ; unsigned int output_height,
340 ; unsigned int vp8_filter_index
342 global sym
(vp8_filter_block1d4_h6_ssse3
)
343 sym
(vp8_filter_block1d4_h6_ssse3
):
346 SHADOW_ARGS_TO_STACK
6
352 movsxd rdx
, DWORD PTR arg
(5) ;table index
356 lea rax
, [GLOBAL(k0_k5
)]
358 movdqa xmm7
, [GLOBAL(rd
)]
360 cmp esi, DWORD PTR [rax
]
361 je vp8_filter_block1d4_h4_ssse3
363 movdqa xmm4
, XMMWORD
PTR [rax
] ;k0_k5
364 movdqa xmm5
, XMMWORD
PTR [rax
+256] ;k2_k4
365 movdqa xmm6
, XMMWORD
PTR [rax
+128] ;k1_k3
367 mov rsi
, arg
(0) ;src_ptr
368 mov rdi
, arg
(2) ;output_ptr
369 movsxd rax
, dword ptr arg
(1) ;src_pixels_per_line
370 movsxd rcx
, dword ptr arg
(4) ;output_height
372 movsxd rdx
, dword ptr arg
(3) ;output_pitch
375 filter_block1d4_h6_rowloop_ssse3:
376 movdqu xmm0
, XMMWORD
PTR [rsi
- 2]
379 pshufb xmm0
, [GLOBAL(shuf1b
)]
382 pshufb xmm1
, [GLOBAL(shuf2b
)]
384 pshufb xmm2
, [GLOBAL(shuf3b
)]
399 movd
DWORD PTR [rdi
], xmm0
403 jnz filter_block1d4_h6_rowloop_ssse3
413 vp8_filter_block1d4_h4_ssse3:
414 movdqa xmm5
, XMMWORD
PTR [rax
+256] ;k2_k4
415 movdqa xmm6
, XMMWORD
PTR [rax
+128] ;k1_k3
416 movdqa xmm0
, XMMWORD
PTR [GLOBAL(shuf2b
)]
417 movdqa xmm3
, XMMWORD
PTR [GLOBAL(shuf3b
)]
419 mov rsi
, arg
(0) ;src_ptr
420 mov rdi
, arg
(2) ;output_ptr
421 movsxd rax
, dword ptr arg
(1) ;src_pixels_per_line
422 movsxd rcx
, dword ptr arg
(4) ;output_height
424 movsxd rdx
, dword ptr arg
(3) ;output_pitch
426 filter_block1d4_h4_rowloop_ssse3:
427 movdqu xmm1
, XMMWORD
PTR [rsi
- 2]
430 pshufb xmm1
, xmm0
;;[GLOBAL(shuf2b)]
431 pshufb xmm2
, xmm3
;;[GLOBAL(shuf3b)]
444 movd
DWORD PTR [rdi
], xmm1
448 jnz filter_block1d4_h4_rowloop_ssse3
460 ;void vp8_filter_block1d16_v6_ssse3
462 ; unsigned char *src_ptr,
463 ; unsigned int src_pitch,
464 ; unsigned char *output_ptr,
465 ; unsigned int out_pitch,
466 ; unsigned int output_height,
467 ; unsigned int vp8_filter_index
469 global sym
(vp8_filter_block1d16_v6_ssse3
)
470 sym
(vp8_filter_block1d16_v6_ssse3
):
473 SHADOW_ARGS_TO_STACK
6
479 movsxd rdx
, DWORD PTR arg
(5) ;table index
483 lea rax
, [GLOBAL(k0_k5
)]
486 cmp esi, DWORD PTR [rax
]
487 je vp8_filter_block1d16_v4_ssse3
489 movdqa xmm5
, XMMWORD
PTR [rax
] ;k0_k5
490 movdqa xmm6
, XMMWORD
PTR [rax
+256] ;k2_k4
491 movdqa xmm7
, XMMWORD
PTR [rax
+128] ;k1_k3
493 mov rsi
, arg
(0) ;src_ptr
494 movsxd rdx
, DWORD PTR arg
(1) ;pixels_per_line
495 mov rdi
, arg
(2) ;output_ptr
498 movsxd r8
, DWORD PTR arg
(3) ;out_pitch
501 movsxd rcx
, DWORD PTR arg
(4) ;output_height
505 vp8_filter_block1d16_v6_ssse3_loop:
506 movq xmm1
, MMWORD
PTR [rsi
] ;A
507 movq xmm2
, MMWORD
PTR [rsi
+ rdx
] ;B
508 movq xmm3
, MMWORD
PTR [rsi
+ rdx
* 2] ;C
509 movq xmm4
, MMWORD
PTR [rax
+ rdx
* 2] ;D
510 movq xmm0
, MMWORD
PTR [rsi
+ rdx
* 4] ;E
512 punpcklbw xmm2
, xmm4
;B D
513 punpcklbw xmm3
, xmm0
;C E
515 movq xmm0
, MMWORD
PTR [rax
+ rdx
* 4] ;F
518 punpcklbw xmm1
, xmm0
;A F
524 paddsw xmm2
, [GLOBAL(rd
)]
528 movq MMWORD
PTR [rdi
], xmm2
;store the results
530 movq xmm1
, MMWORD
PTR [rsi
+ 8] ;A
531 movq xmm2
, MMWORD
PTR [rsi
+ rdx
+ 8] ;B
532 movq xmm3
, MMWORD
PTR [rsi
+ rdx
* 2 + 8] ;C
533 movq xmm4
, MMWORD
PTR [rax
+ rdx
* 2 + 8] ;D
534 movq xmm0
, MMWORD
PTR [rsi
+ rdx
* 4 + 8] ;E
536 punpcklbw xmm2
, xmm4
;B D
537 punpcklbw xmm3
, xmm0
;C E
539 movq xmm0
, MMWORD
PTR [rax
+ rdx
* 4 + 8] ;F
541 punpcklbw xmm1
, xmm0
;A F
551 paddsw xmm2
, [GLOBAL(rd
)]
555 movq MMWORD
PTR [rdi
+8], xmm2
558 add rdi
, DWORD PTR arg
(3) ;out_pitch
563 jnz vp8_filter_block1d16_v6_ssse3_loop
573 vp8_filter_block1d16_v4_ssse3:
574 movdqa xmm6
, XMMWORD
PTR [rax
+256] ;k2_k4
575 movdqa xmm7
, XMMWORD
PTR [rax
+128] ;k1_k3
577 mov rsi
, arg
(0) ;src_ptr
578 movsxd rdx
, DWORD PTR arg
(1) ;pixels_per_line
579 mov rdi
, arg
(2) ;output_ptr
582 movsxd r8
, DWORD PTR arg
(3) ;out_pitch
585 movsxd rcx
, DWORD PTR arg
(4) ;output_height
588 vp8_filter_block1d16_v4_ssse3_loop:
589 movq xmm2
, MMWORD
PTR [rsi
+ rdx
] ;B
590 movq xmm3
, MMWORD
PTR [rsi
+ rdx
* 2] ;C
591 movq xmm4
, MMWORD
PTR [rax
+ rdx
* 2] ;D
592 movq xmm0
, MMWORD
PTR [rsi
+ rdx
* 4] ;E
594 punpcklbw xmm2
, xmm4
;B D
595 punpcklbw xmm3
, xmm0
;C E
599 movq xmm5
, MMWORD
PTR [rsi
+ rdx
+ 8] ;B
600 movq xmm1
, MMWORD
PTR [rsi
+ rdx
* 2 + 8] ;C
601 movq xmm4
, MMWORD
PTR [rax
+ rdx
* 2 + 8] ;D
602 movq xmm0
, MMWORD
PTR [rsi
+ rdx
* 4 + 8] ;E
604 paddsw xmm2
, [GLOBAL(rd
)]
609 punpcklbw xmm5
, xmm4
;B D
610 punpcklbw xmm1
, xmm0
;C E
615 movdqa xmm4
, [GLOBAL(rd
)]
625 punpcklqdq xmm2
, xmm5
627 movdqa XMMWORD
PTR [rdi
], xmm2
630 add rdi
, DWORD PTR arg
(3) ;out_pitch
635 jnz vp8_filter_block1d16_v4_ssse3_loop
645 ;void vp8_filter_block1d8_v6_ssse3
647 ; unsigned char *src_ptr,
648 ; unsigned int src_pitch,
649 ; unsigned char *output_ptr,
650 ; unsigned int out_pitch,
651 ; unsigned int output_height,
652 ; unsigned int vp8_filter_index
654 global sym
(vp8_filter_block1d8_v6_ssse3
)
655 sym
(vp8_filter_block1d8_v6_ssse3
):
658 SHADOW_ARGS_TO_STACK
6
664 movsxd rdx
, DWORD PTR arg
(5) ;table index
668 lea rax
, [GLOBAL(k0_k5
)]
671 movsxd rdx
, DWORD PTR arg
(1) ;pixels_per_line
672 mov rdi
, arg
(2) ;output_ptr
674 movsxd r8
, DWORD PTR arg
(3) ; out_pitch
676 movsxd rcx
, DWORD PTR arg
(4) ;[output_height]
678 cmp esi, DWORD PTR [rax
]
679 je vp8_filter_block1d8_v4_ssse3
681 movdqa xmm5
, XMMWORD
PTR [rax
] ;k0_k5
682 movdqa xmm6
, XMMWORD
PTR [rax
+256] ;k2_k4
683 movdqa xmm7
, XMMWORD
PTR [rax
+128] ;k1_k3
685 mov rsi
, arg
(0) ;src_ptr
690 vp8_filter_block1d8_v6_ssse3_loop:
691 movq xmm1
, MMWORD
PTR [rsi
] ;A
692 movq xmm2
, MMWORD
PTR [rsi
+ rdx
] ;B
693 movq xmm3
, MMWORD
PTR [rsi
+ rdx
* 2] ;C
694 movq xmm4
, MMWORD
PTR [rax
+ rdx
* 2] ;D
695 movq xmm0
, MMWORD
PTR [rsi
+ rdx
* 4] ;E
697 punpcklbw xmm2
, xmm4
;B D
698 punpcklbw xmm3
, xmm0
;C E
700 movq xmm0
, MMWORD
PTR [rax
+ rdx
* 4] ;F
701 movdqa xmm4
, [GLOBAL(rd
)]
704 punpcklbw xmm1
, xmm0
;A F
717 movq MMWORD
PTR [rdi
], xmm2
720 add rdi
, DWORD PTR arg
(3) ;[out_pitch]
725 jnz vp8_filter_block1d8_v6_ssse3_loop
735 vp8_filter_block1d8_v4_ssse3:
736 movdqa xmm6
, XMMWORD
PTR [rax
+256] ;k2_k4
737 movdqa xmm7
, XMMWORD
PTR [rax
+128] ;k1_k3
738 movdqa xmm5
, [GLOBAL(rd
)]
740 mov rsi
, arg
(0) ;src_ptr
745 vp8_filter_block1d8_v4_ssse3_loop:
746 movq xmm2
, MMWORD
PTR [rsi
+ rdx
] ;B
747 movq xmm3
, MMWORD
PTR [rsi
+ rdx
* 2] ;C
748 movq xmm4
, MMWORD
PTR [rax
+ rdx
* 2] ;D
749 movq xmm0
, MMWORD
PTR [rsi
+ rdx
* 4] ;E
751 punpcklbw xmm2
, xmm4
;B D
752 punpcklbw xmm3
, xmm0
;C E
765 movq MMWORD
PTR [rdi
], xmm2
768 add rdi
, DWORD PTR arg
(3) ;[out_pitch]
773 jnz vp8_filter_block1d8_v4_ssse3_loop
782 ;void vp8_filter_block1d4_v6_ssse3
784 ; unsigned char *src_ptr,
785 ; unsigned int src_pitch,
786 ; unsigned char *output_ptr,
787 ; unsigned int out_pitch,
788 ; unsigned int output_height,
789 ; unsigned int vp8_filter_index
791 global sym
(vp8_filter_block1d4_v6_ssse3
)
792 sym
(vp8_filter_block1d4_v6_ssse3
):
795 SHADOW_ARGS_TO_STACK
6
801 movsxd rdx
, DWORD PTR arg
(5) ;table index
805 lea rax
, [GLOBAL(k0_k5
)]
808 movsxd rdx
, DWORD PTR arg
(1) ;pixels_per_line
809 mov rdi
, arg
(2) ;output_ptr
811 movsxd r8
, DWORD PTR arg
(3) ; out_pitch
813 movsxd rcx
, DWORD PTR arg
(4) ;[output_height]
815 cmp esi, DWORD PTR [rax
]
816 je vp8_filter_block1d4_v4_ssse3
818 movq mm5
, MMWORD
PTR [rax
] ;k0_k5
819 movq mm6
, MMWORD
PTR [rax
+256] ;k2_k4
820 movq mm7
, MMWORD
PTR [rax
+128] ;k1_k3
822 mov rsi
, arg
(0) ;src_ptr
827 vp8_filter_block1d4_v6_ssse3_loop:
828 movd mm1
, DWORD PTR [rsi
] ;A
829 movd mm2
, DWORD PTR [rsi
+ rdx
] ;B
830 movd mm3
, DWORD PTR [rsi
+ rdx
* 2] ;C
831 movd mm4
, DWORD PTR [rax
+ rdx
* 2] ;D
832 movd mm0
, DWORD PTR [rsi
+ rdx
* 4] ;E
834 punpcklbw mm2
, mm4
;B D
835 punpcklbw mm3
, mm0
;C E
837 movd mm0
, DWORD PTR [rax
+ rdx
* 4] ;F
839 movq mm4
, [GLOBAL(rd
)]
842 punpcklbw mm1
, mm0
;A F
855 movd
DWORD PTR [rdi
], mm2
858 add rdi
, DWORD PTR arg
(3) ;[out_pitch]
863 jnz vp8_filter_block1d4_v6_ssse3_loop
873 vp8_filter_block1d4_v4_ssse3:
874 movq mm6
, MMWORD
PTR [rax
+256] ;k2_k4
875 movq mm7
, MMWORD
PTR [rax
+128] ;k1_k3
876 movq mm5
, MMWORD
PTR [GLOBAL(rd
)]
878 mov rsi
, arg
(0) ;src_ptr
883 vp8_filter_block1d4_v4_ssse3_loop:
884 movd mm2
, DWORD PTR [rsi
+ rdx
] ;B
885 movd mm3
, DWORD PTR [rsi
+ rdx
* 2] ;C
886 movd mm4
, DWORD PTR [rax
+ rdx
* 2] ;D
887 movd mm0
, DWORD PTR [rsi
+ rdx
* 4] ;E
889 punpcklbw mm2
, mm4
;B D
890 punpcklbw mm3
, mm0
;C E
903 movd
DWORD PTR [rdi
], mm2
906 add rdi
, DWORD PTR arg
(3) ;[out_pitch]
911 jnz vp8_filter_block1d4_v4_ssse3_loop
921 ;void vp8_bilinear_predict16x16_ssse3
923 ; unsigned char *src_ptr,
924 ; int src_pixels_per_line,
927 ; unsigned char *dst_ptr,
930 global sym
(vp8_bilinear_predict16x16_ssse3
)
931 sym
(vp8_bilinear_predict16x16_ssse3
):
934 SHADOW_ARGS_TO_STACK
6
941 lea rcx
, [GLOBAL(vp8_bilinear_filters_ssse3
)]
942 movsxd rax
, dword ptr arg
(2) ; xoffset
944 cmp rax
, 0 ; skip first_pass filter if xoffset=0
948 lea rax
, [rax
+ rcx
] ; HFilter
950 mov rdi
, arg
(4) ; dst_ptr
951 mov rsi
, arg
(0) ; src_ptr
952 movsxd rdx
, dword ptr arg
(5) ; dst_pitch
956 movsxd rax
, dword ptr arg
(3) ; yoffset
958 cmp rax
, 0 ; skip second_pass filter if yoffset=0
962 lea rax
, [rax
+ rcx
] ; VFilter
966 movsxd rdx
, dword ptr arg
(1) ; src_pixels_per_line
971 movsxd r8
, dword ptr arg
(5) ; dst_pitch
973 movq xmm3
, [rsi
] ; 00 01 02 03 04 05 06 07
974 movq xmm5
, [rsi
+1] ; 01 02 03 04 05 06 07 08
976 punpcklbw xmm3
, xmm5
; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
977 movq xmm4
, [rsi
+8] ; 08 09 10 11 12 13 14 15
979 movq xmm5
, [rsi
+9] ; 09 10 11 12 13 14 15 16
981 lea rsi
, [rsi
+ rdx
] ; next line
983 pmaddubsw xmm3
, xmm1
; 00 02 04 06 08 10 12 14
985 punpcklbw xmm4
, xmm5
; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
986 pmaddubsw xmm4
, xmm1
; 01 03 05 07 09 11 13 15
988 paddw xmm3
, [GLOBAL(rd
)] ; xmm3 += round value
989 psraw xmm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
991 paddw xmm4
, [GLOBAL(rd
)] ; xmm4 += round value
992 psraw xmm4
, VP8_FILTER_SHIFT
; xmm4 /= 128
995 packuswb xmm7
, xmm4
; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
998 movq xmm6
, [rsi
] ; 00 01 02 03 04 05 06 07
999 movq xmm5
, [rsi
+1] ; 01 02 03 04 05 06 07 08
1001 punpcklbw xmm6
, xmm5
1002 movq xmm4
, [rsi
+8] ; 08 09 10 11 12 13 14 15
1004 movq xmm5
, [rsi
+9] ; 09 10 11 12 13 14 15 16
1005 lea rsi
, [rsi
+ rdx
] ; next line
1007 pmaddubsw xmm6
, xmm1
1009 punpcklbw xmm4
, xmm5
1010 pmaddubsw xmm4
, xmm1
1012 paddw xmm6
, [GLOBAL(rd
)] ; xmm6 += round value
1013 psraw xmm6
, VP8_FILTER_SHIFT
; xmm6 /= 128
1015 paddw xmm4
, [GLOBAL(rd
)] ; xmm4 += round value
1016 psraw xmm4
, VP8_FILTER_SHIFT
; xmm4 /= 128
1021 punpcklbw xmm5
, xmm6
1022 pmaddubsw xmm5
, xmm2
1024 punpckhbw xmm7
, xmm6
1025 pmaddubsw xmm7
, xmm2
1027 paddw xmm5
, [GLOBAL(rd
)] ; xmm5 += round value
1028 psraw xmm5
, VP8_FILTER_SHIFT
; xmm5 /= 128
1030 paddw xmm7
, [GLOBAL(rd
)] ; xmm7 += round value
1031 psraw xmm7
, VP8_FILTER_SHIFT
; xmm7 /= 128
1036 movdqa
[rdi
], xmm5
; store the results in the destination
1038 add rdi
, DWORD PTR arg
(5) ; dst_pitch
1049 movsxd rax
, dword ptr arg
(3) ; yoffset
1051 lea rax
, [rax
+ rcx
] ; VFilter
1053 mov rdi
, arg
(4) ; dst_ptr
1054 mov rsi
, arg
(0) ; src_ptr
1055 movsxd rdx
, dword ptr arg
(5) ; dst_pitch
1057 movdqa xmm1
, [rax
] ; VFilter
1059 lea rcx
, [rdi
+rdx
*8]
1060 lea rcx
, [rcx
+rdx
*8]
1061 movsxd rax
, dword ptr arg
(1) ; src_pixels_per_line
1063 ; get the first horizontal line done
1064 movq xmm4
, [rsi
] ; load row 0
1065 movq xmm2
, [rsi
+ 8] ; load row 0
1067 lea rsi
, [rsi
+ rax
] ; next line
1069 movq xmm3
, [rsi
] ; load row + 1
1070 movq xmm5
, [rsi
+ 8] ; load row + 1
1072 punpcklbw xmm4
, xmm3
1073 punpcklbw xmm2
, xmm5
1075 pmaddubsw xmm4
, xmm1
1076 movq xmm7
, [rsi
+ rax
] ; load row + 2
1078 pmaddubsw xmm2
, xmm1
1079 movq xmm6
, [rsi
+ rax
+ 8] ; load row + 2
1081 punpcklbw xmm3
, xmm7
1082 punpcklbw xmm5
, xmm6
1084 pmaddubsw xmm3
, xmm1
1085 paddw xmm4
, [GLOBAL(rd
)]
1087 pmaddubsw xmm5
, xmm1
1088 paddw xmm2
, [GLOBAL(rd
)]
1090 psraw xmm4
, VP8_FILTER_SHIFT
1091 psraw xmm2
, VP8_FILTER_SHIFT
1094 paddw xmm3
, [GLOBAL(rd
)]
1096 movdqa
[rdi
], xmm4
; store row 0
1097 paddw xmm5
, [GLOBAL(rd
)]
1099 psraw xmm3
, VP8_FILTER_SHIFT
1100 psraw xmm5
, VP8_FILTER_SHIFT
1105 movdqa
[rdi
+ rdx
],xmm3
; store row 1
1106 lea rsi
, [rsi
+ 2*rax
]
1109 lea rdi
, [rdi
+ 2*rdx
]
1117 lea rcx
, [rdi
+rdx
*8]
1118 lea rcx
, [rcx
+rdx
*8]
1119 movsxd rax
, dword ptr arg
(1) ; src_pixels_per_line
1122 movq xmm2
, [rsi
] ; 00 01 02 03 04 05 06 07
1123 movq xmm4
, [rsi
+1] ; 01 02 03 04 05 06 07 08
1125 punpcklbw xmm2
, xmm4
1126 movq xmm3
, [rsi
+8] ; 08 09 10 11 12 13 14 15
1128 pmaddubsw xmm2
, xmm1
1129 movq xmm4
, [rsi
+9] ; 09 10 11 12 13 14 15 16
1131 lea rsi
, [rsi
+ rax
] ; next line
1132 punpcklbw xmm3
, xmm4
1134 pmaddubsw xmm3
, xmm1
1137 paddw xmm2
, [GLOBAL(rd
)]
1141 psraw xmm2
, VP8_FILTER_SHIFT
1143 punpcklbw xmm5
, xmm7
1146 paddw xmm3
, [GLOBAL(rd
)]
1147 pmaddubsw xmm5
, xmm1
1149 psraw xmm3
, VP8_FILTER_SHIFT
1150 punpcklbw xmm6
, xmm7
1153 pmaddubsw xmm6
, xmm1
1155 movdqa
[rdi
], xmm2
; store the results in the destination
1156 paddw xmm5
, [GLOBAL(rd
)]
1158 lea rdi
, [rdi
+ rdx
] ; dst_pitch
1159 psraw xmm5
, VP8_FILTER_SHIFT
1161 paddw xmm6
, [GLOBAL(rd
)]
1162 psraw xmm6
, VP8_FILTER_SHIFT
1165 lea rsi
, [rsi
+ rax
] ; next line
1167 movdqa
[rdi
], xmm5
; store the results in the destination
1168 lea rdi
, [rdi
+ rdx
] ; dst_pitch
1184 ;void vp8_bilinear_predict8x8_ssse3
1186 ; unsigned char *src_ptr,
1187 ; int src_pixels_per_line,
1190 ; unsigned char *dst_ptr,
1193 global sym
(vp8_bilinear_predict8x8_ssse3
)
1194 sym
(vp8_bilinear_predict8x8_ssse3
):
1197 SHADOW_ARGS_TO_STACK
6
1205 sub rsp
, 144 ; reserve 144 bytes
1207 lea rcx
, [GLOBAL(vp8_bilinear_filters_ssse3
)]
1209 mov rsi
, arg
(0) ;src_ptr
1210 movsxd rdx
, dword ptr arg
(1) ;src_pixels_per_line
1212 ;Read 9-line unaligned data in and put them on stack. This gives a big
1215 lea rax
, [rdx
+ rdx
*2]
1216 movdqu xmm1
, [rsi
+rdx
]
1217 movdqu xmm2
, [rsi
+rdx
*2]
1220 movdqu xmm4
, [rsi
+rdx
]
1221 movdqu xmm5
, [rsi
+rdx
*2]
1224 movdqu xmm7
, [rsi
+rdx
]
1226 movdqa XMMWORD
PTR [rsp
], xmm0
1228 movdqu xmm0
, [rsi
+rdx
*2]
1230 movdqa XMMWORD
PTR [rsp
+16], xmm1
1231 movdqa XMMWORD
PTR [rsp
+32], xmm2
1232 movdqa XMMWORD
PTR [rsp
+48], xmm3
1233 movdqa XMMWORD
PTR [rsp
+64], xmm4
1234 movdqa XMMWORD
PTR [rsp
+80], xmm5
1235 movdqa XMMWORD
PTR [rsp
+96], xmm6
1236 movdqa XMMWORD
PTR [rsp
+112], xmm7
1237 movdqa XMMWORD
PTR [rsp
+128], xmm0
1239 movsxd rax
, dword ptr arg
(2) ; xoffset
1240 cmp rax
, 0 ; skip first_pass filter if xoffset=0
1244 add rax
, rcx
; HFilter
1246 mov rdi
, arg
(4) ; dst_ptr
1247 movsxd rdx
, dword ptr arg
(5) ; dst_pitch
1251 movsxd rax
, dword ptr arg
(3) ; yoffset
1252 cmp rax
, 0 ; skip second_pass filter if yoffset=0
1256 lea rax
, [rax
+ rcx
] ; VFilter
1258 lea rcx
, [rdi
+rdx
*8]
1262 ; get the first horizontal line done
1263 movdqa xmm3
, [rsp
] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1264 movdqa xmm5
, xmm3
; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
1267 lea rsp
, [rsp
+ 16] ; next line
1269 punpcklbw xmm3
, xmm5
; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
1270 pmaddubsw xmm3
, xmm0
; 00 02 04 06 08 10 12 14
1272 paddw xmm3
, [GLOBAL(rd
)] ; xmm3 += round value
1273 psraw xmm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
1276 packuswb xmm7
, xmm7
; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1279 movdqa xmm6
, [rsp
] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1280 lea rsp
, [rsp
+ 16] ; next line
1286 punpcklbw xmm6
, xmm5
1287 pmaddubsw xmm6
, xmm0
1289 paddw xmm6
, [GLOBAL(rd
)] ; xmm6 += round value
1290 psraw xmm6
, VP8_FILTER_SHIFT
; xmm6 /= 128
1294 punpcklbw xmm7
, xmm6
1295 pmaddubsw xmm7
, xmm1
1297 paddw xmm7
, [GLOBAL(rd
)] ; xmm7 += round value
1298 psraw xmm7
, VP8_FILTER_SHIFT
; xmm7 /= 128
1302 movq
[rdi
], xmm7
; store the results in the destination
1303 lea rdi
, [rdi
+ rdx
]
1313 movsxd rax
, dword ptr arg
(3) ; yoffset
1315 lea rax
, [rax
+ rcx
] ; VFilter
1317 mov rdi
, arg
(4) ;dst_ptr
1318 movsxd rdx
, dword ptr arg
(5) ; dst_pitch
1320 movdqa xmm0
, [rax
] ; VFilter
1322 movq xmm1
, XMMWORD
PTR [rsp
]
1323 movq xmm2
, XMMWORD
PTR [rsp
+16]
1325 movq xmm3
, XMMWORD
PTR [rsp
+32]
1326 punpcklbw xmm1
, xmm2
1328 movq xmm4
, XMMWORD
PTR [rsp
+48]
1329 punpcklbw xmm2
, xmm3
1331 movq xmm5
, XMMWORD
PTR [rsp
+64]
1332 punpcklbw xmm3
, xmm4
1334 movq xmm6
, XMMWORD
PTR [rsp
+80]
1335 punpcklbw xmm4
, xmm5
1337 movq xmm7
, XMMWORD
PTR [rsp
+96]
1338 punpcklbw xmm5
, xmm6
1340 pmaddubsw xmm1
, xmm0
1341 pmaddubsw xmm2
, xmm0
1343 pmaddubsw xmm3
, xmm0
1344 pmaddubsw xmm4
, xmm0
1346 pmaddubsw xmm5
, xmm0
1347 punpcklbw xmm6
, xmm7
1349 pmaddubsw xmm6
, xmm0
1350 paddw xmm1
, [GLOBAL(rd
)]
1352 paddw xmm2
, [GLOBAL(rd
)]
1353 psraw xmm1
, VP8_FILTER_SHIFT
1355 paddw xmm3
, [GLOBAL(rd
)]
1356 psraw xmm2
, VP8_FILTER_SHIFT
1358 paddw xmm4
, [GLOBAL(rd
)]
1359 psraw xmm3
, VP8_FILTER_SHIFT
1361 paddw xmm5
, [GLOBAL(rd
)]
1362 psraw xmm4
, VP8_FILTER_SHIFT
1364 paddw xmm6
, [GLOBAL(rd
)]
1365 psraw xmm5
, VP8_FILTER_SHIFT
1367 psraw xmm6
, VP8_FILTER_SHIFT
1374 movq
[rdi
+rdx
], xmm2
1377 movq xmm1
, XMMWORD
PTR [rsp
+112]
1379 lea rdi
, [rdi
+ 2*rdx
]
1380 movq xmm2
, XMMWORD
PTR [rsp
+128]
1386 movq
[rdi
+rdx
], xmm4
1388 lea rdi
, [rdi
+ 2*rdx
]
1389 punpcklbw xmm7
, xmm1
1392 pmaddubsw xmm7
, xmm0
1394 movq
[rdi
+rdx
], xmm6
1395 punpcklbw xmm1
, xmm2
1397 pmaddubsw xmm1
, xmm0
1398 paddw xmm7
, [GLOBAL(rd
)]
1400 psraw xmm7
, VP8_FILTER_SHIFT
1401 paddw xmm1
, [GLOBAL(rd
)]
1403 psraw xmm1
, VP8_FILTER_SHIFT
1407 lea rdi
, [rdi
+ 2*rdx
]
1411 movq
[rdi
+rdx
], xmm1
1412 lea rsp
, [rsp
+ 144]
1417 lea rcx
, [rdi
+rdx
*8]
1420 movdqa xmm1
, XMMWORD
PTR [rsp
]
1421 movdqa xmm3
, XMMWORD
PTR [rsp
+16]
1424 movdqa xmm5
, XMMWORD
PTR [rsp
+32]
1427 movdqa xmm7
, XMMWORD
PTR [rsp
+48]
1435 punpcklbw xmm1
, xmm2
1436 pmaddubsw xmm1
, xmm0
1438 punpcklbw xmm3
, xmm4
1439 pmaddubsw xmm3
, xmm0
1441 punpcklbw xmm5
, xmm6
1442 pmaddubsw xmm5
, xmm0
1447 punpcklbw xmm7
, xmm2
1448 pmaddubsw xmm7
, xmm0
1450 paddw xmm1
, [GLOBAL(rd
)]
1451 psraw xmm1
, VP8_FILTER_SHIFT
1453 paddw xmm3
, [GLOBAL(rd
)]
1454 psraw xmm3
, VP8_FILTER_SHIFT
1456 paddw xmm5
, [GLOBAL(rd
)]
1457 psraw xmm5
, VP8_FILTER_SHIFT
1459 paddw xmm7
, [GLOBAL(rd
)]
1460 psraw xmm7
, VP8_FILTER_SHIFT
1469 movq
[rdi
+rdx
], xmm3
1471 lea rdi
, [rdi
+ 2*rdx
]
1474 lea rsp
, [rsp
+ 4*16]
1475 movq
[rdi
+rdx
], xmm7
1477 lea rdi
, [rdi
+ 2*rdx
]
1499 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
1501 db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
1503 db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
1507 db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
1510 db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
1518 times
8 db 0, 0 ;placeholder
1527 times
8 db 0, 0 ;placeholder
1536 times
8 db 128, 0 ;placeholder
1545 vp8_bilinear_filters_ssse3: