2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %define BLOCK_HEIGHT_WIDTH
4
15 %define VP8_FILTER_WEIGHT
128
16 %define VP8_FILTER_SHIFT
7
19 ;/************************************************************************************
20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
21 ; input pixel array has output_height rows. This routine assumes that output_height is an
22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
23 ; rows each iteration to take advantage of the 128 bits operations.
24 ;*************************************************************************************/
25 ;void vp8_filter_block1d8_h6_sse2
27 ; unsigned char *src_ptr,
28 ; unsigned short *output_ptr,
29 ; unsigned int src_pixels_per_line,
30 ; unsigned int pixel_step,
31 ; unsigned int output_height,
32 ; unsigned int output_width,
35 global sym
(vp8_filter_block1d8_h6_sse2
)
36 sym
(vp8_filter_block1d8_h6_sse2
):
39 SHADOW_ARGS_TO_STACK
7
46 mov rdx
, arg
(6) ;vp8_filter
47 mov rsi
, arg
(0) ;src_ptr
49 mov rdi
, arg
(1) ;output_ptr
51 movsxd rcx
, dword ptr arg
(4) ;output_height
52 movsxd rax
, dword ptr arg
(2) ;src_pixels_per_line ; Pitch for Source
54 movsxd r8
, dword ptr arg
(5) ;output_width
56 pxor xmm0
, xmm0
; clear xmm0 for unpack
58 filter_block1d8_h6_rowloop:
59 movq xmm3
, MMWORD
PTR [rsi
- 2]
60 movq xmm1
, MMWORD
PTR [rsi
+ 6]
62 prefetcht2
[rsi
+rax
-2]
73 punpcklbw xmm3
, xmm0
; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
74 psrldq xmm4
, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
76 pmullw xmm3
, XMMWORD
PTR [rdx
] ; x[-2] * H[-2]; Tap 1
77 punpcklbw xmm4
, xmm0
; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
79 psrldq xmm5
, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
80 pmullw xmm4
, XMMWORD
PTR [rdx
+16] ; x[-1] * H[-1]; Tap 2
83 punpcklbw xmm5
, xmm0
; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
84 psrldq xmm6
, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
86 pmullw xmm5
, [rdx
+32] ; x[ 0] * H[ 0]; Tap 3
88 punpcklbw xmm6
, xmm0
; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
89 psrldq xmm7
, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
91 pmullw xmm6
, [rdx
+48] ; x[ 1] * h[ 1] ; Tap 4
93 punpcklbw xmm7
, xmm0
; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
94 psrldq xmm1
, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
97 pmullw xmm7
, [rdx
+64] ; x[ 2] * h[ 2] ; Tap 5
99 punpcklbw xmm1
, xmm0
; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
100 pmullw xmm1
, [rdx
+80] ; x[ 3] * h[ 3] ; Tap 6
110 paddsw xmm4
, [GLOBAL(rd
)]
117 movdqa XMMWORD
Ptr [rdi
], xmm4
121 add rdi
, DWORD Ptr arg
(5) ;[output_width]
127 jnz filter_block1d8_h6_rowloop
; next row
139 ;void vp8_filter_block1d16_h6_sse2
141 ; unsigned char *src_ptr,
142 ; unsigned short *output_ptr,
143 ; unsigned int src_pixels_per_line,
144 ; unsigned int pixel_step,
145 ; unsigned int output_height,
146 ; unsigned int output_width,
149 ;/************************************************************************************
150 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
151 ; input pixel array has output_height rows. This routine assumes that output_height is an
152 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
153 ; rows each iteration to take advantage of the 128 bits operations.
154 ;*************************************************************************************/
155 global sym
(vp8_filter_block1d16_h6_sse2
)
156 sym
(vp8_filter_block1d16_h6_sse2
):
159 SHADOW_ARGS_TO_STACK
7
166 mov rdx
, arg
(6) ;vp8_filter
167 mov rsi
, arg
(0) ;src_ptr
169 mov rdi
, arg
(1) ;output_ptr
171 movsxd rcx
, dword ptr arg
(4) ;output_height
172 movsxd rax
, dword ptr arg
(2) ;src_pixels_per_line ; Pitch for Source
174 movsxd r8
, dword ptr arg
(5) ;output_width
177 pxor xmm0
, xmm0
; clear xmm0 for unpack
179 filter_block1d16_h6_sse2_rowloop:
180 movq xmm3
, MMWORD
PTR [rsi
- 2]
181 movq xmm1
, MMWORD
PTR [rsi
+ 6]
183 movq xmm2
, MMWORD
PTR [rsi
+14]
187 prefetcht2
[rsi
+rax
-2]
198 punpcklbw xmm3
, xmm0
; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
199 psrldq xmm4
, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
201 pmullw xmm3
, XMMWORD
PTR [rdx
] ; x[-2] * H[-2]; Tap 1
202 punpcklbw xmm4
, xmm0
; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
204 psrldq xmm5
, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
205 pmullw xmm4
, XMMWORD
PTR [rdx
+16] ; x[-1] * H[-1]; Tap 2
208 punpcklbw xmm5
, xmm0
; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
209 psrldq xmm6
, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
211 pmullw xmm5
, [rdx
+32] ; x[ 0] * H[ 0]; Tap 3
213 punpcklbw xmm6
, xmm0
; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
214 psrldq xmm7
, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
216 pmullw xmm6
, [rdx
+48] ; x[ 1] * h[ 1] ; Tap 4
218 punpcklbw xmm7
, xmm0
; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
219 psrldq xmm1
, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
222 pmullw xmm7
, [rdx
+64] ; x[ 2] * h[ 2] ; Tap 5
224 punpcklbw xmm1
, xmm0
; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
225 pmullw xmm1
, [rdx
+80] ; x[ 3] * h[ 3] ; Tap 6
234 paddsw xmm4
, [GLOBAL(rd
)]
241 movdqa XMMWORD
Ptr [rdi
], xmm4
251 punpcklbw xmm3
, xmm0
; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
252 psrldq xmm4
, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
254 pmullw xmm3
, XMMWORD
PTR [rdx
] ; x[-2] * H[-2]; Tap 1
255 punpcklbw xmm4
, xmm0
; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
257 psrldq xmm5
, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
258 pmullw xmm4
, XMMWORD
PTR [rdx
+16] ; x[-1] * H[-1]; Tap 2
261 punpcklbw xmm5
, xmm0
; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
262 psrldq xmm6
, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
264 pmullw xmm5
, [rdx
+32] ; x[ 0] * H[ 0]; Tap 3
266 punpcklbw xmm6
, xmm0
; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
267 psrldq xmm7
, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
269 pmullw xmm6
, [rdx
+48] ; x[ 1] * h[ 1] ; Tap 4
271 punpcklbw xmm7
, xmm0
; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
272 psrldq xmm2
, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
274 pmullw xmm7
, [rdx
+64] ; x[ 2] * h[ 2] ; Tap 5
276 punpcklbw xmm2
, xmm0
; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
277 pmullw xmm2
, [rdx
+80] ; x[ 3] * h[ 3] ; Tap 6
287 paddsw xmm4
, [GLOBAL(rd
)]
294 movdqa XMMWORD
Ptr [rdi
+16], xmm4
298 add rdi
, DWORD Ptr arg
(5) ;[output_width]
304 jnz filter_block1d16_h6_sse2_rowloop
; next row
316 ;void vp8_filter_block1d8_v6_sse2
319 ; unsigned char *output_ptr,
321 ; unsigned int pixels_per_line,
322 ; unsigned int pixel_step,
323 ; unsigned int output_height,
324 ; unsigned int output_width,
327 ;/************************************************************************************
328 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
329 ; input pixel array has output_height rows.
330 ;*************************************************************************************/
331 global sym
(vp8_filter_block1d8_v6_sse2
)
332 sym
(vp8_filter_block1d8_v6_sse2
):
335 SHADOW_ARGS_TO_STACK
8
342 mov rax
, arg
(7) ;vp8_filter
343 movsxd rdx
, dword ptr arg
(3) ;pixels_per_line
345 mov rdi
, arg
(1) ;output_ptr
346 mov rsi
, arg
(0) ;src_ptr
351 movsxd rcx
, DWORD PTR arg
(5) ;[output_height]
352 pxor xmm0
, xmm0
; clear xmm0
354 movdqa xmm7
, XMMWORD
PTR [GLOBAL(rd
)]
356 movsxd r8
, dword ptr arg
(2) ; dst_ptich
359 vp8_filter_block1d8_v6_sse2_loop:
360 movdqa xmm1
, XMMWORD
PTR [rsi
]
363 movdqa xmm2
, XMMWORD
PTR [rsi
+ rdx
]
364 pmullw xmm2
, [rax
+ 16]
366 movdqa xmm3
, XMMWORD
PTR [rsi
+ rdx
* 2]
367 pmullw xmm3
, [rax
+ 32]
369 movdqa xmm5
, XMMWORD
PTR [rsi
+ rdx
* 4]
370 pmullw xmm5
, [rax
+ 64]
373 movdqa xmm4
, XMMWORD
PTR [rsi
+ rdx
* 2]
375 pmullw xmm4
, [rax
+ 48]
376 movdqa xmm6
, XMMWORD
PTR [rsi
+ rdx
* 4]
378 pmullw xmm6
, [rax
+ 80]
390 packuswb xmm2
, xmm0
; pack and saturate
392 movq
QWORD PTR [rdi
], xmm2
; store the results in the destination
394 add rdi
, DWORD PTR arg
(2) ;[dst_ptich]
398 dec rcx
; decrement count
399 jnz vp8_filter_block1d8_v6_sse2_loop
; next row
411 ;void vp8_filter_block1d16_v6_sse2
413 ; unsigned short *src_ptr,
414 ; unsigned char *output_ptr,
416 ; unsigned int pixels_per_line,
417 ; unsigned int pixel_step,
418 ; unsigned int output_height,
419 ; unsigned int output_width,
420 ; const short *vp8_filter
422 ;/************************************************************************************
423 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
424 ; input pixel array has output_height rows.
425 ;*************************************************************************************/
426 global sym
(vp8_filter_block1d16_v6_sse2
)
427 sym
(vp8_filter_block1d16_v6_sse2
):
430 SHADOW_ARGS_TO_STACK
8
437 mov rax
, arg
(7) ;vp8_filter
438 movsxd rdx
, dword ptr arg
(3) ;pixels_per_line
440 mov rdi
, arg
(1) ;output_ptr
441 mov rsi
, arg
(0) ;src_ptr
446 movsxd rcx
, DWORD PTR arg
(5) ;[output_height]
448 movsxd r8
, dword ptr arg
(2) ; dst_ptich
451 vp8_filter_block1d16_v6_sse2_loop:
452 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
453 movdqa xmm1
, XMMWORD
PTR [rsi
+ rdx
] ; line 2
454 movdqa xmm2
, XMMWORD
PTR [rsi
+ rdx
+ 16]
455 pmullw xmm1
, [rax
+ 16]
456 pmullw xmm2
, [rax
+ 16]
458 movdqa xmm3
, XMMWORD
PTR [rsi
+ rdx
* 4] ; line 5
459 movdqa xmm4
, XMMWORD
PTR [rsi
+ rdx
* 4 + 16]
460 pmullw xmm3
, [rax
+ 64]
461 pmullw xmm4
, [rax
+ 64]
463 movdqa xmm5
, XMMWORD
PTR [rsi
+ rdx
* 2] ; line 3
464 movdqa xmm6
, XMMWORD
PTR [rsi
+ rdx
* 2 + 16]
465 pmullw xmm5
, [rax
+ 32]
466 pmullw xmm6
, [rax
+ 32]
468 movdqa xmm7
, XMMWORD
PTR [rsi
] ; line 1
469 movdqa xmm0
, XMMWORD
PTR [rsi
+ 16]
482 movdqa xmm3
, XMMWORD
PTR [rsi
+ rdx
* 2] ; line 4
483 movdqa xmm4
, XMMWORD
PTR [rsi
+ rdx
* 2 + 16]
484 pmullw xmm3
, [rax
+ 48]
485 pmullw xmm4
, [rax
+ 48]
487 movdqa xmm5
, XMMWORD
PTR [rsi
+ rdx
* 4] ; line 6
488 movdqa xmm6
, XMMWORD
PTR [rsi
+ rdx
* 4 + 16]
489 pmullw xmm5
, [rax
+ 80]
490 pmullw xmm6
, [rax
+ 80]
492 movdqa xmm7
, XMMWORD
PTR [GLOBAL(rd
)]
493 pxor xmm0
, xmm0
; clear xmm0
506 packuswb xmm1
, xmm2
; pack and saturate
507 movdqa XMMWORD
PTR [rdi
], xmm1
; store the results in the destination
509 add rdi
, DWORD PTR arg
(2) ;[dst_ptich]
513 dec rcx
; decrement count
514 jnz vp8_filter_block1d16_v6_sse2_loop
; next row
526 ;void vp8_filter_block1d8_h6_only_sse2
528 ; unsigned char *src_ptr,
529 ; unsigned int src_pixels_per_line,
530 ; unsigned char *output_ptr,
532 ; unsigned int output_height,
533 ; const short *vp8_filter
535 ; First-pass filter only when yoffset==0
536 global sym
(vp8_filter_block1d8_h6_only_sse2
)
537 sym
(vp8_filter_block1d8_h6_only_sse2
):
540 SHADOW_ARGS_TO_STACK
6
547 mov rdx
, arg
(5) ;vp8_filter
548 mov rsi
, arg
(0) ;src_ptr
550 mov rdi
, arg
(2) ;output_ptr
552 movsxd rcx
, dword ptr arg
(4) ;output_height
553 movsxd rax
, dword ptr arg
(1) ;src_pixels_per_line ; Pitch for Source
555 movsxd r8
, dword ptr arg
(3) ;dst_ptich
557 pxor xmm0
, xmm0
; clear xmm0 for unpack
559 filter_block1d8_h6_only_rowloop:
560 movq xmm3
, MMWORD
PTR [rsi
- 2]
561 movq xmm1
, MMWORD
PTR [rsi
+ 6]
563 prefetcht2
[rsi
+rax
-2]
574 punpcklbw xmm3
, xmm0
; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
575 psrldq xmm4
, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
577 pmullw xmm3
, XMMWORD
PTR [rdx
] ; x[-2] * H[-2]; Tap 1
578 punpcklbw xmm4
, xmm0
; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
580 psrldq xmm5
, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
581 pmullw xmm4
, XMMWORD
PTR [rdx
+16] ; x[-1] * H[-1]; Tap 2
584 punpcklbw xmm5
, xmm0
; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
585 psrldq xmm6
, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
587 pmullw xmm5
, [rdx
+32] ; x[ 0] * H[ 0]; Tap 3
589 punpcklbw xmm6
, xmm0
; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
590 psrldq xmm7
, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
592 pmullw xmm6
, [rdx
+48] ; x[ 1] * h[ 1] ; Tap 4
594 punpcklbw xmm7
, xmm0
; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
595 psrldq xmm1
, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
598 pmullw xmm7
, [rdx
+64] ; x[ 2] * h[ 2] ; Tap 5
600 punpcklbw xmm1
, xmm0
; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
601 pmullw xmm1
, [rdx
+80] ; x[ 3] * h[ 3] ; Tap 6
611 paddsw xmm4
, [GLOBAL(rd
)]
617 movq
QWORD PTR [rdi
], xmm4
; store the results in the destination
621 add rdi
, DWORD Ptr arg
(3) ;dst_ptich
627 jnz filter_block1d8_h6_only_rowloop
; next row
639 ;void vp8_filter_block1d16_h6_only_sse2
641 ; unsigned char *src_ptr,
642 ; unsigned int src_pixels_per_line,
643 ; unsigned char *output_ptr,
645 ; unsigned int output_height,
646 ; const short *vp8_filter
648 ; First-pass filter only when yoffset==0
649 global sym
(vp8_filter_block1d16_h6_only_sse2
)
650 sym
(vp8_filter_block1d16_h6_only_sse2
):
653 SHADOW_ARGS_TO_STACK
6
660 mov rdx
, arg
(5) ;vp8_filter
661 mov rsi
, arg
(0) ;src_ptr
663 mov rdi
, arg
(2) ;output_ptr
665 movsxd rcx
, dword ptr arg
(4) ;output_height
666 movsxd rax
, dword ptr arg
(1) ;src_pixels_per_line ; Pitch for Source
668 movsxd r8
, dword ptr arg
(3) ;dst_ptich
671 pxor xmm0
, xmm0
; clear xmm0 for unpack
673 filter_block1d16_h6_only_sse2_rowloop:
674 movq xmm3
, MMWORD
PTR [rsi
- 2]
675 movq xmm1
, MMWORD
PTR [rsi
+ 6]
677 movq xmm2
, MMWORD
PTR [rsi
+14]
681 prefetcht2
[rsi
+rax
-2]
692 punpcklbw xmm3
, xmm0
; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
693 psrldq xmm4
, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
695 pmullw xmm3
, XMMWORD
PTR [rdx
] ; x[-2] * H[-2]; Tap 1
696 punpcklbw xmm4
, xmm0
; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
698 psrldq xmm5
, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
699 pmullw xmm4
, XMMWORD
PTR [rdx
+16] ; x[-1] * H[-1]; Tap 2
701 punpcklbw xmm5
, xmm0
; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
702 psrldq xmm6
, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
704 pmullw xmm5
, [rdx
+32] ; x[ 0] * H[ 0]; Tap 3
706 punpcklbw xmm6
, xmm0
; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
707 psrldq xmm7
, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
709 pmullw xmm6
, [rdx
+48] ; x[ 1] * h[ 1] ; Tap 4
711 punpcklbw xmm7
, xmm0
; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
712 psrldq xmm1
, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
714 pmullw xmm7
, [rdx
+64] ; x[ 2] * h[ 2] ; Tap 5
716 punpcklbw xmm1
, xmm0
; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
717 pmullw xmm1
, [rdx
+80] ; x[ 3] * h[ 3] ; Tap 6
726 paddsw xmm4
, [GLOBAL(rd
)]
730 packuswb xmm4
, xmm0
; lower 8 bytes
732 movq
QWORD Ptr [rdi
], xmm4
; store the results in the destination
742 punpcklbw xmm3
, xmm0
; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
743 psrldq xmm4
, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
745 pmullw xmm3
, XMMWORD
PTR [rdx
] ; x[-2] * H[-2]; Tap 1
746 punpcklbw xmm4
, xmm0
; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
748 psrldq xmm5
, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
749 pmullw xmm4
, XMMWORD
PTR [rdx
+16] ; x[-1] * H[-1]; Tap 2
751 punpcklbw xmm5
, xmm0
; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
752 psrldq xmm6
, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
754 pmullw xmm5
, [rdx
+32] ; x[ 0] * H[ 0]; Tap 3
756 punpcklbw xmm6
, xmm0
; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
757 psrldq xmm7
, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
759 pmullw xmm6
, [rdx
+48] ; x[ 1] * h[ 1] ; Tap 4
761 punpcklbw xmm7
, xmm0
; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
762 psrldq xmm2
, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
764 pmullw xmm7
, [rdx
+64] ; x[ 2] * h[ 2] ; Tap 5
766 punpcklbw xmm2
, xmm0
; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
767 pmullw xmm2
, [rdx
+80] ; x[ 3] * h[ 3] ; Tap 6
776 paddsw xmm4
, [GLOBAL(rd
)]
780 packuswb xmm4
, xmm0
; higher 8 bytes
782 movq
QWORD Ptr [rdi
+8], xmm4
; store the results in the destination
786 add rdi
, DWORD Ptr arg
(3) ;dst_ptich
792 jnz filter_block1d16_h6_only_sse2_rowloop
; next row
804 ;void vp8_filter_block1d8_v6_only_sse2
806 ; unsigned char *src_ptr,
807 ; unsigned int src_pixels_per_line,
808 ; unsigned char *output_ptr,
810 ; unsigned int output_height,
811 ; const short *vp8_filter
813 ; Second-pass filter only when xoffset==0
814 global sym
(vp8_filter_block1d8_v6_only_sse2
)
815 sym
(vp8_filter_block1d8_v6_only_sse2
):
818 SHADOW_ARGS_TO_STACK
6
825 mov rsi
, arg
(0) ;src_ptr
826 mov rdi
, arg
(2) ;output_ptr
828 movsxd rcx
, dword ptr arg
(4) ;output_height
829 movsxd rdx
, dword ptr arg
(1) ;src_pixels_per_line
831 mov rax
, arg
(5) ;vp8_filter
833 pxor xmm0
, xmm0
; clear xmm0
835 movdqa xmm7
, XMMWORD
PTR [GLOBAL(rd
)]
837 movsxd r8
, dword ptr arg
(3) ; dst_ptich
840 vp8_filter_block1d8_v6_only_sse2_loop:
841 movq xmm1
, MMWORD
PTR [rsi
]
842 movq xmm2
, MMWORD
PTR [rsi
+ rdx
]
843 movq xmm3
, MMWORD
PTR [rsi
+ rdx
* 2]
844 movq xmm5
, MMWORD
PTR [rsi
+ rdx
* 4]
846 movq xmm4
, MMWORD
PTR [rsi
+ rdx
* 2]
847 movq xmm6
, MMWORD
PTR [rsi
+ rdx
* 4]
853 pmullw xmm2
, [rax
+ 16]
856 pmullw xmm3
, [rax
+ 32]
859 pmullw xmm5
, [rax
+ 64]
862 pmullw xmm4
, [rax
+ 48]
865 pmullw xmm6
, [rax
+ 80]
877 packuswb xmm2
, xmm0
; pack and saturate
879 movq
QWORD PTR [rdi
], xmm2
; store the results in the destination
881 add rdi
, DWORD PTR arg
(3) ;[dst_ptich]
885 dec rcx
; decrement count
886 jnz vp8_filter_block1d8_v6_only_sse2_loop
; next row
898 ;void vp8_unpack_block1d16_h6_sse2
900 ; unsigned char *src_ptr,
901 ; unsigned short *output_ptr,
902 ; unsigned int src_pixels_per_line,
903 ; unsigned int output_height,
904 ; unsigned int output_width
906 global sym
(vp8_unpack_block1d16_h6_sse2
)
907 sym
(vp8_unpack_block1d16_h6_sse2
):
910 SHADOW_ARGS_TO_STACK
5
911 ;SAVE_XMM ;xmm6, xmm7 are not used here.
917 mov rsi
, arg
(0) ;src_ptr
918 mov rdi
, arg
(1) ;output_ptr
920 movsxd rcx
, dword ptr arg
(3) ;output_height
921 movsxd rax
, dword ptr arg
(2) ;src_pixels_per_line ; Pitch for Source
923 pxor xmm0
, xmm0
; clear xmm0 for unpack
925 movsxd r8
, dword ptr arg
(4) ;output_width ; Pitch for Source
928 unpack_block1d16_h6_sse2_rowloop:
929 movq xmm1
, MMWORD
PTR [rsi
] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
930 movq xmm3
, MMWORD
PTR [rsi
+8] ; make copy of xmm1
932 punpcklbw xmm3
, xmm0
; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
935 movdqa XMMWORD
Ptr [rdi
], xmm1
936 movdqa XMMWORD
Ptr [rdi
+ 16], xmm3
940 add rdi
, DWORD Ptr arg
(4) ;[output_width]
945 jnz unpack_block1d16_h6_sse2_rowloop
; next row
957 ;void vp8_bilinear_predict16x16_sse2
959 ; unsigned char *src_ptr,
960 ; int src_pixels_per_line,
963 ; unsigned char *dst_ptr,
966 extern sym
(vp8_bilinear_filters_mmx
)
967 global sym
(vp8_bilinear_predict16x16_sse2
)
968 sym
(vp8_bilinear_predict16x16_sse2
):
971 SHADOW_ARGS_TO_STACK
6
978 ;const short *HFilter = bilinear_filters_mmx[xoffset]
979 ;const short *VFilter = bilinear_filters_mmx[yoffset]
981 lea rcx
, [GLOBAL(sym
(vp8_bilinear_filters_mmx
))]
982 movsxd rax
, dword ptr arg
(2) ;xoffset
984 cmp rax
, 0 ;skip first_pass filter if xoffset=0
988 add rax
, rcx
;HFilter
990 mov rdi
, arg
(4) ;dst_ptr
991 mov rsi
, arg
(0) ;src_ptr
992 movsxd rdx
, dword ptr arg
(5) ;dst_pitch
995 movdqa xmm2
, [rax
+16]
997 movsxd rax
, dword ptr arg
(3) ;yoffset
999 cmp rax
, 0 ;skip second_pass filter if yoffset=0
1003 add rax
, rcx
;VFilter
1005 lea rcx
, [rdi
+rdx
*8]
1006 lea rcx
, [rcx
+rdx
*8]
1007 movsxd rdx
, dword ptr arg
(1) ;src_pixels_per_line
1012 movsxd r8
, dword ptr arg
(5) ;dst_pitch
1014 ; get the first horizontal line done
1015 movdqu xmm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1016 movdqa xmm4
, xmm3
; make a copy of current line
1018 punpcklbw xmm3
, xmm0
; xx 00 01 02 03 04 05 06
1019 punpckhbw xmm4
, xmm0
1024 movdqu xmm5
, [rsi
+1]
1027 punpcklbw xmm5
, xmm0
1028 punpckhbw xmm6
, xmm0
1036 paddw xmm3
, [GLOBAL(rd
)] ; xmm3 += round value
1037 psraw xmm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
1039 paddw xmm4
, [GLOBAL(rd
)]
1040 psraw xmm4
, VP8_FILTER_SHIFT
1045 add rsi
, rdx
; next line
1047 movdqu xmm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1048 movdqa xmm4
, xmm3
; make a copy of current line
1050 punpcklbw xmm3
, xmm0
; xx 00 01 02 03 04 05 06
1051 punpckhbw xmm4
, xmm0
1056 movdqu xmm5
, [rsi
+1]
1059 punpcklbw xmm5
, xmm0
1060 punpckhbw xmm6
, xmm0
1071 punpcklbw xmm5
, xmm0
1072 punpckhbw xmm6
, xmm0
1077 paddw xmm3
, [GLOBAL(rd
)] ; xmm3 += round value
1078 psraw xmm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
1080 paddw xmm4
, [GLOBAL(rd
)]
1081 psraw xmm4
, VP8_FILTER_SHIFT
1086 pmullw xmm3
, [rax
+16]
1087 pmullw xmm4
, [rax
+16]
1092 paddw xmm3
, [GLOBAL(rd
)] ; xmm3 += round value
1093 psraw xmm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
1095 paddw xmm4
, [GLOBAL(rd
)]
1096 psraw xmm4
, VP8_FILTER_SHIFT
1099 movdqa
[rdi
], xmm3
; store the results in the destination
1101 add rsi
, rdx
; next line
1103 add rdi
, DWORD PTR arg
(5) ;dst_pitch
1114 movsxd rax
, dword ptr arg
(3) ;yoffset
1116 add rax
, rcx
;VFilter
1118 mov rdi
, arg
(4) ;dst_ptr
1119 mov rsi
, arg
(0) ;src_ptr
1120 movsxd rdx
, dword ptr arg
(5) ;dst_pitch
1123 movdqa xmm2
, [rax
+16]
1125 lea rcx
, [rdi
+rdx
*8]
1126 lea rcx
, [rcx
+rdx
*8]
1127 movsxd rax
, dword ptr arg
(1) ;src_pixels_per_line
1131 ; get the first horizontal line done
1132 movdqu xmm7
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1134 add rsi
, rax
; next line
1136 movdqu xmm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1141 movdqa xmm4
, xmm3
; make a copy of current line
1144 punpcklbw xmm5
, xmm0
1145 punpckhbw xmm6
, xmm0
1146 punpcklbw xmm3
, xmm0
; xx 00 01 02 03 04 05 06
1147 punpckhbw xmm4
, xmm0
1157 paddw xmm3
, [GLOBAL(rd
)] ; xmm3 += round value
1158 psraw xmm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
1160 paddw xmm4
, [GLOBAL(rd
)]
1161 psraw xmm4
, VP8_FILTER_SHIFT
1164 movdqa
[rdi
], xmm3
; store the results in the destination
1166 add rsi
, rax
; next line
1167 add rdi
, rdx
;dst_pitch
1174 lea rcx
, [rdi
+rdx
*8]
1175 lea rcx
, [rcx
+rdx
*8]
1176 movsxd rax
, dword ptr arg
(1) ;src_pixels_per_line
1180 movdqu xmm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1181 movdqa xmm4
, xmm3
; make a copy of current line
1183 punpcklbw xmm3
, xmm0
; xx 00 01 02 03 04 05 06
1184 punpckhbw xmm4
, xmm0
1189 movdqu xmm5
, [rsi
+1]
1192 punpcklbw xmm5
, xmm0
1193 punpckhbw xmm6
, xmm0
1201 paddw xmm3
, [GLOBAL(rd
)] ; xmm3 += round value
1202 psraw xmm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
1204 paddw xmm4
, [GLOBAL(rd
)]
1205 psraw xmm4
, VP8_FILTER_SHIFT
1208 movdqa
[rdi
], xmm3
; store the results in the destination
1210 add rsi
, rax
; next line
1211 add rdi
, rdx
; dst_pitch
1226 ;void vp8_bilinear_predict8x8_sse2
1228 ; unsigned char *src_ptr,
1229 ; int src_pixels_per_line,
1232 ; unsigned char *dst_ptr,
1235 extern sym
(vp8_bilinear_filters_mmx
)
1236 global sym
(vp8_bilinear_predict8x8_sse2
)
1237 sym
(vp8_bilinear_predict8x8_sse2
):
1240 SHADOW_ARGS_TO_STACK
6
1248 sub rsp
, 144 ; reserve 144 bytes
1250 ;const short *HFilter = bilinear_filters_mmx[xoffset]
1251 ;const short *VFilter = bilinear_filters_mmx[yoffset]
1252 lea rcx
, [GLOBAL(sym
(vp8_bilinear_filters_mmx
))]
1254 mov rsi
, arg
(0) ;src_ptr
1255 movsxd rdx
, dword ptr arg
(1) ;src_pixels_per_line
1257 ;Read 9-line unaligned data in and put them on stack. This gives a big
1260 lea rax
, [rdx
+ rdx
*2]
1261 movdqu xmm1
, [rsi
+rdx
]
1262 movdqu xmm2
, [rsi
+rdx
*2]
1265 movdqu xmm4
, [rsi
+rdx
]
1266 movdqu xmm5
, [rsi
+rdx
*2]
1269 movdqu xmm7
, [rsi
+rdx
]
1271 movdqa XMMWORD
PTR [rsp
], xmm0
1273 movdqu xmm0
, [rsi
+rdx
*2]
1275 movdqa XMMWORD
PTR [rsp
+16], xmm1
1276 movdqa XMMWORD
PTR [rsp
+32], xmm2
1277 movdqa XMMWORD
PTR [rsp
+48], xmm3
1278 movdqa XMMWORD
PTR [rsp
+64], xmm4
1279 movdqa XMMWORD
PTR [rsp
+80], xmm5
1280 movdqa XMMWORD
PTR [rsp
+96], xmm6
1281 movdqa XMMWORD
PTR [rsp
+112], xmm7
1282 movdqa XMMWORD
PTR [rsp
+128], xmm0
1284 movsxd rax
, dword ptr arg
(2) ;xoffset
1286 add rax
, rcx
;HFilter
1288 mov rdi
, arg
(4) ;dst_ptr
1289 movsxd rdx
, dword ptr arg
(5) ;dst_pitch
1292 movdqa xmm2
, [rax
+16]
1294 movsxd rax
, dword ptr arg
(3) ;yoffset
1296 add rax
, rcx
;VFilter
1298 lea rcx
, [rdi
+rdx
*8]
1301 movdqa xmm6
, [rax
+16]
1305 ; get the first horizontal line done
1306 movdqa xmm3
, XMMWORD
PTR [rsp
]
1307 movdqa xmm4
, xmm3
; make a copy of current line
1310 punpcklbw xmm3
, xmm0
; 00 01 02 03 04 05 06 07
1311 punpcklbw xmm4
, xmm0
; 01 02 03 04 05 06 07 08
1318 paddw xmm3
, [GLOBAL(rd
)] ; xmm3 += round value
1319 psraw xmm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
1322 add rsp
, 16 ; next line
1324 movdqa xmm3
, XMMWORD
PTR [rsp
] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1325 movdqa xmm4
, xmm3
; make a copy of current line
1328 punpcklbw xmm3
, xmm0
; 00 01 02 03 04 05 06 07
1329 punpcklbw xmm4
, xmm0
; 01 02 03 04 05 06 07 08
1337 paddw xmm3
, [GLOBAL(rd
)] ; xmm3 += round value
1338 psraw xmm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
1347 paddw xmm3
, [GLOBAL(rd
)] ; xmm3 += round value
1348 psraw xmm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
1351 movq
[rdi
], xmm3
; store the results in the destination
1353 add rsp
, 16 ; next line