2 ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
14 %define BLOCK_HEIGHT_WIDTH
4
15 %define vp8_filter_weight
128
16 %define VP8_FILTER_SHIFT
7
19 ;void vp8_filter_block1d_h6_mmx
21 ; unsigned char *src_ptr,
22 ; unsigned short *output_ptr,
23 ; unsigned int src_pixels_per_line,
24 ; unsigned int pixel_step,
25 ; unsigned int output_height,
26 ; unsigned int output_width,
29 global sym
(vp8_filter_block1d_h6_mmx
)
30 sym
(vp8_filter_block1d_h6_mmx
):
33 SHADOW_ARGS_TO_STACK
7
39 mov rdx
, arg
(6) ;vp8_filter
41 movq mm1
, [rdx
+ 16] ; do both the negative taps first!!!
42 movq mm2
, [rdx
+ 32] ;
43 movq mm6
, [rdx
+ 48] ;
44 movq mm7
, [rdx
+ 64] ;
46 mov rdi
, arg
(1) ;output_ptr
47 mov rsi
, arg
(0) ;src_ptr
48 movsxd rcx
, dword ptr arg
(4) ;output_height
49 movsxd rax
, dword ptr arg
(5) ;output_width ; destination pitch?
50 pxor mm0
, mm0
; mm0 = 00000000
53 movq mm3
, [rsi
-2] ; mm3 = p-2..p5
54 movq mm4
, mm3
; mm4 = p-2..p5
55 psrlq mm3
, 8 ; mm3 = p-1..p5
56 punpcklbw mm3
, mm0
; mm3 = p-1..p2
57 pmullw mm3
, mm1
; mm3 *= kernel 1 modifiers.
59 movq mm5
, mm4
; mm5 = p-2..p5
60 punpckhbw mm4
, mm0
; mm5 = p2..p5
61 pmullw mm4
, mm7
; mm5 *= kernel 4 modifiers
62 paddsw mm3
, mm4
; mm3 += mm5
64 movq mm4
, mm5
; mm4 = p-2..p5;
65 psrlq mm5
, 16 ; mm5 = p0..p5;
66 punpcklbw mm5
, mm0
; mm5 = p0..p3
67 pmullw mm5
, mm2
; mm5 *= kernel 2 modifiers
68 paddsw mm3
, mm5
; mm3 += mm5
70 movq mm5
, mm4
; mm5 = p-2..p5
71 psrlq mm4
, 24 ; mm4 = p1..p5
72 punpcklbw mm4
, mm0
; mm4 = p1..p4
73 pmullw mm4
, mm6
; mm5 *= kernel 3 modifiers
74 paddsw mm3
, mm4
; mm3 += mm5
76 ; do outer positive taps
78 punpcklbw mm4
, mm0
; mm5 = p3..p6
79 pmullw mm4
, [rdx
+80] ; mm5 *= kernel 0 modifiers
80 paddsw mm3
, mm4
; mm3 += mm5
82 punpcklbw mm5
, mm0
; mm5 = p-2..p1
83 pmullw mm5
, [rdx
] ; mm5 *= kernel 5 modifiers
84 paddsw mm3
, mm5
; mm3 += mm5
86 paddsw mm3
, [rd
GLOBAL] ; mm3 += round value
87 psraw mm3
, VP8_FILTER_SHIFT
; mm3 /= 128
88 packuswb mm3
, mm0
; pack and unpack to saturate
91 movq
[rdi
], mm3
; store the results in the destination
94 add rsi
, dword ptr arg
(2) ;src_pixels_per_line ; next line
97 movsxd r8
, dword ptr arg
(2) ;src_pixels_per_line
100 add rsi
, r8
; next line
103 dec rcx
; decrement count
104 jnz nextrow
; next row
116 ; THIS FUNCTION APPEARS TO BE UNUSED
118 ;void vp8_filter_block1d_v6_mmx
121 ; unsigned char *output_ptr,
122 ; unsigned int pixels_per_line,
123 ; unsigned int pixel_step,
124 ; unsigned int output_height,
125 ; unsigned int output_width,
128 global sym
(vp8_filter_block1d_v6_mmx
)
129 sym
(vp8_filter_block1d_v6_mmx
):
132 SHADOW_ARGS_TO_STACK
7
138 movq mm5
, [rd
GLOBAL]
140 mov rbx
, arg
(6) ;vp8_filter
141 movq mm1
, [rbx
+ 16] ; do both the negative taps first!!!
142 movq mm2
, [rbx
+ 32] ;
143 movq mm6
, [rbx
+ 48] ;
144 movq mm7
, [rbx
+ 64] ;
146 movsxd rdx
, dword ptr arg
(2) ;pixels_per_line
147 mov rdi
, arg
(1) ;output_ptr
148 mov rsi
, arg
(0) ;src_ptr
151 movsxd rcx
, DWORD PTR arg
(4) ;output_height
152 movsxd rax
, DWORD PTR arg
(5) ;output_width ; destination pitch?
153 pxor mm0
, mm0
; mm0 = 00000000
157 movq mm3
, [rsi
+rdx
] ; mm3 = p0..p8 = row -1
158 pmullw mm3
, mm1
; mm3 *= kernel 1 modifiers.
161 movq mm4
, [rsi
+ 4*rdx
] ; mm4 = p0..p3 = row 2
162 pmullw mm4
, mm7
; mm4 *= kernel 4 modifiers.
163 paddsw mm3
, mm4
; mm3 += mm4
165 movq mm4
, [rsi
+ 2*rdx
] ; mm4 = p0..p3 = row 0
166 pmullw mm4
, mm2
; mm4 *= kernel 2 modifiers.
167 paddsw mm3
, mm4
; mm3 += mm4
169 movq mm4
, [rsi
] ; mm4 = p0..p3 = row -2
170 pmullw mm4
, [rbx
] ; mm4 *= kernel 0 modifiers.
171 paddsw mm3
, mm4
; mm3 += mm4
174 add rsi
, rdx
; move source forward 1 line to avoid 3 * pitch
175 movq mm4
, [rsi
+ 2*rdx
] ; mm4 = p0..p3 = row 1
176 pmullw mm4
, mm6
; mm4 *= kernel 3 modifiers.
177 paddsw mm3
, mm4
; mm3 += mm4
179 movq mm4
, [rsi
+ 4*rdx
] ; mm4 = p0..p3 = row 3
180 pmullw mm4
, [rbx
+80] ; mm4 *= kernel 3 modifiers.
181 paddsw mm3
, mm4
; mm3 += mm4
184 paddsw mm3
, mm5
; mm3 += round value
185 psraw mm3
, VP8_FILTER_SHIFT
; mm3 /= 128
186 packuswb mm3
, mm0
; pack and saturate
188 movd
[rdi
],mm3
; store the results in the destination
192 dec rcx
; decrement count
193 jnz nextrow_v
; next row
206 ;void vp8_filter_block1dc_v6_mmx
209 ; unsigned char *output_ptr,
211 ; unsigned int pixels_per_line,
212 ; unsigned int pixel_step,
213 ; unsigned int output_height,
214 ; unsigned int output_width,
217 global sym
(vp8_filter_block1dc_v6_mmx
)
218 sym
(vp8_filter_block1dc_v6_mmx
):
221 SHADOW_ARGS_TO_STACK
8
227 movq mm5
, [rd
GLOBAL]
229 mov rbx
, arg
(7) ;vp8_filter
230 movq mm1
, [rbx
+ 16] ; do both the negative taps first!!!
231 movq mm2
, [rbx
+ 32] ;
232 movq mm6
, [rbx
+ 48] ;
233 movq mm7
, [rbx
+ 64] ;
235 movsxd rdx
, dword ptr arg
(3) ;pixels_per_line
236 mov rdi
, arg
(1) ;output_ptr
237 mov rsi
, arg
(0) ;src_ptr
240 movsxd rcx
, DWORD PTR arg
(5) ;output_height
241 movsxd rax
, DWORD PTR arg
(2) ;output_pitch ; destination pitch?
242 pxor mm0
, mm0
; mm0 = 00000000
246 movq mm3
, [rsi
+rdx
] ; mm3 = p0..p8 = row -1
247 pmullw mm3
, mm1
; mm3 *= kernel 1 modifiers.
250 movq mm4
, [rsi
+ 4*rdx
] ; mm4 = p0..p3 = row 2
251 pmullw mm4
, mm7
; mm4 *= kernel 4 modifiers.
252 paddsw mm3
, mm4
; mm3 += mm4
254 movq mm4
, [rsi
+ 2*rdx
] ; mm4 = p0..p3 = row 0
255 pmullw mm4
, mm2
; mm4 *= kernel 2 modifiers.
256 paddsw mm3
, mm4
; mm3 += mm4
258 movq mm4
, [rsi
] ; mm4 = p0..p3 = row -2
259 pmullw mm4
, [rbx
] ; mm4 *= kernel 0 modifiers.
260 paddsw mm3
, mm4
; mm3 += mm4
263 add rsi
, rdx
; move source forward 1 line to avoid 3 * pitch
264 movq mm4
, [rsi
+ 2*rdx
] ; mm4 = p0..p3 = row 1
265 pmullw mm4
, mm6
; mm4 *= kernel 3 modifiers.
266 paddsw mm3
, mm4
; mm3 += mm4
268 movq mm4
, [rsi
+ 4*rdx
] ; mm4 = p0..p3 = row 3
269 pmullw mm4
, [rbx
+80] ; mm4 *= kernel 3 modifiers.
270 paddsw mm3
, mm4
; mm3 += mm4
273 paddsw mm3
, mm5
; mm3 += round value
274 psraw mm3
, VP8_FILTER_SHIFT
; mm3 /= 128
275 packuswb mm3
, mm0
; pack and saturate
277 movd
[rdi
],mm3
; store the results in the destination
278 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
279 ; recon block should be in cache this shouldn't cost much. Its obviously
282 dec rcx
; decrement count
283 jnz nextrow_cv
; next row
296 ;void bilinear_predict8x8_mmx
298 ; unsigned char *src_ptr,
299 ; int src_pixels_per_line,
302 ; unsigned char *dst_ptr,
305 global sym
(vp8_bilinear_predict8x8_mmx
)
306 sym
(vp8_bilinear_predict8x8_mmx
):
309 SHADOW_ARGS_TO_STACK
6
315 ;const short *HFilter = bilinear_filters_mmx[xoffset];
316 ;const short *VFilter = bilinear_filters_mmx[yoffset];
318 movsxd rax
, dword ptr arg
(2) ;xoffset
319 mov rdi
, arg
(4) ;dst_ptr ;
321 shl rax
, 5 ; offset * 32
322 lea rcx
, [sym
(vp8_bilinear_filters_mmx
) GLOBAL]
324 add rax
, rcx
; HFilter
325 mov rsi
, arg
(0) ;src_ptr ;
327 movsxd rdx
, dword ptr arg
(5) ;dst_pitch
331 movsxd rax
, dword ptr arg
(3) ;yoffset
335 shl rax
, 5 ; offset*32
336 add rax
, rcx
; VFilter
338 lea rcx
, [rdi
+rdx
*8] ;
339 movsxd rdx
, dword ptr arg
(1) ;src_pixels_per_line ;
343 ; get the first horizontal line done ;
344 movq mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
345 movq mm4
, mm3
; make a copy of current line
347 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
365 paddw mm3
, [rd
GLOBAL] ; xmm3 += round value
366 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
368 paddw mm4
, [rd
GLOBAL] ;
369 psraw mm4
, VP8_FILTER_SHIFT
;
374 add rsi
, rdx
; next line
376 movq mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
377 movq mm4
, mm3
; make a copy of current line
379 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
406 paddw mm3
, [rd
GLOBAL] ; xmm3 += round value
407 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
409 paddw mm4
, [rd
GLOBAL] ;
410 psraw mm4
, VP8_FILTER_SHIFT
;
416 pmullw mm3
, [rax
+16] ;
417 pmullw mm4
, [rax
+16] ;
423 paddw mm3
, [rd
GLOBAL] ; xmm3 += round value
424 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
426 paddw mm4
, [rd
GLOBAL] ;
427 psraw mm4
, VP8_FILTER_SHIFT
;
431 movq
[rdi
], mm3
; store the results in the destination
434 add rsi
, rdx
; next line
435 add rdi
, dword ptr arg
(5) ;dst_pitch ;
437 movsxd r8
, dword ptr arg
(5) ;dst_pitch
438 add rsi
, rdx
; next line
439 add rdi
, r8
;dst_pitch
453 ;void bilinear_predict8x4_mmx
455 ; unsigned char *src_ptr,
456 ; int src_pixels_per_line,
459 ; unsigned char *dst_ptr,
462 global sym
(vp8_bilinear_predict8x4_mmx
)
463 sym
(vp8_bilinear_predict8x4_mmx
):
466 SHADOW_ARGS_TO_STACK
6
472 ;const short *HFilter = bilinear_filters_mmx[xoffset];
473 ;const short *VFilter = bilinear_filters_mmx[yoffset];
475 movsxd rax
, dword ptr arg
(2) ;xoffset
476 mov rdi
, arg
(4) ;dst_ptr ;
478 lea rcx
, [sym
(vp8_bilinear_filters_mmx
) GLOBAL]
481 mov rsi
, arg
(0) ;src_ptr ;
484 movsxd rdx
, dword ptr arg
(5) ;dst_pitch
488 movsxd rax
, dword ptr arg
(3) ;yoffset
494 lea rcx
, [rdi
+rdx
*4] ;
496 movsxd rdx
, dword ptr arg
(1) ;src_pixels_per_line ;
498 ; get the first horizontal line done ;
499 movq mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
500 movq mm4
, mm3
; make a copy of current line
502 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
520 paddw mm3
, [rd
GLOBAL] ; xmm3 += round value
521 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
523 paddw mm4
, [rd
GLOBAL] ;
524 psraw mm4
, VP8_FILTER_SHIFT
;
529 add rsi
, rdx
; next line
531 movq mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
532 movq mm4
, mm3
; make a copy of current line
534 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
561 paddw mm3
, [rd
GLOBAL] ; xmm3 += round value
562 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
564 paddw mm4
, [rd
GLOBAL] ;
565 psraw mm4
, VP8_FILTER_SHIFT
;
571 pmullw mm3
, [rax
+16] ;
572 pmullw mm4
, [rax
+16] ;
578 paddw mm3
, [rd
GLOBAL] ; xmm3 += round value
579 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
581 paddw mm4
, [rd
GLOBAL] ;
582 psraw mm4
, VP8_FILTER_SHIFT
;
586 movq
[rdi
], mm3
; store the results in the destination
589 add rsi
, rdx
; next line
590 add rdi
, dword ptr arg
(5) ;dst_pitch ;
592 movsxd r8
, dword ptr arg
(5) ;dst_pitch
593 add rsi
, rdx
; next line
608 ;void bilinear_predict4x4_mmx
610 ; unsigned char *src_ptr,
611 ; int src_pixels_per_line,
614 ; unsigned char *dst_ptr,
617 global sym
(vp8_bilinear_predict4x4_mmx
)
618 sym
(vp8_bilinear_predict4x4_mmx
):
621 SHADOW_ARGS_TO_STACK
6
627 ;const short *HFilter = bilinear_filters_mmx[xoffset];
628 ;const short *VFilter = bilinear_filters_mmx[yoffset];
630 movsxd rax
, dword ptr arg
(2) ;xoffset
631 mov rdi
, arg
(4) ;dst_ptr ;
633 lea rcx
, [sym
(vp8_bilinear_filters_mmx
) GLOBAL]
636 add rax
, rcx
; HFilter
637 mov rsi
, arg
(0) ;src_ptr ;
639 movsxd rdx
, dword ptr arg
(5) ;ldst_pitch
643 movsxd rax
, dword ptr arg
(3) ;yoffset
649 lea rcx
, [rdi
+rdx
*4] ;
651 movsxd rdx
, dword ptr arg
(1) ;src_pixels_per_line ;
653 ; get the first horizontal line done ;
654 movd mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
655 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
664 paddw mm3
, [rd
GLOBAL] ; xmm3 += round value
666 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
671 add rsi
, rdx
; next line
673 movd mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
674 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
688 paddw mm3
, [rd
GLOBAL] ; xmm3 += round value
690 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
695 pmullw mm3
, [rax
+16] ;
699 paddw mm3
, [rd
GLOBAL] ; xmm3 += round value
700 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
703 movd
[rdi
], mm3
; store the results in the destination
706 add rsi
, rdx
; next line
707 add rdi
, dword ptr arg
(5) ;dst_pitch ;
709 movsxd r8
, dword ptr arg
(5) ;dst_pitch ;
710 add rsi
, rdx
; next line
733 global sym
(vp8_six_tap_mmx
)
734 sym
(vp8_six_tap_mmx
):
793 global sym
(vp8_bilinear_filters_mmx
)
794 sym
(vp8_bilinear_filters_mmx
):