2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
15 %define BLOCK_HEIGHT_WIDTH
4
16 %define vp8_filter_weight
128
17 %define VP8_FILTER_SHIFT
7
20 ;void vp8_filter_block1d_h6_mmx
22 ; unsigned char *src_ptr,
23 ; unsigned short *output_ptr,
24 ; unsigned int src_pixels_per_line,
25 ; unsigned int pixel_step,
26 ; unsigned int output_height,
27 ; unsigned int output_width,
30 global sym
(vp8_filter_block1d_h6_mmx
)
31 sym
(vp8_filter_block1d_h6_mmx
):
34 SHADOW_ARGS_TO_STACK
7
40 mov rdx
, arg
(6) ;vp8_filter
42 movq mm1
, [rdx
+ 16] ; do both the negative taps first!!!
43 movq mm2
, [rdx
+ 32] ;
44 movq mm6
, [rdx
+ 48] ;
45 movq mm7
, [rdx
+ 64] ;
47 mov rdi
, arg
(1) ;output_ptr
48 mov rsi
, arg
(0) ;src_ptr
49 movsxd rcx
, dword ptr arg
(4) ;output_height
50 movsxd rax
, dword ptr arg
(5) ;output_width ; destination pitch?
51 pxor mm0
, mm0
; mm0 = 00000000
54 movq mm3
, [rsi
-2] ; mm3 = p-2..p5
55 movq mm4
, mm3
; mm4 = p-2..p5
56 psrlq mm3
, 8 ; mm3 = p-1..p5
57 punpcklbw mm3
, mm0
; mm3 = p-1..p2
58 pmullw mm3
, mm1
; mm3 *= kernel 1 modifiers.
60 movq mm5
, mm4
; mm5 = p-2..p5
61 punpckhbw mm4
, mm0
; mm5 = p2..p5
62 pmullw mm4
, mm7
; mm5 *= kernel 4 modifiers
63 paddsw mm3
, mm4
; mm3 += mm5
65 movq mm4
, mm5
; mm4 = p-2..p5;
66 psrlq mm5
, 16 ; mm5 = p0..p5;
67 punpcklbw mm5
, mm0
; mm5 = p0..p3
68 pmullw mm5
, mm2
; mm5 *= kernel 2 modifiers
69 paddsw mm3
, mm5
; mm3 += mm5
71 movq mm5
, mm4
; mm5 = p-2..p5
72 psrlq mm4
, 24 ; mm4 = p1..p5
73 punpcklbw mm4
, mm0
; mm4 = p1..p4
74 pmullw mm4
, mm6
; mm5 *= kernel 3 modifiers
75 paddsw mm3
, mm4
; mm3 += mm5
77 ; do outer positive taps
79 punpcklbw mm4
, mm0
; mm5 = p3..p6
80 pmullw mm4
, [rdx
+80] ; mm5 *= kernel 0 modifiers
81 paddsw mm3
, mm4
; mm3 += mm5
83 punpcklbw mm5
, mm0
; mm5 = p-2..p1
84 pmullw mm5
, [rdx
] ; mm5 *= kernel 5 modifiers
85 paddsw mm3
, mm5
; mm3 += mm5
87 paddsw mm3
, [GLOBAL(rd
)] ; mm3 += round value
88 psraw mm3
, VP8_FILTER_SHIFT
; mm3 /= 128
89 packuswb mm3
, mm0
; pack and unpack to saturate
92 movq
[rdi
], mm3
; store the results in the destination
95 add rsi
, dword ptr arg
(2) ;src_pixels_per_line ; next line
98 movsxd r8
, dword ptr arg
(2) ;src_pixels_per_line
101 add rsi
, r8
; next line
104 dec rcx
; decrement count
105 jnz nextrow
; next row
117 ; THIS FUNCTION APPEARS TO BE UNUSED
119 ;void vp8_filter_block1d_v6_mmx
122 ; unsigned char *output_ptr,
123 ; unsigned int pixels_per_line,
124 ; unsigned int pixel_step,
125 ; unsigned int output_height,
126 ; unsigned int output_width,
129 global sym
(vp8_filter_block1d_v6_mmx
)
130 sym
(vp8_filter_block1d_v6_mmx
):
133 SHADOW_ARGS_TO_STACK
7
139 movq mm5
, [GLOBAL(rd
)]
141 mov rbx
, arg
(6) ;vp8_filter
142 movq mm1
, [rbx
+ 16] ; do both the negative taps first!!!
143 movq mm2
, [rbx
+ 32] ;
144 movq mm6
, [rbx
+ 48] ;
145 movq mm7
, [rbx
+ 64] ;
147 movsxd rdx
, dword ptr arg
(2) ;pixels_per_line
148 mov rdi
, arg
(1) ;output_ptr
149 mov rsi
, arg
(0) ;src_ptr
152 movsxd rcx
, DWORD PTR arg
(4) ;output_height
153 movsxd rax
, DWORD PTR arg
(5) ;output_width ; destination pitch?
154 pxor mm0
, mm0
; mm0 = 00000000
158 movq mm3
, [rsi
+rdx
] ; mm3 = p0..p8 = row -1
159 pmullw mm3
, mm1
; mm3 *= kernel 1 modifiers.
162 movq mm4
, [rsi
+ 4*rdx
] ; mm4 = p0..p3 = row 2
163 pmullw mm4
, mm7
; mm4 *= kernel 4 modifiers.
164 paddsw mm3
, mm4
; mm3 += mm4
166 movq mm4
, [rsi
+ 2*rdx
] ; mm4 = p0..p3 = row 0
167 pmullw mm4
, mm2
; mm4 *= kernel 2 modifiers.
168 paddsw mm3
, mm4
; mm3 += mm4
170 movq mm4
, [rsi
] ; mm4 = p0..p3 = row -2
171 pmullw mm4
, [rbx
] ; mm4 *= kernel 0 modifiers.
172 paddsw mm3
, mm4
; mm3 += mm4
175 add rsi
, rdx
; move source forward 1 line to avoid 3 * pitch
176 movq mm4
, [rsi
+ 2*rdx
] ; mm4 = p0..p3 = row 1
177 pmullw mm4
, mm6
; mm4 *= kernel 3 modifiers.
178 paddsw mm3
, mm4
; mm3 += mm4
180 movq mm4
, [rsi
+ 4*rdx
] ; mm4 = p0..p3 = row 3
181 pmullw mm4
, [rbx
+80] ; mm4 *= kernel 3 modifiers.
182 paddsw mm3
, mm4
; mm3 += mm4
185 paddsw mm3
, mm5
; mm3 += round value
186 psraw mm3
, VP8_FILTER_SHIFT
; mm3 /= 128
187 packuswb mm3
, mm0
; pack and saturate
189 movd
[rdi
],mm3
; store the results in the destination
193 dec rcx
; decrement count
194 jnz nextrow_v
; next row
207 ;void vp8_filter_block1dc_v6_mmx
210 ; unsigned char *output_ptr,
212 ; unsigned int pixels_per_line,
213 ; unsigned int pixel_step,
214 ; unsigned int output_height,
215 ; unsigned int output_width,
218 global sym
(vp8_filter_block1dc_v6_mmx
)
219 sym
(vp8_filter_block1dc_v6_mmx
):
222 SHADOW_ARGS_TO_STACK
8
228 movq mm5
, [GLOBAL(rd
)]
230 mov rbx
, arg
(7) ;vp8_filter
231 movq mm1
, [rbx
+ 16] ; do both the negative taps first!!!
232 movq mm2
, [rbx
+ 32] ;
233 movq mm6
, [rbx
+ 48] ;
234 movq mm7
, [rbx
+ 64] ;
236 movsxd rdx
, dword ptr arg
(3) ;pixels_per_line
237 mov rdi
, arg
(1) ;output_ptr
238 mov rsi
, arg
(0) ;src_ptr
241 movsxd rcx
, DWORD PTR arg
(5) ;output_height
242 movsxd rax
, DWORD PTR arg
(2) ;output_pitch ; destination pitch?
243 pxor mm0
, mm0
; mm0 = 00000000
247 movq mm3
, [rsi
+rdx
] ; mm3 = p0..p8 = row -1
248 pmullw mm3
, mm1
; mm3 *= kernel 1 modifiers.
251 movq mm4
, [rsi
+ 4*rdx
] ; mm4 = p0..p3 = row 2
252 pmullw mm4
, mm7
; mm4 *= kernel 4 modifiers.
253 paddsw mm3
, mm4
; mm3 += mm4
255 movq mm4
, [rsi
+ 2*rdx
] ; mm4 = p0..p3 = row 0
256 pmullw mm4
, mm2
; mm4 *= kernel 2 modifiers.
257 paddsw mm3
, mm4
; mm3 += mm4
259 movq mm4
, [rsi
] ; mm4 = p0..p3 = row -2
260 pmullw mm4
, [rbx
] ; mm4 *= kernel 0 modifiers.
261 paddsw mm3
, mm4
; mm3 += mm4
264 add rsi
, rdx
; move source forward 1 line to avoid 3 * pitch
265 movq mm4
, [rsi
+ 2*rdx
] ; mm4 = p0..p3 = row 1
266 pmullw mm4
, mm6
; mm4 *= kernel 3 modifiers.
267 paddsw mm3
, mm4
; mm3 += mm4
269 movq mm4
, [rsi
+ 4*rdx
] ; mm4 = p0..p3 = row 3
270 pmullw mm4
, [rbx
+80] ; mm4 *= kernel 3 modifiers.
271 paddsw mm3
, mm4
; mm3 += mm4
274 paddsw mm3
, mm5
; mm3 += round value
275 psraw mm3
, VP8_FILTER_SHIFT
; mm3 /= 128
276 packuswb mm3
, mm0
; pack and saturate
278 movd
[rdi
],mm3
; store the results in the destination
279 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
280 ; recon block should be in cache this shouldn't cost much. Its obviously
283 dec rcx
; decrement count
284 jnz nextrow_cv
; next row
297 ;void bilinear_predict8x8_mmx
299 ; unsigned char *src_ptr,
300 ; int src_pixels_per_line,
303 ; unsigned char *dst_ptr,
306 global sym
(vp8_bilinear_predict8x8_mmx
)
307 sym
(vp8_bilinear_predict8x8_mmx
):
310 SHADOW_ARGS_TO_STACK
6
316 ;const short *HFilter = bilinear_filters_mmx[xoffset];
317 ;const short *VFilter = bilinear_filters_mmx[yoffset];
319 movsxd rax
, dword ptr arg
(2) ;xoffset
320 mov rdi
, arg
(4) ;dst_ptr ;
322 shl rax
, 5 ; offset * 32
323 lea rcx
, [GLOBAL(sym
(vp8_bilinear_filters_mmx
))]
325 add rax
, rcx
; HFilter
326 mov rsi
, arg
(0) ;src_ptr ;
328 movsxd rdx
, dword ptr arg
(5) ;dst_pitch
332 movsxd rax
, dword ptr arg
(3) ;yoffset
336 shl rax
, 5 ; offset*32
337 add rax
, rcx
; VFilter
339 lea rcx
, [rdi
+rdx
*8] ;
340 movsxd rdx
, dword ptr arg
(1) ;src_pixels_per_line ;
344 ; get the first horizontal line done ;
345 movq mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
346 movq mm4
, mm3
; make a copy of current line
348 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
366 paddw mm3
, [GLOBAL(rd
)] ; xmm3 += round value
367 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
369 paddw mm4
, [GLOBAL(rd
)] ;
370 psraw mm4
, VP8_FILTER_SHIFT
;
375 add rsi
, rdx
; next line
377 movq mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
378 movq mm4
, mm3
; make a copy of current line
380 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
407 paddw mm3
, [GLOBAL(rd
)] ; xmm3 += round value
408 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
410 paddw mm4
, [GLOBAL(rd
)] ;
411 psraw mm4
, VP8_FILTER_SHIFT
;
417 pmullw mm3
, [rax
+16] ;
418 pmullw mm4
, [rax
+16] ;
424 paddw mm3
, [GLOBAL(rd
)] ; xmm3 += round value
425 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
427 paddw mm4
, [GLOBAL(rd
)] ;
428 psraw mm4
, VP8_FILTER_SHIFT
;
432 movq
[rdi
], mm3
; store the results in the destination
435 add rsi
, rdx
; next line
436 add rdi
, dword ptr arg
(5) ;dst_pitch ;
438 movsxd r8
, dword ptr arg
(5) ;dst_pitch
439 add rsi
, rdx
; next line
440 add rdi
, r8
;dst_pitch
454 ;void bilinear_predict8x4_mmx
456 ; unsigned char *src_ptr,
457 ; int src_pixels_per_line,
460 ; unsigned char *dst_ptr,
463 global sym
(vp8_bilinear_predict8x4_mmx
)
464 sym
(vp8_bilinear_predict8x4_mmx
):
467 SHADOW_ARGS_TO_STACK
6
473 ;const short *HFilter = bilinear_filters_mmx[xoffset];
474 ;const short *VFilter = bilinear_filters_mmx[yoffset];
476 movsxd rax
, dword ptr arg
(2) ;xoffset
477 mov rdi
, arg
(4) ;dst_ptr ;
479 lea rcx
, [GLOBAL(sym
(vp8_bilinear_filters_mmx
))]
482 mov rsi
, arg
(0) ;src_ptr ;
485 movsxd rdx
, dword ptr arg
(5) ;dst_pitch
489 movsxd rax
, dword ptr arg
(3) ;yoffset
495 lea rcx
, [rdi
+rdx
*4] ;
497 movsxd rdx
, dword ptr arg
(1) ;src_pixels_per_line ;
499 ; get the first horizontal line done ;
500 movq mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
501 movq mm4
, mm3
; make a copy of current line
503 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
521 paddw mm3
, [GLOBAL(rd
)] ; xmm3 += round value
522 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
524 paddw mm4
, [GLOBAL(rd
)] ;
525 psraw mm4
, VP8_FILTER_SHIFT
;
530 add rsi
, rdx
; next line
532 movq mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
533 movq mm4
, mm3
; make a copy of current line
535 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
562 paddw mm3
, [GLOBAL(rd
)] ; xmm3 += round value
563 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
565 paddw mm4
, [GLOBAL(rd
)] ;
566 psraw mm4
, VP8_FILTER_SHIFT
;
572 pmullw mm3
, [rax
+16] ;
573 pmullw mm4
, [rax
+16] ;
579 paddw mm3
, [GLOBAL(rd
)] ; xmm3 += round value
580 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
582 paddw mm4
, [GLOBAL(rd
)] ;
583 psraw mm4
, VP8_FILTER_SHIFT
;
587 movq
[rdi
], mm3
; store the results in the destination
590 add rsi
, rdx
; next line
591 add rdi
, dword ptr arg
(5) ;dst_pitch ;
593 movsxd r8
, dword ptr arg
(5) ;dst_pitch
594 add rsi
, rdx
; next line
609 ;void bilinear_predict4x4_mmx
611 ; unsigned char *src_ptr,
612 ; int src_pixels_per_line,
615 ; unsigned char *dst_ptr,
618 global sym
(vp8_bilinear_predict4x4_mmx
)
619 sym
(vp8_bilinear_predict4x4_mmx
):
622 SHADOW_ARGS_TO_STACK
6
628 ;const short *HFilter = bilinear_filters_mmx[xoffset];
629 ;const short *VFilter = bilinear_filters_mmx[yoffset];
631 movsxd rax
, dword ptr arg
(2) ;xoffset
632 mov rdi
, arg
(4) ;dst_ptr ;
634 lea rcx
, [GLOBAL(sym
(vp8_bilinear_filters_mmx
))]
637 add rax
, rcx
; HFilter
638 mov rsi
, arg
(0) ;src_ptr ;
640 movsxd rdx
, dword ptr arg
(5) ;ldst_pitch
644 movsxd rax
, dword ptr arg
(3) ;yoffset
650 lea rcx
, [rdi
+rdx
*4] ;
652 movsxd rdx
, dword ptr arg
(1) ;src_pixels_per_line ;
654 ; get the first horizontal line done ;
655 movd mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
656 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
665 paddw mm3
, [GLOBAL(rd
)] ; xmm3 += round value
667 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
672 add rsi
, rdx
; next line
674 movd mm3
, [rsi
] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
675 punpcklbw mm3
, mm0
; xx 00 01 02 03 04 05 06
689 paddw mm3
, [GLOBAL(rd
)] ; xmm3 += round value
691 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
696 pmullw mm3
, [rax
+16] ;
700 paddw mm3
, [GLOBAL(rd
)] ; xmm3 += round value
701 psraw mm3
, VP8_FILTER_SHIFT
; xmm3 /= 128
704 movd
[rdi
], mm3
; store the results in the destination
707 add rsi
, rdx
; next line
708 add rdi
, dword ptr arg
(5) ;dst_pitch ;
710 movsxd r8
, dword ptr arg
(5) ;dst_pitch ;
711 add rsi
, rdx
; next line
734 global HIDDEN_DATA
(sym
(vp8_six_tap_mmx
))
735 sym
(vp8_six_tap_mmx
):
794 global HIDDEN_DATA
(sym
(vp8_bilinear_filters_mmx
))
795 sym
(vp8_bilinear_filters_mmx
):