2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
13 ;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
14 global sym
(vp8_recon2b_sse2
)
15 sym
(vp8_recon2b_sse2
):
18 SHADOW_ARGS_TO_STACK
4
26 movsxd rax
, dword ptr arg
(3) ;stride
29 movq xmm1
, MMWORD
PTR [rsi
]
31 paddsw xmm1
, XMMWORD
PTR [rdx
]
32 packuswb xmm1
, xmm0
; pack and unpack to saturate
33 movq MMWORD
PTR [rdi
], xmm1
36 movq xmm2
, MMWORD
PTR [rsi
+8]
38 paddsw xmm2
, XMMWORD
PTR [rdx
+16]
39 packuswb xmm2
, xmm0
; pack and unpack to saturate
40 movq MMWORD
PTR [rdi
+rax
], xmm2
43 movq xmm3
, MMWORD
PTR [rsi
+16]
45 paddsw xmm3
, XMMWORD
PTR [rdx
+32]
46 packuswb xmm3
, xmm0
; pack and unpack to saturate
47 movq MMWORD
PTR [rdi
+rax
*2], xmm3
50 movq xmm4
, MMWORD
PTR [rsi
+24]
52 paddsw xmm4
, XMMWORD
PTR [rdx
+48]
53 packuswb xmm4
, xmm0
; pack and unpack to saturate
54 movq MMWORD
PTR [rdi
+rax
*2], xmm4
64 ;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
65 global sym
(vp8_recon4b_sse2
)
66 sym
(vp8_recon4b_sse2
):
69 SHADOW_ARGS_TO_STACK
4
78 movsxd rax
, dword ptr arg
(3) ;stride
81 movdqa xmm1
, XMMWORD
PTR [rsi
]
85 paddsw xmm1
, XMMWORD
PTR [rdx
]
86 paddsw xmm5
, XMMWORD
PTR [rdx
+16]
87 packuswb xmm1
, xmm5
; pack and unpack to saturate
88 movdqa XMMWORD
PTR [rdi
], xmm1
91 movdqa xmm2
, XMMWORD
PTR [rsi
+16]
95 paddsw xmm2
, XMMWORD
PTR [rdx
+32]
96 paddsw xmm6
, XMMWORD
PTR [rdx
+48]
97 packuswb xmm2
, xmm6
; pack and unpack to saturate
98 movdqa XMMWORD
PTR [rdi
+rax
], xmm2
101 movdqa xmm3
, XMMWORD
PTR [rsi
+32]
105 paddsw xmm3
, XMMWORD
PTR [rdx
+64]
106 paddsw xmm7
, XMMWORD
PTR [rdx
+80]
107 packuswb xmm3
, xmm7
; pack and unpack to saturate
108 movdqa XMMWORD
PTR [rdi
+rax
*2], xmm3
111 movdqa xmm4
, XMMWORD
PTR [rsi
+48]
115 paddsw xmm4
, XMMWORD
PTR [rdx
+96]
116 paddsw xmm5
, XMMWORD
PTR [rdx
+112]
117 packuswb xmm4
, xmm5
; pack and unpack to saturate
118 movdqa XMMWORD
PTR [rdi
+rax
*2], xmm4
129 ;void copy_mem16x16_sse2(
130 ; unsigned char *src,
132 ; unsigned char *dst,
135 global sym
(vp8_copy_mem16x16_sse2
)
136 sym
(vp8_copy_mem16x16_sse2
):
139 SHADOW_ARGS_TO_STACK
4
144 mov rsi
, arg
(0) ;src;
147 movsxd rax
, dword ptr arg
(1) ;src_stride;
148 mov rdi
, arg
(2) ;dst;
150 movdqu xmm1
, [rsi
+rax
]
151 movdqu xmm2
, [rsi
+rax
*2]
153 movsxd rcx
, dword ptr arg
(3) ;dst_stride
159 movdqa
[rdi
+rcx
], xmm1
160 movdqa
[rdi
+rcx
*2],xmm2
166 movdqu xmm4
, [rsi
+rax
]
168 movdqu xmm5
, [rsi
+rax
*2]
174 movdqa
[rdi
+rcx
], xmm4
175 movdqa
[rdi
+rcx
*2],xmm5
181 movdqu xmm1
, [rsi
+rax
]
183 movdqu xmm2
, [rsi
+rax
*2]
189 movdqa
[rdi
+rcx
], xmm1
191 movdqa
[rdi
+rcx
*2], xmm2
194 movdqu xmm4
, [rsi
+rax
]
198 movdqu xmm5
, [rsi
+rax
*2]
204 movdqa
[rdi
+rcx
], xmm4
206 movdqa
[rdi
+rcx
*2],xmm5
210 movdqu xmm1
, [rsi
+rax
]
213 movdqu xmm2
, [rsi
+rax
*2]
218 movdqa
[rdi
+rcx
], xmm1
219 movdqa
[rdi
+rcx
*2],xmm2
221 movdqu xmm3
, [rsi
+rax
]
224 movdqa
[rdi
+rcx
], xmm3
234 ;void vp8_intra_pred_uv_dc_mmx2(
235 ; unsigned char *dst,
237 ; unsigned char *src,
240 global sym
(vp8_intra_pred_uv_dc_mmx2
)
241 sym
(vp8_intra_pred_uv_dc_mmx2
):
244 SHADOW_ARGS_TO_STACK
4
250 mov rsi
, arg
(2) ;src;
251 movsxd rax
, dword ptr arg
(3) ;src_stride;
260 movzx ecx, byte [rsi
+rax
]
261 movzx edx, byte [rsi
+rax
*2]
263 movzx edx, byte [rsi
+rdi
]
266 movzx edx, byte [rsi
]
268 movzx edx, byte [rsi
+rax
]
270 movzx edx, byte [rsi
+rax
*2]
272 movzx edx, byte [rsi
+rdi
]
274 movzx edx, byte [rsi
+rax
*4]
286 mov rdi
, arg
(0) ;dst;
287 movsxd rcx
, dword ptr arg
(1) ;dst_stride
292 movq
[rdi
+rcx
*2], mm1
297 movq
[rdi
+rcx
*2], mm1
307 ;void vp8_intra_pred_uv_dctop_mmx2(
308 ; unsigned char *dst,
310 ; unsigned char *src,
313 global sym
(vp8_intra_pred_uv_dctop_mmx2
)
314 sym
(vp8_intra_pred_uv_dctop_mmx2
):
317 SHADOW_ARGS_TO_STACK
4
324 mov rsi
, arg
(2) ;src;
325 movsxd rax
, dword ptr arg
(3) ;src_stride;
332 paddw mm1
, [GLOBAL(dc_4
)]
338 mov rdi
, arg
(0) ;dst;
339 movsxd rcx
, dword ptr arg
(1) ;dst_stride
344 movq
[rdi
+rcx
*2], mm1
349 movq
[rdi
+rcx
*2], mm1
360 ;void vp8_intra_pred_uv_dcleft_mmx2(
361 ; unsigned char *dst,
363 ; unsigned char *src,
366 global sym
(vp8_intra_pred_uv_dcleft_mmx2
)
367 sym
(vp8_intra_pred_uv_dcleft_mmx2
):
370 SHADOW_ARGS_TO_STACK
4
376 mov rsi
, arg
(2) ;src;
377 movsxd rax
, dword ptr arg
(3) ;src_stride;
380 movzx ecx, byte [rsi
]
381 movzx edx, byte [rsi
+rax
]
383 movzx edx, byte [rsi
+rax
*2]
385 movzx edx, byte [rsi
+rdi
]
388 movzx edx, byte [rsi
]
390 movzx edx, byte [rsi
+rax
]
392 movzx edx, byte [rsi
+rax
*2]
394 movzx edx, byte [rsi
+rdi
]
404 mov rdi
, arg
(0) ;dst;
405 movsxd rcx
, dword ptr arg
(1) ;dst_stride
410 movq
[rdi
+rcx
*2], mm1
415 movq
[rdi
+rcx
*2], mm1
425 ;void vp8_intra_pred_uv_dc128_mmx(
426 ; unsigned char *dst,
428 ; unsigned char *src,
431 global sym
(vp8_intra_pred_uv_dc128_mmx
)
432 sym
(vp8_intra_pred_uv_dc128_mmx
):
435 SHADOW_ARGS_TO_STACK
4
440 movq mm1
, [GLOBAL(dc_128
)]
441 mov rax
, arg
(0) ;dst;
442 movsxd rdx
, dword ptr arg
(1) ;dst_stride
447 movq
[rax
+rdx
*2], mm1
452 movq
[rax
+rdx
*2], mm1
461 ;void vp8_intra_pred_uv_tm_sse2(
462 ; unsigned char *dst,
464 ; unsigned char *src,
467 %macro vp8_intra_pred_uv_tm
1
468 global sym
(vp8_intra_pred_uv_tm_
%1)
469 sym
(vp8_intra_pred_uv_tm_
%1):
472 SHADOW_ARGS_TO_STACK
4
480 mov rsi
, arg
(2) ;src;
481 movsxd rax
, dword ptr arg
(3) ;src_stride;
485 movdqa xmm2
, [GLOBAL(dc_1024
)]
490 ; set up left ptrs ans subtract topleft
495 pshuflw xmm3
, xmm3
, 0x0
496 punpcklqdq xmm3
, xmm3
503 mov rdi
, arg
(0) ;dst;
504 movsxd rcx
, dword ptr arg
(1) ;dst_stride
506 .vp8_intra_pred_uv_tm_
%1_loop:
512 pshuflw xmm3
, xmm3
, 0x0
513 pshuflw xmm5
, xmm5
, 0x0
514 punpcklqdq xmm3
, xmm3
515 punpcklqdq xmm5
, xmm5
524 movhps
[rdi
+rcx
], xmm3
528 jnz .vp8_intra_pred_uv_tm_
%1_loop
539 vp8_intra_pred_uv_tm sse2
540 vp8_intra_pred_uv_tm ssse3
542 ;void vp8_intra_pred_uv_ve_mmx(
543 ; unsigned char *dst,
545 ; unsigned char *src,
548 global sym
(vp8_intra_pred_uv_ve_mmx
)
549 sym
(vp8_intra_pred_uv_ve_mmx
):
552 SHADOW_ARGS_TO_STACK
4
556 mov rax
, arg
(2) ;src;
557 movsxd rdx
, dword ptr arg
(3) ;src_stride;
562 mov rax
, arg
(0) ;dst;
563 movsxd rdx
, dword ptr arg
(1) ;dst_stride
568 movq
[rax
+rdx
*2], mm1
573 movq
[rax
+rdx
*2], mm1
581 ;void vp8_intra_pred_uv_ho_mmx2(
582 ; unsigned char *dst,
584 ; unsigned char *src,
587 %macro vp8_intra_pred_uv_ho
1
588 global sym
(vp8_intra_pred_uv_ho_
%1)
589 sym
(vp8_intra_pred_uv_ho_
%1):
592 SHADOW_ARGS_TO_STACK
4
596 %ifndef GET_GOT_SAVE_ARG
603 ; read from left and write out
607 mov rsi
, arg
(2) ;src;
608 movsxd rax
, dword ptr arg
(3) ;src_stride;
609 mov rdi
, arg
(0) ;dst;
610 movsxd rcx
, dword ptr arg
(1) ;dst_stride
613 movdqa xmm2
, [GLOBAL(dc_00001111
)]
618 .vp8_intra_pred_uv_ho_
%1_loop:
630 jnz .vp8_intra_pred_uv_ho_
%1_loop
634 movd xmm1
, [rsi
+rax
*2]
641 movhps
[rdi
+rcx
], xmm0
642 movq
[rdi
+rcx
*2], xmm1
643 movhps
[rdi
+rdx
], xmm1
648 movd xmm1
, [rsi
+rax
*2]
655 movhps
[rdi
+rcx
], xmm0
656 movq
[rdi
+rcx
*2], xmm1
657 movhps
[rdi
+rdx
], xmm1
663 %ifndef GET_GOT_SAVE_ARG
674 vp8_intra_pred_uv_ho mmx2
675 vp8_intra_pred_uv_ho ssse3