clarify *_offsets.asm differences
[libvpx.git] / vp8 / common / x86 / recon_sse2.asm
blob4ad3973ecb73505a4956d9ec3eed3232bc9e13b3
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
13 ;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
14 global sym(vp8_recon2b_sse2)
15 sym(vp8_recon2b_sse2):
16 push rbp
17 mov rbp, rsp
18 SHADOW_ARGS_TO_STACK 4
19 push rsi
20 push rdi
21 ; end prolog
23 mov rsi, arg(0) ;s
24 mov rdi, arg(2) ;d
25 mov rdx, arg(1) ;q
26 movsxd rax, dword ptr arg(3) ;stride
27 pxor xmm0, xmm0
29 movq xmm1, MMWORD PTR [rsi]
30 punpcklbw xmm1, xmm0
31 paddsw xmm1, XMMWORD PTR [rdx]
32 packuswb xmm1, xmm0 ; pack and unpack to saturate
33 movq MMWORD PTR [rdi], xmm1
36 movq xmm2, MMWORD PTR [rsi+8]
37 punpcklbw xmm2, xmm0
38 paddsw xmm2, XMMWORD PTR [rdx+16]
39 packuswb xmm2, xmm0 ; pack and unpack to saturate
40 movq MMWORD PTR [rdi+rax], xmm2
43 movq xmm3, MMWORD PTR [rsi+16]
44 punpcklbw xmm3, xmm0
45 paddsw xmm3, XMMWORD PTR [rdx+32]
46 packuswb xmm3, xmm0 ; pack and unpack to saturate
47 movq MMWORD PTR [rdi+rax*2], xmm3
49 add rdi, rax
50 movq xmm4, MMWORD PTR [rsi+24]
51 punpcklbw xmm4, xmm0
52 paddsw xmm4, XMMWORD PTR [rdx+48]
53 packuswb xmm4, xmm0 ; pack and unpack to saturate
54 movq MMWORD PTR [rdi+rax*2], xmm4
56 ; begin epilog
57 pop rdi
58 pop rsi
59 UNSHADOW_ARGS
60 pop rbp
61 ret
64 ;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
65 global sym(vp8_recon4b_sse2)
66 sym(vp8_recon4b_sse2):
67 push rbp
68 mov rbp, rsp
69 SHADOW_ARGS_TO_STACK 4
70 SAVE_XMM
71 push rsi
72 push rdi
73 ; end prolog
75 mov rsi, arg(0) ;s
76 mov rdi, arg(2) ;d
77 mov rdx, arg(1) ;q
78 movsxd rax, dword ptr arg(3) ;stride
79 pxor xmm0, xmm0
81 movdqa xmm1, XMMWORD PTR [rsi]
82 movdqa xmm5, xmm1
83 punpcklbw xmm1, xmm0
84 punpckhbw xmm5, xmm0
85 paddsw xmm1, XMMWORD PTR [rdx]
86 paddsw xmm5, XMMWORD PTR [rdx+16]
87 packuswb xmm1, xmm5 ; pack and unpack to saturate
88 movdqa XMMWORD PTR [rdi], xmm1
91 movdqa xmm2, XMMWORD PTR [rsi+16]
92 movdqa xmm6, xmm2
93 punpcklbw xmm2, xmm0
94 punpckhbw xmm6, xmm0
95 paddsw xmm2, XMMWORD PTR [rdx+32]
96 paddsw xmm6, XMMWORD PTR [rdx+48]
97 packuswb xmm2, xmm6 ; pack and unpack to saturate
98 movdqa XMMWORD PTR [rdi+rax], xmm2
101 movdqa xmm3, XMMWORD PTR [rsi+32]
102 movdqa xmm7, xmm3
103 punpcklbw xmm3, xmm0
104 punpckhbw xmm7, xmm0
105 paddsw xmm3, XMMWORD PTR [rdx+64]
106 paddsw xmm7, XMMWORD PTR [rdx+80]
107 packuswb xmm3, xmm7 ; pack and unpack to saturate
108 movdqa XMMWORD PTR [rdi+rax*2], xmm3
110 add rdi, rax
111 movdqa xmm4, XMMWORD PTR [rsi+48]
112 movdqa xmm5, xmm4
113 punpcklbw xmm4, xmm0
114 punpckhbw xmm5, xmm0
115 paddsw xmm4, XMMWORD PTR [rdx+96]
116 paddsw xmm5, XMMWORD PTR [rdx+112]
117 packuswb xmm4, xmm5 ; pack and unpack to saturate
118 movdqa XMMWORD PTR [rdi+rax*2], xmm4
120 ; begin epilog
121 pop rdi
122 pop rsi
123 RESTORE_XMM
124 UNSHADOW_ARGS
125 pop rbp
129 ;void copy_mem16x16_sse2(
130 ; unsigned char *src,
131 ; int src_stride,
132 ; unsigned char *dst,
133 ; int dst_stride
135 global sym(vp8_copy_mem16x16_sse2)
136 sym(vp8_copy_mem16x16_sse2):
137 push rbp
138 mov rbp, rsp
139 SHADOW_ARGS_TO_STACK 4
140 push rsi
141 push rdi
142 ; end prolog
144 mov rsi, arg(0) ;src;
145 movdqu xmm0, [rsi]
147 movsxd rax, dword ptr arg(1) ;src_stride;
148 mov rdi, arg(2) ;dst;
150 movdqu xmm1, [rsi+rax]
151 movdqu xmm2, [rsi+rax*2]
153 movsxd rcx, dword ptr arg(3) ;dst_stride
154 lea rsi, [rsi+rax*2]
156 movdqa [rdi], xmm0
157 add rsi, rax
159 movdqa [rdi+rcx], xmm1
160 movdqa [rdi+rcx*2],xmm2
162 lea rdi, [rdi+rcx*2]
163 movdqu xmm3, [rsi]
165 add rdi, rcx
166 movdqu xmm4, [rsi+rax]
168 movdqu xmm5, [rsi+rax*2]
169 lea rsi, [rsi+rax*2]
171 movdqa [rdi], xmm3
172 add rsi, rax
174 movdqa [rdi+rcx], xmm4
175 movdqa [rdi+rcx*2],xmm5
177 lea rdi, [rdi+rcx*2]
178 movdqu xmm0, [rsi]
180 add rdi, rcx
181 movdqu xmm1, [rsi+rax]
183 movdqu xmm2, [rsi+rax*2]
184 lea rsi, [rsi+rax*2]
186 movdqa [rdi], xmm0
187 add rsi, rax
189 movdqa [rdi+rcx], xmm1
191 movdqa [rdi+rcx*2], xmm2
192 movdqu xmm3, [rsi]
194 movdqu xmm4, [rsi+rax]
195 lea rdi, [rdi+rcx*2]
197 add rdi, rcx
198 movdqu xmm5, [rsi+rax*2]
200 lea rsi, [rsi+rax*2]
201 movdqa [rdi], xmm3
203 add rsi, rax
204 movdqa [rdi+rcx], xmm4
206 movdqa [rdi+rcx*2],xmm5
207 movdqu xmm0, [rsi]
209 lea rdi, [rdi+rcx*2]
210 movdqu xmm1, [rsi+rax]
212 add rdi, rcx
213 movdqu xmm2, [rsi+rax*2]
215 lea rsi, [rsi+rax*2]
216 movdqa [rdi], xmm0
218 movdqa [rdi+rcx], xmm1
219 movdqa [rdi+rcx*2],xmm2
221 movdqu xmm3, [rsi+rax]
222 lea rdi, [rdi+rcx*2]
224 movdqa [rdi+rcx], xmm3
226 ; begin epilog
227 pop rdi
228 pop rsi
229 UNSHADOW_ARGS
230 pop rbp