Removed unused vp8_recon_intra4x4mb function
[libvpx.git] / vp8 / encoder / x86 / sad_sse4.asm
blob21e2e50072858eb464288d452cbe162481890316
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %macro PROCESS_16X2X8 1
15 %if %1
16 movdqa xmm0, XMMWORD PTR [rsi]
17 movq xmm1, MMWORD PTR [rdi]
18 movq xmm3, MMWORD PTR [rdi+8]
19 movq xmm2, MMWORD PTR [rdi+16]
20 punpcklqdq xmm1, xmm3
21 punpcklqdq xmm3, xmm2
23 movdqa xmm2, xmm1
24 mpsadbw xmm1, xmm0, 0x0
25 mpsadbw xmm2, xmm0, 0x5
27 psrldq xmm0, 8
29 movdqa xmm4, xmm3
30 mpsadbw xmm3, xmm0, 0x0
31 mpsadbw xmm4, xmm0, 0x5
33 paddw xmm1, xmm2
34 paddw xmm1, xmm3
35 paddw xmm1, xmm4
36 %else
37 movdqa xmm0, XMMWORD PTR [rsi]
38 movq xmm5, MMWORD PTR [rdi]
39 movq xmm3, MMWORD PTR [rdi+8]
40 movq xmm2, MMWORD PTR [rdi+16]
41 punpcklqdq xmm5, xmm3
42 punpcklqdq xmm3, xmm2
44 movdqa xmm2, xmm5
45 mpsadbw xmm5, xmm0, 0x0
46 mpsadbw xmm2, xmm0, 0x5
48 psrldq xmm0, 8
50 movdqa xmm4, xmm3
51 mpsadbw xmm3, xmm0, 0x0
52 mpsadbw xmm4, xmm0, 0x5
54 paddw xmm5, xmm2
55 paddw xmm5, xmm3
56 paddw xmm5, xmm4
58 paddw xmm1, xmm5
59 %endif
60 movdqa xmm0, XMMWORD PTR [rsi + rax]
61 movq xmm5, MMWORD PTR [rdi+ rdx]
62 movq xmm3, MMWORD PTR [rdi+ rdx+8]
63 movq xmm2, MMWORD PTR [rdi+ rdx+16]
64 punpcklqdq xmm5, xmm3
65 punpcklqdq xmm3, xmm2
67 lea rsi, [rsi+rax*2]
68 lea rdi, [rdi+rdx*2]
70 movdqa xmm2, xmm5
71 mpsadbw xmm5, xmm0, 0x0
72 mpsadbw xmm2, xmm0, 0x5
74 psrldq xmm0, 8
75 movdqa xmm4, xmm3
76 mpsadbw xmm3, xmm0, 0x0
77 mpsadbw xmm4, xmm0, 0x5
79 paddw xmm5, xmm2
80 paddw xmm5, xmm3
81 paddw xmm5, xmm4
83 paddw xmm1, xmm5
84 %endmacro
86 %macro PROCESS_8X2X8 1
87 %if %1
88 movq xmm0, MMWORD PTR [rsi]
89 movq xmm1, MMWORD PTR [rdi]
90 movq xmm3, MMWORD PTR [rdi+8]
91 punpcklqdq xmm1, xmm3
93 movdqa xmm2, xmm1
94 mpsadbw xmm1, xmm0, 0x0
95 mpsadbw xmm2, xmm0, 0x5
96 paddw xmm1, xmm2
97 %else
98 movq xmm0, MMWORD PTR [rsi]
99 movq xmm5, MMWORD PTR [rdi]
100 movq xmm3, MMWORD PTR [rdi+8]
101 punpcklqdq xmm5, xmm3
103 movdqa xmm2, xmm5
104 mpsadbw xmm5, xmm0, 0x0
105 mpsadbw xmm2, xmm0, 0x5
106 paddw xmm5, xmm2
108 paddw xmm1, xmm5
109 %endif
110 movq xmm0, MMWORD PTR [rsi + rax]
111 movq xmm5, MMWORD PTR [rdi+ rdx]
112 movq xmm3, MMWORD PTR [rdi+ rdx+8]
113 punpcklqdq xmm5, xmm3
115 lea rsi, [rsi+rax*2]
116 lea rdi, [rdi+rdx*2]
118 movdqa xmm2, xmm5
119 mpsadbw xmm5, xmm0, 0x0
120 mpsadbw xmm2, xmm0, 0x5
121 paddw xmm5, xmm2
123 paddw xmm1, xmm5
124 %endmacro
126 %macro PROCESS_4X2X8 1
127 %if %1
128 movd xmm0, [rsi]
129 movq xmm1, MMWORD PTR [rdi]
130 movq xmm3, MMWORD PTR [rdi+8]
131 punpcklqdq xmm1, xmm3
133 mpsadbw xmm1, xmm0, 0x0
134 %else
135 movd xmm0, [rsi]
136 movq xmm5, MMWORD PTR [rdi]
137 movq xmm3, MMWORD PTR [rdi+8]
138 punpcklqdq xmm5, xmm3
140 mpsadbw xmm5, xmm0, 0x0
142 paddw xmm1, xmm5
143 %endif
144 movd xmm0, [rsi + rax]
145 movq xmm5, MMWORD PTR [rdi+ rdx]
146 movq xmm3, MMWORD PTR [rdi+ rdx+8]
147 punpcklqdq xmm5, xmm3
149 lea rsi, [rsi+rax*2]
150 lea rdi, [rdi+rdx*2]
152 mpsadbw xmm5, xmm0, 0x0
154 paddw xmm1, xmm5
155 %endmacro
158 ;void vp8_sad16x16x8_sse4(
159 ; const unsigned char *src_ptr,
160 ; int src_stride,
161 ; const unsigned char *ref_ptr,
162 ; int ref_stride,
163 ; unsigned short *sad_array);
164 global sym(vp8_sad16x16x8_sse4)
165 sym(vp8_sad16x16x8_sse4):
166 push rbp
167 mov rbp, rsp
168 SHADOW_ARGS_TO_STACK 5
169 push rsi
170 push rdi
171 ; end prolog
173 mov rsi, arg(0) ;src_ptr
174 mov rdi, arg(2) ;ref_ptr
176 movsxd rax, dword ptr arg(1) ;src_stride
177 movsxd rdx, dword ptr arg(3) ;ref_stride
179 PROCESS_16X2X8 1
180 PROCESS_16X2X8 0
181 PROCESS_16X2X8 0
182 PROCESS_16X2X8 0
183 PROCESS_16X2X8 0
184 PROCESS_16X2X8 0
185 PROCESS_16X2X8 0
186 PROCESS_16X2X8 0
188 mov rdi, arg(4) ;Results
189 movdqu XMMWORD PTR [rdi], xmm1
191 ; begin epilog
192 pop rdi
193 pop rsi
194 UNSHADOW_ARGS
195 pop rbp
199 ;void vp8_sad16x8x8_sse4(
200 ; const unsigned char *src_ptr,
201 ; int src_stride,
202 ; const unsigned char *ref_ptr,
203 ; int ref_stride,
204 ; unsigned short *sad_array
206 global sym(vp8_sad16x8x8_sse4)
207 sym(vp8_sad16x8x8_sse4):
208 push rbp
209 mov rbp, rsp
210 SHADOW_ARGS_TO_STACK 5
211 push rsi
212 push rdi
213 ; end prolog
215 mov rsi, arg(0) ;src_ptr
216 mov rdi, arg(2) ;ref_ptr
218 movsxd rax, dword ptr arg(1) ;src_stride
219 movsxd rdx, dword ptr arg(3) ;ref_stride
221 PROCESS_16X2X8 1
222 PROCESS_16X2X8 0
223 PROCESS_16X2X8 0
224 PROCESS_16X2X8 0
226 mov rdi, arg(4) ;Results
227 movdqu XMMWORD PTR [rdi], xmm1
229 ; begin epilog
230 pop rdi
231 pop rsi
232 UNSHADOW_ARGS
233 pop rbp
237 ;void vp8_sad8x8x8_sse4(
238 ; const unsigned char *src_ptr,
239 ; int src_stride,
240 ; const unsigned char *ref_ptr,
241 ; int ref_stride,
242 ; unsigned short *sad_array
244 global sym(vp8_sad8x8x8_sse4)
245 sym(vp8_sad8x8x8_sse4):
246 push rbp
247 mov rbp, rsp
248 SHADOW_ARGS_TO_STACK 5
249 push rsi
250 push rdi
251 ; end prolog
253 mov rsi, arg(0) ;src_ptr
254 mov rdi, arg(2) ;ref_ptr
256 movsxd rax, dword ptr arg(1) ;src_stride
257 movsxd rdx, dword ptr arg(3) ;ref_stride
259 PROCESS_8X2X8 1
260 PROCESS_8X2X8 0
261 PROCESS_8X2X8 0
262 PROCESS_8X2X8 0
264 mov rdi, arg(4) ;Results
265 movdqu XMMWORD PTR [rdi], xmm1
267 ; begin epilog
268 pop rdi
269 pop rsi
270 UNSHADOW_ARGS
271 pop rbp
275 ;void vp8_sad8x16x8_sse4(
276 ; const unsigned char *src_ptr,
277 ; int src_stride,
278 ; const unsigned char *ref_ptr,
279 ; int ref_stride,
280 ; unsigned short *sad_array
282 global sym(vp8_sad8x16x8_sse4)
283 sym(vp8_sad8x16x8_sse4):
284 push rbp
285 mov rbp, rsp
286 SHADOW_ARGS_TO_STACK 5
287 push rsi
288 push rdi
289 ; end prolog
291 mov rsi, arg(0) ;src_ptr
292 mov rdi, arg(2) ;ref_ptr
294 movsxd rax, dword ptr arg(1) ;src_stride
295 movsxd rdx, dword ptr arg(3) ;ref_stride
297 PROCESS_8X2X8 1
298 PROCESS_8X2X8 0
299 PROCESS_8X2X8 0
300 PROCESS_8X2X8 0
301 PROCESS_8X2X8 0
302 PROCESS_8X2X8 0
303 PROCESS_8X2X8 0
304 PROCESS_8X2X8 0
305 mov rdi, arg(4) ;Results
306 movdqu XMMWORD PTR [rdi], xmm1
308 ; begin epilog
309 pop rdi
310 pop rsi
311 UNSHADOW_ARGS
312 pop rbp
316 ;void vp8_sad4x4x8_c(
317 ; const unsigned char *src_ptr,
318 ; int src_stride,
319 ; const unsigned char *ref_ptr,
320 ; int ref_stride,
321 ; unsigned short *sad_array
323 global sym(vp8_sad4x4x8_sse4)
324 sym(vp8_sad4x4x8_sse4):
325 push rbp
326 mov rbp, rsp
327 SHADOW_ARGS_TO_STACK 5
328 push rsi
329 push rdi
330 ; end prolog
332 mov rsi, arg(0) ;src_ptr
333 mov rdi, arg(2) ;ref_ptr
335 movsxd rax, dword ptr arg(1) ;src_stride
336 movsxd rdx, dword ptr arg(3) ;ref_stride
338 PROCESS_4X2X8 1
339 PROCESS_4X2X8 0
341 mov rdi, arg(4) ;Results
342 movdqu XMMWORD PTR [rdi], xmm1
344 ; begin epilog
345 pop rdi
346 pop rsi
347 UNSHADOW_ARGS
348 pop rbp