Initial WebM release
[libvpx.git] / vp8 / encoder / x86 / sad_ssse3.asm
blob1bb956121f7db525f85009b79628485bb0a4d0e4
2 ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
13 %idefine QWORD
15 %macro PROCESS_16X2X3 1
16 %if %1
17 movdqa xmm0, [rsi]
18 lddqu xmm5, [rdi]
19 lddqu xmm6, [rdi+1]
20 lddqu xmm7, [rdi+2]
22 psadbw xmm5, xmm0
23 psadbw xmm6, xmm0
24 psadbw xmm7, xmm0
25 %else
26 movdqa xmm0, [rsi]
27 lddqu xmm1, [rdi]
28 lddqu xmm2, [rdi+1]
29 lddqu xmm3, [rdi+2]
31 psadbw xmm1, xmm0
32 psadbw xmm2, xmm0
33 psadbw xmm3, xmm0
35 paddw xmm5, xmm1
36 paddw xmm6, xmm2
37 paddw xmm7, xmm3
38 %endif
39 movdqa xmm0, QWORD PTR [rsi+rax]
40 lddqu xmm1, QWORD PTR [rdi+rdx]
41 lddqu xmm2, QWORD PTR [rdi+rdx+1]
42 lddqu xmm3, QWORD PTR [rdi+rdx+2]
44 lea rsi, [rsi+rax*2]
45 lea rdi, [rdi+rdx*2]
47 psadbw xmm1, xmm0
48 psadbw xmm2, xmm0
49 psadbw xmm3, xmm0
51 paddw xmm5, xmm1
52 paddw xmm6, xmm2
53 paddw xmm7, xmm3
54 %endmacro
56 %macro PROCESS_16X2X3_OFFSET 2
57 %if %1
58 movdqa xmm0, [rsi]
59 movdqa xmm4, [rdi]
60 movdqa xmm7, [rdi+16]
62 movdqa xmm5, xmm7
63 palignr xmm5, xmm4, %2
65 movdqa xmm6, xmm7
66 palignr xmm6, xmm4, (%2+1)
68 palignr xmm7, xmm4, (%2+2)
70 psadbw xmm5, xmm0
71 psadbw xmm6, xmm0
72 psadbw xmm7, xmm0
73 %else
74 movdqa xmm0, [rsi]
75 movdqa xmm4, [rdi]
76 movdqa xmm3, [rdi+16]
78 movdqa xmm1, xmm3
79 palignr xmm1, xmm4, %2
81 movdqa xmm2, xmm3
82 palignr xmm2, xmm4, (%2+1)
84 palignr xmm3, xmm4, (%2+2)
86 psadbw xmm1, xmm0
87 psadbw xmm2, xmm0
88 psadbw xmm3, xmm0
90 paddw xmm5, xmm1
91 paddw xmm6, xmm2
92 paddw xmm7, xmm3
93 %endif
94 movdqa xmm0, QWORD PTR [rsi+rax]
95 movdqa xmm4, QWORD PTR [rdi+rdx]
96 movdqa xmm3, QWORD PTR [rdi+rdx+16]
98 movdqa xmm1, xmm3
99 palignr xmm1, xmm4, %2
101 movdqa xmm2, xmm3
102 palignr xmm2, xmm4, (%2+1)
104 palignr xmm3, xmm4, (%2+2)
106 lea rsi, [rsi+rax*2]
107 lea rdi, [rdi+rdx*2]
109 psadbw xmm1, xmm0
110 psadbw xmm2, xmm0
111 psadbw xmm3, xmm0
113 paddw xmm5, xmm1
114 paddw xmm6, xmm2
115 paddw xmm7, xmm3
116 %endmacro
118 %macro PROCESS_16X16X3_OFFSET 2
119 %2_aligned_by_%1:
121 sub rdi, %1
123 PROCESS_16X2X3_OFFSET 1, %1
124 PROCESS_16X2X3_OFFSET 0, %1
125 PROCESS_16X2X3_OFFSET 0, %1
126 PROCESS_16X2X3_OFFSET 0, %1
127 PROCESS_16X2X3_OFFSET 0, %1
128 PROCESS_16X2X3_OFFSET 0, %1
129 PROCESS_16X2X3_OFFSET 0, %1
130 PROCESS_16X2X3_OFFSET 0, %1
132 jmp %2_store_off
134 %endmacro
136 %macro PROCESS_16X8X3_OFFSET 2
137 %2_aligned_by_%1:
139 sub rdi, %1
141 PROCESS_16X2X3_OFFSET 1, %1
142 PROCESS_16X2X3_OFFSET 0, %1
143 PROCESS_16X2X3_OFFSET 0, %1
144 PROCESS_16X2X3_OFFSET 0, %1
146 jmp %2_store_off
148 %endmacro
150 ;void int vp8_sad16x16x3_ssse3(
151 ; unsigned char *src_ptr,
152 ; int src_stride,
153 ; unsigned char *ref_ptr,
154 ; int ref_stride,
155 ; int *results)
156 global sym(vp8_sad16x16x3_ssse3)
157 sym(vp8_sad16x16x3_ssse3):
158 push rbp
159 mov rbp, rsp
160 SHADOW_ARGS_TO_STACK 5
161 push rsi
162 push rdi
163 push rcx
164 ; end prolog
166 mov rsi, arg(0) ;src_ptr
167 mov rdi, arg(2) ;ref_ptr
169 mov rdx, 0xf
170 and rdx, rdi
172 jmp vp8_sad16x16x3_ssse3_skiptable
173 vp8_sad16x16x3_ssse3_jumptable:
174 dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump
175 dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump
176 dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump
177 dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump
178 dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump
179 dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump
180 dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump
181 dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump
182 dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump
183 dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump
184 dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
185 dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
186 dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
187 dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
188 dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
189 dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
190 vp8_sad16x16x3_ssse3_skiptable:
192 call vp8_sad16x16x3_ssse3_do_jump
193 vp8_sad16x16x3_ssse3_do_jump:
194 pop rcx ; get the address of do_jump
195 mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
196 add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
198 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
199 add rcx, rax
201 movsxd rax, dword ptr arg(1) ;src_stride
202 movsxd rdx, dword ptr arg(3) ;ref_stride
204 jmp rcx
206 PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3
207 PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3
208 PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3
209 PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3
210 PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3
211 PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3
212 PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3
213 PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3
214 PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3
215 PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3
216 PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
217 PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
218 PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
219 PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
220 PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
222 vp8_sad16x16x3_ssse3_aligned_by_15:
223 PROCESS_16X2X3 1
224 PROCESS_16X2X3 0
225 PROCESS_16X2X3 0
226 PROCESS_16X2X3 0
227 PROCESS_16X2X3 0
228 PROCESS_16X2X3 0
229 PROCESS_16X2X3 0
230 PROCESS_16X2X3 0
232 vp8_sad16x16x3_ssse3_store_off:
233 mov rdi, arg(4) ;Results
235 movq xmm0, xmm5
236 psrldq xmm5, 8
238 paddw xmm0, xmm5
239 movd [rdi], xmm0
241 movq xmm0, xmm6
242 psrldq xmm6, 8
244 paddw xmm0, xmm6
245 movd [rdi+4], xmm0
247 movq xmm0, xmm7
248 psrldq xmm7, 8
250 paddw xmm0, xmm7
251 movd [rdi+8], xmm0
253 ; begin epilog
254 pop rcx
255 pop rdi
256 pop rsi
257 UNSHADOW_ARGS
258 pop rbp
261 ;void int vp8_sad16x8x3_ssse3(
262 ; unsigned char *src_ptr,
263 ; int src_stride,
264 ; unsigned char *ref_ptr,
265 ; int ref_stride,
266 ; int *results)
267 global sym(vp8_sad16x8x3_ssse3)
268 sym(vp8_sad16x8x3_ssse3):
269 push rbp
270 mov rbp, rsp
271 SHADOW_ARGS_TO_STACK 5
272 push rsi
273 push rdi
274 push rcx
275 ; end prolog
277 mov rsi, arg(0) ;src_ptr
278 mov rdi, arg(2) ;ref_ptr
280 mov rdx, 0xf
281 and rdx, rdi
283 jmp vp8_sad16x8x3_ssse3_skiptable
284 vp8_sad16x8x3_ssse3_jumptable:
285 dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump
286 dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump
287 dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump
288 dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump
289 dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump
290 dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump
291 dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump
292 dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump
293 dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump
294 dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump
295 dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
296 dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
297 dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
298 dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
299 dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
300 dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
301 vp8_sad16x8x3_ssse3_skiptable:
303 call vp8_sad16x8x3_ssse3_do_jump
304 vp8_sad16x8x3_ssse3_do_jump:
305 pop rcx ; get the address of do_jump
306 mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
307 add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
309 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
310 add rcx, rax
312 movsxd rax, dword ptr arg(1) ;src_stride
313 movsxd rdx, dword ptr arg(3) ;ref_stride
315 jmp rcx
317 PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3
318 PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3
319 PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3
320 PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3
321 PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3
322 PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3
323 PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3
324 PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3
325 PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3
326 PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3
327 PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
328 PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
329 PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
330 PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
331 PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
333 vp8_sad16x8x3_ssse3_aligned_by_15:
335 PROCESS_16X2X3 1
336 PROCESS_16X2X3 0
337 PROCESS_16X2X3 0
338 PROCESS_16X2X3 0
340 vp8_sad16x8x3_ssse3_store_off:
341 mov rdi, arg(4) ;Results
343 movq xmm0, xmm5
344 psrldq xmm5, 8
346 paddw xmm0, xmm5
347 movd [rdi], xmm0
349 movq xmm0, xmm6
350 psrldq xmm6, 8
352 paddw xmm0, xmm6
353 movd [rdi+4], xmm0
355 movq xmm0, xmm7
356 psrldq xmm7, 8
358 paddw xmm0, xmm7
359 movd [rdi+8], xmm0
361 ; begin epilog
362 pop rcx
363 pop rdi
364 pop rsi
365 UNSHADOW_ARGS
366 pop rbp