Merge "Skip computation of distortion in vp8_pick_inter_mode if active_map is used"
[libvpx.git] / vp8 / encoder / x86 / sad_ssse3.asm
blob95b6c89e64832e2635b1636b11889ddb558a5d36
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %macro PROCESS_16X2X3 1
15 %if %1
16 movdqa xmm0, XMMWORD PTR [rsi]
17 lddqu xmm5, XMMWORD PTR [rdi]
18 lddqu xmm6, XMMWORD PTR [rdi+1]
19 lddqu xmm7, XMMWORD PTR [rdi+2]
21 psadbw xmm5, xmm0
22 psadbw xmm6, xmm0
23 psadbw xmm7, xmm0
24 %else
25 movdqa xmm0, XMMWORD PTR [rsi]
26 lddqu xmm1, XMMWORD PTR [rdi]
27 lddqu xmm2, XMMWORD PTR [rdi+1]
28 lddqu xmm3, XMMWORD PTR [rdi+2]
30 psadbw xmm1, xmm0
31 psadbw xmm2, xmm0
32 psadbw xmm3, xmm0
34 paddw xmm5, xmm1
35 paddw xmm6, xmm2
36 paddw xmm7, xmm3
37 %endif
38 movdqa xmm0, XMMWORD PTR [rsi+rax]
39 lddqu xmm1, XMMWORD PTR [rdi+rdx]
40 lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
41 lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
43 lea rsi, [rsi+rax*2]
44 lea rdi, [rdi+rdx*2]
46 psadbw xmm1, xmm0
47 psadbw xmm2, xmm0
48 psadbw xmm3, xmm0
50 paddw xmm5, xmm1
51 paddw xmm6, xmm2
52 paddw xmm7, xmm3
53 %endmacro
55 %macro PROCESS_16X2X3_OFFSET 2
56 %if %1
57 movdqa xmm0, XMMWORD PTR [rsi]
58 movdqa xmm4, XMMWORD PTR [rdi]
59 movdqa xmm7, XMMWORD PTR [rdi+16]
61 movdqa xmm5, xmm7
62 palignr xmm5, xmm4, %2
64 movdqa xmm6, xmm7
65 palignr xmm6, xmm4, (%2+1)
67 palignr xmm7, xmm4, (%2+2)
69 psadbw xmm5, xmm0
70 psadbw xmm6, xmm0
71 psadbw xmm7, xmm0
72 %else
73 movdqa xmm0, XMMWORD PTR [rsi]
74 movdqa xmm4, XMMWORD PTR [rdi]
75 movdqa xmm3, XMMWORD PTR [rdi+16]
77 movdqa xmm1, xmm3
78 palignr xmm1, xmm4, %2
80 movdqa xmm2, xmm3
81 palignr xmm2, xmm4, (%2+1)
83 palignr xmm3, xmm4, (%2+2)
85 psadbw xmm1, xmm0
86 psadbw xmm2, xmm0
87 psadbw xmm3, xmm0
89 paddw xmm5, xmm1
90 paddw xmm6, xmm2
91 paddw xmm7, xmm3
92 %endif
93 movdqa xmm0, XMMWORD PTR [rsi+rax]
94 movdqa xmm4, XMMWORD PTR [rdi+rdx]
95 movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
97 movdqa xmm1, xmm3
98 palignr xmm1, xmm4, %2
100 movdqa xmm2, xmm3
101 palignr xmm2, xmm4, (%2+1)
103 palignr xmm3, xmm4, (%2+2)
105 lea rsi, [rsi+rax*2]
106 lea rdi, [rdi+rdx*2]
108 psadbw xmm1, xmm0
109 psadbw xmm2, xmm0
110 psadbw xmm3, xmm0
112 paddw xmm5, xmm1
113 paddw xmm6, xmm2
114 paddw xmm7, xmm3
115 %endmacro
117 %macro PROCESS_16X16X3_OFFSET 2
118 %2_aligned_by_%1:
120 sub rdi, %1
122 PROCESS_16X2X3_OFFSET 1, %1
123 PROCESS_16X2X3_OFFSET 0, %1
124 PROCESS_16X2X3_OFFSET 0, %1
125 PROCESS_16X2X3_OFFSET 0, %1
126 PROCESS_16X2X3_OFFSET 0, %1
127 PROCESS_16X2X3_OFFSET 0, %1
128 PROCESS_16X2X3_OFFSET 0, %1
129 PROCESS_16X2X3_OFFSET 0, %1
131 jmp %2_store_off
133 %endmacro
135 %macro PROCESS_16X8X3_OFFSET 2
136 %2_aligned_by_%1:
138 sub rdi, %1
140 PROCESS_16X2X3_OFFSET 1, %1
141 PROCESS_16X2X3_OFFSET 0, %1
142 PROCESS_16X2X3_OFFSET 0, %1
143 PROCESS_16X2X3_OFFSET 0, %1
145 jmp %2_store_off
147 %endmacro
149 ;void int vp8_sad16x16x3_ssse3(
150 ; unsigned char *src_ptr,
151 ; int src_stride,
152 ; unsigned char *ref_ptr,
153 ; int ref_stride,
154 ; int *results)
155 global sym(vp8_sad16x16x3_ssse3)
156 sym(vp8_sad16x16x3_ssse3):
157 push rbp
158 mov rbp, rsp
159 SHADOW_ARGS_TO_STACK 5
160 SAVE_XMM 7
161 push rsi
162 push rdi
163 push rcx
164 ; end prolog
166 mov rsi, arg(0) ;src_ptr
167 mov rdi, arg(2) ;ref_ptr
169 mov rdx, 0xf
170 and rdx, rdi
172 jmp .vp8_sad16x16x3_ssse3_skiptable
173 .vp8_sad16x16x3_ssse3_jumptable:
174 dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump
175 dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump
176 dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump
177 dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump
178 dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump
179 dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump
180 dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump
181 dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump
182 dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump
183 dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump
184 dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
185 dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
186 dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
187 dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
188 dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
189 dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
190 .vp8_sad16x16x3_ssse3_skiptable:
192 call .vp8_sad16x16x3_ssse3_do_jump
193 .vp8_sad16x16x3_ssse3_do_jump:
194 pop rcx ; get the address of do_jump
195 mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
196 add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
198 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
199 add rcx, rax
201 movsxd rax, dword ptr arg(1) ;src_stride
202 movsxd rdx, dword ptr arg(3) ;ref_stride
204 jmp rcx
206 PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3
207 PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3
208 PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3
209 PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3
210 PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3
211 PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3
212 PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3
213 PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3
214 PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3
215 PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3
216 PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
217 PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
218 PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
219 PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
220 PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
222 .vp8_sad16x16x3_ssse3_aligned_by_15:
223 PROCESS_16X2X3 1
224 PROCESS_16X2X3 0
225 PROCESS_16X2X3 0
226 PROCESS_16X2X3 0
227 PROCESS_16X2X3 0
228 PROCESS_16X2X3 0
229 PROCESS_16X2X3 0
230 PROCESS_16X2X3 0
232 .vp8_sad16x16x3_ssse3_store_off:
233 mov rdi, arg(4) ;Results
235 movq xmm0, xmm5
236 psrldq xmm5, 8
238 paddw xmm0, xmm5
239 movd [rdi], xmm0
241 movq xmm0, xmm6
242 psrldq xmm6, 8
244 paddw xmm0, xmm6
245 movd [rdi+4], xmm0
247 movq xmm0, xmm7
248 psrldq xmm7, 8
250 paddw xmm0, xmm7
251 movd [rdi+8], xmm0
253 ; begin epilog
254 pop rcx
255 pop rdi
256 pop rsi
257 RESTORE_XMM
258 UNSHADOW_ARGS
259 pop rbp
262 ;void int vp8_sad16x8x3_ssse3(
263 ; unsigned char *src_ptr,
264 ; int src_stride,
265 ; unsigned char *ref_ptr,
266 ; int ref_stride,
267 ; int *results)
268 global sym(vp8_sad16x8x3_ssse3)
269 sym(vp8_sad16x8x3_ssse3):
270 push rbp
271 mov rbp, rsp
272 SHADOW_ARGS_TO_STACK 5
273 SAVE_XMM 7
274 push rsi
275 push rdi
276 push rcx
277 ; end prolog
279 mov rsi, arg(0) ;src_ptr
280 mov rdi, arg(2) ;ref_ptr
282 mov rdx, 0xf
283 and rdx, rdi
285 jmp .vp8_sad16x8x3_ssse3_skiptable
286 .vp8_sad16x8x3_ssse3_jumptable:
287 dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump
288 dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump
289 dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump
290 dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump
291 dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump
292 dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump
293 dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump
294 dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump
295 dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump
296 dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump
297 dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
298 dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
299 dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
300 dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
301 dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
302 dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
303 .vp8_sad16x8x3_ssse3_skiptable:
305 call .vp8_sad16x8x3_ssse3_do_jump
306 .vp8_sad16x8x3_ssse3_do_jump:
307 pop rcx ; get the address of do_jump
308 mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
309 add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
311 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
312 add rcx, rax
314 movsxd rax, dword ptr arg(1) ;src_stride
315 movsxd rdx, dword ptr arg(3) ;ref_stride
317 jmp rcx
319 PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3
320 PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3
321 PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3
322 PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3
323 PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3
324 PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3
325 PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3
326 PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3
327 PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3
328 PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3
329 PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
330 PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
331 PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
332 PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
333 PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
335 .vp8_sad16x8x3_ssse3_aligned_by_15:
337 PROCESS_16X2X3 1
338 PROCESS_16X2X3 0
339 PROCESS_16X2X3 0
340 PROCESS_16X2X3 0
342 .vp8_sad16x8x3_ssse3_store_off:
343 mov rdi, arg(4) ;Results
345 movq xmm0, xmm5
346 psrldq xmm5, 8
348 paddw xmm0, xmm5
349 movd [rdi], xmm0
351 movq xmm0, xmm6
352 psrldq xmm6, 8
354 paddw xmm0, xmm6
355 movd [rdi+4], xmm0
357 movq xmm0, xmm7
358 psrldq xmm7, 8
360 paddw xmm0, xmm7
361 movd [rdi+8], xmm0
363 ; begin epilog
364 pop rcx
365 pop rdi
366 pop rsi
367 RESTORE_XMM
368 UNSHADOW_ARGS
369 pop rbp