2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %macro PROCESS_16X2X3
1
16 movdqa xmm0
, XMMWORD
PTR [rsi
]
17 lddqu xmm5
, XMMWORD
PTR [rdi
]
18 lddqu xmm6
, XMMWORD
PTR [rdi
+1]
19 lddqu xmm7
, XMMWORD
PTR [rdi
+2]
25 movdqa xmm0
, XMMWORD
PTR [rsi
]
26 lddqu xmm1
, XMMWORD
PTR [rdi
]
27 lddqu xmm2
, XMMWORD
PTR [rdi
+1]
28 lddqu xmm3
, XMMWORD
PTR [rdi
+2]
38 movdqa xmm0
, XMMWORD
PTR [rsi
+rax
]
39 lddqu xmm1
, XMMWORD
PTR [rdi
+rdx
]
40 lddqu xmm2
, XMMWORD
PTR [rdi
+rdx
+1]
41 lddqu xmm3
, XMMWORD
PTR [rdi
+rdx
+2]
55 %macro PROCESS_16X2X3_OFFSET
2
57 movdqa xmm0
, XMMWORD
PTR [rsi
]
58 movdqa xmm4
, XMMWORD
PTR [rdi
]
59 movdqa xmm7
, XMMWORD
PTR [rdi
+16]
62 palignr xmm5
, xmm4
, %2
65 palignr xmm6
, xmm4
, (%2+1)
67 palignr xmm7
, xmm4
, (%2+2)
73 movdqa xmm0
, XMMWORD
PTR [rsi
]
74 movdqa xmm4
, XMMWORD
PTR [rdi
]
75 movdqa xmm3
, XMMWORD
PTR [rdi
+16]
78 palignr xmm1
, xmm4
, %2
81 palignr xmm2
, xmm4
, (%2+1)
83 palignr xmm3
, xmm4
, (%2+2)
93 movdqa xmm0
, XMMWORD
PTR [rsi
+rax
]
94 movdqa xmm4
, XMMWORD
PTR [rdi
+rdx
]
95 movdqa xmm3
, XMMWORD
PTR [rdi
+rdx
+16]
98 palignr xmm1
, xmm4
, %2
101 palignr xmm2
, xmm4
, (%2+1)
103 palignr xmm3
, xmm4
, (%2+2)
117 %macro PROCESS_16X16X3_OFFSET
2
122 PROCESS_16X2X3_OFFSET
1, %1
123 PROCESS_16X2X3_OFFSET
0, %1
124 PROCESS_16X2X3_OFFSET
0, %1
125 PROCESS_16X2X3_OFFSET
0, %1
126 PROCESS_16X2X3_OFFSET
0, %1
127 PROCESS_16X2X3_OFFSET
0, %1
128 PROCESS_16X2X3_OFFSET
0, %1
129 PROCESS_16X2X3_OFFSET
0, %1
135 %macro PROCESS_16X8X3_OFFSET
2
140 PROCESS_16X2X3_OFFSET
1, %1
141 PROCESS_16X2X3_OFFSET
0, %1
142 PROCESS_16X2X3_OFFSET
0, %1
143 PROCESS_16X2X3_OFFSET
0, %1
149 ;void int vp8_sad16x16x3_ssse3(
150 ; unsigned char *src_ptr,
152 ; unsigned char *ref_ptr,
155 global sym
(vp8_sad16x16x3_ssse3
)
156 sym
(vp8_sad16x16x3_ssse3
):
159 SHADOW_ARGS_TO_STACK
5
166 mov rsi
, arg
(0) ;src_ptr
167 mov rdi
, arg
(2) ;ref_ptr
172 jmp .vp8_sad16x16x3_ssse3_skiptable
173 .
vp8_sad16x16x3_ssse3_jumptable:
174 dd .vp8_sad16x16x3_ssse3_aligned_by_0
- .vp8_sad16x16x3_ssse3_do_jump
175 dd .vp8_sad16x16x3_ssse3_aligned_by_1
- .vp8_sad16x16x3_ssse3_do_jump
176 dd .vp8_sad16x16x3_ssse3_aligned_by_2
- .vp8_sad16x16x3_ssse3_do_jump
177 dd .vp8_sad16x16x3_ssse3_aligned_by_3
- .vp8_sad16x16x3_ssse3_do_jump
178 dd .vp8_sad16x16x3_ssse3_aligned_by_4
- .vp8_sad16x16x3_ssse3_do_jump
179 dd .vp8_sad16x16x3_ssse3_aligned_by_5
- .vp8_sad16x16x3_ssse3_do_jump
180 dd .vp8_sad16x16x3_ssse3_aligned_by_6
- .vp8_sad16x16x3_ssse3_do_jump
181 dd .vp8_sad16x16x3_ssse3_aligned_by_7
- .vp8_sad16x16x3_ssse3_do_jump
182 dd .vp8_sad16x16x3_ssse3_aligned_by_8
- .vp8_sad16x16x3_ssse3_do_jump
183 dd .vp8_sad16x16x3_ssse3_aligned_by_9
- .vp8_sad16x16x3_ssse3_do_jump
184 dd .vp8_sad16x16x3_ssse3_aligned_by_10
- .vp8_sad16x16x3_ssse3_do_jump
185 dd .vp8_sad16x16x3_ssse3_aligned_by_11
- .vp8_sad16x16x3_ssse3_do_jump
186 dd .vp8_sad16x16x3_ssse3_aligned_by_12
- .vp8_sad16x16x3_ssse3_do_jump
187 dd .vp8_sad16x16x3_ssse3_aligned_by_13
- .vp8_sad16x16x3_ssse3_do_jump
188 dd .vp8_sad16x16x3_ssse3_aligned_by_14
- .vp8_sad16x16x3_ssse3_do_jump
189 dd .vp8_sad16x16x3_ssse3_aligned_by_15
- .vp8_sad16x16x3_ssse3_do_jump
190 .
vp8_sad16x16x3_ssse3_skiptable:
192 call .vp8_sad16x16x3_ssse3_do_jump
193 .
vp8_sad16x16x3_ssse3_do_jump:
194 pop rcx
; get the address of do_jump
195 mov rax
, .vp8_sad16x16x3_ssse3_jumptable
- .vp8_sad16x16x3_ssse3_do_jump
196 add rax
, rcx
; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
198 movsxd rax
, dword [rax
+ 4*rdx
] ; get the 32 bit offset from the jumptable
201 movsxd rax
, dword ptr arg
(1) ;src_stride
202 movsxd rdx
, dword ptr arg
(3) ;ref_stride
206 PROCESS_16X16X3_OFFSET
0, .vp8_sad16x16x3_ssse3
207 PROCESS_16X16X3_OFFSET
1, .vp8_sad16x16x3_ssse3
208 PROCESS_16X16X3_OFFSET
2, .vp8_sad16x16x3_ssse3
209 PROCESS_16X16X3_OFFSET
3, .vp8_sad16x16x3_ssse3
210 PROCESS_16X16X3_OFFSET
4, .vp8_sad16x16x3_ssse3
211 PROCESS_16X16X3_OFFSET
5, .vp8_sad16x16x3_ssse3
212 PROCESS_16X16X3_OFFSET
6, .vp8_sad16x16x3_ssse3
213 PROCESS_16X16X3_OFFSET
7, .vp8_sad16x16x3_ssse3
214 PROCESS_16X16X3_OFFSET
8, .vp8_sad16x16x3_ssse3
215 PROCESS_16X16X3_OFFSET
9, .vp8_sad16x16x3_ssse3
216 PROCESS_16X16X3_OFFSET
10, .vp8_sad16x16x3_ssse3
217 PROCESS_16X16X3_OFFSET
11, .vp8_sad16x16x3_ssse3
218 PROCESS_16X16X3_OFFSET
12, .vp8_sad16x16x3_ssse3
219 PROCESS_16X16X3_OFFSET
13, .vp8_sad16x16x3_ssse3
220 PROCESS_16X16X3_OFFSET
14, .vp8_sad16x16x3_ssse3
222 .
vp8_sad16x16x3_ssse3_aligned_by_15:
232 .
vp8_sad16x16x3_ssse3_store_off:
233 mov rdi
, arg
(4) ;Results
262 ;void int vp8_sad16x8x3_ssse3(
263 ; unsigned char *src_ptr,
265 ; unsigned char *ref_ptr,
268 global sym
(vp8_sad16x8x3_ssse3
)
269 sym
(vp8_sad16x8x3_ssse3
):
272 SHADOW_ARGS_TO_STACK
5
279 mov rsi
, arg
(0) ;src_ptr
280 mov rdi
, arg
(2) ;ref_ptr
285 jmp .vp8_sad16x8x3_ssse3_skiptable
286 .
vp8_sad16x8x3_ssse3_jumptable:
287 dd .vp8_sad16x8x3_ssse3_aligned_by_0
- .vp8_sad16x8x3_ssse3_do_jump
288 dd .vp8_sad16x8x3_ssse3_aligned_by_1
- .vp8_sad16x8x3_ssse3_do_jump
289 dd .vp8_sad16x8x3_ssse3_aligned_by_2
- .vp8_sad16x8x3_ssse3_do_jump
290 dd .vp8_sad16x8x3_ssse3_aligned_by_3
- .vp8_sad16x8x3_ssse3_do_jump
291 dd .vp8_sad16x8x3_ssse3_aligned_by_4
- .vp8_sad16x8x3_ssse3_do_jump
292 dd .vp8_sad16x8x3_ssse3_aligned_by_5
- .vp8_sad16x8x3_ssse3_do_jump
293 dd .vp8_sad16x8x3_ssse3_aligned_by_6
- .vp8_sad16x8x3_ssse3_do_jump
294 dd .vp8_sad16x8x3_ssse3_aligned_by_7
- .vp8_sad16x8x3_ssse3_do_jump
295 dd .vp8_sad16x8x3_ssse3_aligned_by_8
- .vp8_sad16x8x3_ssse3_do_jump
296 dd .vp8_sad16x8x3_ssse3_aligned_by_9
- .vp8_sad16x8x3_ssse3_do_jump
297 dd .vp8_sad16x8x3_ssse3_aligned_by_10
- .vp8_sad16x8x3_ssse3_do_jump
298 dd .vp8_sad16x8x3_ssse3_aligned_by_11
- .vp8_sad16x8x3_ssse3_do_jump
299 dd .vp8_sad16x8x3_ssse3_aligned_by_12
- .vp8_sad16x8x3_ssse3_do_jump
300 dd .vp8_sad16x8x3_ssse3_aligned_by_13
- .vp8_sad16x8x3_ssse3_do_jump
301 dd .vp8_sad16x8x3_ssse3_aligned_by_14
- .vp8_sad16x8x3_ssse3_do_jump
302 dd .vp8_sad16x8x3_ssse3_aligned_by_15
- .vp8_sad16x8x3_ssse3_do_jump
303 .
vp8_sad16x8x3_ssse3_skiptable:
305 call .vp8_sad16x8x3_ssse3_do_jump
306 .
vp8_sad16x8x3_ssse3_do_jump:
307 pop rcx
; get the address of do_jump
308 mov rax
, .vp8_sad16x8x3_ssse3_jumptable
- .vp8_sad16x8x3_ssse3_do_jump
309 add rax
, rcx
; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
311 movsxd rax
, dword [rax
+ 4*rdx
] ; get the 32 bit offset from the jumptable
314 movsxd rax
, dword ptr arg
(1) ;src_stride
315 movsxd rdx
, dword ptr arg
(3) ;ref_stride
319 PROCESS_16X8X3_OFFSET
0, .vp8_sad16x8x3_ssse3
320 PROCESS_16X8X3_OFFSET
1, .vp8_sad16x8x3_ssse3
321 PROCESS_16X8X3_OFFSET
2, .vp8_sad16x8x3_ssse3
322 PROCESS_16X8X3_OFFSET
3, .vp8_sad16x8x3_ssse3
323 PROCESS_16X8X3_OFFSET
4, .vp8_sad16x8x3_ssse3
324 PROCESS_16X8X3_OFFSET
5, .vp8_sad16x8x3_ssse3
325 PROCESS_16X8X3_OFFSET
6, .vp8_sad16x8x3_ssse3
326 PROCESS_16X8X3_OFFSET
7, .vp8_sad16x8x3_ssse3
327 PROCESS_16X8X3_OFFSET
8, .vp8_sad16x8x3_ssse3
328 PROCESS_16X8X3_OFFSET
9, .vp8_sad16x8x3_ssse3
329 PROCESS_16X8X3_OFFSET
10, .vp8_sad16x8x3_ssse3
330 PROCESS_16X8X3_OFFSET
11, .vp8_sad16x8x3_ssse3
331 PROCESS_16X8X3_OFFSET
12, .vp8_sad16x8x3_ssse3
332 PROCESS_16X8X3_OFFSET
13, .vp8_sad16x8x3_ssse3
333 PROCESS_16X8X3_OFFSET
14, .vp8_sad16x8x3_ssse3
335 .
vp8_sad16x8x3_ssse3_aligned_by_15:
342 .
vp8_sad16x8x3_ssse3_store_off:
343 mov rdi
, arg
(4) ;Results