aarch64: Add assembly support for -fsanitize=hwaddress tagged globals.
[libav.git] / libavcodec / x86 / dct32.asm
blobcfd5f52ba817cb8440363ec212f44452a72378fd
1 ;******************************************************************************
2 ;* 32 point SSE-optimized DCT transform
3 ;* Copyright (c) 2010 Vitor Sessak
4 ;*
5 ;* This file is part of Libav.
6 ;*
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
24 SECTION_RODATA 32
26 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
28 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
29 dd 0.553104, 0.582935, 0.622504, 0.674808
30 dd -10.190008, -3.407609, -2.057781, -1.484165
31 dd -1.169440, -0.972568, -0.839350, -0.744536
32 dd 0.502419, 0.522499, 0.566944, 0.646822
33 dd 0.788155, 1.060678, 1.722447, 5.101149
34 dd 0.509796, 0.601345, 0.899976, 2.562916
35 dd 0.509796, 0.601345, 0.899976, 2.562916
36 dd 1.000000, 1.000000, 1.306563, 0.541196
37 dd 1.000000, 1.000000, 1.306563, 0.541196
38 dd 1.000000, 0.707107, 1.000000, -0.707107
39 dd 1.000000, 0.707107, 1.000000, -0.707107
40 dd 0.707107, 0.707107, 0.707107, 0.707107
42 %macro BUTTERFLY 4
43 subps %4, %1, %2
44 addps %2, %2, %1
45 mulps %1, %4, %3
46 %endmacro
48 %macro BUTTERFLY0 5
49 %if cpuflag(sse2) && notcpuflag(avx)
50 pshufd %4, %1, %5
51 xorps %1, %2
52 addps %1, %4
53 mulps %1, %3
54 %else
55 shufps %4, %1, %1, %5
56 xorps %1, %1, %2
57 addps %4, %4, %1
58 mulps %1, %4, %3
59 %endif
60 %endmacro
62 %macro BUTTERFLY2 4
63 BUTTERFLY0 %1, %2, %3, %4, 0x1b
64 %endmacro
66 %macro BUTTERFLY3 4
67 BUTTERFLY0 %1, %2, %3, %4, 0xb1
68 %endmacro
70 %macro BUTTERFLY3V 5
71 movaps m%5, m%1
72 addps m%1, m%2
73 subps m%5, m%2
74 SWAP %2, %5
75 mulps m%2, [ps_cos_vec+192]
76 movaps m%5, m%3
77 addps m%3, m%4
78 subps m%4, m%5
79 mulps m%4, [ps_cos_vec+192]
80 %endmacro
82 %macro PASS6_AND_PERMUTE 0
83 mov tmpd, [outq+4]
84 movss m7, [outq+72]
85 addss m7, [outq+76]
86 movss m3, [outq+56]
87 addss m3, [outq+60]
88 addss m4, m3
89 movss m2, [outq+52]
90 addss m2, m3
91 movss m3, [outq+104]
92 addss m3, [outq+108]
93 addss m1, m3
94 addss m5, m4
95 movss [outq+ 16], m1
96 movss m1, [outq+100]
97 addss m1, m3
98 movss m3, [outq+40]
99 movss [outq+ 48], m1
100 addss m3, [outq+44]
101 movss m1, [outq+100]
102 addss m4, m3
103 addss m3, m2
104 addss m1, [outq+108]
105 movss [outq+ 40], m3
106 addss m2, [outq+36]
107 movss m3, [outq+8]
108 movss [outq+ 56], m2
109 addss m3, [outq+12]
110 movss [outq+ 32], m3
111 movss m3, [outq+80]
112 movss [outq+ 8], m5
113 movss [outq+ 80], m1
114 movss m2, [outq+52]
115 movss m5, [outq+120]
116 addss m5, [outq+124]
117 movss m1, [outq+64]
118 addss m2, [outq+60]
119 addss m0, m5
120 addss m5, [outq+116]
121 mov [outq+64], tmpd
122 addss m6, m0
123 addss m1, m6
124 mov tmpd, [outq+12]
125 mov [outq+ 96], tmpd
126 movss [outq+ 4], m1
127 movss m1, [outq+24]
128 movss [outq+ 24], m4
129 movss m4, [outq+88]
130 addss m4, [outq+92]
131 addss m3, m4
132 addss m4, [outq+84]
133 mov tmpd, [outq+108]
134 addss m1, [outq+28]
135 addss m0, m1
136 addss m1, m5
137 addss m6, m3
138 addss m3, m0
139 addss m0, m7
140 addss m5, [outq+20]
141 addss m7, m1
142 movss [outq+ 12], m6
143 mov [outq+112], tmpd
144 movss m6, [outq+28]
145 movss [outq+ 28], m0
146 movss m0, [outq+36]
147 movss [outq+ 36], m7
148 addss m1, m4
149 movss m7, [outq+116]
150 addss m0, m2
151 addss m7, [outq+124]
152 movss [outq+ 72], m0
153 movss m0, [outq+44]
154 addss m2, m0
155 movss [outq+ 44], m1
156 movss [outq+ 88], m2
157 addss m0, [outq+60]
158 mov tmpd, [outq+60]
159 mov [outq+120], tmpd
160 movss [outq+104], m0
161 addss m4, m5
162 addss m5, [outq+68]
163 movss [outq+52], m4
164 movss [outq+60], m5
165 movss m4, [outq+68]
166 movss m5, [outq+20]
167 movss [outq+ 20], m3
168 addss m5, m7
169 addss m7, m6
170 addss m4, m5
171 movss m2, [outq+84]
172 addss m2, [outq+92]
173 addss m5, m2
174 movss [outq+ 68], m4
175 addss m2, m7
176 movss m4, [outq+76]
177 movss [outq+ 84], m2
178 movss [outq+ 76], m5
179 addss m7, m4
180 addss m6, [outq+124]
181 addss m4, m6
182 addss m6, [outq+92]
183 movss [outq+100], m4
184 movss [outq+108], m6
185 movss m6, [outq+92]
186 movss [outq+92], m7
187 addss m6, [outq+124]
188 movss [outq+116], m6
189 %endmacro
191 INIT_YMM avx
192 SECTION .text
193 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
194 cglobal dct32_float, 2,3,8, out, in, tmp
195 ; pass 1
196 vmovaps m4, [inq+0]
197 vinsertf128 m5, m5, [inq+96], 1
198 vinsertf128 m5, m5, [inq+112], 0
199 vshufps m5, m5, m5, 0x1b
200 BUTTERFLY m4, m5, [ps_cos_vec], m6
202 vmovaps m2, [inq+64]
203 vinsertf128 m6, m6, [inq+32], 1
204 vinsertf128 m6, m6, [inq+48], 0
205 vshufps m6, m6, m6, 0x1b
206 BUTTERFLY m2, m6, [ps_cos_vec+32], m0
208 ; pass 2
210 BUTTERFLY m5, m6, [ps_cos_vec+64], m0
211 BUTTERFLY m4, m2, [ps_cos_vec+64], m7
214 ; pass 3
215 vperm2f128 m3, m6, m4, 0x31
216 vperm2f128 m1, m6, m4, 0x20
217 vshufps m3, m3, m3, 0x1b
219 BUTTERFLY m1, m3, [ps_cos_vec+96], m6
222 vperm2f128 m4, m5, m2, 0x20
223 vperm2f128 m5, m5, m2, 0x31
224 vshufps m5, m5, m5, 0x1b
226 BUTTERFLY m4, m5, [ps_cos_vec+96], m6
228 ; pass 4
229 vmovaps m6, [ps_p1p1m1m1+0]
230 vmovaps m2, [ps_cos_vec+128]
232 BUTTERFLY2 m5, m6, m2, m7
233 BUTTERFLY2 m4, m6, m2, m7
234 BUTTERFLY2 m1, m6, m2, m7
235 BUTTERFLY2 m3, m6, m2, m7
238 ; pass 5
239 vshufps m6, m6, m6, 0xcc
240 vmovaps m2, [ps_cos_vec+160]
242 BUTTERFLY3 m5, m6, m2, m7
243 BUTTERFLY3 m4, m6, m2, m7
244 BUTTERFLY3 m1, m6, m2, m7
245 BUTTERFLY3 m3, m6, m2, m7
247 vperm2f128 m6, m3, m3, 0x31
248 vmovaps [outq], m3
250 vextractf128 [outq+64], m5, 1
251 vextractf128 [outq+32], m5, 0
253 vextractf128 [outq+80], m4, 1
254 vextractf128 [outq+48], m4, 0
256 vperm2f128 m0, m1, m1, 0x31
257 vmovaps [outq+96], m1
259 vzeroupper
261 ; pass 6, no SIMD...
262 INIT_XMM
263 PASS6_AND_PERMUTE
266 %if ARCH_X86_64
267 %define SPILL SWAP
268 %define UNSPILL SWAP
270 %macro PASS5 0
271 nop ; FIXME code alignment
272 SWAP 5, 8
273 SWAP 4, 12
274 SWAP 6, 14
275 SWAP 7, 13
276 SWAP 0, 15
277 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
278 TRANSPOSE4x4PS 8, 9, 10, 11, 0
279 BUTTERFLY3V 8, 9, 10, 11, 0
280 addps m10, m11
281 TRANSPOSE4x4PS 12, 13, 14, 15, 0
282 BUTTERFLY3V 12, 13, 14, 15, 0
283 addps m14, m15
284 addps m12, m14
285 addps m14, m13
286 addps m13, m15
287 %endmacro
289 %macro PASS6 0
290 SWAP 9, 12
291 SWAP 11, 14
292 movss [outq+0x00], m8
293 pshuflw m0, m8, 0xe
294 movss [outq+0x10], m9
295 pshuflw m1, m9, 0xe
296 movss [outq+0x20], m10
297 pshuflw m2, m10, 0xe
298 movss [outq+0x30], m11
299 pshuflw m3, m11, 0xe
300 movss [outq+0x40], m12
301 pshuflw m4, m12, 0xe
302 movss [outq+0x50], m13
303 pshuflw m5, m13, 0xe
304 movss [outq+0x60], m14
305 pshuflw m6, m14, 0xe
306 movaps [outq+0x70], m15
307 pshuflw m7, m15, 0xe
308 addss m0, m1
309 addss m1, m2
310 movss [outq+0x08], m0
311 addss m2, m3
312 movss [outq+0x18], m1
313 addss m3, m4
314 movss [outq+0x28], m2
315 addss m4, m5
316 movss [outq+0x38], m3
317 addss m5, m6
318 movss [outq+0x48], m4
319 addss m6, m7
320 movss [outq+0x58], m5
321 movss [outq+0x68], m6
322 movss [outq+0x78], m7
324 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
325 movhlps m0, m1
326 pshufd m1, m1, 3
327 SWAP 0, 2, 4, 6, 8, 10, 12, 14
328 SWAP 1, 3, 5, 7, 9, 11, 13, 15
329 %rep 7
330 movhlps m0, m1
331 pshufd m1, m1, 3
332 addss m15, m1
333 SWAP 0, 2, 4, 6, 8, 10, 12, 14
334 SWAP 1, 3, 5, 7, 9, 11, 13, 15
335 %endrep
336 %assign i 4
337 %rep 15
338 addss m0, m1
339 movss [outq+i], m0
340 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
341 %assign i i+8
342 %endrep
343 %endmacro
345 %else ; ARCH_X86_32
346 %macro SPILL 2 ; xmm#, mempos
347 movaps [outq+(%2-8)*16], m%1
348 %endmacro
349 %macro UNSPILL 2
350 movaps m%1, [outq+(%2-8)*16]
351 %endmacro
353 %define PASS6 PASS6_AND_PERMUTE
354 %macro PASS5 0
355 movaps m2, [ps_cos_vec+160]
356 shufps m3, m3, 0xcc
358 BUTTERFLY3 m5, m3, m2, m1
359 SPILL 5, 8
361 UNSPILL 1, 9
362 BUTTERFLY3 m1, m3, m2, m5
363 SPILL 1, 14
365 BUTTERFLY3 m4, m3, m2, m5
366 SPILL 4, 12
368 BUTTERFLY3 m7, m3, m2, m5
369 SPILL 7, 13
371 UNSPILL 5, 10
372 BUTTERFLY3 m5, m3, m2, m7
373 SPILL 5, 10
375 UNSPILL 4, 11
376 BUTTERFLY3 m4, m3, m2, m7
377 SPILL 4, 11
379 BUTTERFLY3 m6, m3, m2, m7
380 SPILL 6, 9
382 BUTTERFLY3 m0, m3, m2, m7
383 SPILL 0, 15
384 %endmacro
385 %endif
388 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
389 %macro DCT32_FUNC 0
390 cglobal dct32_float, 2, 3, 16, out, in, tmp
391 ; pass 1
393 movaps m0, [inq+0]
394 LOAD_INV m1, [inq+112]
395 BUTTERFLY m0, m1, [ps_cos_vec], m3
397 movaps m7, [inq+64]
398 LOAD_INV m4, [inq+48]
399 BUTTERFLY m7, m4, [ps_cos_vec+32], m3
401 ; pass 2
402 movaps m2, [ps_cos_vec+64]
403 BUTTERFLY m1, m4, m2, m3
404 SPILL 1, 11
405 SPILL 4, 8
407 ; pass 1
408 movaps m1, [inq+16]
409 LOAD_INV m6, [inq+96]
410 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
412 movaps m4, [inq+80]
413 LOAD_INV m5, [inq+32]
414 BUTTERFLY m4, m5, [ps_cos_vec+48], m3
416 ; pass 2
417 BUTTERFLY m0, m7, m2, m3
419 movaps m2, [ps_cos_vec+80]
420 BUTTERFLY m6, m5, m2, m3
422 BUTTERFLY m1, m4, m2, m3
424 ; pass 3
425 movaps m2, [ps_cos_vec+96]
426 shufps m1, m1, 0x1b
427 BUTTERFLY m0, m1, m2, m3
428 SPILL 0, 15
429 SPILL 1, 14
431 UNSPILL 0, 8
432 shufps m5, m5, 0x1b
433 BUTTERFLY m0, m5, m2, m3
435 UNSPILL 1, 11
436 shufps m6, m6, 0x1b
437 BUTTERFLY m1, m6, m2, m3
438 SPILL 1, 11
440 shufps m4, m4, 0x1b
441 BUTTERFLY m7, m4, m2, m3
443 ; pass 4
444 movaps m3, [ps_p1p1m1m1+0]
445 movaps m2, [ps_cos_vec+128]
447 BUTTERFLY2 m5, m3, m2, m1
449 BUTTERFLY2 m0, m3, m2, m1
450 SPILL 0, 9
452 BUTTERFLY2 m6, m3, m2, m1
453 SPILL 6, 10
455 UNSPILL 0, 11
456 BUTTERFLY2 m0, m3, m2, m1
457 SPILL 0, 11
459 BUTTERFLY2 m4, m3, m2, m1
461 BUTTERFLY2 m7, m3, m2, m1
463 UNSPILL 6, 14
464 BUTTERFLY2 m6, m3, m2, m1
466 UNSPILL 0, 15
467 BUTTERFLY2 m0, m3, m2, m1
469 PASS5
470 PASS6
472 %endmacro
474 %macro LOAD_INV 2
475 %if cpuflag(sse2)
476 pshufd %1, %2, 0x1b
477 %elif cpuflag(sse)
478 movaps %1, %2
479 shufps %1, %1, 0x1b
480 %endif
481 %endmacro
483 %if ARCH_X86_32
484 INIT_XMM sse
485 DCT32_FUNC
486 %endif
488 INIT_XMM sse2
489 DCT32_FUNC