1 ;******************************************************************************
2 ;* 32 point SSE-optimized DCT transform
3 ;* Copyright (c) 2010 Vitor Sessak
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
26 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
28 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
29 dd 0.553104, 0.582935, 0.622504, 0.674808
30 dd -10.190008, -3.407609, -2.057781, -1.484165
31 dd -1.169440, -0.972568, -0.839350, -0.744536
32 dd 0.502419, 0.522499, 0.566944, 0.646822
33 dd 0.788155, 1.060678, 1.722447, 5.101149
34 dd 0.509796, 0.601345, 0.899976, 2.562916
35 dd 0.509796, 0.601345, 0.899976, 2.562916
36 dd 1.000000, 1.000000, 1.306563, 0.541196
37 dd 1.000000, 1.000000, 1.306563, 0.541196
38 dd 1.000000, 0.707107, 1.000000, -0.707107
39 dd 1.000000, 0.707107, 1.000000, -0.707107
40 dd 0.707107, 0.707107, 0.707107, 0.707107
49 %if cpuflag
(sse2
) && notcpuflag
(avx
)
63 BUTTERFLY0
%1, %2, %3, %4, 0x1b
67 BUTTERFLY0
%1, %2, %3, %4, 0xb1
75 mulps m
%2, [ps_cos_vec
+192]
79 mulps m
%4, [ps_cos_vec
+192]
82 %macro PASS6_AND_PERMUTE
0
193 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
194 cglobal dct32_float
, 2,3,8, out, in, tmp
197 vinsertf128 m5
, m5
, [inq
+96], 1
198 vinsertf128 m5
, m5
, [inq
+112], 0
199 vshufps m5
, m5
, m5
, 0x1b
200 BUTTERFLY m4
, m5
, [ps_cos_vec
], m6
203 vinsertf128 m6
, m6
, [inq
+32], 1
204 vinsertf128 m6
, m6
, [inq
+48], 0
205 vshufps m6
, m6
, m6
, 0x1b
206 BUTTERFLY m2
, m6
, [ps_cos_vec
+32], m0
210 BUTTERFLY m5
, m6
, [ps_cos_vec
+64], m0
211 BUTTERFLY m4
, m2
, [ps_cos_vec
+64], m7
215 vperm2f128 m3
, m6
, m4
, 0x31
216 vperm2f128 m1
, m6
, m4
, 0x20
217 vshufps m3
, m3
, m3
, 0x1b
219 BUTTERFLY m1
, m3
, [ps_cos_vec
+96], m6
222 vperm2f128 m4
, m5
, m2
, 0x20
223 vperm2f128 m5
, m5
, m2
, 0x31
224 vshufps m5
, m5
, m5
, 0x1b
226 BUTTERFLY m4
, m5
, [ps_cos_vec
+96], m6
229 vmovaps m6
, [ps_p1p1m1m1
+0]
230 vmovaps m2
, [ps_cos_vec
+128]
232 BUTTERFLY2 m5
, m6
, m2
, m7
233 BUTTERFLY2 m4
, m6
, m2
, m7
234 BUTTERFLY2 m1
, m6
, m2
, m7
235 BUTTERFLY2 m3
, m6
, m2
, m7
239 vshufps m6
, m6
, m6
, 0xcc
240 vmovaps m2
, [ps_cos_vec
+160]
242 BUTTERFLY3 m5
, m6
, m2
, m7
243 BUTTERFLY3 m4
, m6
, m2
, m7
244 BUTTERFLY3 m1
, m6
, m2
, m7
245 BUTTERFLY3 m3
, m6
, m2
, m7
247 vperm2f128 m6
, m3
, m3
, 0x31
250 vextractf128
[outq
+64], m5
, 1
251 vextractf128
[outq
+32], m5
, 0
253 vextractf128
[outq
+80], m4
, 1
254 vextractf128
[outq
+48], m4
, 0
256 vperm2f128 m0
, m1
, m1
, 0x31
257 vmovaps
[outq
+96], m1
271 nop ; FIXME code alignment
277 PERMUTE
9,10, 10,12, 11,14, 12,9, 13,11, 14,13
278 TRANSPOSE4x4PS
8, 9, 10, 11, 0
279 BUTTERFLY3V
8, 9, 10, 11, 0
281 TRANSPOSE4x4PS
12, 13, 14, 15, 0
282 BUTTERFLY3V
12, 13, 14, 15, 0
292 movss
[outq
+0x00], m8
294 movss
[outq
+0x10], m9
296 movss
[outq
+0x20], m10
298 movss
[outq
+0x30], m11
300 movss
[outq
+0x40], m12
302 movss
[outq
+0x50], m13
304 movss
[outq
+0x60], m14
306 movaps
[outq
+0x70], m15
310 movss
[outq
+0x08], m0
312 movss
[outq
+0x18], m1
314 movss
[outq
+0x28], m2
316 movss
[outq
+0x38], m3
318 movss
[outq
+0x48], m4
320 movss
[outq
+0x58], m5
321 movss
[outq
+0x68], m6
322 movss
[outq
+0x78], m7
324 PERMUTE
1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
327 SWAP
0, 2, 4, 6, 8, 10, 12, 14
328 SWAP
1, 3, 5, 7, 9, 11, 13, 15
333 SWAP
0, 2, 4, 6, 8, 10, 12, 14
334 SWAP
1, 3, 5, 7, 9, 11, 13, 15
340 SWAP
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
346 %macro SPILL
2 ; xmm#, mempos
347 movaps
[outq
+(%2-8)*16], m
%1
350 movaps m
%1, [outq
+(%2-8)*16]
353 %define PASS6 PASS6_AND_PERMUTE
355 movaps m2
, [ps_cos_vec
+160]
358 BUTTERFLY3 m5
, m3
, m2
, m1
362 BUTTERFLY3 m1
, m3
, m2
, m5
365 BUTTERFLY3 m4
, m3
, m2
, m5
368 BUTTERFLY3 m7
, m3
, m2
, m5
372 BUTTERFLY3 m5
, m3
, m2
, m7
376 BUTTERFLY3 m4
, m3
, m2
, m7
379 BUTTERFLY3 m6
, m3
, m2
, m7
382 BUTTERFLY3 m0
, m3
, m2
, m7
388 ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
390 cglobal dct32_float
, 2, 3, 16, out, in, tmp
394 LOAD_INV m1
, [inq
+112]
395 BUTTERFLY m0
, m1
, [ps_cos_vec
], m3
398 LOAD_INV m4
, [inq
+48]
399 BUTTERFLY m7
, m4
, [ps_cos_vec
+32], m3
402 movaps m2
, [ps_cos_vec
+64]
403 BUTTERFLY m1
, m4
, m2
, m3
409 LOAD_INV m6
, [inq
+96]
410 BUTTERFLY m1
, m6
, [ps_cos_vec
+16], m3
413 LOAD_INV m5
, [inq
+32]
414 BUTTERFLY m4
, m5
, [ps_cos_vec
+48], m3
417 BUTTERFLY m0
, m7
, m2
, m3
419 movaps m2
, [ps_cos_vec
+80]
420 BUTTERFLY m6
, m5
, m2
, m3
422 BUTTERFLY m1
, m4
, m2
, m3
425 movaps m2
, [ps_cos_vec
+96]
427 BUTTERFLY m0
, m1
, m2
, m3
433 BUTTERFLY m0
, m5
, m2
, m3
437 BUTTERFLY m1
, m6
, m2
, m3
441 BUTTERFLY m7
, m4
, m2
, m3
444 movaps m3
, [ps_p1p1m1m1
+0]
445 movaps m2
, [ps_cos_vec
+128]
447 BUTTERFLY2 m5
, m3
, m2
, m1
449 BUTTERFLY2 m0
, m3
, m2
, m1
452 BUTTERFLY2 m6
, m3
, m2
, m1
456 BUTTERFLY2 m0
, m3
, m2
, m1
459 BUTTERFLY2 m4
, m3
, m2
, m1
461 BUTTERFLY2 m7
, m3
, m2
, m1
464 BUTTERFLY2 m6
, m3
, m2
, m1
467 BUTTERFLY2 m0
, m3
, m2
, m1