1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 ; These functions are not individually interchangeable with the C versions.
23 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
24 ; in blocks as conventient to the vector size.
25 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
31 %define M_SQRT1_2
0.70710678118654752440
32 ps_root2: times
4 dd M_SQRT1_2
33 ps_root2mppm: dd -M_SQRT1_2
, M_SQRT1_2
, M_SQRT1_2
, -M_SQRT1_2
54 section .text
align=16
56 %macro T2_3DN
4 ; z0, z1, mem0, mem1
63 %macro T4_3DN
6 ; z0, z1, z2, z3, tmp0, tmp1
66 pfadd
%5, %4 ; {t6,t5}
67 pxor
%3, [ps_m1p1
GLOBAL] ; {t8,t7}
70 pfadd
%1, %5 ; {r0,i0}
71 pfsub
%6, %5 ; {r2,i2}
73 pfadd
%2, %3 ; {r1,i1}
74 pfsub
%4, %3 ; {r3,i3}
78 ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
79 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
82 shufps
%1, %2, 0x64 ; {r0,i0,r3,i2}
83 shufps
%3, %2, 0xce ; {r1,i1,r2,i3}
85 addps
%1, %3 ; {t1,t2,t6,t5}
86 subps
%2, %3 ; {t3,t4,t8,t7}
88 shufps
%1, %2, 0x44 ; {t1,t2,t3,t4}
89 shufps
%3, %2, 0xbe ; {t6,t5,t7,t8}
91 addps
%1, %3 ; {r0,i0,r1,i1}
92 subps
%2, %3 ; {r2,i2,r3,i3}
94 shufps
%1, %2, 0x88 ; {r0,r1,r2,r3}
95 shufps
%3, %2, 0xdd ; {i0,i1,i2,i3}
99 %macro T8_SSE
6 ; r0,i0,r1,i1,t0,t1
101 shufps
%3, %4, 0x44 ; {r4,i4,r6,i6}
102 shufps
%5, %4, 0xee ; {r5,i5,r7,i7}
104 subps
%3, %5 ; {r5,i5,r7,i7}
105 addps
%6, %5 ; {t1,t2,t3,t4}
107 shufps
%5, %5, 0xb1 ; {i5,r5,i7,r7}
108 mulps
%3, [ps_root2mppm
GLOBAL] ; {-r5,i5,r7,-i7}
109 mulps
%5, [ps_root2
GLOBAL]
110 addps
%3, %5 ; {t8,t7,ta,t9}
112 shufps
%6, %3, 0x36 ; {t3,t2,t9,t8}
113 shufps
%5, %3, 0x9c ; {t1,t4,t7,ta}
115 addps
%6, %5 ; {t1,t2,t9,ta}
116 subps
%3, %5 ; {t6,t5,tc,tb}
118 shufps
%6, %3, 0xd8 ; {t1,t9,t5,tb}
119 shufps
%5, %3, 0x8d ; {t2,ta,t6,tc}
122 addps
%1, %6 ; {r0,r1,r2,r3}
123 addps
%2, %5 ; {i0,i1,i2,i3}
124 subps
%3, %6 ; {r4,r5,r6,r7}
125 subps
%4, %5 ; {i4,i5,i6,i7}
128 ; scheduled for cpu-bound sizes
129 %macro PASS_SMALL
3 ; (to load m4-m7), wre, wim
136 mulps m2
, m0
; r2*wre
138 mulps m3
, m1
; i2*wim
140 mulps m4
, m1
; r2*wim
141 mulps m5
, m0
; i2*wre
142 addps m2
, m3
; r2*wre + i2*wim
144 mulps m1
, m6
; r3*wim
145 subps m5
, m4
; i2*wre - r2*wim
147 mulps m3
, m7
; i3*wim
148 mulps m4
, m6
; r3*wre
149 mulps m0
, m7
; i3*wre
150 subps m4
, m3
; r3*wre - i3*wim
152 addps m0
, m1
; i3*wre + r3*wim
182 ; scheduled to avoid store->load aliasing
183 %macro PASS_BIG
1 ; (!interleave)
189 mova m1
, [wq
+o1q
] ; wim
190 mulps m2
, m0
; r2*wre
192 mulps m3
, m1
; i2*wim
194 mulps m4
, m1
; r2*wim
195 mulps m5
, m0
; i2*wre
196 addps m2
, m3
; r2*wre + i2*wim
198 mulps m1
, m6
; r3*wim
199 subps m5
, m4
; i2*wre - r2*wim
201 mulps m3
, m7
; i3*wim
202 mulps m4
, m6
; r3*wre
203 mulps m0
, m7
; i3*wre
204 subps m4
, m3
; r3*wre - i3*wim
206 addps m0
, m1
; i3*wre + r3*wim
268 %define Z
(x
) [r0
+mmsize
*x
]
286 T8_SSE m0
, m1
, m2
, m3
, m4
, m5
300 T8_SSE m0
, m1
, m2
, m3
, m4
, m5
311 PASS_SMALL
0, [ff_cos_16
GLOBAL], [ff_cos_16
+16 GLOBAL]
320 T2_3DN m0
, m1
, Z
(0), Z
(1)
323 T4_3DN m0
, m1
, m2
, m3
, m4
, m5
334 T2_3DN m0
, m1
, Z
(0), Z
(1)
337 T4_3DN m0
, m1
, m2
, m3
, m4
, m5
340 T2_3DN m4
, m5
, Z
(4), Z
(5)
341 T2_3DN m6
, m7
, Z
(6), Z
(7)
344 pxor m0
, [ps_m1p1
GLOBAL]
345 pxor m2
, [ps_m1p1
GLOBAL]
348 pfmul m5
, [ps_root2
GLOBAL]
349 pfmul m7
, [ps_root2
GLOBAL]
350 T4_3DN m1
, m3
, m5
, m7
, m0
, m2
355 T4_3DN m0
, m2
, m4
, m6
, m5
, m7
387 %define Z
(x
) [zq
+ o1q
*(x
&6)*((x
/6)^
1) + o3q
*(x
/6) + mmsize
*(x
&1)]
389 %macro DECL_PASS
2+ ; name, payload
392 DEFINE_ARGS z
, w
, n
, o1
, o3
406 DECL_PASS pass_sse
, PASS_BIG
1
407 DECL_PASS pass_interleave_sse
, PASS_BIG
0
413 %define unpcklps punpckldq
414 %define unpckhps punpckhdq
415 DECL_PASS pass_3dn
, PASS_SMALL
1, [wq
], [wq
+o1q
]
416 DECL_PASS pass_interleave_3dn
, PASS_BIG
0
417 %define pass_3dn2 pass_3dn
418 %define pass_interleave_3dn2 pass_interleave_3dn
421 %macro DECL_FFT
2-3 ; nbits, cpu, suffix
422 %xdefine list_of_fft fft4
%2, fft8
%2
424 %xdefine list_of_fft list_of_fft
, fft16
%2
431 %xdefine list_of_fft list_of_fft
, fft
%+ n
%+ %3%2
436 add r0
, n
*4 - (n
&(-2<<%1))
438 add r0
, n
*2 - (n2
&(-2<<%1))
440 sub r0
, n
*6 + (n2
&(-2<<%1))
441 lea r1
, [ff_cos_
%+ n
GLOBAL]
449 %ifidn __OUTPUT_FORMAT__
,macho64
454 dispatch_tab
%3%2: pointer list_of_fft
458 ; On x86_32, this function does the register saving and restoring for all of fft.
459 ; The others pass args in registers and don't spill anything.
460 cglobal fft_dispatch
%3%2, 2,5,0, z
, nbits
461 lea r2
, [dispatch_tab
%3%2 GLOBAL]
462 mov r2
, [r2
+ (nbitsq
-2)*gprsize
]
468 DECL_FFT
5, _sse
, _interleave
470 DECL_FFT
4, _3dn
, _interleave
472 DECL_FFT
4, _3dn2
, _interleave