1 ;******************************************************************************
2 ;* 36 point SSE-optimized IMDCT transform
3 ;* Copyright (c) 2011 Vitor Sessak
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
27 ps_mask: dd 0, ~
0, ~
0, ~
0
28 ps_mask2: dd 0, ~
0, 0, ~
0
29 ps_mask3: dd 0, 0, 0, ~
0
30 ps_mask4: dd 0, ~
0, 0, 0
32 ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
33 ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
34 ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
35 ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
36 ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
37 ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
38 ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
40 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
41 ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
43 ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
44 dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
45 dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
46 dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
47 dd 1.0, 0.70710678118654752439, 0.0, 0.0
49 ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
50 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
51 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
52 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
53 dd 1.0, 0.70710678118654752439, 0.0, 0.0
55 costabs: times
4 dd 0.98480773
58 times
4 dd -0.76604444
59 times
4 dd -0.64278764
61 times
4 dd -0.50000000
62 times
4 dd -0.34202015
63 times
4 dd -0.17364818
78 %if cpuflag
(sse2
) && notcpuflag
(avx
)
85 ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
86 ; output %1={x3,x4,y1,y2}
87 %macro BUILDINVHIGHLOW
3
89 shufps
%1, %2, %3, 0x4e
96 ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
97 ; output %1={x4,y1,y2,y3}
100 palignr
%1, %3, %2, 12
102 BUILDINVHIGHLOW
%1, %2, %3
103 shufps
%1, %1, %3, 0x99
118 xorps
%1, [ps_p1p1m1m1
]
121 mulps
%1, %1, [ps_cosh_sse3
+ %3]
125 mulps
%1, [ps_cosh
+ %3]
127 xorps
%1, [ps_p1m1p1m1
]
135 movss
[%3 + 2*%4], %2
139 movss
[%3 + 3*%4], %2
145 movlps
%2, [%3 + 2*%4]
146 movhps
%2, [%3 + 3*%4]
159 %macro DEFINE_IMDCT
0
160 cglobal imdct36_float
, 4,4,9, out, buf
, in, win
162 ; for(i=17;i>=1;i--) in[i] += in[i-1];
169 andps m6
, m6
, [ps_mask
]
183 BUILDINVHIGHLOW m6
, m3
, m4
184 shufps m6
, m6
, m4
, 0xa9
190 ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
192 andps m5
, m5
, [ps_mask3
]
194 BUILDINVHIGHLOW m7
, m0
, m1
195 andps m7
, m7
, [ps_mask2
]
199 BUILDINVHIGHLOW m6
, m1
, m2
200 andps m6
, m6
, [ps_mask2
]
204 BUILDINVHIGHLOW m7
, m2
, m3
205 andps m7
, m7
, [ps_mask2
]
210 andps m6
, m6
, [ps_mask4
]
216 movlhps m6
, m1
, m5
; zero out high values
225 mulps m7
, m2
, [ps_val1
]
228 mulps m5
, m8
, [ps_val2
]
230 mulps m5
, m5
, [ps_val2
]
234 mulps m5
, m6
, [ps_val1
]
246 shufps m6
, m4
, m3
, 0xe4
248 mulps m6
, m6
, [ps_val3
]
251 mulps m4
, m4
, [ps_val4
]
253 shufps m1
, m1
, m0
, 0xe4
255 mulps m1
, m1
, [ps_val5
]
257 mulps m3
, m3
, [ps_val6
]
258 mulps m0
, m0
, [ps_val7
]
261 xorps m2
, m1
, [ps_p1p1m1m1
]
267 xorps m3
, m3
, [ps_p1p1m1m1
]
269 shufps m0
, m0
, m4
, 0xe4
273 BUILDINVHIGHLOW m4
, m2
, m3
274 shufps m3
, m3
, m2
, 0x4e
276 ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
283 mulps m5
, m5
, [ps_cosh
+ 64]
285 xorps m5
, m5
, [ps_p1m1p1m1
]
289 ; m0 0 1 2 3 => 2 6 10 14 m1
290 ; m7 4 5 6 7 => 3 7 11 15 m2
291 ; m3 8 9 10 11 => 17 13 9 5 m3
292 ; m4 12 13 14 15 => 16 12 8 4 m5
293 ; m5 16 17 xx xx => 0 1 xx xx m0
308 movss m4
, [bufq
+ 4*68]
309 movss m7
, [bufq
+ 4*64]
311 mulps m6
, m6
, [winq
+ 16*4]
313 movss
[outq
+ 64*SBLIMIT
], m6
314 shufps m6
, m6
, m6
, 0xb1
315 movss
[outq
+ 68*SBLIMIT
], m6
317 mulps m6
, m3
, [winq
+ 4*4]
318 LOAD m4
, m7
, bufq
+ 4*16, 16
320 STORE m6
, m7
, outq
+ 16*SBLIMIT
, 4*SBLIMIT
322 shufps m4
, m0
, m3
, 0xb5
323 mulps m4
, m4
, [winq
+ 8*4]
324 LOAD m7
, m6
, bufq
+ 4*32, 16
326 STORE m4
, m6
, outq
+ 32*SBLIMIT
, 4*SBLIMIT
328 shufps m3
, m3
, m2
, 0xb1
329 mulps m3
, m3
, [winq
+ 12*4]
330 LOAD m7
, m6
, bufq
+ 4*48, 16
332 STORE m3
, m7
, outq
+ 48*SBLIMIT
, 4*SBLIMIT
335 LOAD m6
, m7
, bufq
, 16
337 STORE m2
, m7
, outq
, 4*SBLIMIT
339 mulps m4
, m1
, [winq
+ 20*4]
340 STORE m4
, m7
, bufq
, 16
342 mulps m3
, m5
, [winq
+ 24*4]
343 STORE m3
, m7
, bufq
+ 4*16, 16
345 shufps m0
, m0
, m5
, 0xb0
346 mulps m0
, m0
, [winq
+ 28*4]
347 STORE m0
, m7
, bufq
+ 4*32, 16
349 shufps m5
, m5
, m1
, 0xb1
350 mulps m5
, m5
, [winq
+ 32*4]
351 STORE m5
, m7
, bufq
+ 4*48, 16
353 shufps m1
, m1
, m1
, 0xb1
354 mulps m1
, m1
, [winq
+ 36*4]
355 movss
[bufq
+ 4*64], m1
357 movss
[bufq
+ 4*68], m1
381 %define SPILLED
(x
) m
%+ x
383 %define SPILLED
(x
) [tmpq
+(x
-8)*16 + 32*4]
384 %macro SPILL
2 ; xmm#, mempos
385 movaps SPILLED
(%2), m
%1
388 movaps m
%1, SPILLED
(%2)
392 %macro DEFINE_FOUR_IMDCT
0
393 cglobal four_imdct36_float
, 5,5,16, out, buf
, in, win
, tmp
395 movhps m0
, [inq
+64 + 72]
396 movlps m3
, [inq
+64 + 2*72]
397 movhps m3
, [inq
+64 + 3*72]
399 shufps m5
, m0
, m3
, 0xdd
400 shufps m0
, m0
, m3
, 0x88
403 movu m6
, [inq
+48 + 72]
404 mova m7
, [inq
+48 + 2*72]
405 movu m3
, [inq
+48 + 3*72]
407 TRANSPOSE4x4PS
1, 6, 7, 3, 4
422 movu m5
, [inq
+32 + 72]
423 mova m2
, [inq
+32 + 2*72]
424 movu m7
, [inq
+32 + 3*72]
426 TRANSPOSE4x4PS
4, 5, 2, 7, 3
442 movu m7
, [inq
+16 + 72]
443 mova m1
, [inq
+16 + 2*72]
444 movu m6
, [inq
+16 + 3*72]
446 TRANSPOSE4x4PS
2, 7, 1, 6, 3
455 mulps m6
, [costabs
+ 16*2]
462 mova m3
, [inq
+ 2*72]
463 movu m5
, [inq
+ 3*72]
465 TRANSPOSE4x4PS
1, 6, 3, 5, 0
475 addps m6
, m4
, SPILLED
(12)
480 mulps m7
, [costabs
+ 16*5]
482 mulps m0
, m6
, [costabs
+ 16*6]
488 mulps m6
, [costabs
+ 16*1]
489 subps m4
, SPILLED
(12)
490 mulps m4
, [costabs
+ 16*8]
491 addps m2
, SPILLED
(12)
492 mulps m2
, [costabs
+ 16*3]
504 mulps m5
, [costabs
+ 16*7]
506 mulps m1
, [costabs
+ 16*2]
508 mulps m4
, [costabs
+ 16*4]
513 mulps m3
, [costabs
+ 16*2]
524 addps m1
, m0
, SPILLED
(15)
526 mova m4
, [costabs
+ 16*5]
531 mulps m5
, m1
, [costabs
+ 16*6]
539 mulps m5
, [costabs
+ 16*1]
540 mulps m7
, [costabs
+ 16*8]
542 mulps m0
, [costabs
+ 16*3]
554 subps m0
, SPILLED
(11)
555 mulps m0
, [costabs
+ 16*2]
556 addps m4
, m7
, SPILLED
(11)
559 mulps m7
, [costabs
+ 16*7]
560 addps m2
, SPILLED
(11)
561 mulps m2
, [costabs
+ 16*4]
562 addps m1
, m7
, [tmpq
+4*8]
569 addps m4
, m6
, SPILLED
(10)
570 subps m6
, SPILLED
(10)
572 mulps m2
, [costabs
+ 16*9]
574 mulps m5
, [costabs
+ 16*17]
577 mulps m2
, m1
, [winq
+4*36]
578 addps m2
, [bufq
+4*36]
580 mulps m1
, [winq
+4*32]
581 addps m1
, [bufq
+4*32]
583 mulps m1
, m4
, [winq
+4*116]
585 mulps m4
, [winq
+4*112]
589 mulps m1
, m6
, [winq
+4*68]
590 addps m1
, [bufq
+4*68]
595 mulps m1
, m2
, [winq
+4*148]
597 mulps m2
, [winq
+4*80]
599 addps m5
, m3
, [tmpq
+4*24]
604 mulps m1
, [costabs
+ 16*10]
606 mulps m0
, [costabs
+ 16*16]
609 mulps m3
, m5
, [winq
+4*40]
610 addps m3
, [bufq
+4*40]
612 mulps m5
, [winq
+4*28]
613 addps m5
, [bufq
+4*28]
615 mulps m1
, m6
, [winq
+4*120]
617 mulps m6
, [winq
+4*108]
621 mulps m5
, m2
, [winq
+4*64]
622 addps m5
, [bufq
+4*64]
627 mulps m0
, m1
, [winq
+4*144]
629 mulps m1
, [winq
+4*84]
633 addps m1
, SPILLED
(13)
634 subps m5
, SPILLED
(13)
637 mulps m2
, [costabs
+ 16*11]
639 mulps m3
, [costabs
+ 16*15]
643 mulps m6
, m1
, [winq
+4*44]
644 addps m6
, [bufq
+4*44]
646 mulps m1
, [winq
+4*24]
647 addps m1
, [bufq
+4*24]
649 mulps m0
, m2
, [winq
+4*124]
651 mulps m2
, [winq
+4*104]
655 mulps m1
, m5
, [winq
+4*60]
656 addps m1
, [bufq
+4*60]
661 mulps m1
, m0
, [winq
+4*140]
663 mulps m0
, [winq
+4*88]
666 addps m1
, SPILLED
(12)
668 subps m2
, SPILLED
(12)
670 subps m0
, m7
, SPILLED
(11)
671 addps m7
, SPILLED
(11)
672 mulps m4
, m7
, [costabs
+ 16*12]
673 mulps m0
, [costabs
+ 16*14]
676 mulps m7
, m1
, [winq
+4*48]
677 addps m7
, [bufq
+4*48]
679 mulps m1
, [winq
+4*20]
680 addps m1
, [bufq
+4*20]
682 mulps m1
, m5
, [winq
+4*128]
684 mulps m5
, [winq
+4*100]
688 mulps m1
, m2
, [winq
+4*56]
689 addps m1
, [bufq
+4*56]
691 mulps m2
, [winq
+4*12]
692 addps m2
, [bufq
+4*12]
694 mulps m0
, m6
, [winq
+4*136]
696 mulps m6
, [winq
+4*92]
699 mulps m0
, [costabs
+ 16*13]
703 mulps m0
, m3
, [winq
+4*52]
704 addps m0
, [bufq
+4*52]
706 mulps m3
, [winq
+4*16]
707 addps m3
, [bufq
+4*16]
709 mulps m0
, m2
, [winq
+4*132]
711 mulps m2
, [winq
+4*96]