lavfi: switch to AVFrame.
[FFMpeg-mirror/mplayer-patches.git] / libavcodec / x86 / imdct36.asm
blob633fcd9d5947101657a0c7610ca4452319e9ac62
1 ;******************************************************************************
2 ;* 36 point SSE-optimized IMDCT transform
3 ;* Copyright (c) 2011 Vitor Sessak
4 ;*
5 ;* This file is part of Libav.
6 ;*
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
24 SECTION_RODATA
26 align 16
27 ps_mask: dd 0, ~0, ~0, ~0
28 ps_mask2: dd 0, ~0, 0, ~0
29 ps_mask3: dd 0, 0, 0, ~0
30 ps_mask4: dd 0, ~0, 0, 0
32 ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
33 ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
34 ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
35 ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
36 ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
37 ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
38 ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
40 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
41 ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
43 ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
44 dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
45 dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
46 dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
47 dd 1.0, 0.70710678118654752439, 0.0, 0.0
49 ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
50 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
51 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
52 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
53 dd 1.0, 0.70710678118654752439, 0.0, 0.0
55 costabs: times 4 dd 0.98480773
56 times 4 dd 0.93969262
57 times 4 dd 0.86602539
58 times 4 dd -0.76604444
59 times 4 dd -0.64278764
60 times 4 dd 0.50000000
61 times 4 dd -0.50000000
62 times 4 dd -0.34202015
63 times 4 dd -0.17364818
64 times 4 dd 0.50190992
65 times 4 dd 0.51763808
66 times 4 dd 0.55168896
67 times 4 dd 0.61038726
68 times 4 dd 0.70710677
69 times 4 dd 0.87172341
70 times 4 dd 1.18310082
71 times 4 dd 1.93185163
72 times 4 dd 5.73685646
74 %define SBLIMIT 32
75 SECTION_TEXT
77 %macro PSHUFD 3
78 %if cpuflag(sse2) && notcpuflag(avx)
79 pshufd %1, %2, %3
80 %else
81 shufps %1, %2, %2, %3
82 %endif
83 %endmacro
85 ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
86 ; output %1={x3,x4,y1,y2}
87 %macro BUILDINVHIGHLOW 3
88 %if cpuflag(avx)
89 shufps %1, %2, %3, 0x4e
90 %else
91 movlhps %1, %3
92 movhlps %1, %2
93 %endif
94 %endmacro
96 ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
97 ; output %1={x4,y1,y2,y3}
98 %macro ROTLEFT 3
99 %if cpuflag(ssse3)
100 palignr %1, %3, %2, 12
101 %else
102 BUILDINVHIGHLOW %1, %2, %3
103 shufps %1, %1, %3, 0x99
104 %endif
105 %endmacro
107 %macro INVERTHL 2
108 %if cpuflag(sse2)
109 PSHUFD %1, %2, 0x4e
110 %else
111 movhlps %1, %2
112 movlhps %1, %2
113 %endif
114 %endmacro
116 %macro BUTTERF 3
117 INVERTHL %2, %1
118 xorps %1, [ps_p1p1m1m1]
119 addps %1, %2
120 %if cpuflag(sse3)
121 mulps %1, %1, [ps_cosh_sse3 + %3]
122 PSHUFD %2, %1, 0xb1
123 addsubps %1, %1, %2
124 %else
125 mulps %1, [ps_cosh + %3]
126 PSHUFD %2, %1, 0xb1
127 xorps %1, [ps_p1m1p1m1]
128 addps %1, %2
129 %endif
130 %endmacro
132 %macro STORE 4
133 movhlps %2, %1
134 movss [%3 ], %1
135 movss [%3 + 2*%4], %2
136 shufps %1, %1, 0xb1
137 movss [%3 + %4], %1
138 movhlps %2, %1
139 movss [%3 + 3*%4], %2
140 %endmacro
142 %macro LOAD 4
143 movlps %1, [%3 ]
144 movhps %1, [%3 + %4]
145 movlps %2, [%3 + 2*%4]
146 movhps %2, [%3 + 3*%4]
147 shufps %1, %2, 0x88
148 %endmacro
150 %macro LOADA64 2
151 %if cpuflag(avx)
152 movu %1, [%2]
153 %else
154 movlps %1, [%2]
155 movhps %1, [%2 + 8]
156 %endif
157 %endmacro
159 %macro DEFINE_IMDCT 0
160 cglobal imdct36_float, 4,4,9, out, buf, in, win
162 ; for(i=17;i>=1;i--) in[i] += in[i-1];
163 LOADA64 m0, inq
164 LOADA64 m1, inq + 16
166 ROTLEFT m5, m0, m1
168 PSHUFD m6, m0, 0x93
169 andps m6, m6, [ps_mask]
170 addps m0, m0, m6
172 LOADA64 m2, inq + 32
174 ROTLEFT m7, m1, m2
176 addps m1, m1, m5
177 LOADA64 m3, inq + 48
179 ROTLEFT m5, m2, m3
181 xorps m4, m4, m4
182 movlps m4, [inq+64]
183 BUILDINVHIGHLOW m6, m3, m4
184 shufps m6, m6, m4, 0xa9
186 addps m4, m4, m6
187 addps m2, m2, m7
188 addps m3, m3, m5
190 ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
191 movlhps m5, m5, m0
192 andps m5, m5, [ps_mask3]
194 BUILDINVHIGHLOW m7, m0, m1
195 andps m7, m7, [ps_mask2]
197 addps m0, m0, m5
199 BUILDINVHIGHLOW m6, m1, m2
200 andps m6, m6, [ps_mask2]
202 addps m1, m1, m7
204 BUILDINVHIGHLOW m7, m2, m3
205 andps m7, m7, [ps_mask2]
207 addps m2, m2, m6
209 movhlps m6, m6, m3
210 andps m6, m6, [ps_mask4]
212 addps m3, m3, m7
213 addps m4, m4, m6
215 ; Populate tmp[]
216 movlhps m6, m1, m5 ; zero out high values
217 subps m6, m6, m4
219 subps m5, m0, m3
221 %if ARCH_X86_64
222 SWAP m5, m8
223 %endif
225 mulps m7, m2, [ps_val1]
227 %if ARCH_X86_64
228 mulps m5, m8, [ps_val2]
229 %else
230 mulps m5, m5, [ps_val2]
231 %endif
232 addps m7, m7, m5
234 mulps m5, m6, [ps_val1]
235 subps m7, m7, m5
237 %if ARCH_X86_64
238 SWAP m5, m8
239 %else
240 subps m5, m0, m3
241 %endif
243 subps m5, m5, m6
244 addps m5, m5, m2
246 shufps m6, m4, m3, 0xe4
247 subps m6, m6, m2
248 mulps m6, m6, [ps_val3]
250 addps m4, m4, m1
251 mulps m4, m4, [ps_val4]
253 shufps m1, m1, m0, 0xe4
254 addps m1, m1, m2
255 mulps m1, m1, [ps_val5]
257 mulps m3, m3, [ps_val6]
258 mulps m0, m0, [ps_val7]
259 addps m0, m0, m3
261 xorps m2, m1, [ps_p1p1m1m1]
262 subps m2, m2, m4
263 addps m2, m2, m0
265 addps m3, m4, m0
266 subps m3, m3, m6
267 xorps m3, m3, [ps_p1p1m1m1]
269 shufps m0, m0, m4, 0xe4
270 subps m0, m0, m1
271 addps m0, m0, m6
273 BUILDINVHIGHLOW m4, m2, m3
274 shufps m3, m3, m2, 0x4e
276 ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
278 BUTTERF m0, m1, 0
279 BUTTERF m7, m2, 16
280 BUTTERF m3, m6, 32
281 BUTTERF m4, m1, 48
283 mulps m5, m5, [ps_cosh + 64]
284 PSHUFD m1, m5, 0xe1
285 xorps m5, m5, [ps_p1m1p1m1]
286 addps m5, m5, m1
288 ; permutates:
289 ; m0 0 1 2 3 => 2 6 10 14 m1
290 ; m7 4 5 6 7 => 3 7 11 15 m2
291 ; m3 8 9 10 11 => 17 13 9 5 m3
292 ; m4 12 13 14 15 => 16 12 8 4 m5
293 ; m5 16 17 xx xx => 0 1 xx xx m0
295 unpckhps m1, m0, m7
296 unpckhps m6, m3, m4
297 movhlps m2, m6, m1
298 movlhps m1, m1, m6
300 unpcklps m5, m5, m4
301 unpcklps m3, m3, m7
302 movhlps m4, m3, m5
303 movlhps m5, m5, m3
304 SWAP m4, m3
305 ; permutation done
307 PSHUFD m6, m2, 0xb1
308 movss m4, [bufq + 4*68]
309 movss m7, [bufq + 4*64]
310 unpcklps m7, m7, m4
311 mulps m6, m6, [winq + 16*4]
312 addps m6, m6, m7
313 movss [outq + 64*SBLIMIT], m6
314 shufps m6, m6, m6, 0xb1
315 movss [outq + 68*SBLIMIT], m6
317 mulps m6, m3, [winq + 4*4]
318 LOAD m4, m7, bufq + 4*16, 16
319 addps m6, m6, m4
320 STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
322 shufps m4, m0, m3, 0xb5
323 mulps m4, m4, [winq + 8*4]
324 LOAD m7, m6, bufq + 4*32, 16
325 addps m4, m4, m7
326 STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
328 shufps m3, m3, m2, 0xb1
329 mulps m3, m3, [winq + 12*4]
330 LOAD m7, m6, bufq + 4*48, 16
331 addps m3, m3, m7
332 STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
334 mulps m2, m2, [winq]
335 LOAD m6, m7, bufq, 16
336 addps m2, m2, m6
337 STORE m2, m7, outq, 4*SBLIMIT
339 mulps m4, m1, [winq + 20*4]
340 STORE m4, m7, bufq, 16
342 mulps m3, m5, [winq + 24*4]
343 STORE m3, m7, bufq + 4*16, 16
345 shufps m0, m0, m5, 0xb0
346 mulps m0, m0, [winq + 28*4]
347 STORE m0, m7, bufq + 4*32, 16
349 shufps m5, m5, m1, 0xb1
350 mulps m5, m5, [winq + 32*4]
351 STORE m5, m7, bufq + 4*48, 16
353 shufps m1, m1, m1, 0xb1
354 mulps m1, m1, [winq + 36*4]
355 movss [bufq + 4*64], m1
356 shufps m1, m1, 0xb1
357 movss [bufq + 4*68], m1
359 %endmacro
361 INIT_XMM sse
362 DEFINE_IMDCT
364 INIT_XMM sse2
365 DEFINE_IMDCT
367 INIT_XMM sse3
368 DEFINE_IMDCT
370 INIT_XMM ssse3
371 DEFINE_IMDCT
373 INIT_XMM avx
374 DEFINE_IMDCT
376 INIT_XMM sse
378 %if ARCH_X86_64
379 %define SPILL SWAP
380 %define UNSPILL SWAP
381 %define SPILLED(x) m %+ x
382 %else
383 %define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
384 %macro SPILL 2 ; xmm#, mempos
385 movaps SPILLED(%2), m%1
386 %endmacro
387 %macro UNSPILL 2
388 movaps m%1, SPILLED(%2)
389 %endmacro
390 %endif
392 %macro DEFINE_FOUR_IMDCT 0
393 cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
394 movlps m0, [inq+64]
395 movhps m0, [inq+64 + 72]
396 movlps m3, [inq+64 + 2*72]
397 movhps m3, [inq+64 + 3*72]
399 shufps m5, m0, m3, 0xdd
400 shufps m0, m0, m3, 0x88
402 mova m1, [inq+48]
403 movu m6, [inq+48 + 72]
404 mova m7, [inq+48 + 2*72]
405 movu m3, [inq+48 + 3*72]
407 TRANSPOSE4x4PS 1, 6, 7, 3, 4
409 addps m4, m6, m7
410 mova [tmpq+4*28], m4
412 addps m7, m3
413 addps m6, m1
414 addps m3, m0
415 addps m0, m5
416 addps m0, m7
417 addps m7, m6
418 mova [tmpq+4*12], m7
419 SPILL 3, 12
421 mova m4, [inq+32]
422 movu m5, [inq+32 + 72]
423 mova m2, [inq+32 + 2*72]
424 movu m7, [inq+32 + 3*72]
426 TRANSPOSE4x4PS 4, 5, 2, 7, 3
428 addps m1, m7
429 SPILL 1, 11
431 addps m3, m5, m2
432 SPILL 3, 13
434 addps m7, m2
435 addps m5, m4
436 addps m6, m7
437 mova [tmpq], m6
438 addps m7, m5
439 mova [tmpq+4*16], m7
441 mova m2, [inq+16]
442 movu m7, [inq+16 + 72]
443 mova m1, [inq+16 + 2*72]
444 movu m6, [inq+16 + 3*72]
446 TRANSPOSE4x4PS 2, 7, 1, 6, 3
448 addps m4, m6
449 addps m6, m1
450 addps m1, m7
451 addps m7, m2
452 addps m5, m6
453 SPILL 5, 15
454 addps m6, m7
455 mulps m6, [costabs + 16*2]
456 mova [tmpq+4*8], m6
457 SPILL 1, 10
458 SPILL 0, 14
460 mova m1, [inq]
461 movu m6, [inq + 72]
462 mova m3, [inq + 2*72]
463 movu m5, [inq + 3*72]
465 TRANSPOSE4x4PS 1, 6, 3, 5, 0
467 addps m2, m5
468 addps m5, m3
469 addps m7, m5
470 addps m3, m6
471 addps m6, m1
472 SPILL 7, 8
473 addps m5, m6
474 SPILL 6, 9
475 addps m6, m4, SPILLED(12)
476 subps m6, m2
477 UNSPILL 7, 11
478 SPILL 5, 11
479 subps m5, m1, m7
480 mulps m7, [costabs + 16*5]
481 addps m7, m1
482 mulps m0, m6, [costabs + 16*6]
483 addps m0, m5
484 mova [tmpq+4*24], m0
485 addps m6, m5
486 mova [tmpq+4*4], m6
487 addps m6, m4, m2
488 mulps m6, [costabs + 16*1]
489 subps m4, SPILLED(12)
490 mulps m4, [costabs + 16*8]
491 addps m2, SPILLED(12)
492 mulps m2, [costabs + 16*3]
493 subps m5, m7, m6
494 subps m5, m2
495 addps m6, m7
496 addps m6, m4
497 addps m7, m2
498 subps m7, m4
499 mova [tmpq+4*20], m7
500 mova m2, [tmpq+4*28]
501 mova [tmpq+4*28], m5
502 UNSPILL 7, 13
503 subps m5, m7, m2
504 mulps m5, [costabs + 16*7]
505 UNSPILL 1, 10
506 mulps m1, [costabs + 16*2]
507 addps m4, m3, m2
508 mulps m4, [costabs + 16*4]
509 addps m2, m7
510 addps m7, m3
511 mulps m7, [costabs]
512 subps m3, m2
513 mulps m3, [costabs + 16*2]
514 addps m2, m7, m5
515 addps m2, m1
516 SPILL 2, 10
517 addps m7, m4
518 subps m7, m1
519 SPILL 7, 12
520 subps m5, m4
521 subps m5, m1
522 UNSPILL 0, 14
523 SPILL 5, 13
524 addps m1, m0, SPILLED(15)
525 subps m1, SPILLED(8)
526 mova m4, [costabs + 16*5]
527 mulps m4, [tmpq]
528 UNSPILL 2, 9
529 addps m4, m2
530 subps m2, [tmpq]
531 mulps m5, m1, [costabs + 16*6]
532 addps m5, m2
533 SPILL 5, 9
534 addps m2, m1
535 SPILL 2, 14
536 UNSPILL 5, 15
537 subps m7, m5, m0
538 addps m5, SPILLED(8)
539 mulps m5, [costabs + 16*1]
540 mulps m7, [costabs + 16*8]
541 addps m0, SPILLED(8)
542 mulps m0, [costabs + 16*3]
543 subps m2, m4, m5
544 subps m2, m0
545 SPILL 2, 15
546 addps m5, m4
547 addps m5, m7
548 addps m4, m0
549 subps m4, m7
550 SPILL 4, 8
551 mova m7, [tmpq+4*16]
552 mova m2, [tmpq+4*12]
553 addps m0, m7, m2
554 subps m0, SPILLED(11)
555 mulps m0, [costabs + 16*2]
556 addps m4, m7, SPILLED(11)
557 mulps m4, [costabs]
558 subps m7, m2
559 mulps m7, [costabs + 16*7]
560 addps m2, SPILLED(11)
561 mulps m2, [costabs + 16*4]
562 addps m1, m7, [tmpq+4*8]
563 addps m1, m4
564 addps m4, m2
565 subps m4, [tmpq+4*8]
566 SPILL 4, 11
567 subps m7, m2
568 subps m7, [tmpq+4*8]
569 addps m4, m6, SPILLED(10)
570 subps m6, SPILLED(10)
571 addps m2, m5, m1
572 mulps m2, [costabs + 16*9]
573 subps m5, m1
574 mulps m5, [costabs + 16*17]
575 subps m1, m4, m2
576 addps m4, m2
577 mulps m2, m1, [winq+4*36]
578 addps m2, [bufq+4*36]
579 mova [outq+1152], m2
580 mulps m1, [winq+4*32]
581 addps m1, [bufq+4*32]
582 mova [outq+1024], m1
583 mulps m1, m4, [winq+4*116]
584 mova [bufq+4*36], m1
585 mulps m4, [winq+4*112]
586 mova [bufq+4*32], m4
587 addps m2, m6, m5
588 subps m6, m5
589 mulps m1, m6, [winq+4*68]
590 addps m1, [bufq+4*68]
591 mova [outq+2176], m1
592 mulps m6, [winq]
593 addps m6, [bufq]
594 mova [outq], m6
595 mulps m1, m2, [winq+4*148]
596 mova [bufq+4*68], m1
597 mulps m2, [winq+4*80]
598 mova [bufq], m2
599 addps m5, m3, [tmpq+4*24]
600 mova m2, [tmpq+4*24]
601 subps m2, m3
602 mova m1, SPILLED(9)
603 subps m1, m0
604 mulps m1, [costabs + 16*10]
605 addps m0, SPILLED(9)
606 mulps m0, [costabs + 16*16]
607 addps m6, m5, m1
608 subps m5, m1
609 mulps m3, m5, [winq+4*40]
610 addps m3, [bufq+4*40]
611 mova [outq+1280], m3
612 mulps m5, [winq+4*28]
613 addps m5, [bufq+4*28]
614 mova [outq+896], m5
615 mulps m1, m6, [winq+4*120]
616 mova [bufq+4*40], m1
617 mulps m6, [winq+4*108]
618 mova [bufq+4*28], m6
619 addps m1, m2, m0
620 subps m2, m0
621 mulps m5, m2, [winq+4*64]
622 addps m5, [bufq+4*64]
623 mova [outq+2048], m5
624 mulps m2, [winq+4*4]
625 addps m2, [bufq+4*4]
626 mova [outq+128], m2
627 mulps m0, m1, [winq+4*144]
628 mova [bufq+4*64], m0
629 mulps m1, [winq+4*84]
630 mova [bufq+4*4], m1
631 mova m1, [tmpq+4*28]
632 mova m5, m1
633 addps m1, SPILLED(13)
634 subps m5, SPILLED(13)
635 UNSPILL 3, 15
636 addps m2, m7, m3
637 mulps m2, [costabs + 16*11]
638 subps m3, m7
639 mulps m3, [costabs + 16*15]
640 addps m0, m2, m1
641 subps m1, m2
642 SWAP m0, m2
643 mulps m6, m1, [winq+4*44]
644 addps m6, [bufq+4*44]
645 mova [outq+1408], m6
646 mulps m1, [winq+4*24]
647 addps m1, [bufq+4*24]
648 mova [outq+768], m1
649 mulps m0, m2, [winq+4*124]
650 mova [bufq+4*44], m0
651 mulps m2, [winq+4*104]
652 mova [bufq+4*24], m2
653 addps m0, m5, m3
654 subps m5, m3
655 mulps m1, m5, [winq+4*60]
656 addps m1, [bufq+4*60]
657 mova [outq+1920], m1
658 mulps m5, [winq+4*8]
659 addps m5, [bufq+4*8]
660 mova [outq+256], m5
661 mulps m1, m0, [winq+4*140]
662 mova [bufq+4*60], m1
663 mulps m0, [winq+4*88]
664 mova [bufq+4*8], m0
665 mova m1, [tmpq+4*20]
666 addps m1, SPILLED(12)
667 mova m2, [tmpq+4*20]
668 subps m2, SPILLED(12)
669 UNSPILL 7, 8
670 subps m0, m7, SPILLED(11)
671 addps m7, SPILLED(11)
672 mulps m4, m7, [costabs + 16*12]
673 mulps m0, [costabs + 16*14]
674 addps m5, m1, m4
675 subps m1, m4
676 mulps m7, m1, [winq+4*48]
677 addps m7, [bufq+4*48]
678 mova [outq+1536], m7
679 mulps m1, [winq+4*20]
680 addps m1, [bufq+4*20]
681 mova [outq+640], m1
682 mulps m1, m5, [winq+4*128]
683 mova [bufq+4*48], m1
684 mulps m5, [winq+4*100]
685 mova [bufq+4*20], m5
686 addps m6, m2, m0
687 subps m2, m0
688 mulps m1, m2, [winq+4*56]
689 addps m1, [bufq+4*56]
690 mova [outq+1792], m1
691 mulps m2, [winq+4*12]
692 addps m2, [bufq+4*12]
693 mova [outq+384], m2
694 mulps m0, m6, [winq+4*136]
695 mova [bufq+4*56], m0
696 mulps m6, [winq+4*92]
697 mova [bufq+4*12], m6
698 UNSPILL 0, 14
699 mulps m0, [costabs + 16*13]
700 mova m3, [tmpq+4*4]
701 addps m2, m0, m3
702 subps m3, m0
703 mulps m0, m3, [winq+4*52]
704 addps m0, [bufq+4*52]
705 mova [outq+1664], m0
706 mulps m3, [winq+4*16]
707 addps m3, [bufq+4*16]
708 mova [outq+512], m3
709 mulps m0, m2, [winq+4*132]
710 mova [bufq+4*52], m0
711 mulps m2, [winq+4*96]
712 mova [bufq+4*16], m2
714 %endmacro
716 INIT_XMM sse
717 DEFINE_FOUR_IMDCT
719 INIT_XMM avx
720 DEFINE_FOUR_IMDCT