1 ;******************************************************************************
2 ;* VP9 inverse transform x86 SIMD optimizations
4 ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
24 %include "vp9itxfm_template.asm"
39 pd_3fff: times
4 dd 0x3fff
45 cextern pw_15212_m13377
47 cextern pw_m5283_m15212
49 cextern pw_m13377_13377
52 pw_9929_m5283: times
4 dw 9929, -5283
67 COEF_PAIR
3196, 16069, 1
70 COEF_PAIR
6270, 15137, 1
72 COEF_PAIR
10394, 12665
73 COEF_PAIR
11003, 12140
74 COEF_PAIR
11585, 11585, 1
76 COEF_PAIR
13623, 9102, 1
119 %macro VP9_STORE_2X
6-7 dstq
; reg1, reg2, tmp1, tmp2, min, max, dst
121 mova m
%4, [%7+strideq
]
129 mova
[%7+strideq
], m
%4
132 %macro ZERO_BLOCK
4 ; mem, stride, nnzcpl, zero_reg
137 mova
[%1+%%y
+%%x
], %4
138 %assign
%%x
(%%x
+mmsize
)
144 ; the input coefficients are scaled up by 2 bit (which we downscale immediately
145 ; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d.
146 ; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling,
147 ; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits
148 ; add 2 bits, we need to scale before converting to word in 12bpp, since the
149 ; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp
150 ; we can scale after converting to words (which is half the instructions),
151 ; since the input is only 14+sign bit, which fits in 15+sign words directly.
153 %macro IWHT4_FN
2 ; bpp, max
154 cglobal vp9_iwht_iwht_4x4_add_
%1, 3, 3, 8, dst
, stride
, block
, eob
156 mova m0
, [blockq
+0*16+0]
157 mova m1
, [blockq
+1*16+0]
159 mova m4
, [blockq
+0*16+8]
160 mova m5
, [blockq
+1*16+8]
168 packssdw m0
, [blockq
+0*16+8]
169 packssdw m1
, [blockq
+1*16+8]
173 mova m2
, [blockq
+2*16+0]
174 mova m3
, [blockq
+3*16+0]
176 mova m4
, [blockq
+2*16+8]
177 mova m5
, [blockq
+3*16+8]
185 packssdw m2
, [blockq
+2*16+8]
186 packssdw m3
, [blockq
+3*16+8]
192 TRANSPOSE4x4W
0, 1, 2, 3, 4
196 VP9_STORE_2X
0, 1, 4, 5, 6, 7
197 lea dstq
, [dstq
+strideq
*2]
198 VP9_STORE_2X
2, 3, 4, 5, 6, 7
199 ZERO_BLOCK blockq
, 16, 4, m6
208 %macro VP9_IDCT4_WRITEOUT
0
227 VP9_STORE_2X
0, 1, 6, 7, 4, 5
228 lea dstq
, [dstq
+2*strideq
]
229 VP9_STORE_2X
2, 3, 6, 7, 4, 5
232 %macro DC_ONLY
2 ; shift, zero
233 mov coefd
, dword [blockq
]
239 add coefd
, ((1 << (%1 - 1)) << 14) + 8192
243 ; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
244 ; in 15+1 words without additional effort, since the coefficients are 15bpp.
247 cglobal vp9_idct_idct_4x4_add_10
, 4, 4, 8, dst
, stride
, block
, eob
256 mova m5
, [pw_11585x2
]
260 DEFINE_ARGS dst
, stride
, block
, coef
267 pmulhrsw m0
, [pw_2048
] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
269 VP9_STORE_2X
0, 0, 6, 7, 4, 5
270 lea dstq
, [dstq
+2*strideq
]
271 VP9_STORE_2X
0, 0, 6, 7, 4, 5
275 mova m0
, [blockq
+0*16+0]
276 mova m1
, [blockq
+1*16+0]
277 packssdw m0
, [blockq
+0*16+8]
278 packssdw m1
, [blockq
+1*16+8]
279 mova m2
, [blockq
+2*16+0]
280 mova m3
, [blockq
+3*16+0]
281 packssdw m2
, [blockq
+2*16+8]
282 packssdw m3
, [blockq
+3*16+8]
285 mova m6
, [pw_11585x2
]
287 mova m7
, [pd_8192
] ; rounding
289 TRANSPOSE4x4W
0, 1, 2, 3, 4
293 ZERO_BLOCK blockq
, 16, 4, m4
304 cglobal vp9_
%1_
%3_4x
4_add
_10, 3, 3, 0, dst
, stride
, block
, eob
305 %if WIN64
&& notcpuflag
(ssse3
)
310 movdqa xmm5
, [pd_8192
]
311 mova m0
, [blockq
+0*16+0]
312 mova m1
, [blockq
+1*16+0]
313 packssdw m0
, [blockq
+0*16+8]
314 packssdw m1
, [blockq
+1*16+8]
315 mova m2
, [blockq
+2*16+0]
316 mova m3
, [blockq
+3*16+0]
317 packssdw m2
, [blockq
+2*16+8]
318 packssdw m3
, [blockq
+3*16+8]
321 mova m6
, [pw_11585x2
]
323 %ifnidn
%1%3, iadstiadst
327 TRANSPOSE4x4W
0, 1, 2, 3, 4
331 ZERO_BLOCK blockq
, 16, 4, m4
337 IADST4_FN idct
, IDCT4
, iadst
, IADST4
338 IADST4_FN iadst
, IADST4
, idct
, IDCT4
339 IADST4_FN iadst
, IADST4
, iadst
, IADST4
342 IADST4_FN idct
, IDCT4
, iadst
, IADST4
343 IADST4_FN iadst
, IADST4
, idct
, IDCT4
344 IADST4_FN iadst
, IADST4
, iadst
, IADST4
346 ; inputs and outputs are dwords, coefficients are words
348 ; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
349 ; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
350 %macro SUMSUB_MUL
6-8 [pd_8192
], [pd_3fff
] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
357 punpckhwd m
%2, m
%4, m
%3
359 pmaddwd m
%3, m
%4, [pw_
%6_
%5]
360 pmaddwd m
%1, m
%2, [pw_
%6_
%5]
361 pmaddwd m
%4, [pw_m
%5_
%6]
362 pmaddwd m
%2, [pw_m
%5_
%6]
371 %macro IDCT4_12BPP_1D
0-8 [pd_8192
], [pd_3fff
], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1
372 SUMSUB_MUL
%3, %5, %7, %8, 11585, 11585, %1, %2
373 SUMSUB_MUL
%4, %6, %7, %8, 15137, 6270, %1, %2
374 SUMSUB_BA d
, %4, %3, %7
375 SUMSUB_BA d
, %6, %5, %7
379 %macro STORE_4x4
6 ; tmp1-2, reg1-2, min, max
380 movh m
%1, [dstq
+strideq
*0]
381 movh m
%2, [dstq
+strideq
*2]
382 movhps m
%1, [dstq
+strideq
*1]
383 movhps m
%2, [dstq
+stride3q
]
390 movh
[dstq
+strideq
*0], m
%1
391 movhps
[dstq
+strideq
*1], m
%1
392 movh
[dstq
+strideq
*2], m
%2
393 movhps
[dstq
+stride3q
], m
%2
396 %macro ROUND_AND_STORE_4x4
8 ; reg1-4, min, max, rnd, shift
407 STORE_4x4
%2, %4, %1, %3, %5, %6
411 cglobal vp9_idct_idct_4x4_add_12
, 4, 4, 8, dst
, stride
, block
, eob
415 ; dc-only - this is special, since for 4x4 12bpp, the max coef size is
416 ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the
417 ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a
418 ; dword. After the final shift (4), the result is 13+sign bits, so we
419 ; don't need any additional processing to fit it in a word
420 DEFINE_ARGS dst
, stride
, block
, coef
424 pshuflw m0
, m0
, q0000
427 DEFINE_ARGS dst
, stride
, stride3
428 lea stride3q
, [strideq
*3]
429 STORE_4x4
1, 3, 0, 0, m4
, m5
433 DEFINE_ARGS dst
, stride
, block
, eob
434 mova m0
, [blockq
+0*16]
435 mova m1
, [blockq
+1*16]
436 mova m2
, [blockq
+2*16]
437 mova m3
, [blockq
+3*16]
441 IDCT4_12BPP_1D m6
, m7
442 TRANSPOSE4x4D
0, 1, 2, 3, 4
443 IDCT4_12BPP_1D m6
, m7
446 ZERO_BLOCK blockq
, 16, 4, m4
449 DEFINE_ARGS dst
, stride
, stride3
450 lea stride3q
, [strideq
*3]
453 ROUND_AND_STORE_4x4
0, 1, 2, 3, m4
, m5
, m6
, 4
492 ; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14
493 ; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14
494 ; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14
495 ; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14
496 %macro IADST4_12BPP_1D
0-2 [pd_8192
], [pd_3fff
] ; rnd, mask
513 SCRATCH
1, 8, rsp
+0*mmsize
, a
514 SCRATCH
5, 9, rsp
+1*mmsize
, b
516 ; m1/3 have the high bits of 0,1,2,3
517 ; m4/5 have the low bits of 0,1,2,3
520 mova m2
, [pw_15212_9929
]
521 mova m0
, [pw_5283_13377
]
522 pmaddwd m7
, m2
, reg_b
528 mova m1
, [pw_m13377_13377
]
529 mova m5
, [pw_13377_0
]
530 pmaddwd m7
, m1
, reg_b
543 mova m7
, [pw_m5283_m15212
]
544 mova m5
, [pw_9929_13377
]
545 pmaddwd m1
, m7
, reg_b
551 UNSCRATCH
5, 9, rsp
+1*mmsize
, b
552 pmaddwd m5
, [pw_9929_m5283
]
553 pmaddwd m4
, [pw_15212_m13377
]
554 pmaddwd m3
, [pw_9929_m5283
]
555 UNSCRATCH
1, 8, rsp
+0*mmsize
, a
556 pmaddwd m1
, [pw_15212_m13377
]
569 %macro IADST4_12BPP_FN
4
570 cglobal vp9_
%1_
%3_4x
4_add
_12, 3, 3, 12, 2 * ARCH_X86_32
* mmsize
, dst
, stride
, block
, eob
571 mova m0
, [blockq
+0*16]
572 mova m1
, [blockq
+1*16]
573 mova m2
, [blockq
+2*16]
574 mova m3
, [blockq
+3*16]
576 PRELOAD
10, pd_8192
, rnd
577 PRELOAD
11, pd_3fff
, mask
578 %2_12BPP
_1D reg_rnd
, reg_mask
579 TRANSPOSE4x4D
0, 1, 2, 3, 4
580 %4_12BPP
_1D reg_rnd
, reg_mask
583 ZERO_BLOCK blockq
, 16, 4, m4
586 DEFINE_ARGS dst
, stride
, stride3
587 lea stride3q
, [strideq
*3]
590 ROUND_AND_STORE_4x4
0, 1, 2, 3, m4
, m5
, m6
, 4
595 IADST4_12BPP_FN idct
, IDCT4
, iadst
, IADST4
596 IADST4_12BPP_FN iadst
, IADST4
, idct
, IDCT4
597 IADST4_12BPP_FN iadst
, IADST4
, iadst
, IADST4
599 ; the following line has not been executed at the end of this macro:
600 ; UNSCRATCH 6, 8, rsp+%3*mmsize
601 %macro IDCT8_1D
1-5 [pd_8192
], [pd_3fff
], 2 * mmsize
, 17 ; src, rnd, mask, src_stride, stack_offset
606 IDCT4_12BPP_1D
%2, %3, 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3
607 SCRATCH
4, 8, rsp
+(%5+0)*mmsize
608 SCRATCH
6, 9, rsp
+(%5+1)*mmsize
613 SUMSUB_MUL
1, 7, 4, 6, 16069, 3196, %2, %3 ; m1=t7a, m7=t4a
614 SUMSUB_MUL
5, 3, 4, 6, 9102, 13623, %2, %3 ; m5=t6a, m3=t5a
615 SUMSUB_BA d
, 3, 7, 4 ; m3=t4, m7=t5a
616 SUMSUB_BA d
, 5, 1, 4 ; m5=t7, m1=t6a
617 SUMSUB_MUL
1, 7, 4, 6, 11585, 11585, %2, %3 ; m1=t6, m7=t5
618 SUMSUB_BA d
, 5, 0, 4 ; m5=out0, m0=out7
619 SUMSUB_BA d
, 1, 2, 4 ; m1=out1, m2=out6
620 UNSCRATCH
4, 8, rsp
+(%5+0)*mmsize
621 UNSCRATCH
6, 9, rsp
+(%5+1)*mmsize
622 SCRATCH
2, 8, rsp
+(%5+0)*mmsize
623 SUMSUB_BA d
, 7, 4, 2 ; m7=out2, m4=out5
624 SUMSUB_BA d
, 3, 6, 2 ; m3=out3, m6=out4
625 SWAP
0, 5, 4, 6, 2, 7
628 %macro STORE_2x8
5-7 dstq
, strideq
; tmp1-2, reg, min, max
641 ; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
642 ; storage also instead of allocating two more stack spaces. This doesn't
643 ; matter much but it's something...
645 cglobal vp9_idct_idct_8x8_add_10
, 4, 6 + ARCH_X86_64
, 14, \
646 16 * mmsize
+ 3 * ARCH_X86_32
* mmsize
, \
647 dst
, stride
, block
, eob
652 ; dc-only - the 10bit version can be done entirely in 32bit, since the max
653 ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily
655 DEFINE_ARGS dst
, stride
, block
, coef
659 pshuflw m1
, m1
, q0000
661 DEFINE_ARGS dst
, stride
, cnt
664 STORE_2x8
3, 4, 1, m2
, m0
665 lea dstq
, [dstq
+strideq
*2]
671 SCRATCH
0, 12, rsp
+16*mmsize
, max
672 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, skip
, dstbak
678 lea ptrq
, [default_8x8
]
679 movzx cntd
, byte [ptrq
+cntq
-1]
681 movzx cntd
, byte [default_8x8
+cntq
-1]
686 PRELOAD
10, pd_8192
, rnd
687 PRELOAD
11, pd_3fff
, mask
688 PRELOAD
13, pd_16
, srnd
690 IDCT8_1D blockq
, reg_rnd
, reg_mask
692 TRANSPOSE4x4D
0, 1, 2, 3, 6
693 mova
[ptrq
+ 0*mmsize
], m0
694 mova
[ptrq
+ 2*mmsize
], m1
695 mova
[ptrq
+ 4*mmsize
], m2
696 mova
[ptrq
+ 6*mmsize
], m3
697 UNSCRATCH
6, 8, rsp
+17*mmsize
698 TRANSPOSE4x4D
4, 5, 6, 7, 0
699 mova
[ptrq
+ 1*mmsize
], m4
700 mova
[ptrq
+ 3*mmsize
], m5
701 mova
[ptrq
+ 5*mmsize
], m6
702 mova
[ptrq
+ 7*mmsize
], m7
708 ; zero-pad the remainder (skipped cols)
712 lea blockq
, [blockq
+skipq
*(mmsize
/2)]
715 mova
[ptrq
+mmsize
*0], m0
716 mova
[ptrq
+mmsize
*1], m0
717 mova
[ptrq
+mmsize
*2], m0
718 mova
[ptrq
+mmsize
*3], m0
724 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, stride3
, dstbak
725 lea stride3q
, [strideq
*3]
729 IDCT8_1D ptrq
, reg_rnd
, reg_mask
732 ROUND_AND_STORE_4x4
0, 1, 2, 3, m6
, reg_max
, reg_srnd
, 5
733 lea dstq
, [dstq
+strideq
*4]
734 UNSCRATCH
0, 8, rsp
+17*mmsize
735 UNSCRATCH
1, 12, rsp
+16*mmsize
, max
736 UNSCRATCH
2, 13, pd_16
, srnd
737 ROUND_AND_STORE_4x4
4, 5, 0, 7, m6
, m1
, m2
, 5
740 lea dstq
, [dstbakq
+8]
749 ZERO_BLOCK blockq
-2*mmsize
, 32, 8, m6
752 %macro DC_ONLY_64BIT
2 ; shift, zero
754 movsxd coefq
, dword [blockq
]
760 add coefq
, ((1 << (%1 - 1)) << 14) + 8192
763 mov coefd
, dword [blockq
]
765 DEFINE_ARGS dst
, stride
, cnt
, coef
, coefl
778 add coefd
, 1 << (%1 - 1)
784 cglobal vp9_idct_idct_8x8_add_12
, 4, 6 + ARCH_X86_64
, 14, \
785 16 * mmsize
+ 3 * ARCH_X86_32
* mmsize
, \
786 dst
, stride
, block
, eob
789 jg mangle
(private_prefix
%+ _
%+ vp9_idct_idct_8x8_add_10
%+ SUFFIX
).idctfull
791 ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign
792 ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies
793 DEFINE_ARGS dst
, stride
, block
, coef
, coefl
797 pshuflw m1
, m1
, q0000
799 DEFINE_ARGS dst
, stride
, cnt
802 STORE_2x8
3, 4, 1, m2
, m0
803 lea dstq
, [dstq
+strideq
*2]
808 ; inputs and outputs are dwords, coefficients are words
810 ; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
811 ; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
812 %macro SUMSUB_MUL_D
6-7 [pd_3fff
] ; src/dst 1-2, dst3-4, coef1-2, mask
819 punpckhwd m
%2, m
%4, m
%3
821 pmaddwd m
%3, m
%4, [pw_
%6_
%5]
822 pmaddwd m
%1, m
%2, [pw_
%6_
%5]
823 pmaddwd m
%4, [pw_m
%5_
%6]
824 pmaddwd m
%2, [pw_m
%5_
%6]
827 ; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
828 ; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
829 %macro SUMSUB_PACK_D
5-6 [pd_8192
] ; src/dst 1-2, src3-4, tmp, rnd
830 SUMSUB_BA d
, %1, %2, %5
831 SUMSUB_BA d
, %3, %4, %5
849 ; the following line has not been executed at the end of this macro:
850 ; UNSCRATCH 6, 8, rsp+17*mmsize
851 %macro IADST8_1D
1-3 [pd_8192
], [pd_3fff
] ; src, rnd, mask
852 mova m0
, [%1+ 0*mmsize
]
853 mova m3
, [%1+ 6*mmsize
]
854 mova m4
, [%1+ 8*mmsize
]
855 mova m7
, [%1+14*mmsize
]
856 SUMSUB_MUL_D
7, 0, 1, 2, 16305, 1606, %3 ; m7/1=t0a, m0/2=t1a
857 SUMSUB_MUL_D
3, 4, 5, 6, 10394, 12665, %3 ; m3/5=t4a, m4/6=t5a
858 SCRATCH
0, 8, rsp
+17*mmsize
859 SUMSUB_PACK_D
3, 7, 5, 1, 0, %2 ; m3=t0, m7=t4
860 UNSCRATCH
0, 8, rsp
+17*mmsize
861 SUMSUB_PACK_D
4, 0, 6, 2, 1, %2 ; m4=t1, m0=t5
863 SCRATCH
3, 8, rsp
+17*mmsize
864 SCRATCH
4, 9, rsp
+18*mmsize
865 SCRATCH
7, 10, rsp
+19*mmsize
866 SCRATCH
0, 11, rsp
+20*mmsize
868 mova m1
, [%1+ 2*mmsize
]
869 mova m2
, [%1+ 4*mmsize
]
870 mova m5
, [%1+10*mmsize
]
871 mova m6
, [%1+12*mmsize
]
872 SUMSUB_MUL_D
5, 2, 3, 4, 14449, 7723, %3 ; m5/8=t2a, m2/9=t3a
873 SUMSUB_MUL_D
1, 6, 7, 0, 4756, 15679, %3 ; m1/10=t6a, m6/11=t7a
874 SCRATCH
2, 12, rsp
+21*mmsize
875 SUMSUB_PACK_D
1, 5, 7, 3, 2, %2 ; m1=t2, m5=t6
876 UNSCRATCH
2, 12, rsp
+21*mmsize
877 SUMSUB_PACK_D
6, 2, 0, 4, 3, %2 ; m6=t3, m2=t7
879 UNSCRATCH
7, 10, rsp
+19*mmsize
880 UNSCRATCH
0, 11, rsp
+20*mmsize
881 SCRATCH
1, 10, rsp
+19*mmsize
882 SCRATCH
6, 11, rsp
+20*mmsize
884 SUMSUB_MUL_D
7, 0, 3, 4, 15137, 6270, %3 ; m7/8=t4a, m0/9=t5a
885 SUMSUB_MUL_D
2, 5, 1, 6, 6270, 15137, %3 ; m2/10=t7a, m5/11=t6a
886 SCRATCH
2, 12, rsp
+21*mmsize
887 SUMSUB_PACK_D
5, 7, 6, 3, 2, %2 ; m5=-out1, m7=t6
888 UNSCRATCH
2, 12, rsp
+21*mmsize
890 SUMSUB_PACK_D
2, 0, 1, 4, 3, %2 ; m2=out6, m0=t7
891 SUMSUB_MUL
7, 0, 3, 4, 11585, 11585, %2, %3 ; m7=out2, m0=-out5
894 UNSCRATCH
3, 8, rsp
+17*mmsize
895 UNSCRATCH
4, 9, rsp
+18*mmsize
896 UNSCRATCH
1, 10, rsp
+19*mmsize
897 UNSCRATCH
6, 11, rsp
+20*mmsize
898 SCRATCH
2, 8, rsp
+17*mmsize
899 SCRATCH
0, 9, rsp
+18*mmsize
901 SUMSUB_BA d
, 1, 3, 2 ; m1=out0, m3=t2
902 SUMSUB_BA d
, 6, 4, 2 ; m6=-out7, m4=t3
904 SUMSUB_MUL
3, 4, 2, 0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4
907 UNSCRATCH
0, 9, rsp
+18*mmsize
914 cglobal vp9_
%1_
%3_8x
8_add
_10, 4, 6 + ARCH_X86_64
, 16, \
915 16 * mmsize
+ ARCH_X86_32
* 6 * mmsize
, \
916 dst
, stride
, block
, eob
920 SCRATCH
0, 13, rsp
+16*mmsize
, max
921 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, skip
, dstbak
928 movzx cntd
, byte [ptrq
+cntq
-1]
930 movzx cntd
, byte [%5_8x
8+cntq
-1]
935 PRELOAD
14, pd_8192
, rnd
936 PRELOAD
15, pd_3fff
, mask
938 %2_1D blockq
, reg_rnd
, reg_mask
940 TRANSPOSE4x4D
0, 1, 2, 3, 6
941 mova
[ptrq
+ 0*mmsize
], m0
942 mova
[ptrq
+ 2*mmsize
], m1
943 mova
[ptrq
+ 4*mmsize
], m2
944 mova
[ptrq
+ 6*mmsize
], m3
945 UNSCRATCH
6, 8, rsp
+17*mmsize
946 TRANSPOSE4x4D
4, 5, 6, 7, 0
947 mova
[ptrq
+ 1*mmsize
], m4
948 mova
[ptrq
+ 3*mmsize
], m5
949 mova
[ptrq
+ 5*mmsize
], m6
950 mova
[ptrq
+ 7*mmsize
], m7
956 ; zero-pad the remainder (skipped cols)
960 lea blockq
, [blockq
+skipq
*(mmsize
/2)]
963 mova
[ptrq
+mmsize
*0], m0
964 mova
[ptrq
+mmsize
*1], m0
965 mova
[ptrq
+mmsize
*2], m0
966 mova
[ptrq
+mmsize
*3], m0
972 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, stride3
, dstbak
973 lea stride3q
, [strideq
*3]
977 %4_1D ptrq
, reg_rnd
, reg_mask
980 PRELOAD
9, pd_16
, srnd
981 ROUND_AND_STORE_4x4
0, 1, 2, 3, m6
, reg_max
, reg_srnd
, 5
982 lea dstq
, [dstq
+strideq
*4]
983 UNSCRATCH
0, 8, rsp
+17*mmsize
984 UNSCRATCH
1, 13, rsp
+16*mmsize
, max
985 UNSCRATCH
2, 9, pd_16
, srnd
986 ROUND_AND_STORE_4x4
4, 5, 0, 7, m6
, m1
, m2
, 5
989 lea dstq
, [dstbakq
+8]
998 ZERO_BLOCK blockq
-2*mmsize
, 32, 8, m6
1001 cglobal vp9_
%1_
%3_8x
8_add
_12, 4, 6 + ARCH_X86_64
, 16, \
1002 16 * mmsize
+ ARCH_X86_32
* 6 * mmsize
, \
1003 dst
, stride
, block
, eob
1005 jmp mangle
(private_prefix
%+ _
%+ vp9_
%1_
%3_8x
8_add
_10 %+ SUFFIX
).body
1009 IADST8_FN idct
, IDCT8
, iadst
, IADST8
, row
1010 IADST8_FN iadst
, IADST8
, idct
, IDCT8
, col
1011 IADST8_FN iadst
, IADST8
, iadst
, IADST8
, default
1013 %macro IDCT16_1D
1-4 4 * mmsize
, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
1014 IDCT8_1D
%1, [pd_8192
], [pd_3fff
], %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
1015 ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6
1016 SCRATCH
0, 15, rsp
+(%4+7)*mmsize
; t0a
1017 SCRATCH
1, 14, rsp
+(%4+6)*mmsize
; t1a
1018 SCRATCH
2, 13, rsp
+(%4+5)*mmsize
; t2a
1019 SCRATCH
3, 12, rsp
+(%4+4)*mmsize
; t3a
1020 SCRATCH
4, 11, rsp
+(%4+3)*mmsize
; t4
1021 mova
[rsp
+(%3+0)*mmsize
], m5
; t5
1022 mova
[rsp
+(%3+1)*mmsize
], m7
; t7
1024 mova m0
, [%1+ 1*%2] ; in1
1025 mova m3
, [%1+ 7*%2] ; in7
1026 mova m4
, [%1+ 9*%2] ; in9
1027 mova m7
, [%1+15*%2] ; in15
1029 SUMSUB_MUL
0, 7, 1, 2, 16305, 1606 ; m0=t15a, m7=t8a
1030 SUMSUB_MUL
4, 3, 1, 2, 10394, 12665 ; m4=t14a, m3=t9a
1031 SUMSUB_BA d
, 3, 7, 1 ; m3=t8, m7=t9
1032 SUMSUB_BA d
, 4, 0, 1 ; m4=t15,m0=t14
1033 SUMSUB_MUL
0, 7, 1, 2, 15137, 6270 ; m0=t14a, m7=t9a
1035 mova m1
, [%1+ 3*%2] ; in3
1036 mova m2
, [%1+ 5*%2] ; in5
1037 mova m5
, [%1+11*%2] ; in11
1038 mova m6
, [%1+13*%2] ; in13
1040 SCRATCH
0, 9, rsp
+(%4+1)*mmsize
1041 SCRATCH
7, 10, rsp
+(%4+2)*mmsize
1043 SUMSUB_MUL
2, 5, 0, 7, 14449, 7723 ; m2=t13a, m5=t10a
1044 SUMSUB_MUL
6, 1, 0, 7, 4756, 15679 ; m6=t12a, m1=t11a
1045 SUMSUB_BA d
, 5, 1, 0 ; m5=t11,m1=t10
1046 SUMSUB_BA d
, 2, 6, 0 ; m2=t12,m6=t13
1048 SUMSUB_MUL
1, 6, 0, 7, 15137, 6270 ; m1=t13a, m6=t10a
1050 UNSCRATCH
7, 10, rsp
+(%4+2)*mmsize
1051 SUMSUB_BA d
, 5, 3, 0 ; m5=t8a, m3=t11a
1052 SUMSUB_BA d
, 6, 7, 0 ; m6=t9, m7=t10
1053 SUMSUB_BA d
, 2, 4, 0 ; m2=t15a,m4=t12a
1054 SCRATCH
5, 10, rsp
+(%4+2)*mmsize
1055 SUMSUB_MUL
4, 3, 0, 5, 11585, 11585 ; m4=t12, m3=t11
1056 UNSCRATCH
0, 9, rsp
+(%4+1)*mmsize
1057 SUMSUB_BA d
, 1, 0, 5 ; m1=t14, m0=t13
1058 SCRATCH
6, 9, rsp
+(%4+1)*mmsize
1059 SUMSUB_MUL
0, 7, 6, 5, 11585, 11585 ; m0=t13a,m7=t10a
1061 ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2
1064 UNSCRATCH
5, 15, rsp
+(%4+7)*mmsize
1065 SUMSUB_BA d
, 2, 5, 6 ; m2=out0, m5=out15
1066 SCRATCH
5, 15, rsp
+(%4+7)*mmsize
1067 UNSCRATCH
5, 14, rsp
+(%4+6)*mmsize
1068 SUMSUB_BA d
, 1, 5, 6 ; m1=out1, m5=out14
1069 SCRATCH
5, 14, rsp
+(%4+6)*mmsize
1070 UNSCRATCH
5, 13, rsp
+(%4+5)*mmsize
1071 SUMSUB_BA d
, 0, 5, 6 ; m0=out2, m5=out13
1072 SCRATCH
5, 13, rsp
+(%4+5)*mmsize
1073 UNSCRATCH
5, 12, rsp
+(%4+4)*mmsize
1074 SUMSUB_BA d
, 4, 5, 6 ; m4=out3, m5=out12
1075 SCRATCH
5, 12, rsp
+(%4+4)*mmsize
1076 UNSCRATCH
5, 11, rsp
+(%4+3)*mmsize
1077 SUMSUB_BA d
, 3, 5, 6 ; m3=out4, m5=out11
1078 SCRATCH
4, 11, rsp
+(%4+3)*mmsize
1079 mova m4
, [rsp
+(%3+0)*mmsize
]
1080 SUMSUB_BA d
, 7, 4, 6 ; m7=out5, m4=out10
1081 mova
[rsp
+(%3+0)*mmsize
], m5
1082 UNSCRATCH
5, 8, rsp
+(%4+0)*mmsize
1083 UNSCRATCH
6, 9, rsp
+(%4+1)*mmsize
1084 SCRATCH
2, 8, rsp
+(%4+0)*mmsize
1085 SCRATCH
1, 9, rsp
+(%4+1)*mmsize
1086 UNSCRATCH
1, 10, rsp
+(%4+2)*mmsize
1087 SCRATCH
0, 10, rsp
+(%4+2)*mmsize
1088 mova m0
, [rsp
+(%3+1)*mmsize
]
1089 SUMSUB_BA d
, 6, 5, 2 ; m6=out6, m5=out9
1090 SUMSUB_BA d
, 1, 0, 2 ; m1=out7, m0=out8
1092 SWAP
0, 3, 1, 7, 2, 6, 4
1094 ; output order: 8-11|r67-70=out0-3
1096 ; 12-15|r71-74=out12-15
1100 cglobal vp9_idct_idct_16x16_add_10
, 4, 6 + ARCH_X86_64
, 16, \
1101 67 * mmsize
+ ARCH_X86_32
* 8 * mmsize
, \
1102 dst
, stride
, block
, eob
1107 ; dc-only - the 10bit version can be done entirely in 32bit, since the max
1108 ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
1110 DEFINE_ARGS dst
, stride
, block
, coef
1114 pshuflw m1
, m1
, q0000
1116 DEFINE_ARGS dst
, stride
, cnt
1119 STORE_2x8
3, 4, 1, m2
, m0
, dstq
, mmsize
1120 STORE_2x8
3, 4, 1, m2
, m0
, dstq
+strideq
, mmsize
1121 lea dstq
, [dstq
+strideq
*2]
1127 mova
[rsp
+64*mmsize
], m0
1128 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, skip
, dstbak
1134 lea ptrq
, [default_16x16
]
1135 movzx cntd
, byte [ptrq
+cntq
-1]
1137 movzx cntd
, byte [default_16x16
+cntq
-1]
1145 TRANSPOSE4x4D
0, 1, 2, 3, 7
1146 mova
[ptrq
+ 1*mmsize
], m0
1147 mova
[ptrq
+ 5*mmsize
], m1
1148 mova
[ptrq
+ 9*mmsize
], m2
1149 mova
[ptrq
+13*mmsize
], m3
1150 mova m7
, [rsp
+65*mmsize
]
1151 TRANSPOSE4x4D
4, 5, 6, 7, 0
1152 mova
[ptrq
+ 2*mmsize
], m4
1153 mova
[ptrq
+ 6*mmsize
], m5
1154 mova
[ptrq
+10*mmsize
], m6
1155 mova
[ptrq
+14*mmsize
], m7
1156 UNSCRATCH
0, 8, rsp
+67*mmsize
1157 UNSCRATCH
1, 9, rsp
+68*mmsize
1158 UNSCRATCH
2, 10, rsp
+69*mmsize
1159 UNSCRATCH
3, 11, rsp
+70*mmsize
1160 TRANSPOSE4x4D
0, 1, 2, 3, 7
1161 mova
[ptrq
+ 0*mmsize
], m0
1162 mova
[ptrq
+ 4*mmsize
], m1
1163 mova
[ptrq
+ 8*mmsize
], m2
1164 mova
[ptrq
+12*mmsize
], m3
1165 UNSCRATCH
4, 12, rsp
+71*mmsize
1166 UNSCRATCH
5, 13, rsp
+72*mmsize
1167 UNSCRATCH
6, 14, rsp
+73*mmsize
1168 UNSCRATCH
7, 15, rsp
+74*mmsize
1169 TRANSPOSE4x4D
4, 5, 6, 7, 0
1170 mova
[ptrq
+ 3*mmsize
], m4
1171 mova
[ptrq
+ 7*mmsize
], m5
1172 mova
[ptrq
+11*mmsize
], m6
1173 mova
[ptrq
+15*mmsize
], m7
1174 add ptrq
, 16 * mmsize
1179 ; zero-pad the remainder (skipped cols)
1183 lea blockq
, [blockq
+skipq
*(mmsize
/2)]
1186 mova
[ptrq
+mmsize
*0], m0
1187 mova
[ptrq
+mmsize
*1], m0
1188 mova
[ptrq
+mmsize
*2], m0
1189 mova
[ptrq
+mmsize
*3], m0
1190 mova
[ptrq
+mmsize
*4], m0
1191 mova
[ptrq
+mmsize
*5], m0
1192 mova
[ptrq
+mmsize
*6], m0
1193 mova
[ptrq
+mmsize
*7], m0
1194 add ptrq
, 8 * mmsize
1199 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, stride3
, dstbak
1200 lea stride3q
, [strideq
*3]
1207 lea dstq
, [dstq
+strideq
*4]
1208 ROUND_AND_STORE_4x4
0, 1, 2, 3, m7
, [rsp
+64*mmsize
], [pd_32
], 6
1209 lea dstq
, [dstq
+strideq
*4]
1210 mova m0
, [rsp
+65*mmsize
]
1211 mova m1
, [rsp
+64*mmsize
]
1213 ROUND_AND_STORE_4x4
4, 5, 6, 0, m7
, m1
, m2
, 6
1216 DEFINE_ARGS dstbak
, stride
, block
, cnt
, ptr, stride3
, dst
1220 UNSCRATCH
0, 8, rsp
+67*mmsize
1221 UNSCRATCH
4, 9, rsp
+68*mmsize
1222 UNSCRATCH
5, 10, rsp
+69*mmsize
1223 UNSCRATCH
3, 11, rsp
+70*mmsize
1224 ROUND_AND_STORE_4x4
0, 4, 5, 3, m7
, m1
, m2
, 6
1226 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, stride3
, dstbak
1227 lea dstq
, [dstbakq
+stride3q
*4]
1229 lea dstq
, [dstq
+stride3q
*4]
1231 UNSCRATCH
4, 12, rsp
+71*mmsize
1232 UNSCRATCH
5, 13, rsp
+72*mmsize
1233 UNSCRATCH
6, 14, rsp
+73*mmsize
1234 UNSCRATCH
0, 15, rsp
+74*mmsize
1235 ROUND_AND_STORE_4x4
4, 5, 6, 0, m7
, m1
, m2
, 6
1249 ZERO_BLOCK blockq
-4*mmsize
, 64, 16, m7
1253 cglobal vp9_idct_idct_16x16_add_12
, 4, 6 + ARCH_X86_64
, 16, \
1254 67 * mmsize
+ ARCH_X86_32
* 8 * mmsize
, \
1255 dst
, stride
, block
, eob
1258 jg mangle
(private_prefix
%+ _
%+ vp9_idct_idct_16x16_add_10
%+ SUFFIX
).idctfull
1260 ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
1261 ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
1262 DEFINE_ARGS dst
, stride
, block
, coef
, coefl
1266 pshuflw m1
, m1
, q0000
1268 DEFINE_ARGS dst
, stride
, cnt
1271 STORE_2x8
3, 4, 1, m2
, m0
, dstq
, mmsize
1272 STORE_2x8
3, 4, 1, m2
, m0
, dstq
+strideq
, mmsize
1273 lea dstq
, [dstq
+strideq
*2]
1278 ; r65-69 are available for spills
1279 ; r70-77 are available on x86-32 only (x86-64 should use m8-15)
1280 ; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77
1281 %macro IADST16_1D
1 ; src
1282 mova m0
, [%1+ 0*4*mmsize
] ; in0
1283 mova m1
, [%1+ 7*4*mmsize
] ; in7
1284 mova m2
, [%1+ 8*4*mmsize
] ; in8
1285 mova m3
, [%1+15*4*mmsize
] ; in15
1286 SUMSUB_MUL_D
3, 0, 4, 5, 16364, 804 ; m3/4=t0, m0/5=t1
1287 SUMSUB_MUL_D
1, 2, 6, 7, 11003, 12140 ; m1/6=t8, m2/7=t9
1288 SCRATCH
0, 8, rsp
+70*mmsize
1289 SUMSUB_PACK_D
1, 3, 6, 4, 0 ; m1=t0a, m3=t8a
1290 UNSCRATCH
0, 8, rsp
+70*mmsize
1291 SUMSUB_PACK_D
2, 0, 7, 5, 4 ; m2=t1a, m0=t9a
1292 mova
[rsp
+67*mmsize
], m1
1293 SCRATCH
2, 9, rsp
+71*mmsize
1294 SCRATCH
3, 12, rsp
+74*mmsize
1295 SCRATCH
0, 13, rsp
+75*mmsize
1297 mova m0
, [%1+ 3*4*mmsize
] ; in3
1298 mova m1
, [%1+ 4*4*mmsize
] ; in4
1299 mova m2
, [%1+11*4*mmsize
] ; in11
1300 mova m3
, [%1+12*4*mmsize
] ; in12
1301 SUMSUB_MUL_D
2, 1, 4, 5, 14811, 7005 ; m2/4=t4, m1/5=t5
1302 SUMSUB_MUL_D
0, 3, 6, 7, 5520, 15426 ; m0/6=t12, m3/7=t13
1303 SCRATCH
1, 10, rsp
+72*mmsize
1304 SUMSUB_PACK_D
0, 2, 6, 4, 1 ; m0=t4a, m2=t12a
1305 UNSCRATCH
1, 10, rsp
+72*mmsize
1306 SUMSUB_PACK_D
3, 1, 7, 5, 4 ; m3=t5a, m1=t13a
1307 SCRATCH
0, 15, rsp
+77*mmsize
1308 SCRATCH
3, 11, rsp
+73*mmsize
1310 UNSCRATCH
0, 12, rsp
+74*mmsize
; t8a
1311 UNSCRATCH
3, 13, rsp
+75*mmsize
; t9a
1312 SUMSUB_MUL_D
0, 3, 4, 5, 16069, 3196 ; m0/4=t8, m3/5=t9
1313 SUMSUB_MUL_D
1, 2, 6, 7, 3196, 16069 ; m1/6=t13, m2/7=t12
1314 SCRATCH
1, 12, rsp
+74*mmsize
1315 SUMSUB_PACK_D
2, 0, 7, 4, 1 ; m2=t8a, m0=t12a
1316 UNSCRATCH
1, 12, rsp
+74*mmsize
1317 SUMSUB_PACK_D
1, 3, 6, 5, 4 ; m1=t9a, m3=t13a
1318 mova
[rsp
+65*mmsize
], m2
1319 mova
[rsp
+66*mmsize
], m1
1320 SCRATCH
0, 8, rsp
+70*mmsize
1321 SCRATCH
3, 12, rsp
+74*mmsize
1323 mova m0
, [%1+ 2*4*mmsize
] ; in2
1324 mova m1
, [%1+ 5*4*mmsize
] ; in5
1325 mova m2
, [%1+10*4*mmsize
] ; in10
1326 mova m3
, [%1+13*4*mmsize
] ; in13
1327 SUMSUB_MUL_D
3, 0, 4, 5, 15893, 3981 ; m3/4=t2, m0/5=t3
1328 SUMSUB_MUL_D
1, 2, 6, 7, 8423, 14053 ; m1/6=t10, m2/7=t11
1329 SCRATCH
0, 10, rsp
+72*mmsize
1330 SUMSUB_PACK_D
1, 3, 6, 4, 0 ; m1=t2a, m3=t10a
1331 UNSCRATCH
0, 10, rsp
+72*mmsize
1332 SUMSUB_PACK_D
2, 0, 7, 5, 4 ; m2=t3a, m0=t11a
1333 mova
[rsp
+68*mmsize
], m1
1334 mova
[rsp
+69*mmsize
], m2
1335 SCRATCH
3, 13, rsp
+75*mmsize
1336 SCRATCH
0, 14, rsp
+76*mmsize
1338 mova m0
, [%1+ 1*4*mmsize
] ; in1
1339 mova m1
, [%1+ 6*4*mmsize
] ; in6
1340 mova m2
, [%1+ 9*4*mmsize
] ; in9
1341 mova m3
, [%1+14*4*mmsize
] ; in14
1342 SUMSUB_MUL_D
2, 1, 4, 5, 13160, 9760 ; m2/4=t6, m1/5=t7
1343 SUMSUB_MUL_D
0, 3, 6, 7, 2404, 16207 ; m0/6=t14, m3/7=t15
1344 SCRATCH
1, 10, rsp
+72*mmsize
1345 SUMSUB_PACK_D
0, 2, 6, 4, 1 ; m0=t6a, m2=t14a
1346 UNSCRATCH
1, 10, rsp
+72*mmsize
1347 SUMSUB_PACK_D
3, 1, 7, 5, 4 ; m3=t7a, m1=t15a
1349 UNSCRATCH
4, 13, rsp
+75*mmsize
; t10a
1350 UNSCRATCH
5, 14, rsp
+76*mmsize
; t11a
1351 SCRATCH
0, 13, rsp
+75*mmsize
1352 SCRATCH
3, 14, rsp
+76*mmsize
1353 SUMSUB_MUL_D
4, 5, 6, 7, 9102, 13623 ; m4/6=t10, m5/7=t11
1354 SUMSUB_MUL_D
1, 2, 0, 3, 13623, 9102 ; m1/0=t15, m2/3=t14
1355 SCRATCH
0, 10, rsp
+72*mmsize
1356 SUMSUB_PACK_D
2, 4, 3, 6, 0 ; m2=t10a, m4=t14a
1357 UNSCRATCH
0, 10, rsp
+72*mmsize
1358 SUMSUB_PACK_D
1, 5, 0, 7, 6 ; m1=t11a, m5=t15a
1360 UNSCRATCH
0, 8, rsp
+70*mmsize
; t12a
1361 UNSCRATCH
3, 12, rsp
+74*mmsize
; t13a
1362 SCRATCH
2, 8, rsp
+70*mmsize
1363 SCRATCH
1, 12, rsp
+74*mmsize
1364 SUMSUB_MUL_D
0, 3, 1, 2, 15137, 6270 ; m0/1=t12, m3/2=t13
1365 SUMSUB_MUL_D
5, 4, 7, 6, 6270, 15137 ; m5/7=t15, m4/6=t14
1366 SCRATCH
2, 10, rsp
+72*mmsize
1367 SUMSUB_PACK_D
4, 0, 6, 1, 2 ; m4=out2, m0=t14a
1368 UNSCRATCH
2, 10, rsp
+72*mmsize
1369 SUMSUB_PACK_D
5, 3, 7, 2, 1 ; m5=-out13, m3=t15a
1372 UNSCRATCH
1, 9, rsp
+71*mmsize
; t1a
1373 mova m2
, [rsp
+68*mmsize
] ; t2a
1374 UNSCRATCH
6, 13, rsp
+75*mmsize
; t6a
1375 UNSCRATCH
7, 14, rsp
+76*mmsize
; t7a
1376 SCRATCH
4, 10, rsp
+72*mmsize
1377 SCRATCH
5, 13, rsp
+75*mmsize
1378 UNSCRATCH
4, 15, rsp
+77*mmsize
; t4a
1379 UNSCRATCH
5, 11, rsp
+73*mmsize
; t5a
1380 SCRATCH
0, 14, rsp
+76*mmsize
1381 SCRATCH
3, 15, rsp
+77*mmsize
1382 mova m0
, [rsp
+67*mmsize
] ; t0a
1383 SUMSUB_BA d
, 4, 0, 3 ; m4=t0, m0=t4
1384 SUMSUB_BA d
, 5, 1, 3 ; m5=t1, m1=t5
1385 SUMSUB_BA d
, 6, 2, 3 ; m6=t2, m2=t6
1386 SCRATCH
4, 9, rsp
+71*mmsize
1387 mova m3
, [rsp
+69*mmsize
] ; t3a
1388 SUMSUB_BA d
, 7, 3, 4 ; m7=t3, m3=t7
1390 mova
[rsp
+67*mmsize
], m5
1391 mova
[rsp
+68*mmsize
], m6
1392 mova
[rsp
+69*mmsize
], m7
1393 SUMSUB_MUL_D
0, 1, 4, 5, 15137, 6270 ; m0/4=t4a, m1/5=t5a
1394 SUMSUB_MUL_D
3, 2, 7, 6, 6270, 15137 ; m3/7=t7a, m2/6=t6a
1395 SCRATCH
1, 11, rsp
+73*mmsize
1396 SUMSUB_PACK_D
2, 0, 6, 4, 1 ; m2=-out3, m0=t6
1398 UNSCRATCH
1, 11, rsp
+73*mmsize
1399 SUMSUB_PACK_D
3, 1, 7, 5, 4 ; m3=out12, m1=t7
1400 SCRATCH
2, 11, rsp
+73*mmsize
1401 UNSCRATCH
2, 12, rsp
+74*mmsize
; t11a
1402 SCRATCH
3, 12, rsp
+74*mmsize
1404 UNSCRATCH
3, 8, rsp
+70*mmsize
; t10a
1405 mova m4
, [rsp
+65*mmsize
] ; t8a
1406 mova m5
, [rsp
+66*mmsize
] ; t9a
1407 SUMSUB_BA d
, 3, 4, 6 ; m3=-out1, m4=t10
1409 SUMSUB_BA d
, 2, 5, 6 ; m2=out14, m5=t11
1410 UNSCRATCH
6, 9, rsp
+71*mmsize
; t0
1411 UNSCRATCH
7, 14, rsp
+76*mmsize
; t14a
1412 SCRATCH
3, 9, rsp
+71*mmsize
1413 SCRATCH
2, 14, rsp
+76*mmsize
1415 SUMSUB_MUL
1, 0, 2, 3, 11585, 11585 ; m1=out4, m0=out11
1416 mova
[rsp
+65*mmsize
], m0
1417 SUMSUB_MUL
5, 4, 2, 3, 11585, 11585 ; m5=out6, m4=out9
1418 UNSCRATCH
0, 15, rsp
+77*mmsize
; t15a
1419 SUMSUB_MUL
7, 0, 2, 3, 11585, m11585
; m7=out10, m0=out5
1421 mova m2
, [rsp
+68*mmsize
] ; t2
1422 SUMSUB_BA d
, 2, 6, 3 ; m2=out0, m6=t2a
1423 SCRATCH
2, 8, rsp
+70*mmsize
1424 mova m2
, [rsp
+67*mmsize
] ; t1
1425 mova m3
, [rsp
+69*mmsize
] ; t3
1426 mova
[rsp
+67*mmsize
], m7
1427 SUMSUB_BA d
, 3, 2, 7 ; m3=-out15, m2=t3a
1429 SCRATCH
3, 15, rsp
+77*mmsize
1430 SUMSUB_MUL
6, 2, 7, 3, 11585, m11585
; m6=out8, m2=out7
1431 mova m7
, [rsp
+67*mmsize
]
1434 SWAP
2, 5, 4, 6, 7, 3
1438 cglobal vp9_
%1_
%4_16x
16_add
_10, 4, 6 + ARCH_X86_64
, 16, \
1439 70 * mmsize
+ ARCH_X86_32
* 8 * mmsize
, \
1440 dst
, stride
, block
, eob
1444 mova
[rsp
+64*mmsize
], m0
1445 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, skip
, dstbak
1451 lea ptrq
, [%7_16x
16]
1452 movzx cntd
, byte [ptrq
+cntq
-1]
1454 movzx cntd
, byte [%7_16x
16+cntq
-1]
1462 TRANSPOSE4x4D
0, 1, 2, 3, 7
1463 mova
[ptrq
+ 1*mmsize
], m0
1464 mova
[ptrq
+ 5*mmsize
], m1
1465 mova
[ptrq
+ 9*mmsize
], m2
1466 mova
[ptrq
+13*mmsize
], m3
1467 mova m7
, [rsp
+65*mmsize
]
1468 TRANSPOSE4x4D
4, 5, 6, 7, 0
1469 mova
[ptrq
+ 2*mmsize
], m4
1470 mova
[ptrq
+ 6*mmsize
], m5
1471 mova
[ptrq
+10*mmsize
], m6
1472 mova
[ptrq
+14*mmsize
], m7
1473 UNSCRATCH
0, 8, rsp
+(%3+0)*mmsize
1474 UNSCRATCH
1, 9, rsp
+(%3+1)*mmsize
1475 UNSCRATCH
2, 10, rsp
+(%3+2)*mmsize
1476 UNSCRATCH
3, 11, rsp
+(%3+3)*mmsize
1477 TRANSPOSE4x4D
0, 1, 2, 3, 7
1478 mova
[ptrq
+ 0*mmsize
], m0
1479 mova
[ptrq
+ 4*mmsize
], m1
1480 mova
[ptrq
+ 8*mmsize
], m2
1481 mova
[ptrq
+12*mmsize
], m3
1482 UNSCRATCH
4, 12, rsp
+(%3+4)*mmsize
1483 UNSCRATCH
5, 13, rsp
+(%3+5)*mmsize
1484 UNSCRATCH
6, 14, rsp
+(%3+6)*mmsize
1485 UNSCRATCH
7, 15, rsp
+(%3+7)*mmsize
1486 TRANSPOSE4x4D
4, 5, 6, 7, 0
1487 mova
[ptrq
+ 3*mmsize
], m4
1488 mova
[ptrq
+ 7*mmsize
], m5
1489 mova
[ptrq
+11*mmsize
], m6
1490 mova
[ptrq
+15*mmsize
], m7
1491 add ptrq
, 16 * mmsize
1496 ; zero-pad the remainder (skipped cols)
1500 lea blockq
, [blockq
+skipq
*(mmsize
/2)]
1503 mova
[ptrq
+mmsize
*0], m0
1504 mova
[ptrq
+mmsize
*1], m0
1505 mova
[ptrq
+mmsize
*2], m0
1506 mova
[ptrq
+mmsize
*3], m0
1507 mova
[ptrq
+mmsize
*4], m0
1508 mova
[ptrq
+mmsize
*5], m0
1509 mova
[ptrq
+mmsize
*6], m0
1510 mova
[ptrq
+mmsize
*7], m0
1511 add ptrq
, 8 * mmsize
1516 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, stride3
, dstbak
1517 lea stride3q
, [strideq
*3]
1524 lea dstq
, [dstq
+strideq
*4]
1525 ROUND_AND_STORE_4x4
0, 1, 2, 3, m7
, [rsp
+64*mmsize
], [pd_32
], 6
1526 lea dstq
, [dstq
+strideq
*4]
1527 mova m0
, [rsp
+65*mmsize
]
1528 mova m1
, [rsp
+64*mmsize
]
1530 ROUND_AND_STORE_4x4
4, 5, 6, 0, m7
, m1
, m2
, 6
1533 DEFINE_ARGS dstbak
, stride
, block
, cnt
, ptr, stride3
, dst
1537 UNSCRATCH
0, 8, rsp
+(%6+0)*mmsize
1538 UNSCRATCH
4, 9, rsp
+(%6+1)*mmsize
1539 UNSCRATCH
5, 10, rsp
+(%6+2)*mmsize
1540 UNSCRATCH
3, 11, rsp
+(%6+3)*mmsize
1541 ROUND_AND_STORE_4x4
0, 4, 5, 3, m7
, m1
, m2
, 6
1543 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, stride3
, dstbak
1544 lea dstq
, [dstbakq
+stride3q
*4]
1546 lea dstq
, [dstq
+stride3q
*4]
1548 UNSCRATCH
4, 12, rsp
+(%6+4)*mmsize
1549 UNSCRATCH
5, 13, rsp
+(%6+5)*mmsize
1550 UNSCRATCH
6, 14, rsp
+(%6+6)*mmsize
1551 UNSCRATCH
0, 15, rsp
+(%6+7)*mmsize
1552 ROUND_AND_STORE_4x4
4, 5, 6, 0, m7
, m1
, m2
, 6
1566 ZERO_BLOCK blockq
-4*mmsize
, 64, 16, m7
1569 cglobal vp9_
%1_
%4_16x
16_add
_12, 4, 6 + ARCH_X86_64
, 16, \
1570 70 * mmsize
+ ARCH_X86_32
* 8 * mmsize
, \
1571 dst
, stride
, block
, eob
1573 jmp mangle
(private_prefix
%+ _
%+ vp9_
%1_
%4_16x
16_add
_10 %+ SUFFIX
).body
1577 IADST16_FN idct
, IDCT16
, 67, iadst
, IADST16
, 70, row
1578 IADST16_FN iadst
, IADST16
, 70, idct
, IDCT16
, 67, col
1579 IADST16_FN iadst
, IADST16
, 70, iadst
, IADST16
, 70, default
1581 %macro IDCT32_1D
2-3 8 * mmsize
; pass[1/2], src, src_stride
1582 IDCT16_1D
%2, 2 * %3, 272, 257
1584 mova
[rsp
+257*mmsize
], m8
1585 mova
[rsp
+258*mmsize
], m9
1586 mova
[rsp
+259*mmsize
], m10
1587 mova
[rsp
+260*mmsize
], m11
1588 mova
[rsp
+261*mmsize
], m12
1589 mova
[rsp
+262*mmsize
], m13
1590 mova
[rsp
+263*mmsize
], m14
1591 mova
[rsp
+264*mmsize
], m15
1593 mova
[rsp
+265*mmsize
], m0
1594 mova
[rsp
+266*mmsize
], m1
1595 mova
[rsp
+267*mmsize
], m2
1596 mova
[rsp
+268*mmsize
], m3
1597 mova
[rsp
+269*mmsize
], m4
1598 mova
[rsp
+270*mmsize
], m5
1599 mova
[rsp
+271*mmsize
], m6
1602 ; r265-272: t4/5a/6a/7/8/9a/10/11a
1603 ; r261-264: t12a/13/14a/15
1604 ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit
1606 mova m0
, [%2+ 1*%3] ; in1
1607 mova m1
, [%2+15*%3] ; in15
1608 mova m2
, [%2+17*%3] ; in17
1609 mova m3
, [%2+31*%3] ; in31
1610 SUMSUB_MUL
0, 3, 4, 5, 16364, 804 ; m0=t31a, m3=t16a
1611 SUMSUB_MUL
2, 1, 4, 5, 11003, 12140 ; m2=t30a, m1=t17a
1612 SUMSUB_BA d
, 1, 3, 4 ; m1=t16, m3=t17
1613 SUMSUB_BA d
, 2, 0, 4 ; m2=t31, m0=t30
1614 SUMSUB_MUL
0, 3, 4, 5, 16069, 3196 ; m0=t30a, m3=t17a
1615 SCRATCH
0, 8, rsp
+275*mmsize
1616 SCRATCH
2, 9, rsp
+276*mmsize
1618 ; end of stage 1-3 first quart
1620 mova m0
, [%2+ 7*%3] ; in7
1621 mova m2
, [%2+ 9*%3] ; in9
1622 mova m4
, [%2+23*%3] ; in23
1623 mova m5
, [%2+25*%3] ; in25
1624 SUMSUB_MUL
2, 4, 6, 7, 14811, 7005 ; m2=t29a, m4=t18a
1625 SUMSUB_MUL
5, 0, 6, 7, 5520, 15426 ; m5=t28a, m0=t19a
1626 SUMSUB_BA d
, 4, 0, 6 ; m4=t19, m0=t18
1627 SUMSUB_BA d
, 2, 5, 6 ; m2=t28, m5=t29
1628 SUMSUB_MUL
5, 0, 6, 7, 3196, m16069
; m5=t29a, m0=t18a
1630 ; end of stage 1-3 second quart
1632 SUMSUB_BA d
, 4, 1, 6 ; m4=t16a, m1=t19a
1633 SUMSUB_BA d
, 0, 3, 6 ; m0=t17, m3=t18
1634 UNSCRATCH
6, 8, rsp
+275*mmsize
; t30a
1635 UNSCRATCH
7, 9, rsp
+276*mmsize
; t31
1636 mova
[rsp
+273*mmsize
], m4
1637 mova
[rsp
+274*mmsize
], m0
1638 SUMSUB_BA d
, 2, 7, 0 ; m2=t31a, m7=t28a
1639 SUMSUB_BA d
, 5, 6, 0 ; m5=t30, m6=t29
1640 SUMSUB_MUL
6, 3, 0, 4, 15137, 6270 ; m6=t29a, m3=t18a
1641 SUMSUB_MUL
7, 1, 0, 4, 15137, 6270 ; m7=t28, m1=t19
1642 SCRATCH
3, 10, rsp
+277*mmsize
1643 SCRATCH
1, 11, rsp
+278*mmsize
1644 SCRATCH
7, 12, rsp
+279*mmsize
1645 SCRATCH
6, 13, rsp
+280*mmsize
1646 SCRATCH
5, 14, rsp
+281*mmsize
1647 SCRATCH
2, 15, rsp
+282*mmsize
1649 ; end of stage 4-5 first half
1651 mova m0
, [%2+ 5*%3] ; in5
1652 mova m1
, [%2+11*%3] ; in11
1653 mova m2
, [%2+21*%3] ; in21
1654 mova m3
, [%2+27*%3] ; in27
1655 SUMSUB_MUL
0, 3, 4, 5, 15893, 3981 ; m0=t27a, m3=t20a
1656 SUMSUB_MUL
2, 1, 4, 5, 8423, 14053 ; m2=t26a, m1=t21a
1657 SUMSUB_BA d
, 1, 3, 4 ; m1=t20, m3=t21
1658 SUMSUB_BA d
, 2, 0, 4 ; m2=t27, m0=t26
1659 SUMSUB_MUL
0, 3, 4, 5, 9102, 13623 ; m0=t26a, m3=t21a
1660 SCRATCH
0, 8, rsp
+275*mmsize
1661 SCRATCH
2, 9, rsp
+276*mmsize
1663 ; end of stage 1-3 third quart
1665 mova m0
, [%2+ 3*%3] ; in3
1666 mova m2
, [%2+13*%3] ; in13
1667 mova m4
, [%2+19*%3] ; in19
1668 mova m5
, [%2+29*%3] ; in29
1669 SUMSUB_MUL
2, 4, 6, 7, 13160, 9760 ; m2=t25a, m4=t22a
1670 SUMSUB_MUL
5, 0, 6, 7, 2404, 16207 ; m5=t24a, m0=t23a
1671 SUMSUB_BA d
, 4, 0, 6 ; m4=t23, m0=t22
1672 SUMSUB_BA d
, 2, 5, 6 ; m2=t24, m5=t25
1673 SUMSUB_MUL
5, 0, 6, 7, 13623, m9102
; m5=t25a, m0=t22a
1675 ; end of stage 1-3 fourth quart
1677 SUMSUB_BA d
, 1, 4, 6 ; m1=t23a, m4=t20a
1678 SUMSUB_BA d
, 3, 0, 6 ; m3=t22, m0=t21
1679 UNSCRATCH
6, 8, rsp
+275*mmsize
; t26a
1680 UNSCRATCH
7, 9, rsp
+276*mmsize
; t27
1681 SCRATCH
3, 8, rsp
+275*mmsize
1682 SCRATCH
1, 9, rsp
+276*mmsize
1683 SUMSUB_BA d
, 7, 2, 1 ; m7=t24a, m2=t27a
1684 SUMSUB_BA d
, 6, 5, 1 ; m6=t25, m5=t26
1685 SUMSUB_MUL
2, 4, 1, 3, 6270, m15137
; m2=t27, m4=t20
1686 SUMSUB_MUL
5, 0, 1, 3, 6270, m15137
; m5=t26a, m0=t21a
1688 ; end of stage 4-5 second half
1690 UNSCRATCH
1, 12, rsp
+279*mmsize
; t28
1691 UNSCRATCH
3, 13, rsp
+280*mmsize
; t29a
1692 SCRATCH
4, 12, rsp
+279*mmsize
1693 SCRATCH
0, 13, rsp
+280*mmsize
1694 SUMSUB_BA d
, 5, 3, 0 ; m5=t29, m3=t26
1695 SUMSUB_BA d
, 2, 1, 0 ; m2=t28a, m1=t27a
1696 UNSCRATCH
0, 14, rsp
+281*mmsize
; t30
1697 UNSCRATCH
4, 15, rsp
+282*mmsize
; t31a
1698 SCRATCH
2, 14, rsp
+281*mmsize
1699 SCRATCH
5, 15, rsp
+282*mmsize
1700 SUMSUB_BA d
, 6, 0, 2 ; m6=t30a, m0=t25a
1701 SUMSUB_BA d
, 7, 4, 2 ; m7=t31, m4=t24
1703 mova m2
, [rsp
+273*mmsize
] ; t16a
1704 mova m5
, [rsp
+274*mmsize
] ; t17
1705 mova
[rsp
+273*mmsize
], m6
1706 mova
[rsp
+274*mmsize
], m7
1707 UNSCRATCH
6, 10, rsp
+277*mmsize
; t18a
1708 UNSCRATCH
7, 11, rsp
+278*mmsize
; t19
1709 SCRATCH
4, 10, rsp
+277*mmsize
1710 SCRATCH
0, 11, rsp
+278*mmsize
1711 UNSCRATCH
4, 12, rsp
+279*mmsize
; t20
1712 UNSCRATCH
0, 13, rsp
+280*mmsize
; t21a
1713 SCRATCH
3, 12, rsp
+279*mmsize
1714 SCRATCH
1, 13, rsp
+280*mmsize
1715 SUMSUB_BA d
, 0, 6, 1 ; m0=t18, m6=t21
1716 SUMSUB_BA d
, 4, 7, 1 ; m4=t19a, m7=t20a
1717 UNSCRATCH
3, 8, rsp
+275*mmsize
; t22
1718 UNSCRATCH
1, 9, rsp
+276*mmsize
; t23a
1719 SCRATCH
0, 8, rsp
+275*mmsize
1720 SCRATCH
4, 9, rsp
+276*mmsize
1721 SUMSUB_BA d
, 3, 5, 0 ; m3=t17a, m5=t22a
1722 SUMSUB_BA d
, 1, 2, 0 ; m1=t16, m2=t23
1726 UNSCRATCH
0, 10, rsp
+277*mmsize
; t24
1727 UNSCRATCH
4, 11, rsp
+278*mmsize
; t25a
1728 SCRATCH
1, 10, rsp
+277*mmsize
1729 SCRATCH
3, 11, rsp
+278*mmsize
1730 SUMSUB_MUL
0, 2, 1, 3, 11585, 11585 ; m0=t24a, m2=t23a
1731 SUMSUB_MUL
4, 5, 1, 3, 11585, 11585 ; m4=t25, m5=t22
1732 UNSCRATCH
1, 12, rsp
+279*mmsize
; t26
1733 UNSCRATCH
3, 13, rsp
+280*mmsize
; t27a
1734 SCRATCH
0, 12, rsp
+279*mmsize
1735 SCRATCH
4, 13, rsp
+280*mmsize
1736 SUMSUB_MUL
3, 7, 0, 4, 11585, 11585 ; m3=t27, m7=t20
1737 SUMSUB_MUL
1, 6, 0, 4, 11585, 11585 ; m1=t26a, m6=t21a
1741 mova m0
, [rsp
+269*mmsize
] ; t8
1742 mova m4
, [rsp
+270*mmsize
] ; t9a
1743 mova
[rsp
+269*mmsize
], m1
; t26a
1744 mova
[rsp
+270*mmsize
], m3
; t27
1745 mova m3
, [rsp
+271*mmsize
] ; t10
1746 SUMSUB_BA d
, 2, 0, 1 ; m2=out8, m0=out23
1747 SUMSUB_BA d
, 5, 4, 1 ; m5=out9, m4=out22
1748 SUMSUB_BA d
, 6, 3, 1 ; m6=out10, m3=out21
1749 mova m1
, [rsp
+272*mmsize
] ; t11a
1750 mova
[rsp
+271*mmsize
], m0
1751 SUMSUB_BA d
, 7, 1, 0 ; m7=out11, m1=out20
1754 TRANSPOSE4x4D
2, 5, 6, 7, 0
1755 mova
[ptrq
+ 2*mmsize
], m2
1756 mova
[ptrq
+10*mmsize
], m5
1757 mova
[ptrq
+18*mmsize
], m6
1758 mova
[ptrq
+26*mmsize
], m7
1761 lea dstq
, [dstq
+strideq
*8]
1762 ROUND_AND_STORE_4x4
2, 5, 6, 7, m0
, [rsp
+256*mmsize
], [pd_32
], 6
1764 mova m2
, [rsp
+271*mmsize
]
1766 TRANSPOSE4x4D
1, 3, 4, 2, 0
1767 mova
[ptrq
+ 5*mmsize
], m1
1768 mova
[ptrq
+13*mmsize
], m3
1769 mova
[ptrq
+21*mmsize
], m4
1770 mova
[ptrq
+29*mmsize
], m2
1772 lea dstq
, [dstq
+stride3q
*4]
1773 ROUND_AND_STORE_4x4
1, 3, 4, 2, m0
, [rsp
+256*mmsize
], [pd_32
], 6
1776 ; end of last stage + store for out8-11 and out20-23
1778 UNSCRATCH
0, 9, rsp
+276*mmsize
; t19a
1779 UNSCRATCH
1, 8, rsp
+275*mmsize
; t18
1780 UNSCRATCH
2, 11, rsp
+278*mmsize
; t17a
1781 UNSCRATCH
3, 10, rsp
+277*mmsize
; t16
1782 mova m7
, [rsp
+261*mmsize
] ; t12a
1783 mova m6
, [rsp
+262*mmsize
] ; t13
1784 mova m5
, [rsp
+263*mmsize
] ; t14a
1785 SUMSUB_BA d
, 0, 7, 4 ; m0=out12, m7=out19
1786 SUMSUB_BA d
, 1, 6, 4 ; m1=out13, m6=out18
1787 SUMSUB_BA d
, 2, 5, 4 ; m2=out14, m5=out17
1788 mova m4
, [rsp
+264*mmsize
] ; t15
1789 SCRATCH
7, 8, rsp
+275*mmsize
1790 SUMSUB_BA d
, 3, 4, 7 ; m3=out15, m4=out16
1793 TRANSPOSE4x4D
0, 1, 2, 3, 7
1794 mova
[ptrq
+ 3*mmsize
], m0
1795 mova
[ptrq
+11*mmsize
], m1
1796 mova
[ptrq
+19*mmsize
], m2
1797 mova
[ptrq
+27*mmsize
], m3
1801 lea dstq
, [dstbakq
+stride3q
*4]
1805 lea dstq
, [dstq
+stride3q
*4]
1807 ROUND_AND_STORE_4x4
0, 1, 2, 3, m7
, [rsp
+256*mmsize
], [pd_32
], 6
1809 UNSCRATCH
0, 8, rsp
+275*mmsize
; out19
1811 TRANSPOSE4x4D
4, 5, 6, 0, 7
1812 mova
[ptrq
+ 4*mmsize
], m4
1813 mova
[ptrq
+12*mmsize
], m5
1814 mova
[ptrq
+20*mmsize
], m6
1815 mova
[ptrq
+28*mmsize
], m0
1817 lea dstq
, [dstq
+strideq
*4]
1818 ROUND_AND_STORE_4x4
4, 5, 6, 0, m7
, [rsp
+256*mmsize
], [pd_32
], 6
1821 ; end of last stage + store for out12-19
1826 mova m7
, [rsp
+257*mmsize
] ; t0
1827 mova m6
, [rsp
+258*mmsize
] ; t1
1828 mova m5
, [rsp
+259*mmsize
] ; t2
1829 mova m4
, [rsp
+260*mmsize
] ; t3
1830 mova m0
, [rsp
+274*mmsize
] ; t31
1831 mova m1
, [rsp
+273*mmsize
] ; t30a
1832 UNSCRATCH
2, 15, rsp
+282*mmsize
; t29
1833 SUMSUB_BA d
, 0, 7, 3 ; m0=out0, m7=out31
1834 SUMSUB_BA d
, 1, 6, 3 ; m1=out1, m6=out30
1835 SUMSUB_BA d
, 2, 5, 3 ; m2=out2, m5=out29
1836 SCRATCH
0, 9, rsp
+276*mmsize
1837 UNSCRATCH
3, 14, rsp
+281*mmsize
; t28a
1838 SUMSUB_BA d
, 3, 4, 0 ; m3=out3, m4=out28
1841 TRANSPOSE4x4D
4, 5, 6, 7, 0
1842 mova
[ptrq
+ 7*mmsize
], m4
1843 mova
[ptrq
+15*mmsize
], m5
1844 mova
[ptrq
+23*mmsize
], m6
1845 mova
[ptrq
+31*mmsize
], m7
1852 lea dstq
, [dstq
+stride3q
*4]
1853 ROUND_AND_STORE_4x4
4, 5, 6, 7, m0
, [rsp
+256*mmsize
], [pd_32
], 6
1855 UNSCRATCH
7, 9, rsp
+276*mmsize
; out0
1857 TRANSPOSE4x4D
7, 1, 2, 3, 0
1858 mova
[ptrq
+ 0*mmsize
], m7
1859 mova
[ptrq
+ 8*mmsize
], m1
1860 mova
[ptrq
+16*mmsize
], m2
1861 mova
[ptrq
+24*mmsize
], m3
1864 DEFINE_ARGS dstbak
, stride
, block
, cnt
, ptr, stride3
, dst
1868 ROUND_AND_STORE_4x4
7, 1, 2, 3, m0
, [rsp
+256*mmsize
], [pd_32
], 6
1870 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, stride3
, dstbak
1874 ; end of last stage + store for out0-3 and out28-31
1879 mova m7
, [rsp
+265*mmsize
] ; t4
1880 mova m6
, [rsp
+266*mmsize
] ; t5a
1881 mova m5
, [rsp
+267*mmsize
] ; t6a
1882 mova m4
, [rsp
+268*mmsize
] ; t7
1883 mova m0
, [rsp
+270*mmsize
] ; t27
1884 mova m1
, [rsp
+269*mmsize
] ; t26a
1885 UNSCRATCH
2, 13, rsp
+280*mmsize
; t25
1886 SUMSUB_BA d
, 0, 7, 3 ; m0=out4, m7=out27
1887 SUMSUB_BA d
, 1, 6, 3 ; m1=out5, m6=out26
1888 SUMSUB_BA d
, 2, 5, 3 ; m2=out6, m5=out25
1889 UNSCRATCH
3, 12, rsp
+279*mmsize
; t24a
1890 SCRATCH
7, 9, rsp
+276*mmsize
1891 SUMSUB_BA d
, 3, 4, 7 ; m3=out7, m4=out24
1894 TRANSPOSE4x4D
0, 1, 2, 3, 7
1895 mova
[ptrq
+ 1*mmsize
], m0
1896 mova
[ptrq
+ 9*mmsize
], m1
1897 mova
[ptrq
+17*mmsize
], m2
1898 mova
[ptrq
+25*mmsize
], m3
1902 lea dstq
, [dstbakq
+strideq
*4]
1905 lea dstq
, [dstq
+strideq
*4]
1907 ROUND_AND_STORE_4x4
0, 1, 2, 3, m7
, [rsp
+256*mmsize
], [pd_32
], 6
1909 UNSCRATCH
0, 9, rsp
+276*mmsize
; out27
1911 TRANSPOSE4x4D
4, 5, 6, 0, 7
1912 mova
[ptrq
+ 6*mmsize
], m4
1913 mova
[ptrq
+14*mmsize
], m5
1914 mova
[ptrq
+22*mmsize
], m6
1915 mova
[ptrq
+30*mmsize
], m0
1918 lea dstq
, [dstbakq
+stride3q
*8]
1921 lea dstq
, [dstq
+stride3q
*8]
1923 ROUND_AND_STORE_4x4
4, 5, 6, 0, m7
, [rsp
+256*mmsize
], [pd_32
], 6
1926 ; end of last stage + store for out4-7 and out24-27
1930 cglobal vp9_idct_idct_32x32_add_10
, 4, 6 + ARCH_X86_64
, 16, \
1931 275 * mmsize
+ ARCH_X86_32
* 8 * mmsize
, \
1932 dst
, stride
, block
, eob
1937 ; dc-only - the 10bit version can be done entirely in 32bit, since the max
1938 ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
1940 DEFINE_ARGS dst
, stride
, block
, coef
1944 pshuflw m1
, m1
, q0000
1946 DEFINE_ARGS dst
, stride
, cnt
1949 STORE_2x8
3, 4, 1, m2
, m0
, dstq
, mmsize
1950 STORE_2x8
3, 4, 1, m2
, m0
, dstq
+mmsize
*2, mmsize
1957 mova
[rsp
+256*mmsize
], m0
1958 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, skip
, dstbak
1964 lea ptrq
, [default_32x32
]
1965 movzx cntd
, byte [ptrq
+cntq
-1]
1967 movzx cntd
, byte [default_32x32
+cntq
-1]
1975 add ptrq
, 32 * mmsize
1980 ; zero-pad the remainder (skipped cols)
1984 lea blockq
, [blockq
+skipq
*(mmsize
/4)]
1987 mova
[ptrq
+mmsize
*0], m0
1988 mova
[ptrq
+mmsize
*1], m0
1989 mova
[ptrq
+mmsize
*2], m0
1990 mova
[ptrq
+mmsize
*3], m0
1991 mova
[ptrq
+mmsize
*4], m0
1992 mova
[ptrq
+mmsize
*5], m0
1993 mova
[ptrq
+mmsize
*6], m0
1994 mova
[ptrq
+mmsize
*7], m0
1995 add ptrq
, 8 * mmsize
2000 DEFINE_ARGS dst
, stride
, block
, cnt
, ptr, stride3
, dstbak
2001 lea stride3q
, [strideq
*3]
2019 ZERO_BLOCK blockq
-8*mmsize
, 128, 32, m7
2023 cglobal vp9_idct_idct_32x32_add_12
, 4, 6 + ARCH_X86_64
, 16, \
2024 275 * mmsize
+ ARCH_X86_32
* 8 * mmsize
, \
2025 dst
, stride
, block
, eob
2028 jg mangle
(private_prefix
%+ _
%+ vp9_idct_idct_32x32_add_10
%+ SUFFIX
).idctfull
2030 ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
2031 ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
2032 DEFINE_ARGS dst
, stride
, block
, coef
, coefl
2036 pshuflw m1
, m1
, q0000
2038 DEFINE_ARGS dst
, stride
, cnt
2041 STORE_2x8
3, 4, 1, m2
, m0
, dstq
, mmsize
2042 STORE_2x8
3, 4, 1, m2
, m0
, dstq
+mmsize
*2, mmsize