threadprogress: reorder instructions to fix race.
[ffmpeg.git] / libavcodec / x86 / vp9itxfm_16bpp.asm
blobebe6222285bc1943b85bc2814f6889dbfa6e8f4d
1 ;******************************************************************************
2 ;* VP9 inverse transform x86 SIMD optimizations
3 ;*
4 ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje gmail com>
5 ;*
6 ;* This file is part of FFmpeg.
7 ;*
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
24 %include "vp9itxfm_template.asm"
26 SECTION_RODATA
28 cextern pw_8
29 cextern pw_1023
30 cextern pw_2048
31 cextern pw_4095
32 cextern pw_m1
33 cextern pd_1
34 cextern pd_16
35 cextern pd_32
36 cextern pd_8192
38 pd_8: times 4 dd 8
39 pd_3fff: times 4 dd 0x3fff
41 cextern pw_11585x2
43 cextern pw_5283_13377
44 cextern pw_9929_13377
45 cextern pw_15212_m13377
46 cextern pw_15212_9929
47 cextern pw_m5283_m15212
48 cextern pw_13377x2
49 cextern pw_m13377_13377
50 cextern pw_13377_0
52 pw_9929_m5283: times 4 dw 9929, -5283
54 %macro COEF_PAIR 2-3
55 cextern pw_m%1_%2
56 cextern pw_%2_%1
57 %if %0 == 3
58 cextern pw_m%1_m%2
59 %if %1 != %2
60 cextern pw_m%2_%1
61 cextern pw_%1_%2
62 %endif
63 %endif
64 %endmacro
66 COEF_PAIR 2404, 16207
67 COEF_PAIR 3196, 16069, 1
68 COEF_PAIR 4756, 15679
69 COEF_PAIR 5520, 15426
70 COEF_PAIR 6270, 15137, 1
71 COEF_PAIR 8423, 14053
72 COEF_PAIR 10394, 12665
73 COEF_PAIR 11003, 12140
74 COEF_PAIR 11585, 11585, 1
75 COEF_PAIR 13160, 9760
76 COEF_PAIR 13623, 9102, 1
77 COEF_PAIR 14449, 7723
78 COEF_PAIR 14811, 7005
79 COEF_PAIR 15893, 3981
80 COEF_PAIR 16305, 1606
81 COEF_PAIR 16364, 804
83 default_8x8:
84 times 12 db 1
85 times 52 db 2
86 row_8x8:
87 times 18 db 1
88 times 46 db 2
89 col_8x8:
90 times 6 db 1
91 times 58 db 2
92 default_16x16:
93 times 10 db 1
94 times 28 db 2
95 times 51 db 3
96 times 167 db 4
97 row_16x16:
98 times 21 db 1
99 times 45 db 2
100 times 60 db 3
101 times 130 db 4
102 col_16x16:
103 times 5 db 1
104 times 12 db 2
105 times 25 db 3
106 times 214 db 4
107 default_32x32:
108 times 9 db 1
109 times 25 db 2
110 times 36 db 3
111 times 65 db 4
112 times 105 db 5
113 times 96 db 6
114 times 112 db 7
115 times 576 db 8
117 SECTION .text
119 %macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst
120 mova m%3, [%7]
121 mova m%4, [%7+strideq]
122 paddw m%3, m%1
123 paddw m%4, m%2
124 pmaxsw m%3, m%5
125 pmaxsw m%4, m%5
126 pminsw m%3, m%6
127 pminsw m%4, m%6
128 mova [%7], m%3
129 mova [%7+strideq], m%4
130 %endmacro
132 %macro ZERO_BLOCK 4 ; mem, stride, nnzcpl, zero_reg
133 %assign %%y 0
134 %rep %3
135 %assign %%x 0
136 %rep %3*4/mmsize
137 mova [%1+%%y+%%x], %4
138 %assign %%x (%%x+mmsize)
139 %endrep
140 %assign %%y (%%y+%2)
141 %endrep
142 %endmacro
144 ; the input coefficients are scaled up by 2 bit (which we downscale immediately
145 ; in the iwht), and is otherwise orthonormally increased by 1 bit per iwht_1d.
146 ; therefore, a diff of 10-12+sign bit will fit in 12-14+sign bit after scaling,
147 ; i.e. everything can be done in 15+1bpp words. Since the quant fractional bits
148 ; add 2 bits, we need to scale before converting to word in 12bpp, since the
149 ; input will be 16+sign bit which doesn't fit in 15+sign words, but in 10bpp
150 ; we can scale after converting to words (which is half the instructions),
151 ; since the input is only 14+sign bit, which fits in 15+sign words directly.
153 %macro IWHT4_FN 2 ; bpp, max
154 cglobal vp9_iwht_iwht_4x4_add_%1, 3, 3, 8, dst, stride, block, eob
155 mova m7, [pw_%2]
156 mova m0, [blockq+0*16+0]
157 mova m1, [blockq+1*16+0]
158 %if %1 >= 12
159 mova m4, [blockq+0*16+8]
160 mova m5, [blockq+1*16+8]
161 psrad m0, 2
162 psrad m1, 2
163 psrad m4, 2
164 psrad m5, 2
165 packssdw m0, m4
166 packssdw m1, m5
167 %else
168 packssdw m0, [blockq+0*16+8]
169 packssdw m1, [blockq+1*16+8]
170 psraw m0, 2
171 psraw m1, 2
172 %endif
173 mova m2, [blockq+2*16+0]
174 mova m3, [blockq+3*16+0]
175 %if %1 >= 12
176 mova m4, [blockq+2*16+8]
177 mova m5, [blockq+3*16+8]
178 psrad m2, 2
179 psrad m3, 2
180 psrad m4, 2
181 psrad m5, 2
182 packssdw m2, m4
183 packssdw m3, m5
184 %else
185 packssdw m2, [blockq+2*16+8]
186 packssdw m3, [blockq+3*16+8]
187 psraw m2, 2
188 psraw m3, 2
189 %endif
191 VP9_IWHT4_1D
192 TRANSPOSE4x4W 0, 1, 2, 3, 4
193 VP9_IWHT4_1D
195 pxor m6, m6
196 VP9_STORE_2X 0, 1, 4, 5, 6, 7
197 lea dstq, [dstq+strideq*2]
198 VP9_STORE_2X 2, 3, 4, 5, 6, 7
199 ZERO_BLOCK blockq, 16, 4, m6
201 %endmacro
203 INIT_MMX mmxext
204 IWHT4_FN 10, 1023
205 INIT_MMX mmxext
206 IWHT4_FN 12, 4095
208 %macro VP9_IDCT4_WRITEOUT 0
209 %if cpuflag(ssse3)
210 mova m5, [pw_2048]
211 pmulhrsw m0, m5
212 pmulhrsw m1, m5
213 pmulhrsw m2, m5
214 pmulhrsw m3, m5
215 %else
216 mova m5, [pw_8]
217 paddw m0, m5
218 paddw m1, m5
219 paddw m2, m5
220 paddw m3, m5
221 psraw m0, 4
222 psraw m1, 4
223 psraw m2, 4
224 psraw m3, 4
225 %endif
226 mova m5, [pw_1023]
227 VP9_STORE_2X 0, 1, 6, 7, 4, 5
228 lea dstq, [dstq+2*strideq]
229 VP9_STORE_2X 2, 3, 6, 7, 4, 5
230 %endmacro
232 %macro DC_ONLY 2 ; shift, zero
233 mov coefd, dword [blockq]
234 movd [blockq], %2
235 imul coefd, 11585
236 add coefd, 8192
237 sar coefd, 14
238 imul coefd, 11585
239 add coefd, ((1 << (%1 - 1)) << 14) + 8192
240 sar coefd, 14 + %1
241 %endmacro
243 ; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
244 ; in 15+1 words without additional effort, since the coefficients are 15bpp.
246 %macro IDCT4_10_FN 0
247 cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
248 cmp eobd, 1
249 jg .idctfull
251 ; dc-only
252 pxor m4, m4
253 %if cpuflag(ssse3)
254 movd m0, [blockq]
255 movd [blockq], m4
256 mova m5, [pw_11585x2]
257 pmulhrsw m0, m5
258 pmulhrsw m0, m5
259 %else
260 DEFINE_ARGS dst, stride, block, coef
261 DC_ONLY 4, m4
262 movd m0, coefd
263 %endif
264 pshufw m0, m0, 0
265 mova m5, [pw_1023]
266 %if cpuflag(ssse3)
267 pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
268 %endif
269 VP9_STORE_2X 0, 0, 6, 7, 4, 5
270 lea dstq, [dstq+2*strideq]
271 VP9_STORE_2X 0, 0, 6, 7, 4, 5
274 .idctfull:
275 mova m0, [blockq+0*16+0]
276 mova m1, [blockq+1*16+0]
277 packssdw m0, [blockq+0*16+8]
278 packssdw m1, [blockq+1*16+8]
279 mova m2, [blockq+2*16+0]
280 mova m3, [blockq+3*16+0]
281 packssdw m2, [blockq+2*16+8]
282 packssdw m3, [blockq+3*16+8]
284 %if cpuflag(ssse3)
285 mova m6, [pw_11585x2]
286 %endif
287 mova m7, [pd_8192] ; rounding
288 VP9_IDCT4_1D
289 TRANSPOSE4x4W 0, 1, 2, 3, 4
290 VP9_IDCT4_1D
292 pxor m4, m4
293 ZERO_BLOCK blockq, 16, 4, m4
294 VP9_IDCT4_WRITEOUT
296 %endmacro
298 INIT_MMX mmxext
299 IDCT4_10_FN
300 INIT_MMX ssse3
301 IDCT4_10_FN
303 %macro IADST4_FN 4
304 cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob
305 %if WIN64 && notcpuflag(ssse3)
306 INIT_XMM cpuname
307 WIN64_SPILL_XMM 8
308 INIT_MMX cpuname
309 %endif
310 movdqa xmm5, [pd_8192]
311 mova m0, [blockq+0*16+0]
312 mova m1, [blockq+1*16+0]
313 packssdw m0, [blockq+0*16+8]
314 packssdw m1, [blockq+1*16+8]
315 mova m2, [blockq+2*16+0]
316 mova m3, [blockq+3*16+0]
317 packssdw m2, [blockq+2*16+8]
318 packssdw m3, [blockq+3*16+8]
320 %if cpuflag(ssse3)
321 mova m6, [pw_11585x2]
322 %endif
323 %ifnidn %1%3, iadstiadst
324 movdq2q m7, xmm5
325 %endif
326 VP9_%2_1D
327 TRANSPOSE4x4W 0, 1, 2, 3, 4
328 VP9_%4_1D
330 pxor m4, m4
331 ZERO_BLOCK blockq, 16, 4, m4
332 VP9_IDCT4_WRITEOUT
334 %endmacro
336 INIT_MMX sse2
337 IADST4_FN idct, IDCT4, iadst, IADST4
338 IADST4_FN iadst, IADST4, idct, IDCT4
339 IADST4_FN iadst, IADST4, iadst, IADST4
341 INIT_MMX ssse3
342 IADST4_FN idct, IDCT4, iadst, IADST4
343 IADST4_FN iadst, IADST4, idct, IDCT4
344 IADST4_FN iadst, IADST4, iadst, IADST4
346 ; inputs and outputs are dwords, coefficients are words
348 ; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
349 ; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
350 %macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
351 pand m%3, m%1, %8
352 pand m%4, m%2, %8
353 psrad m%1, 14
354 psrad m%2, 14
355 packssdw m%4, m%2
356 packssdw m%3, m%1
357 punpckhwd m%2, m%4, m%3
358 punpcklwd m%4, m%3
359 pmaddwd m%3, m%4, [pw_%6_%5]
360 pmaddwd m%1, m%2, [pw_%6_%5]
361 pmaddwd m%4, [pw_m%5_%6]
362 pmaddwd m%2, [pw_m%5_%6]
363 paddd m%3, %7
364 paddd m%4, %7
365 psrad m%3, 14
366 psrad m%4, 14
367 paddd m%1, m%3
368 paddd m%2, m%4
369 %endmacro
371 %macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1
372 SUMSUB_MUL %3, %5, %7, %8, 11585, 11585, %1, %2
373 SUMSUB_MUL %4, %6, %7, %8, 15137, 6270, %1, %2
374 SUMSUB_BA d, %4, %3, %7
375 SUMSUB_BA d, %6, %5, %7
376 SWAP %4, %6, %3
377 %endmacro
379 %macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max
380 movh m%1, [dstq+strideq*0]
381 movh m%2, [dstq+strideq*2]
382 movhps m%1, [dstq+strideq*1]
383 movhps m%2, [dstq+stride3q ]
384 paddw m%1, m%3
385 paddw m%2, m%4
386 pmaxsw m%1, %5
387 pmaxsw m%2, %5
388 pminsw m%1, %6
389 pminsw m%2, %6
390 movh [dstq+strideq*0], m%1
391 movhps [dstq+strideq*1], m%1
392 movh [dstq+strideq*2], m%2
393 movhps [dstq+stride3q ], m%2
394 %endmacro
396 %macro ROUND_AND_STORE_4x4 8 ; reg1-4, min, max, rnd, shift
397 paddd m%1, %7
398 paddd m%2, %7
399 paddd m%3, %7
400 paddd m%4, %7
401 psrad m%1, %8
402 psrad m%2, %8
403 psrad m%3, %8
404 psrad m%4, %8
405 packssdw m%1, m%2
406 packssdw m%3, m%4
407 STORE_4x4 %2, %4, %1, %3, %5, %6
408 %endmacro
410 INIT_XMM sse2
411 cglobal vp9_idct_idct_4x4_add_12, 4, 4, 8, dst, stride, block, eob
412 cmp eobd, 1
413 jg .idctfull
415 ; dc-only - this is special, since for 4x4 12bpp, the max coef size is
416 ; 17+sign bpp. Since the multiply is with 11585, which is 14bpp, the
417 ; result of each multiply is 31+sign bit, i.e. it _exactly_ fits in a
418 ; dword. After the final shift (4), the result is 13+sign bits, so we
419 ; don't need any additional processing to fit it in a word
420 DEFINE_ARGS dst, stride, block, coef
421 pxor m4, m4
422 DC_ONLY 4, m4
423 movd m0, coefd
424 pshuflw m0, m0, q0000
425 punpcklqdq m0, m0
426 mova m5, [pw_4095]
427 DEFINE_ARGS dst, stride, stride3
428 lea stride3q, [strideq*3]
429 STORE_4x4 1, 3, 0, 0, m4, m5
432 .idctfull:
433 DEFINE_ARGS dst, stride, block, eob
434 mova m0, [blockq+0*16]
435 mova m1, [blockq+1*16]
436 mova m2, [blockq+2*16]
437 mova m3, [blockq+3*16]
438 mova m6, [pd_8192]
439 mova m7, [pd_3fff]
441 IDCT4_12BPP_1D m6, m7
442 TRANSPOSE4x4D 0, 1, 2, 3, 4
443 IDCT4_12BPP_1D m6, m7
445 pxor m4, m4
446 ZERO_BLOCK blockq, 16, 4, m4
448 ; writeout
449 DEFINE_ARGS dst, stride, stride3
450 lea stride3q, [strideq*3]
451 mova m5, [pw_4095]
452 mova m6, [pd_8]
453 ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4
456 %macro SCRATCH 3-4
457 %if ARCH_X86_64
458 SWAP %1, %2
459 %if %0 == 4
460 %define reg_%4 m%2
461 %endif
462 %else
463 mova [%3], m%1
464 %if %0 == 4
465 %define reg_%4 [%3]
466 %endif
467 %endif
468 %endmacro
470 %macro UNSCRATCH 3-4
471 %if ARCH_X86_64
472 SWAP %1, %2
473 %else
474 mova m%1, [%3]
475 %endif
476 %if %0 == 4
477 %undef reg_%4
478 %endif
479 %endmacro
481 %macro PRELOAD 2-3
482 %if ARCH_X86_64
483 mova m%1, [%2]
484 %if %0 == 3
485 %define reg_%3 m%1
486 %endif
487 %elif %0 == 3
488 %define reg_%3 [%2]
489 %endif
490 %endmacro
492 ; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14
493 ; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14
494 ; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14
495 ; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14
496 %macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask
497 pand m4, m0, %2
498 pand m5, m1, %2
499 psrad m0, 14
500 psrad m1, 14
501 packssdw m5, m1
502 packssdw m4, m0
503 punpckhwd m1, m4, m5
504 punpcklwd m4, m5
505 pand m5, m2, %2
506 pand m6, m3, %2
507 psrad m2, 14
508 psrad m3, 14
509 packssdw m6, m3
510 packssdw m5, m2
511 punpckhwd m3, m5, m6
512 punpcklwd m5, m6
513 SCRATCH 1, 8, rsp+0*mmsize, a
514 SCRATCH 5, 9, rsp+1*mmsize, b
516 ; m1/3 have the high bits of 0,1,2,3
517 ; m4/5 have the low bits of 0,1,2,3
518 ; m0/2/6/7 are free
520 mova m2, [pw_15212_9929]
521 mova m0, [pw_5283_13377]
522 pmaddwd m7, m2, reg_b
523 pmaddwd m6, m4, m0
524 pmaddwd m2, m3
525 pmaddwd m0, reg_a
526 paddd m6, m7
527 paddd m0, m2
528 mova m1, [pw_m13377_13377]
529 mova m5, [pw_13377_0]
530 pmaddwd m7, m1, reg_b
531 pmaddwd m2, m4, m5
532 pmaddwd m1, m3
533 pmaddwd m5, reg_a
534 paddd m2, m7
535 paddd m1, m5
536 paddd m6, %1
537 paddd m2, %1
538 psrad m6, 14
539 psrad m2, 14
540 paddd m0, m6 ; t0
541 paddd m2, m1 ; t2
543 mova m7, [pw_m5283_m15212]
544 mova m5, [pw_9929_13377]
545 pmaddwd m1, m7, reg_b
546 pmaddwd m6, m4, m5
547 pmaddwd m7, m3
548 pmaddwd m5, reg_a
549 paddd m6, m1
550 paddd m7, m5
551 UNSCRATCH 5, 9, rsp+1*mmsize, b
552 pmaddwd m5, [pw_9929_m5283]
553 pmaddwd m4, [pw_15212_m13377]
554 pmaddwd m3, [pw_9929_m5283]
555 UNSCRATCH 1, 8, rsp+0*mmsize, a
556 pmaddwd m1, [pw_15212_m13377]
557 paddd m4, m5
558 paddd m3, m1
559 paddd m6, %1
560 paddd m4, %1
561 psrad m6, 14
562 psrad m4, 14
563 paddd m7, m6 ; t1
564 paddd m3, m4 ; t3
566 SWAP 1, 7
567 %endmacro
569 %macro IADST4_12BPP_FN 4
570 cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
571 mova m0, [blockq+0*16]
572 mova m1, [blockq+1*16]
573 mova m2, [blockq+2*16]
574 mova m3, [blockq+3*16]
576 PRELOAD 10, pd_8192, rnd
577 PRELOAD 11, pd_3fff, mask
578 %2_12BPP_1D reg_rnd, reg_mask
579 TRANSPOSE4x4D 0, 1, 2, 3, 4
580 %4_12BPP_1D reg_rnd, reg_mask
582 pxor m4, m4
583 ZERO_BLOCK blockq, 16, 4, m4
585 ; writeout
586 DEFINE_ARGS dst, stride, stride3
587 lea stride3q, [strideq*3]
588 mova m5, [pw_4095]
589 mova m6, [pd_8]
590 ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4
592 %endmacro
594 INIT_XMM sse2
595 IADST4_12BPP_FN idct, IDCT4, iadst, IADST4
596 IADST4_12BPP_FN iadst, IADST4, idct, IDCT4
597 IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
599 ; the following line has not been executed at the end of this macro:
600 ; UNSCRATCH 6, 8, rsp+%3*mmsize
601 %macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset
602 mova m0, [%1+0*%4]
603 mova m2, [%1+2*%4]
604 mova m4, [%1+4*%4]
605 mova m6, [%1+6*%4]
606 IDCT4_12BPP_1D %2, %3, 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3
607 SCRATCH 4, 8, rsp+(%5+0)*mmsize
608 SCRATCH 6, 9, rsp+(%5+1)*mmsize
609 mova m1, [%1+1*%4]
610 mova m3, [%1+3*%4]
611 mova m5, [%1+5*%4]
612 mova m7, [%1+7*%4]
613 SUMSUB_MUL 1, 7, 4, 6, 16069, 3196, %2, %3 ; m1=t7a, m7=t4a
614 SUMSUB_MUL 5, 3, 4, 6, 9102, 13623, %2, %3 ; m5=t6a, m3=t5a
615 SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a
616 SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a
617 SUMSUB_MUL 1, 7, 4, 6, 11585, 11585, %2, %3 ; m1=t6, m7=t5
618 SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7
619 SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6
620 UNSCRATCH 4, 8, rsp+(%5+0)*mmsize
621 UNSCRATCH 6, 9, rsp+(%5+1)*mmsize
622 SCRATCH 2, 8, rsp+(%5+0)*mmsize
623 SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5
624 SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4
625 SWAP 0, 5, 4, 6, 2, 7
626 %endmacro
628 %macro STORE_2x8 5-7 dstq, strideq ; tmp1-2, reg, min, max
629 mova m%1, [%6+%7*0]
630 mova m%2, [%6+%7*1]
631 paddw m%1, m%3
632 paddw m%2, m%3
633 pmaxsw m%1, %4
634 pmaxsw m%2, %4
635 pminsw m%1, %5
636 pminsw m%2, %5
637 mova [%6+%7*0], m%1
638 mova [%6+%7*1], m%2
639 %endmacro
641 ; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
642 ; storage also instead of allocating two more stack spaces. This doesn't
643 ; matter much but it's something...
644 INIT_XMM sse2
645 cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \
646 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
647 dst, stride, block, eob
648 mova m0, [pw_1023]
649 cmp eobd, 1
650 jg .idctfull
652 ; dc-only - the 10bit version can be done entirely in 32bit, since the max
653 ; coef values are 16+sign bit, and the coef is 14bit, so 30+sign easily
654 ; fits in 32bit
655 DEFINE_ARGS dst, stride, block, coef
656 pxor m2, m2
657 DC_ONLY 5, m2
658 movd m1, coefd
659 pshuflw m1, m1, q0000
660 punpcklqdq m1, m1
661 DEFINE_ARGS dst, stride, cnt
662 mov cntd, 4
663 .loop_dc:
664 STORE_2x8 3, 4, 1, m2, m0
665 lea dstq, [dstq+strideq*2]
666 dec cntd
667 jg .loop_dc
670 .idctfull:
671 SCRATCH 0, 12, rsp+16*mmsize, max
672 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
673 %if ARCH_X86_64
674 mov dstbakq, dstq
675 movsxd cntq, cntd
676 %endif
677 %if PIC
678 lea ptrq, [default_8x8]
679 movzx cntd, byte [ptrq+cntq-1]
680 %else
681 movzx cntd, byte [default_8x8+cntq-1]
682 %endif
683 mov skipd, 2
684 sub skipd, cntd
685 mov ptrq, rsp
686 PRELOAD 10, pd_8192, rnd
687 PRELOAD 11, pd_3fff, mask
688 PRELOAD 13, pd_16, srnd
689 .loop_1:
690 IDCT8_1D blockq, reg_rnd, reg_mask
692 TRANSPOSE4x4D 0, 1, 2, 3, 6
693 mova [ptrq+ 0*mmsize], m0
694 mova [ptrq+ 2*mmsize], m1
695 mova [ptrq+ 4*mmsize], m2
696 mova [ptrq+ 6*mmsize], m3
697 UNSCRATCH 6, 8, rsp+17*mmsize
698 TRANSPOSE4x4D 4, 5, 6, 7, 0
699 mova [ptrq+ 1*mmsize], m4
700 mova [ptrq+ 3*mmsize], m5
701 mova [ptrq+ 5*mmsize], m6
702 mova [ptrq+ 7*mmsize], m7
703 add ptrq, 8 * mmsize
704 add blockq, mmsize
705 dec cntd
706 jg .loop_1
708 ; zero-pad the remainder (skipped cols)
709 test skipd, skipd
710 jz .end
711 add skipd, skipd
712 lea blockq, [blockq+skipq*(mmsize/2)]
713 pxor m0, m0
714 .loop_z:
715 mova [ptrq+mmsize*0], m0
716 mova [ptrq+mmsize*1], m0
717 mova [ptrq+mmsize*2], m0
718 mova [ptrq+mmsize*3], m0
719 add ptrq, 4 * mmsize
720 dec skipd
721 jg .loop_z
722 .end:
724 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
725 lea stride3q, [strideq*3]
726 mov cntd, 2
727 mov ptrq, rsp
728 .loop_2:
729 IDCT8_1D ptrq, reg_rnd, reg_mask
731 pxor m6, m6
732 ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5
733 lea dstq, [dstq+strideq*4]
734 UNSCRATCH 0, 8, rsp+17*mmsize
735 UNSCRATCH 1, 12, rsp+16*mmsize, max
736 UNSCRATCH 2, 13, pd_16, srnd
737 ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5
738 add ptrq, 16
739 %if ARCH_X86_64
740 lea dstq, [dstbakq+8]
741 %else
742 mov dstq, dstm
743 add dstq, 8
744 %endif
745 dec cntd
746 jg .loop_2
748 ; m6 is still zero
749 ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
752 %macro DC_ONLY_64BIT 2 ; shift, zero
753 %if ARCH_X86_64
754 movsxd coefq, dword [blockq]
755 movd [blockq], %2
756 imul coefq, 11585
757 add coefq, 8192
758 sar coefq, 14
759 imul coefq, 11585
760 add coefq, ((1 << (%1 - 1)) << 14) + 8192
761 sar coefq, 14 + %1
762 %else
763 mov coefd, dword [blockq]
764 movd [blockq], %2
765 DEFINE_ARGS dst, stride, cnt, coef, coefl
766 mov cntd, 2
767 .loop_dc_calc:
768 mov coefld, coefd
769 sar coefd, 14
770 and coefld, 0x3fff
771 imul coefd, 11585
772 imul coefld, 11585
773 add coefld, 8192
774 sar coefld, 14
775 add coefd, coefld
776 dec cntd
777 jg .loop_dc_calc
778 add coefd, 1 << (%1 - 1)
779 sar coefd, %1
780 %endif
781 %endmacro
783 INIT_XMM sse2
784 cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \
785 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
786 dst, stride, block, eob
787 mova m0, [pw_4095]
788 cmp eobd, 1
789 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_8x8_add_10 %+ SUFFIX).idctfull
791 ; dc-only - unfortunately, this one can overflow, since coefs are 18+sign
792 ; bpp, and 18+14+sign does not fit in 32bit, so we do 2-stage multiplies
793 DEFINE_ARGS dst, stride, block, coef, coefl
794 pxor m2, m2
795 DC_ONLY_64BIT 5, m2
796 movd m1, coefd
797 pshuflw m1, m1, q0000
798 punpcklqdq m1, m1
799 DEFINE_ARGS dst, stride, cnt
800 mov cntd, 4
801 .loop_dc:
802 STORE_2x8 3, 4, 1, m2, m0
803 lea dstq, [dstq+strideq*2]
804 dec cntd
805 jg .loop_dc
808 ; inputs and outputs are dwords, coefficients are words
810 ; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
811 ; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
812 %macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask
813 pand m%3, m%1, %7
814 pand m%4, m%2, %7
815 psrad m%1, 14
816 psrad m%2, 14
817 packssdw m%4, m%2
818 packssdw m%3, m%1
819 punpckhwd m%2, m%4, m%3
820 punpcklwd m%4, m%3
821 pmaddwd m%3, m%4, [pw_%6_%5]
822 pmaddwd m%1, m%2, [pw_%6_%5]
823 pmaddwd m%4, [pw_m%5_%6]
824 pmaddwd m%2, [pw_m%5_%6]
825 %endmacro
827 ; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
828 ; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
829 %macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd
830 SUMSUB_BA d, %1, %2, %5
831 SUMSUB_BA d, %3, %4, %5
832 paddd m%3, %6
833 paddd m%4, %6
834 psrad m%3, 14
835 psrad m%4, 14
836 paddd m%1, m%3
837 paddd m%2, m%4
838 %endmacro
840 %macro NEGD 1
841 %if cpuflag(ssse3)
842 psignd %1, [pw_m1]
843 %else
844 pxor %1, [pw_m1]
845 paddd %1, [pd_1]
846 %endif
847 %endmacro
849 ; the following line has not been executed at the end of this macro:
850 ; UNSCRATCH 6, 8, rsp+17*mmsize
851 %macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask
852 mova m0, [%1+ 0*mmsize]
853 mova m3, [%1+ 6*mmsize]
854 mova m4, [%1+ 8*mmsize]
855 mova m7, [%1+14*mmsize]
856 SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606, %3 ; m7/1=t0a, m0/2=t1a
857 SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665, %3 ; m3/5=t4a, m4/6=t5a
858 SCRATCH 0, 8, rsp+17*mmsize
859 SUMSUB_PACK_D 3, 7, 5, 1, 0, %2 ; m3=t0, m7=t4
860 UNSCRATCH 0, 8, rsp+17*mmsize
861 SUMSUB_PACK_D 4, 0, 6, 2, 1, %2 ; m4=t1, m0=t5
863 SCRATCH 3, 8, rsp+17*mmsize
864 SCRATCH 4, 9, rsp+18*mmsize
865 SCRATCH 7, 10, rsp+19*mmsize
866 SCRATCH 0, 11, rsp+20*mmsize
868 mova m1, [%1+ 2*mmsize]
869 mova m2, [%1+ 4*mmsize]
870 mova m5, [%1+10*mmsize]
871 mova m6, [%1+12*mmsize]
872 SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723, %3 ; m5/8=t2a, m2/9=t3a
873 SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679, %3 ; m1/10=t6a, m6/11=t7a
874 SCRATCH 2, 12, rsp+21*mmsize
875 SUMSUB_PACK_D 1, 5, 7, 3, 2, %2 ; m1=t2, m5=t6
876 UNSCRATCH 2, 12, rsp+21*mmsize
877 SUMSUB_PACK_D 6, 2, 0, 4, 3, %2 ; m6=t3, m2=t7
879 UNSCRATCH 7, 10, rsp+19*mmsize
880 UNSCRATCH 0, 11, rsp+20*mmsize
881 SCRATCH 1, 10, rsp+19*mmsize
882 SCRATCH 6, 11, rsp+20*mmsize
884 SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270, %3 ; m7/8=t4a, m0/9=t5a
885 SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137, %3 ; m2/10=t7a, m5/11=t6a
886 SCRATCH 2, 12, rsp+21*mmsize
887 SUMSUB_PACK_D 5, 7, 6, 3, 2, %2 ; m5=-out1, m7=t6
888 UNSCRATCH 2, 12, rsp+21*mmsize
889 NEGD m5 ; m5=out1
890 SUMSUB_PACK_D 2, 0, 1, 4, 3, %2 ; m2=out6, m0=t7
891 SUMSUB_MUL 7, 0, 3, 4, 11585, 11585, %2, %3 ; m7=out2, m0=-out5
892 NEGD m0 ; m0=out5
894 UNSCRATCH 3, 8, rsp+17*mmsize
895 UNSCRATCH 4, 9, rsp+18*mmsize
896 UNSCRATCH 1, 10, rsp+19*mmsize
897 UNSCRATCH 6, 11, rsp+20*mmsize
898 SCRATCH 2, 8, rsp+17*mmsize
899 SCRATCH 0, 9, rsp+18*mmsize
901 SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2
902 SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3
903 NEGD m6 ; m6=out7
904 SUMSUB_MUL 3, 4, 2, 0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4
905 NEGD m3 ; m3=out3
907 UNSCRATCH 0, 9, rsp+18*mmsize
909 SWAP 0, 1, 5
910 SWAP 2, 7, 6
911 %endmacro
913 %macro IADST8_FN 5
914 cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
915 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
916 dst, stride, block, eob
917 mova m0, [pw_1023]
919 .body:
920 SCRATCH 0, 13, rsp+16*mmsize, max
921 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
922 %if ARCH_X86_64
923 mov dstbakq, dstq
924 movsxd cntq, cntd
925 %endif
926 %if PIC
927 lea ptrq, [%5_8x8]
928 movzx cntd, byte [ptrq+cntq-1]
929 %else
930 movzx cntd, byte [%5_8x8+cntq-1]
931 %endif
932 mov skipd, 2
933 sub skipd, cntd
934 mov ptrq, rsp
935 PRELOAD 14, pd_8192, rnd
936 PRELOAD 15, pd_3fff, mask
937 .loop_1:
938 %2_1D blockq, reg_rnd, reg_mask
940 TRANSPOSE4x4D 0, 1, 2, 3, 6
941 mova [ptrq+ 0*mmsize], m0
942 mova [ptrq+ 2*mmsize], m1
943 mova [ptrq+ 4*mmsize], m2
944 mova [ptrq+ 6*mmsize], m3
945 UNSCRATCH 6, 8, rsp+17*mmsize
946 TRANSPOSE4x4D 4, 5, 6, 7, 0
947 mova [ptrq+ 1*mmsize], m4
948 mova [ptrq+ 3*mmsize], m5
949 mova [ptrq+ 5*mmsize], m6
950 mova [ptrq+ 7*mmsize], m7
951 add ptrq, 8 * mmsize
952 add blockq, mmsize
953 dec cntd
954 jg .loop_1
956 ; zero-pad the remainder (skipped cols)
957 test skipd, skipd
958 jz .end
959 add skipd, skipd
960 lea blockq, [blockq+skipq*(mmsize/2)]
961 pxor m0, m0
962 .loop_z:
963 mova [ptrq+mmsize*0], m0
964 mova [ptrq+mmsize*1], m0
965 mova [ptrq+mmsize*2], m0
966 mova [ptrq+mmsize*3], m0
967 add ptrq, 4 * mmsize
968 dec skipd
969 jg .loop_z
970 .end:
972 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
973 lea stride3q, [strideq*3]
974 mov cntd, 2
975 mov ptrq, rsp
976 .loop_2:
977 %4_1D ptrq, reg_rnd, reg_mask
979 pxor m6, m6
980 PRELOAD 9, pd_16, srnd
981 ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5
982 lea dstq, [dstq+strideq*4]
983 UNSCRATCH 0, 8, rsp+17*mmsize
984 UNSCRATCH 1, 13, rsp+16*mmsize, max
985 UNSCRATCH 2, 9, pd_16, srnd
986 ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5
987 add ptrq, 16
988 %if ARCH_X86_64
989 lea dstq, [dstbakq+8]
990 %else
991 mov dstq, dstm
992 add dstq, 8
993 %endif
994 dec cntd
995 jg .loop_2
997 ; m6 is still zero
998 ZERO_BLOCK blockq-2*mmsize, 32, 8, m6
1001 cglobal vp9_%1_%3_8x8_add_12, 4, 6 + ARCH_X86_64, 16, \
1002 16 * mmsize + ARCH_X86_32 * 6 * mmsize, \
1003 dst, stride, block, eob
1004 mova m0, [pw_4095]
1005 jmp mangle(private_prefix %+ _ %+ vp9_%1_%3_8x8_add_10 %+ SUFFIX).body
1006 %endmacro
1008 INIT_XMM sse2
1009 IADST8_FN idct, IDCT8, iadst, IADST8, row
1010 IADST8_FN iadst, IADST8, idct, IDCT8, col
1011 IADST8_FN iadst, IADST8, iadst, IADST8, default
1013 %macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
1014 IDCT8_1D %1, [pd_8192], [pd_3fff], %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
1015 ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6
1016 SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a
1017 SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a
1018 SCRATCH 2, 13, rsp+(%4+5)*mmsize ; t2a
1019 SCRATCH 3, 12, rsp+(%4+4)*mmsize ; t3a
1020 SCRATCH 4, 11, rsp+(%4+3)*mmsize ; t4
1021 mova [rsp+(%3+0)*mmsize], m5 ; t5
1022 mova [rsp+(%3+1)*mmsize], m7 ; t7
1024 mova m0, [%1+ 1*%2] ; in1
1025 mova m3, [%1+ 7*%2] ; in7
1026 mova m4, [%1+ 9*%2] ; in9
1027 mova m7, [%1+15*%2] ; in15
1029 SUMSUB_MUL 0, 7, 1, 2, 16305, 1606 ; m0=t15a, m7=t8a
1030 SUMSUB_MUL 4, 3, 1, 2, 10394, 12665 ; m4=t14a, m3=t9a
1031 SUMSUB_BA d, 3, 7, 1 ; m3=t8, m7=t9
1032 SUMSUB_BA d, 4, 0, 1 ; m4=t15,m0=t14
1033 SUMSUB_MUL 0, 7, 1, 2, 15137, 6270 ; m0=t14a, m7=t9a
1035 mova m1, [%1+ 3*%2] ; in3
1036 mova m2, [%1+ 5*%2] ; in5
1037 mova m5, [%1+11*%2] ; in11
1038 mova m6, [%1+13*%2] ; in13
1040 SCRATCH 0, 9, rsp+(%4+1)*mmsize
1041 SCRATCH 7, 10, rsp+(%4+2)*mmsize
1043 SUMSUB_MUL 2, 5, 0, 7, 14449, 7723 ; m2=t13a, m5=t10a
1044 SUMSUB_MUL 6, 1, 0, 7, 4756, 15679 ; m6=t12a, m1=t11a
1045 SUMSUB_BA d, 5, 1, 0 ; m5=t11,m1=t10
1046 SUMSUB_BA d, 2, 6, 0 ; m2=t12,m6=t13
1047 NEGD m1 ; m1=-t10
1048 SUMSUB_MUL 1, 6, 0, 7, 15137, 6270 ; m1=t13a, m6=t10a
1050 UNSCRATCH 7, 10, rsp+(%4+2)*mmsize
1051 SUMSUB_BA d, 5, 3, 0 ; m5=t8a, m3=t11a
1052 SUMSUB_BA d, 6, 7, 0 ; m6=t9, m7=t10
1053 SUMSUB_BA d, 2, 4, 0 ; m2=t15a,m4=t12a
1054 SCRATCH 5, 10, rsp+(%4+2)*mmsize
1055 SUMSUB_MUL 4, 3, 0, 5, 11585, 11585 ; m4=t12, m3=t11
1056 UNSCRATCH 0, 9, rsp+(%4+1)*mmsize
1057 SUMSUB_BA d, 1, 0, 5 ; m1=t14, m0=t13
1058 SCRATCH 6, 9, rsp+(%4+1)*mmsize
1059 SUMSUB_MUL 0, 7, 6, 5, 11585, 11585 ; m0=t13a,m7=t10a
1061 ; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2
1062 ; free: 6,5
1064 UNSCRATCH 5, 15, rsp+(%4+7)*mmsize
1065 SUMSUB_BA d, 2, 5, 6 ; m2=out0, m5=out15
1066 SCRATCH 5, 15, rsp+(%4+7)*mmsize
1067 UNSCRATCH 5, 14, rsp+(%4+6)*mmsize
1068 SUMSUB_BA d, 1, 5, 6 ; m1=out1, m5=out14
1069 SCRATCH 5, 14, rsp+(%4+6)*mmsize
1070 UNSCRATCH 5, 13, rsp+(%4+5)*mmsize
1071 SUMSUB_BA d, 0, 5, 6 ; m0=out2, m5=out13
1072 SCRATCH 5, 13, rsp+(%4+5)*mmsize
1073 UNSCRATCH 5, 12, rsp+(%4+4)*mmsize
1074 SUMSUB_BA d, 4, 5, 6 ; m4=out3, m5=out12
1075 SCRATCH 5, 12, rsp+(%4+4)*mmsize
1076 UNSCRATCH 5, 11, rsp+(%4+3)*mmsize
1077 SUMSUB_BA d, 3, 5, 6 ; m3=out4, m5=out11
1078 SCRATCH 4, 11, rsp+(%4+3)*mmsize
1079 mova m4, [rsp+(%3+0)*mmsize]
1080 SUMSUB_BA d, 7, 4, 6 ; m7=out5, m4=out10
1081 mova [rsp+(%3+0)*mmsize], m5
1082 UNSCRATCH 5, 8, rsp+(%4+0)*mmsize
1083 UNSCRATCH 6, 9, rsp+(%4+1)*mmsize
1084 SCRATCH 2, 8, rsp+(%4+0)*mmsize
1085 SCRATCH 1, 9, rsp+(%4+1)*mmsize
1086 UNSCRATCH 1, 10, rsp+(%4+2)*mmsize
1087 SCRATCH 0, 10, rsp+(%4+2)*mmsize
1088 mova m0, [rsp+(%3+1)*mmsize]
1089 SUMSUB_BA d, 6, 5, 2 ; m6=out6, m5=out9
1090 SUMSUB_BA d, 1, 0, 2 ; m1=out7, m0=out8
1092 SWAP 0, 3, 1, 7, 2, 6, 4
1094 ; output order: 8-11|r67-70=out0-3
1095 ; 0-6,r65=out4-11
1096 ; 12-15|r71-74=out12-15
1097 %endmacro
1099 INIT_XMM sse2
1100 cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
1101 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1102 dst, stride, block, eob
1103 mova m0, [pw_1023]
1104 cmp eobd, 1
1105 jg .idctfull
1107 ; dc-only - the 10bit version can be done entirely in 32bit, since the max
1108 ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
1109 ; fits in 32bit
1110 DEFINE_ARGS dst, stride, block, coef
1111 pxor m2, m2
1112 DC_ONLY 6, m2
1113 movd m1, coefd
1114 pshuflw m1, m1, q0000
1115 punpcklqdq m1, m1
1116 DEFINE_ARGS dst, stride, cnt
1117 mov cntd, 8
1118 .loop_dc:
1119 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
1120 STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize
1121 lea dstq, [dstq+strideq*2]
1122 dec cntd
1123 jg .loop_dc
1126 .idctfull:
1127 mova [rsp+64*mmsize], m0
1128 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
1129 %if ARCH_X86_64
1130 mov dstbakq, dstq
1131 movsxd cntq, cntd
1132 %endif
1133 %if PIC
1134 lea ptrq, [default_16x16]
1135 movzx cntd, byte [ptrq+cntq-1]
1136 %else
1137 movzx cntd, byte [default_16x16+cntq-1]
1138 %endif
1139 mov skipd, 4
1140 sub skipd, cntd
1141 mov ptrq, rsp
1142 .loop_1:
1143 IDCT16_1D blockq
1145 TRANSPOSE4x4D 0, 1, 2, 3, 7
1146 mova [ptrq+ 1*mmsize], m0
1147 mova [ptrq+ 5*mmsize], m1
1148 mova [ptrq+ 9*mmsize], m2
1149 mova [ptrq+13*mmsize], m3
1150 mova m7, [rsp+65*mmsize]
1151 TRANSPOSE4x4D 4, 5, 6, 7, 0
1152 mova [ptrq+ 2*mmsize], m4
1153 mova [ptrq+ 6*mmsize], m5
1154 mova [ptrq+10*mmsize], m6
1155 mova [ptrq+14*mmsize], m7
1156 UNSCRATCH 0, 8, rsp+67*mmsize
1157 UNSCRATCH 1, 9, rsp+68*mmsize
1158 UNSCRATCH 2, 10, rsp+69*mmsize
1159 UNSCRATCH 3, 11, rsp+70*mmsize
1160 TRANSPOSE4x4D 0, 1, 2, 3, 7
1161 mova [ptrq+ 0*mmsize], m0
1162 mova [ptrq+ 4*mmsize], m1
1163 mova [ptrq+ 8*mmsize], m2
1164 mova [ptrq+12*mmsize], m3
1165 UNSCRATCH 4, 12, rsp+71*mmsize
1166 UNSCRATCH 5, 13, rsp+72*mmsize
1167 UNSCRATCH 6, 14, rsp+73*mmsize
1168 UNSCRATCH 7, 15, rsp+74*mmsize
1169 TRANSPOSE4x4D 4, 5, 6, 7, 0
1170 mova [ptrq+ 3*mmsize], m4
1171 mova [ptrq+ 7*mmsize], m5
1172 mova [ptrq+11*mmsize], m6
1173 mova [ptrq+15*mmsize], m7
1174 add ptrq, 16 * mmsize
1175 add blockq, mmsize
1176 dec cntd
1177 jg .loop_1
1179 ; zero-pad the remainder (skipped cols)
1180 test skipd, skipd
1181 jz .end
1182 add skipd, skipd
1183 lea blockq, [blockq+skipq*(mmsize/2)]
1184 pxor m0, m0
1185 .loop_z:
1186 mova [ptrq+mmsize*0], m0
1187 mova [ptrq+mmsize*1], m0
1188 mova [ptrq+mmsize*2], m0
1189 mova [ptrq+mmsize*3], m0
1190 mova [ptrq+mmsize*4], m0
1191 mova [ptrq+mmsize*5], m0
1192 mova [ptrq+mmsize*6], m0
1193 mova [ptrq+mmsize*7], m0
1194 add ptrq, 8 * mmsize
1195 dec skipd
1196 jg .loop_z
1197 .end:
1199 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1200 lea stride3q, [strideq*3]
1201 mov cntd, 4
1202 mov ptrq, rsp
1203 .loop_2:
1204 IDCT16_1D ptrq
1206 pxor m7, m7
1207 lea dstq, [dstq+strideq*4]
1208 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
1209 lea dstq, [dstq+strideq*4]
1210 mova m0, [rsp+65*mmsize]
1211 mova m1, [rsp+64*mmsize]
1212 mova m2, [pd_32]
1213 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
1215 %if ARCH_X86_64
1216 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
1217 %else
1218 mov dstq, dstm
1219 %endif
1220 UNSCRATCH 0, 8, rsp+67*mmsize
1221 UNSCRATCH 4, 9, rsp+68*mmsize
1222 UNSCRATCH 5, 10, rsp+69*mmsize
1223 UNSCRATCH 3, 11, rsp+70*mmsize
1224 ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6
1225 %if ARCH_X86_64
1226 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1227 lea dstq, [dstbakq+stride3q*4]
1228 %else
1229 lea dstq, [dstq+stride3q*4]
1230 %endif
1231 UNSCRATCH 4, 12, rsp+71*mmsize
1232 UNSCRATCH 5, 13, rsp+72*mmsize
1233 UNSCRATCH 6, 14, rsp+73*mmsize
1234 UNSCRATCH 0, 15, rsp+74*mmsize
1235 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
1237 add ptrq, mmsize
1238 %if ARCH_X86_64
1239 add dstbakq, 8
1240 mov dstq, dstbakq
1241 %else
1242 add dword dstm, 8
1243 mov dstq, dstm
1244 %endif
1245 dec cntd
1246 jg .loop_2
1248 ; m7 is still zero
1249 ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
1252 INIT_XMM sse2
1253 cglobal vp9_idct_idct_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
1254 67 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1255 dst, stride, block, eob
1256 mova m0, [pw_4095]
1257 cmp eobd, 1
1258 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_16x16_add_10 %+ SUFFIX).idctfull
1260 ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
1261 ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
1262 DEFINE_ARGS dst, stride, block, coef, coefl
1263 pxor m2, m2
1264 DC_ONLY_64BIT 6, m2
1265 movd m1, coefd
1266 pshuflw m1, m1, q0000
1267 punpcklqdq m1, m1
1268 DEFINE_ARGS dst, stride, cnt
1269 mov cntd, 8
1270 .loop_dc:
1271 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
1272 STORE_2x8 3, 4, 1, m2, m0, dstq+strideq, mmsize
1273 lea dstq, [dstq+strideq*2]
1274 dec cntd
1275 jg .loop_dc
1278 ; r65-69 are available for spills
1279 ; r70-77 are available on x86-32 only (x86-64 should use m8-15)
1280 ; output should be in m8-11|r70-73, m0-6,r65 and m12-15|r74-77
1281 %macro IADST16_1D 1 ; src
1282 mova m0, [%1+ 0*4*mmsize] ; in0
1283 mova m1, [%1+ 7*4*mmsize] ; in7
1284 mova m2, [%1+ 8*4*mmsize] ; in8
1285 mova m3, [%1+15*4*mmsize] ; in15
1286 SUMSUB_MUL_D 3, 0, 4, 5, 16364, 804 ; m3/4=t0, m0/5=t1
1287 SUMSUB_MUL_D 1, 2, 6, 7, 11003, 12140 ; m1/6=t8, m2/7=t9
1288 SCRATCH 0, 8, rsp+70*mmsize
1289 SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t0a, m3=t8a
1290 UNSCRATCH 0, 8, rsp+70*mmsize
1291 SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t1a, m0=t9a
1292 mova [rsp+67*mmsize], m1
1293 SCRATCH 2, 9, rsp+71*mmsize
1294 SCRATCH 3, 12, rsp+74*mmsize
1295 SCRATCH 0, 13, rsp+75*mmsize
1297 mova m0, [%1+ 3*4*mmsize] ; in3
1298 mova m1, [%1+ 4*4*mmsize] ; in4
1299 mova m2, [%1+11*4*mmsize] ; in11
1300 mova m3, [%1+12*4*mmsize] ; in12
1301 SUMSUB_MUL_D 2, 1, 4, 5, 14811, 7005 ; m2/4=t4, m1/5=t5
1302 SUMSUB_MUL_D 0, 3, 6, 7, 5520, 15426 ; m0/6=t12, m3/7=t13
1303 SCRATCH 1, 10, rsp+72*mmsize
1304 SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t4a, m2=t12a
1305 UNSCRATCH 1, 10, rsp+72*mmsize
1306 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t5a, m1=t13a
1307 SCRATCH 0, 15, rsp+77*mmsize
1308 SCRATCH 3, 11, rsp+73*mmsize
1310 UNSCRATCH 0, 12, rsp+74*mmsize ; t8a
1311 UNSCRATCH 3, 13, rsp+75*mmsize ; t9a
1312 SUMSUB_MUL_D 0, 3, 4, 5, 16069, 3196 ; m0/4=t8, m3/5=t9
1313 SUMSUB_MUL_D 1, 2, 6, 7, 3196, 16069 ; m1/6=t13, m2/7=t12
1314 SCRATCH 1, 12, rsp+74*mmsize
1315 SUMSUB_PACK_D 2, 0, 7, 4, 1 ; m2=t8a, m0=t12a
1316 UNSCRATCH 1, 12, rsp+74*mmsize
1317 SUMSUB_PACK_D 1, 3, 6, 5, 4 ; m1=t9a, m3=t13a
1318 mova [rsp+65*mmsize], m2
1319 mova [rsp+66*mmsize], m1
1320 SCRATCH 0, 8, rsp+70*mmsize
1321 SCRATCH 3, 12, rsp+74*mmsize
1323 mova m0, [%1+ 2*4*mmsize] ; in2
1324 mova m1, [%1+ 5*4*mmsize] ; in5
1325 mova m2, [%1+10*4*mmsize] ; in10
1326 mova m3, [%1+13*4*mmsize] ; in13
1327 SUMSUB_MUL_D 3, 0, 4, 5, 15893, 3981 ; m3/4=t2, m0/5=t3
1328 SUMSUB_MUL_D 1, 2, 6, 7, 8423, 14053 ; m1/6=t10, m2/7=t11
1329 SCRATCH 0, 10, rsp+72*mmsize
1330 SUMSUB_PACK_D 1, 3, 6, 4, 0 ; m1=t2a, m3=t10a
1331 UNSCRATCH 0, 10, rsp+72*mmsize
1332 SUMSUB_PACK_D 2, 0, 7, 5, 4 ; m2=t3a, m0=t11a
1333 mova [rsp+68*mmsize], m1
1334 mova [rsp+69*mmsize], m2
1335 SCRATCH 3, 13, rsp+75*mmsize
1336 SCRATCH 0, 14, rsp+76*mmsize
1338 mova m0, [%1+ 1*4*mmsize] ; in1
1339 mova m1, [%1+ 6*4*mmsize] ; in6
1340 mova m2, [%1+ 9*4*mmsize] ; in9
1341 mova m3, [%1+14*4*mmsize] ; in14
1342 SUMSUB_MUL_D 2, 1, 4, 5, 13160, 9760 ; m2/4=t6, m1/5=t7
1343 SUMSUB_MUL_D 0, 3, 6, 7, 2404, 16207 ; m0/6=t14, m3/7=t15
1344 SCRATCH 1, 10, rsp+72*mmsize
1345 SUMSUB_PACK_D 0, 2, 6, 4, 1 ; m0=t6a, m2=t14a
1346 UNSCRATCH 1, 10, rsp+72*mmsize
1347 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=t7a, m1=t15a
1349 UNSCRATCH 4, 13, rsp+75*mmsize ; t10a
1350 UNSCRATCH 5, 14, rsp+76*mmsize ; t11a
1351 SCRATCH 0, 13, rsp+75*mmsize
1352 SCRATCH 3, 14, rsp+76*mmsize
1353 SUMSUB_MUL_D 4, 5, 6, 7, 9102, 13623 ; m4/6=t10, m5/7=t11
1354 SUMSUB_MUL_D 1, 2, 0, 3, 13623, 9102 ; m1/0=t15, m2/3=t14
1355 SCRATCH 0, 10, rsp+72*mmsize
1356 SUMSUB_PACK_D 2, 4, 3, 6, 0 ; m2=t10a, m4=t14a
1357 UNSCRATCH 0, 10, rsp+72*mmsize
1358 SUMSUB_PACK_D 1, 5, 0, 7, 6 ; m1=t11a, m5=t15a
1360 UNSCRATCH 0, 8, rsp+70*mmsize ; t12a
1361 UNSCRATCH 3, 12, rsp+74*mmsize ; t13a
1362 SCRATCH 2, 8, rsp+70*mmsize
1363 SCRATCH 1, 12, rsp+74*mmsize
1364 SUMSUB_MUL_D 0, 3, 1, 2, 15137, 6270 ; m0/1=t12, m3/2=t13
1365 SUMSUB_MUL_D 5, 4, 7, 6, 6270, 15137 ; m5/7=t15, m4/6=t14
1366 SCRATCH 2, 10, rsp+72*mmsize
1367 SUMSUB_PACK_D 4, 0, 6, 1, 2 ; m4=out2, m0=t14a
1368 UNSCRATCH 2, 10, rsp+72*mmsize
1369 SUMSUB_PACK_D 5, 3, 7, 2, 1 ; m5=-out13, m3=t15a
1370 NEGD m5 ; m5=out13
1372 UNSCRATCH 1, 9, rsp+71*mmsize ; t1a
1373 mova m2, [rsp+68*mmsize] ; t2a
1374 UNSCRATCH 6, 13, rsp+75*mmsize ; t6a
1375 UNSCRATCH 7, 14, rsp+76*mmsize ; t7a
1376 SCRATCH 4, 10, rsp+72*mmsize
1377 SCRATCH 5, 13, rsp+75*mmsize
1378 UNSCRATCH 4, 15, rsp+77*mmsize ; t4a
1379 UNSCRATCH 5, 11, rsp+73*mmsize ; t5a
1380 SCRATCH 0, 14, rsp+76*mmsize
1381 SCRATCH 3, 15, rsp+77*mmsize
1382 mova m0, [rsp+67*mmsize] ; t0a
1383 SUMSUB_BA d, 4, 0, 3 ; m4=t0, m0=t4
1384 SUMSUB_BA d, 5, 1, 3 ; m5=t1, m1=t5
1385 SUMSUB_BA d, 6, 2, 3 ; m6=t2, m2=t6
1386 SCRATCH 4, 9, rsp+71*mmsize
1387 mova m3, [rsp+69*mmsize] ; t3a
1388 SUMSUB_BA d, 7, 3, 4 ; m7=t3, m3=t7
1390 mova [rsp+67*mmsize], m5
1391 mova [rsp+68*mmsize], m6
1392 mova [rsp+69*mmsize], m7
1393 SUMSUB_MUL_D 0, 1, 4, 5, 15137, 6270 ; m0/4=t4a, m1/5=t5a
1394 SUMSUB_MUL_D 3, 2, 7, 6, 6270, 15137 ; m3/7=t7a, m2/6=t6a
1395 SCRATCH 1, 11, rsp+73*mmsize
1396 SUMSUB_PACK_D 2, 0, 6, 4, 1 ; m2=-out3, m0=t6
1397 NEGD m2 ; m2=out3
1398 UNSCRATCH 1, 11, rsp+73*mmsize
1399 SUMSUB_PACK_D 3, 1, 7, 5, 4 ; m3=out12, m1=t7
1400 SCRATCH 2, 11, rsp+73*mmsize
1401 UNSCRATCH 2, 12, rsp+74*mmsize ; t11a
1402 SCRATCH 3, 12, rsp+74*mmsize
1404 UNSCRATCH 3, 8, rsp+70*mmsize ; t10a
1405 mova m4, [rsp+65*mmsize] ; t8a
1406 mova m5, [rsp+66*mmsize] ; t9a
1407 SUMSUB_BA d, 3, 4, 6 ; m3=-out1, m4=t10
1408 NEGD m3 ; m3=out1
1409 SUMSUB_BA d, 2, 5, 6 ; m2=out14, m5=t11
1410 UNSCRATCH 6, 9, rsp+71*mmsize ; t0
1411 UNSCRATCH 7, 14, rsp+76*mmsize ; t14a
1412 SCRATCH 3, 9, rsp+71*mmsize
1413 SCRATCH 2, 14, rsp+76*mmsize
1415 SUMSUB_MUL 1, 0, 2, 3, 11585, 11585 ; m1=out4, m0=out11
1416 mova [rsp+65*mmsize], m0
1417 SUMSUB_MUL 5, 4, 2, 3, 11585, 11585 ; m5=out6, m4=out9
1418 UNSCRATCH 0, 15, rsp+77*mmsize ; t15a
1419 SUMSUB_MUL 7, 0, 2, 3, 11585, m11585 ; m7=out10, m0=out5
1421 mova m2, [rsp+68*mmsize] ; t2
1422 SUMSUB_BA d, 2, 6, 3 ; m2=out0, m6=t2a
1423 SCRATCH 2, 8, rsp+70*mmsize
1424 mova m2, [rsp+67*mmsize] ; t1
1425 mova m3, [rsp+69*mmsize] ; t3
1426 mova [rsp+67*mmsize], m7
1427 SUMSUB_BA d, 3, 2, 7 ; m3=-out15, m2=t3a
1428 NEGD m3 ; m3=out15
1429 SCRATCH 3, 15, rsp+77*mmsize
1430 SUMSUB_MUL 6, 2, 7, 3, 11585, m11585 ; m6=out8, m2=out7
1431 mova m7, [rsp+67*mmsize]
1433 SWAP 0, 1
1434 SWAP 2, 5, 4, 6, 7, 3
1435 %endmacro
1437 %macro IADST16_FN 7
1438 cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
1439 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1440 dst, stride, block, eob
1441 mova m0, [pw_1023]
1443 .body:
1444 mova [rsp+64*mmsize], m0
1445 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
1446 %if ARCH_X86_64
1447 mov dstbakq, dstq
1448 movsxd cntq, cntd
1449 %endif
1450 %if PIC
1451 lea ptrq, [%7_16x16]
1452 movzx cntd, byte [ptrq+cntq-1]
1453 %else
1454 movzx cntd, byte [%7_16x16+cntq-1]
1455 %endif
1456 mov skipd, 4
1457 sub skipd, cntd
1458 mov ptrq, rsp
1459 .loop_1:
1460 %2_1D blockq
1462 TRANSPOSE4x4D 0, 1, 2, 3, 7
1463 mova [ptrq+ 1*mmsize], m0
1464 mova [ptrq+ 5*mmsize], m1
1465 mova [ptrq+ 9*mmsize], m2
1466 mova [ptrq+13*mmsize], m3
1467 mova m7, [rsp+65*mmsize]
1468 TRANSPOSE4x4D 4, 5, 6, 7, 0
1469 mova [ptrq+ 2*mmsize], m4
1470 mova [ptrq+ 6*mmsize], m5
1471 mova [ptrq+10*mmsize], m6
1472 mova [ptrq+14*mmsize], m7
1473 UNSCRATCH 0, 8, rsp+(%3+0)*mmsize
1474 UNSCRATCH 1, 9, rsp+(%3+1)*mmsize
1475 UNSCRATCH 2, 10, rsp+(%3+2)*mmsize
1476 UNSCRATCH 3, 11, rsp+(%3+3)*mmsize
1477 TRANSPOSE4x4D 0, 1, 2, 3, 7
1478 mova [ptrq+ 0*mmsize], m0
1479 mova [ptrq+ 4*mmsize], m1
1480 mova [ptrq+ 8*mmsize], m2
1481 mova [ptrq+12*mmsize], m3
1482 UNSCRATCH 4, 12, rsp+(%3+4)*mmsize
1483 UNSCRATCH 5, 13, rsp+(%3+5)*mmsize
1484 UNSCRATCH 6, 14, rsp+(%3+6)*mmsize
1485 UNSCRATCH 7, 15, rsp+(%3+7)*mmsize
1486 TRANSPOSE4x4D 4, 5, 6, 7, 0
1487 mova [ptrq+ 3*mmsize], m4
1488 mova [ptrq+ 7*mmsize], m5
1489 mova [ptrq+11*mmsize], m6
1490 mova [ptrq+15*mmsize], m7
1491 add ptrq, 16 * mmsize
1492 add blockq, mmsize
1493 dec cntd
1494 jg .loop_1
1496 ; zero-pad the remainder (skipped cols)
1497 test skipd, skipd
1498 jz .end
1499 add skipd, skipd
1500 lea blockq, [blockq+skipq*(mmsize/2)]
1501 pxor m0, m0
1502 .loop_z:
1503 mova [ptrq+mmsize*0], m0
1504 mova [ptrq+mmsize*1], m0
1505 mova [ptrq+mmsize*2], m0
1506 mova [ptrq+mmsize*3], m0
1507 mova [ptrq+mmsize*4], m0
1508 mova [ptrq+mmsize*5], m0
1509 mova [ptrq+mmsize*6], m0
1510 mova [ptrq+mmsize*7], m0
1511 add ptrq, 8 * mmsize
1512 dec skipd
1513 jg .loop_z
1514 .end:
1516 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1517 lea stride3q, [strideq*3]
1518 mov cntd, 4
1519 mov ptrq, rsp
1520 .loop_2:
1521 %5_1D ptrq
1523 pxor m7, m7
1524 lea dstq, [dstq+strideq*4]
1525 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
1526 lea dstq, [dstq+strideq*4]
1527 mova m0, [rsp+65*mmsize]
1528 mova m1, [rsp+64*mmsize]
1529 mova m2, [pd_32]
1530 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
1532 %if ARCH_X86_64
1533 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
1534 %else
1535 mov dstq, dstm
1536 %endif
1537 UNSCRATCH 0, 8, rsp+(%6+0)*mmsize
1538 UNSCRATCH 4, 9, rsp+(%6+1)*mmsize
1539 UNSCRATCH 5, 10, rsp+(%6+2)*mmsize
1540 UNSCRATCH 3, 11, rsp+(%6+3)*mmsize
1541 ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6
1542 %if ARCH_X86_64
1543 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1544 lea dstq, [dstbakq+stride3q*4]
1545 %else
1546 lea dstq, [dstq+stride3q*4]
1547 %endif
1548 UNSCRATCH 4, 12, rsp+(%6+4)*mmsize
1549 UNSCRATCH 5, 13, rsp+(%6+5)*mmsize
1550 UNSCRATCH 6, 14, rsp+(%6+6)*mmsize
1551 UNSCRATCH 0, 15, rsp+(%6+7)*mmsize
1552 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
1554 add ptrq, mmsize
1555 %if ARCH_X86_64
1556 add dstbakq, 8
1557 mov dstq, dstbakq
1558 %else
1559 add dword dstm, 8
1560 mov dstq, dstm
1561 %endif
1562 dec cntd
1563 jg .loop_2
1565 ; m7 is still zero
1566 ZERO_BLOCK blockq-4*mmsize, 64, 16, m7
1569 cglobal vp9_%1_%4_16x16_add_12, 4, 6 + ARCH_X86_64, 16, \
1570 70 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1571 dst, stride, block, eob
1572 mova m0, [pw_4095]
1573 jmp mangle(private_prefix %+ _ %+ vp9_%1_%4_16x16_add_10 %+ SUFFIX).body
1574 %endmacro
1576 INIT_XMM sse2
1577 IADST16_FN idct, IDCT16, 67, iadst, IADST16, 70, row
1578 IADST16_FN iadst, IADST16, 70, idct, IDCT16, 67, col
1579 IADST16_FN iadst, IADST16, 70, iadst, IADST16, 70, default
1581 %macro IDCT32_1D 2-3 8 * mmsize; pass[1/2], src, src_stride
1582 IDCT16_1D %2, 2 * %3, 272, 257
1583 %if ARCH_X86_64
1584 mova [rsp+257*mmsize], m8
1585 mova [rsp+258*mmsize], m9
1586 mova [rsp+259*mmsize], m10
1587 mova [rsp+260*mmsize], m11
1588 mova [rsp+261*mmsize], m12
1589 mova [rsp+262*mmsize], m13
1590 mova [rsp+263*mmsize], m14
1591 mova [rsp+264*mmsize], m15
1592 %endif
1593 mova [rsp+265*mmsize], m0
1594 mova [rsp+266*mmsize], m1
1595 mova [rsp+267*mmsize], m2
1596 mova [rsp+268*mmsize], m3
1597 mova [rsp+269*mmsize], m4
1598 mova [rsp+270*mmsize], m5
1599 mova [rsp+271*mmsize], m6
1601 ; r257-260: t0-3
1602 ; r265-272: t4/5a/6a/7/8/9a/10/11a
1603 ; r261-264: t12a/13/14a/15
1604 ; r273-274 is free as scratch space, and 275-282 mirrors m8-15 on 32bit
1606 mova m0, [%2+ 1*%3] ; in1
1607 mova m1, [%2+15*%3] ; in15
1608 mova m2, [%2+17*%3] ; in17
1609 mova m3, [%2+31*%3] ; in31
1610 SUMSUB_MUL 0, 3, 4, 5, 16364, 804 ; m0=t31a, m3=t16a
1611 SUMSUB_MUL 2, 1, 4, 5, 11003, 12140 ; m2=t30a, m1=t17a
1612 SUMSUB_BA d, 1, 3, 4 ; m1=t16, m3=t17
1613 SUMSUB_BA d, 2, 0, 4 ; m2=t31, m0=t30
1614 SUMSUB_MUL 0, 3, 4, 5, 16069, 3196 ; m0=t30a, m3=t17a
1615 SCRATCH 0, 8, rsp+275*mmsize
1616 SCRATCH 2, 9, rsp+276*mmsize
1618 ; end of stage 1-3 first quart
1620 mova m0, [%2+ 7*%3] ; in7
1621 mova m2, [%2+ 9*%3] ; in9
1622 mova m4, [%2+23*%3] ; in23
1623 mova m5, [%2+25*%3] ; in25
1624 SUMSUB_MUL 2, 4, 6, 7, 14811, 7005 ; m2=t29a, m4=t18a
1625 SUMSUB_MUL 5, 0, 6, 7, 5520, 15426 ; m5=t28a, m0=t19a
1626 SUMSUB_BA d, 4, 0, 6 ; m4=t19, m0=t18
1627 SUMSUB_BA d, 2, 5, 6 ; m2=t28, m5=t29
1628 SUMSUB_MUL 5, 0, 6, 7, 3196, m16069 ; m5=t29a, m0=t18a
1630 ; end of stage 1-3 second quart
1632 SUMSUB_BA d, 4, 1, 6 ; m4=t16a, m1=t19a
1633 SUMSUB_BA d, 0, 3, 6 ; m0=t17, m3=t18
1634 UNSCRATCH 6, 8, rsp+275*mmsize ; t30a
1635 UNSCRATCH 7, 9, rsp+276*mmsize ; t31
1636 mova [rsp+273*mmsize], m4
1637 mova [rsp+274*mmsize], m0
1638 SUMSUB_BA d, 2, 7, 0 ; m2=t31a, m7=t28a
1639 SUMSUB_BA d, 5, 6, 0 ; m5=t30, m6=t29
1640 SUMSUB_MUL 6, 3, 0, 4, 15137, 6270 ; m6=t29a, m3=t18a
1641 SUMSUB_MUL 7, 1, 0, 4, 15137, 6270 ; m7=t28, m1=t19
1642 SCRATCH 3, 10, rsp+277*mmsize
1643 SCRATCH 1, 11, rsp+278*mmsize
1644 SCRATCH 7, 12, rsp+279*mmsize
1645 SCRATCH 6, 13, rsp+280*mmsize
1646 SCRATCH 5, 14, rsp+281*mmsize
1647 SCRATCH 2, 15, rsp+282*mmsize
1649 ; end of stage 4-5 first half
1651 mova m0, [%2+ 5*%3] ; in5
1652 mova m1, [%2+11*%3] ; in11
1653 mova m2, [%2+21*%3] ; in21
1654 mova m3, [%2+27*%3] ; in27
1655 SUMSUB_MUL 0, 3, 4, 5, 15893, 3981 ; m0=t27a, m3=t20a
1656 SUMSUB_MUL 2, 1, 4, 5, 8423, 14053 ; m2=t26a, m1=t21a
1657 SUMSUB_BA d, 1, 3, 4 ; m1=t20, m3=t21
1658 SUMSUB_BA d, 2, 0, 4 ; m2=t27, m0=t26
1659 SUMSUB_MUL 0, 3, 4, 5, 9102, 13623 ; m0=t26a, m3=t21a
1660 SCRATCH 0, 8, rsp+275*mmsize
1661 SCRATCH 2, 9, rsp+276*mmsize
1663 ; end of stage 1-3 third quart
1665 mova m0, [%2+ 3*%3] ; in3
1666 mova m2, [%2+13*%3] ; in13
1667 mova m4, [%2+19*%3] ; in19
1668 mova m5, [%2+29*%3] ; in29
1669 SUMSUB_MUL 2, 4, 6, 7, 13160, 9760 ; m2=t25a, m4=t22a
1670 SUMSUB_MUL 5, 0, 6, 7, 2404, 16207 ; m5=t24a, m0=t23a
1671 SUMSUB_BA d, 4, 0, 6 ; m4=t23, m0=t22
1672 SUMSUB_BA d, 2, 5, 6 ; m2=t24, m5=t25
1673 SUMSUB_MUL 5, 0, 6, 7, 13623, m9102 ; m5=t25a, m0=t22a
1675 ; end of stage 1-3 fourth quart
1677 SUMSUB_BA d, 1, 4, 6 ; m1=t23a, m4=t20a
1678 SUMSUB_BA d, 3, 0, 6 ; m3=t22, m0=t21
1679 UNSCRATCH 6, 8, rsp+275*mmsize ; t26a
1680 UNSCRATCH 7, 9, rsp+276*mmsize ; t27
1681 SCRATCH 3, 8, rsp+275*mmsize
1682 SCRATCH 1, 9, rsp+276*mmsize
1683 SUMSUB_BA d, 7, 2, 1 ; m7=t24a, m2=t27a
1684 SUMSUB_BA d, 6, 5, 1 ; m6=t25, m5=t26
1685 SUMSUB_MUL 2, 4, 1, 3, 6270, m15137 ; m2=t27, m4=t20
1686 SUMSUB_MUL 5, 0, 1, 3, 6270, m15137 ; m5=t26a, m0=t21a
1688 ; end of stage 4-5 second half
1690 UNSCRATCH 1, 12, rsp+279*mmsize ; t28
1691 UNSCRATCH 3, 13, rsp+280*mmsize ; t29a
1692 SCRATCH 4, 12, rsp+279*mmsize
1693 SCRATCH 0, 13, rsp+280*mmsize
1694 SUMSUB_BA d, 5, 3, 0 ; m5=t29, m3=t26
1695 SUMSUB_BA d, 2, 1, 0 ; m2=t28a, m1=t27a
1696 UNSCRATCH 0, 14, rsp+281*mmsize ; t30
1697 UNSCRATCH 4, 15, rsp+282*mmsize ; t31a
1698 SCRATCH 2, 14, rsp+281*mmsize
1699 SCRATCH 5, 15, rsp+282*mmsize
1700 SUMSUB_BA d, 6, 0, 2 ; m6=t30a, m0=t25a
1701 SUMSUB_BA d, 7, 4, 2 ; m7=t31, m4=t24
1703 mova m2, [rsp+273*mmsize] ; t16a
1704 mova m5, [rsp+274*mmsize] ; t17
1705 mova [rsp+273*mmsize], m6
1706 mova [rsp+274*mmsize], m7
1707 UNSCRATCH 6, 10, rsp+277*mmsize ; t18a
1708 UNSCRATCH 7, 11, rsp+278*mmsize ; t19
1709 SCRATCH 4, 10, rsp+277*mmsize
1710 SCRATCH 0, 11, rsp+278*mmsize
1711 UNSCRATCH 4, 12, rsp+279*mmsize ; t20
1712 UNSCRATCH 0, 13, rsp+280*mmsize ; t21a
1713 SCRATCH 3, 12, rsp+279*mmsize
1714 SCRATCH 1, 13, rsp+280*mmsize
1715 SUMSUB_BA d, 0, 6, 1 ; m0=t18, m6=t21
1716 SUMSUB_BA d, 4, 7, 1 ; m4=t19a, m7=t20a
1717 UNSCRATCH 3, 8, rsp+275*mmsize ; t22
1718 UNSCRATCH 1, 9, rsp+276*mmsize ; t23a
1719 SCRATCH 0, 8, rsp+275*mmsize
1720 SCRATCH 4, 9, rsp+276*mmsize
1721 SUMSUB_BA d, 3, 5, 0 ; m3=t17a, m5=t22a
1722 SUMSUB_BA d, 1, 2, 0 ; m1=t16, m2=t23
1724 ; end of stage 6
1726 UNSCRATCH 0, 10, rsp+277*mmsize ; t24
1727 UNSCRATCH 4, 11, rsp+278*mmsize ; t25a
1728 SCRATCH 1, 10, rsp+277*mmsize
1729 SCRATCH 3, 11, rsp+278*mmsize
1730 SUMSUB_MUL 0, 2, 1, 3, 11585, 11585 ; m0=t24a, m2=t23a
1731 SUMSUB_MUL 4, 5, 1, 3, 11585, 11585 ; m4=t25, m5=t22
1732 UNSCRATCH 1, 12, rsp+279*mmsize ; t26
1733 UNSCRATCH 3, 13, rsp+280*mmsize ; t27a
1734 SCRATCH 0, 12, rsp+279*mmsize
1735 SCRATCH 4, 13, rsp+280*mmsize
1736 SUMSUB_MUL 3, 7, 0, 4, 11585, 11585 ; m3=t27, m7=t20
1737 SUMSUB_MUL 1, 6, 0, 4, 11585, 11585 ; m1=t26a, m6=t21a
1739 ; end of stage 7
1741 mova m0, [rsp+269*mmsize] ; t8
1742 mova m4, [rsp+270*mmsize] ; t9a
1743 mova [rsp+269*mmsize], m1 ; t26a
1744 mova [rsp+270*mmsize], m3 ; t27
1745 mova m3, [rsp+271*mmsize] ; t10
1746 SUMSUB_BA d, 2, 0, 1 ; m2=out8, m0=out23
1747 SUMSUB_BA d, 5, 4, 1 ; m5=out9, m4=out22
1748 SUMSUB_BA d, 6, 3, 1 ; m6=out10, m3=out21
1749 mova m1, [rsp+272*mmsize] ; t11a
1750 mova [rsp+271*mmsize], m0
1751 SUMSUB_BA d, 7, 1, 0 ; m7=out11, m1=out20
1753 %if %1 == 1
1754 TRANSPOSE4x4D 2, 5, 6, 7, 0
1755 mova [ptrq+ 2*mmsize], m2
1756 mova [ptrq+10*mmsize], m5
1757 mova [ptrq+18*mmsize], m6
1758 mova [ptrq+26*mmsize], m7
1759 %else ; %1 == 2
1760 pxor m0, m0
1761 lea dstq, [dstq+strideq*8]
1762 ROUND_AND_STORE_4x4 2, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
1763 %endif
1764 mova m2, [rsp+271*mmsize]
1765 %if %1 == 1
1766 TRANSPOSE4x4D 1, 3, 4, 2, 0
1767 mova [ptrq+ 5*mmsize], m1
1768 mova [ptrq+13*mmsize], m3
1769 mova [ptrq+21*mmsize], m4
1770 mova [ptrq+29*mmsize], m2
1771 %else ; %1 == 2
1772 lea dstq, [dstq+stride3q*4]
1773 ROUND_AND_STORE_4x4 1, 3, 4, 2, m0, [rsp+256*mmsize], [pd_32], 6
1774 %endif
1776 ; end of last stage + store for out8-11 and out20-23
1778 UNSCRATCH 0, 9, rsp+276*mmsize ; t19a
1779 UNSCRATCH 1, 8, rsp+275*mmsize ; t18
1780 UNSCRATCH 2, 11, rsp+278*mmsize ; t17a
1781 UNSCRATCH 3, 10, rsp+277*mmsize ; t16
1782 mova m7, [rsp+261*mmsize] ; t12a
1783 mova m6, [rsp+262*mmsize] ; t13
1784 mova m5, [rsp+263*mmsize] ; t14a
1785 SUMSUB_BA d, 0, 7, 4 ; m0=out12, m7=out19
1786 SUMSUB_BA d, 1, 6, 4 ; m1=out13, m6=out18
1787 SUMSUB_BA d, 2, 5, 4 ; m2=out14, m5=out17
1788 mova m4, [rsp+264*mmsize] ; t15
1789 SCRATCH 7, 8, rsp+275*mmsize
1790 SUMSUB_BA d, 3, 4, 7 ; m3=out15, m4=out16
1792 %if %1 == 1
1793 TRANSPOSE4x4D 0, 1, 2, 3, 7
1794 mova [ptrq+ 3*mmsize], m0
1795 mova [ptrq+11*mmsize], m1
1796 mova [ptrq+19*mmsize], m2
1797 mova [ptrq+27*mmsize], m3
1798 %else ; %1 == 2
1799 %if ARCH_X86_64
1800 SWAP 7, 9
1801 lea dstq, [dstbakq+stride3q*4]
1802 %else ; x86-32
1803 pxor m7, m7
1804 mov dstq, dstm
1805 lea dstq, [dstq+stride3q*4]
1806 %endif
1807 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
1808 %endif
1809 UNSCRATCH 0, 8, rsp+275*mmsize ; out19
1810 %if %1 == 1
1811 TRANSPOSE4x4D 4, 5, 6, 0, 7
1812 mova [ptrq+ 4*mmsize], m4
1813 mova [ptrq+12*mmsize], m5
1814 mova [ptrq+20*mmsize], m6
1815 mova [ptrq+28*mmsize], m0
1816 %else ; %1 == 2
1817 lea dstq, [dstq+strideq*4]
1818 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
1819 %endif
1821 ; end of last stage + store for out12-19
1823 %if ARCH_X86_64
1824 SWAP 7, 8
1825 %endif
1826 mova m7, [rsp+257*mmsize] ; t0
1827 mova m6, [rsp+258*mmsize] ; t1
1828 mova m5, [rsp+259*mmsize] ; t2
1829 mova m4, [rsp+260*mmsize] ; t3
1830 mova m0, [rsp+274*mmsize] ; t31
1831 mova m1, [rsp+273*mmsize] ; t30a
1832 UNSCRATCH 2, 15, rsp+282*mmsize ; t29
1833 SUMSUB_BA d, 0, 7, 3 ; m0=out0, m7=out31
1834 SUMSUB_BA d, 1, 6, 3 ; m1=out1, m6=out30
1835 SUMSUB_BA d, 2, 5, 3 ; m2=out2, m5=out29
1836 SCRATCH 0, 9, rsp+276*mmsize
1837 UNSCRATCH 3, 14, rsp+281*mmsize ; t28a
1838 SUMSUB_BA d, 3, 4, 0 ; m3=out3, m4=out28
1840 %if %1 == 1
1841 TRANSPOSE4x4D 4, 5, 6, 7, 0
1842 mova [ptrq+ 7*mmsize], m4
1843 mova [ptrq+15*mmsize], m5
1844 mova [ptrq+23*mmsize], m6
1845 mova [ptrq+31*mmsize], m7
1846 %else ; %1 == 2
1847 %if ARCH_X86_64
1848 SWAP 0, 8
1849 %else ; x86-32
1850 pxor m0, m0
1851 %endif
1852 lea dstq, [dstq+stride3q*4]
1853 ROUND_AND_STORE_4x4 4, 5, 6, 7, m0, [rsp+256*mmsize], [pd_32], 6
1854 %endif
1855 UNSCRATCH 7, 9, rsp+276*mmsize ; out0
1856 %if %1 == 1
1857 TRANSPOSE4x4D 7, 1, 2, 3, 0
1858 mova [ptrq+ 0*mmsize], m7
1859 mova [ptrq+ 8*mmsize], m1
1860 mova [ptrq+16*mmsize], m2
1861 mova [ptrq+24*mmsize], m3
1862 %else ; %1 == 2
1863 %if ARCH_X86_64
1864 DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
1865 %else ; x86-32
1866 mov dstq, dstm
1867 %endif
1868 ROUND_AND_STORE_4x4 7, 1, 2, 3, m0, [rsp+256*mmsize], [pd_32], 6
1869 %if ARCH_X86_64
1870 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
1871 %endif
1872 %endif
1874 ; end of last stage + store for out0-3 and out28-31
1876 %if ARCH_X86_64
1877 SWAP 0, 8
1878 %endif
1879 mova m7, [rsp+265*mmsize] ; t4
1880 mova m6, [rsp+266*mmsize] ; t5a
1881 mova m5, [rsp+267*mmsize] ; t6a
1882 mova m4, [rsp+268*mmsize] ; t7
1883 mova m0, [rsp+270*mmsize] ; t27
1884 mova m1, [rsp+269*mmsize] ; t26a
1885 UNSCRATCH 2, 13, rsp+280*mmsize ; t25
1886 SUMSUB_BA d, 0, 7, 3 ; m0=out4, m7=out27
1887 SUMSUB_BA d, 1, 6, 3 ; m1=out5, m6=out26
1888 SUMSUB_BA d, 2, 5, 3 ; m2=out6, m5=out25
1889 UNSCRATCH 3, 12, rsp+279*mmsize ; t24a
1890 SCRATCH 7, 9, rsp+276*mmsize
1891 SUMSUB_BA d, 3, 4, 7 ; m3=out7, m4=out24
1893 %if %1 == 1
1894 TRANSPOSE4x4D 0, 1, 2, 3, 7
1895 mova [ptrq+ 1*mmsize], m0
1896 mova [ptrq+ 9*mmsize], m1
1897 mova [ptrq+17*mmsize], m2
1898 mova [ptrq+25*mmsize], m3
1899 %else ; %1 == 2
1900 %if ARCH_X86_64
1901 SWAP 7, 8
1902 lea dstq, [dstbakq+strideq*4]
1903 %else ; x86-32
1904 pxor m7, m7
1905 lea dstq, [dstq+strideq*4]
1906 %endif
1907 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+256*mmsize], [pd_32], 6
1908 %endif
1909 UNSCRATCH 0, 9, rsp+276*mmsize ; out27
1910 %if %1 == 1
1911 TRANSPOSE4x4D 4, 5, 6, 0, 7
1912 mova [ptrq+ 6*mmsize], m4
1913 mova [ptrq+14*mmsize], m5
1914 mova [ptrq+22*mmsize], m6
1915 mova [ptrq+30*mmsize], m0
1916 %else ; %1 == 2
1917 %if ARCH_X86_64
1918 lea dstq, [dstbakq+stride3q*8]
1919 %else
1920 mov dstq, dstm
1921 lea dstq, [dstq+stride3q*8]
1922 %endif
1923 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+256*mmsize], [pd_32], 6
1924 %endif
1926 ; end of last stage + store for out4-7 and out24-27
1927 %endmacro
1929 INIT_XMM sse2
1930 cglobal vp9_idct_idct_32x32_add_10, 4, 6 + ARCH_X86_64, 16, \
1931 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
1932 dst, stride, block, eob
1933 mova m0, [pw_1023]
1934 cmp eobd, 1
1935 jg .idctfull
1937 ; dc-only - the 10bit version can be done entirely in 32bit, since the max
1938 ; coef values are 17+sign bit, and the coef is 14bit, so 31+sign easily
1939 ; fits in 32bit
1940 DEFINE_ARGS dst, stride, block, coef
1941 pxor m2, m2
1942 DC_ONLY 6, m2
1943 movd m1, coefd
1944 pshuflw m1, m1, q0000
1945 punpcklqdq m1, m1
1946 DEFINE_ARGS dst, stride, cnt
1947 mov cntd, 32
1948 .loop_dc:
1949 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
1950 STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
1951 add dstq, strideq
1952 dec cntd
1953 jg .loop_dc
1956 .idctfull:
1957 mova [rsp+256*mmsize], m0
1958 DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
1959 %if ARCH_X86_64
1960 mov dstbakq, dstq
1961 movsxd cntq, cntd
1962 %endif
1963 %if PIC
1964 lea ptrq, [default_32x32]
1965 movzx cntd, byte [ptrq+cntq-1]
1966 %else
1967 movzx cntd, byte [default_32x32+cntq-1]
1968 %endif
1969 mov skipd, 8
1970 sub skipd, cntd
1971 mov ptrq, rsp
1972 .loop_1:
1973 IDCT32_1D 1, blockq
1975 add ptrq, 32 * mmsize
1976 add blockq, mmsize
1977 dec cntd
1978 jg .loop_1
1980 ; zero-pad the remainder (skipped cols)
1981 test skipd, skipd
1982 jz .end
1983 shl skipd, 2
1984 lea blockq, [blockq+skipq*(mmsize/4)]
1985 pxor m0, m0
1986 .loop_z:
1987 mova [ptrq+mmsize*0], m0
1988 mova [ptrq+mmsize*1], m0
1989 mova [ptrq+mmsize*2], m0
1990 mova [ptrq+mmsize*3], m0
1991 mova [ptrq+mmsize*4], m0
1992 mova [ptrq+mmsize*5], m0
1993 mova [ptrq+mmsize*6], m0
1994 mova [ptrq+mmsize*7], m0
1995 add ptrq, 8 * mmsize
1996 dec skipd
1997 jg .loop_z
1998 .end:
2000 DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
2001 lea stride3q, [strideq*3]
2002 mov cntd, 8
2003 mov ptrq, rsp
2004 .loop_2:
2005 IDCT32_1D 2, ptrq
2007 add ptrq, mmsize
2008 %if ARCH_X86_64
2009 add dstbakq, 8
2010 mov dstq, dstbakq
2011 %else
2012 add dword dstm, 8
2013 mov dstq, dstm
2014 %endif
2015 dec cntd
2016 jg .loop_2
2018 ; m7 is still zero
2019 ZERO_BLOCK blockq-8*mmsize, 128, 32, m7
2022 INIT_XMM sse2
2023 cglobal vp9_idct_idct_32x32_add_12, 4, 6 + ARCH_X86_64, 16, \
2024 275 * mmsize + ARCH_X86_32 * 8 * mmsize, \
2025 dst, stride, block, eob
2026 mova m0, [pw_4095]
2027 cmp eobd, 1
2028 jg mangle(private_prefix %+ _ %+ vp9_idct_idct_32x32_add_10 %+ SUFFIX).idctfull
2030 ; dc-only - unfortunately, this one can overflow, since coefs are 19+sign
2031 ; bpp, and 19+14+sign does not fit in 32bit, so we do 2-stage multiplies
2032 DEFINE_ARGS dst, stride, block, coef, coefl
2033 pxor m2, m2
2034 DC_ONLY_64BIT 6, m2
2035 movd m1, coefd
2036 pshuflw m1, m1, q0000
2037 punpcklqdq m1, m1
2038 DEFINE_ARGS dst, stride, cnt
2039 mov cntd, 32
2040 .loop_dc:
2041 STORE_2x8 3, 4, 1, m2, m0, dstq, mmsize
2042 STORE_2x8 3, 4, 1, m2, m0, dstq+mmsize*2, mmsize
2043 add dstq, strideq
2044 dec cntd
2045 jg .loop_dc