1 ;******************************************************************************
2 ;* VP9 loop filter SIMD optimizations
4 ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
27 pw_511: times
16 dw 511
28 pw_2047: times
16 dw 2047
29 pw_16384: times
16 dw 16384
30 pw_m512: times
16 dw -512
31 pw_m2048: times
16 dw -2048
81 ; calculate p or q portion of flat8out
82 %macro FLAT8OUT_HALF
0
87 ABS2 m4
, m5
, m2
, m3
; abs(q4-q0) | abs(q5-q0)
88 ABS2 m6
, m7
, m2
, m3
; abs(q6-q0) | abs(q7-q0)
89 pcmpgtw m4
, reg_F
; abs(q4-q0) > F
90 pcmpgtw m5
, reg_F
; abs(q5-q0) > F
91 pcmpgtw m6
, reg_F
; abs(q6-q0) > F
92 pcmpgtw m7
, reg_F
; abs(q7-q0) > F
95 por m7
, m5
; !flat8out, q portion
98 ; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
101 psubw m4
, m3
, m0
; q3-q0
102 psubw m5
, m2
, m0
; q2-q0
103 ABS2 m4
, m5
, m6
, m7
; abs(q3-q0) | abs(q2-q0)
104 pcmpgtw m4
, reg_F
; abs(q3-q0) > F
105 pcmpgtw m5
, reg_F
; abs(q2-q0) > F
109 ABS2 m3
, m2
, m6
, m7
; abs(q3-q2) | abs(q2-q1)
110 pcmpgtw m3
, reg_I
; abs(q3-q2) > I
111 pcmpgtw m2
, reg_I
; abs(q2-q1) > I
116 psubw m3
, m1
, m0
; q1-q0
117 ABS1 m3
, m5
; abs(q1-q0)
119 pcmpgtw m6
, m3
, reg_F
; abs(q1-q0) > F
121 pcmpgtw m7
, m3
, reg_H
; abs(q1-q0) > H
122 pcmpgtw m3
, reg_I
; abs(q1-q0) > I
129 ; one step in filter_14/filter_6
131 ; take sum $reg, downshift, apply mask and write into dst
133 ; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
134 ; step's sum $reg. This is omitted for the last row in each filter.
136 ; if dont_store is set, don't write the result into memory, instead keep the
137 ; values in register so we can write it out later
138 %macro FILTER_STEP
6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
139 ; src/sub1, sub2, add1, add2, dont_store
141 psubw
%1, %6 ; abs->delta
148 pand
%1, reg_
%3 ; apply mask
150 paddw
%6, %1 ; delta->abs
152 paddw
%1, %6 ; delta->abs
157 ; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
159 %macro LOOP_FILTER
3 ; dir[h/v], wd[4/8/16], bpp[10/12]
163 %assign
%%num_xmm_regs
16
165 %assign
%%num_xmm_regs
15
167 %assign
%%num_xmm_regs
14
171 %assign
%%num_xmm_regs
8
179 %endif
; ARCH_X86_64/32
183 %assign
%%num_gpr_regs
6
185 %assign
%%num_gpr_regs
5
189 %assign
%%num_gpr_regs
5
190 %if ARCH_X86_32
&& %2 == 8
192 %else
; ARCH_X86_64 || %2 == 4
194 %endif
; ARCH_X86_64/32 etc.
199 %elif
%2 == 16 ; && %1 == h
201 %else
; %1 == h && %1 == 8/4
205 %assign
%%off
%%wd_mem
206 %assign
%%tspoff
%%bak_mem
+%%wd_mem
207 %assign
%%stack_mem
((%%bak_mem
+%%wd_mem
+%%tsp_mem
)*mmsize
)
211 %define
%%minsgn m512
212 %define
%%maxusgn
1023
215 %define
%%maxsgn
2047
216 %define
%%minsgn m2048
217 %define
%%maxusgn
4095
221 cglobal vp9_loop_filter_
%1_
%2_
%3, 5, %%num_gpr_regs
, %%num_xmm_regs
, %%stack_mem
, dst
, stride
, E
, I
, H
222 ; prepare E, I and H masks
233 pshufb m1
, m0
; E << (bit_depth - 8)
234 pshufb m2
, m0
; I << (bit_depth - 8)
235 pshufb m3
, m0
; H << (bit_depth - 8)
244 SCRATCH
1, 8, rsp
+(%%off
+0)*mmsize
, E
245 SCRATCH
2, 9, rsp
+(%%off
+1)*mmsize
, I
246 SCRATCH
3, 10, rsp
+(%%off
+2)*mmsize
, H
248 PRELOAD
11, pw_
%+ %%maxf
, F
251 ; set up variables to load data
253 DEFINE_ARGS dst8
, stride
, stride3
, dst0
, dst4
, dst12
254 lea stride3q
, [strideq
*3]
257 lea dst0q
, [dst8q
+strideq
*8]
259 lea dst4q
, [dst8q
+strideq
*4]
263 lea dst12q
, [dst8q
+strideq
*4]
264 lea dst4q
, [dst0q
+strideq
*4]
269 %define
%%p6 dst0q
+strideq
270 %define
%%p5 dst0q
+strideq
*2
271 %define
%%p4 dst0q
+stride3q
274 %define
%%p2 dst4q
+strideq
275 %define
%%p1 dst4q
+strideq
*2
276 %define
%%p0 dst4q
+stride3q
278 %define
%%q1 dst8q
+strideq
279 %define
%%q2 dst8q
+strideq
*2
280 %define
%%q3 dst8q
+stride3q
283 %define
%%q5 dst12q
+strideq
284 %define
%%q6 dst12q
+strideq
*2
285 %define
%%q7 dst12q
+stride3q
288 DEFINE_ARGS dst0
, stride
, stride3
, dst4
289 lea stride3q
, [strideq
*3]
290 lea dst4q
, [dst0q
+strideq
*4]
292 %define
%%p3 rsp
+(%%tspoff
+0)*mmsize
293 %define
%%p2 rsp
+(%%tspoff
+1)*mmsize
294 %define
%%p1 rsp
+(%%tspoff
+2)*mmsize
295 %define
%%p0 rsp
+(%%tspoff
+3)*mmsize
296 %define
%%q0 rsp
+(%%tspoff
+4)*mmsize
297 %define
%%q1 rsp
+(%%tspoff
+5)*mmsize
298 %define
%%q2 rsp
+(%%tspoff
+6)*mmsize
299 %define
%%q3 rsp
+(%%tspoff
+7)*mmsize
302 movu m0
, [dst0q
+strideq
*0-8]
303 movu m1
, [dst0q
+strideq
*1-8]
304 movu m2
, [dst0q
+strideq
*2-8]
305 movu m3
, [dst0q
+stride3q
-8]
306 movu m4
, [dst4q
+strideq
*0-8]
307 movu m5
, [dst4q
+strideq
*1-8]
308 movu m6
, [dst4q
+strideq
*2-8]
309 movu m7
, [dst4q
+stride3q
-8]
312 TRANSPOSE8x8W
0, 1, 2, 3, 4, 5, 6, 7, 12
314 TRANSPOSE8x8W
0, 1, 2, 3, 4, 5, 6, 7, [%%p0
], [%%q0
]
328 ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
329 ; order here accordingly
332 %define
%%p7 rsp
+(%%tspoff
+ 8)*mmsize
333 %define
%%p6 rsp
+(%%tspoff
+ 9)*mmsize
334 %define
%%p5 rsp
+(%%tspoff
+10)*mmsize
335 %define
%%p4 rsp
+(%%tspoff
+11)*mmsize
336 %define
%%q4 rsp
+(%%tspoff
+12)*mmsize
337 %define
%%q5 rsp
+(%%tspoff
+13)*mmsize
338 %define
%%q6 rsp
+(%%tspoff
+14)*mmsize
339 %define
%%q7 rsp
+(%%tspoff
+15)*mmsize
341 mova m0
, [dst0q
+strideq
*0-16]
342 mova m1
, [dst0q
+strideq
*1-16]
343 mova m2
, [dst0q
+strideq
*2-16]
344 mova m3
, [dst0q
+stride3q
-16]
345 mova m4
, [dst4q
+strideq
*0-16]
346 mova m5
, [dst4q
+strideq
*1-16]
348 mova m6
, [dst4q
+strideq
*2-16]
350 mova m7
, [dst4q
+stride3q
-16]
353 TRANSPOSE8x8W
0, 1, 2, 3, 4, 5, 6, 7, 12
355 TRANSPOSE8x8W
0, 1, 2, 3, 4, 5, 6, 7, [dst4q
+strideq
*2-16], [%%p3
], 1
369 mova m0
, [dst0q
+strideq
*0]
370 mova m1
, [dst0q
+strideq
*1]
371 mova m2
, [dst0q
+strideq
*2]
372 mova m3
, [dst0q
+stride3q
]
373 mova m4
, [dst4q
+strideq
*0]
374 mova m5
, [dst4q
+strideq
*1]
376 mova m6
, [dst4q
+strideq
*2]
378 mova m7
, [dst4q
+stride3q
]
381 TRANSPOSE8x8W
0, 1, 2, 3, 4, 5, 6, 7, 12
383 TRANSPOSE8x8W
0, 1, 2, 3, 4, 5, 6, 7, [dst4q
+strideq
*2], [%%q4
], 1
397 ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
398 ; order here accordingly
412 SCRATCH
7, 15, rsp
+(%%off
+6)*mmsize
, F8O
420 ; r6-8|pw_4[m8-11]=reg_E/I/H/F
421 ; r9[m15]=!flatout[q]
426 ; flat8in|fm|hev q portion
428 SCRATCH
7, 13, rsp
+(%%off
+4)*mmsize
, HEV
430 SCRATCH
4, 14, rsp
+(%%off
+5)*mmsize
, F8I
433 ; r6-8|pw_4[m8-11]=reg_E/I/H/F
434 ; r9[m15]=!flat8out[q]
436 ; r11[m14]=!flat8in[q]
447 psubw m5
, m3
, m0
; q0-p0
448 psubw m6
, m4
, m1
; q1-p1
450 ABS2 m5
, m6
, m7
, m12
; abs(q0-p0) | abs(q1-p1)
452 ABS1 m5
, m7
; abs(q0-p0)
453 ABS1 m6
, m7
; abs(q1-p1)
457 paddw m6
, m5
; abs(q0-p0)*2+(abs(q1-p1)>>1)
460 SCRATCH
2, 12, rsp
+(%%off
+3)*mmsize
, FM
462 ; r6-8|pw_4[m8-11]=reg_E/I/H/F
463 ; r9[m15]=!flat8out[q]
465 ; r11[m14]=!flat8in[q]
482 SCRATCH
7, 15, rsp
+(%%off
+6)*mmsize
, F8O
485 ; r6-8|pw_4[m8-11]=reg_E/I/H/F
488 ; r11[m14]=!flat8in[q]
497 ; flat8in|fm|hev p portion
505 por m4
, m2
; !flat8|!fm
507 por m5
, m4
, reg_F8O
; !flat16|!fm
508 pandn m2
, m4
; filter4_mask
509 pandn m4
, m5
; filter8_mask
510 pxor m5
, [pw_m1
] ; filter16_mask
511 SCRATCH
5, 15, rsp
+(%%off
+6)*mmsize
, F16M
513 pandn m2
, m4
; filter4_mask
514 pxor m4
, [pw_m1
] ; filter8_mask
516 SCRATCH
4, 14, rsp
+(%%off
+5)*mmsize
, F8M
518 pxor m2
, [pw_m1
] ; filter4_mask
520 SCRATCH
7, 13, rsp
+(%%off
+4)*mmsize
, HEV
521 SCRATCH
2, 12, rsp
+(%%off
+3)*mmsize
, F4M
523 ; r9[m15]=filter16_mask
525 ; r11[m14]=filter8_mask
526 ; r12[m12]=filter4_mask
552 paddw m4
, reg_Q0
; q0+p1+p3+p5+p7*8
553 psubw m5
, m2
; p0+p2+p4+p6*2-p7
555 paddw m5
, m4
; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
557 ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
558 ; at the end of the filter
560 mova
[rsp
+0*mmsize
], m3
561 FILTER_STEP m4
, m5
, F16M
, 4, %%p6
, m3
, m2
, m6
, reg_Q1
565 mova
[rsp
+1*mmsize
], m6
566 FILTER_STEP m4
, m5
, F16M
, 4, %%p5
, m6
, m2
, m7
, m3
570 mova
[rsp
+2*mmsize
], m7
571 FILTER_STEP m4
, m5
, F16M
, 4, %%p4
, m7
, m2
, reg_P3
, m6
574 mova
[rsp
+3*mmsize
], reg_P3
577 mova
[rsp
+3*mmsize
], m4
579 FILTER_STEP m4
, m5
, F16M
, 4, %%p3
, reg_P3
, m2
, reg_P2
, m7
582 mova
[rsp
+4*mmsize
], reg_P2
585 mova
[rsp
+4*mmsize
], m4
587 FILTER_STEP m4
, m5
, F16M
, 4, %%p2
, reg_P2
, m2
, m1
, reg_Q5
589 mova
[rsp
+5*mmsize
], m1
590 FILTER_STEP m4
, m5
, F16M
, 4, %%p1
, m1
, m2
, m0
, reg_Q6
592 FILTER_STEP m4
, m5
, F16M
, 4, %%p0
, m0
, m2
, reg_Q0
, m1
, 1
593 FILTER_STEP m4
, m5
, F16M
, 4, %%q0
, reg_Q0
, [rsp
+0*mmsize
], reg_Q1
, m1
, ARCH_X86_64
594 FILTER_STEP m4
, m5
, F16M
, 4, %%q1
, reg_Q1
, [rsp
+1*mmsize
], m3
, m1
, ARCH_X86_64
595 FILTER_STEP m4
, m5
, F16M
, 4, %%q2
, m3
, [rsp
+2*mmsize
], m6
, m1
, 1
596 FILTER_STEP m4
, m5
, F16M
, 4, %%q3
, m6
, [rsp
+3*mmsize
], m7
, m1
597 FILTER_STEP m4
, m5
, F16M
, 4, %%q4
, m7
, [rsp
+4*mmsize
], reg_Q5
, m1
598 FILTER_STEP m4
, m5
, F16M
, 4, %%q5
, reg_Q5
, [rsp
+5*mmsize
], reg_Q6
, m1
599 FILTER_STEP m4
, m5
, F16M
, 4, %%q6
, reg_Q6
633 mova
[rsp
+0*mmsize
], m1
634 mova
[rsp
+1*mmsize
], m7
637 FILTER_STEP m4
, m5
, F8M
, 3, %%p2
, m1
, m2
, m7
, reg_Q1
639 FILTER_STEP m4
, m5
, F8M
, 3, %%p2
, m1
, m2
, m7
, reg_Q1
, 1
641 FILTER_STEP m4
, m5
, F8M
, 3, %%p1
, m7
, m2
, m0
, m3
, 1
642 FILTER_STEP m4
, m5
, F8M
, 3, %%p0
, m0
, m2
, reg_Q0
, m6
, 1
644 FILTER_STEP m4
, m5
, F8M
, 3, %%q0
, reg_Q0
, m8
, reg_Q1
, m6
, ARCH_X86_64
645 FILTER_STEP m4
, m5
, F8M
, 3, %%q1
, reg_Q1
, m9
, m3
, m6
, ARCH_X86_64
647 FILTER_STEP m4
, m5
, F8M
, 3, %%q0
, reg_Q0
, [rsp
+0*mmsize
], reg_Q1
, m6
, ARCH_X86_64
648 FILTER_STEP m4
, m5
, F8M
, 3, %%q1
, reg_Q1
, [rsp
+1*mmsize
], m3
, m6
, ARCH_X86_64
650 FILTER_STEP m4
, m5
, F8M
, 3, %%q2
, m3
652 UNSCRATCH
2, 10, %%q0
653 UNSCRATCH
6, 11, %%q1
659 UNSCRATCH
3, 13, rsp
+(%%off
+4)*mmsize
, HEV
670 psubw m4
, m7
, m6
; p1-q1
671 psubw m5
, m2
, m0
; q0-p0
673 pminsw m4
, [pw_
%+ %%maxsgn
]
674 pmaxsw m4
, [pw_
%+ %%minsgn
] ; clip_intp2(p1-q1, 9) -> f
677 paddw m4
, m5
; 3*(q0-p0)+f
678 pminsw m4
, [pw_
%+ %%maxsgn
]
679 pmaxsw m4
, [pw_
%+ %%minsgn
] ; clip_intp2(3*(q0-p0)+f, 9) -> f
683 pminsw m5
, [pw_
%+ %%maxsgn
]
684 pminsw m4
, [pw_
%+ %%maxsgn
]
685 psraw m5
, 3 ; min_intp2(f+4, 9)>>3 -> f1
686 psraw m4
, 3 ; min_intp2(f+3, 9)>>3 -> f2
689 pandn m3
, m5
; f1 & !hev (for p1/q1 adj)
691 mova m5
, [pw_
%+ %%maxusgn
]
697 pmulhrsw m3
, [pw_16384
] ; (f1+1)>>1
717 TRANSPOSE4x4W
7, 0, 2, 6, 1
718 movh
[dst0q
+strideq
*0-4], m7
719 movhps
[dst0q
+strideq
*1-4], m7
720 movh
[dst0q
+strideq
*2-4], m0
721 movhps
[dst0q
+stride3q
-4], m0
722 movh
[dst4q
+strideq
*0-4], m2
723 movhps
[dst4q
+strideq
*1-4], m2
724 movh
[dst4q
+strideq
*2-4], m6
725 movhps
[dst4q
+stride3q
-4], m6
732 TRANSPOSE8x8W
3, 1, 7, 0, 2, 6, 4, 5, 8
734 TRANSPOSE8x8W
3, 1, 7, 0, 2, 6, 4, 5, [%%q2
], [%%q0
], 1
738 movu
[dst0q
+strideq
*0-8], m3
739 movu
[dst0q
+strideq
*1-8], m1
740 movu
[dst0q
+strideq
*2-8], m7
741 movu
[dst0q
+stride3q
-8], m0
742 movu
[dst4q
+strideq
*0-8], m2
743 movu
[dst4q
+strideq
*1-8], m6
744 movu
[dst4q
+strideq
*2-8], m4
745 movu
[dst4q
+stride3q
-8], m5
756 TRANSPOSE8x8W
2, 3, 4, 5, 6, 1, 7, 0, 10
759 TRANSPOSE8x8W
2, 3, 4, 5, 6, 1, 7, 0, [%%p1
], [dst4q
+strideq
*0-16], 1
762 mova
[dst0q
+strideq
*0-16], m2
763 mova
[dst0q
+strideq
*1-16], m3
764 mova
[dst0q
+strideq
*2-16], m4
765 mova
[dst0q
+stride3q
-16], m5
767 mova
[dst4q
+strideq
*0-16], m6
769 mova
[dst4q
+strideq
*1-16], m1
770 mova
[dst4q
+strideq
*2-16], m7
771 mova
[dst4q
+stride3q
-16], m0
785 TRANSPOSE8x8W
2, 6, 0, 1, 3, 4, 5, 7, 8
787 TRANSPOSE8x8W
2, 6, 0, 1, 3, 4, 5, 7, [%%q6
], [dst4q
+strideq
*0], 1
790 mova
[dst0q
+strideq
*0], m2
791 mova
[dst0q
+strideq
*1], m6
792 mova
[dst0q
+strideq
*2], m0
793 mova
[dst0q
+stride3q
], m1
795 mova
[dst4q
+strideq
*0], m3
797 mova
[dst4q
+strideq
*1], m4
798 mova
[dst4q
+strideq
*2], m5
799 mova
[dst4q
+stride3q
], m7
805 %macro LOOP_FILTER_CPUSETS
3
807 LOOP_FILTER
%1, %2, %3
809 LOOP_FILTER
%1, %2, %3
811 LOOP_FILTER
%1, %2, %3
814 %macro LOOP_FILTER_WDSETS
2
815 LOOP_FILTER_CPUSETS
%1, 4, %2
816 LOOP_FILTER_CPUSETS
%1, 8, %2
817 LOOP_FILTER_CPUSETS
%1, 16, %2
820 LOOP_FILTER_WDSETS h
, 10
821 LOOP_FILTER_WDSETS v
, 10
822 LOOP_FILTER_WDSETS h
, 12
823 LOOP_FILTER_WDSETS v
, 12