avformat/mxfdec: Check edit unit for overflow in mxf_set_current_edit_unit()
[FFMpeg-mirror.git] / libavfilter / x86 / vf_bwdif.asm
blobc93b41ec4886d9de71536fbf21a9d575c4b74aa4
1 ;*****************************************************************************
2 ;* x86-optimized functions for bwdif filter
3 ;*
4 ;* Copyright (C) 2016 Thomas Mundt <loudmax@yahoo.de>
5 ;*
6 ;* Based on yadif simd code
7 ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
8 ;* 2013 Daniel Kang <daniel.d.kang@gmail.com>
9 ;*
10 ;* This file is part of FFmpeg.
12 ;* FFmpeg is free software; you can redistribute it and/or
13 ;* modify it under the terms of the GNU Lesser General Public
14 ;* License as published by the Free Software Foundation; either
15 ;* version 2.1 of the License, or (at your option) any later version.
17 ;* FFmpeg is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 ;* Lesser General Public License for more details.
22 ;* You should have received a copy of the GNU Lesser General Public
23 ;* License along with FFmpeg; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 ;******************************************************************************
27 %include "libavutil/x86/x86util.asm"
29 SECTION_RODATA 32
31 pw_coefhf: times 8 dw 1016, 5570
32 pw_coefhf1: times 16 dw -3801
33 pw_coefsp: times 8 dw 5077, -981
34 pw_splfdif: times 8 dw -768, 768
36 SECTION .text
38 %macro LOAD8 2
39 %if mmsize == 32
40 pmovzxbw %1, %2
41 %else
42 movh %1, %2
43 punpcklbw %1, m7
44 %endif
45 %endmacro
47 %macro LOAD12 2
48 movu %1, %2
49 %endmacro
51 %macro DISP8 0
52 %if mmsize == 32
53 vextracti128 xm1, m2, 1
54 packuswb xm2, xm1
55 movu [dstq], xm2
56 %else
57 packuswb m2, m2
58 movh [dstq], m2
59 %endif
60 %endmacro
62 %macro DISP12 0
63 CLIPW m2, m7, m12
64 movu [dstq], m2
65 %endmacro
67 %macro FILTER 5
68 pxor m7, m7
69 .loop%1:
70 LOAD%4 m0, [curq+t0*%5]
71 LOAD%4 m1, [curq+t1*%5]
72 LOAD%4 m2, [%2]
73 LOAD%4 m3, [%3]
74 mova m4, m3
75 paddw m3, m2
76 psubw m2, m4
77 ABS1 m2, m4
78 mova m8, m3
79 mova m9, m2
80 LOAD%4 m3, [prevq+t0*%5]
81 LOAD%4 m4, [prevq+t1*%5]
82 psubw m3, m0
83 psubw m4, m1
84 ABS2 m3, m4, m5, m6
85 paddw m3, m4
86 psrlw m2, 1
87 psrlw m3, 1
88 pmaxsw m2, m3
89 LOAD%4 m3, [nextq+t0*%5]
90 LOAD%4 m4, [nextq+t1*%5]
91 psubw m3, m0
92 psubw m4, m1
93 ABS2 m3, m4, m5, m6
94 paddw m3, m4
95 psrlw m3, 1
96 pmaxsw m2, m3
98 LOAD%4 m3, [%2+t0*2*%5]
99 LOAD%4 m4, [%3+t0*2*%5]
100 LOAD%4 m5, [%2+t1*2*%5]
101 LOAD%4 m6, [%3+t1*2*%5]
102 paddw m3, m4
103 paddw m5, m6
104 mova m6, m3
105 paddw m6, m5
106 mova m10, m6
107 psrlw m3, 1
108 psrlw m5, 1
109 psubw m3, m0
110 psubw m5, m1
111 mova m6, m3
112 pminsw m3, m5
113 pmaxsw m5, m6
114 mova m4, m8
115 psraw m4, 1
116 mova m6, m4
117 psubw m6, m0
118 psubw m4, m1
119 pmaxsw m3, m6
120 pminsw m5, m6
121 pmaxsw m3, m4
122 pminsw m5, m4
123 mova m6, m7
124 psubw m6, m3
125 pmaxsw m6, m5
126 mova m3, m2
127 pcmpgtw m3, m7
128 pand m6, m3
129 pmaxsw m2, m6
130 mova m11, m2
132 LOAD%4 m2, [%2+t0*4*%5]
133 LOAD%4 m3, [%3+t0*4*%5]
134 LOAD%4 m4, [%2+t1*4*%5]
135 LOAD%4 m5, [%3+t1*4*%5]
136 paddw m2, m3
137 paddw m4, m5
138 paddw m2, m4
139 mova m3, m2
140 punpcklwd m2, m8
141 punpckhwd m3, m8
142 pmaddwd m2, [pw_coefhf]
143 pmaddwd m3, [pw_coefhf]
144 mova m4, m10
145 mova m6, m4
146 pmullw m4, [pw_coefhf1]
147 pmulhw m6, [pw_coefhf1]
148 mova m5, m4
149 punpcklwd m4, m6
150 punpckhwd m5, m6
151 paddd m2, m4
152 paddd m3, m5
153 psrad m2, 2
154 psrad m3, 2
156 mova m4, m0
157 paddw m0, m1
158 %if ARCH_X86_64
159 LOAD%4 m5, [curq+t2*%5]
160 LOAD%4 m6, [curq+t3*%5]
161 %else
162 mov r4, prefs3mp
163 mov r5, mrefs3mp
164 LOAD%4 m5, [curq+t0*%5]
165 LOAD%4 m6, [curq+t1*%5]
166 mov r4, prefsmp
167 mov r5, mrefsmp
168 %endif
169 paddw m6, m5
170 psubw m1, m4
171 ABS1 m1, m4
172 pcmpgtw m1, m9
173 mova m4, m1
174 punpcklwd m1, m4
175 punpckhwd m4, m4
176 pand m2, m1
177 pand m3, m4
178 mova m5, [pw_splfdif]
179 mova m7, m5
180 pand m5, m1
181 pand m7, m4
182 paddw m5, [pw_coefsp]
183 paddw m7, [pw_coefsp]
184 mova m4, m0
185 punpcklwd m0, m6
186 punpckhwd m4, m6
187 pmaddwd m0, m5
188 pmaddwd m4, m7
189 paddd m2, m0
190 paddd m3, m4
191 psrad m2, 13
192 psrad m3, 13
193 packssdw m2, m3
195 mova m4, m8
196 psraw m4, 1
197 mova m0, m11
198 mova m3, m4
199 psubw m4, m0
200 paddw m3, m0
201 CLIPW m2, m4, m3
202 pxor m7, m7
203 DISP%4
205 add dstq, STEP
206 add prevq, STEP
207 add curq, STEP
208 add nextq, STEP
209 sub DWORD wm, mmsize/2
210 jg .loop%1
211 %endmacro
213 %macro PROC 2
214 %if ARCH_X86_64
215 movsxd r5, DWORD prefsm
216 movsxd r6, DWORD mrefsm
217 movsxd r7, DWORD prefs3m
218 movsxd r8, DWORD mrefs3m
219 DECLARE_REG_TMP 5, 6, 7, 8
220 %else
221 %define m8 [rsp+ 0]
222 %define m9 [rsp+16]
223 %define m10 [rsp+32]
224 %define m11 [rsp+48]
225 mov r4, prefsmp
226 mov r5, mrefsmp
227 DECLARE_REG_TMP 4, 5
228 %endif
229 cmp DWORD paritym, 0
230 je .parity0
231 FILTER 1, prevq, curq, %1, %2
232 jmp .ret
233 .parity0:
234 FILTER 0, curq, nextq, %1, %2
235 .ret:
237 %endmacro
239 %macro BWDIF 0
240 %if ARCH_X86_64
241 cglobal bwdif_filter_line, 4, 9, 12, 0, dst, prev, cur, next, w, prefs, \
242 mrefs, prefs2, mrefs2, prefs3, mrefs3, \
243 prefs4, mrefs4, parity, clip_max
244 %else
245 cglobal bwdif_filter_line, 4, 6, 8, 64, dst, prev, cur, next, w, prefs, \
246 mrefs, prefs2, mrefs2, prefs3, mrefs3, \
247 prefs4, mrefs4, parity, clip_max
248 %endif
249 %define STEP mmsize/2
250 PROC 8, 1
252 %if ARCH_X86_64
253 cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
254 prefs, mrefs, prefs2, mrefs2, \
255 prefs3, mrefs3, prefs4, \
256 mrefs4, parity, clip_max
257 %if mmsize == 32
258 vpbroadcastw m12, WORD clip_maxm
259 %else
260 movd m12, DWORD clip_maxm
261 SPLATW m12, m12, 0
262 %endif
263 %else
264 cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
265 prefs, mrefs, prefs2, mrefs2, \
266 prefs3, mrefs3, prefs4, \
267 mrefs4, parity, clip_max
268 %define m12 [rsp+64]
269 movd m0, DWORD clip_maxm
270 SPLATW m0, m0, 0
271 mova m12, m0
272 %endif
273 %define STEP mmsize
274 PROC 12, 2
275 %endmacro
277 INIT_XMM ssse3
278 BWDIF
279 INIT_XMM sse2
280 BWDIF
282 %if HAVE_AVX2_EXTERNAL && ARCH_X86_64
283 INIT_YMM avx2
284 BWDIF
285 %endif