1 ;*****************************************************************************
2 ;* x86-optimized functions for bwdif filter
4 ;* Copyright (C) 2016 Thomas Mundt <loudmax@yahoo.de>
6 ;* Based on yadif simd code
7 ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
8 ;* 2013 Daniel Kang <daniel.d.kang@gmail.com>
10 ;* This file is part of FFmpeg.
12 ;* FFmpeg is free software; you can redistribute it and/or
13 ;* modify it under the terms of the GNU Lesser General Public
14 ;* License as published by the Free Software Foundation; either
15 ;* version 2.1 of the License, or (at your option) any later version.
17 ;* FFmpeg is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 ;* Lesser General Public License for more details.
22 ;* You should have received a copy of the GNU Lesser General Public
23 ;* License along with FFmpeg; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 ;******************************************************************************
27 %include "libavutil/x86/x86util.asm"
31 pw_coefhf: times
8 dw 1016, 5570
32 pw_coefhf1: times
16 dw -3801
33 pw_coefsp: times
8 dw 5077, -981
34 pw_splfdif: times
8 dw -768, 768
53 vextracti128 xm1
, m2
, 1
70 LOAD
%4 m0
, [curq
+t0
*%5]
71 LOAD
%4 m1
, [curq
+t1
*%5]
80 LOAD
%4 m3
, [prevq
+t0
*%5]
81 LOAD
%4 m4
, [prevq
+t1
*%5]
89 LOAD
%4 m3
, [nextq
+t0
*%5]
90 LOAD
%4 m4
, [nextq
+t1
*%5]
98 LOAD
%4 m3
, [%2+t0
*2*%5]
99 LOAD
%4 m4
, [%3+t0
*2*%5]
100 LOAD
%4 m5
, [%2+t1
*2*%5]
101 LOAD
%4 m6
, [%3+t1
*2*%5]
132 LOAD
%4 m2
, [%2+t0
*4*%5]
133 LOAD
%4 m3
, [%3+t0
*4*%5]
134 LOAD
%4 m4
, [%2+t1
*4*%5]
135 LOAD
%4 m5
, [%3+t1
*4*%5]
142 pmaddwd m2
, [pw_coefhf
]
143 pmaddwd m3
, [pw_coefhf
]
146 pmullw m4
, [pw_coefhf1
]
147 pmulhw m6
, [pw_coefhf1
]
159 LOAD
%4 m5
, [curq
+t2
*%5]
160 LOAD
%4 m6
, [curq
+t3
*%5]
164 LOAD
%4 m5
, [curq
+t0
*%5]
165 LOAD
%4 m6
, [curq
+t1
*%5]
178 mova m5
, [pw_splfdif
]
182 paddw m5
, [pw_coefsp
]
183 paddw m7
, [pw_coefsp
]
209 sub DWORD wm
, mmsize
/2
215 movsxd r5
, DWORD prefsm
216 movsxd r6
, DWORD mrefsm
217 movsxd r7
, DWORD prefs3m
218 movsxd r8
, DWORD mrefs3m
219 DECLARE_REG_TMP
5, 6, 7, 8
231 FILTER
1, prevq
, curq
, %1, %2
234 FILTER
0, curq
, nextq
, %1, %2
241 cglobal bwdif_filter_line
, 4, 9, 12, 0, dst
, prev
, cur
, next
, w
, prefs
, \
242 mrefs
, prefs2
, mrefs2
, prefs3
, mrefs3
, \
243 prefs4
, mrefs4
, parity
, clip_max
245 cglobal bwdif_filter_line
, 4, 6, 8, 64, dst
, prev
, cur
, next
, w
, prefs
, \
246 mrefs
, prefs2
, mrefs2
, prefs3
, mrefs3
, \
247 prefs4
, mrefs4
, parity
, clip_max
249 %define STEP mmsize
/2
253 cglobal bwdif_filter_line_12bit
, 4, 9, 13, 0, dst
, prev
, cur
, next
, w
, \
254 prefs
, mrefs
, prefs2
, mrefs2
, \
255 prefs3
, mrefs3
, prefs4
, \
256 mrefs4
, parity
, clip_max
258 vpbroadcastw m12
, WORD clip_maxm
260 movd m12
, DWORD clip_maxm
264 cglobal bwdif_filter_line_12bit
, 4, 6, 8, 80, dst
, prev
, cur
, next
, w
, \
265 prefs
, mrefs
, prefs2
, mrefs2
, \
266 prefs3
, mrefs3
, prefs4
, \
267 mrefs4
, parity
, clip_max
269 movd m0
, DWORD clip_maxm
282 %if HAVE_AVX2_EXTERNAL
&& ARCH_X86_64