swscale/aarch64/output.S: refactor ff_yuv2plane1_8_neon
[FFMpeg-mirror.git] / libavfilter / x86 / vf_convolution.asm
blob9ac9ef5d73ef16e9bf6f0a7219c39182e95a0bca
1 ;*****************************************************************************
2 ;* x86-optimized functions for convolution filter
3 ;*
4 ;* This file is part of FFmpeg.
5 ;*
6 ;* FFmpeg is free software; you can redistribute it and/or
7 ;* modify it under the terms of the GNU Lesser General Public
8 ;* License as published by the Free Software Foundation; either
9 ;* version 2.1 of the License, or (at your option) any later version.
11 ;* FFmpeg is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;* Lesser General Public License for more details.
16 ;* You should have received a copy of the GNU Lesser General Public
17 ;* License along with FFmpeg; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 ;******************************************************************************
21 %include "libavutil/x86/x86util.asm"
23 SECTION_RODATA
24 half: dd 0.5
25 data_p1: dd 1
26 data_n1: dd -1
27 data_p2: dd 2
28 data_n2: dd -2
30 ALIGN 64
31 sobel_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51
32 db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55
33 db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
34 db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
35 sobel_mulA: db -1, 1, -2, 2
36 sobel_mulB: db 1, -1, 2, -2
38 SECTION .text
40 ; void filter_3x3_sse4(uint8_t *dst, int width,
41 ; float rdiv, float bias, const int *const matrix,
42 ; const uint8_t *c[], int peak, int radius,
43 ; int dstride, int stride)
46 %macro PROCESS_V 1
47 movss m2, [matrixq + 4 * %1]
48 VBROADCASTSS m2, m2
49 movss m3, [c%1q + xq]
50 punpcklbw m3, m6
51 punpcklwd m3, m6
52 pmulld m2, m3
53 paddd m4, m2
54 %endmacro
56 %macro PROCESS_S 1
57 movzx ptrd, byte [c%1q + xq]
58 imul ptrd, [matrixq + 4 * %1]
59 add rd, ptrd
60 %endmacro
62 %macro FILTER_3X3 0
63 %if UNIX64
64 cglobal filter_3x3, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
65 %else
66 cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
67 %endif
69 %if WIN64
70 SWAP m0, m2
71 SWAP m1, m3
72 mov r2q, matrixmp
73 mov r3q, ptrmp
74 DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
75 %endif
76 movsxdifnidn widthq, widthd
77 VBROADCASTSS m0, m0
78 VBROADCASTSS m1, m1
79 pxor m6, m6
80 movss m5, [half]
81 VBROADCASTSS m5, m5
82 mov c0q, [ptrq + 0*gprsize]
83 mov c1q, [ptrq + 1*gprsize]
84 mov c2q, [ptrq + 2*gprsize]
85 mov c3q, [ptrq + 3*gprsize]
86 mov c4q, [ptrq + 4*gprsize]
87 mov c5q, [ptrq + 5*gprsize]
88 mov c6q, [ptrq + 6*gprsize]
89 mov c7q, [ptrq + 7*gprsize]
90 mov c8q, [ptrq + 8*gprsize]
92 xor xq, xq
93 cmp widthq, mmsize/4
94 jl .loop2
96 mov rq, widthq
97 and rq, mmsize/4-1
98 sub widthq, rq
100 .loop1:
101 pxor m4, m4 ; sum = 0;
103 PROCESS_V 0
104 PROCESS_V 1
105 PROCESS_V 2
106 PROCESS_V 3
107 PROCESS_V 4
108 PROCESS_V 5
109 PROCESS_V 6
110 PROCESS_V 7
111 PROCESS_V 8
113 cvtdq2ps m4, m4
114 mulps m4, m0 ; sum *= rdiv
115 addps m4, m1 ; sum += bias
116 addps m4, m5 ; sum += 0.5
117 cvttps2dq m4, m4
118 packssdw m4, m4
119 packuswb m4, m4
120 movss [dstq + xq], m4
122 add xq, mmsize/4
123 cmp xq, widthq
124 jl .loop1
126 add widthq, rq
127 cmp xq, widthq
128 jge .end
130 .loop2:
131 ; reuse r to hold sum, init with zero
132 xor rd, rd
134 PROCESS_S 0
135 PROCESS_S 1
136 PROCESS_S 2
137 PROCESS_S 3
138 PROCESS_S 4
139 PROCESS_S 5
140 PROCESS_S 6
141 PROCESS_S 7
142 PROCESS_S 8
144 pxor m4, m4
145 cvtsi2ss m4, rd
146 mulss m4, m0 ; sum *= rdiv
147 addss m4, m1 ; sum += bias
148 addss m4, m5 ; sum += 0.5
149 ; we don't have simple scalar instructions to convert
150 ; from 32bit to 8bit with saturation, so here
151 ; just use packed version SSE instructions for simplicity.
152 cvttps2dq m4, m4 ; trunc to integer
153 packssdw m4, m4
154 packuswb m4, m4
155 movd rd, m4
156 mov [dstq + xq], rb
158 add xq, 1
159 cmp xq, widthq
160 jl .loop2
161 .end:
163 %endmacro
165 %if ARCH_X86_64
166 INIT_XMM sse4
167 FILTER_3X3
168 %endif
170 %macro SOBEL_MUL 2
171 movzx ptrd, byte [c%1q + xq]
172 imul ptrd, [%2]
173 add rd, ptrd
174 %endmacro
176 %macro SOBEL_ADD 1
177 movzx ptrd, byte [c%1q + xq]
178 add rd, ptrd
179 %endmacro
181 ; void filter_sobel_avx512(uint8_t *dst, int width,
182 ; float scale, float delta, const int *const matrix,
183 ; const uint8_t *c[], int peak, int radius,
184 ; int dstride, int stride)
185 %macro FILTER_SOBEL 0
186 %if UNIX64
187 cglobal filter_sobel, 4, 15, 7, dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
188 %else
189 cglobal filter_sobel, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
190 %endif
191 %if WIN64
192 VBROADCASTSS m0, xmm2
193 VBROADCASTSS m1, xmm3
194 mov r2q, matrixmp
195 mov r3q, ptrmp
196 DEFINE_ARGS dst, width, matrix, ptr, c0, c1, c2, c3, c4, c5, c6, c7, c8, r, x
197 %else
198 VBROADCASTSS m0, xmm0
199 VBROADCASTSS m1, xmm1
200 %endif
201 movsxdifnidn widthq, widthd
202 pxor m6, m6
203 mov c0q, [ptrq + 0*gprsize]
204 mov c1q, [ptrq + 1*gprsize]
205 mov c2q, [ptrq + 2*gprsize]
206 mov c3q, [ptrq + 3*gprsize]
207 mov c4q, [ptrq + 4*gprsize]
208 mov c5q, [ptrq + 5*gprsize]
209 mov c6q, [ptrq + 6*gprsize]
210 mov c7q, [ptrq + 7*gprsize]
211 mov c8q, [ptrq + 8*gprsize]
213 xor xq, xq
214 cmp widthq, mmsize/4
215 jl .loop2
217 mov rq, widthq
218 and rq, mmsize/4-1
219 sub widthq, rq
221 mova m6, [sobel_perm]
222 .loop1:
223 movu xm3, [c2q + xq]
224 pmovzxbd m5, [c0q + xq]
225 vinserti32x4 ym3, [c6q + xq], 1
226 pmovzxbd m4, [c8q + xq]
227 vinserti32x4 m2, m3, [c1q + xq], 2
228 vinserti32x4 m3, [c5q + xq], 2
229 vinserti32x4 m2, [c7q + xq], 3
230 vinserti32x4 m3, [c3q + xq], 3
231 vpermb m2, m6, m2
232 psubd m4, m5
233 vpermb m3, m6, m3
234 mova m5, m4
235 vpdpbusd m4, m2, [sobel_mulA] {1to16}
236 vpdpbusd m5, m3, [sobel_mulB] {1to16}
238 cvtdq2ps m4, m4
239 mulps m4, m4
241 cvtdq2ps m5, m5
242 VFMADD231PS m4, m5, m5
244 sqrtps m4, m4
245 fmaddps m4, m4, m0, m1
246 cvttps2dq m4, m4
247 vpmovusdb [dstq + xq], m4
249 add xq, mmsize/4
250 cmp xq, widthq
251 jl .loop1
253 add widthq, rq
254 cmp xq, widthq
255 jge .end
257 .loop2:
258 xor rd, rd
259 pxor m4, m4
262 SOBEL_MUL 0, data_n1
263 SOBEL_MUL 1, data_n2
264 SOBEL_MUL 2, data_n1
265 SOBEL_ADD 6
266 SOBEL_MUL 7, data_p2
267 SOBEL_ADD 8
269 cvtsi2ss xmm4, rd
270 mulss xmm4, xmm4
272 xor rd, rd
274 SOBEL_MUL 0, data_n1
275 SOBEL_ADD 2
276 SOBEL_MUL 3, data_n2
277 SOBEL_MUL 5, data_p2
278 SOBEL_MUL 6, data_n1
279 SOBEL_ADD 8
281 cvtsi2ss xmm5, rd
282 fmaddss xmm4, xmm5, xmm5, xmm4
284 sqrtps xmm4, xmm4
285 fmaddss xmm4, xmm4, xm0, xm1 ;sum = sum * rdiv + bias
286 cvttps2dq xmm4, xmm4 ; trunc to integer
287 packssdw xmm4, xmm4
288 packuswb xmm4, xmm4
289 movd rd, xmm4
290 mov [dstq + xq], rb
292 add xq, 1
293 cmp xq, widthq
294 jl .loop2
295 .end:
297 %endmacro
299 %if ARCH_X86_64
300 %if HAVE_AVX512ICL_EXTERNAL
301 INIT_ZMM avx512icl
302 FILTER_SOBEL
303 %endif
304 %endif