swscale/aarch64/output.S: refactor ff_yuv2plane1_8_neon
[FFMpeg-mirror.git] / libavfilter / x86 / vf_v360.asm
blob8e7e4591b475b22ab3c8f9f7d27202aa0ef47213
1 ;*****************************************************************************
2 ;* x86-optimized functions for v360 filter
3 ;*
4 ;* This file is part of FFmpeg.
5 ;*
6 ;* FFmpeg is free software; you can redistribute it and/or
7 ;* modify it under the terms of the GNU Lesser General Public
8 ;* License as published by the Free Software Foundation; either
9 ;* version 2.1 of the License, or (at your option) any later version.
11 ;* FFmpeg is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;* Lesser General Public License for more details.
16 ;* You should have received a copy of the GNU Lesser General Public
17 ;* License along with FFmpeg; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
24 %if HAVE_AVX2_EXTERNAL
26 SECTION_RODATA
28 pb_mask: db 0,4,8,12,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
29 pw_mask: db 0,1,4, 5, 8, 9,12,13,-1,-1,-1,-1,-1,-1,-1,-1
30 pd_255: times 4 dd 255
31 pd_65535: times 4 dd 65535
33 SECTION .text
35 ; void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
36 ; const uint16_t *u, const uint16_t *v, const int16_t *ker);
38 INIT_YMM avx2
39 cglobal remap1_8bit_line, 6, 7, 6, dst, width, src, in_linesize, u, v, x
40 movsxdifnidn widthq, widthd
41 xor xq, xq
42 movd xm0, in_linesized
43 pcmpeqw m4, m4
44 VBROADCASTI128 m3, [pb_mask]
45 vpbroadcastd m0, xm0
47 .loop:
48 pmovsxwd m1, [vq + xq * 2]
49 pmovsxwd m2, [uq + xq * 2]
51 pmulld m1, m0
52 paddd m1, m2
53 mova m2, m4
54 vpgatherdd m5, [srcq + m1], m2
55 pshufb m1, m5, m3
56 vextracti128 xm2, m1, 1
57 movd [dstq+xq], xm1
58 movd [dstq+xq+4], xm2
60 add xq, mmsize / 4
61 cmp xq, widthq
62 jl .loop
63 RET
65 INIT_YMM avx2
66 cglobal remap1_16bit_line, 6, 7, 6, dst, width, src, in_linesize, u, v, x
67 movsxdifnidn widthq, widthd
68 xor xq, xq
69 movd xm0, in_linesized
70 pcmpeqw m4, m4
71 VBROADCASTI128 m3, [pw_mask]
72 vpbroadcastd m0, xm0
74 .loop:
75 pmovsxwd m1, [vq + xq * 2]
76 pmovsxwd m2, [uq + xq * 2]
78 pslld m2, 0x1
79 pmulld m1, m0
80 paddd m1, m2
81 mova m2, m4
82 vpgatherdd m5, [srcq + m1], m2
83 pshufb m1, m5, m3
84 vextracti128 xm2, m1, 1
85 movq [dstq+xq*2], xm1
86 movq [dstq+xq*2+8], xm2
88 add xq, mmsize / 4
89 cmp xq, widthq
90 jl .loop
91 RET
93 INIT_YMM avx2
94 cglobal remap2_8bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v, ker, x
95 movsxdifnidn widthq, widthd
96 movd xm0, in_linesized
97 %if ARCH_X86_32
98 DEFINE_ARGS dst, width, src, x, u, v, ker
99 %endif
100 xor xq, xq
101 pcmpeqw m7, m7
102 vpbroadcastd m0, xm0
103 vpbroadcastd m6, [pd_255]
105 .loop:
106 pmovsxwd m1, [kerq + xq * 8]
107 pmovsxwd m2, [vq + xq * 8]
108 pmovsxwd m3, [uq + xq * 8]
110 pmulld m4, m2, m0
111 paddd m4, m3
112 mova m3, m7
113 vpgatherdd m2, [srcq + m4], m3
114 pand m2, m6
115 pmulld m2, m1
116 phaddd m2, m2
117 phaddd m1, m2, m2
118 psrld m1, m1, 0xe
119 vextracti128 xm2, m1, 1
121 pextrb [dstq+xq], xm1, 0
122 pextrb [dstq+xq+1], xm2, 0
124 add xq, mmsize / 16
125 cmp xq, widthq
126 jl .loop
129 INIT_YMM avx2
130 cglobal remap2_16bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v, ker, x
131 movsxdifnidn widthq, widthd
132 movd xm0, in_linesized
133 %if ARCH_X86_32
134 DEFINE_ARGS dst, width, src, x, u, v, ker
135 %endif
136 xor xq, xq
137 pcmpeqw m7, m7
138 vpbroadcastd m0, xm0
139 vpbroadcastd m6, [pd_65535]
141 .loop:
142 pmovsxwd m1, [kerq + xq * 8]
143 pmovsxwd m2, [vq + xq * 8]
144 pmovsxwd m3, [uq + xq * 8]
146 pslld m3, 0x1
147 pmulld m4, m2, m0
148 paddd m4, m3
149 mova m3, m7
150 vpgatherdd m2, [srcq + m4], m3
151 pand m2, m6
152 pmulld m2, m1
153 phaddd m2, m2
154 phaddd m1, m2, m2
155 psrld m1, m1, 0xe
156 vextracti128 xm2, m1, 1
158 pextrw [dstq+xq*2], xm1, 0
159 pextrw [dstq+xq*2+2], xm2, 0
161 add xq, mmsize / 16
162 cmp xq, widthq
163 jl .loop
166 %if ARCH_X86_64
168 INIT_YMM avx2
169 cglobal remap3_8bit_line, 7, 11, 8, dst, width, src, in_linesize, u, v, ker, x, y, tmp, z
170 movsxdifnidn widthq, widthd
171 xor zq, zq
172 xor yq, yq
173 xor xq, xq
174 movd xm0, in_linesized
175 pcmpeqw m7, m7
176 vpbroadcastd m0, xm0
177 vpbroadcastd m6, [pd_255]
179 .loop:
180 pmovsxwd m1, [kerq + yq]
181 pmovsxwd m2, [vq + yq]
182 pmovsxwd m3, [uq + yq]
184 pmulld m4, m2, m0
185 paddd m4, m3
186 mova m3, m7
187 vpgatherdd m2, [srcq + m4], m3
188 pand m2, m6
189 pmulld m2, m1
190 HADDD m2, m1
191 movzx tmpq, word [vq + yq + 16]
192 imul tmpq, in_linesizeq
193 movzx zq, word [uq + yq + 16]
194 add tmpq, zq
195 movzx zq, byte [srcq + tmpq]
196 movzx tmpq, word [kerq + yq + 16]
197 imul zd, tmpd
198 movd xm1, zd
199 paddd m2, m1
200 psrld m2, m2, 0xe
202 packuswb m2, m2
203 pextrb [dstq+xq], xm2, 0
205 add xq, 1
206 add yq, 18
207 cmp xq, widthq
208 jl .loop
211 INIT_YMM avx2
212 cglobal remap4_8bit_line, 7, 9, 11, dst, width, src, in_linesize, u, v, ker, x, y
213 movsxdifnidn widthq, widthd
214 xor yq, yq
215 xor xq, xq
216 movd xm0, in_linesized
217 pcmpeqw m7, m7
218 vpbroadcastd m0, xm0
219 vpbroadcastd m6, [pd_255]
221 .loop:
222 pmovsxwd m1, [kerq + yq]
223 pmovsxwd m5, [kerq + yq + 16]
224 pmovsxwd m2, [vq + yq]
225 pmovsxwd m8, [vq + yq + 16]
226 pmovsxwd m3, [uq + yq]
227 pmovsxwd m9, [uq + yq + 16]
229 pmulld m4, m2, m0
230 pmulld m10, m8, m0
231 paddd m4, m3
232 paddd m10, m9
233 mova m3, m7
234 vpgatherdd m2, [srcq + m4], m3
235 mova m3, m7
236 vpgatherdd m4, [srcq + m10], m3
237 pand m2, m6
238 pand m4, m6
239 pmulld m2, m1
240 pmulld m4, m5
242 paddd m2, m4
243 HADDD m2, m1
244 psrld m2, m2, 0xe
245 packuswb m2, m2
247 pextrb [dstq+xq], xm2, 0
249 add xq, 1
250 add yq, 32
251 cmp xq, widthq
252 jl .loop
255 %endif
256 %endif