1 ;*****************************************************************************
2 ;* x86-optimized functions for convolution filter
4 ;* This file is part of FFmpeg.
6 ;* FFmpeg is free software; you can redistribute it and/or
7 ;* modify it under the terms of the GNU Lesser General Public
8 ;* License as published by the Free Software Foundation; either
9 ;* version 2.1 of the License, or (at your option) any later version.
11 ;* FFmpeg is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;* Lesser General Public License for more details.
16 ;* You should have received a copy of the GNU Lesser General Public
17 ;* License along with FFmpeg; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 ;******************************************************************************
21 %include "libavutil/x86/x86util.asm"
31 sobel_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51
32 db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55
33 db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
34 db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
35 sobel_mulA: db -1, 1, -2, 2
36 sobel_mulB: db 1, -1, 2, -2
40 ; void filter_3x3_sse4(uint8_t *dst, int width,
41 ; float rdiv, float bias, const int *const matrix,
42 ; const uint8_t *c[], int peak, int radius,
43 ; int dstride, int stride)
47 movss m2
, [matrixq
+ 4 * %1]
57 movzx ptrd
, byte [c
%1q
+ xq
]
58 imul ptrd
, [matrixq
+ 4 * %1]
64 cglobal filter_3x3
, 4, 15, 7, dst
, width, matrix
, ptr, c0
, c1
, c2
, c3
, c4
, c5
, c6
, c7
, c8
, r
, x
66 cglobal filter_3x3
, 4, 15, 7, dst
, width, rdiv
, bias
, matrix
, ptr, c0
, c1
, c2
, c3
, c4
, c5
, c6
, c7
, c8
, r
, x
74 DEFINE_ARGS dst
, width, matrix
, ptr, c0
, c1
, c2
, c3
, c4
, c5
, c6
, c7
, c8
, r
, x
76 movsxdifnidn widthq
, widthd
82 mov c0q
, [ptrq
+ 0*gprsize
]
83 mov c1q
, [ptrq
+ 1*gprsize
]
84 mov c2q
, [ptrq
+ 2*gprsize
]
85 mov c3q
, [ptrq
+ 3*gprsize
]
86 mov c4q
, [ptrq
+ 4*gprsize
]
87 mov c5q
, [ptrq
+ 5*gprsize
]
88 mov c6q
, [ptrq
+ 6*gprsize
]
89 mov c7q
, [ptrq
+ 7*gprsize
]
90 mov c8q
, [ptrq
+ 8*gprsize
]
101 pxor m4
, m4
; sum = 0;
114 mulps m4
, m0
; sum *= rdiv
115 addps m4
, m1
; sum += bias
116 addps m4
, m5
; sum += 0.5
120 movss
[dstq
+ xq
], m4
131 ; reuse r to hold sum, init with zero
146 mulss m4
, m0
; sum *= rdiv
147 addss m4
, m1
; sum += bias
148 addss m4
, m5
; sum += 0.5
149 ; we don't have simple scalar instructions to convert
150 ; from 32bit to 8bit with saturation, so here
151 ; just use packed version SSE instructions for simplicity.
152 cvttps2dq m4
, m4
; trunc to integer
171 movzx ptrd
, byte [c
%1q
+ xq
]
177 movzx ptrd
, byte [c
%1q
+ xq
]
181 ; void filter_sobel_avx512(uint8_t *dst, int width,
182 ; float scale, float delta, const int *const matrix,
183 ; const uint8_t *c[], int peak, int radius,
184 ; int dstride, int stride)
185 %macro FILTER_SOBEL
0
187 cglobal filter_sobel
, 4, 15, 7, dst
, width, matrix
, ptr, c0
, c1
, c2
, c3
, c4
, c5
, c6
, c7
, c8
, r
, x
189 cglobal filter_sobel
, 4, 15, 7, dst
, width, rdiv
, bias
, matrix
, ptr, c0
, c1
, c2
, c3
, c4
, c5
, c6
, c7
, c8
, r
, x
192 VBROADCASTSS m0
, xmm2
193 VBROADCASTSS m1
, xmm3
196 DEFINE_ARGS dst
, width, matrix
, ptr, c0
, c1
, c2
, c3
, c4
, c5
, c6
, c7
, c8
, r
, x
198 VBROADCASTSS m0
, xmm0
199 VBROADCASTSS m1
, xmm1
201 movsxdifnidn widthq
, widthd
203 mov c0q
, [ptrq
+ 0*gprsize
]
204 mov c1q
, [ptrq
+ 1*gprsize
]
205 mov c2q
, [ptrq
+ 2*gprsize
]
206 mov c3q
, [ptrq
+ 3*gprsize
]
207 mov c4q
, [ptrq
+ 4*gprsize
]
208 mov c5q
, [ptrq
+ 5*gprsize
]
209 mov c6q
, [ptrq
+ 6*gprsize
]
210 mov c7q
, [ptrq
+ 7*gprsize
]
211 mov c8q
, [ptrq
+ 8*gprsize
]
221 mova m6
, [sobel_perm
]
224 pmovzxbd m5
, [c0q
+ xq
]
225 vinserti32x4 ym3
, [c6q
+ xq
], 1
226 pmovzxbd m4
, [c8q
+ xq
]
227 vinserti32x4 m2
, m3
, [c1q
+ xq
], 2
228 vinserti32x4 m3
, [c5q
+ xq
], 2
229 vinserti32x4 m2
, [c7q
+ xq
], 3
230 vinserti32x4 m3
, [c3q
+ xq
], 3
235 vpdpbusd m4
, m2
, [sobel_mulA
] {1to16}
236 vpdpbusd m5
, m3
, [sobel_mulB
] {1to16}
242 VFMADD231PS m4
, m5
, m5
245 fmaddps m4
, m4
, m0
, m1
247 vpmovusdb
[dstq
+ xq
], m4
282 fmaddss xmm4
, xmm5
, xmm5
, xmm4
285 fmaddss xmm4
, xmm4
, xm0
, xm1
;sum = sum * rdiv + bias
286 cvttps2dq xmm4
, xmm4
; trunc to integer
300 %if HAVE_AVX512ICL_EXTERNAL