avformat/mov: check for tts_count before deferencing tts_data
[FFMpeg-mirror.git] / libswscale / x86 / scale.asm
blob85a96dc57e4bce8ea5ed5ce8357f1934b2a80d56
1 ;******************************************************************************
2 ;* x86-optimized horizontal line scaling functions
3 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4 ;*
5 ;* This file is part of FFmpeg.
6 ;*
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
24 SECTION_RODATA
26 max_19bit_int: times 4 dd 0x7ffff
27 minshort: times 8 dw 0x8000
28 unicoeff: times 4 dd 0x20000000
30 SECTION .text
32 ;-----------------------------------------------------------------------------
33 ; horizontal line scaling
35 ; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
36 ; (SwsInternal *c, int{16,32}_t *dst,
37 ; int dstW, const uint{8,16}_t *src,
38 ; const int16_t *filter,
39 ; const int32_t *filterPos, int filterSize);
41 ; Scale one horizontal line. Input is either 8-bit width or 16-bit width
42 ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
43 ; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
44 ; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
45 ; output pixel is generated from $filterSize input pixels, the position of
46 ; the first pixel is given in filterPos[nOutputPixel].
47 ;-----------------------------------------------------------------------------
49 ; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
50 %macro SCALE_FUNC 6
51 %ifnidn %3, X
52 cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
53 %else
54 cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
55 %endif
56 %if ARCH_X86_64
57 movsxd wq, wd
58 %define mov32 movsxd
59 %else ; x86-32
60 %define mov32 mov
61 %endif ; x86-64
62 %if %2 == 19
63 mova m2, [max_19bit_int]
64 %endif ; %2 == 19
65 %if %1 == 16
66 mova m6, [minshort]
67 mova m7, [unicoeff]
68 %elif %1 == 8
69 pxor m3, m3
70 %endif ; %1 == 8/16
72 %if %1 == 8
73 %define movlh movd
74 %define movbh movh
75 %define srcmul 1
76 %else ; %1 == 9-16
77 %define movlh movq
78 %define movbh movu
79 %define srcmul 2
80 %endif ; %1 == 8/9-16
82 %ifnidn %3, X
84 ; setup loop
85 %if %3 == 8
86 shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
87 %define wshr 1
88 %else ; %3 == 4
89 %define wshr 0
90 %endif ; %3 == 8
91 lea filterq, [filterq+wq*8]
92 %if %2 == 15
93 lea dstq, [dstq+wq*(2>>wshr)]
94 %else ; %2 == 19
95 lea dstq, [dstq+wq*(4>>wshr)]
96 %endif ; %2 == 15/19
97 lea fltposq, [fltposq+wq*(4>>wshr)]
98 neg wq
100 .loop:
101 %if %3 == 4 ; filterSize == 4 scaling
102 ; load 2x4 or 4x4 source pixels into m0/m1
103 mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0]
104 mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1]
105 movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}]
106 %if mmsize == 8
107 movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
108 %else ; mmsize == 16
109 %if %1 > 8
110 movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
111 %else ; %1 == 8
112 movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
113 %endif
114 mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2]
115 mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3]
116 movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}]
117 %if %1 > 8
118 movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
119 %else ; %1 == 8
120 movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
121 punpckldq m0, m4
122 punpckldq m1, m5
123 %endif ; %1 == 8
124 %endif ; mmsize == 8/16
125 %if %1 == 8
126 punpcklbw m0, m3 ; byte -> word
127 punpcklbw m1, m3 ; byte -> word
128 %endif ; %1 == 8
130 ; multiply with filter coefficients
131 %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
132 ; add back 0x8000 * sum(coeffs) after the horizontal add
133 psubw m0, m6
134 psubw m1, m6
135 %endif ; %1 == 16
136 pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
137 pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
139 ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
140 %if notcpuflag(ssse3) ; sse2
141 mova m4, m0
142 shufps m0, m1, 10001000b
143 shufps m4, m1, 11011101b
144 paddd m0, m4
145 %else ; ssse3/sse4
146 phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
147 ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
148 ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
149 ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
150 %endif ; sse2/ssse3/sse4
151 %else ; %3 == 8, i.e. filterSize == 8 scaling
152 ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
153 mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
154 mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1]
155 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
156 %if mmsize == 8
157 movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
158 movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}]
159 movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
160 %else ; mmsize == 16
161 movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
162 mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2]
163 mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3]
164 movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
165 movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
166 %endif ; mmsize == 8/16
167 %if %1 == 8
168 punpcklbw m0, m3 ; byte -> word
169 punpcklbw m1, m3 ; byte -> word
170 punpcklbw m4, m3 ; byte -> word
171 punpcklbw m5, m3 ; byte -> word
172 %endif ; %1 == 8
174 ; multiply
175 %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
176 ; add back 0x8000 * sum(coeffs) after the horizontal add
177 psubw m0, m6
178 psubw m1, m6
179 psubw m4, m6
180 psubw m5, m6
181 %endif ; %1 == 16
182 pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
183 pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
184 pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
185 pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
187 ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
188 %if notcpuflag(ssse3) ; sse2
189 %if %1 == 8
190 %define mex m6
191 %else
192 %define mex m3
193 %endif
194 ; emulate horizontal add as transpose + vertical add
195 mova mex, m0
196 punpckldq m0, m1
197 punpckhdq mex, m1
198 paddd m0, mex
199 mova m1, m4
200 punpckldq m4, m5
201 punpckhdq m1, m5
202 paddd m4, m1
203 mova m1, m0
204 punpcklqdq m0, m4
205 punpckhqdq m1, m4
206 paddd m0, m1
207 %else ; ssse3/sse4
208 ; FIXME if we rearrange the filter in pairs of 4, we can
209 ; load pixels likewise and use 2 x paddd + phaddd instead
210 ; of 3 x phaddd here, faster on older cpus
211 phaddd m0, m1
212 phaddd m4, m5
213 phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
214 ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
215 ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
216 ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
217 %endif ; sse2/ssse3/sse4
218 %endif ; %3 == 4/8
220 %else ; %3 == X, i.e. any filterSize scaling
222 %ifidn %4, X4
223 %define dlt 4
224 %else ; %4 == X || %4 == X8
225 %define dlt 0
226 %endif ; %4 ==/!= X4
227 %if ARCH_X86_64
228 %define srcq r8
229 %define pos1q r7
230 %define srcendq r9
231 movsxd fltsizeq, fltsized ; filterSize
232 lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
233 %else ; x86-32
234 %define srcq srcmemq
235 %define pos1q dstq
236 %define srcendq r6m
237 lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
238 mov srcendq, pos0q
239 %endif ; x86-32/64
240 lea fltposq, [fltposq+wq*4]
241 %if %2 == 15
242 lea dstq, [dstq+wq*2]
243 %else ; %2 == 19
244 lea dstq, [dstq+wq*4]
245 %endif ; %2 == 15/19
246 movifnidn dstmp, dstq
247 neg wq
249 .loop:
250 mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0]
251 mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
252 ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
253 pxor m4, m4
254 pxor m5, m5
255 mov srcq, srcmemmp
257 .innerloop:
258 ; load 2x8 (sse) source pixels into m0/m1 -> m4/m5
259 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
260 movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
261 %if %1 == 8
262 punpcklbw m0, m3
263 punpcklbw m1, m3
264 %endif ; %1 == 8
266 ; multiply
267 %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
268 ; add back 0x8000 * sum(coeffs) after the horizontal add
269 psubw m0, m6
270 psubw m1, m6
271 %endif ; %1 == 16
272 pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}]
273 pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
274 paddd m4, m0
275 paddd m5, m1
276 add filterq, mmsize
277 add srcq, srcmul*mmsize/2
278 cmp srcq, srcendq ; while (src += 4) < &src[filterSize]
279 jl .innerloop
281 %ifidn %4, X4
282 mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
283 movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0]
284 sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1]
285 %if %1 > 8
286 movhps m0, [srcq+(pos1q+dlt)*srcmul]
287 %else ; %1 == 8
288 movd m1, [srcq+(pos1q+dlt)*srcmul]
289 punpckldq m0, m1
290 %endif ; %1 == 8
291 %if %1 == 8
292 punpcklbw m0, m3
293 %endif ; %1 == 8
294 %if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
295 ; add back 0x8000 * sum(coeffs) after the horizontal add
296 psubw m0, m6
297 %endif ; %1 == 16
298 pmaddwd m0, [filterq]
299 %endif ; %4 == X4
301 lea filterq, [filterq+(fltsizeq+dlt)*2]
303 %if notcpuflag(ssse3) ; sse2
304 mova m1, m4
305 punpcklqdq m4, m5
306 punpckhqdq m1, m5
307 paddd m4, m1
308 %else ; ssse3/sse4
309 phaddd m4, m5
310 %endif ; sse2/ssse3/sse4
311 %ifidn %4, X4
312 paddd m4, m0
313 %endif ; %3 == X4
314 %if notcpuflag(ssse3) ; sse2
315 pshufd m4, m4, 11011000b
316 movhlps m0, m4
317 paddd m0, m4
318 %else ; ssse3/sse4
319 phaddd m4, m4
320 SWAP 0, 4
321 %endif ; sse2/ssse3/sse4
322 %endif ; %3 ==/!= X
324 %if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
325 paddd m0, m7
326 %endif ; %1 == 16
328 ; clip, store
329 psrad m0, 14 + %1 - %2
330 %ifidn %3, X
331 movifnidn dstq, dstmp
332 %endif ; %3 == X
333 %if %2 == 15
334 packssdw m0, m0
335 %ifnidn %3, X
336 movh [dstq+wq*(2>>wshr)], m0
337 %else ; %3 == X
338 movd [dstq+wq*2], m0
339 %endif ; %3 ==/!= X
340 %else ; %2 == 19
341 PMINSD m0, m2, m4
342 %ifnidn %3, X
343 mova [dstq+wq*(4>>wshr)], m0
344 %else ; %3 == X
345 movq [dstq+wq*4], m0
346 %endif ; %3 ==/!= X
347 %endif ; %2 == 15/19
348 %ifnidn %3, X
349 add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels
350 ; per iteration. see "shl wq,1" above as for why we do this
351 %else ; %3 == X
352 add wq, 2
353 %endif ; %3 ==/!= X
354 jl .loop
356 %endmacro
358 ; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
359 %macro SCALE_FUNCS 3
360 SCALE_FUNC %1, %2, 4, 4, 6, %3
361 SCALE_FUNC %1, %2, 8, 8, 6, %3
362 SCALE_FUNC %1, %2, X, X4, 7, %3
363 SCALE_FUNC %1, %2, X, X8, 7, %3
364 %endmacro
366 ; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
367 %macro SCALE_FUNCS2 3
368 %if notcpuflag(sse4)
369 SCALE_FUNCS 8, 15, %1
370 SCALE_FUNCS 9, 15, %2
371 SCALE_FUNCS 10, 15, %2
372 SCALE_FUNCS 12, 15, %2
373 SCALE_FUNCS 14, 15, %2
374 SCALE_FUNCS 16, 15, %3
375 %endif ; !sse4
376 SCALE_FUNCS 8, 19, %1
377 SCALE_FUNCS 9, 19, %2
378 SCALE_FUNCS 10, 19, %2
379 SCALE_FUNCS 12, 19, %2
380 SCALE_FUNCS 14, 19, %2
381 SCALE_FUNCS 16, 19, %3
382 %endmacro
384 INIT_XMM sse2
385 SCALE_FUNCS2 7, 6, 8
386 INIT_XMM ssse3
387 SCALE_FUNCS2 6, 6, 8
388 INIT_XMM sse4
389 SCALE_FUNCS2 6, 6, 8