1 ;******************************************************************************
2 ;* x86-optimized horizontal line scaling functions
3 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
26 max_19bit_int: times
4 dd 0x7ffff
27 minshort: times
8 dw 0x8000
28 unicoeff: times
4 dd 0x20000000
32 ;-----------------------------------------------------------------------------
33 ; horizontal line scaling
35 ; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
36 ; (SwsInternal *c, int{16,32}_t *dst,
37 ; int dstW, const uint{8,16}_t *src,
38 ; const int16_t *filter,
39 ; const int32_t *filterPos, int filterSize);
41 ; Scale one horizontal line. Input is either 8-bit width or 16-bit width
42 ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
43 ; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
44 ; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
45 ; output pixel is generated from $filterSize input pixels, the position of
46 ; the first pixel is given in filterPos[nOutputPixel].
47 ;-----------------------------------------------------------------------------
49 ; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
52 cglobal hscale
%1to
%2_
%4, %5, 7, %6, pos0
, dst
, w
, src
, filter
, fltpos
, pos1
54 cglobal hscale
%1to
%2_
%4, %5, 10, %6, pos0
, dst
, w
, srcmem
, filter
, fltpos
, fltsize
63 mova m2
, [max_19bit_int
]
86 shl wq
, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
91 lea filterq
, [filterq
+wq
*8]
93 lea dstq
, [dstq
+wq
*(2>>wshr
)]
95 lea dstq
, [dstq
+wq
*(4>>wshr
)]
97 lea fltposq
, [fltposq
+wq
*(4>>wshr
)]
101 %if
%3 == 4 ; filterSize == 4 scaling
102 ; load 2x4 or 4x4 source pixels into m0/m1
103 mov32 pos0q
, dword [fltposq
+wq
*4+ 0] ; filterPos[0]
104 mov32 pos1q
, dword [fltposq
+wq
*4+ 4] ; filterPos[1]
105 movlh m0
, [srcq
+pos0q
*srcmul
] ; src[filterPos[0] + {0,1,2,3}]
107 movlh m1
, [srcq
+pos1q
*srcmul
] ; src[filterPos[1] + {0,1,2,3}]
110 movhps m0
, [srcq
+pos1q
*srcmul
] ; src[filterPos[1] + {0,1,2,3}]
112 movd m4
, [srcq
+pos1q
*srcmul
] ; src[filterPos[1] + {0,1,2,3}]
114 mov32 pos0q
, dword [fltposq
+wq
*4+ 8] ; filterPos[2]
115 mov32 pos1q
, dword [fltposq
+wq
*4+12] ; filterPos[3]
116 movlh m1
, [srcq
+pos0q
*srcmul
] ; src[filterPos[2] + {0,1,2,3}]
118 movhps m1
, [srcq
+pos1q
*srcmul
] ; src[filterPos[3] + {0,1,2,3}]
120 movd m5
, [srcq
+pos1q
*srcmul
] ; src[filterPos[3] + {0,1,2,3}]
124 %endif
; mmsize == 8/16
126 punpcklbw m0
, m3
; byte -> word
127 punpcklbw m1
, m3
; byte -> word
130 ; multiply with filter coefficients
131 %if
%1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
132 ; add back 0x8000 * sum(coeffs) after the horizontal add
136 pmaddwd m0
, [filterq
+wq
*8+mmsize
*0] ; *= filter[{0,1,..,6,7}]
137 pmaddwd m1
, [filterq
+wq
*8+mmsize
*1] ; *= filter[{8,9,..,14,15}]
139 ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
140 %if notcpuflag
(ssse3
) ; sse2
142 shufps m0
, m1
, 10001000b
143 shufps m4
, m1
, 11011101b
146 phaddd m0
, m1
; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
147 ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
148 ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
149 ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
150 %endif
; sse2/ssse3/sse4
151 %else
; %3 == 8, i.e. filterSize == 8 scaling
152 ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
153 mov32 pos0q
, dword [fltposq
+wq
*2+0] ; filterPos[0]
154 mov32 pos1q
, dword [fltposq
+wq
*2+4] ; filterPos[1]
155 movbh m0
, [srcq
+ pos0q
*srcmul
] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
157 movbh m1
, [srcq
+(pos0q
+4)*srcmul
] ; src[filterPos[0] + {4,5,6,7}]
158 movbh m4
, [srcq
+ pos1q
*srcmul
] ; src[filterPos[1] + {0,1,2,3}]
159 movbh m5
, [srcq
+(pos1q
+4)*srcmul
] ; src[filterPos[1] + {4,5,6,7}]
161 movbh m1
, [srcq
+ pos1q
*srcmul
] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
162 mov32 pos0q
, dword [fltposq
+wq
*2+8] ; filterPos[2]
163 mov32 pos1q
, dword [fltposq
+wq
*2+12] ; filterPos[3]
164 movbh m4
, [srcq
+ pos0q
*srcmul
] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
165 movbh m5
, [srcq
+ pos1q
*srcmul
] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
166 %endif
; mmsize == 8/16
168 punpcklbw m0
, m3
; byte -> word
169 punpcklbw m1
, m3
; byte -> word
170 punpcklbw m4
, m3
; byte -> word
171 punpcklbw m5
, m3
; byte -> word
175 %if
%1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
176 ; add back 0x8000 * sum(coeffs) after the horizontal add
182 pmaddwd m0
, [filterq
+wq
*8+mmsize
*0] ; *= filter[{0,1,..,6,7}]
183 pmaddwd m1
, [filterq
+wq
*8+mmsize
*1] ; *= filter[{8,9,..,14,15}]
184 pmaddwd m4
, [filterq
+wq
*8+mmsize
*2] ; *= filter[{16,17,..,22,23}]
185 pmaddwd m5
, [filterq
+wq
*8+mmsize
*3] ; *= filter[{24,25,..,30,31}]
187 ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
188 %if notcpuflag
(ssse3
) ; sse2
194 ; emulate horizontal add as transpose + vertical add
208 ; FIXME if we rearrange the filter in pairs of 4, we can
209 ; load pixels likewise and use 2 x paddd + phaddd instead
210 ; of 3 x phaddd here, faster on older cpus
213 phaddd m0
, m4
; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
214 ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
215 ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
216 ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
217 %endif
; sse2/ssse3/sse4
220 %else
; %3 == X, i.e. any filterSize scaling
224 %else
; %4 == X || %4 == X8
231 movsxd fltsizeq
, fltsized
; filterSize
232 lea srcendq
, [srcmemq
+(fltsizeq
-dlt
)*srcmul
] ; &src[filterSize&~4]
237 lea pos0q
, [srcmemq
+(fltsizeq
-dlt
)*srcmul
] ; &src[filterSize&~4]
240 lea fltposq
, [fltposq
+wq
*4]
242 lea dstq
, [dstq
+wq
*2]
244 lea dstq
, [dstq
+wq
*4]
246 movifnidn dstmp
, dstq
250 mov32 pos0q
, dword [fltposq
+wq
*4+0] ; filterPos[0]
251 mov32 pos1q
, dword [fltposq
+wq
*4+4] ; filterPos[1]
252 ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
258 ; load 2x8 (sse) source pixels into m0/m1 -> m4/m5
259 movbh m0
, [srcq
+ pos0q
*srcmul
] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
260 movbh m1
, [srcq
+(pos1q
+dlt
)*srcmul
] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
267 %if
%1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
268 ; add back 0x8000 * sum(coeffs) after the horizontal add
272 pmaddwd m0
, [filterq
] ; filter[{0,1,2,3(,4,5,6,7)}]
273 pmaddwd m1
, [filterq
+(fltsizeq
+dlt
)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
277 add srcq
, srcmul
*mmsize
/2
278 cmp srcq
, srcendq
; while (src += 4) < &src[filterSize]
282 mov32 pos1q
, dword [fltposq
+wq
*4+4] ; filterPos[1]
283 movlh m0
, [srcq
+ pos0q
*srcmul
] ; split last 4 srcpx of dstpx[0]
284 sub pos1q
, fltsizeq
; and first 4 srcpx of dstpx[1]
286 movhps m0
, [srcq
+(pos1q
+dlt
)*srcmul
]
288 movd m1
, [srcq
+(pos1q
+dlt
)*srcmul
]
294 %if
%1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
295 ; add back 0x8000 * sum(coeffs) after the horizontal add
298 pmaddwd m0
, [filterq
]
301 lea filterq
, [filterq
+(fltsizeq
+dlt
)*2]
303 %if notcpuflag
(ssse3
) ; sse2
310 %endif
; sse2/ssse3/sse4
314 %if notcpuflag
(ssse3
) ; sse2
315 pshufd m4
, m4
, 11011000b
321 %endif
; sse2/ssse3/sse4
324 %if
%1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
329 psrad m0
, 14 + %1 - %2
331 movifnidn dstq
, dstmp
336 movh
[dstq
+wq
*(2>>wshr
)], m0
343 mova
[dstq
+wq
*(4>>wshr
)], m0
349 add wq
, (mmsize
<<wshr
)/4 ; both 8tap and 4tap really only do 4 pixels
350 ; per iteration. see "shl wq,1" above as for why we do this
358 ; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
360 SCALE_FUNC
%1, %2, 4, 4, 6, %3
361 SCALE_FUNC
%1, %2, 8, 8, 6, %3
362 SCALE_FUNC
%1, %2, X
, X4
, 7, %3
363 SCALE_FUNC
%1, %2, X
, X8
, 7, %3
366 ; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
367 %macro SCALE_FUNCS2
3
369 SCALE_FUNCS
8, 15, %1
370 SCALE_FUNCS
9, 15, %2
371 SCALE_FUNCS
10, 15, %2
372 SCALE_FUNCS
12, 15, %2
373 SCALE_FUNCS
14, 15, %2
374 SCALE_FUNCS
16, 15, %3
376 SCALE_FUNCS
8, 19, %1
377 SCALE_FUNCS
9, 19, %2
378 SCALE_FUNCS
10, 19, %2
379 SCALE_FUNCS
12, 19, %2
380 SCALE_FUNCS
14, 19, %2
381 SCALE_FUNCS
16, 19, %3