lavfi: switch to AVFrame.
[FFMpeg-mirror/mplayer-patches.git] / libavcodec / x86 / fmtconvert.asm
blob8267bd47dcbd90dbedd6d332c5b6d2390cafba2c
1 ;******************************************************************************
2 ;* x86 optimized Format Conversion Utils
3 ;* Copyright (c) 2008 Loren Merritt
4 ;*
5 ;* This file is part of Libav.
6 ;*
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
24 SECTION_TEXT
26 %macro CVTPS2PI 2
27 %if cpuflag(sse)
28 cvtps2pi %1, %2
29 %elif cpuflag(3dnow)
30 pf2id %1, %2
31 %endif
32 %endmacro
34 ;---------------------------------------------------------------------------------
35 ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
36 ;---------------------------------------------------------------------------------
37 %macro INT32_TO_FLOAT_FMUL_SCALAR 1
38 %if UNIX64
39 cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
40 %else
41 cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
42 %endif
43 %if WIN64
44 SWAP 0, 2
45 %elif ARCH_X86_32
46 movss m0, mulm
47 %endif
48 SPLATD m0
49 shl lenq, 2
50 add srcq, lenq
51 add dstq, lenq
52 neg lenq
53 .loop:
54 %if cpuflag(sse2)
55 cvtdq2ps m1, [srcq+lenq ]
56 cvtdq2ps m2, [srcq+lenq+16]
57 %else
58 cvtpi2ps m1, [srcq+lenq ]
59 cvtpi2ps m3, [srcq+lenq+ 8]
60 cvtpi2ps m2, [srcq+lenq+16]
61 cvtpi2ps m4, [srcq+lenq+24]
62 movlhps m1, m3
63 movlhps m2, m4
64 %endif
65 mulps m1, m0
66 mulps m2, m0
67 mova [dstq+lenq ], m1
68 mova [dstq+lenq+16], m2
69 add lenq, 32
70 jl .loop
71 REP_RET
72 %endmacro
74 INIT_XMM sse
75 INT32_TO_FLOAT_FMUL_SCALAR 5
76 INIT_XMM sse2
77 INT32_TO_FLOAT_FMUL_SCALAR 3
80 ;------------------------------------------------------------------------------
81 ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
82 ;------------------------------------------------------------------------------
83 %macro FLOAT_TO_INT16 1
84 cglobal float_to_int16, 3, 3, %1, dst, src, len
85 add lenq, lenq
86 lea srcq, [srcq+2*lenq]
87 add dstq, lenq
88 neg lenq
89 .loop:
90 %if cpuflag(sse2)
91 cvtps2dq m0, [srcq+2*lenq ]
92 cvtps2dq m1, [srcq+2*lenq+16]
93 packssdw m0, m1
94 mova [dstq+lenq], m0
95 %else
96 CVTPS2PI m0, [srcq+2*lenq ]
97 CVTPS2PI m1, [srcq+2*lenq+ 8]
98 CVTPS2PI m2, [srcq+2*lenq+16]
99 CVTPS2PI m3, [srcq+2*lenq+24]
100 packssdw m0, m1
101 packssdw m2, m3
102 mova [dstq+lenq ], m0
103 mova [dstq+lenq+8], m2
104 %endif
105 add lenq, 16
106 js .loop
107 %if mmsize == 8
108 emms
109 %endif
110 REP_RET
111 %endmacro
113 INIT_XMM sse2
114 FLOAT_TO_INT16 2
115 INIT_MMX sse
116 FLOAT_TO_INT16 0
117 INIT_MMX 3dnow
118 FLOAT_TO_INT16 0
120 ;------------------------------------------------------------------------------
121 ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
122 ;------------------------------------------------------------------------------
123 %macro FLOAT_TO_INT16_STEP 1
124 cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
125 add lenq, lenq
126 lea srcq, [srcq+2*lenq]
127 lea step3q, [stepq*3]
128 neg lenq
129 .loop:
130 %if cpuflag(sse2)
131 cvtps2dq m0, [srcq+2*lenq ]
132 cvtps2dq m1, [srcq+2*lenq+16]
133 packssdw m0, m1
134 movd v1d, m0
135 psrldq m0, 4
136 movd v2d, m0
137 psrldq m0, 4
138 mov [dstq], v1w
139 mov [dstq+stepq*4], v2w
140 shr v1d, 16
141 shr v2d, 16
142 mov [dstq+stepq*2], v1w
143 mov [dstq+step3q*2], v2w
144 lea dstq, [dstq+stepq*8]
145 movd v1d, m0
146 psrldq m0, 4
147 movd v2d, m0
148 mov [dstq], v1w
149 mov [dstq+stepq*4], v2w
150 shr v1d, 16
151 shr v2d, 16
152 mov [dstq+stepq*2], v1w
153 mov [dstq+step3q*2], v2w
154 lea dstq, [dstq+stepq*8]
155 %else
156 CVTPS2PI m0, [srcq+2*lenq ]
157 CVTPS2PI m1, [srcq+2*lenq+ 8]
158 CVTPS2PI m2, [srcq+2*lenq+16]
159 CVTPS2PI m3, [srcq+2*lenq+24]
160 packssdw m0, m1
161 packssdw m2, m3
162 movd v1d, m0
163 psrlq m0, 32
164 movd v2d, m0
165 mov [dstq], v1w
166 mov [dstq+stepq*4], v2w
167 shr v1d, 16
168 shr v2d, 16
169 mov [dstq+stepq*2], v1w
170 mov [dstq+step3q*2], v2w
171 lea dstq, [dstq+stepq*8]
172 movd v1d, m2
173 psrlq m2, 32
174 movd v2d, m2
175 mov [dstq], v1w
176 mov [dstq+stepq*4], v2w
177 shr v1d, 16
178 shr v2d, 16
179 mov [dstq+stepq*2], v1w
180 mov [dstq+step3q*2], v2w
181 lea dstq, [dstq+stepq*8]
182 %endif
183 add lenq, 16
184 js .loop
185 %if mmsize == 8
186 emms
187 %endif
188 REP_RET
189 %endmacro
191 INIT_XMM sse2
192 FLOAT_TO_INT16_STEP 2
193 INIT_MMX sse
194 FLOAT_TO_INT16_STEP 0
195 INIT_MMX 3dnow
196 FLOAT_TO_INT16_STEP 0
198 ;-------------------------------------------------------------------------------
199 ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
200 ;-------------------------------------------------------------------------------
201 %macro FLOAT_TO_INT16_INTERLEAVE2 0
202 cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
203 lea lenq, [4*r2q]
204 mov src1q, [src0q+gprsize]
205 mov src0q, [src0q]
206 add dstq, lenq
207 add src0q, lenq
208 add src1q, lenq
209 neg lenq
210 .loop:
211 %if cpuflag(sse2)
212 cvtps2dq m0, [src0q+lenq]
213 cvtps2dq m1, [src1q+lenq]
214 packssdw m0, m1
215 movhlps m1, m0
216 punpcklwd m0, m1
217 mova [dstq+lenq], m0
218 %else
219 CVTPS2PI m0, [src0q+lenq ]
220 CVTPS2PI m1, [src0q+lenq+8]
221 CVTPS2PI m2, [src1q+lenq ]
222 CVTPS2PI m3, [src1q+lenq+8]
223 packssdw m0, m1
224 packssdw m2, m3
225 mova m1, m0
226 punpcklwd m0, m2
227 punpckhwd m1, m2
228 mova [dstq+lenq ], m0
229 mova [dstq+lenq+8], m1
230 %endif
231 add lenq, 16
232 js .loop
233 %if mmsize == 8
234 emms
235 %endif
236 REP_RET
237 %endmacro
239 INIT_MMX 3dnow
240 FLOAT_TO_INT16_INTERLEAVE2
241 INIT_MMX sse
242 FLOAT_TO_INT16_INTERLEAVE2
243 INIT_XMM sse2
244 FLOAT_TO_INT16_INTERLEAVE2
246 %macro FLOAT_TO_INT16_INTERLEAVE6 0
247 ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
248 cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
249 %if ARCH_X86_64
250 mov lend, r2d
251 %else
252 %define lend dword r2m
253 %endif
254 mov src1q, [srcq+1*gprsize]
255 mov src2q, [srcq+2*gprsize]
256 mov src3q, [srcq+3*gprsize]
257 mov src4q, [srcq+4*gprsize]
258 mov src5q, [srcq+5*gprsize]
259 mov srcq, [srcq]
260 sub src1q, srcq
261 sub src2q, srcq
262 sub src3q, srcq
263 sub src4q, srcq
264 sub src5q, srcq
265 .loop:
266 CVTPS2PI mm0, [srcq]
267 CVTPS2PI mm1, [srcq+src1q]
268 CVTPS2PI mm2, [srcq+src2q]
269 CVTPS2PI mm3, [srcq+src3q]
270 CVTPS2PI mm4, [srcq+src4q]
271 CVTPS2PI mm5, [srcq+src5q]
272 packssdw mm0, mm3
273 packssdw mm1, mm4
274 packssdw mm2, mm5
275 PSWAPD mm3, mm0
276 punpcklwd mm0, mm1
277 punpckhwd mm1, mm2
278 punpcklwd mm2, mm3
279 PSWAPD mm3, mm0
280 punpckldq mm0, mm2
281 punpckhdq mm2, mm1
282 punpckldq mm1, mm3
283 movq [dstq ], mm0
284 movq [dstq+16], mm2
285 movq [dstq+ 8], mm1
286 add srcq, 8
287 add dstq, 24
288 sub lend, 2
289 jg .loop
290 emms
292 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
294 INIT_MMX sse
295 FLOAT_TO_INT16_INTERLEAVE6
296 INIT_MMX 3dnow
297 FLOAT_TO_INT16_INTERLEAVE6
298 INIT_MMX 3dnowext
299 FLOAT_TO_INT16_INTERLEAVE6
301 ;-----------------------------------------------------------------------------
302 ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
303 ;-----------------------------------------------------------------------------
305 %macro FLOAT_INTERLEAVE6 1
306 cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
307 %if ARCH_X86_64
308 mov lend, r2d
309 %else
310 %define lend dword r2m
311 %endif
312 mov src1q, [srcq+1*gprsize]
313 mov src2q, [srcq+2*gprsize]
314 mov src3q, [srcq+3*gprsize]
315 mov src4q, [srcq+4*gprsize]
316 mov src5q, [srcq+5*gprsize]
317 mov srcq, [srcq]
318 sub src1q, srcq
319 sub src2q, srcq
320 sub src3q, srcq
321 sub src4q, srcq
322 sub src5q, srcq
323 .loop:
324 %if cpuflag(sse)
325 movaps m0, [srcq]
326 movaps m1, [srcq+src1q]
327 movaps m2, [srcq+src2q]
328 movaps m3, [srcq+src3q]
329 movaps m4, [srcq+src4q]
330 movaps m5, [srcq+src5q]
332 SBUTTERFLYPS 0, 1, 6
333 SBUTTERFLYPS 2, 3, 6
334 SBUTTERFLYPS 4, 5, 6
336 movaps m6, m4
337 shufps m4, m0, 0xe4
338 movlhps m0, m2
339 movhlps m6, m2
340 movaps [dstq ], m0
341 movaps [dstq+16], m4
342 movaps [dstq+32], m6
344 movaps m6, m5
345 shufps m5, m1, 0xe4
346 movlhps m1, m3
347 movhlps m6, m3
348 movaps [dstq+48], m1
349 movaps [dstq+64], m5
350 movaps [dstq+80], m6
351 %else ; mmx
352 movq m0, [srcq]
353 movq m1, [srcq+src1q]
354 movq m2, [srcq+src2q]
355 movq m3, [srcq+src3q]
356 movq m4, [srcq+src4q]
357 movq m5, [srcq+src5q]
359 SBUTTERFLY dq, 0, 1, 6
360 SBUTTERFLY dq, 2, 3, 6
361 SBUTTERFLY dq, 4, 5, 6
362 movq [dstq ], m0
363 movq [dstq+ 8], m2
364 movq [dstq+16], m4
365 movq [dstq+24], m1
366 movq [dstq+32], m3
367 movq [dstq+40], m5
368 %endif
369 add srcq, mmsize
370 add dstq, mmsize*6
371 sub lend, mmsize/4
372 jg .loop
373 %if mmsize == 8
374 emms
375 %endif
376 REP_RET
377 %endmacro
379 INIT_MMX mmx
380 FLOAT_INTERLEAVE6 0
381 INIT_XMM sse
382 FLOAT_INTERLEAVE6 7
384 ;-----------------------------------------------------------------------------
385 ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
386 ;-----------------------------------------------------------------------------
388 %macro FLOAT_INTERLEAVE2 1
389 cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
390 mov src1q, [srcq+gprsize]
391 mov srcq, [srcq ]
392 sub src1q, srcq
393 .loop:
394 mova m0, [srcq ]
395 mova m1, [srcq+src1q ]
396 mova m3, [srcq +mmsize]
397 mova m4, [srcq+src1q+mmsize]
399 mova m2, m0
400 PUNPCKLDQ m0, m1
401 PUNPCKHDQ m2, m1
403 mova m1, m3
404 PUNPCKLDQ m3, m4
405 PUNPCKHDQ m1, m4
407 mova [dstq ], m0
408 mova [dstq+1*mmsize], m2
409 mova [dstq+2*mmsize], m3
410 mova [dstq+3*mmsize], m1
412 add srcq, mmsize*2
413 add dstq, mmsize*4
414 sub lend, mmsize/2
415 jg .loop
416 %if mmsize == 8
417 emms
418 %endif
419 REP_RET
420 %endmacro
422 INIT_MMX mmx
423 %define PUNPCKLDQ punpckldq
424 %define PUNPCKHDQ punpckhdq
425 FLOAT_INTERLEAVE2 0
426 INIT_XMM sse
427 %define PUNPCKLDQ unpcklps
428 %define PUNPCKHDQ unpckhps
429 FLOAT_INTERLEAVE2 5