1 ;*****************************************************************************
2 ;* x86-optimized functions for volume filter
3 ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
26 pd_1_256: times
4 dq 0x3F70000000000000
27 pd_int32_max: times
4 dq 0x41DFFFFFFFC00000
29 pw_128: times
8 dw 128
30 pq_128: times
2 dq 128
34 ;------------------------------------------------------------------------------
35 ; void ff_scale_samples_s16(uint8_t *dst, const uint8_t *src, int len,
37 ;------------------------------------------------------------------------------
40 cglobal scale_samples_s16
, 4,4,4, dst
, src
, len
, volume
45 lea lenq
, [lend
*2-mmsize
]
47 ; dst[i] = av_clip_int16((src[i] * volume + 128) >> 8);
61 ;------------------------------------------------------------------------------
62 ; void ff_scale_samples_s32(uint8_t *dst, const uint8_t *src, int len,
64 ;------------------------------------------------------------------------------
66 %macro SCALE_SAMPLES_S32
0
67 cglobal scale_samples_s32
, 4,4,4, dst
, src
, len
, volume
68 %if ARCH_X86_32
&& cpuflag
(avx
)
69 vbroadcastss xmm2
, volumem
75 mulpd m2
, m2
, [pd_1_256
]
76 mova m3
, [pd_int32_max
]
77 lea lenq
, [lend
*4-mmsize
]
79 CVTDQ2PD m0
, [srcq
+lenq
]
80 CVTDQ2PD m1
, [srcq
+lenq
+mmsize
/2]
88 vmovdqa
[dstq
+lenq
], xmm0
89 vmovdqa
[dstq
+lenq
+mmsize
/2], xmm1
91 movq
[dstq
+lenq
], xmm0
92 movq
[dstq
+lenq
+mmsize
/2], xmm1
100 %define CVTDQ2PD cvtdq2pd
102 %if HAVE_AVX_EXTERNAL
103 %define CVTDQ2PD vcvtdq2pd
109 ; NOTE: This is not bit-identical with the C version because it clips to
110 ; [-INT_MAX, INT_MAX] instead of [INT_MIN, INT_MAX]
113 cglobal scale_samples_s32
, 4,4,8, dst
, src
, len
, volume
118 lea lenq
, [lend
*4-mmsize
]
120 ; src[i] = av_clipl_int32((src[i] * volume + 128) >> 8);
131 shufps m2
, m0
, m1
, q3131
132 shufps m0
, m0
, m1
, q2020