2 * OpenAL cross platform audio library
3 * Copyright (C) 2014 by Timothy Arceri <t_arceri@yahoo.com.au>.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 * Or go to http://www.gnu.org/copyleft/lgpl.html
23 #include <xmmintrin.h>
24 #include <emmintrin.h>
31 #include "alnumeric.h"
33 #include "core/cubic_defs.h"
34 #include "core/resampler_limits.h"
36 #include "opthelpers.h"
43 #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE2__)
44 #pragma GCC target("sse2")
47 using uint
= unsigned int;
51 constexpr uint CubicPhaseDiffBits
{MixerFracBits
- CubicPhaseBits
};
52 constexpr uint CubicPhaseDiffOne
{1 << CubicPhaseDiffBits
};
53 constexpr uint CubicPhaseDiffMask
{CubicPhaseDiffOne
- 1u};
55 force_inline __m128
vmadd(const __m128 x
, const __m128 y
, const __m128 z
) noexcept
56 { return _mm_add_ps(x
, _mm_mul_ps(y
, z
)); }
61 void Resample_
<LerpTag
,SSE2Tag
>(const InterpState
*, const al::span
<const float> src
, uint frac
,
62 const uint increment
, const al::span
<float> dst
)
64 ASSUME(frac
< MixerFracOne
);
66 const __m128i increment4
{_mm_set1_epi32(static_cast<int>(increment
*4))};
67 const __m128 fracOne4
{_mm_set1_ps(1.0f
/MixerFracOne
)};
68 const __m128i fracMask4
{_mm_set1_epi32(MixerFracMask
)};
70 std::array
<uint
,4> pos_
{}, frac_
{};
71 InitPosArrays(MaxResamplerEdge
, frac
, increment
, al::span
{frac_
}, al::span
{pos_
});
72 __m128i frac4
{_mm_setr_epi32(static_cast<int>(frac_
[0]), static_cast<int>(frac_
[1]),
73 static_cast<int>(frac_
[2]), static_cast<int>(frac_
[3]))};
74 __m128i pos4
{_mm_setr_epi32(static_cast<int>(pos_
[0]), static_cast<int>(pos_
[1]),
75 static_cast<int>(pos_
[2]), static_cast<int>(pos_
[3]))};
77 auto vecout
= al::span
{reinterpret_cast<__m128
*>(dst
.data()), dst
.size()/4};
78 std::generate(vecout
.begin(), vecout
.end(), [=,&pos4
,&frac4
]() -> __m128
80 const auto pos0
= static_cast<uint
>(_mm_cvtsi128_si32(pos4
));
81 const auto pos1
= static_cast<uint
>(_mm_cvtsi128_si32(_mm_srli_si128(pos4
, 4)));
82 const auto pos2
= static_cast<uint
>(_mm_cvtsi128_si32(_mm_srli_si128(pos4
, 8)));
83 const auto pos3
= static_cast<uint
>(_mm_cvtsi128_si32(_mm_srli_si128(pos4
, 12)));
84 ASSUME(pos0
<= pos1
); ASSUME(pos1
<= pos2
); ASSUME(pos2
<= pos3
);
85 const __m128 val1
{_mm_setr_ps(src
[pos0
], src
[pos1
], src
[pos2
], src
[pos3
])};
86 const __m128 val2
{_mm_setr_ps(src
[pos0
+1_uz
], src
[pos1
+1_uz
], src
[pos2
+1_uz
], src
[pos3
+1_uz
])};
88 /* val1 + (val2-val1)*mu */
89 const __m128 r0
{_mm_sub_ps(val2
, val1
)};
90 const __m128 mu
{_mm_mul_ps(_mm_cvtepi32_ps(frac4
), fracOne4
)};
91 const __m128 out
{_mm_add_ps(val1
, _mm_mul_ps(mu
, r0
))};
93 frac4
= _mm_add_epi32(frac4
, increment4
);
94 pos4
= _mm_add_epi32(pos4
, _mm_srli_epi32(frac4
, MixerFracBits
));
95 frac4
= _mm_and_si128(frac4
, fracMask4
);
99 if(size_t todo
{dst
.size()&3})
101 auto pos
= size_t{static_cast<uint
>(_mm_cvtsi128_si32(pos4
))};
102 frac
= static_cast<uint
>(_mm_cvtsi128_si32(frac4
));
104 const auto out
= dst
.last(todo
);
105 std::generate(out
.begin(), out
.end(), [&pos
,&frac
,src
,increment
]()
107 const float smp
{lerpf(src
[pos
+0], src
[pos
+1],
108 static_cast<float>(frac
) * (1.0f
/MixerFracOne
))};
111 pos
+= frac
>>MixerFracBits
;
112 frac
&= MixerFracMask
;
119 void Resample_
<CubicTag
,SSE2Tag
>(const InterpState
*state
, const al::span
<const float> src
,
120 uint frac
, const uint increment
, const al::span
<float> dst
)
122 ASSUME(frac
< MixerFracOne
);
124 const auto filter
= std::get
<CubicState
>(*state
).filter
;
126 const __m128i increment4
{_mm_set1_epi32(static_cast<int>(increment
*4))};
127 const __m128i fracMask4
{_mm_set1_epi32(MixerFracMask
)};
128 const __m128 fracDiffOne4
{_mm_set1_ps(1.0f
/CubicPhaseDiffOne
)};
129 const __m128i fracDiffMask4
{_mm_set1_epi32(CubicPhaseDiffMask
)};
131 std::array
<uint
,4> pos_
{}, frac_
{};
132 InitPosArrays(MaxResamplerEdge
-1, frac
, increment
, al::span
{frac_
}, al::span
{pos_
});
133 __m128i frac4
{_mm_setr_epi32(static_cast<int>(frac_
[0]), static_cast<int>(frac_
[1]),
134 static_cast<int>(frac_
[2]), static_cast<int>(frac_
[3]))};
135 __m128i pos4
{_mm_setr_epi32(static_cast<int>(pos_
[0]), static_cast<int>(pos_
[1]),
136 static_cast<int>(pos_
[2]), static_cast<int>(pos_
[3]))};
138 auto vecout
= al::span
{reinterpret_cast<__m128
*>(dst
.data()), dst
.size()/4};
139 std::generate(vecout
.begin(), vecout
.end(), [=,&pos4
,&frac4
]
141 const auto pos0
= static_cast<uint
>(_mm_cvtsi128_si32(pos4
));
142 const auto pos1
= static_cast<uint
>(_mm_cvtsi128_si32(_mm_srli_si128(pos4
, 4)));
143 const auto pos2
= static_cast<uint
>(_mm_cvtsi128_si32(_mm_srli_si128(pos4
, 8)));
144 const auto pos3
= static_cast<uint
>(_mm_cvtsi128_si32(_mm_srli_si128(pos4
, 12)));
145 ASSUME(pos0
<= pos1
); ASSUME(pos1
<= pos2
); ASSUME(pos2
<= pos3
);
146 const __m128 val0
{_mm_loadu_ps(&src
[pos0
])};
147 const __m128 val1
{_mm_loadu_ps(&src
[pos1
])};
148 const __m128 val2
{_mm_loadu_ps(&src
[pos2
])};
149 const __m128 val3
{_mm_loadu_ps(&src
[pos3
])};
151 const __m128i pi4
{_mm_srli_epi32(frac4
, CubicPhaseDiffBits
)};
152 const auto pi0
= static_cast<uint
>(_mm_cvtsi128_si32(pi4
));
153 const auto pi1
= static_cast<uint
>(_mm_cvtsi128_si32(_mm_srli_si128(pi4
, 4)));
154 const auto pi2
= static_cast<uint
>(_mm_cvtsi128_si32(_mm_srli_si128(pi4
, 8)));
155 const auto pi3
= static_cast<uint
>(_mm_cvtsi128_si32(_mm_srli_si128(pi4
, 12)));
156 ASSUME(pi0
< CubicPhaseCount
); ASSUME(pi1
< CubicPhaseCount
);
157 ASSUME(pi2
< CubicPhaseCount
); ASSUME(pi3
< CubicPhaseCount
);
159 const __m128 pf4
{_mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(frac4
, fracDiffMask4
)),
162 __m128 r0
{_mm_mul_ps(val0
,
163 vmadd(_mm_load_ps(filter
[pi0
].mCoeffs
.data()),
164 _mm_shuffle_ps(pf4
, pf4
, _MM_SHUFFLE(0, 0, 0, 0)),
165 _mm_load_ps(filter
[pi0
].mDeltas
.data())))};
166 __m128 r1
{_mm_mul_ps(val1
,
167 vmadd(_mm_load_ps(filter
[pi1
].mCoeffs
.data()),
168 _mm_shuffle_ps(pf4
, pf4
, _MM_SHUFFLE(1, 1, 1, 1)),
169 _mm_load_ps(filter
[pi1
].mDeltas
.data())))};
170 __m128 r2
{_mm_mul_ps(val2
,
171 vmadd(_mm_load_ps(filter
[pi2
].mCoeffs
.data()),
172 _mm_shuffle_ps(pf4
, pf4
, _MM_SHUFFLE(2, 2, 2, 2)),
173 _mm_load_ps(filter
[pi2
].mDeltas
.data())))};
174 __m128 r3
{_mm_mul_ps(val3
,
175 vmadd(_mm_load_ps(filter
[pi3
].mCoeffs
.data()),
176 _mm_shuffle_ps(pf4
, pf4
, _MM_SHUFFLE(3, 3, 3, 3)),
177 _mm_load_ps(filter
[pi3
].mDeltas
.data())))};
179 _MM_TRANSPOSE4_PS(r0
, r1
, r2
, r3
);
180 r0
= _mm_add_ps(_mm_add_ps(r0
, r1
), _mm_add_ps(r2
, r3
));
182 frac4
= _mm_add_epi32(frac4
, increment4
);
183 pos4
= _mm_add_epi32(pos4
, _mm_srli_epi32(frac4
, MixerFracBits
));
184 frac4
= _mm_and_si128(frac4
, fracMask4
);
188 if(const size_t todo
{dst
.size()&3})
190 auto pos
= size_t{static_cast<uint
>(_mm_cvtsi128_si32(pos4
))};
191 frac
= static_cast<uint
>(_mm_cvtsi128_si32(frac4
));
193 auto out
= dst
.last(todo
);
194 std::generate(out
.begin(), out
.end(), [&pos
,&frac
,src
,increment
,filter
]
196 const uint pi
{frac
>> CubicPhaseDiffBits
}; ASSUME(pi
< CubicPhaseCount
);
197 const float pf
{static_cast<float>(frac
&CubicPhaseDiffMask
) * (1.0f
/CubicPhaseDiffOne
)};
198 const __m128 pf4
{_mm_set1_ps(pf
)};
200 const __m128 f4
= vmadd(_mm_load_ps(filter
[pi
].mCoeffs
.data()), pf4
,
201 _mm_load_ps(filter
[pi
].mDeltas
.data()));
202 __m128 r4
{_mm_mul_ps(f4
, _mm_loadu_ps(&src
[pos
]))};
204 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
205 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
206 const float output
{_mm_cvtss_f32(r4
)};
209 pos
+= frac
>>MixerFracBits
;
210 frac
&= MixerFracMask
;