2 * OpenAL cross platform audio library
3 * Copyright (C) 2014 by Timothy Arceri <t_arceri@yahoo.com.au>.
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 * Or go to http://www.gnu.org/copyleft/lgpl.html
23 #include <xmmintrin.h>
24 #include <emmintrin.h>
25 #include <smmintrin.h>
32 #include "alnumeric.h"
34 #include "core/cubic_defs.h"
35 #include "core/resampler_limits.h"
37 #include "opthelpers.h"
44 #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE4_1__)
45 #pragma GCC target("sse4.1")
48 using uint
= unsigned int;
52 constexpr uint CubicPhaseDiffBits
{MixerFracBits
- CubicPhaseBits
};
53 constexpr uint CubicPhaseDiffOne
{1 << CubicPhaseDiffBits
};
54 constexpr uint CubicPhaseDiffMask
{CubicPhaseDiffOne
- 1u};
56 force_inline __m128
vmadd(const __m128 x
, const __m128 y
, const __m128 z
) noexcept
57 { return _mm_add_ps(x
, _mm_mul_ps(y
, z
)); }
62 void Resample_
<LerpTag
,SSE4Tag
>(const InterpState
*, const al::span
<const float> src
, uint frac
,
63 const uint increment
, const al::span
<float> dst
)
65 ASSUME(frac
< MixerFracOne
);
67 const __m128i increment4
{_mm_set1_epi32(static_cast<int>(increment
*4))};
68 const __m128 fracOne4
{_mm_set1_ps(1.0f
/MixerFracOne
)};
69 const __m128i fracMask4
{_mm_set1_epi32(MixerFracMask
)};
71 std::array
<uint
,4> pos_
{}, frac_
{};
72 InitPosArrays(MaxResamplerEdge
, frac
, increment
, al::span
{frac_
}, al::span
{pos_
});
73 __m128i frac4
{_mm_setr_epi32(static_cast<int>(frac_
[0]), static_cast<int>(frac_
[1]),
74 static_cast<int>(frac_
[2]), static_cast<int>(frac_
[3]))};
75 __m128i pos4
{_mm_setr_epi32(static_cast<int>(pos_
[0]), static_cast<int>(pos_
[1]),
76 static_cast<int>(pos_
[2]), static_cast<int>(pos_
[3]))};
78 auto vecout
= al::span
{reinterpret_cast<__m128
*>(dst
.data()), dst
.size()/4};
79 std::generate(vecout
.begin(), vecout
.end(), [=,&pos4
,&frac4
]
81 const auto pos0
= static_cast<uint
>(_mm_extract_epi32(pos4
, 0));
82 const auto pos1
= static_cast<uint
>(_mm_extract_epi32(pos4
, 1));
83 const auto pos2
= static_cast<uint
>(_mm_extract_epi32(pos4
, 2));
84 const auto pos3
= static_cast<uint
>(_mm_extract_epi32(pos4
, 3));
85 ASSUME(pos0
<= pos1
); ASSUME(pos1
<= pos2
); ASSUME(pos2
<= pos3
);
86 const __m128 val1
{_mm_setr_ps(src
[pos0
], src
[pos1
], src
[pos2
], src
[pos3
])};
87 const __m128 val2
{_mm_setr_ps(src
[pos0
+1_uz
], src
[pos1
+1_uz
], src
[pos2
+1_uz
], src
[pos3
+1_uz
])};
89 /* val1 + (val2-val1)*mu */
90 const __m128 r0
{_mm_sub_ps(val2
, val1
)};
91 const __m128 mu
{_mm_mul_ps(_mm_cvtepi32_ps(frac4
), fracOne4
)};
92 const __m128 out
{_mm_add_ps(val1
, _mm_mul_ps(mu
, r0
))};
94 frac4
= _mm_add_epi32(frac4
, increment4
);
95 pos4
= _mm_add_epi32(pos4
, _mm_srli_epi32(frac4
, MixerFracBits
));
96 frac4
= _mm_and_si128(frac4
, fracMask4
);
100 if(size_t todo
{dst
.size()&3})
102 /* NOTE: These four elements represent the position *after* the last
103 * four samples, so the lowest element is the next position to
106 auto pos
= size_t{static_cast<uint
>(_mm_cvtsi128_si32(pos4
))};
107 frac
= static_cast<uint
>(_mm_cvtsi128_si32(frac4
));
109 auto out
= dst
.last(todo
);
110 std::generate(out
.begin(), out
.end(), [&pos
,&frac
,src
,increment
]
112 const float smp
{lerpf(src
[pos
+0], src
[pos
+1],
113 static_cast<float>(frac
) * (1.0f
/MixerFracOne
))};
116 pos
+= frac
>>MixerFracBits
;
117 frac
&= MixerFracMask
;
124 void Resample_
<CubicTag
,SSE4Tag
>(const InterpState
*state
, const al::span
<const float> src
,
125 uint frac
, const uint increment
, const al::span
<float> dst
)
127 ASSUME(frac
< MixerFracOne
);
129 const auto filter
= std::get
<CubicState
>(*state
).filter
;
131 const __m128i increment4
{_mm_set1_epi32(static_cast<int>(increment
*4))};
132 const __m128i fracMask4
{_mm_set1_epi32(MixerFracMask
)};
133 const __m128 fracDiffOne4
{_mm_set1_ps(1.0f
/CubicPhaseDiffOne
)};
134 const __m128i fracDiffMask4
{_mm_set1_epi32(CubicPhaseDiffMask
)};
136 std::array
<uint
,4> pos_
{}, frac_
{};
137 InitPosArrays(MaxResamplerEdge
-1, frac
, increment
, al::span
{frac_
}, al::span
{pos_
});
138 __m128i frac4
{_mm_setr_epi32(static_cast<int>(frac_
[0]), static_cast<int>(frac_
[1]),
139 static_cast<int>(frac_
[2]), static_cast<int>(frac_
[3]))};
140 __m128i pos4
{_mm_setr_epi32(static_cast<int>(pos_
[0]), static_cast<int>(pos_
[1]),
141 static_cast<int>(pos_
[2]), static_cast<int>(pos_
[3]))};
143 auto vecout
= al::span
{reinterpret_cast<__m128
*>(dst
.data()), dst
.size()/4};
144 std::generate(vecout
.begin(), vecout
.end(), [=,&pos4
,&frac4
]
146 const auto pos0
= static_cast<uint
>(_mm_extract_epi32(pos4
, 0));
147 const auto pos1
= static_cast<uint
>(_mm_extract_epi32(pos4
, 1));
148 const auto pos2
= static_cast<uint
>(_mm_extract_epi32(pos4
, 2));
149 const auto pos3
= static_cast<uint
>(_mm_extract_epi32(pos4
, 3));
150 ASSUME(pos0
<= pos1
); ASSUME(pos1
<= pos2
); ASSUME(pos2
<= pos3
);
151 const __m128 val0
{_mm_loadu_ps(&src
[pos0
])};
152 const __m128 val1
{_mm_loadu_ps(&src
[pos1
])};
153 const __m128 val2
{_mm_loadu_ps(&src
[pos2
])};
154 const __m128 val3
{_mm_loadu_ps(&src
[pos3
])};
156 const __m128i pi4
{_mm_srli_epi32(frac4
, CubicPhaseDiffBits
)};
157 const auto pi0
= static_cast<uint
>(_mm_extract_epi32(pi4
, 0));
158 const auto pi1
= static_cast<uint
>(_mm_extract_epi32(pi4
, 1));
159 const auto pi2
= static_cast<uint
>(_mm_extract_epi32(pi4
, 2));
160 const auto pi3
= static_cast<uint
>(_mm_extract_epi32(pi4
, 3));
161 ASSUME(pi0
< CubicPhaseCount
); ASSUME(pi1
< CubicPhaseCount
);
162 ASSUME(pi2
< CubicPhaseCount
); ASSUME(pi3
< CubicPhaseCount
);
164 const __m128 pf4
{_mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(frac4
, fracDiffMask4
)),
167 __m128 r0
{_mm_mul_ps(val0
,
168 vmadd(_mm_load_ps(filter
[pi0
].mCoeffs
.data()),
169 _mm_shuffle_ps(pf4
, pf4
, _MM_SHUFFLE(0, 0, 0, 0)),
170 _mm_load_ps(filter
[pi0
].mDeltas
.data())))};
171 __m128 r1
{_mm_mul_ps(val1
,
172 vmadd(_mm_load_ps(filter
[pi1
].mCoeffs
.data()),
173 _mm_shuffle_ps(pf4
, pf4
, _MM_SHUFFLE(1, 1, 1, 1)),
174 _mm_load_ps(filter
[pi1
].mDeltas
.data())))};
175 __m128 r2
{_mm_mul_ps(val2
,
176 vmadd(_mm_load_ps(filter
[pi2
].mCoeffs
.data()),
177 _mm_shuffle_ps(pf4
, pf4
, _MM_SHUFFLE(2, 2, 2, 2)),
178 _mm_load_ps(filter
[pi2
].mDeltas
.data())))};
179 __m128 r3
{_mm_mul_ps(val3
,
180 vmadd(_mm_load_ps(filter
[pi3
].mCoeffs
.data()),
181 _mm_shuffle_ps(pf4
, pf4
, _MM_SHUFFLE(3, 3, 3, 3)),
182 _mm_load_ps(filter
[pi3
].mDeltas
.data())))};
184 _MM_TRANSPOSE4_PS(r0
, r1
, r2
, r3
);
185 r0
= _mm_add_ps(_mm_add_ps(r0
, r1
), _mm_add_ps(r2
, r3
));
187 frac4
= _mm_add_epi32(frac4
, increment4
);
188 pos4
= _mm_add_epi32(pos4
, _mm_srli_epi32(frac4
, MixerFracBits
));
189 frac4
= _mm_and_si128(frac4
, fracMask4
);
193 if(const size_t todo
{dst
.size()&3})
195 auto pos
= size_t{static_cast<uint
>(_mm_cvtsi128_si32(pos4
))};
196 frac
= static_cast<uint
>(_mm_cvtsi128_si32(frac4
));
198 auto out
= dst
.last(todo
);
199 std::generate(out
.begin(), out
.end(), [&pos
,&frac
,src
,increment
,filter
]
201 const uint pi
{frac
>> CubicPhaseDiffBits
}; ASSUME(pi
< CubicPhaseCount
);
202 const float pf
{static_cast<float>(frac
&CubicPhaseDiffMask
) * (1.0f
/CubicPhaseDiffOne
)};
203 const __m128 pf4
{_mm_set1_ps(pf
)};
205 const __m128 f4
= vmadd(_mm_load_ps(filter
[pi
].mCoeffs
.data()), pf4
,
206 _mm_load_ps(filter
[pi
].mDeltas
.data()));
207 __m128 r4
{_mm_mul_ps(f4
, _mm_loadu_ps(&src
[pos
]))};
209 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
210 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
211 const float output
{_mm_cvtss_f32(r4
)};
214 pos
+= frac
>>MixerFracBits
;
215 frac
&= MixerFracMask
;