2 Copyright (C) 2000 Paul Davis
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 #define _ISOC9X_SOURCE 1
21 #define _ISOC99_SOURCE 1
23 #define __USE_ISOC9X 1
24 #define __USE_ISOC99 1
38 #if defined (__SSE2__) && !defined (__sun__)
39 #include <emmintrin.h>
41 #include <smmintrin.h>
45 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
49 /* Notes about these *_SCALING values.
51 the MAX_<N>BIT values are floating point. when multiplied by
52 a full-scale normalized floating point sample value (-1.0..+1.0)
53 they should give the maximum value representable with an integer
54 sample type of N bits. Note that this is asymmetric. Sample ranges
55 for signed integer, 2's complement values are -(2^(N-1) to +(2^(N-1)-1)
59 If we use +2^(N-1) for the scaling factors, we run into a problem:
61 if we start with a normalized float value of -1.0, scaling
62 to 24 bits would give -8388608 (-2^23), which is ideal.
63 But with +1.0, we get +8388608, which is technically out of range.
65 We never multiply a full range normalized value by this constant,
66 but we could multiply it by a positive value that is close enough to +1.0
67 to produce a value > +(2^(N-1)-1.
69 There is no way around this paradox without wasting CPU cycles to determine
70 which scaling factor to use (i.e. determine if its negative or not,
71 use the right factor).
73 So, for now (October 2008) we use 2^(N-1)-1 as the scaling factor.
76 #define SAMPLE_32BIT_SCALING 2147483647.0
77 #define SAMPLE_24BIT_SCALING 8388607.0f
78 #define SAMPLE_16BIT_SCALING 32767.0f
80 /* these are just values to use if the floating point value was out of range
82 advice from Fons Adriaensen: make the limits symmetrical
85 #define SAMPLE_32BIT_MAX 2147483647
86 #define SAMPLE_32BIT_MIN -2147483647
87 #define SAMPLE_32BIT_MAX_D 2147483647.0
88 #define SAMPLE_32BIT_MIN_D -2147483647.0
90 #define SAMPLE_24BIT_MAX 8388607
91 #define SAMPLE_24BIT_MIN -8388607
92 #define SAMPLE_24BIT_MAX_F 8388607.0f
93 #define SAMPLE_24BIT_MIN_F -8388607.0f
95 #define SAMPLE_16BIT_MAX 32767
96 #define SAMPLE_16BIT_MIN -32767
97 #define SAMPLE_16BIT_MAX_F 32767.0f
98 #define SAMPLE_16BIT_MIN_F -32767.0f
100 /* these mark the outer edges of the range considered "within" range
101 for a floating point sample value. values outside (and on the boundaries)
102 of this range will be clipped before conversion; values within this
103 range will be scaled to appropriate values for the target sample
107 #define NORMALIZED_FLOAT_MIN -1.0f
108 #define NORMALIZED_FLOAT_MAX 1.0f
110 /* define this in case we end up on a platform that is missing
111 the real lrintf functions
114 #define f_round(f) lrintf(f)
115 #define d_round(f) lrint(f)
117 #define float_16(s, d)\
118 if ((s) <= NORMALIZED_FLOAT_MIN) {\
119 (d) = SAMPLE_16BIT_MIN;\
120 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
121 (d) = SAMPLE_16BIT_MAX;\
123 (d) = f_round ((s) * SAMPLE_16BIT_SCALING);\
126 /* call this when "s" has already been scaled (e.g. when dithering)
129 #define float_16_scaled(s, d)\
130 if ((s) <= SAMPLE_16BIT_MIN_F) {\
131 (d) = SAMPLE_16BIT_MIN_F;\
132 } else if ((s) >= SAMPLE_16BIT_MAX_F) { \
133 (d) = SAMPLE_16BIT_MAX;\
135 (d) = f_round ((s));\
138 #define float_24u32(s, d) \
139 if ((s) <= NORMALIZED_FLOAT_MIN) {\
140 (d) = SAMPLE_24BIT_MIN << 8;\
141 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
142 (d) = SAMPLE_24BIT_MAX << 8;\
144 (d) = f_round ((s) * SAMPLE_24BIT_SCALING) << 8;\
147 #define float_24l32(s, d) \
148 if ((s) <= NORMALIZED_FLOAT_MIN) {\
149 (d) = SAMPLE_24BIT_MIN; \
150 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
151 (d) = SAMPLE_24BIT_MAX; \
153 (d) = f_round ((s) * SAMPLE_24BIT_SCALING); \
156 #define float_32(s, d) \
158 double clipped = fmin(NORMALIZED_FLOAT_MAX, \
159 fmax((double)(s), NORMALIZED_FLOAT_MIN)); \
160 double scaled = clipped * SAMPLE_32BIT_MAX_D; \
161 (d) = d_round(scaled); \
165 /* call this when "s" has already been scaled (e.g. when dithering)
168 #define float_24u32_scaled(s, d)\
169 if ((s) <= SAMPLE_24BIT_MIN_F) {\
170 (d) = SAMPLE_24BIT_MIN << 8;\
171 } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
172 (d) = SAMPLE_24BIT_MAX << 8; \
174 (d) = f_round ((s)) << 8; \
177 #define float_24(s, d) \
178 if ((s) <= NORMALIZED_FLOAT_MIN) {\
179 (d) = SAMPLE_24BIT_MIN;\
180 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
181 (d) = SAMPLE_24BIT_MAX;\
183 (d) = f_round ((s) * SAMPLE_24BIT_SCALING);\
186 /* call this when "s" has already been scaled (e.g. when dithering)
189 #define float_24_scaled(s, d)\
190 if ((s) <= SAMPLE_24BIT_MIN_F) {\
191 (d) = SAMPLE_24BIT_MIN;\
192 } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
193 (d) = SAMPLE_24BIT_MAX; \
195 (d) = f_round ((s)); \
199 #if defined (__SSE2__) && !defined (__sun__)
201 /* generates same as _mm_set_ps(1.f, 1.f, 1f., 1f) but faster */
202 static inline __m128
gen_one(void)
204 volatile __m128i x
= { 0 }; /* shut up, GCC */
205 __m128i ones
= _mm_cmpeq_epi32(x
, x
);
206 return (__m128
)_mm_slli_epi32 (_mm_srli_epi32(ones
, 25), 23);
209 static inline __m128
clip(__m128 s
, __m128 min
, __m128 max
)
211 return _mm_min_ps(max
, _mm_max_ps(s
, min
));
214 static inline __m128d
clip_double(__m128d s
, __m128d min
, __m128d max
)
216 return _mm_min_pd(max
, _mm_max_pd(s
, min
));
219 static inline __m128i
float_24_sse(__m128 s
)
221 const __m128 upper_bound
= gen_one(); /* NORMALIZED_FLOAT_MAX */
222 const __m128 lower_bound
= _mm_sub_ps(_mm_setzero_ps(), upper_bound
);
224 __m128 clipped
= clip(s
, lower_bound
, upper_bound
);
225 __m128 scaled
= _mm_mul_ps(clipped
, _mm_set1_ps(SAMPLE_24BIT_SCALING
));
226 return _mm_cvtps_epi32(scaled
);
231 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
233 static inline float32x4_t
clip(float32x4_t s
, float32x4_t min
, float32x4_t max
)
235 return vminq_f32(max
, vmaxq_f32(s
, min
));
238 static inline int32x4_t
float_24_neon(float32x4_t s
)
240 const float32x4_t upper_bound
= vdupq_n_f32(NORMALIZED_FLOAT_MAX
);
241 const float32x4_t lower_bound
= vdupq_n_f32(NORMALIZED_FLOAT_MIN
);
243 float32x4_t clipped
= clip(s
, lower_bound
, upper_bound
);
244 float32x4_t scaled
= vmulq_f32(clipped
, vdupq_n_f32(SAMPLE_24BIT_SCALING
));
245 return vcvtq_s32_f32(scaled
);
248 static inline int16x4_t
float_16_neon(float32x4_t s
)
250 const float32x4_t upper_bound
= vdupq_n_f32(NORMALIZED_FLOAT_MAX
);
251 const float32x4_t lower_bound
= vdupq_n_f32(NORMALIZED_FLOAT_MIN
);
253 float32x4_t clipped
= clip(s
, lower_bound
, upper_bound
);
254 float32x4_t scaled
= vmulq_f32(clipped
, vdupq_n_f32(SAMPLE_16BIT_SCALING
));
255 return vmovn_s32(vcvtq_s32_f32(scaled
));
259 /* Linear Congruential noise generator. From the music-dsp list
260 * less random than rand(), but good enough and 10x faster
262 static unsigned int seed
= 22222;
264 static inline unsigned int fast_rand() {
265 seed
= (seed
* 196314165) + 907633515;
269 /* functions for native float sample data */
271 void sample_move_floatLE_sSs (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
) {
273 *dst
= *((float *) src
);
279 void sample_move_dS_floatLE (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
) {
281 *((float *) dst
) = *src
;
287 /* NOTES on function naming:
289 foo_bar_d<TYPE>_s<TYPE>
291 the "d<TYPE>" component defines the destination type for the operation
292 the "s<TYPE>" component defines the source type for the operation
296 S - sample is a jack_default_audio_sample_t, currently (October 2008) a 32 bit floating point value
297 Ss - like S but reverse endian from the host CPU
298 32 - sample is a signed 32 bit integer value
299 32u24 - sample is a signed 32 bit integer value, but data is in upper 24 bits only
300 32u24s - like 32u24 but reverse endian from the host CPU
301 32l24 - sample is a signed 32 bit integer value, but data is in lower 24 bits only
302 32l24s - like 32l24 but reverse endian from the host CPU
303 24 - sample is a signed 24 bit integer value
304 24s - like 24 but reverse endian from the host CPU
305 16 - sample is a signed 16 bit integer value
306 16s - like 16 but reverse endian from the host CPU
308 For obvious reasons, the reverse endian versions only show as source types.
310 This covers all known sample formats at 16 bits or larger.
313 /* functions for native integer sample data */
315 void sample_move_d32_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
320 #if __BYTE_ORDER == __LITTLE_ENDIAN
321 dst
[0]=(char)(z
>>24);
322 dst
[1]=(char)(z
>>16);
325 #elif __BYTE_ORDER == __BIG_ENDIAN
328 dst
[2]=(char)(z
>>16);
329 dst
[3]=(char)(z
>>24);
336 void sample_move_d32_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
339 float_32(*src
, *(int32_t *)dst
);
345 void sample_move_d32u24_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
347 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
348 unsigned long unrolled
= nsamples
/ 4;
349 nsamples
= nsamples
& 3;
352 float32x4_t samples
= vld1q_f32(src
);
353 int32x4_t converted
= float_24_neon(samples
);
354 int32x4_t shifted
= vshlq_n_s32(converted
, 8);
355 shifted
= vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(shifted
)));
359 vst1q_s32((int32_t*)dst
, shifted
);
362 vst1q_lane_s32((int32_t*)(dst
), shifted
, 0);
363 vst1q_lane_s32((int32_t*)(dst
+dst_skip
), shifted
, 1);
364 vst1q_lane_s32((int32_t*)(dst
+2*dst_skip
), shifted
, 2);
365 vst1q_lane_s32((int32_t*)(dst
+3*dst_skip
), shifted
, 3);
377 float_24u32 (*src
, z
);
379 #if __BYTE_ORDER == __LITTLE_ENDIAN
380 dst
[0]=(char)(z
>>24);
381 dst
[1]=(char)(z
>>16);
384 #elif __BYTE_ORDER == __BIG_ENDIAN
387 dst
[2]=(char)(z
>>16);
388 dst
[3]=(char)(z
>>24);
395 void sample_move_d32u24_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
397 #if defined (__SSE2__) && !defined (__sun__)
398 __m128 int_max
= _mm_set1_ps(SAMPLE_24BIT_MAX_F
);
399 __m128 int_min
= _mm_sub_ps(_mm_setzero_ps(), int_max
);
400 __m128 factor
= int_max
;
402 unsigned long unrolled
= nsamples
/ 4;
403 nsamples
= nsamples
& 3;
406 __m128 in
= _mm_load_ps(src
);
407 __m128 scaled
= _mm_mul_ps(in
, factor
);
408 __m128 clipped
= clip(scaled
, int_min
, int_max
);
410 __m128i y
= _mm_cvttps_epi32(clipped
);
411 __m128i shifted
= _mm_slli_epi32(y
, 8);
414 *(int32_t*)dst
= _mm_extract_epi32(shifted
, 0);
415 *(int32_t*)(dst
+dst_skip
) = _mm_extract_epi32(shifted
, 1);
416 *(int32_t*)(dst
+2*dst_skip
) = _mm_extract_epi32(shifted
, 2);
417 *(int32_t*)(dst
+3*dst_skip
) = _mm_extract_epi32(shifted
, 3);
419 __m128i shuffled1
= _mm_shuffle_epi32(shifted
, _MM_SHUFFLE(0, 3, 2, 1));
420 __m128i shuffled2
= _mm_shuffle_epi32(shifted
, _MM_SHUFFLE(1, 0, 3, 2));
421 __m128i shuffled3
= _mm_shuffle_epi32(shifted
, _MM_SHUFFLE(2, 1, 0, 3));
423 _mm_store_ss((float*)dst
, (__m128
)shifted
);
425 _mm_store_ss((float*)(dst
+dst_skip
), (__m128
)shuffled1
);
426 _mm_store_ss((float*)(dst
+2*dst_skip
), (__m128
)shuffled2
);
427 _mm_store_ss((float*)(dst
+3*dst_skip
), (__m128
)shuffled3
);
435 __m128 in
= _mm_load_ss(src
);
436 __m128 scaled
= _mm_mul_ss(in
, factor
);
437 __m128 clipped
= _mm_min_ss(int_max
, _mm_max_ss(scaled
, int_min
));
439 int y
= _mm_cvttss_si32(clipped
);
440 *((int *) dst
) = y
<<8;
446 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
447 unsigned long unrolled
= nsamples
/ 4;
448 nsamples
= nsamples
& 3;
451 float32x4_t samples
= vld1q_f32(src
);
452 int32x4_t converted
= float_24_neon(samples
);
453 int32x4_t shifted
= vshlq_n_s32(converted
, 8);
457 vst1q_s32((int32_t*)dst
, shifted
);
460 vst1q_lane_s32((int32_t*)(dst
), shifted
, 0);
461 vst1q_lane_s32((int32_t*)(dst
+dst_skip
), shifted
, 1);
462 vst1q_lane_s32((int32_t*)(dst
+2*dst_skip
), shifted
, 2);
463 vst1q_lane_s32((int32_t*)(dst
+3*dst_skip
), shifted
, 3);
472 #if !defined (__SSE2__)
474 float_24u32 (*src
, *((int32_t*) dst
));
481 void sample_move_dS_s32u24s (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
483 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
484 float32x4_t factor
= vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING
);
485 unsigned long unrolled
= nsamples
/ 4;
491 src128
= vld1q_s32((int32_t*)src
);
494 src128
= vld2q_s32((int32_t*)src
).val
[0];
497 src128
= vld1q_lane_s32((int32_t*)src
, src128
, 0);
498 src128
= vld1q_lane_s32((int32_t*)(src
+src_skip
), src128
, 1);
499 src128
= vld1q_lane_s32((int32_t*)(src
+2*src_skip
), src128
, 2);
500 src128
= vld1q_lane_s32((int32_t*)(src
+3*src_skip
), src128
, 3);
503 src128
= vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(src128
)));
504 int32x4_t shifted
= vshrq_n_s32(src128
, 8);
505 float32x4_t as_float
= vcvtq_f32_s32(shifted
);
506 float32x4_t divided
= vmulq_f32(as_float
, factor
);
507 vst1q_f32(dst
, divided
);
512 nsamples
= nsamples
& 3;
515 /* ALERT: signed sign-extension portability !!! */
517 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_24BIT_SCALING
;
521 #if __BYTE_ORDER == __LITTLE_ENDIAN
522 x
= (unsigned char)(src
[0]);
524 x
|= (unsigned char)(src
[1]);
526 x
|= (unsigned char)(src
[2]);
528 x
|= (unsigned char)(src
[3]);
529 #elif __BYTE_ORDER == __BIG_ENDIAN
530 x
= (unsigned char)(src
[3]);
532 x
|= (unsigned char)(src
[2]);
534 x
|= (unsigned char)(src
[1]);
536 x
|= (unsigned char)(src
[0]);
538 *dst
= (x
>> 8) * scaling
;
544 void sample_move_dS_s32u24 (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
546 #if defined (__SSE2__) && !defined (__sun__)
547 unsigned long unrolled
= nsamples
/ 4;
548 static float inv_sample_max_24bit
= 1.0 / SAMPLE_24BIT_SCALING
;
549 __m128 factor
= _mm_set1_ps(inv_sample_max_24bit
);
552 int i1
= *((int *) src
);
554 int i2
= *((int *) src
);
556 int i3
= *((int *) src
);
558 int i4
= *((int *) src
);
561 __m128i src
= _mm_set_epi32(i4
, i3
, i2
, i1
);
562 __m128i shifted
= _mm_srai_epi32(src
, 8);
564 __m128 as_float
= _mm_cvtepi32_ps(shifted
);
565 __m128 divided
= _mm_mul_ps(as_float
, factor
);
567 _mm_storeu_ps(dst
, divided
);
571 nsamples
= nsamples
& 3;
572 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
573 unsigned long unrolled
= nsamples
/ 4;
574 float32x4_t factor
= vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING
);
579 src128
= vld1q_s32((int32_t*)src
);
582 src128
= vld2q_s32((int32_t*)src
).val
[0];
585 src128
= vld1q_lane_s32((int32_t*)src
, src128
, 0);
586 src128
= vld1q_lane_s32((int32_t*)(src
+src_skip
), src128
, 1);
587 src128
= vld1q_lane_s32((int32_t*)(src
+2*src_skip
), src128
, 2);
588 src128
= vld1q_lane_s32((int32_t*)(src
+3*src_skip
), src128
, 3);
591 int32x4_t shifted
= vshrq_n_s32(src128
, 8);
592 float32x4_t as_float
= vcvtq_f32_s32(shifted
);
593 float32x4_t divided
= vmulq_f32(as_float
, factor
);
594 vst1q_f32(dst
, divided
);
599 nsamples
= nsamples
& 3;
602 /* ALERT: signed sign-extension portability !!! */
604 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_24BIT_SCALING
;
606 *dst
= (*((int *) src
) >> 8) * scaling
;
612 void sample_move_d32l24_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
614 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
615 unsigned long unrolled
= nsamples
/ 4;
616 nsamples
= nsamples
& 3;
619 float32x4_t samples
= vld1q_f32(src
);
620 int32x4_t converted
= float_24_neon(samples
);
621 converted
= vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted
)));
625 vst1q_s32((int32_t*)dst
, converted
);
628 vst1q_lane_s32((int32_t*)(dst
), converted
, 0);
629 vst1q_lane_s32((int32_t*)(dst
+dst_skip
), converted
, 1);
630 vst1q_lane_s32((int32_t*)(dst
+2*dst_skip
), converted
, 2);
631 vst1q_lane_s32((int32_t*)(dst
+3*dst_skip
), converted
, 3);
643 float_24l32 (*src
, z
);
645 #if __BYTE_ORDER == __LITTLE_ENDIAN
646 dst
[0]=(char)(z
>>24);
647 dst
[1]=(char)(z
>>16);
650 #elif __BYTE_ORDER == __BIG_ENDIAN
653 dst
[2]=(char)(z
>>16);
654 dst
[3]=(char)(z
>>24);
661 void sample_move_d32l24_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
663 #if defined (__SSE2__) && !defined (__sun__)
664 __m128 int_max
= _mm_set1_ps(SAMPLE_24BIT_MAX_F
);
665 __m128 int_min
= _mm_sub_ps(_mm_setzero_ps(), int_max
);
666 __m128 factor
= int_max
;
668 unsigned long unrolled
= nsamples
/ 4;
669 nsamples
= nsamples
& 3;
672 __m128 in
= _mm_load_ps(src
);
673 __m128 scaled
= _mm_mul_ps(in
, factor
);
674 __m128 clipped
= clip(scaled
, int_min
, int_max
);
676 __m128i shifted
= _mm_cvttps_epi32(clipped
);
679 *(int32_t*)dst
= _mm_extract_epi32(shifted
, 0);
680 *(int32_t*)(dst
+dst_skip
) = _mm_extract_epi32(shifted
, 1);
681 *(int32_t*)(dst
+2*dst_skip
) = _mm_extract_epi32(shifted
, 2);
682 *(int32_t*)(dst
+3*dst_skip
) = _mm_extract_epi32(shifted
, 3);
684 __m128i shuffled1
= _mm_shuffle_epi32(shifted
, _MM_SHUFFLE(0, 3, 2, 1));
685 __m128i shuffled2
= _mm_shuffle_epi32(shifted
, _MM_SHUFFLE(1, 0, 3, 2));
686 __m128i shuffled3
= _mm_shuffle_epi32(shifted
, _MM_SHUFFLE(2, 1, 0, 3));
688 _mm_store_ss((float*)dst
, (__m128
)shifted
);
690 _mm_store_ss((float*)(dst
+dst_skip
), (__m128
)shuffled1
);
691 _mm_store_ss((float*)(dst
+2*dst_skip
), (__m128
)shuffled2
);
692 _mm_store_ss((float*)(dst
+3*dst_skip
), (__m128
)shuffled3
);
700 __m128 in
= _mm_load_ss(src
);
701 __m128 scaled
= _mm_mul_ss(in
, factor
);
702 __m128 clipped
= _mm_min_ss(int_max
, _mm_max_ss(scaled
, int_min
));
704 int y
= _mm_cvttss_si32(clipped
);
705 *((int *) dst
) = y
<<8;
710 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
711 unsigned long unrolled
= nsamples
/ 4;
712 nsamples
= nsamples
& 3;
715 float32x4_t samples
= vld1q_f32(src
);
716 int32x4_t converted
= float_24_neon(samples
);
720 vst1q_s32((int32_t*)dst
, converted
);
723 vst1q_lane_s32((int32_t*)(dst
), converted
, 0);
724 vst1q_lane_s32((int32_t*)(dst
+dst_skip
), converted
, 1);
725 vst1q_lane_s32((int32_t*)(dst
+2*dst_skip
), converted
, 2);
726 vst1q_lane_s32((int32_t*)(dst
+3*dst_skip
), converted
, 3);
735 #if !defined (__SSE2__)
737 float_24l32 (*src
, *((int32_t*) dst
));
744 void sample_move_dS_s32s (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
746 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_32BIT_SCALING
;
749 #if __BYTE_ORDER == __LITTLE_ENDIAN
750 x
= (unsigned char)(src
[0]);
752 x
|= (unsigned char)(src
[1]);
754 x
|= (unsigned char)(src
[2]);
756 x
|= (unsigned char)(src
[3]);
757 #elif __BYTE_ORDER == __BIG_ENDIAN
758 x
= (unsigned char)(src
[3]);
760 x
|= (unsigned char)(src
[2]);
762 x
|= (unsigned char)(src
[1]);
764 x
|= (unsigned char)(src
[0]);
766 double extended
= x
* scaling
;
767 *dst
= (float)extended
;
773 void sample_move_dS_s32l24s (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
775 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
776 float32x4_t factor
= vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING
);
777 unsigned long unrolled
= nsamples
/ 4;
783 src128
= vld1q_u32((uint32_t*)src
);
786 src128
= vld2q_u32((uint32_t*)src
).val
[0];
789 src128
= vld1q_lane_u32((uint32_t*)src
, src128
, 0);
790 src128
= vld1q_lane_u32((uint32_t*)(src
+src_skip
), src128
, 1);
791 src128
= vld1q_lane_u32((uint32_t*)(src
+2*src_skip
), src128
, 2);
792 src128
= vld1q_lane_u32((uint32_t*)(src
+3*src_skip
), src128
, 3);
795 src128
= vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(src128
)));
796 uint32x4_t toupper
= vshlq_n_u32(src128
, 8);
797 int32x4_t shifted
= vshrq_n_s32((int32x4_t
)toupper
, 8);
798 float32x4_t as_float
= vcvtq_f32_s32(shifted
);
799 float32x4_t divided
= vmulq_f32(as_float
, factor
);
800 vst1q_f32(dst
, divided
);
805 nsamples
= nsamples
& 3;
808 /* ALERT: signed sign-extension portability !!! */
810 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_24BIT_SCALING
;
814 #if __BYTE_ORDER == __LITTLE_ENDIAN
815 x
= (unsigned char)(src
[0]);
817 x
|= (unsigned char)(src
[1]);
819 x
|= (unsigned char)(src
[2]);
821 x
|= (unsigned char)(src
[3]);
822 #elif __BYTE_ORDER == __BIG_ENDIAN
823 x
= (unsigned char)(src
[3]);
825 x
|= (unsigned char)(src
[2]);
827 x
|= (unsigned char)(src
[1]);
829 x
|= (unsigned char)(src
[0]);
831 *dst
= (x
>> 0) * scaling
;
837 void sample_move_dS_s32 (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
839 const double scaling
= 1.0 / SAMPLE_32BIT_SCALING
;
841 int32_t val
=(*((int32_t*)src
));
842 double extended
= val
* scaling
;
843 *dst
= (float)extended
;
849 void sample_move_dS_s32l24 (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
851 #if defined (__SSE2__) && !defined (__sun__)
852 unsigned long unrolled
= nsamples
/ 4;
853 static float inv_sample_max_24bit
= 1.0 / SAMPLE_24BIT_SCALING
;
854 __m128 factor
= _mm_set1_ps(inv_sample_max_24bit
);
857 int i1
= *((int *) src
);
859 int i2
= *((int *) src
);
861 int i3
= *((int *) src
);
863 int i4
= *((int *) src
);
866 __m128i shifted
= _mm_set_epi32(i4
, i3
, i2
, i1
);
868 __m128 as_float
= _mm_cvtepi32_ps(shifted
);
869 __m128 divided
= _mm_mul_ps(as_float
, factor
);
871 _mm_storeu_ps(dst
, divided
);
875 nsamples
= nsamples
& 3;
876 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
877 unsigned long unrolled
= nsamples
/ 4;
878 float32x4_t factor
= vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING
);
883 src128
= vld1q_u32((uint32_t*)src
);
886 src128
= vld2q_u32((uint32_t*)src
).val
[0];
889 src128
= vld1q_lane_u32((uint32_t*)src
, src128
, 0);
890 src128
= vld1q_lane_u32((uint32_t*)(src
+src_skip
), src128
, 1);
891 src128
= vld1q_lane_u32((uint32_t*)(src
+2*src_skip
), src128
, 2);
892 src128
= vld1q_lane_u32((uint32_t*)(src
+3*src_skip
), src128
, 3);
895 // Sign extension by moving to upper as unsigned, then down
896 uint32x4_t toupper
= vshlq_n_u32(src128
, 8);
897 int32x4_t shifted
= vshrq_n_s32((int32x4_t
)toupper
, 8);
898 float32x4_t as_float
= vcvtq_f32_s32(shifted
);
899 float32x4_t divided
= vmulq_f32(as_float
, factor
);
900 vst1q_f32(dst
, divided
);
905 nsamples
= nsamples
& 3;
908 /* ALERT: signed sign-extension portability !!! */
910 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_24BIT_SCALING
;
912 uint32_t val
=(*((uint32_t*)src
));
913 if (val
& 0x800000u
) val
|=0xFF000000u
;
914 *dst
= (*((int32_t *) &val
)) * scaling
;
920 void sample_move_d24_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
922 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
923 unsigned long unrolled
= nsamples
/ 4;
927 float32x4_t samples
= vld1q_f32(src
);
928 int32x4_t converted
= float_24_neon(samples
);
929 converted
= vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted
)));
930 vst1q_s32(z
, converted
);
932 for (i
= 0; i
!= 4; ++i
) {
933 memcpy (dst
, ((char*)(z
+i
))+1, 3);
938 nsamples
= nsamples
& 3;
945 #if __BYTE_ORDER == __LITTLE_ENDIAN
946 dst
[0]=(char)(z
>>16);
949 #elif __BYTE_ORDER == __BIG_ENDIAN
952 dst
[2]=(char)(z
>>16);
959 void sample_move_d24_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
961 #if defined (__SSE2__) && !defined (__sun__)
962 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST
);
963 while (nsamples
>= 4) {
966 __m128 samples
= _mm_loadu_ps(src
);
967 __m128i converted
= float_24_sse(samples
);
970 z
[0] = _mm_extract_epi32(converted
, 0);
971 z
[1] = _mm_extract_epi32(converted
, 1);
972 z
[2] = _mm_extract_epi32(converted
, 2);
973 z
[3] = _mm_extract_epi32(converted
, 3);
975 __m128i shuffled1
= _mm_shuffle_epi32(converted
, _MM_SHUFFLE(0, 3, 2, 1));
976 __m128i shuffled2
= _mm_shuffle_epi32(converted
, _MM_SHUFFLE(1, 0, 3, 2));
977 __m128i shuffled3
= _mm_shuffle_epi32(converted
, _MM_SHUFFLE(2, 1, 0, 3));
979 _mm_store_ss((float*)z
, (__m128
)converted
);
980 _mm_store_ss((float*)z
+1, (__m128
)shuffled1
);
981 _mm_store_ss((float*)z
+2, (__m128
)shuffled2
);
982 _mm_store_ss((float*)z
+3, (__m128
)shuffled3
);
985 for (i
= 0; i
!= 4; ++i
) {
986 memcpy (dst
, z
+i
, 3);
993 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
994 unsigned long unrolled
= nsamples
/ 4;
998 float32x4_t samples
= vld1q_f32(src
);
999 int32x4_t converted
= float_24_neon(samples
);
1000 vst1q_s32(z
, converted
);
1002 for (i
= 0; i
!= 4; ++i
) {
1003 memcpy (dst
, z
+i
, 3);
1008 nsamples
= nsamples
& 3;
1013 while (nsamples
--) {
1015 #if __BYTE_ORDER == __LITTLE_ENDIAN
1016 memcpy (dst
, &z
, 3);
1017 #elif __BYTE_ORDER == __BIG_ENDIAN
1018 memcpy (dst
, (char *)&z
+ 1, 3);
1025 void sample_move_dS_s24s (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
1027 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_24BIT_SCALING
;
1029 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1030 // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
1031 const float32x4_t vscaling
= vdupq_n_f32(scaling
/256.0);
1033 memset(x
, 0, sizeof(x
));
1034 unsigned long unrolled
= nsamples
/ 4;
1035 while (unrolled
--) {
1036 #if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
1037 // right aligned / inverse sequence below -> *256
1038 memcpy(((char*)&x
[0])+1, src
, 3);
1039 memcpy(((char*)&x
[1])+1, src
+src_skip
, 3);
1040 memcpy(((char*)&x
[2])+1, src
+2*src_skip
, 3);
1041 memcpy(((char*)&x
[3])+1, src
+3*src_skip
, 3);
1043 memcpy(&x
[0], src
, 3);
1044 memcpy(&x
[1], src
+src_skip
, 3);
1045 memcpy(&x
[2], src
+2*src_skip
, 3);
1046 memcpy(&x
[3], src
+3*src_skip
, 3);
1048 src
+= 4 * src_skip
;
1050 int32x4_t source
= vld1q_s32(x
);
1051 source
= vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(source
)));
1052 float32x4_t converted
= vcvtq_f32_s32(source
);
1053 float32x4_t scaled
= vmulq_f32(converted
, vscaling
);
1054 vst1q_f32(dst
, scaled
);
1057 nsamples
= nsamples
& 3;
1060 /* ALERT: signed sign-extension portability !!! */
1062 while (nsamples
--) {
1064 #if __BYTE_ORDER == __LITTLE_ENDIAN
1065 x
= (unsigned char)(src
[0]);
1067 x
|= (unsigned char)(src
[1]);
1069 x
|= (unsigned char)(src
[2]);
1070 /* correct sign bit and the rest of the top byte */
1071 if (src
[0] & 0x80) {
1074 #elif __BYTE_ORDER == __BIG_ENDIAN
1075 x
= (unsigned char)(src
[2]);
1077 x
|= (unsigned char)(src
[1]);
1079 x
|= (unsigned char)(src
[0]);
1080 /* correct sign bit and the rest of the top byte */
1081 if (src
[2] & 0x80) {
1091 void sample_move_dS_s24 (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
1093 const jack_default_audio_sample_t scaling
= 1.f
/SAMPLE_24BIT_SCALING
;
1095 #if defined (__SSE2__) && !defined (__sun__)
1096 const __m128 scaling_block
= _mm_set_ps1(scaling
);
1097 while (nsamples
>= 4) {
1100 memcpy((char*)&x0
+ 1, src
, 3);
1101 memcpy((char*)&x1
+ 1, src
+src_skip
, 3);
1102 memcpy((char*)&x2
+ 1, src
+2*src_skip
, 3);
1103 memcpy((char*)&x3
+ 1, src
+3*src_skip
, 3);
1104 src
+= 4 * src_skip
;
1106 const __m128i block_i
= _mm_set_epi32(x3
, x2
, x1
, x0
);
1107 const __m128i shifted
= _mm_srai_epi32(block_i
, 8);
1108 const __m128 converted
= _mm_cvtepi32_ps (shifted
);
1109 const __m128 scaled
= _mm_mul_ps(converted
, scaling_block
);
1110 _mm_storeu_ps(dst
, scaled
);
1114 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
1115 // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
1116 const float32x4_t vscaling
= vdupq_n_f32(scaling
/256.0);
1118 memset(x
, 0, sizeof(x
));
1119 unsigned long unrolled
= nsamples
/ 4;
1120 while (unrolled
--) {
1121 #if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
1122 // left aligned -> *256
1123 memcpy(&x
[0], src
, 3);
1124 memcpy(&x
[1], src
+src_skip
, 3);
1125 memcpy(&x
[2], src
+2*src_skip
, 3);
1126 memcpy(&x
[3], src
+3*src_skip
, 3);
1128 memcpy(((char*)&x
[0])+1, src
, 3);
1129 memcpy(((char*)&x
[1])+1, src
+src_skip
, 3);
1130 memcpy(((char*)&x
[2])+1, src
+2*src_skip
, 3);
1131 memcpy(((char*)&x
[3])+1, src
+3*src_skip
, 3);
1133 src
+= 4 * src_skip
;
1135 int32x4_t source
= vld1q_s32(x
);
1136 float32x4_t converted
= vcvtq_f32_s32(source
);
1137 float32x4_t scaled
= vmulq_f32(converted
, vscaling
);
1138 vst1q_f32(dst
, scaled
);
1141 nsamples
= nsamples
& 3;
1144 while (nsamples
--) {
1146 #if __BYTE_ORDER == __LITTLE_ENDIAN
1147 memcpy((char*)&x
+ 1, src
, 3);
1148 #elif __BYTE_ORDER == __BIG_ENDIAN
1159 void sample_move_d16_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
1161 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1162 unsigned long unrolled
= nsamples
/ 4;
1163 nsamples
= nsamples
& 3;
1165 while (unrolled
--) {
1166 float32x4_t samples
= vld1q_f32(src
);
1167 int16x4_t converted
= float_16_neon(samples
);
1168 converted
= vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(converted
)));
1172 vst1_s16((int16_t*)dst
, converted
);
1175 vst1_lane_s16((int16_t*)(dst
), converted
, 0);
1176 vst1_lane_s16((int16_t*)(dst
+dst_skip
), converted
, 1);
1177 vst1_lane_s16((int16_t*)(dst
+2*dst_skip
), converted
, 2);
1178 vst1_lane_s16((int16_t*)(dst
+3*dst_skip
), converted
, 3);
1187 while (nsamples
--) {
1188 // float_16 (*src, tmp);
1190 if (*src
<= NORMALIZED_FLOAT_MIN
) {
1191 tmp
= SAMPLE_16BIT_MIN
;
1192 } else if (*src
>= NORMALIZED_FLOAT_MAX
) {
1193 tmp
= SAMPLE_16BIT_MAX
;
1195 tmp
= (int16_t) f_round (*src
* SAMPLE_16BIT_SCALING
);
1198 #if __BYTE_ORDER == __LITTLE_ENDIAN
1199 dst
[0]=(char)(tmp
>>8);
1201 #elif __BYTE_ORDER == __BIG_ENDIAN
1203 dst
[1]=(char)(tmp
>>8);
1210 void sample_move_d16_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
1212 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1213 unsigned long unrolled
= nsamples
/ 4;
1214 nsamples
= nsamples
& 3;
1216 while (unrolled
--) {
1217 float32x4_t samples
= vld1q_f32(src
);
1218 int16x4_t converted
= float_16_neon(samples
);
1222 vst1_s16((int16_t*)dst
, converted
);
1225 vst1_lane_s16((int16_t*)(dst
), converted
, 0);
1226 vst1_lane_s16((int16_t*)(dst
+dst_skip
), converted
, 1);
1227 vst1_lane_s16((int16_t*)(dst
+2*dst_skip
), converted
, 2);
1228 vst1_lane_s16((int16_t*)(dst
+3*dst_skip
), converted
, 3);
1235 while (nsamples
--) {
1236 float_16 (*src
, *((int16_t*) dst
));
1242 void sample_move_dither_rect_d16_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
1244 jack_default_audio_sample_t val
;
1247 while (nsamples
--) {
1248 val
= (*src
* SAMPLE_16BIT_SCALING
) + fast_rand() / (float) UINT_MAX
- 0.5f
;
1249 float_16_scaled (val
, tmp
);
1250 #if __BYTE_ORDER == __LITTLE_ENDIAN
1251 dst
[0]=(char)(tmp
>>8);
1253 #elif __BYTE_ORDER == __BIG_ENDIAN
1255 dst
[1]=(char)(tmp
>>8);
1262 void sample_move_dither_rect_d16_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
1264 jack_default_audio_sample_t val
;
1266 while (nsamples
--) {
1267 val
= (*src
* SAMPLE_16BIT_SCALING
) + fast_rand() / (float)UINT_MAX
- 0.5f
;
1268 float_16_scaled (val
, *((int16_t*) dst
));
1274 void sample_move_dither_tri_d16_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
1276 jack_default_audio_sample_t val
;
1279 while (nsamples
--) {
1280 val
= (*src
* SAMPLE_16BIT_SCALING
) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX
- 1.0f
;
1281 float_16_scaled (val
, tmp
);
1283 #if __BYTE_ORDER == __LITTLE_ENDIAN
1284 dst
[0]=(char)(tmp
>>8);
1286 #elif __BYTE_ORDER == __BIG_ENDIAN
1288 dst
[1]=(char)(tmp
>>8);
1295 void sample_move_dither_tri_d16_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
1297 jack_default_audio_sample_t val
;
1299 while (nsamples
--) {
1300 val
= (*src
* SAMPLE_16BIT_SCALING
) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX
- 1.0f
;
1301 float_16_scaled (val
, *((int16_t*) dst
));
1307 void sample_move_dither_shaped_d16_sSs (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
1309 jack_default_audio_sample_t x
;
1310 jack_default_audio_sample_t xe
; /* the innput sample - filtered error */
1311 jack_default_audio_sample_t xp
; /* x' */
1313 float rm1
= state
->rm1
;
1314 unsigned int idx
= state
->idx
;
1317 while (nsamples
--) {
1318 x
= *src
* SAMPLE_16BIT_SCALING
;
1319 r
= ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX
- 1.0f
;
1320 /* Filter the error with Lipshitz's minimally audible FIR:
1321 [2.033 -2.165 1.959 -1.590 0.6149] */
1323 - state
->e
[idx
] * 2.033f
1324 + state
->e
[(idx
- 1) & DITHER_BUF_MASK
] * 2.165f
1325 - state
->e
[(idx
- 2) & DITHER_BUF_MASK
] * 1.959f
1326 + state
->e
[(idx
- 3) & DITHER_BUF_MASK
] * 1.590f
1327 - state
->e
[(idx
- 4) & DITHER_BUF_MASK
] * 0.6149f
;
1331 float_16_scaled (xp
, tmp
);
1333 /* Intrinsic z^-1 delay */
1334 idx
= (idx
+ 1) & DITHER_BUF_MASK
;
1335 state
->e
[idx
] = xp
- xe
;
1337 #if __BYTE_ORDER == __LITTLE_ENDIAN
1338 dst
[0]=(char)(tmp
>>8);
1340 #elif __BYTE_ORDER == __BIG_ENDIAN
1342 dst
[1]=(char)(tmp
>>8);
1351 void sample_move_dither_shaped_d16_sS (char *dst
, jack_default_audio_sample_t
*src
, unsigned long nsamples
, unsigned long dst_skip
, dither_state_t
*state
)
1353 jack_default_audio_sample_t x
;
1354 jack_default_audio_sample_t xe
; /* the innput sample - filtered error */
1355 jack_default_audio_sample_t xp
; /* x' */
1357 float rm1
= state
->rm1
;
1358 unsigned int idx
= state
->idx
;
1360 while (nsamples
--) {
1361 x
= *src
* SAMPLE_16BIT_SCALING
;
1362 r
= ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX
- 1.0f
;
1363 /* Filter the error with Lipshitz's minimally audible FIR:
1364 [2.033 -2.165 1.959 -1.590 0.6149] */
1366 - state
->e
[idx
] * 2.033f
1367 + state
->e
[(idx
- 1) & DITHER_BUF_MASK
] * 2.165f
1368 - state
->e
[(idx
- 2) & DITHER_BUF_MASK
] * 1.959f
1369 + state
->e
[(idx
- 3) & DITHER_BUF_MASK
] * 1.590f
1370 - state
->e
[(idx
- 4) & DITHER_BUF_MASK
] * 0.6149f
;
1374 float_16_scaled (xp
, *((int16_t*) dst
));
1376 /* Intrinsic z^-1 delay */
1377 idx
= (idx
+ 1) & DITHER_BUF_MASK
;
1378 state
->e
[idx
] = *((int16_t*) dst
) - xe
;
1387 void sample_move_dS_s16s (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
1390 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_16BIT_SCALING
;
1391 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1392 const float32x4_t vscaling
= vdupq_n_f32(scaling
);
1393 unsigned long unrolled
= nsamples
/ 4;
1394 while (unrolled
--) {
1395 int16x4_t source16x4
;
1398 source16x4
= vld1_s16((int16_t*)src
);
1401 source16x4
= vld2_s16((int16_t*)src
).val
[0];
1404 source16x4
= vld1_lane_s16((int16_t*)src
, source16x4
, 0);
1405 source16x4
= vld1_lane_s16((int16_t*)(src
+src_skip
), source16x4
, 1);
1406 source16x4
= vld1_lane_s16((int16_t*)(src
+2*src_skip
), source16x4
, 2);
1407 source16x4
= vld1_lane_s16((int16_t*)(src
+3*src_skip
), source16x4
, 3);
1410 source16x4
= vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(source16x4
)));
1411 int32x4_t source32x4
= vmovl_s16(source16x4
);
1412 src
+= 4 * src_skip
;
1414 float32x4_t converted
= vcvtq_f32_s32(source32x4
);
1415 float32x4_t scaled
= vmulq_f32(converted
, vscaling
);
1416 vst1q_f32(dst
, scaled
);
1419 nsamples
= nsamples
& 3;
1422 /* ALERT: signed sign-extension portability !!! */
1423 while (nsamples
--) {
1424 #if __BYTE_ORDER == __LITTLE_ENDIAN
1425 z
= (unsigned char)(src
[0]);
1427 z
|= (unsigned char)(src
[1]);
1428 #elif __BYTE_ORDER == __BIG_ENDIAN
1429 z
= (unsigned char)(src
[1]);
1431 z
|= (unsigned char)(src
[0]);
1439 void sample_move_dS_s16 (jack_default_audio_sample_t
*dst
, char *src
, unsigned long nsamples
, unsigned long src_skip
)
1441 /* ALERT: signed sign-extension portability !!! */
1442 const jack_default_audio_sample_t scaling
= 1.0/SAMPLE_16BIT_SCALING
;
1443 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1444 const float32x4_t vscaling
= vdupq_n_f32(scaling
);
1445 unsigned long unrolled
= nsamples
/ 4;
1446 while (unrolled
--) {
1447 int16x4_t source16x4
;
1450 source16x4
= vld1_s16((int16_t*)src
);
1453 source16x4
= vld2_s16((int16_t*)src
).val
[0];
1456 source16x4
= vld1_lane_s16((int16_t*)src
, source16x4
, 0);
1457 source16x4
= vld1_lane_s16((int16_t*)(src
+src_skip
), source16x4
, 1);
1458 source16x4
= vld1_lane_s16((int16_t*)(src
+2*src_skip
), source16x4
, 2);
1459 source16x4
= vld1_lane_s16((int16_t*)(src
+3*src_skip
), source16x4
, 3);
1462 int32x4_t source32x4
= vmovl_s16(source16x4
);
1463 src
+= 4 * src_skip
;
1465 float32x4_t converted
= vcvtq_f32_s32(source32x4
);
1466 float32x4_t scaled
= vmulq_f32(converted
, vscaling
);
1467 vst1q_f32(dst
, scaled
);
1470 nsamples
= nsamples
& 3;
1473 while (nsamples
--) {
1474 *dst
= (*((short *) src
)) * scaling
;
1480 void memset_interleave (char *dst
, char val
, unsigned long bytes
,
1481 unsigned long unit_bytes
,
1482 unsigned long skip_bytes
)
1484 switch (unit_bytes
) {
1493 *((short *) dst
) = (short) val
;
1500 *((int *) dst
) = (int) val
;
1507 memset(dst
, val
, unit_bytes
);
1509 bytes
-= unit_bytes
;
1515 /* COPY FUNCTIONS: used to move data from an input channel to an
1516 output channel. Note that we assume that the skip distance
1517 is the same for both channels. This is completely fine
1518 unless the input and output were on different audio interfaces that
1519 were interleaved differently. We don't try to handle that.
1523 memcpy_fake (char *dst
, char *src
, unsigned long src_bytes
, unsigned long foo
, unsigned long bar
)
1525 memcpy (dst
, src
, src_bytes
);
1529 memcpy_interleave_d16_s16 (char *dst
, char *src
, unsigned long src_bytes
,
1530 unsigned long dst_skip_bytes
, unsigned long src_skip_bytes
)
1533 *((short *) dst
) = *((short *) src
);
1534 dst
+= dst_skip_bytes
;
1535 src
+= src_skip_bytes
;
1541 memcpy_interleave_d24_s24 (char *dst
, char *src
, unsigned long src_bytes
,
1542 unsigned long dst_skip_bytes
, unsigned long src_skip_bytes
)
1545 memcpy(dst
, src
, 3);
1546 dst
+= dst_skip_bytes
;
1547 src
+= src_skip_bytes
;
1553 memcpy_interleave_d32_s32 (char *dst
, char *src
, unsigned long src_bytes
,
1554 unsigned long dst_skip_bytes
, unsigned long src_skip_bytes
)
1557 *((int *) dst
) = *((int *) src
);
1558 dst
+= dst_skip_bytes
;
1559 src
+= src_skip_bytes
;