Remove non-jackdbus man pages
[jackdbus.git] / common / memops.c
blobb00af0bf92ebaa815e1694f2b3260b71a94a287a
1 /*
2 Copyright (C) 2000 Paul Davis
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 #define _ISOC9X_SOURCE 1
21 #define _ISOC99_SOURCE 1
23 #define __USE_ISOC9X 1
24 #define __USE_ISOC99 1
26 #include <stdio.h>
27 #include <string.h>
28 #include <math.h>
29 #include <memory.h>
30 #include <stdlib.h>
31 #include <stdint.h>
32 #include <limits.h>
33 #ifdef __linux__
34 #include <endian.h>
35 #endif
36 #include "memops.h"
38 #if defined (__SSE2__) && !defined (__sun__)
39 #include <emmintrin.h>
40 #ifdef __SSE4_1__
41 #include <smmintrin.h>
42 #endif
43 #endif
45 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
46 #include <arm_neon.h>
47 #endif
49 /* Notes about these *_SCALING values.
51 the MAX_<N>BIT values are floating point. when multiplied by
52 a full-scale normalized floating point sample value (-1.0..+1.0)
53 they should give the maximum value representable with an integer
54 sample type of N bits. Note that this is asymmetric. Sample ranges
55 for signed integer, 2's complement values are -(2^(N-1) to +(2^(N-1)-1)
57 Complications
58 -------------
59 If we use +2^(N-1) for the scaling factors, we run into a problem:
61 if we start with a normalized float value of -1.0, scaling
62 to 24 bits would give -8388608 (-2^23), which is ideal.
63 But with +1.0, we get +8388608, which is technically out of range.
65 We never multiply a full range normalized value by this constant,
66 but we could multiply it by a positive value that is close enough to +1.0
67 to produce a value > +(2^(N-1)-1.
69 There is no way around this paradox without wasting CPU cycles to determine
70 which scaling factor to use (i.e. determine if its negative or not,
71 use the right factor).
73 So, for now (October 2008) we use 2^(N-1)-1 as the scaling factor.
76 #define SAMPLE_32BIT_SCALING 2147483647.0
77 #define SAMPLE_24BIT_SCALING 8388607.0f
78 #define SAMPLE_16BIT_SCALING 32767.0f
80 /* these are just values to use if the floating point value was out of range
82 advice from Fons Adriaensen: make the limits symmetrical
85 #define SAMPLE_32BIT_MAX 2147483647
86 #define SAMPLE_32BIT_MIN -2147483647
87 #define SAMPLE_32BIT_MAX_D 2147483647.0
88 #define SAMPLE_32BIT_MIN_D -2147483647.0
90 #define SAMPLE_24BIT_MAX 8388607
91 #define SAMPLE_24BIT_MIN -8388607
92 #define SAMPLE_24BIT_MAX_F 8388607.0f
93 #define SAMPLE_24BIT_MIN_F -8388607.0f
95 #define SAMPLE_16BIT_MAX 32767
96 #define SAMPLE_16BIT_MIN -32767
97 #define SAMPLE_16BIT_MAX_F 32767.0f
98 #define SAMPLE_16BIT_MIN_F -32767.0f
100 /* these mark the outer edges of the range considered "within" range
101 for a floating point sample value. values outside (and on the boundaries)
102 of this range will be clipped before conversion; values within this
103 range will be scaled to appropriate values for the target sample
104 type.
107 #define NORMALIZED_FLOAT_MIN -1.0f
108 #define NORMALIZED_FLOAT_MAX 1.0f
110 /* define this in case we end up on a platform that is missing
111 the real lrintf functions
114 #define f_round(f) lrintf(f)
115 #define d_round(f) lrint(f)
117 #define float_16(s, d)\
118 if ((s) <= NORMALIZED_FLOAT_MIN) {\
119 (d) = SAMPLE_16BIT_MIN;\
120 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
121 (d) = SAMPLE_16BIT_MAX;\
122 } else {\
123 (d) = f_round ((s) * SAMPLE_16BIT_SCALING);\
126 /* call this when "s" has already been scaled (e.g. when dithering)
129 #define float_16_scaled(s, d)\
130 if ((s) <= SAMPLE_16BIT_MIN_F) {\
131 (d) = SAMPLE_16BIT_MIN_F;\
132 } else if ((s) >= SAMPLE_16BIT_MAX_F) { \
133 (d) = SAMPLE_16BIT_MAX;\
134 } else {\
135 (d) = f_round ((s));\
138 #define float_24u32(s, d) \
139 if ((s) <= NORMALIZED_FLOAT_MIN) {\
140 (d) = SAMPLE_24BIT_MIN << 8;\
141 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
142 (d) = SAMPLE_24BIT_MAX << 8;\
143 } else {\
144 (d) = f_round ((s) * SAMPLE_24BIT_SCALING) << 8;\
147 #define float_24l32(s, d) \
148 if ((s) <= NORMALIZED_FLOAT_MIN) {\
149 (d) = SAMPLE_24BIT_MIN; \
150 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
151 (d) = SAMPLE_24BIT_MAX; \
152 } else {\
153 (d) = f_round ((s) * SAMPLE_24BIT_SCALING); \
156 #define float_32(s, d) \
157 do { \
158 double clipped = fmin(NORMALIZED_FLOAT_MAX, \
159 fmax((double)(s), NORMALIZED_FLOAT_MIN)); \
160 double scaled = clipped * SAMPLE_32BIT_MAX_D; \
161 (d) = d_round(scaled); \
163 while (0)
165 /* call this when "s" has already been scaled (e.g. when dithering)
168 #define float_24u32_scaled(s, d)\
169 if ((s) <= SAMPLE_24BIT_MIN_F) {\
170 (d) = SAMPLE_24BIT_MIN << 8;\
171 } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
172 (d) = SAMPLE_24BIT_MAX << 8; \
173 } else {\
174 (d) = f_round ((s)) << 8; \
177 #define float_24(s, d) \
178 if ((s) <= NORMALIZED_FLOAT_MIN) {\
179 (d) = SAMPLE_24BIT_MIN;\
180 } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
181 (d) = SAMPLE_24BIT_MAX;\
182 } else {\
183 (d) = f_round ((s) * SAMPLE_24BIT_SCALING);\
186 /* call this when "s" has already been scaled (e.g. when dithering)
189 #define float_24_scaled(s, d)\
190 if ((s) <= SAMPLE_24BIT_MIN_F) {\
191 (d) = SAMPLE_24BIT_MIN;\
192 } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
193 (d) = SAMPLE_24BIT_MAX; \
194 } else {\
195 (d) = f_round ((s)); \
199 #if defined (__SSE2__) && !defined (__sun__)
201 /* generates same as _mm_set_ps(1.f, 1.f, 1f., 1f) but faster */
202 static inline __m128 gen_one(void)
204 volatile __m128i x = { 0 }; /* shut up, GCC */
205 __m128i ones = _mm_cmpeq_epi32(x, x);
206 return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 25), 23);
209 static inline __m128 clip(__m128 s, __m128 min, __m128 max)
211 return _mm_min_ps(max, _mm_max_ps(s, min));
214 static inline __m128d clip_double(__m128d s, __m128d min, __m128d max)
216 return _mm_min_pd(max, _mm_max_pd(s, min));
219 static inline __m128i float_24_sse(__m128 s)
221 const __m128 upper_bound = gen_one(); /* NORMALIZED_FLOAT_MAX */
222 const __m128 lower_bound = _mm_sub_ps(_mm_setzero_ps(), upper_bound);
224 __m128 clipped = clip(s, lower_bound, upper_bound);
225 __m128 scaled = _mm_mul_ps(clipped, _mm_set1_ps(SAMPLE_24BIT_SCALING));
226 return _mm_cvtps_epi32(scaled);
228 #endif
231 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
233 static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max)
235 return vminq_f32(max, vmaxq_f32(s, min));
238 static inline int32x4_t float_24_neon(float32x4_t s)
240 const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
241 const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
243 float32x4_t clipped = clip(s, lower_bound, upper_bound);
244 float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_24BIT_SCALING));
245 return vcvtq_s32_f32(scaled);
248 static inline int16x4_t float_16_neon(float32x4_t s)
250 const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
251 const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
253 float32x4_t clipped = clip(s, lower_bound, upper_bound);
254 float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_16BIT_SCALING));
255 return vmovn_s32(vcvtq_s32_f32(scaled));
257 #endif
259 /* Linear Congruential noise generator. From the music-dsp list
260 * less random than rand(), but good enough and 10x faster
262 static unsigned int seed = 22222;
264 static inline unsigned int fast_rand() {
265 seed = (seed * 196314165) + 907633515;
266 return seed;
269 /* functions for native float sample data */
271 void sample_move_floatLE_sSs (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) {
272 while (nsamples--) {
273 *dst = *((float *) src);
274 dst++;
275 src += src_skip;
279 void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) {
280 while (nsamples--) {
281 *((float *) dst) = *src;
282 dst += dst_skip;
283 src++;
287 /* NOTES on function naming:
289 foo_bar_d<TYPE>_s<TYPE>
291 the "d<TYPE>" component defines the destination type for the operation
292 the "s<TYPE>" component defines the source type for the operation
294 TYPE can be one of:
296 S - sample is a jack_default_audio_sample_t, currently (October 2008) a 32 bit floating point value
297 Ss - like S but reverse endian from the host CPU
298 32 - sample is a signed 32 bit integer value
299 32u24 - sample is a signed 32 bit integer value, but data is in upper 24 bits only
300 32u24s - like 32u24 but reverse endian from the host CPU
301 32l24 - sample is a signed 32 bit integer value, but data is in lower 24 bits only
302 32l24s - like 32l24 but reverse endian from the host CPU
303 24 - sample is a signed 24 bit integer value
304 24s - like 24 but reverse endian from the host CPU
305 16 - sample is a signed 16 bit integer value
306 16s - like 16 but reverse endian from the host CPU
308 For obvious reasons, the reverse endian versions only show as source types.
310 This covers all known sample formats at 16 bits or larger.
313 /* functions for native integer sample data */
315 void sample_move_d32_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
317 while (nsamples--) {
318 int32_t z;
319 float_32(*src, z);
320 #if __BYTE_ORDER == __LITTLE_ENDIAN
321 dst[0]=(char)(z>>24);
322 dst[1]=(char)(z>>16);
323 dst[2]=(char)(z>>8);
324 dst[3]=(char)(z);
325 #elif __BYTE_ORDER == __BIG_ENDIAN
326 dst[0]=(char)(z);
327 dst[1]=(char)(z>>8);
328 dst[2]=(char)(z>>16);
329 dst[3]=(char)(z>>24);
330 #endif
331 dst += dst_skip;
332 src++;
336 void sample_move_d32_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
338 while (nsamples--) {
339 float_32(*src, *(int32_t *)dst);
340 dst += dst_skip;
341 src++;
345 void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
347 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
348 unsigned long unrolled = nsamples / 4;
349 nsamples = nsamples & 3;
351 while (unrolled--) {
352 float32x4_t samples = vld1q_f32(src);
353 int32x4_t converted = float_24_neon(samples);
354 int32x4_t shifted = vshlq_n_s32(converted, 8);
355 shifted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(shifted)));
357 switch(dst_skip) {
358 case 4:
359 vst1q_s32((int32_t*)dst, shifted);
360 break;
361 default:
362 vst1q_lane_s32((int32_t*)(dst), shifted, 0);
363 vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1);
364 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
365 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
366 break;
368 dst += 4*dst_skip;
369 src+= 4;
371 #endif
373 int32_t z;
375 while (nsamples--) {
377 float_24u32 (*src, z);
379 #if __BYTE_ORDER == __LITTLE_ENDIAN
380 dst[0]=(char)(z>>24);
381 dst[1]=(char)(z>>16);
382 dst[2]=(char)(z>>8);
383 dst[3]=(char)(z);
384 #elif __BYTE_ORDER == __BIG_ENDIAN
385 dst[0]=(char)(z);
386 dst[1]=(char)(z>>8);
387 dst[2]=(char)(z>>16);
388 dst[3]=(char)(z>>24);
389 #endif
390 dst += dst_skip;
391 src++;
395 void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
397 #if defined (__SSE2__) && !defined (__sun__)
398 __m128 int_max = _mm_set1_ps(SAMPLE_24BIT_MAX_F);
399 __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
400 __m128 factor = int_max;
402 unsigned long unrolled = nsamples / 4;
403 nsamples = nsamples & 3;
405 while (unrolled--) {
406 __m128 in = _mm_load_ps(src);
407 __m128 scaled = _mm_mul_ps(in, factor);
408 __m128 clipped = clip(scaled, int_min, int_max);
410 __m128i y = _mm_cvttps_epi32(clipped);
411 __m128i shifted = _mm_slli_epi32(y, 8);
413 #ifdef __SSE4_1__
414 *(int32_t*)dst = _mm_extract_epi32(shifted, 0);
415 *(int32_t*)(dst+dst_skip) = _mm_extract_epi32(shifted, 1);
416 *(int32_t*)(dst+2*dst_skip) = _mm_extract_epi32(shifted, 2);
417 *(int32_t*)(dst+3*dst_skip) = _mm_extract_epi32(shifted, 3);
418 #else
419 __m128i shuffled1 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(0, 3, 2, 1));
420 __m128i shuffled2 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(1, 0, 3, 2));
421 __m128i shuffled3 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(2, 1, 0, 3));
423 _mm_store_ss((float*)dst, (__m128)shifted);
425 _mm_store_ss((float*)(dst+dst_skip), (__m128)shuffled1);
426 _mm_store_ss((float*)(dst+2*dst_skip), (__m128)shuffled2);
427 _mm_store_ss((float*)(dst+3*dst_skip), (__m128)shuffled3);
428 #endif
429 dst += 4*dst_skip;
431 src+= 4;
434 while (nsamples--) {
435 __m128 in = _mm_load_ss(src);
436 __m128 scaled = _mm_mul_ss(in, factor);
437 __m128 clipped = _mm_min_ss(int_max, _mm_max_ss(scaled, int_min));
439 int y = _mm_cvttss_si32(clipped);
440 *((int *) dst) = y<<8;
442 dst += dst_skip;
443 src++;
446 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
447 unsigned long unrolled = nsamples / 4;
448 nsamples = nsamples & 3;
450 while (unrolled--) {
451 float32x4_t samples = vld1q_f32(src);
452 int32x4_t converted = float_24_neon(samples);
453 int32x4_t shifted = vshlq_n_s32(converted, 8);
455 switch(dst_skip) {
456 case 4:
457 vst1q_s32((int32_t*)dst, shifted);
458 break;
459 default:
460 vst1q_lane_s32((int32_t*)(dst), shifted, 0);
461 vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1);
462 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
463 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
464 break;
466 dst += 4*dst_skip;
468 src+= 4;
470 #endif
472 #if !defined (__SSE2__)
473 while (nsamples--) {
474 float_24u32 (*src, *((int32_t*) dst));
475 dst += dst_skip;
476 src++;
478 #endif
481 void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
483 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
484 float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
485 unsigned long unrolled = nsamples / 4;
486 while (unrolled--) {
487 int32x4_t src128;
488 switch(src_skip)
490 case 4:
491 src128 = vld1q_s32((int32_t*)src);
492 break;
493 case 8:
494 src128 = vld2q_s32((int32_t*)src).val[0];
495 break;
496 default:
497 src128 = vld1q_lane_s32((int32_t*)src, src128, 0);
498 src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1);
499 src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
500 src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
501 break;
503 src128 = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(src128)));
504 int32x4_t shifted = vshrq_n_s32(src128, 8);
505 float32x4_t as_float = vcvtq_f32_s32(shifted);
506 float32x4_t divided = vmulq_f32(as_float, factor);
507 vst1q_f32(dst, divided);
509 src += 4*src_skip;
510 dst += 4;
512 nsamples = nsamples & 3;
513 #endif
515 /* ALERT: signed sign-extension portability !!! */
517 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
519 while (nsamples--) {
520 int x;
521 #if __BYTE_ORDER == __LITTLE_ENDIAN
522 x = (unsigned char)(src[0]);
523 x <<= 8;
524 x |= (unsigned char)(src[1]);
525 x <<= 8;
526 x |= (unsigned char)(src[2]);
527 x <<= 8;
528 x |= (unsigned char)(src[3]);
529 #elif __BYTE_ORDER == __BIG_ENDIAN
530 x = (unsigned char)(src[3]);
531 x <<= 8;
532 x |= (unsigned char)(src[2]);
533 x <<= 8;
534 x |= (unsigned char)(src[1]);
535 x <<= 8;
536 x |= (unsigned char)(src[0]);
537 #endif
538 *dst = (x >> 8) * scaling;
539 dst++;
540 src += src_skip;
544 void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
546 #if defined (__SSE2__) && !defined (__sun__)
547 unsigned long unrolled = nsamples / 4;
548 static float inv_sample_max_24bit = 1.0 / SAMPLE_24BIT_SCALING;
549 __m128 factor = _mm_set1_ps(inv_sample_max_24bit);
550 while (unrolled--)
552 int i1 = *((int *) src);
553 src+= src_skip;
554 int i2 = *((int *) src);
555 src+= src_skip;
556 int i3 = *((int *) src);
557 src+= src_skip;
558 int i4 = *((int *) src);
559 src+= src_skip;
561 __m128i src = _mm_set_epi32(i4, i3, i2, i1);
562 __m128i shifted = _mm_srai_epi32(src, 8);
564 __m128 as_float = _mm_cvtepi32_ps(shifted);
565 __m128 divided = _mm_mul_ps(as_float, factor);
567 _mm_storeu_ps(dst, divided);
569 dst += 4;
571 nsamples = nsamples & 3;
572 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
573 unsigned long unrolled = nsamples / 4;
574 float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
575 while (unrolled--) {
576 int32x4_t src128;
577 switch(src_skip) {
578 case 4:
579 src128 = vld1q_s32((int32_t*)src);
580 break;
581 case 8:
582 src128 = vld2q_s32((int32_t*)src).val[0];
583 break;
584 default:
585 src128 = vld1q_lane_s32((int32_t*)src, src128, 0);
586 src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1);
587 src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
588 src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
589 break;
591 int32x4_t shifted = vshrq_n_s32(src128, 8);
592 float32x4_t as_float = vcvtq_f32_s32(shifted);
593 float32x4_t divided = vmulq_f32(as_float, factor);
594 vst1q_f32(dst, divided);
596 src += 4*src_skip;
597 dst += 4;
599 nsamples = nsamples & 3;
600 #endif
602 /* ALERT: signed sign-extension portability !!! */
604 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
605 while (nsamples--) {
606 *dst = (*((int *) src) >> 8) * scaling;
607 dst++;
608 src += src_skip;
612 void sample_move_d32l24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
614 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
615 unsigned long unrolled = nsamples / 4;
616 nsamples = nsamples & 3;
618 while (unrolled--) {
619 float32x4_t samples = vld1q_f32(src);
620 int32x4_t converted = float_24_neon(samples);
621 converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted)));
623 switch(dst_skip) {
624 case 4:
625 vst1q_s32((int32_t*)dst, converted);
626 break;
627 default:
628 vst1q_lane_s32((int32_t*)(dst), converted, 0);
629 vst1q_lane_s32((int32_t*)(dst+dst_skip), converted, 1);
630 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), converted, 2);
631 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), converted, 3);
632 break;
634 dst += 4*dst_skip;
635 src+= 4;
637 #endif
639 int32_t z;
641 while (nsamples--) {
643 float_24l32 (*src, z);
645 #if __BYTE_ORDER == __LITTLE_ENDIAN
646 dst[0]=(char)(z>>24);
647 dst[1]=(char)(z>>16);
648 dst[2]=(char)(z>>8);
649 dst[3]=(char)(z);
650 #elif __BYTE_ORDER == __BIG_ENDIAN
651 dst[0]=(char)(z);
652 dst[1]=(char)(z>>8);
653 dst[2]=(char)(z>>16);
654 dst[3]=(char)(z>>24);
655 #endif
656 dst += dst_skip;
657 src++;
661 void sample_move_d32l24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
663 #if defined (__SSE2__) && !defined (__sun__)
664 __m128 int_max = _mm_set1_ps(SAMPLE_24BIT_MAX_F);
665 __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
666 __m128 factor = int_max;
668 unsigned long unrolled = nsamples / 4;
669 nsamples = nsamples & 3;
671 while (unrolled--) {
672 __m128 in = _mm_load_ps(src);
673 __m128 scaled = _mm_mul_ps(in, factor);
674 __m128 clipped = clip(scaled, int_min, int_max);
676 __m128i shifted = _mm_cvttps_epi32(clipped);
678 #ifdef __SSE4_1__
679 *(int32_t*)dst = _mm_extract_epi32(shifted, 0);
680 *(int32_t*)(dst+dst_skip) = _mm_extract_epi32(shifted, 1);
681 *(int32_t*)(dst+2*dst_skip) = _mm_extract_epi32(shifted, 2);
682 *(int32_t*)(dst+3*dst_skip) = _mm_extract_epi32(shifted, 3);
683 #else
684 __m128i shuffled1 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(0, 3, 2, 1));
685 __m128i shuffled2 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(1, 0, 3, 2));
686 __m128i shuffled3 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(2, 1, 0, 3));
688 _mm_store_ss((float*)dst, (__m128)shifted);
690 _mm_store_ss((float*)(dst+dst_skip), (__m128)shuffled1);
691 _mm_store_ss((float*)(dst+2*dst_skip), (__m128)shuffled2);
692 _mm_store_ss((float*)(dst+3*dst_skip), (__m128)shuffled3);
693 #endif
694 dst += 4*dst_skip;
696 src+= 4;
699 while (nsamples--) {
700 __m128 in = _mm_load_ss(src);
701 __m128 scaled = _mm_mul_ss(in, factor);
702 __m128 clipped = _mm_min_ss(int_max, _mm_max_ss(scaled, int_min));
704 int y = _mm_cvttss_si32(clipped);
705 *((int *) dst) = y<<8;
707 dst += dst_skip;
708 src++;
710 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
711 unsigned long unrolled = nsamples / 4;
712 nsamples = nsamples & 3;
714 while (unrolled--) {
715 float32x4_t samples = vld1q_f32(src);
716 int32x4_t converted = float_24_neon(samples);
718 switch(dst_skip) {
719 case 4:
720 vst1q_s32((int32_t*)dst, converted);
721 break;
722 default:
723 vst1q_lane_s32((int32_t*)(dst), converted, 0);
724 vst1q_lane_s32((int32_t*)(dst+dst_skip), converted, 1);
725 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), converted, 2);
726 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), converted, 3);
727 break;
729 dst += 4*dst_skip;
731 src+= 4;
733 #endif
735 #if !defined (__SSE2__)
736 while (nsamples--) {
737 float_24l32 (*src, *((int32_t*) dst));
738 dst += dst_skip;
739 src++;
741 #endif
744 void sample_move_dS_s32s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
746 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_32BIT_SCALING;
747 while (nsamples--) {
748 int32_t x;
749 #if __BYTE_ORDER == __LITTLE_ENDIAN
750 x = (unsigned char)(src[0]);
751 x <<= 8;
752 x |= (unsigned char)(src[1]);
753 x <<= 8;
754 x |= (unsigned char)(src[2]);
755 x <<= 8;
756 x |= (unsigned char)(src[3]);
757 #elif __BYTE_ORDER == __BIG_ENDIAN
758 x = (unsigned char)(src[3]);
759 x <<= 8;
760 x |= (unsigned char)(src[2]);
761 x <<= 8;
762 x |= (unsigned char)(src[1]);
763 x <<= 8;
764 x |= (unsigned char)(src[0]);
765 #endif
766 double extended = x * scaling;
767 *dst = (float)extended;
768 dst++;
769 src += src_skip;
773 void sample_move_dS_s32l24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
775 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
776 float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
777 unsigned long unrolled = nsamples / 4;
778 while (unrolled--) {
779 uint32x4_t src128;
780 switch(src_skip)
782 case 4:
783 src128 = vld1q_u32((uint32_t*)src);
784 break;
785 case 8:
786 src128 = vld2q_u32((uint32_t*)src).val[0];
787 break;
788 default:
789 src128 = vld1q_lane_u32((uint32_t*)src, src128, 0);
790 src128 = vld1q_lane_u32((uint32_t*)(src+src_skip), src128, 1);
791 src128 = vld1q_lane_u32((uint32_t*)(src+2*src_skip), src128, 2);
792 src128 = vld1q_lane_u32((uint32_t*)(src+3*src_skip), src128, 3);
793 break;
795 src128 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(src128)));
796 uint32x4_t toupper = vshlq_n_u32(src128, 8);
797 int32x4_t shifted = vshrq_n_s32((int32x4_t)toupper, 8);
798 float32x4_t as_float = vcvtq_f32_s32(shifted);
799 float32x4_t divided = vmulq_f32(as_float, factor);
800 vst1q_f32(dst, divided);
802 src += 4*src_skip;
803 dst += 4;
805 nsamples = nsamples & 3;
806 #endif
808 /* ALERT: signed sign-extension portability !!! */
810 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
812 while (nsamples--) {
813 int32_t x;
814 #if __BYTE_ORDER == __LITTLE_ENDIAN
815 x = (unsigned char)(src[0]);
816 x <<= 8;
817 x |= (unsigned char)(src[1]);
818 x <<= 8;
819 x |= (unsigned char)(src[2]);
820 x <<= 8;
821 x |= (unsigned char)(src[3]);
822 #elif __BYTE_ORDER == __BIG_ENDIAN
823 x = (unsigned char)(src[3]);
824 x <<= 8;
825 x |= (unsigned char)(src[2]);
826 x <<= 8;
827 x |= (unsigned char)(src[1]);
828 x <<= 8;
829 x |= (unsigned char)(src[0]);
830 #endif
831 *dst = (x >> 0) * scaling;
832 dst++;
833 src += src_skip;
837 void sample_move_dS_s32 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
839 const double scaling = 1.0 / SAMPLE_32BIT_SCALING;
840 while (nsamples--) {
841 int32_t val=(*((int32_t*)src));
842 double extended = val * scaling;
843 *dst = (float)extended;
844 dst++;
845 src += src_skip;
849 void sample_move_dS_s32l24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
851 #if defined (__SSE2__) && !defined (__sun__)
852 unsigned long unrolled = nsamples / 4;
853 static float inv_sample_max_24bit = 1.0 / SAMPLE_24BIT_SCALING;
854 __m128 factor = _mm_set1_ps(inv_sample_max_24bit);
855 while (unrolled--)
857 int i1 = *((int *) src);
858 src+= src_skip;
859 int i2 = *((int *) src);
860 src+= src_skip;
861 int i3 = *((int *) src);
862 src+= src_skip;
863 int i4 = *((int *) src);
864 src+= src_skip;
866 __m128i shifted = _mm_set_epi32(i4, i3, i2, i1);
868 __m128 as_float = _mm_cvtepi32_ps(shifted);
869 __m128 divided = _mm_mul_ps(as_float, factor);
871 _mm_storeu_ps(dst, divided);
873 dst += 4;
875 nsamples = nsamples & 3;
876 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
877 unsigned long unrolled = nsamples / 4;
878 float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
879 while (unrolled--) {
880 uint32x4_t src128;
881 switch(src_skip) {
882 case 4:
883 src128 = vld1q_u32((uint32_t*)src);
884 break;
885 case 8:
886 src128 = vld2q_u32((uint32_t*)src).val[0];
887 break;
888 default:
889 src128 = vld1q_lane_u32((uint32_t*)src, src128, 0);
890 src128 = vld1q_lane_u32((uint32_t*)(src+src_skip), src128, 1);
891 src128 = vld1q_lane_u32((uint32_t*)(src+2*src_skip), src128, 2);
892 src128 = vld1q_lane_u32((uint32_t*)(src+3*src_skip), src128, 3);
893 break;
895 // Sign extension by moving to upper as unsigned, then down
896 uint32x4_t toupper = vshlq_n_u32(src128, 8);
897 int32x4_t shifted = vshrq_n_s32((int32x4_t)toupper, 8);
898 float32x4_t as_float = vcvtq_f32_s32(shifted);
899 float32x4_t divided = vmulq_f32(as_float, factor);
900 vst1q_f32(dst, divided);
902 src += 4*src_skip;
903 dst += 4;
905 nsamples = nsamples & 3;
906 #endif
908 /* ALERT: signed sign-extension portability !!! */
910 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
911 while (nsamples--) {
912 uint32_t val=(*((uint32_t*)src));
913 if (val & 0x800000u) val|=0xFF000000u;
914 *dst = (*((int32_t *) &val)) * scaling;
915 dst++;
916 src += src_skip;
920 void sample_move_d24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
922 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
923 unsigned long unrolled = nsamples / 4;
924 while (unrolled--) {
925 int i;
926 int32_t z[4];
927 float32x4_t samples = vld1q_f32(src);
928 int32x4_t converted = float_24_neon(samples);
929 converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted)));
930 vst1q_s32(z, converted);
932 for (i = 0; i != 4; ++i) {
933 memcpy (dst, ((char*)(z+i))+1, 3);
934 dst += dst_skip;
936 src += 4;
938 nsamples = nsamples & 3;
939 #endif
941 int32_t z;
943 while (nsamples--) {
944 float_24 (*src, z);
945 #if __BYTE_ORDER == __LITTLE_ENDIAN
946 dst[0]=(char)(z>>16);
947 dst[1]=(char)(z>>8);
948 dst[2]=(char)(z);
949 #elif __BYTE_ORDER == __BIG_ENDIAN
950 dst[0]=(char)(z);
951 dst[1]=(char)(z>>8);
952 dst[2]=(char)(z>>16);
953 #endif
954 dst += dst_skip;
955 src++;
959 void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
961 #if defined (__SSE2__) && !defined (__sun__)
962 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
963 while (nsamples >= 4) {
964 int i;
965 int32_t z[4];
966 __m128 samples = _mm_loadu_ps(src);
967 __m128i converted = float_24_sse(samples);
969 #ifdef __SSE4_1__
970 z[0] = _mm_extract_epi32(converted, 0);
971 z[1] = _mm_extract_epi32(converted, 1);
972 z[2] = _mm_extract_epi32(converted, 2);
973 z[3] = _mm_extract_epi32(converted, 3);
974 #else
975 __m128i shuffled1 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(0, 3, 2, 1));
976 __m128i shuffled2 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(1, 0, 3, 2));
977 __m128i shuffled3 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(2, 1, 0, 3));
979 _mm_store_ss((float*)z, (__m128)converted);
980 _mm_store_ss((float*)z+1, (__m128)shuffled1);
981 _mm_store_ss((float*)z+2, (__m128)shuffled2);
982 _mm_store_ss((float*)z+3, (__m128)shuffled3);
983 #endif
985 for (i = 0; i != 4; ++i) {
986 memcpy (dst, z+i, 3);
987 dst += dst_skip;
990 nsamples -= 4;
991 src += 4;
993 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
994 unsigned long unrolled = nsamples / 4;
995 while (unrolled--) {
996 int i;
997 int32_t z[4];
998 float32x4_t samples = vld1q_f32(src);
999 int32x4_t converted = float_24_neon(samples);
1000 vst1q_s32(z, converted);
1002 for (i = 0; i != 4; ++i) {
1003 memcpy (dst, z+i, 3);
1004 dst += dst_skip;
1006 src += 4;
1008 nsamples = nsamples & 3;
1009 #endif
1011 int32_t z;
1013 while (nsamples--) {
1014 float_24 (*src, z);
1015 #if __BYTE_ORDER == __LITTLE_ENDIAN
1016 memcpy (dst, &z, 3);
1017 #elif __BYTE_ORDER == __BIG_ENDIAN
1018 memcpy (dst, (char *)&z + 1, 3);
1019 #endif
1020 dst += dst_skip;
1021 src++;
1025 void sample_move_dS_s24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1027 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
1029 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1030 // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
1031 const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
1032 int32_t x[4];
1033 memset(x, 0, sizeof(x));
1034 unsigned long unrolled = nsamples / 4;
1035 while (unrolled--) {
1036 #if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
1037 // right aligned / inverse sequence below -> *256
1038 memcpy(((char*)&x[0])+1, src, 3);
1039 memcpy(((char*)&x[1])+1, src+src_skip, 3);
1040 memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
1041 memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
1042 #else
1043 memcpy(&x[0], src, 3);
1044 memcpy(&x[1], src+src_skip, 3);
1045 memcpy(&x[2], src+2*src_skip, 3);
1046 memcpy(&x[3], src+3*src_skip, 3);
1047 #endif
1048 src += 4 * src_skip;
1050 int32x4_t source = vld1q_s32(x);
1051 source = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(source)));
1052 float32x4_t converted = vcvtq_f32_s32(source);
1053 float32x4_t scaled = vmulq_f32(converted, vscaling);
1054 vst1q_f32(dst, scaled);
1055 dst += 4;
1057 nsamples = nsamples & 3;
1058 #endif
1060 /* ALERT: signed sign-extension portability !!! */
1062 while (nsamples--) {
1063 int x;
1064 #if __BYTE_ORDER == __LITTLE_ENDIAN
1065 x = (unsigned char)(src[0]);
1066 x <<= 8;
1067 x |= (unsigned char)(src[1]);
1068 x <<= 8;
1069 x |= (unsigned char)(src[2]);
1070 /* correct sign bit and the rest of the top byte */
1071 if (src[0] & 0x80) {
1072 x |= 0xff << 24;
1074 #elif __BYTE_ORDER == __BIG_ENDIAN
1075 x = (unsigned char)(src[2]);
1076 x <<= 8;
1077 x |= (unsigned char)(src[1]);
1078 x <<= 8;
1079 x |= (unsigned char)(src[0]);
1080 /* correct sign bit and the rest of the top byte */
1081 if (src[2] & 0x80) {
1082 x |= 0xff << 24;
1084 #endif
1085 *dst = x * scaling;
1086 dst++;
1087 src += src_skip;
1091 void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1093 const jack_default_audio_sample_t scaling = 1.f/SAMPLE_24BIT_SCALING;
1095 #if defined (__SSE2__) && !defined (__sun__)
1096 const __m128 scaling_block = _mm_set_ps1(scaling);
1097 while (nsamples >= 4) {
1098 int x0, x1, x2, x3;
1100 memcpy((char*)&x0 + 1, src, 3);
1101 memcpy((char*)&x1 + 1, src+src_skip, 3);
1102 memcpy((char*)&x2 + 1, src+2*src_skip, 3);
1103 memcpy((char*)&x3 + 1, src+3*src_skip, 3);
1104 src += 4 * src_skip;
1106 const __m128i block_i = _mm_set_epi32(x3, x2, x1, x0);
1107 const __m128i shifted = _mm_srai_epi32(block_i, 8);
1108 const __m128 converted = _mm_cvtepi32_ps (shifted);
1109 const __m128 scaled = _mm_mul_ps(converted, scaling_block);
1110 _mm_storeu_ps(dst, scaled);
1111 dst += 4;
1112 nsamples -= 4;
1114 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
1115 // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
1116 const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
1117 int32_t x[4];
1118 memset(x, 0, sizeof(x));
1119 unsigned long unrolled = nsamples / 4;
1120 while (unrolled--) {
1121 #if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
1122 // left aligned -> *256
1123 memcpy(&x[0], src, 3);
1124 memcpy(&x[1], src+src_skip, 3);
1125 memcpy(&x[2], src+2*src_skip, 3);
1126 memcpy(&x[3], src+3*src_skip, 3);
1127 #else
1128 memcpy(((char*)&x[0])+1, src, 3);
1129 memcpy(((char*)&x[1])+1, src+src_skip, 3);
1130 memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
1131 memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
1132 #endif
1133 src += 4 * src_skip;
1135 int32x4_t source = vld1q_s32(x);
1136 float32x4_t converted = vcvtq_f32_s32(source);
1137 float32x4_t scaled = vmulq_f32(converted, vscaling);
1138 vst1q_f32(dst, scaled);
1139 dst += 4;
1141 nsamples = nsamples & 3;
1142 #endif
1144 while (nsamples--) {
1145 int x;
1146 #if __BYTE_ORDER == __LITTLE_ENDIAN
1147 memcpy((char*)&x + 1, src, 3);
1148 #elif __BYTE_ORDER == __BIG_ENDIAN
1149 memcpy(&x, src, 3);
1150 #endif
1151 x >>= 8;
1152 *dst = x * scaling;
1153 dst++;
1154 src += src_skip;
1159 void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1161 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1162 unsigned long unrolled = nsamples / 4;
1163 nsamples = nsamples & 3;
1165 while (unrolled--) {
1166 float32x4_t samples = vld1q_f32(src);
1167 int16x4_t converted = float_16_neon(samples);
1168 converted = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(converted)));
1170 switch(dst_skip) {
1171 case 2:
1172 vst1_s16((int16_t*)dst, converted);
1173 break;
1174 default:
1175 vst1_lane_s16((int16_t*)(dst), converted, 0);
1176 vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1);
1177 vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
1178 vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
1179 break;
1181 dst += 4*dst_skip;
1182 src+= 4;
1184 #endif
1185 int16_t tmp;
1187 while (nsamples--) {
1188 // float_16 (*src, tmp);
1190 if (*src <= NORMALIZED_FLOAT_MIN) {
1191 tmp = SAMPLE_16BIT_MIN;
1192 } else if (*src >= NORMALIZED_FLOAT_MAX) {
1193 tmp = SAMPLE_16BIT_MAX;
1194 } else {
1195 tmp = (int16_t) f_round (*src * SAMPLE_16BIT_SCALING);
1198 #if __BYTE_ORDER == __LITTLE_ENDIAN
1199 dst[0]=(char)(tmp>>8);
1200 dst[1]=(char)(tmp);
1201 #elif __BYTE_ORDER == __BIG_ENDIAN
1202 dst[0]=(char)(tmp);
1203 dst[1]=(char)(tmp>>8);
1204 #endif
1205 dst += dst_skip;
1206 src++;
1210 void sample_move_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1212 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1213 unsigned long unrolled = nsamples / 4;
1214 nsamples = nsamples & 3;
1216 while (unrolled--) {
1217 float32x4_t samples = vld1q_f32(src);
1218 int16x4_t converted = float_16_neon(samples);
1220 switch(dst_skip) {
1221 case 2:
1222 vst1_s16((int16_t*)dst, converted);
1223 break;
1224 default:
1225 vst1_lane_s16((int16_t*)(dst), converted, 0);
1226 vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1);
1227 vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
1228 vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
1229 break;
1231 dst += 4*dst_skip;
1232 src+= 4;
1234 #endif
1235 while (nsamples--) {
1236 float_16 (*src, *((int16_t*) dst));
1237 dst += dst_skip;
1238 src++;
1242 void sample_move_dither_rect_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1244 jack_default_audio_sample_t val;
1245 int16_t tmp;
1247 while (nsamples--) {
1248 val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float) UINT_MAX - 0.5f;
1249 float_16_scaled (val, tmp);
1250 #if __BYTE_ORDER == __LITTLE_ENDIAN
1251 dst[0]=(char)(tmp>>8);
1252 dst[1]=(char)(tmp);
1253 #elif __BYTE_ORDER == __BIG_ENDIAN
1254 dst[0]=(char)(tmp);
1255 dst[1]=(char)(tmp>>8);
1256 #endif
1257 dst += dst_skip;
1258 src++;
1262 void sample_move_dither_rect_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1264 jack_default_audio_sample_t val;
1266 while (nsamples--) {
1267 val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float)UINT_MAX - 0.5f;
1268 float_16_scaled (val, *((int16_t*) dst));
1269 dst += dst_skip;
1270 src++;
1274 void sample_move_dither_tri_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1276 jack_default_audio_sample_t val;
1277 int16_t tmp;
1279 while (nsamples--) {
1280 val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
1281 float_16_scaled (val, tmp);
1283 #if __BYTE_ORDER == __LITTLE_ENDIAN
1284 dst[0]=(char)(tmp>>8);
1285 dst[1]=(char)(tmp);
1286 #elif __BYTE_ORDER == __BIG_ENDIAN
1287 dst[0]=(char)(tmp);
1288 dst[1]=(char)(tmp>>8);
1289 #endif
1290 dst += dst_skip;
1291 src++;
1295 void sample_move_dither_tri_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1297 jack_default_audio_sample_t val;
1299 while (nsamples--) {
1300 val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
1301 float_16_scaled (val, *((int16_t*) dst));
1302 dst += dst_skip;
1303 src++;
1307 void sample_move_dither_shaped_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1309 jack_default_audio_sample_t x;
1310 jack_default_audio_sample_t xe; /* the innput sample - filtered error */
1311 jack_default_audio_sample_t xp; /* x' */
1312 float r;
1313 float rm1 = state->rm1;
1314 unsigned int idx = state->idx;
1315 int16_t tmp;
1317 while (nsamples--) {
1318 x = *src * SAMPLE_16BIT_SCALING;
1319 r = ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
1320 /* Filter the error with Lipshitz's minimally audible FIR:
1321 [2.033 -2.165 1.959 -1.590 0.6149] */
1322 xe = x
1323 - state->e[idx] * 2.033f
1324 + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
1325 - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
1326 + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
1327 - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
1328 xp = xe + r - rm1;
1329 rm1 = r;
1331 float_16_scaled (xp, tmp);
1333 /* Intrinsic z^-1 delay */
1334 idx = (idx + 1) & DITHER_BUF_MASK;
1335 state->e[idx] = xp - xe;
1337 #if __BYTE_ORDER == __LITTLE_ENDIAN
1338 dst[0]=(char)(tmp>>8);
1339 dst[1]=(char)(tmp);
1340 #elif __BYTE_ORDER == __BIG_ENDIAN
1341 dst[0]=(char)(tmp);
1342 dst[1]=(char)(tmp>>8);
1343 #endif
1344 dst += dst_skip;
1345 src++;
1347 state->rm1 = rm1;
1348 state->idx = idx;
1351 void sample_move_dither_shaped_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1353 jack_default_audio_sample_t x;
1354 jack_default_audio_sample_t xe; /* the innput sample - filtered error */
1355 jack_default_audio_sample_t xp; /* x' */
1356 float r;
1357 float rm1 = state->rm1;
1358 unsigned int idx = state->idx;
1360 while (nsamples--) {
1361 x = *src * SAMPLE_16BIT_SCALING;
1362 r = ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
1363 /* Filter the error with Lipshitz's minimally audible FIR:
1364 [2.033 -2.165 1.959 -1.590 0.6149] */
1365 xe = x
1366 - state->e[idx] * 2.033f
1367 + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
1368 - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
1369 + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
1370 - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
1371 xp = xe + r - rm1;
1372 rm1 = r;
1374 float_16_scaled (xp, *((int16_t*) dst));
1376 /* Intrinsic z^-1 delay */
1377 idx = (idx + 1) & DITHER_BUF_MASK;
1378 state->e[idx] = *((int16_t*) dst) - xe;
1380 dst += dst_skip;
1381 src++;
1383 state->rm1 = rm1;
1384 state->idx = idx;
1387 void sample_move_dS_s16s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1389 short z;
1390 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
1391 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1392 const float32x4_t vscaling = vdupq_n_f32(scaling);
1393 unsigned long unrolled = nsamples / 4;
1394 while (unrolled--) {
1395 int16x4_t source16x4;
1396 switch(src_skip) {
1397 case 2:
1398 source16x4 = vld1_s16((int16_t*)src);
1399 break;
1400 case 4:
1401 source16x4 = vld2_s16((int16_t*)src).val[0];
1402 break;
1403 default:
1404 source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0);
1405 source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1);
1406 source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
1407 source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
1408 break;
1410 source16x4 = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(source16x4)));
1411 int32x4_t source32x4 = vmovl_s16(source16x4);
1412 src += 4 * src_skip;
1414 float32x4_t converted = vcvtq_f32_s32(source32x4);
1415 float32x4_t scaled = vmulq_f32(converted, vscaling);
1416 vst1q_f32(dst, scaled);
1417 dst += 4;
1419 nsamples = nsamples & 3;
1420 #endif
1422 /* ALERT: signed sign-extension portability !!! */
1423 while (nsamples--) {
1424 #if __BYTE_ORDER == __LITTLE_ENDIAN
1425 z = (unsigned char)(src[0]);
1426 z <<= 8;
1427 z |= (unsigned char)(src[1]);
1428 #elif __BYTE_ORDER == __BIG_ENDIAN
1429 z = (unsigned char)(src[1]);
1430 z <<= 8;
1431 z |= (unsigned char)(src[0]);
1432 #endif
1433 *dst = z * scaling;
1434 dst++;
1435 src += src_skip;
1439 void sample_move_dS_s16 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1441 /* ALERT: signed sign-extension portability !!! */
1442 const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
1443 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1444 const float32x4_t vscaling = vdupq_n_f32(scaling);
1445 unsigned long unrolled = nsamples / 4;
1446 while (unrolled--) {
1447 int16x4_t source16x4;
1448 switch(src_skip) {
1449 case 2:
1450 source16x4 = vld1_s16((int16_t*)src);
1451 break;
1452 case 4:
1453 source16x4 = vld2_s16((int16_t*)src).val[0];
1454 break;
1455 default:
1456 source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0);
1457 source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1);
1458 source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
1459 source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
1460 break;
1462 int32x4_t source32x4 = vmovl_s16(source16x4);
1463 src += 4 * src_skip;
1465 float32x4_t converted = vcvtq_f32_s32(source32x4);
1466 float32x4_t scaled = vmulq_f32(converted, vscaling);
1467 vst1q_f32(dst, scaled);
1468 dst += 4;
1470 nsamples = nsamples & 3;
1471 #endif
1473 while (nsamples--) {
1474 *dst = (*((short *) src)) * scaling;
1475 dst++;
1476 src += src_skip;
1480 void memset_interleave (char *dst, char val, unsigned long bytes,
1481 unsigned long unit_bytes,
1482 unsigned long skip_bytes)
1484 switch (unit_bytes) {
1485 case 1:
1486 while (bytes--) {
1487 *dst = val;
1488 dst += skip_bytes;
1490 break;
1491 case 2:
1492 while (bytes) {
1493 *((short *) dst) = (short) val;
1494 dst += skip_bytes;
1495 bytes -= 2;
1497 break;
1498 case 4:
1499 while (bytes) {
1500 *((int *) dst) = (int) val;
1501 dst += skip_bytes;
1502 bytes -= 4;
1504 break;
1505 default:
1506 while (bytes) {
1507 memset(dst, val, unit_bytes);
1508 dst += skip_bytes;
1509 bytes -= unit_bytes;
1511 break;
1515 /* COPY FUNCTIONS: used to move data from an input channel to an
1516 output channel. Note that we assume that the skip distance
1517 is the same for both channels. This is completely fine
1518 unless the input and output were on different audio interfaces that
1519 were interleaved differently. We don't try to handle that.
1522 void
1523 memcpy_fake (char *dst, char *src, unsigned long src_bytes, unsigned long foo, unsigned long bar)
1525 memcpy (dst, src, src_bytes);
1528 void
1529 memcpy_interleave_d16_s16 (char *dst, char *src, unsigned long src_bytes,
1530 unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1532 while (src_bytes) {
1533 *((short *) dst) = *((short *) src);
1534 dst += dst_skip_bytes;
1535 src += src_skip_bytes;
1536 src_bytes -= 2;
1540 void
1541 memcpy_interleave_d24_s24 (char *dst, char *src, unsigned long src_bytes,
1542 unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1544 while (src_bytes) {
1545 memcpy(dst, src, 3);
1546 dst += dst_skip_bytes;
1547 src += src_skip_bytes;
1548 src_bytes -= 3;
1552 void
1553 memcpy_interleave_d32_s32 (char *dst, char *src, unsigned long src_bytes,
1554 unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1556 while (src_bytes) {
1557 *((int *) dst) = *((int *) src);
1558 dst += dst_skip_bytes;
1559 src += src_skip_bytes;
1560 src_bytes -= 4;