common/memops.c

   1 /*
   2     Copyright (C) 2000 Paul Davis
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17
  18 */
  19
  20 #define _ISOC9X_SOURCE  1
  21 #define _ISOC99_SOURCE  1
  22
  23 #define __USE_ISOC9X    1
  24 #define __USE_ISOC99    1
  25
  26 #include <stdio.h>
  27 #include <string.h>
  28 #include <math.h>
  29 #include <memory.h>
  30 #include <stdlib.h>
  31 #include <stdint.h>
  32 #include <limits.h>
  33 #ifdef __linux__
  34 #include <endian.h>
  35 #endif
  36 #include "memops.h"
  37
  38 #if defined (__SSE2__) && !defined (__sun__)
  39 #include <emmintrin.h>
  40 #ifdef __SSE4_1__
  41 #include <smmintrin.h>
  42 #endif
  43 #endif
  44
  45 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
  46 #include <arm_neon.h>
  47 #endif
  48
  49 /* Notes about these *_SCALING values.
  50
  51    the MAX_<N>BIT values are floating point. when multiplied by
  52    a full-scale normalized floating point sample value (-1.0..+1.0)
  53    they should give the maximum value representable with an integer
  54    sample type of N bits. Note that this is asymmetric. Sample ranges
  55    for signed integer, 2's complement values are -(2^(N-1) to +(2^(N-1)-1)
  56
  57    Complications
  58    -------------
  59    If we use +2^(N-1) for the scaling factors, we run into a problem:
  60
  61    if we start with a normalized float value of -1.0, scaling
  62    to 24 bits would give -8388608 (-2^23), which is ideal.
  63    But with +1.0, we get +8388608, which is technically out of range.
  64
  65    We never multiply a full range normalized value by this constant,
  66    but we could multiply it by a positive value that is close enough to +1.0
  67    to produce a value > +(2^(N-1)-1.
  68
  69    There is no way around this paradox without wasting CPU cycles to determine
  70    which scaling factor to use (i.e. determine if its negative or not,
  71    use the right factor).
  72
  73    So, for now (October 2008) we use 2^(N-1)-1 as the scaling factor.
  74 */
  75
  76 #define SAMPLE_32BIT_SCALING  2147483647.0
  77 #define SAMPLE_24BIT_SCALING  8388607.0f
  78 #define SAMPLE_16BIT_SCALING  32767.0f
  79
  80 /* these are just values to use if the floating point value was out of range
  81
  82    advice from Fons Adriaensen: make the limits symmetrical
  83  */
  84
  85 #define SAMPLE_32BIT_MAX   2147483647
  86 #define SAMPLE_32BIT_MIN   -2147483647
  87 #define SAMPLE_32BIT_MAX_D  2147483647.0
  88 #define SAMPLE_32BIT_MIN_D  -2147483647.0
  89
  90 #define SAMPLE_24BIT_MAX  8388607
  91 #define SAMPLE_24BIT_MIN  -8388607
  92 #define SAMPLE_24BIT_MAX_F  8388607.0f
  93 #define SAMPLE_24BIT_MIN_F  -8388607.0f
  94
  95 #define SAMPLE_16BIT_MAX  32767
  96 #define SAMPLE_16BIT_MIN  -32767
  97 #define SAMPLE_16BIT_MAX_F  32767.0f
  98 #define SAMPLE_16BIT_MIN_F  -32767.0f
  99
 100 /* these mark the outer edges of the range considered "within" range
 101    for a floating point sample value. values outside (and on the boundaries)
 102    of this range will be clipped before conversion; values within this
 103    range will be scaled to appropriate values for the target sample
 104    type.
 105 */
 106
 107 #define NORMALIZED_FLOAT_MIN -1.0f
 108 #define NORMALIZED_FLOAT_MAX  1.0f
 109
 110 /* define this in case we end up on a platform that is missing
 111    the real lrintf functions
 112 */
 113
 114 #define f_round(f) lrintf(f)
 115 #define d_round(f) lrint(f)
 116
 117 #define float_16(s, d)\
 118         if ((s) <= NORMALIZED_FLOAT_MIN) {\
 119                 (d) = SAMPLE_16BIT_MIN;\
 120         } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
 121                 (d) = SAMPLE_16BIT_MAX;\
 122         } else {\
 123                 (d) = f_round ((s) * SAMPLE_16BIT_SCALING);\
 124         }
 125
 126 /* call this when "s" has already been scaled (e.g. when dithering)
 127  */
 128
 129 #define float_16_scaled(s, d)\
 130         if ((s) <= SAMPLE_16BIT_MIN_F) {\
 131                 (d) = SAMPLE_16BIT_MIN_F;\
 132         } else if ((s) >= SAMPLE_16BIT_MAX_F) { \
 133                 (d) = SAMPLE_16BIT_MAX;\
 134         } else {\
 135                 (d) = f_round ((s));\
 136         }
 137
 138 #define float_24u32(s, d) \
 139         if ((s) <= NORMALIZED_FLOAT_MIN) {\
 140                 (d) = SAMPLE_24BIT_MIN << 8;\
 141         } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
 142                 (d) = SAMPLE_24BIT_MAX << 8;\
 143         } else {\
 144                 (d) = f_round ((s) * SAMPLE_24BIT_SCALING) << 8;\
 145         }
 146
 147 #define float_24l32(s, d)                                              \
 148         if ((s) <= NORMALIZED_FLOAT_MIN) {\
 149                 (d) = SAMPLE_24BIT_MIN;                                        \
 150         } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
 151                 (d) = SAMPLE_24BIT_MAX;                                        \
 152         } else {\
 153                 (d) = f_round ((s) * SAMPLE_24BIT_SCALING);                    \
 154         }
 155
 156 #define float_32(s, d)                                                                                          \
 157         do {                                                                                                                    \
 158                 double clipped = fmin(NORMALIZED_FLOAT_MAX,                                     \
 159                                 fmax((double)(s), NORMALIZED_FLOAT_MIN));                       \
 160                 double scaled = clipped * SAMPLE_32BIT_MAX_D;                           \
 161                 (d) = d_round(scaled);                                                                          \
 162         }                                                                                                                               \
 163         while (0)
 164
 165 /* call this when "s" has already been scaled (e.g. when dithering)
 166  */
 167
 168 #define float_24u32_scaled(s, d)\
 169         if ((s) <= SAMPLE_24BIT_MIN_F) {\
 170                 (d) = SAMPLE_24BIT_MIN << 8;\
 171         } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
 172                 (d) = SAMPLE_24BIT_MAX << 8;            \
 173         } else {\
 174                 (d) = f_round ((s)) << 8; \
 175         }
 176
 177 #define float_24(s, d) \
 178         if ((s) <= NORMALIZED_FLOAT_MIN) {\
 179                 (d) = SAMPLE_24BIT_MIN;\
 180         } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
 181                 (d) = SAMPLE_24BIT_MAX;\
 182         } else {\
 183                 (d) = f_round ((s) * SAMPLE_24BIT_SCALING);\
 184         }
 185
 186 /* call this when "s" has already been scaled (e.g. when dithering)
 187  */
 188
 189 #define float_24_scaled(s, d)\
 190         if ((s) <= SAMPLE_24BIT_MIN_F) {\
 191                 (d) = SAMPLE_24BIT_MIN;\
 192         } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
 193                 (d) = SAMPLE_24BIT_MAX;         \
 194         } else {\
 195                 (d) = f_round ((s)); \
 196         }
 197
 198
 199 #if defined (__SSE2__) && !defined (__sun__)
 200
 201 /* generates same as _mm_set_ps(1.f, 1.f, 1f., 1f) but faster  */
 202 static inline __m128 gen_one(void)
 203 {
 204     volatile __m128i x = { 0 }; /* shut up, GCC */
 205     __m128i ones = _mm_cmpeq_epi32(x, x);
 206     return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 25), 23);
 207 }
 208
 209 static inline __m128 clip(__m128 s, __m128 min, __m128 max)
 210 {
 211     return _mm_min_ps(max, _mm_max_ps(s, min));
 212 }
 213
 214 static inline __m128d clip_double(__m128d s, __m128d min, __m128d max)
 215 {
 216     return _mm_min_pd(max, _mm_max_pd(s, min));
 217 }
 218
 219 static inline __m128i float_24_sse(__m128 s)
 220 {
 221     const __m128 upper_bound = gen_one(); /* NORMALIZED_FLOAT_MAX */
 222     const __m128 lower_bound = _mm_sub_ps(_mm_setzero_ps(), upper_bound);
 223
 224     __m128 clipped = clip(s, lower_bound, upper_bound);
 225     __m128 scaled = _mm_mul_ps(clipped, _mm_set1_ps(SAMPLE_24BIT_SCALING));
 226     return _mm_cvtps_epi32(scaled);
 227 }
 228 #endif
 229
 230
 231 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 232
 233 static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max)
 234 {
 235         return vminq_f32(max, vmaxq_f32(s, min));
 236 }
 237
 238 static inline int32x4_t float_24_neon(float32x4_t s)
 239 {
 240         const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
 241         const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
 242
 243         float32x4_t clipped = clip(s, lower_bound, upper_bound);
 244         float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_24BIT_SCALING));
 245         return vcvtq_s32_f32(scaled);
 246 }
 247
 248 static inline int16x4_t float_16_neon(float32x4_t s)
 249 {
 250         const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
 251         const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
 252
 253         float32x4_t clipped = clip(s, lower_bound, upper_bound);
 254         float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_16BIT_SCALING));
 255         return vmovn_s32(vcvtq_s32_f32(scaled));
 256 }
 257 #endif
 258
 259 /* Linear Congruential noise generator. From the music-dsp list
 260  * less random than rand(), but good enough and 10x faster
 261  */
 262 static unsigned int seed = 22222;
 263
 264 static inline unsigned int fast_rand() {
 265         seed = (seed * 196314165) + 907633515;
 266         return seed;
 267 }
 268
 269 /* functions for native float sample data */
 270
 271 void sample_move_floatLE_sSs (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) {
 272         while (nsamples--) {
 273                 *dst = *((float *) src);
 274                 dst++;
 275                 src += src_skip;
 276         }
 277 }
 278
 279 void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) {
 280         while (nsamples--) {
 281                 *((float *) dst) = *src;
 282                 dst += dst_skip;
 283                 src++;
 284         }
 285 }
 286
 287 /* NOTES on function naming:
 288
 289    foo_bar_d<TYPE>_s<TYPE>
 290
 291    the "d<TYPE>" component defines the destination type for the operation
 292    the "s<TYPE>" component defines the source type for the operation
 293
 294    TYPE can be one of:
 295
 296    S      - sample is a jack_default_audio_sample_t, currently (October 2008) a 32 bit floating point value
 297    Ss     - like S but reverse endian from the host CPU
 298    32     - sample is a signed 32 bit integer value
 299    32u24  - sample is a signed 32 bit integer value, but data is in upper 24 bits only
 300    32u24s - like 32u24 but reverse endian from the host CPU
 301    32l24  - sample is a signed 32 bit integer value, but data is in lower 24 bits only
 302    32l24s - like 32l24 but reverse endian from the host CPU
 303    24     - sample is a signed 24 bit integer value
 304    24s    - like 24 but reverse endian from the host CPU
 305    16     - sample is a signed 16 bit integer value
 306    16s    - like 16 but reverse endian from the host CPU
 307
 308    For obvious reasons, the reverse endian versions only show as source types.
 309
 310    This covers all known sample formats at 16 bits or larger.
 311 */
 312
 313 /* functions for native integer sample data */
 314
 315 void sample_move_d32_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 316 {
 317         while (nsamples--) {
 318                 int32_t z;
 319                 float_32(*src, z);
 320 #if __BYTE_ORDER == __LITTLE_ENDIAN
 321                 dst[0]=(char)(z>>24);
 322                 dst[1]=(char)(z>>16);
 323                 dst[2]=(char)(z>>8);
 324                 dst[3]=(char)(z);
 325 #elif __BYTE_ORDER == __BIG_ENDIAN
 326                 dst[0]=(char)(z);
 327                 dst[1]=(char)(z>>8);
 328                 dst[2]=(char)(z>>16);
 329                 dst[3]=(char)(z>>24);
 330 #endif
 331                 dst += dst_skip;
 332                 src++;
 333         }
 334 }
 335
 336 void sample_move_d32_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 337 {
 338         while (nsamples--) {
 339                 float_32(*src, *(int32_t *)dst);
 340                 dst += dst_skip;
 341                 src++;
 342         }
 343 }
 344
 345 void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 346 {
 347 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 348         unsigned long unrolled = nsamples / 4;
 349         nsamples = nsamples & 3;
 350
 351         while (unrolled--) {
 352                 float32x4_t samples = vld1q_f32(src);
 353                 int32x4_t converted = float_24_neon(samples);
 354                 int32x4_t shifted = vshlq_n_s32(converted, 8);
 355                 shifted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(shifted)));
 356
 357                 switch(dst_skip) {
 358                         case 4:
 359                                 vst1q_s32((int32_t*)dst, shifted);
 360                                 break;
 361                         default:
 362                                 vst1q_lane_s32((int32_t*)(dst),            shifted, 0);
 363                                 vst1q_lane_s32((int32_t*)(dst+dst_skip),   shifted, 1);
 364                                 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
 365                 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
 366                                 break;
 367                 }
 368                 dst += 4*dst_skip;
 369                 src+= 4;
 370         }
 371 #endif
 372
 373         int32_t z;
 374
 375         while (nsamples--) {
 376
 377                 float_24u32 (*src, z);
 378
 379 #if __BYTE_ORDER == __LITTLE_ENDIAN
 380                 dst[0]=(char)(z>>24);
 381                 dst[1]=(char)(z>>16);
 382                 dst[2]=(char)(z>>8);
 383                 dst[3]=(char)(z);
 384 #elif __BYTE_ORDER == __BIG_ENDIAN
 385                 dst[0]=(char)(z);
 386                 dst[1]=(char)(z>>8);
 387                 dst[2]=(char)(z>>16);
 388                 dst[3]=(char)(z>>24);
 389 #endif
 390                 dst += dst_skip;
 391                 src++;
 392         }
 393 }
 394
 395 void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 396 {
 397 #if defined (__SSE2__) && !defined (__sun__)
 398         __m128 int_max = _mm_set1_ps(SAMPLE_24BIT_MAX_F);
 399         __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
 400         __m128 factor = int_max;
 401
 402         unsigned long unrolled = nsamples / 4;
 403         nsamples = nsamples & 3;
 404
 405         while (unrolled--) {
 406                 __m128 in = _mm_load_ps(src);
 407                 __m128 scaled = _mm_mul_ps(in, factor);
 408                 __m128 clipped = clip(scaled, int_min, int_max);
 409
 410                 __m128i y = _mm_cvttps_epi32(clipped);
 411                 __m128i shifted = _mm_slli_epi32(y, 8);
 412
 413 #ifdef __SSE4_1__
 414                 *(int32_t*)dst              = _mm_extract_epi32(shifted, 0);
 415                 *(int32_t*)(dst+dst_skip)   = _mm_extract_epi32(shifted, 1);
 416                 *(int32_t*)(dst+2*dst_skip) = _mm_extract_epi32(shifted, 2);
 417                 *(int32_t*)(dst+3*dst_skip) = _mm_extract_epi32(shifted, 3);
 418 #else
 419                 __m128i shuffled1 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(0, 3, 2, 1));
 420                 __m128i shuffled2 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(1, 0, 3, 2));
 421                 __m128i shuffled3 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(2, 1, 0, 3));
 422
 423                 _mm_store_ss((float*)dst, (__m128)shifted);
 424
 425                 _mm_store_ss((float*)(dst+dst_skip), (__m128)shuffled1);
 426                 _mm_store_ss((float*)(dst+2*dst_skip), (__m128)shuffled2);
 427                 _mm_store_ss((float*)(dst+3*dst_skip), (__m128)shuffled3);
 428 #endif
 429                 dst += 4*dst_skip;
 430
 431                 src+= 4;
 432         }
 433
 434         while (nsamples--) {
 435                 __m128 in = _mm_load_ss(src);
 436                 __m128 scaled = _mm_mul_ss(in, factor);
 437                 __m128 clipped = _mm_min_ss(int_max, _mm_max_ss(scaled, int_min));
 438
 439                 int y = _mm_cvttss_si32(clipped);
 440                 *((int *) dst) = y<<8;
 441
 442                 dst += dst_skip;
 443                 src++;
 444         }
 445
 446 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
 447         unsigned long unrolled = nsamples / 4;
 448         nsamples = nsamples & 3;
 449
 450         while (unrolled--) {
 451                 float32x4_t samples = vld1q_f32(src);
 452                 int32x4_t converted = float_24_neon(samples);
 453                 int32x4_t shifted = vshlq_n_s32(converted, 8);
 454
 455                 switch(dst_skip) {
 456                         case 4:
 457                                 vst1q_s32((int32_t*)dst, shifted);
 458                                 break;
 459                         default:
 460                                 vst1q_lane_s32((int32_t*)(dst),            shifted, 0);
 461                                 vst1q_lane_s32((int32_t*)(dst+dst_skip),   shifted, 1);
 462                                 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
 463                 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
 464                                 break;
 465                 }
 466                 dst += 4*dst_skip;
 467
 468                 src+= 4;
 469         }
 470 #endif
 471
 472 #if !defined (__SSE2__)
 473         while (nsamples--) {
 474                 float_24u32 (*src, *((int32_t*) dst));
 475                 dst += dst_skip;
 476                 src++;
 477         }
 478 #endif
 479 }
 480
 481 void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
 482 {
 483 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 484         float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
 485         unsigned long unrolled = nsamples / 4;
 486         while (unrolled--) {
 487                 int32x4_t src128;
 488                 switch(src_skip)
 489                 {
 490                         case 4:
 491                                 src128 = vld1q_s32((int32_t*)src);
 492                                 break;
 493                         case 8:
 494                                 src128 = vld2q_s32((int32_t*)src).val[0];
 495                                 break;
 496                         default:
 497                                 src128 = vld1q_lane_s32((int32_t*)src,              src128, 0);
 498                                 src128 = vld1q_lane_s32((int32_t*)(src+src_skip),   src128, 1);
 499                                 src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
 500                                 src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
 501                                 break;
 502                 }
 503                 src128 = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(src128)));
 504                 int32x4_t shifted = vshrq_n_s32(src128, 8);
 505                 float32x4_t as_float = vcvtq_f32_s32(shifted);
 506                 float32x4_t divided = vmulq_f32(as_float, factor);
 507                 vst1q_f32(dst, divided);
 508
 509                 src += 4*src_skip;
 510                 dst += 4;
 511         }
 512         nsamples = nsamples & 3;
 513 #endif
 514
 515         /* ALERT: signed sign-extension portability !!! */
 516
 517         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
 518
 519         while (nsamples--) {
 520                 int x;
 521 #if __BYTE_ORDER == __LITTLE_ENDIAN
 522                 x = (unsigned char)(src[0]);
 523                 x <<= 8;
 524                 x |= (unsigned char)(src[1]);
 525                 x <<= 8;
 526                 x |= (unsigned char)(src[2]);
 527                 x <<= 8;
 528                 x |= (unsigned char)(src[3]);
 529 #elif __BYTE_ORDER == __BIG_ENDIAN
 530                 x = (unsigned char)(src[3]);
 531                 x <<= 8;
 532                 x |= (unsigned char)(src[2]);
 533                 x <<= 8;
 534                 x |= (unsigned char)(src[1]);
 535                 x <<= 8;
 536                 x |= (unsigned char)(src[0]);
 537 #endif
 538                 *dst = (x >> 8) * scaling;
 539                 dst++;
 540                 src += src_skip;
 541         }
 542 }
 543
 544 void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
 545 {
 546 #if defined (__SSE2__) && !defined (__sun__)
 547         unsigned long unrolled = nsamples / 4;
 548         static float inv_sample_max_24bit = 1.0 / SAMPLE_24BIT_SCALING;
 549         __m128 factor = _mm_set1_ps(inv_sample_max_24bit);
 550         while (unrolled--)
 551         {
 552                 int i1 = *((int *) src);
 553                 src+= src_skip;
 554                 int i2 = *((int *) src);
 555                 src+= src_skip;
 556                 int i3 = *((int *) src);
 557                 src+= src_skip;
 558                 int i4 = *((int *) src);
 559                 src+= src_skip;
 560
 561                 __m128i src = _mm_set_epi32(i4, i3, i2, i1);
 562                 __m128i shifted = _mm_srai_epi32(src, 8);
 563
 564                 __m128 as_float = _mm_cvtepi32_ps(shifted);
 565                 __m128 divided = _mm_mul_ps(as_float, factor);
 566
 567                 _mm_storeu_ps(dst, divided);
 568
 569                 dst += 4;
 570         }
 571         nsamples = nsamples & 3;
 572 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
 573         unsigned long unrolled = nsamples / 4;
 574         float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
 575         while (unrolled--) {
 576                 int32x4_t src128;
 577                 switch(src_skip) {
 578                         case 4:
 579                                 src128 = vld1q_s32((int32_t*)src);
 580                                 break;
 581                         case 8:
 582                                 src128 = vld2q_s32((int32_t*)src).val[0];
 583                                 break;
 584                         default:
 585                                 src128 = vld1q_lane_s32((int32_t*)src,              src128, 0);
 586                                 src128 = vld1q_lane_s32((int32_t*)(src+src_skip),   src128, 1);
 587                                 src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
 588                                 src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
 589                                 break;
 590                 }
 591                 int32x4_t shifted = vshrq_n_s32(src128, 8);
 592                 float32x4_t as_float = vcvtq_f32_s32(shifted);
 593                 float32x4_t divided = vmulq_f32(as_float, factor);
 594                 vst1q_f32(dst, divided);
 595
 596                 src += 4*src_skip;
 597                 dst += 4;
 598         }
 599         nsamples = nsamples & 3;
 600 #endif
 601
 602         /* ALERT: signed sign-extension portability !!! */
 603
 604         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
 605         while (nsamples--) {
 606                 *dst = (*((int *) src) >> 8) * scaling;
 607                 dst++;
 608                 src += src_skip;
 609         }
 610 }
 611
 612 void sample_move_d32l24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 613 {
 614 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 615         unsigned long unrolled = nsamples / 4;
 616         nsamples = nsamples & 3;
 617
 618         while (unrolled--) {
 619                 float32x4_t samples = vld1q_f32(src);
 620                 int32x4_t converted = float_24_neon(samples);
 621                 converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted)));
 622
 623                 switch(dst_skip) {
 624                         case 4:
 625                                 vst1q_s32((int32_t*)dst, converted);
 626                                 break;
 627                         default:
 628                                 vst1q_lane_s32((int32_t*)(dst),            converted, 0);
 629                                 vst1q_lane_s32((int32_t*)(dst+dst_skip),   converted, 1);
 630                                 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), converted, 2);
 631                 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), converted, 3);
 632                                 break;
 633                 }
 634                 dst += 4*dst_skip;
 635                 src+= 4;
 636         }
 637 #endif
 638
 639         int32_t z;
 640
 641         while (nsamples--) {
 642
 643                 float_24l32 (*src, z);
 644
 645 #if __BYTE_ORDER == __LITTLE_ENDIAN
 646                 dst[0]=(char)(z>>24);
 647                 dst[1]=(char)(z>>16);
 648                 dst[2]=(char)(z>>8);
 649                 dst[3]=(char)(z);
 650 #elif __BYTE_ORDER == __BIG_ENDIAN
 651                 dst[0]=(char)(z);
 652                 dst[1]=(char)(z>>8);
 653                 dst[2]=(char)(z>>16);
 654                 dst[3]=(char)(z>>24);
 655 #endif
 656                 dst += dst_skip;
 657                 src++;
 658         }
 659 }
 660
 661 void sample_move_d32l24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 662 {
 663 #if defined (__SSE2__) && !defined (__sun__)
 664         __m128 int_max = _mm_set1_ps(SAMPLE_24BIT_MAX_F);
 665         __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
 666         __m128 factor = int_max;
 667
 668         unsigned long unrolled = nsamples / 4;
 669         nsamples = nsamples & 3;
 670
 671         while (unrolled--) {
 672                 __m128 in = _mm_load_ps(src);
 673                 __m128 scaled = _mm_mul_ps(in, factor);
 674                 __m128 clipped = clip(scaled, int_min, int_max);
 675
 676                 __m128i shifted = _mm_cvttps_epi32(clipped);
 677
 678 #ifdef __SSE4_1__
 679                 *(int32_t*)dst              = _mm_extract_epi32(shifted, 0);
 680                 *(int32_t*)(dst+dst_skip)   = _mm_extract_epi32(shifted, 1);
 681                 *(int32_t*)(dst+2*dst_skip) = _mm_extract_epi32(shifted, 2);
 682                 *(int32_t*)(dst+3*dst_skip) = _mm_extract_epi32(shifted, 3);
 683 #else
 684                 __m128i shuffled1 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(0, 3, 2, 1));
 685                 __m128i shuffled2 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(1, 0, 3, 2));
 686                 __m128i shuffled3 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(2, 1, 0, 3));
 687
 688                 _mm_store_ss((float*)dst, (__m128)shifted);
 689
 690                 _mm_store_ss((float*)(dst+dst_skip), (__m128)shuffled1);
 691                 _mm_store_ss((float*)(dst+2*dst_skip), (__m128)shuffled2);
 692                 _mm_store_ss((float*)(dst+3*dst_skip), (__m128)shuffled3);
 693 #endif
 694                 dst += 4*dst_skip;
 695
 696                 src+= 4;
 697         }
 698
 699         while (nsamples--) {
 700                 __m128 in = _mm_load_ss(src);
 701                 __m128 scaled = _mm_mul_ss(in, factor);
 702                 __m128 clipped = _mm_min_ss(int_max, _mm_max_ss(scaled, int_min));
 703
 704                 int y = _mm_cvttss_si32(clipped);
 705                 *((int *) dst) = y<<8;
 706
 707                 dst += dst_skip;
 708                 src++;
 709         }
 710 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
 711         unsigned long unrolled = nsamples / 4;
 712         nsamples = nsamples & 3;
 713
 714         while (unrolled--) {
 715                 float32x4_t samples = vld1q_f32(src);
 716                 int32x4_t converted = float_24_neon(samples);
 717
 718                 switch(dst_skip) {
 719                         case 4:
 720                                 vst1q_s32((int32_t*)dst, converted);
 721                                 break;
 722                         default:
 723                                 vst1q_lane_s32((int32_t*)(dst),            converted, 0);
 724                                 vst1q_lane_s32((int32_t*)(dst+dst_skip),   converted, 1);
 725                                 vst1q_lane_s32((int32_t*)(dst+2*dst_skip), converted, 2);
 726                 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), converted, 3);
 727                                 break;
 728                 }
 729                 dst += 4*dst_skip;
 730
 731                 src+= 4;
 732         }
 733 #endif
 734
 735 #if !defined (__SSE2__)
 736         while (nsamples--) {
 737                 float_24l32 (*src, *((int32_t*) dst));
 738                 dst += dst_skip;
 739                 src++;
 740         }
 741 #endif
 742 }
 743
 744 void sample_move_dS_s32s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
 745 {
 746         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_32BIT_SCALING;
 747         while (nsamples--) {
 748                 int32_t x;
 749 #if __BYTE_ORDER == __LITTLE_ENDIAN
 750                 x = (unsigned char)(src[0]);
 751                 x <<= 8;
 752                 x |= (unsigned char)(src[1]);
 753                 x <<= 8;
 754                 x |= (unsigned char)(src[2]);
 755                 x <<= 8;
 756                 x |= (unsigned char)(src[3]);
 757 #elif __BYTE_ORDER == __BIG_ENDIAN
 758                 x = (unsigned char)(src[3]);
 759                 x <<= 8;
 760                 x |= (unsigned char)(src[2]);
 761                 x <<= 8;
 762                 x |= (unsigned char)(src[1]);
 763                 x <<= 8;
 764                 x |= (unsigned char)(src[0]);
 765 #endif
 766                 double extended = x * scaling;
 767                 *dst = (float)extended;
 768                 dst++;
 769                 src += src_skip;
 770         }
 771 }
 772
 773 void sample_move_dS_s32l24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
 774 {
 775 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 776         float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
 777         unsigned long unrolled = nsamples / 4;
 778         while (unrolled--) {
 779                 uint32x4_t src128;
 780                 switch(src_skip)
 781                 {
 782                         case 4:
 783                                 src128 = vld1q_u32((uint32_t*)src);
 784                                 break;
 785                         case 8:
 786                                 src128 = vld2q_u32((uint32_t*)src).val[0];
 787                                 break;
 788                         default:
 789                                 src128 = vld1q_lane_u32((uint32_t*)src,              src128, 0);
 790                                 src128 = vld1q_lane_u32((uint32_t*)(src+src_skip),   src128, 1);
 791                                 src128 = vld1q_lane_u32((uint32_t*)(src+2*src_skip), src128, 2);
 792                                 src128 = vld1q_lane_u32((uint32_t*)(src+3*src_skip), src128, 3);
 793                                 break;
 794                 }
 795                 src128 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(src128)));
 796                 uint32x4_t toupper = vshlq_n_u32(src128, 8);
 797                 int32x4_t shifted = vshrq_n_s32((int32x4_t)toupper, 8);
 798                 float32x4_t as_float = vcvtq_f32_s32(shifted);
 799                 float32x4_t divided = vmulq_f32(as_float, factor);
 800                 vst1q_f32(dst, divided);
 801
 802                 src += 4*src_skip;
 803                 dst += 4;
 804         }
 805         nsamples = nsamples & 3;
 806 #endif
 807
 808         /* ALERT: signed sign-extension portability !!! */
 809
 810         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
 811
 812         while (nsamples--) {
 813                 int32_t x;
 814 #if __BYTE_ORDER == __LITTLE_ENDIAN
 815                 x = (unsigned char)(src[0]);
 816                 x <<= 8;
 817                 x |= (unsigned char)(src[1]);
 818                 x <<= 8;
 819                 x |= (unsigned char)(src[2]);
 820                 x <<= 8;
 821                 x |= (unsigned char)(src[3]);
 822 #elif __BYTE_ORDER == __BIG_ENDIAN
 823                 x = (unsigned char)(src[3]);
 824                 x <<= 8;
 825                 x |= (unsigned char)(src[2]);
 826                 x <<= 8;
 827                 x |= (unsigned char)(src[1]);
 828                 x <<= 8;
 829                 x |= (unsigned char)(src[0]);
 830 #endif
 831                 *dst = (x >> 0) * scaling;
 832                 dst++;
 833                 src += src_skip;
 834         }
 835 }
 836
 837 void sample_move_dS_s32 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
 838 {
 839         const double scaling = 1.0 / SAMPLE_32BIT_SCALING;
 840         while (nsamples--) {
 841                 int32_t val=(*((int32_t*)src));
 842                 double extended = val * scaling;
 843                 *dst = (float)extended;
 844                 dst++;
 845                 src += src_skip;
 846         }
 847 }
 848
 849 void sample_move_dS_s32l24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
 850 {
 851 #if defined (__SSE2__) && !defined (__sun__)
 852         unsigned long unrolled = nsamples / 4;
 853         static float inv_sample_max_24bit = 1.0 / SAMPLE_24BIT_SCALING;
 854         __m128 factor = _mm_set1_ps(inv_sample_max_24bit);
 855         while (unrolled--)
 856         {
 857                 int i1 = *((int *) src);
 858                 src+= src_skip;
 859                 int i2 = *((int *) src);
 860                 src+= src_skip;
 861                 int i3 = *((int *) src);
 862                 src+= src_skip;
 863                 int i4 = *((int *) src);
 864                 src+= src_skip;
 865
 866                 __m128i shifted = _mm_set_epi32(i4, i3, i2, i1);
 867
 868                 __m128 as_float = _mm_cvtepi32_ps(shifted);
 869                 __m128 divided = _mm_mul_ps(as_float, factor);
 870
 871                 _mm_storeu_ps(dst, divided);
 872
 873                 dst += 4;
 874         }
 875         nsamples = nsamples & 3;
 876 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
 877         unsigned long unrolled = nsamples / 4;
 878         float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
 879         while (unrolled--) {
 880                 uint32x4_t src128;
 881                 switch(src_skip) {
 882                         case 4:
 883                                 src128 = vld1q_u32((uint32_t*)src);
 884                                 break;
 885                         case 8:
 886                                 src128 = vld2q_u32((uint32_t*)src).val[0];
 887                                 break;
 888                         default:
 889                                 src128 = vld1q_lane_u32((uint32_t*)src,              src128, 0);
 890                                 src128 = vld1q_lane_u32((uint32_t*)(src+src_skip),   src128, 1);
 891                                 src128 = vld1q_lane_u32((uint32_t*)(src+2*src_skip), src128, 2);
 892                                 src128 = vld1q_lane_u32((uint32_t*)(src+3*src_skip), src128, 3);
 893                                 break;
 894                 }
 895                 // Sign extension by moving to upper as unsigned, then down
 896                 uint32x4_t toupper = vshlq_n_u32(src128, 8);
 897                 int32x4_t shifted = vshrq_n_s32((int32x4_t)toupper, 8);
 898                 float32x4_t as_float = vcvtq_f32_s32(shifted);
 899                 float32x4_t divided = vmulq_f32(as_float, factor);
 900                 vst1q_f32(dst, divided);
 901
 902                 src += 4*src_skip;
 903                 dst += 4;
 904         }
 905         nsamples = nsamples & 3;
 906 #endif
 907
 908         /* ALERT: signed sign-extension portability !!! */
 909
 910         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
 911         while (nsamples--) {
 912                 uint32_t val=(*((uint32_t*)src));
 913                 if (val & 0x800000u) val|=0xFF000000u;
 914                 *dst = (*((int32_t *) &val)) * scaling;
 915                 dst++;
 916                 src += src_skip;
 917         }
 918 }
 919
 920 void sample_move_d24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 921 {
 922 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
 923         unsigned long unrolled = nsamples / 4;
 924         while (unrolled--) {
 925                 int i;
 926                 int32_t z[4];
 927                 float32x4_t samples = vld1q_f32(src);
 928                 int32x4_t converted = float_24_neon(samples);
 929                 converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted)));
 930                 vst1q_s32(z, converted);
 931
 932                 for (i = 0; i != 4; ++i) {
 933                         memcpy (dst, ((char*)(z+i))+1, 3);
 934                         dst += dst_skip;
 935                 }
 936                 src += 4;
 937         }
 938         nsamples = nsamples & 3;
 939 #endif
 940
 941         int32_t z;
 942
 943         while (nsamples--) {
 944                 float_24 (*src, z);
 945 #if __BYTE_ORDER == __LITTLE_ENDIAN
 946                 dst[0]=(char)(z>>16);
 947                 dst[1]=(char)(z>>8);
 948                 dst[2]=(char)(z);
 949 #elif __BYTE_ORDER == __BIG_ENDIAN
 950                 dst[0]=(char)(z);
 951                 dst[1]=(char)(z>>8);
 952                 dst[2]=(char)(z>>16);
 953 #endif
 954                 dst += dst_skip;
 955                 src++;
 956         }
 957 }
 958
 959 void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
 960 {
 961 #if defined (__SSE2__) && !defined (__sun__)
 962         _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
 963         while (nsamples >= 4) {
 964                 int i;
 965                 int32_t z[4];
 966                 __m128 samples = _mm_loadu_ps(src);
 967                 __m128i converted = float_24_sse(samples);
 968
 969 #ifdef __SSE4_1__
 970                 z[0] = _mm_extract_epi32(converted, 0);
 971                 z[1] = _mm_extract_epi32(converted, 1);
 972                 z[2] = _mm_extract_epi32(converted, 2);
 973                 z[3] = _mm_extract_epi32(converted, 3);
 974 #else
 975                 __m128i shuffled1 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(0, 3, 2, 1));
 976                 __m128i shuffled2 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(1, 0, 3, 2));
 977                 __m128i shuffled3 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(2, 1, 0, 3));
 978
 979                 _mm_store_ss((float*)z, (__m128)converted);
 980                 _mm_store_ss((float*)z+1, (__m128)shuffled1);
 981                 _mm_store_ss((float*)z+2, (__m128)shuffled2);
 982                 _mm_store_ss((float*)z+3, (__m128)shuffled3);
 983 #endif
 984
 985                 for (i = 0; i != 4; ++i) {
 986                         memcpy (dst, z+i, 3);
 987                         dst += dst_skip;
 988                 }
 989
 990                 nsamples -= 4;
 991                 src += 4;
 992         }
 993 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
 994         unsigned long unrolled = nsamples / 4;
 995         while (unrolled--) {
 996                 int i;
 997                 int32_t z[4];
 998                 float32x4_t samples = vld1q_f32(src);
 999                 int32x4_t converted = float_24_neon(samples);
1000                 vst1q_s32(z, converted);
1001
1002                 for (i = 0; i != 4; ++i) {
1003                         memcpy (dst, z+i, 3);
1004                         dst += dst_skip;
1005                 }
1006                 src += 4;
1007         }
1008         nsamples = nsamples & 3;
1009 #endif
1010
1011     int32_t z;
1012
1013         while (nsamples--) {
1014                 float_24 (*src, z);
1015 #if __BYTE_ORDER == __LITTLE_ENDIAN
1016                 memcpy (dst, &z, 3);
1017 #elif __BYTE_ORDER == __BIG_ENDIAN
1018                 memcpy (dst, (char *)&z + 1, 3);
1019 #endif
1020                 dst += dst_skip;
1021                 src++;
1022         }
1023 }
1024
1025 void sample_move_dS_s24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1026 {
1027         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
1028
1029 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1030         // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
1031         const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
1032         int32_t x[4];
1033         memset(x, 0, sizeof(x));
1034         unsigned long unrolled = nsamples / 4;
1035         while (unrolled--) {
1036 #if __BYTE_ORDER == __BIG_ENDIAN         /* ARM big endian?? */
1037                 // right aligned / inverse sequence below -> *256
1038                 memcpy(((char*)&x[0])+1, src, 3);
1039                 memcpy(((char*)&x[1])+1, src+src_skip, 3);
1040                 memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
1041                 memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
1042 #else
1043                 memcpy(&x[0], src, 3);
1044                 memcpy(&x[1], src+src_skip, 3);
1045                 memcpy(&x[2], src+2*src_skip, 3);
1046                 memcpy(&x[3], src+3*src_skip, 3);
1047 #endif
1048                 src += 4 * src_skip;
1049
1050                 int32x4_t source = vld1q_s32(x);
1051                 source = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(source)));
1052                 float32x4_t converted = vcvtq_f32_s32(source);
1053                 float32x4_t scaled = vmulq_f32(converted, vscaling);
1054                 vst1q_f32(dst, scaled);
1055                 dst += 4;
1056         }
1057         nsamples = nsamples & 3;
1058 #endif
1059
1060         /* ALERT: signed sign-extension portability !!! */
1061
1062         while (nsamples--) {
1063                 int x;
1064 #if __BYTE_ORDER == __LITTLE_ENDIAN
1065                 x = (unsigned char)(src[0]);
1066                 x <<= 8;
1067                 x |= (unsigned char)(src[1]);
1068                 x <<= 8;
1069                 x |= (unsigned char)(src[2]);
1070                 /* correct sign bit and the rest of the top byte */
1071                 if (src[0] & 0x80) {
1072                         x |= 0xff << 24;
1073                 }
1074 #elif __BYTE_ORDER == __BIG_ENDIAN
1075                 x = (unsigned char)(src[2]);
1076                 x <<= 8;
1077                 x |= (unsigned char)(src[1]);
1078                 x <<= 8;
1079                 x |= (unsigned char)(src[0]);
1080                 /* correct sign bit and the rest of the top byte */
1081                 if (src[2] & 0x80) {
1082                         x |= 0xff << 24;
1083                 }
1084 #endif
1085                 *dst = x * scaling;
1086                 dst++;
1087                 src += src_skip;
1088         }
1089 }
1090
1091 void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1092 {
1093         const jack_default_audio_sample_t scaling = 1.f/SAMPLE_24BIT_SCALING;
1094
1095 #if defined (__SSE2__) && !defined (__sun__)
1096         const __m128 scaling_block = _mm_set_ps1(scaling);
1097         while (nsamples >= 4) {
1098                 int x0, x1, x2, x3;
1099
1100                 memcpy((char*)&x0 + 1, src, 3);
1101                 memcpy((char*)&x1 + 1, src+src_skip, 3);
1102                 memcpy((char*)&x2 + 1, src+2*src_skip, 3);
1103                 memcpy((char*)&x3 + 1, src+3*src_skip, 3);
1104                 src += 4 * src_skip;
1105
1106                 const __m128i block_i = _mm_set_epi32(x3, x2, x1, x0);
1107                 const __m128i shifted = _mm_srai_epi32(block_i, 8);
1108                 const __m128 converted = _mm_cvtepi32_ps (shifted);
1109                 const __m128 scaled = _mm_mul_ps(converted, scaling_block);
1110                 _mm_storeu_ps(dst, scaled);
1111                 dst += 4;
1112                 nsamples -= 4;
1113         }
1114 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
1115         // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
1116         const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
1117         int32_t x[4];
1118         memset(x, 0, sizeof(x));
1119         unsigned long unrolled = nsamples / 4;
1120         while (unrolled--) {
1121 #if __BYTE_ORDER == __BIG_ENDIAN        /* ARM big endian?? */
1122                 // left aligned -> *256
1123                 memcpy(&x[0], src, 3);
1124                 memcpy(&x[1], src+src_skip, 3);
1125                 memcpy(&x[2], src+2*src_skip, 3);
1126                 memcpy(&x[3], src+3*src_skip, 3);
1127 #else
1128                 memcpy(((char*)&x[0])+1, src, 3);
1129                 memcpy(((char*)&x[1])+1, src+src_skip, 3);
1130                 memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
1131                 memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
1132 #endif
1133                 src += 4 * src_skip;
1134
1135                 int32x4_t source = vld1q_s32(x);
1136                 float32x4_t converted = vcvtq_f32_s32(source);
1137                 float32x4_t scaled = vmulq_f32(converted, vscaling);
1138                 vst1q_f32(dst, scaled);
1139                 dst += 4;
1140         }
1141         nsamples = nsamples & 3;
1142 #endif
1143
1144         while (nsamples--) {
1145                 int x;
1146 #if __BYTE_ORDER == __LITTLE_ENDIAN
1147                 memcpy((char*)&x + 1, src, 3);
1148 #elif __BYTE_ORDER == __BIG_ENDIAN
1149                 memcpy(&x, src, 3);
1150 #endif
1151                 x >>= 8;
1152                 *dst = x * scaling;
1153                 dst++;
1154                 src += src_skip;
1155         }
1156 }
1157
1158
1159 void sample_move_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1160 {
1161 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1162         unsigned long unrolled = nsamples / 4;
1163         nsamples = nsamples & 3;
1164
1165         while (unrolled--) {
1166                 float32x4_t samples = vld1q_f32(src);
1167                 int16x4_t converted = float_16_neon(samples);
1168                 converted = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(converted)));
1169
1170                 switch(dst_skip) {
1171                         case 2:
1172                                 vst1_s16((int16_t*)dst, converted);
1173                                 break;
1174                         default:
1175                                 vst1_lane_s16((int16_t*)(dst),            converted, 0);
1176                                 vst1_lane_s16((int16_t*)(dst+dst_skip),   converted, 1);
1177                                 vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
1178                                 vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
1179                                 break;
1180                 }
1181                 dst += 4*dst_skip;
1182                 src+= 4;
1183         }
1184 #endif
1185         int16_t tmp;
1186
1187         while (nsamples--) {
1188                 // float_16 (*src, tmp);
1189
1190                 if (*src <= NORMALIZED_FLOAT_MIN) {
1191                         tmp = SAMPLE_16BIT_MIN;
1192                 } else if (*src >= NORMALIZED_FLOAT_MAX) {
1193                         tmp = SAMPLE_16BIT_MAX;
1194                 } else {
1195                         tmp = (int16_t) f_round (*src * SAMPLE_16BIT_SCALING);
1196                 }
1197
1198 #if __BYTE_ORDER == __LITTLE_ENDIAN
1199                 dst[0]=(char)(tmp>>8);
1200                 dst[1]=(char)(tmp);
1201 #elif __BYTE_ORDER == __BIG_ENDIAN
1202                 dst[0]=(char)(tmp);
1203                 dst[1]=(char)(tmp>>8);
1204 #endif
1205                 dst += dst_skip;
1206                 src++;
1207         }
1208 }
1209
1210 void sample_move_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1211 {
1212 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1213         unsigned long unrolled = nsamples / 4;
1214         nsamples = nsamples & 3;
1215
1216         while (unrolled--) {
1217                 float32x4_t samples = vld1q_f32(src);
1218                 int16x4_t converted = float_16_neon(samples);
1219
1220                 switch(dst_skip) {
1221                         case 2:
1222                                 vst1_s16((int16_t*)dst, converted);
1223                                 break;
1224                         default:
1225                                 vst1_lane_s16((int16_t*)(dst),            converted, 0);
1226                                 vst1_lane_s16((int16_t*)(dst+dst_skip),   converted, 1);
1227                                 vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
1228                                 vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
1229                                 break;
1230                 }
1231                 dst += 4*dst_skip;
1232                 src+= 4;
1233         }
1234 #endif
1235         while (nsamples--) {
1236                 float_16 (*src, *((int16_t*) dst));
1237                 dst += dst_skip;
1238                 src++;
1239         }
1240 }
1241
1242 void sample_move_dither_rect_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1243 {
1244         jack_default_audio_sample_t val;
1245         int16_t      tmp;
1246
1247         while (nsamples--) {
1248                 val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float) UINT_MAX - 0.5f;
1249                 float_16_scaled (val, tmp);
1250 #if __BYTE_ORDER == __LITTLE_ENDIAN
1251                 dst[0]=(char)(tmp>>8);
1252                 dst[1]=(char)(tmp);
1253 #elif __BYTE_ORDER == __BIG_ENDIAN
1254                 dst[0]=(char)(tmp);
1255                 dst[1]=(char)(tmp>>8);
1256 #endif
1257                 dst += dst_skip;
1258                 src++;
1259         }
1260 }
1261
1262 void sample_move_dither_rect_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1263 {
1264         jack_default_audio_sample_t val;
1265
1266         while (nsamples--) {
1267                 val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float)UINT_MAX - 0.5f;
1268                 float_16_scaled (val, *((int16_t*) dst));
1269                 dst += dst_skip;
1270                 src++;
1271         }
1272 }
1273
1274 void sample_move_dither_tri_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1275 {
1276         jack_default_audio_sample_t val;
1277         int16_t      tmp;
1278
1279         while (nsamples--) {
1280                 val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
1281                 float_16_scaled (val, tmp);
1282
1283 #if __BYTE_ORDER == __LITTLE_ENDIAN
1284                 dst[0]=(char)(tmp>>8);
1285                 dst[1]=(char)(tmp);
1286 #elif __BYTE_ORDER == __BIG_ENDIAN
1287                 dst[0]=(char)(tmp);
1288                 dst[1]=(char)(tmp>>8);
1289 #endif
1290                 dst += dst_skip;
1291                 src++;
1292         }
1293 }
1294
1295 void sample_move_dither_tri_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1296 {
1297         jack_default_audio_sample_t val;
1298
1299         while (nsamples--) {
1300                 val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
1301                 float_16_scaled (val, *((int16_t*) dst));
1302                 dst += dst_skip;
1303                 src++;
1304         }
1305 }
1306
1307 void sample_move_dither_shaped_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1308 {
1309         jack_default_audio_sample_t     x;
1310         jack_default_audio_sample_t     xe; /* the innput sample - filtered error */
1311         jack_default_audio_sample_t     xp; /* x' */
1312         float        r;
1313         float        rm1 = state->rm1;
1314         unsigned int idx = state->idx;
1315         int16_t      tmp;
1316
1317         while (nsamples--) {
1318                 x = *src * SAMPLE_16BIT_SCALING;
1319                 r = ((float)fast_rand() + (float)fast_rand())  / (float)UINT_MAX - 1.0f;
1320                 /* Filter the error with Lipshitz's minimally audible FIR:
1321                    [2.033 -2.165 1.959 -1.590 0.6149] */
1322                 xe = x
1323                      - state->e[idx] * 2.033f
1324                      + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
1325                      - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
1326                      + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
1327                      - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
1328                 xp = xe + r - rm1;
1329                 rm1 = r;
1330
1331                 float_16_scaled (xp, tmp);
1332
1333                 /* Intrinsic z^-1 delay */
1334                 idx = (idx + 1) & DITHER_BUF_MASK;
1335                 state->e[idx] = xp - xe;
1336
1337 #if __BYTE_ORDER == __LITTLE_ENDIAN
1338                 dst[0]=(char)(tmp>>8);
1339                 dst[1]=(char)(tmp);
1340 #elif __BYTE_ORDER == __BIG_ENDIAN
1341                 dst[0]=(char)(tmp);
1342                 dst[1]=(char)(tmp>>8);
1343 #endif
1344                 dst += dst_skip;
1345                 src++;
1346         }
1347         state->rm1 = rm1;
1348         state->idx = idx;
1349 }
1350
1351 void sample_move_dither_shaped_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
1352 {
1353         jack_default_audio_sample_t     x;
1354         jack_default_audio_sample_t     xe; /* the innput sample - filtered error */
1355         jack_default_audio_sample_t     xp; /* x' */
1356         float        r;
1357         float        rm1 = state->rm1;
1358         unsigned int idx = state->idx;
1359
1360         while (nsamples--) {
1361                 x = *src * SAMPLE_16BIT_SCALING;
1362                 r = ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
1363                 /* Filter the error with Lipshitz's minimally audible FIR:
1364                    [2.033 -2.165 1.959 -1.590 0.6149] */
1365                 xe = x
1366                      - state->e[idx] * 2.033f
1367                      + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
1368                      - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
1369                      + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
1370                      - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
1371                 xp = xe + r - rm1;
1372                 rm1 = r;
1373
1374                 float_16_scaled (xp, *((int16_t*) dst));
1375
1376                 /* Intrinsic z^-1 delay */
1377                 idx = (idx + 1) & DITHER_BUF_MASK;
1378                 state->e[idx] = *((int16_t*) dst) - xe;
1379
1380                 dst += dst_skip;
1381                 src++;
1382         }
1383         state->rm1 = rm1;
1384         state->idx = idx;
1385 }
1386
1387 void sample_move_dS_s16s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1388 {
1389         short z;
1390         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
1391 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1392         const float32x4_t vscaling = vdupq_n_f32(scaling);
1393         unsigned long unrolled = nsamples / 4;
1394         while (unrolled--) {
1395                 int16x4_t source16x4;
1396                 switch(src_skip) {
1397                         case 2:
1398                                 source16x4 = vld1_s16((int16_t*)src);
1399                                 break;
1400                         case 4:
1401                                 source16x4 = vld2_s16((int16_t*)src).val[0];
1402                                 break;
1403                         default:
1404                                 source16x4 = vld1_lane_s16((int16_t*)src,              source16x4, 0);
1405                                 source16x4 = vld1_lane_s16((int16_t*)(src+src_skip),   source16x4, 1);
1406                                 source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
1407                                 source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
1408                                 break;
1409                 }
1410                 source16x4 = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(source16x4)));
1411                 int32x4_t source32x4 = vmovl_s16(source16x4);
1412                 src += 4 * src_skip;
1413
1414                 float32x4_t converted = vcvtq_f32_s32(source32x4);
1415                 float32x4_t scaled = vmulq_f32(converted, vscaling);
1416                 vst1q_f32(dst, scaled);
1417                 dst += 4;
1418         }
1419         nsamples = nsamples & 3;
1420 #endif
1421
1422         /* ALERT: signed sign-extension portability !!! */
1423         while (nsamples--) {
1424 #if __BYTE_ORDER == __LITTLE_ENDIAN
1425                 z = (unsigned char)(src[0]);
1426                 z <<= 8;
1427                 z |= (unsigned char)(src[1]);
1428 #elif __BYTE_ORDER == __BIG_ENDIAN
1429                 z = (unsigned char)(src[1]);
1430                 z <<= 8;
1431                 z |= (unsigned char)(src[0]);
1432 #endif
1433                 *dst = z * scaling;
1434                 dst++;
1435                 src += src_skip;
1436         }
1437 }
1438
1439 void sample_move_dS_s16 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1440 {
1441         /* ALERT: signed sign-extension portability !!! */
1442         const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
1443 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1444         const float32x4_t vscaling = vdupq_n_f32(scaling);
1445         unsigned long unrolled = nsamples / 4;
1446         while (unrolled--) {
1447                 int16x4_t source16x4;
1448                 switch(src_skip) {
1449                         case 2:
1450                                 source16x4 = vld1_s16((int16_t*)src);
1451                                 break;
1452                         case 4:
1453                                 source16x4 = vld2_s16((int16_t*)src).val[0];
1454                                 break;
1455                         default:
1456                                 source16x4 = vld1_lane_s16((int16_t*)src,              source16x4, 0);
1457                                 source16x4 = vld1_lane_s16((int16_t*)(src+src_skip),   source16x4, 1);
1458                                 source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
1459                                 source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
1460                                 break;
1461                 }
1462                 int32x4_t source32x4 = vmovl_s16(source16x4);
1463                 src += 4 * src_skip;
1464
1465                 float32x4_t converted = vcvtq_f32_s32(source32x4);
1466                 float32x4_t scaled = vmulq_f32(converted, vscaling);
1467                 vst1q_f32(dst, scaled);
1468                 dst += 4;
1469         }
1470         nsamples = nsamples & 3;
1471 #endif
1472
1473         while (nsamples--) {
1474                 *dst = (*((short *) src)) * scaling;
1475                 dst++;
1476                 src += src_skip;
1477         }
1478 }
1479
1480 void memset_interleave (char *dst, char val, unsigned long bytes,
1481                         unsigned long unit_bytes,
1482                         unsigned long skip_bytes)
1483 {
1484         switch (unit_bytes) {
1485         case 1:
1486                 while (bytes--) {
1487                         *dst = val;
1488                         dst += skip_bytes;
1489                 }
1490                 break;
1491         case 2:
1492                 while (bytes) {
1493                         *((short *) dst) = (short) val;
1494                         dst += skip_bytes;
1495                         bytes -= 2;
1496                 }
1497                 break;
1498         case 4:
1499                 while (bytes) {
1500                         *((int *) dst) = (int) val;
1501                         dst += skip_bytes;
1502                         bytes -= 4;
1503                 }
1504                 break;
1505         default:
1506                 while (bytes) {
1507                         memset(dst, val, unit_bytes);
1508                         dst += skip_bytes;
1509                         bytes -= unit_bytes;
1510                 }
1511                 break;
1512         }
1513 }
1514
1515 /* COPY FUNCTIONS: used to move data from an input channel to an
1516    output channel. Note that we assume that the skip distance
1517    is the same for both channels. This is completely fine
1518    unless the input and output were on different audio interfaces that
1519    were interleaved differently. We don't try to handle that.
1520 */
1521
1522 void
1523 memcpy_fake (char *dst, char *src, unsigned long src_bytes, unsigned long foo, unsigned long bar)
1524 {
1525         memcpy (dst, src, src_bytes);
1526 }
1527
1528 void
1529 memcpy_interleave_d16_s16 (char *dst, char *src, unsigned long src_bytes,
1530                            unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1531 {
1532         while (src_bytes) {
1533                 *((short *) dst) = *((short *) src);
1534                 dst += dst_skip_bytes;
1535                 src += src_skip_bytes;
1536                 src_bytes -= 2;
1537         }
1538 }
1539
1540 void
1541 memcpy_interleave_d24_s24 (char *dst, char *src, unsigned long src_bytes,
1542                            unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1543 {
1544         while (src_bytes) {
1545                 memcpy(dst, src, 3);
1546                 dst += dst_skip_bytes;
1547                 src += src_skip_bytes;
1548                 src_bytes -= 3;
1549         }
1550 }
1551
1552 void
1553 memcpy_interleave_d32_s32 (char *dst, char *src, unsigned long src_bytes,
1554                            unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1555 {
1556         while (src_bytes) {
1557                 *((int *) dst) = *((int *) src);
1558                 dst += dst_skip_bytes;
1559                 src += src_skip_bytes;
1560                 src_bytes -= 4;
1561         }
1562 }