source/modules/ysfx/thirdparty/dr_libs/dr_flac.h

   1 /*
   2 FLAC audio decoder. Choice of public domain or MIT-0. See license statements at the end of this file.
   3 dr_flac - v0.12.31 - 2021-08-16
   4
   5 David Reid - mackron@gmail.com
   6
   7 GitHub: https://github.com/mackron/dr_libs
   8 */
   9
  10 /*
  11 RELEASE NOTES - v0.12.0
  12 =======================
  13 Version 0.12.0 has breaking API changes including changes to the existing API and the removal of deprecated APIs.
  14
  15
  16 Improved Client-Defined Memory Allocation
  17 -----------------------------------------
  18 The main change with this release is the addition of a more flexible way of implementing custom memory allocation routines. The
  19 existing system of DRFLAC_MALLOC, DRFLAC_REALLOC and DRFLAC_FREE are still in place and will be used by default when no custom
  20 allocation callbacks are specified.
  21
  22 To use the new system, you pass in a pointer to a drflac_allocation_callbacks object to drflac_open() and family, like this:
  23
  24     void* my_malloc(size_t sz, void* pUserData)
  25     {
  26         return malloc(sz);
  27     }
  28     void* my_realloc(void* p, size_t sz, void* pUserData)
  29     {
  30         return realloc(p, sz);
  31     }
  32     void my_free(void* p, void* pUserData)
  33     {
  34         free(p);
  35     }
  36
  37     ...
  38
  39     drflac_allocation_callbacks allocationCallbacks;
  40     allocationCallbacks.pUserData = &myData;
  41     allocationCallbacks.onMalloc  = my_malloc;
  42     allocationCallbacks.onRealloc = my_realloc;
  43     allocationCallbacks.onFree    = my_free;
  44     drflac* pFlac = drflac_open_file("my_file.flac", &allocationCallbacks);
  45
  46 The advantage of this new system is that it allows you to specify user data which will be passed in to the allocation routines.
  47
  48 Passing in null for the allocation callbacks object will cause dr_flac to use defaults which is the same as DRFLAC_MALLOC,
  49 DRFLAC_REALLOC and DRFLAC_FREE and the equivalent of how it worked in previous versions.
  50
  51 Every API that opens a drflac object now takes this extra parameter. These include the following:
  52
  53     drflac_open()
  54     drflac_open_relaxed()
  55     drflac_open_with_metadata()
  56     drflac_open_with_metadata_relaxed()
  57     drflac_open_file()
  58     drflac_open_file_with_metadata()
  59     drflac_open_memory()
  60     drflac_open_memory_with_metadata()
  61     drflac_open_and_read_pcm_frames_s32()
  62     drflac_open_and_read_pcm_frames_s16()
  63     drflac_open_and_read_pcm_frames_f32()
  64     drflac_open_file_and_read_pcm_frames_s32()
  65     drflac_open_file_and_read_pcm_frames_s16()
  66     drflac_open_file_and_read_pcm_frames_f32()
  67     drflac_open_memory_and_read_pcm_frames_s32()
  68     drflac_open_memory_and_read_pcm_frames_s16()
  69     drflac_open_memory_and_read_pcm_frames_f32()
  70
  71
  72
  73 Optimizations
  74 -------------
  75 Seeking performance has been greatly improved. A new binary search based seeking algorithm has been introduced which significantly
  76 improves performance over the brute force method which was used when no seek table was present. Seek table based seeking also takes
  77 advantage of the new binary search seeking system to further improve performance there as well. Note that this depends on CRC which
  78 means it will be disabled when DR_FLAC_NO_CRC is used.
  79
  80 The SSE4.1 pipeline has been cleaned up and optimized. You should see some improvements with decoding speed of 24-bit files in
  81 particular. 16-bit streams should also see some improvement.
  82
  83 drflac_read_pcm_frames_s16() has been optimized. Previously this sat on top of drflac_read_pcm_frames_s32() and performed it's s32
  84 to s16 conversion in a second pass. This is now all done in a single pass. This includes SSE2 and ARM NEON optimized paths.
  85
  86 A minor optimization has been implemented for drflac_read_pcm_frames_s32(). This will now use an SSE2 optimized pipeline for stereo
  87 channel reconstruction which is the last part of the decoding process.
  88
  89 The ARM build has seen a few improvements. The CLZ (count leading zeroes) and REV (byte swap) instructions are now used when
  90 compiling with GCC and Clang which is achieved using inline assembly. The CLZ instruction requires ARM architecture version 5 at
  91 compile time and the REV instruction requires ARM architecture version 6.
  92
  93 An ARM NEON optimized pipeline has been implemented. To enable this you'll need to add -mfpu=neon to the command line when compiling.
  94
  95
  96 Removed APIs
  97 ------------
  98 The following APIs were deprecated in version 0.11.0 and have been completely removed in version 0.12.0:
  99
 100     drflac_read_s32()                   -> drflac_read_pcm_frames_s32()
 101     drflac_read_s16()                   -> drflac_read_pcm_frames_s16()
 102     drflac_read_f32()                   -> drflac_read_pcm_frames_f32()
 103     drflac_seek_to_sample()             -> drflac_seek_to_pcm_frame()
 104     drflac_open_and_decode_s32()        -> drflac_open_and_read_pcm_frames_s32()
 105     drflac_open_and_decode_s16()        -> drflac_open_and_read_pcm_frames_s16()
 106     drflac_open_and_decode_f32()        -> drflac_open_and_read_pcm_frames_f32()
 107     drflac_open_and_decode_file_s32()   -> drflac_open_file_and_read_pcm_frames_s32()
 108     drflac_open_and_decode_file_s16()   -> drflac_open_file_and_read_pcm_frames_s16()
 109     drflac_open_and_decode_file_f32()   -> drflac_open_file_and_read_pcm_frames_f32()
 110     drflac_open_and_decode_memory_s32() -> drflac_open_memory_and_read_pcm_frames_s32()
 111     drflac_open_and_decode_memory_s16() -> drflac_open_memory_and_read_pcm_frames_s16()
 112     drflac_open_and_decode_memory_f32() -> drflac_open_memroy_and_read_pcm_frames_f32()
 113
 114 Prior versions of dr_flac operated on a per-sample basis whereas now it operates on PCM frames. The removed APIs all relate
 115 to the old per-sample APIs. You now need to use the "pcm_frame" versions.
 116 */
 117
 118
 119 /*
 120 Introduction
 121 ============
 122 dr_flac is a single file library. To use it, do something like the following in one .c file.
 123
 124     ```c
 125     #define DR_FLAC_IMPLEMENTATION
 126     #include "dr_flac.h"
 127     ```
 128
 129 You can then #include this file in other parts of the program as you would with any other header file. To decode audio data, do something like the following:
 130
 131     ```c
 132     drflac* pFlac = drflac_open_file("MySong.flac", NULL);
 133     if (pFlac == NULL) {
 134         // Failed to open FLAC file
 135     }
 136
 137     drflac_int32* pSamples = malloc(pFlac->totalPCMFrameCount * pFlac->channels * sizeof(drflac_int32));
 138     drflac_uint64 numberOfInterleavedSamplesActuallyRead = drflac_read_pcm_frames_s32(pFlac, pFlac->totalPCMFrameCount, pSamples);
 139     ```
 140
 141 The drflac object represents the decoder. It is a transparent type so all the information you need, such as the number of channels and the bits per sample,
 142 should be directly accessible - just make sure you don't change their values. Samples are always output as interleaved signed 32-bit PCM. In the example above
 143 a native FLAC stream was opened, however dr_flac has seamless support for Ogg encapsulated FLAC streams as well.
 144
 145 You do not need to decode the entire stream in one go - you just specify how many samples you'd like at any given time and the decoder will give you as many
 146 samples as it can, up to the amount requested. Later on when you need the next batch of samples, just call it again. Example:
 147
 148     ```c
 149     while (drflac_read_pcm_frames_s32(pFlac, chunkSizeInPCMFrames, pChunkSamples) > 0) {
 150         do_something();
 151     }
 152     ```
 153
 154 You can seek to a specific PCM frame with `drflac_seek_to_pcm_frame()`.
 155
 156 If you just want to quickly decode an entire FLAC file in one go you can do something like this:
 157
 158     ```c
 159     unsigned int channels;
 160     unsigned int sampleRate;
 161     drflac_uint64 totalPCMFrameCount;
 162     drflac_int32* pSampleData = drflac_open_file_and_read_pcm_frames_s32("MySong.flac", &channels, &sampleRate, &totalPCMFrameCount, NULL);
 163     if (pSampleData == NULL) {
 164         // Failed to open and decode FLAC file.
 165     }
 166
 167     ...
 168
 169     drflac_free(pSampleData, NULL);
 170     ```
 171
 172 You can read samples as signed 16-bit integer and 32-bit floating-point PCM with the *_s16() and *_f32() family of APIs respectively, but note that these
 173 should be considered lossy.
 174
 175
 176 If you need access to metadata (album art, etc.), use `drflac_open_with_metadata()`, `drflac_open_file_with_metdata()` or `drflac_open_memory_with_metadata()`.
 177 The rationale for keeping these APIs separate is that they're slightly slower than the normal versions and also just a little bit harder to use. dr_flac
 178 reports metadata to the application through the use of a callback, and every metadata block is reported before `drflac_open_with_metdata()` returns.
 179
 180 The main opening APIs (`drflac_open()`, etc.) will fail if the header is not present. The presents a problem in certain scenarios such as broadcast style
 181 streams or internet radio where the header may not be present because the user has started playback mid-stream. To handle this, use the relaxed APIs:
 182
 183     `drflac_open_relaxed()`
 184     `drflac_open_with_metadata_relaxed()`
 185
 186 It is not recommended to use these APIs for file based streams because a missing header would usually indicate a corrupt or perverse file. In addition, these
 187 APIs can take a long time to initialize because they may need to spend a lot of time finding the first frame.
 188
 189
 190
 191 Build Options
 192 =============
 193 #define these options before including this file.
 194
 195 #define DR_FLAC_NO_STDIO
 196   Disable `drflac_open_file()` and family.
 197
 198 #define DR_FLAC_NO_OGG
 199   Disables support for Ogg/FLAC streams.
 200
 201 #define DR_FLAC_BUFFER_SIZE <number>
 202   Defines the size of the internal buffer to store data from onRead(). This buffer is used to reduce the number of calls back to the client for more data.
 203   Larger values means more memory, but better performance. My tests show diminishing returns after about 4KB (which is the default). Consider reducing this if
 204   you have a very efficient implementation of onRead(), or increase it if it's very inefficient. Must be a multiple of 8.
 205
 206 #define DR_FLAC_NO_CRC
 207   Disables CRC checks. This will offer a performance boost when CRC is unnecessary. This will disable binary search seeking. When seeking, the seek table will
 208   be used if available. Otherwise the seek will be performed using brute force.
 209
 210 #define DR_FLAC_NO_SIMD
 211   Disables SIMD optimizations (SSE on x86/x64 architectures, NEON on ARM architectures). Use this if you are having compatibility issues with your compiler.
 212
 213
 214
 215 Notes
 216 =====
 217 - dr_flac does not support changing the sample rate nor channel count mid stream.
 218 - dr_flac is not thread-safe, but its APIs can be called from any thread so long as you do your own synchronization.
 219 - When using Ogg encapsulation, a corrupted metadata block will result in `drflac_open_with_metadata()` and `drflac_open()` returning inconsistent samples due
 220   to differences in corrupted stream recorvery logic between the two APIs.
 221 */
 222
 223 #ifndef dr_flac_h
 224 #define dr_flac_h
 225
 226 #ifdef __cplusplus
 227 extern "C" {
 228 #endif
 229
 230 #define DRFLAC_STRINGIFY(x)      #x
 231 #define DRFLAC_XSTRINGIFY(x)     DRFLAC_STRINGIFY(x)
 232
 233 #define DRFLAC_VERSION_MAJOR     0
 234 #define DRFLAC_VERSION_MINOR     12
 235 #define DRFLAC_VERSION_REVISION  31
 236 #define DRFLAC_VERSION_STRING    DRFLAC_XSTRINGIFY(DRFLAC_VERSION_MAJOR) "." DRFLAC_XSTRINGIFY(DRFLAC_VERSION_MINOR) "." DRFLAC_XSTRINGIFY(DRFLAC_VERSION_REVISION)
 237
 238 #include <stddef.h> /* For size_t. */
 239
 240 /* Sized types. */
 241 typedef   signed char           drflac_int8;
 242 typedef unsigned char           drflac_uint8;
 243 typedef   signed short          drflac_int16;
 244 typedef unsigned short          drflac_uint16;
 245 typedef   signed int            drflac_int32;
 246 typedef unsigned int            drflac_uint32;
 247 #if defined(_MSC_VER)
 248     typedef   signed __int64    drflac_int64;
 249     typedef unsigned __int64    drflac_uint64;
 250 #else
 251     #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
 252         #pragma GCC diagnostic push
 253         #pragma GCC diagnostic ignored "-Wlong-long"
 254         #if defined(__clang__)
 255             #pragma GCC diagnostic ignored "-Wc++11-long-long"
 256         #endif
 257     #endif
 258     typedef   signed long long  drflac_int64;
 259     typedef unsigned long long  drflac_uint64;
 260     #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
 261         #pragma GCC diagnostic pop
 262     #endif
 263 #endif
 264 #if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
 265     typedef drflac_uint64       drflac_uintptr;
 266 #else
 267     typedef drflac_uint32       drflac_uintptr;
 268 #endif
 269 typedef drflac_uint8            drflac_bool8;
 270 typedef drflac_uint32           drflac_bool32;
 271 #define DRFLAC_TRUE             1
 272 #define DRFLAC_FALSE            0
 273
 274 #if !defined(DRFLAC_API)
 275     #if defined(DRFLAC_DLL)
 276         #if defined(_WIN32)
 277             #define DRFLAC_DLL_IMPORT  __declspec(dllimport)
 278             #define DRFLAC_DLL_EXPORT  __declspec(dllexport)
 279             #define DRFLAC_DLL_PRIVATE static
 280         #else
 281             #if defined(__GNUC__) && __GNUC__ >= 4
 282                 #define DRFLAC_DLL_IMPORT  __attribute__((visibility("default")))
 283                 #define DRFLAC_DLL_EXPORT  __attribute__((visibility("default")))
 284                 #define DRFLAC_DLL_PRIVATE __attribute__((visibility("hidden")))
 285             #else
 286                 #define DRFLAC_DLL_IMPORT
 287                 #define DRFLAC_DLL_EXPORT
 288                 #define DRFLAC_DLL_PRIVATE static
 289             #endif
 290         #endif
 291
 292         #if defined(DR_FLAC_IMPLEMENTATION) || defined(DRFLAC_IMPLEMENTATION)
 293             #define DRFLAC_API  DRFLAC_DLL_EXPORT
 294         #else
 295             #define DRFLAC_API  DRFLAC_DLL_IMPORT
 296         #endif
 297         #define DRFLAC_PRIVATE DRFLAC_DLL_PRIVATE
 298     #else
 299         #define DRFLAC_API extern
 300         #define DRFLAC_PRIVATE static
 301     #endif
 302 #endif
 303
 304 #if defined(_MSC_VER) && _MSC_VER >= 1700   /* Visual Studio 2012 */
 305     #define DRFLAC_DEPRECATED       __declspec(deprecated)
 306 #elif (defined(__GNUC__) && __GNUC__ >= 4)  /* GCC 4 */
 307     #define DRFLAC_DEPRECATED       __attribute__((deprecated))
 308 #elif defined(__has_feature)                /* Clang */
 309     #if __has_feature(attribute_deprecated)
 310         #define DRFLAC_DEPRECATED   __attribute__((deprecated))
 311     #else
 312         #define DRFLAC_DEPRECATED
 313     #endif
 314 #else
 315     #define DRFLAC_DEPRECATED
 316 #endif
 317
 318 DRFLAC_API void drflac_version(drflac_uint32* pMajor, drflac_uint32* pMinor, drflac_uint32* pRevision);
 319 DRFLAC_API const char* drflac_version_string(void);
 320
 321 /*
 322 As data is read from the client it is placed into an internal buffer for fast access. This controls the size of that buffer. Larger values means more speed,
 323 but also more memory. In my testing there is diminishing returns after about 4KB, but you can fiddle with this to suit your own needs. Must be a multiple of 8.
 324 */
 325 #ifndef DR_FLAC_BUFFER_SIZE
 326 #define DR_FLAC_BUFFER_SIZE   4096
 327 #endif
 328
 329 /* Check if we can enable 64-bit optimizations. */
 330 #if defined(_WIN64) || defined(_LP64) || defined(__LP64__)
 331 #define DRFLAC_64BIT
 332 #endif
 333
 334 #ifdef DRFLAC_64BIT
 335 typedef drflac_uint64 drflac_cache_t;
 336 #else
 337 typedef drflac_uint32 drflac_cache_t;
 338 #endif
 339
 340 /* The various metadata block types. */
 341 #define DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO       0
 342 #define DRFLAC_METADATA_BLOCK_TYPE_PADDING          1
 343 #define DRFLAC_METADATA_BLOCK_TYPE_APPLICATION      2
 344 #define DRFLAC_METADATA_BLOCK_TYPE_SEEKTABLE        3
 345 #define DRFLAC_METADATA_BLOCK_TYPE_VORBIS_COMMENT   4
 346 #define DRFLAC_METADATA_BLOCK_TYPE_CUESHEET         5
 347 #define DRFLAC_METADATA_BLOCK_TYPE_PICTURE          6
 348 #define DRFLAC_METADATA_BLOCK_TYPE_INVALID          127
 349
 350 /* The various picture types specified in the PICTURE block. */
 351 #define DRFLAC_PICTURE_TYPE_OTHER                   0
 352 #define DRFLAC_PICTURE_TYPE_FILE_ICON               1
 353 #define DRFLAC_PICTURE_TYPE_OTHER_FILE_ICON         2
 354 #define DRFLAC_PICTURE_TYPE_COVER_FRONT             3
 355 #define DRFLAC_PICTURE_TYPE_COVER_BACK              4
 356 #define DRFLAC_PICTURE_TYPE_LEAFLET_PAGE            5
 357 #define DRFLAC_PICTURE_TYPE_MEDIA                   6
 358 #define DRFLAC_PICTURE_TYPE_LEAD_ARTIST             7
 359 #define DRFLAC_PICTURE_TYPE_ARTIST                  8
 360 #define DRFLAC_PICTURE_TYPE_CONDUCTOR               9
 361 #define DRFLAC_PICTURE_TYPE_BAND                    10
 362 #define DRFLAC_PICTURE_TYPE_COMPOSER                11
 363 #define DRFLAC_PICTURE_TYPE_LYRICIST                12
 364 #define DRFLAC_PICTURE_TYPE_RECORDING_LOCATION      13
 365 #define DRFLAC_PICTURE_TYPE_DURING_RECORDING        14
 366 #define DRFLAC_PICTURE_TYPE_DURING_PERFORMANCE      15
 367 #define DRFLAC_PICTURE_TYPE_SCREEN_CAPTURE          16
 368 #define DRFLAC_PICTURE_TYPE_BRIGHT_COLORED_FISH     17
 369 #define DRFLAC_PICTURE_TYPE_ILLUSTRATION            18
 370 #define DRFLAC_PICTURE_TYPE_BAND_LOGOTYPE           19
 371 #define DRFLAC_PICTURE_TYPE_PUBLISHER_LOGOTYPE      20
 372
 373 typedef enum
 374 {
 375     drflac_container_native,
 376     drflac_container_ogg,
 377     drflac_container_unknown
 378 } drflac_container;
 379
 380 typedef enum
 381 {
 382     drflac_seek_origin_start,
 383     drflac_seek_origin_current
 384 } drflac_seek_origin;
 385
 386 /* Packing is important on this structure because we map this directly to the raw data within the SEEKTABLE metadata block. */
 387 #pragma pack(2)
 388 typedef struct
 389 {
 390     drflac_uint64 firstPCMFrame;
 391     drflac_uint64 flacFrameOffset;   /* The offset from the first byte of the header of the first frame. */
 392     drflac_uint16 pcmFrameCount;
 393 } drflac_seekpoint;
 394 #pragma pack()
 395
 396 typedef struct
 397 {
 398     drflac_uint16 minBlockSizeInPCMFrames;
 399     drflac_uint16 maxBlockSizeInPCMFrames;
 400     drflac_uint32 minFrameSizeInPCMFrames;
 401     drflac_uint32 maxFrameSizeInPCMFrames;
 402     drflac_uint32 sampleRate;
 403     drflac_uint8  channels;
 404     drflac_uint8  bitsPerSample;
 405     drflac_uint64 totalPCMFrameCount;
 406     drflac_uint8  md5[16];
 407 } drflac_streaminfo;
 408
 409 typedef struct
 410 {
 411     /*
 412     The metadata type. Use this to know how to interpret the data below. Will be set to one of the
 413     DRFLAC_METADATA_BLOCK_TYPE_* tokens.
 414     */
 415     drflac_uint32 type;
 416
 417     /*
 418     A pointer to the raw data. This points to a temporary buffer so don't hold on to it. It's best to
 419     not modify the contents of this buffer. Use the structures below for more meaningful and structured
 420     information about the metadata. It's possible for this to be null.
 421     */
 422     const void* pRawData;
 423
 424     /* The size in bytes of the block and the buffer pointed to by pRawData if it's non-NULL. */
 425     drflac_uint32 rawDataSize;
 426
 427     union
 428     {
 429         drflac_streaminfo streaminfo;
 430
 431         struct
 432         {
 433             int unused;
 434         } padding;
 435
 436         struct
 437         {
 438             drflac_uint32 id;
 439             const void* pData;
 440             drflac_uint32 dataSize;
 441         } application;
 442
 443         struct
 444         {
 445             drflac_uint32 seekpointCount;
 446             const drflac_seekpoint* pSeekpoints;
 447         } seektable;
 448
 449         struct
 450         {
 451             drflac_uint32 vendorLength;
 452             const char* vendor;
 453             drflac_uint32 commentCount;
 454             const void* pComments;
 455         } vorbis_comment;
 456
 457         struct
 458         {
 459             char catalog[128];
 460             drflac_uint64 leadInSampleCount;
 461             drflac_bool32 isCD;
 462             drflac_uint8 trackCount;
 463             const void* pTrackData;
 464         } cuesheet;
 465
 466         struct
 467         {
 468             drflac_uint32 type;
 469             drflac_uint32 mimeLength;
 470             const char* mime;
 471             drflac_uint32 descriptionLength;
 472             const char* description;
 473             drflac_uint32 width;
 474             drflac_uint32 height;
 475             drflac_uint32 colorDepth;
 476             drflac_uint32 indexColorCount;
 477             drflac_uint32 pictureDataSize;
 478             const drflac_uint8* pPictureData;
 479         } picture;
 480     } data;
 481 } drflac_metadata;
 482
 483
 484 /*
 485 Callback for when data needs to be read from the client.
 486
 487
 488 Parameters
 489 ----------
 490 pUserData (in)
 491     The user data that was passed to drflac_open() and family.
 492
 493 pBufferOut (out)
 494     The output buffer.
 495
 496 bytesToRead (in)
 497     The number of bytes to read.
 498
 499
 500 Return Value
 501 ------------
 502 The number of bytes actually read.
 503
 504
 505 Remarks
 506 -------
 507 A return value of less than bytesToRead indicates the end of the stream. Do _not_ return from this callback until either the entire bytesToRead is filled or
 508 you have reached the end of the stream.
 509 */
 510 typedef size_t (* drflac_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
 511
 512 /*
 513 Callback for when data needs to be seeked.
 514
 515
 516 Parameters
 517 ----------
 518 pUserData (in)
 519     The user data that was passed to drflac_open() and family.
 520
 521 offset (in)
 522     The number of bytes to move, relative to the origin. Will never be negative.
 523
 524 origin (in)
 525     The origin of the seek - the current position or the start of the stream.
 526
 527
 528 Return Value
 529 ------------
 530 Whether or not the seek was successful.
 531
 532
 533 Remarks
 534 -------
 535 The offset will never be negative. Whether or not it is relative to the beginning or current position is determined by the "origin" parameter which will be
 536 either drflac_seek_origin_start or drflac_seek_origin_current.
 537
 538 When seeking to a PCM frame using drflac_seek_to_pcm_frame(), dr_flac may call this with an offset beyond the end of the FLAC stream. This needs to be detected
 539 and handled by returning DRFLAC_FALSE.
 540 */
 541 typedef drflac_bool32 (* drflac_seek_proc)(void* pUserData, int offset, drflac_seek_origin origin);
 542
 543 /*
 544 Callback for when a metadata block is read.
 545
 546
 547 Parameters
 548 ----------
 549 pUserData (in)
 550     The user data that was passed to drflac_open() and family.
 551
 552 pMetadata (in)
 553     A pointer to a structure containing the data of the metadata block.
 554
 555
 556 Remarks
 557 -------
 558 Use pMetadata->type to determine which metadata block is being handled and how to read the data. This
 559 will be set to one of the DRFLAC_METADATA_BLOCK_TYPE_* tokens.
 560 */
 561 typedef void (* drflac_meta_proc)(void* pUserData, drflac_metadata* pMetadata);
 562
 563
 564 typedef struct
 565 {
 566     void* pUserData;
 567     void* (* onMalloc)(size_t sz, void* pUserData);
 568     void* (* onRealloc)(void* p, size_t sz, void* pUserData);
 569     void  (* onFree)(void* p, void* pUserData);
 570 } drflac_allocation_callbacks;
 571
 572 /* Structure for internal use. Only used for decoders opened with drflac_open_memory. */
 573 typedef struct
 574 {
 575     const drflac_uint8* data;
 576     size_t dataSize;
 577     size_t currentReadPos;
 578 } drflac__memory_stream;
 579
 580 /* Structure for internal use. Used for bit streaming. */
 581 typedef struct
 582 {
 583     /* The function to call when more data needs to be read. */
 584     drflac_read_proc onRead;
 585
 586     /* The function to call when the current read position needs to be moved. */
 587     drflac_seek_proc onSeek;
 588
 589     /* The user data to pass around to onRead and onSeek. */
 590     void* pUserData;
 591
 592
 593     /*
 594     The number of unaligned bytes in the L2 cache. This will always be 0 until the end of the stream is hit. At the end of the
 595     stream there will be a number of bytes that don't cleanly fit in an L1 cache line, so we use this variable to know whether
 596     or not the bistreamer needs to run on a slower path to read those last bytes. This will never be more than sizeof(drflac_cache_t).
 597     */
 598     size_t unalignedByteCount;
 599
 600     /* The content of the unaligned bytes. */
 601     drflac_cache_t unalignedCache;
 602
 603     /* The index of the next valid cache line in the "L2" cache. */
 604     drflac_uint32 nextL2Line;
 605
 606     /* The number of bits that have been consumed by the cache. This is used to determine how many valid bits are remaining. */
 607     drflac_uint32 consumedBits;
 608
 609     /*
 610     The cached data which was most recently read from the client. There are two levels of cache. Data flows as such:
 611     Client -> L2 -> L1. The L2 -> L1 movement is aligned and runs on a fast path in just a few instructions.
 612     */
 613     drflac_cache_t cacheL2[DR_FLAC_BUFFER_SIZE/sizeof(drflac_cache_t)];
 614     drflac_cache_t cache;
 615
 616     /*
 617     CRC-16. This is updated whenever bits are read from the bit stream. Manually set this to 0 to reset the CRC. For FLAC, this
 618     is reset to 0 at the beginning of each frame.
 619     */
 620     drflac_uint16 crc16;
 621     drflac_cache_t crc16Cache;              /* A cache for optimizing CRC calculations. This is filled when when the L1 cache is reloaded. */
 622     drflac_uint32 crc16CacheIgnoredBytes;   /* The number of bytes to ignore when updating the CRC-16 from the CRC-16 cache. */
 623 } drflac_bs;
 624
 625 typedef struct
 626 {
 627     /* The type of the subframe: SUBFRAME_CONSTANT, SUBFRAME_VERBATIM, SUBFRAME_FIXED or SUBFRAME_LPC. */
 628     drflac_uint8 subframeType;
 629
 630     /* The number of wasted bits per sample as specified by the sub-frame header. */
 631     drflac_uint8 wastedBitsPerSample;
 632
 633     /* The order to use for the prediction stage for SUBFRAME_FIXED and SUBFRAME_LPC. */
 634     drflac_uint8 lpcOrder;
 635
 636     /* A pointer to the buffer containing the decoded samples in the subframe. This pointer is an offset from drflac::pExtraData. */
 637     drflac_int32* pSamplesS32;
 638 } drflac_subframe;
 639
 640 typedef struct
 641 {
 642     /*
 643     If the stream uses variable block sizes, this will be set to the index of the first PCM frame. If fixed block sizes are used, this will
 644     always be set to 0. This is 64-bit because the decoded PCM frame number will be 36 bits.
 645     */
 646     drflac_uint64 pcmFrameNumber;
 647
 648     /*
 649     If the stream uses fixed block sizes, this will be set to the frame number. If variable block sizes are used, this will always be 0. This
 650     is 32-bit because in fixed block sizes, the maximum frame number will be 31 bits.
 651     */
 652     drflac_uint32 flacFrameNumber;
 653
 654     /* The sample rate of this frame. */
 655     drflac_uint32 sampleRate;
 656
 657     /* The number of PCM frames in each sub-frame within this frame. */
 658     drflac_uint16 blockSizeInPCMFrames;
 659
 660     /*
 661     The channel assignment of this frame. This is not always set to the channel count. If interchannel decorrelation is being used this
 662     will be set to DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE, DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE or DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE.
 663     */
 664     drflac_uint8 channelAssignment;
 665
 666     /* The number of bits per sample within this frame. */
 667     drflac_uint8 bitsPerSample;
 668
 669     /* The frame's CRC. */
 670     drflac_uint8 crc8;
 671 } drflac_frame_header;
 672
 673 typedef struct
 674 {
 675     /* The header. */
 676     drflac_frame_header header;
 677
 678     /*
 679     The number of PCM frames left to be read in this FLAC frame. This is initially set to the block size. As PCM frames are read,
 680     this will be decremented. When it reaches 0, the decoder will see this frame as fully consumed and load the next frame.
 681     */
 682     drflac_uint32 pcmFramesRemaining;
 683
 684     /* The list of sub-frames within the frame. There is one sub-frame for each channel, and there's a maximum of 8 channels. */
 685     drflac_subframe subframes[8];
 686 } drflac_frame;
 687
 688 typedef struct
 689 {
 690     /* The function to call when a metadata block is read. */
 691     drflac_meta_proc onMeta;
 692
 693     /* The user data posted to the metadata callback function. */
 694     void* pUserDataMD;
 695
 696     /* Memory allocation callbacks. */
 697     drflac_allocation_callbacks allocationCallbacks;
 698
 699
 700     /* The sample rate. Will be set to something like 44100. */
 701     drflac_uint32 sampleRate;
 702
 703     /*
 704     The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. Maximum 8. This is set based on the
 705     value specified in the STREAMINFO block.
 706     */
 707     drflac_uint8 channels;
 708
 709     /* The bits per sample. Will be set to something like 16, 24, etc. */
 710     drflac_uint8 bitsPerSample;
 711
 712     /* The maximum block size, in samples. This number represents the number of samples in each channel (not combined). */
 713     drflac_uint16 maxBlockSizeInPCMFrames;
 714
 715     /*
 716     The total number of PCM Frames making up the stream. Can be 0 in which case it's still a valid stream, but just means
 717     the total PCM frame count is unknown. Likely the case with streams like internet radio.
 718     */
 719     drflac_uint64 totalPCMFrameCount;
 720
 721
 722     /* The container type. This is set based on whether or not the decoder was opened from a native or Ogg stream. */
 723     drflac_container container;
 724
 725     /* The number of seekpoints in the seektable. */
 726     drflac_uint32 seekpointCount;
 727
 728
 729     /* Information about the frame the decoder is currently sitting on. */
 730     drflac_frame currentFLACFrame;
 731
 732
 733     /* The index of the PCM frame the decoder is currently sitting on. This is only used for seeking. */
 734     drflac_uint64 currentPCMFrame;
 735
 736     /* The position of the first FLAC frame in the stream. This is only ever used for seeking. */
 737     drflac_uint64 firstFLACFramePosInBytes;
 738
 739
 740     /* A hack to avoid a malloc() when opening a decoder with drflac_open_memory(). */
 741     drflac__memory_stream memoryStream;
 742
 743
 744     /* A pointer to the decoded sample data. This is an offset of pExtraData. */
 745     drflac_int32* pDecodedSamples;
 746
 747     /* A pointer to the seek table. This is an offset of pExtraData, or NULL if there is no seek table. */
 748     drflac_seekpoint* pSeekpoints;
 749
 750     /* Internal use only. Only used with Ogg containers. Points to a drflac_oggbs object. This is an offset of pExtraData. */
 751     void* _oggbs;
 752
 753     /* Internal use only. Used for profiling and testing different seeking modes. */
 754     drflac_bool32 _noSeekTableSeek    : 1;
 755     drflac_bool32 _noBinarySearchSeek : 1;
 756     drflac_bool32 _noBruteForceSeek   : 1;
 757
 758     /* The bit streamer. The raw FLAC data is fed through this object. */
 759     drflac_bs bs;
 760
 761     /* Variable length extra data. We attach this to the end of the object so we can avoid unnecessary mallocs. */
 762     drflac_uint8 pExtraData[1];
 763 } drflac;
 764
 765
 766 /*
 767 Opens a FLAC decoder.
 768
 769
 770 Parameters
 771 ----------
 772 onRead (in)
 773     The function to call when data needs to be read from the client.
 774
 775 onSeek (in)
 776     The function to call when the read position of the client data needs to move.
 777
 778 pUserData (in, optional)
 779     A pointer to application defined data that will be passed to onRead and onSeek.
 780
 781 pAllocationCallbacks (in, optional)
 782     A pointer to application defined callbacks for managing memory allocations.
 783
 784
 785 Return Value
 786 ------------
 787 Returns a pointer to an object representing the decoder.
 788
 789
 790 Remarks
 791 -------
 792 Close the decoder with `drflac_close()`.
 793
 794 `pAllocationCallbacks` can be NULL in which case it will use `DRFLAC_MALLOC`, `DRFLAC_REALLOC` and `DRFLAC_FREE`.
 795
 796 This function will automatically detect whether or not you are attempting to open a native or Ogg encapsulated FLAC, both of which should work seamlessly
 797 without any manual intervention. Ogg encapsulation also works with multiplexed streams which basically means it can play FLAC encoded audio tracks in videos.
 798
 799 This is the lowest level function for opening a FLAC stream. You can also use `drflac_open_file()` and `drflac_open_memory()` to open the stream from a file or
 800 from a block of memory respectively.
 801
 802 The STREAMINFO block must be present for this to succeed. Use `drflac_open_relaxed()` to open a FLAC stream where the header may not be present.
 803
 804 Use `drflac_open_with_metadata()` if you need access to metadata.
 805
 806
 807 Seek Also
 808 ---------
 809 drflac_open_file()
 810 drflac_open_memory()
 811 drflac_open_with_metadata()
 812 drflac_close()
 813 */
 814 DRFLAC_API drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
 815
 816 /*
 817 Opens a FLAC stream with relaxed validation of the header block.
 818
 819
 820 Parameters
 821 ----------
 822 onRead (in)
 823     The function to call when data needs to be read from the client.
 824
 825 onSeek (in)
 826     The function to call when the read position of the client data needs to move.
 827
 828 container (in)
 829     Whether or not the FLAC stream is encapsulated using standard FLAC encapsulation or Ogg encapsulation.
 830
 831 pUserData (in, optional)
 832     A pointer to application defined data that will be passed to onRead and onSeek.
 833
 834 pAllocationCallbacks (in, optional)
 835     A pointer to application defined callbacks for managing memory allocations.
 836
 837
 838 Return Value
 839 ------------
 840 A pointer to an object representing the decoder.
 841
 842
 843 Remarks
 844 -------
 845 The same as drflac_open(), except attempts to open the stream even when a header block is not present.
 846
 847 Because the header is not necessarily available, the caller must explicitly define the container (Native or Ogg). Do not set this to `drflac_container_unknown`
 848 as that is for internal use only.
 849
 850 Opening in relaxed mode will continue reading data from onRead until it finds a valid frame. If a frame is never found it will continue forever. To abort,
 851 force your `onRead` callback to return 0, which dr_flac will use as an indicator that the end of the stream was found.
 852
 853 Use `drflac_open_with_metadata_relaxed()` if you need access to metadata.
 854 */
 855 DRFLAC_API drflac* drflac_open_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
 856
 857 /*
 858 Opens a FLAC decoder and notifies the caller of the metadata chunks (album art, etc.).
 859
 860
 861 Parameters
 862 ----------
 863 onRead (in)
 864     The function to call when data needs to be read from the client.
 865
 866 onSeek (in)
 867     The function to call when the read position of the client data needs to move.
 868
 869 onMeta (in)
 870     The function to call for every metadata block.
 871
 872 pUserData (in, optional)
 873     A pointer to application defined data that will be passed to onRead, onSeek and onMeta.
 874
 875 pAllocationCallbacks (in, optional)
 876     A pointer to application defined callbacks for managing memory allocations.
 877
 878
 879 Return Value
 880 ------------
 881 A pointer to an object representing the decoder.
 882
 883
 884 Remarks
 885 -------
 886 Close the decoder with `drflac_close()`.
 887
 888 `pAllocationCallbacks` can be NULL in which case it will use `DRFLAC_MALLOC`, `DRFLAC_REALLOC` and `DRFLAC_FREE`.
 889
 890 This is slower than `drflac_open()`, so avoid this one if you don't need metadata. Internally, this will allocate and free memory on the heap for every
 891 metadata block except for STREAMINFO and PADDING blocks.
 892
 893 The caller is notified of the metadata via the `onMeta` callback. All metadata blocks will be handled before the function returns. This callback takes a
 894 pointer to a `drflac_metadata` object which is a union containing the data of all relevant metadata blocks. Use the `type` member to discriminate against
 895 the different metadata types.
 896
 897 The STREAMINFO block must be present for this to succeed. Use `drflac_open_with_metadata_relaxed()` to open a FLAC stream where the header may not be present.
 898
 899 Note that this will behave inconsistently with `drflac_open()` if the stream is an Ogg encapsulated stream and a metadata block is corrupted. This is due to
 900 the way the Ogg stream recovers from corrupted pages. When `drflac_open_with_metadata()` is being used, the open routine will try to read the contents of the
 901 metadata block, whereas `drflac_open()` will simply seek past it (for the sake of efficiency). This inconsistency can result in different samples being
 902 returned depending on whether or not the stream is being opened with metadata.
 903
 904
 905 Seek Also
 906 ---------
 907 drflac_open_file_with_metadata()
 908 drflac_open_memory_with_metadata()
 909 drflac_open()
 910 drflac_close()
 911 */
 912 DRFLAC_API drflac* drflac_open_with_metadata(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
 913
 914 /*
 915 The same as drflac_open_with_metadata(), except attempts to open the stream even when a header block is not present.
 916
 917 See Also
 918 --------
 919 drflac_open_with_metadata()
 920 drflac_open_relaxed()
 921 */
 922 DRFLAC_API drflac* drflac_open_with_metadata_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
 923
 924 /*
 925 Closes the given FLAC decoder.
 926
 927
 928 Parameters
 929 ----------
 930 pFlac (in)
 931     The decoder to close.
 932
 933
 934 Remarks
 935 -------
 936 This will destroy the decoder object.
 937
 938
 939 See Also
 940 --------
 941 drflac_open()
 942 drflac_open_with_metadata()
 943 drflac_open_file()
 944 drflac_open_file_w()
 945 drflac_open_file_with_metadata()
 946 drflac_open_file_with_metadata_w()
 947 drflac_open_memory()
 948 drflac_open_memory_with_metadata()
 949 */
 950 DRFLAC_API void drflac_close(drflac* pFlac);
 951
 952
 953 /*
 954 Reads sample data from the given FLAC decoder, output as interleaved signed 32-bit PCM.
 955
 956
 957 Parameters
 958 ----------
 959 pFlac (in)
 960     The decoder.
 961
 962 framesToRead (in)
 963     The number of PCM frames to read.
 964
 965 pBufferOut (out, optional)
 966     A pointer to the buffer that will receive the decoded samples.
 967
 968
 969 Return Value
 970 ------------
 971 Returns the number of PCM frames actually read. If the return value is less than `framesToRead` it has reached the end.
 972
 973
 974 Remarks
 975 -------
 976 pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of frames seeked.
 977 */
 978 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s32(drflac* pFlac, drflac_uint64 framesToRead, drflac_int32* pBufferOut);
 979
 980
 981 /*
 982 Reads sample data from the given FLAC decoder, output as interleaved signed 16-bit PCM.
 983
 984
 985 Parameters
 986 ----------
 987 pFlac (in)
 988     The decoder.
 989
 990 framesToRead (in)
 991     The number of PCM frames to read.
 992
 993 pBufferOut (out, optional)
 994     A pointer to the buffer that will receive the decoded samples.
 995
 996
 997 Return Value
 998 ------------
 999 Returns the number of PCM frames actually read. If the return value is less than `framesToRead` it has reached the end.
1000
1001
1002 Remarks
1003 -------
1004 pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of frames seeked.
1005
1006 Note that this is lossy for streams where the bits per sample is larger than 16.
1007 */
1008 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s16(drflac* pFlac, drflac_uint64 framesToRead, drflac_int16* pBufferOut);
1009
1010 /*
1011 Reads sample data from the given FLAC decoder, output as interleaved 32-bit floating point PCM.
1012
1013
1014 Parameters
1015 ----------
1016 pFlac (in)
1017     The decoder.
1018
1019 framesToRead (in)
1020     The number of PCM frames to read.
1021
1022 pBufferOut (out, optional)
1023     A pointer to the buffer that will receive the decoded samples.
1024
1025
1026 Return Value
1027 ------------
1028 Returns the number of PCM frames actually read. If the return value is less than `framesToRead` it has reached the end.
1029
1030
1031 Remarks
1032 -------
1033 pBufferOut can be null, in which case the call will act as a seek, and the return value will be the number of frames seeked.
1034
1035 Note that this should be considered lossy due to the nature of floating point numbers not being able to exactly represent every possible number.
1036 */
1037 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_f32(drflac* pFlac, drflac_uint64 framesToRead, float* pBufferOut);
1038
1039 /*
1040 Seeks to the PCM frame at the given index.
1041
1042
1043 Parameters
1044 ----------
1045 pFlac (in)
1046     The decoder.
1047
1048 pcmFrameIndex (in)
1049     The index of the PCM frame to seek to. See notes below.
1050
1051
1052 Return Value
1053 -------------
1054 `DRFLAC_TRUE` if successful; `DRFLAC_FALSE` otherwise.
1055 */
1056 DRFLAC_API drflac_bool32 drflac_seek_to_pcm_frame(drflac* pFlac, drflac_uint64 pcmFrameIndex);
1057
1058
1059
1060 #ifndef DR_FLAC_NO_STDIO
1061 /*
1062 Opens a FLAC decoder from the file at the given path.
1063
1064
1065 Parameters
1066 ----------
1067 pFileName (in)
1068     The path of the file to open, either absolute or relative to the current directory.
1069
1070 pAllocationCallbacks (in, optional)
1071     A pointer to application defined callbacks for managing memory allocations.
1072
1073
1074 Return Value
1075 ------------
1076 A pointer to an object representing the decoder.
1077
1078
1079 Remarks
1080 -------
1081 Close the decoder with drflac_close().
1082
1083
1084 Remarks
1085 -------
1086 This will hold a handle to the file until the decoder is closed with drflac_close(). Some platforms will restrict the number of files a process can have open
1087 at any given time, so keep this mind if you have many decoders open at the same time.
1088
1089
1090 See Also
1091 --------
1092 drflac_open_file_with_metadata()
1093 drflac_open()
1094 drflac_close()
1095 */
1096 DRFLAC_API drflac* drflac_open_file(const char* pFileName, const drflac_allocation_callbacks* pAllocationCallbacks);
1097 DRFLAC_API drflac* drflac_open_file_w(const wchar_t* pFileName, const drflac_allocation_callbacks* pAllocationCallbacks);
1098
1099 /*
1100 Opens a FLAC decoder from the file at the given path and notifies the caller of the metadata chunks (album art, etc.)
1101
1102
1103 Parameters
1104 ----------
1105 pFileName (in)
1106     The path of the file to open, either absolute or relative to the current directory.
1107
1108 pAllocationCallbacks (in, optional)
1109     A pointer to application defined callbacks for managing memory allocations.
1110
1111 onMeta (in)
1112     The callback to fire for each metadata block.
1113
1114 pUserData (in)
1115     A pointer to the user data to pass to the metadata callback.
1116
1117 pAllocationCallbacks (in)
1118     A pointer to application defined callbacks for managing memory allocations.
1119
1120
1121 Remarks
1122 -------
1123 Look at the documentation for drflac_open_with_metadata() for more information on how metadata is handled.
1124
1125
1126 See Also
1127 --------
1128 drflac_open_with_metadata()
1129 drflac_open()
1130 drflac_close()
1131 */
1132 DRFLAC_API drflac* drflac_open_file_with_metadata(const char* pFileName, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
1133 DRFLAC_API drflac* drflac_open_file_with_metadata_w(const wchar_t* pFileName, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
1134 #endif
1135
1136 /*
1137 Opens a FLAC decoder from a pre-allocated block of memory
1138
1139
1140 Parameters
1141 ----------
1142 pData (in)
1143     A pointer to the raw encoded FLAC data.
1144
1145 dataSize (in)
1146     The size in bytes of `data`.
1147
1148 pAllocationCallbacks (in)
1149     A pointer to application defined callbacks for managing memory allocations.
1150
1151
1152 Return Value
1153 ------------
1154 A pointer to an object representing the decoder.
1155
1156
1157 Remarks
1158 -------
1159 This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for the lifetime of the decoder.
1160
1161
1162 See Also
1163 --------
1164 drflac_open()
1165 drflac_close()
1166 */
1167 DRFLAC_API drflac* drflac_open_memory(const void* pData, size_t dataSize, const drflac_allocation_callbacks* pAllocationCallbacks);
1168
1169 /*
1170 Opens a FLAC decoder from a pre-allocated block of memory and notifies the caller of the metadata chunks (album art, etc.)
1171
1172
1173 Parameters
1174 ----------
1175 pData (in)
1176     A pointer to the raw encoded FLAC data.
1177
1178 dataSize (in)
1179     The size in bytes of `data`.
1180
1181 onMeta (in)
1182     The callback to fire for each metadata block.
1183
1184 pUserData (in)
1185     A pointer to the user data to pass to the metadata callback.
1186
1187 pAllocationCallbacks (in)
1188     A pointer to application defined callbacks for managing memory allocations.
1189
1190
1191 Remarks
1192 -------
1193 Look at the documentation for drflac_open_with_metadata() for more information on how metadata is handled.
1194
1195
1196 See Also
1197 -------
1198 drflac_open_with_metadata()
1199 drflac_open()
1200 drflac_close()
1201 */
1202 DRFLAC_API drflac* drflac_open_memory_with_metadata(const void* pData, size_t dataSize, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks);
1203
1204
1205
1206 /* High Level APIs */
1207
1208 /*
1209 Opens a FLAC stream from the given callbacks and fully decodes it in a single operation. The return value is a
1210 pointer to the sample data as interleaved signed 32-bit PCM. The returned data must be freed with drflac_free().
1211
1212 You can pass in custom memory allocation callbacks via the pAllocationCallbacks parameter. This can be NULL in which
1213 case it will use DRFLAC_MALLOC, DRFLAC_REALLOC and DRFLAC_FREE.
1214
1215 Sometimes a FLAC file won't keep track of the total sample count. In this situation the function will continuously
1216 read samples into a dynamically sized buffer on the heap until no samples are left.
1217
1218 Do not call this function on a broadcast type of stream (like internet radio streams and whatnot).
1219 */
1220 DRFLAC_API drflac_int32* drflac_open_and_read_pcm_frames_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1221
1222 /* Same as drflac_open_and_read_pcm_frames_s32(), except returns signed 16-bit integer samples. */
1223 DRFLAC_API drflac_int16* drflac_open_and_read_pcm_frames_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1224
1225 /* Same as drflac_open_and_read_pcm_frames_s32(), except returns 32-bit floating-point samples. */
1226 DRFLAC_API float* drflac_open_and_read_pcm_frames_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1227
1228 #ifndef DR_FLAC_NO_STDIO
1229 /* Same as drflac_open_and_read_pcm_frames_s32() except opens the decoder from a file. */
1230 DRFLAC_API drflac_int32* drflac_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1231
1232 /* Same as drflac_open_file_and_read_pcm_frames_s32(), except returns signed 16-bit integer samples. */
1233 DRFLAC_API drflac_int16* drflac_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1234
1235 /* Same as drflac_open_file_and_read_pcm_frames_s32(), except returns 32-bit floating-point samples. */
1236 DRFLAC_API float* drflac_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1237 #endif
1238
1239 /* Same as drflac_open_and_read_pcm_frames_s32() except opens the decoder from a block of memory. */
1240 DRFLAC_API drflac_int32* drflac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1241
1242 /* Same as drflac_open_memory_and_read_pcm_frames_s32(), except returns signed 16-bit integer samples. */
1243 DRFLAC_API drflac_int16* drflac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1244
1245 /* Same as drflac_open_memory_and_read_pcm_frames_s32(), except returns 32-bit floating-point samples. */
1246 DRFLAC_API float* drflac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks);
1247
1248 /*
1249 Frees memory that was allocated internally by dr_flac.
1250
1251 Set pAllocationCallbacks to the same object that was passed to drflac_open_*_and_read_pcm_frames_*(). If you originally passed in NULL, pass in NULL for this.
1252 */
1253 DRFLAC_API void drflac_free(void* p, const drflac_allocation_callbacks* pAllocationCallbacks);
1254
1255
1256 /* Structure representing an iterator for vorbis comments in a VORBIS_COMMENT metadata block. */
1257 typedef struct
1258 {
1259     drflac_uint32 countRemaining;
1260     const char* pRunningData;
1261 } drflac_vorbis_comment_iterator;
1262
1263 /*
1264 Initializes a vorbis comment iterator. This can be used for iterating over the vorbis comments in a VORBIS_COMMENT
1265 metadata block.
1266 */
1267 DRFLAC_API void drflac_init_vorbis_comment_iterator(drflac_vorbis_comment_iterator* pIter, drflac_uint32 commentCount, const void* pComments);
1268
1269 /*
1270 Goes to the next vorbis comment in the given iterator. If null is returned it means there are no more comments. The
1271 returned string is NOT null terminated.
1272 */
1273 DRFLAC_API const char* drflac_next_vorbis_comment(drflac_vorbis_comment_iterator* pIter, drflac_uint32* pCommentLengthOut);
1274
1275
1276 /* Structure representing an iterator for cuesheet tracks in a CUESHEET metadata block. */
1277 typedef struct
1278 {
1279     drflac_uint32 countRemaining;
1280     const char* pRunningData;
1281 } drflac_cuesheet_track_iterator;
1282
1283 /* Packing is important on this structure because we map this directly to the raw data within the CUESHEET metadata block. */
1284 #pragma pack(4)
1285 typedef struct
1286 {
1287     drflac_uint64 offset;
1288     drflac_uint8 index;
1289     drflac_uint8 reserved[3];
1290 } drflac_cuesheet_track_index;
1291 #pragma pack()
1292
1293 typedef struct
1294 {
1295     drflac_uint64 offset;
1296     drflac_uint8 trackNumber;
1297     char ISRC[12];
1298     drflac_bool8 isAudio;
1299     drflac_bool8 preEmphasis;
1300     drflac_uint8 indexCount;
1301     const drflac_cuesheet_track_index* pIndexPoints;
1302 } drflac_cuesheet_track;
1303
1304 /*
1305 Initializes a cuesheet track iterator. This can be used for iterating over the cuesheet tracks in a CUESHEET metadata
1306 block.
1307 */
1308 DRFLAC_API void drflac_init_cuesheet_track_iterator(drflac_cuesheet_track_iterator* pIter, drflac_uint32 trackCount, const void* pTrackData);
1309
1310 /* Goes to the next cuesheet track in the given iterator. If DRFLAC_FALSE is returned it means there are no more comments. */
1311 DRFLAC_API drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter, drflac_cuesheet_track* pCuesheetTrack);
1312
1313
1314 #ifdef __cplusplus
1315 }
1316 #endif
1317 #endif  /* dr_flac_h */
1318
1319
1320 /************************************************************************************************************************************************************
1321  ************************************************************************************************************************************************************
1322
1323  IMPLEMENTATION
1324
1325  ************************************************************************************************************************************************************
1326  ************************************************************************************************************************************************************/
1327 #if defined(DR_FLAC_IMPLEMENTATION) || defined(DRFLAC_IMPLEMENTATION)
1328 #ifndef dr_flac_c
1329 #define dr_flac_c
1330
1331 /* Disable some annoying warnings. */
1332 #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
1333     #pragma GCC diagnostic push
1334     #if __GNUC__ >= 7
1335     #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
1336     #endif
1337 #endif
1338
1339 #ifdef __linux__
1340     #ifndef _BSD_SOURCE
1341         #define _BSD_SOURCE
1342     #endif
1343     #ifndef _DEFAULT_SOURCE
1344         #define _DEFAULT_SOURCE
1345     #endif
1346     #ifndef __USE_BSD
1347         #define __USE_BSD
1348     #endif
1349     #include <endian.h>
1350 #endif
1351
1352 #include <stdlib.h>
1353 #include <string.h>
1354
1355 #ifdef _MSC_VER
1356     #define DRFLAC_INLINE __forceinline
1357 #elif defined(__GNUC__)
1358     /*
1359     I've had a bug report where GCC is emitting warnings about functions possibly not being inlineable. This warning happens when
1360     the __attribute__((always_inline)) attribute is defined without an "inline" statement. I think therefore there must be some
1361     case where "__inline__" is not always defined, thus the compiler emitting these warnings. When using -std=c89 or -ansi on the
1362     command line, we cannot use the "inline" keyword and instead need to use "__inline__". In an attempt to work around this issue
1363     I am using "__inline__" only when we're compiling in strict ANSI mode.
1364     */
1365     #if defined(__STRICT_ANSI__)
1366         #define DRFLAC_INLINE __inline__ __attribute__((always_inline))
1367     #else
1368         #define DRFLAC_INLINE inline __attribute__((always_inline))
1369     #endif
1370 #elif defined(__WATCOMC__)
1371     #define DRFLAC_INLINE __inline
1372 #else
1373     #define DRFLAC_INLINE
1374 #endif
1375
1376 /* CPU architecture. */
1377 #if defined(__x86_64__) || defined(_M_X64)
1378     #define DRFLAC_X64
1379 #elif defined(__i386) || defined(_M_IX86)
1380     #define DRFLAC_X86
1381 #elif defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
1382     #define DRFLAC_ARM
1383 #endif
1384
1385 /*
1386 Intrinsics Support
1387
1388 There's a bug in GCC 4.2.x which results in an incorrect compilation error when using _mm_slli_epi32() where it complains with
1389
1390     "error: shift must be an immediate"
1391
1392 Unfortuantely dr_flac depends on this for a few things so we're just going to disable SSE on GCC 4.2 and below.
1393 */
1394 #if !defined(DR_FLAC_NO_SIMD)
1395     #if defined(DRFLAC_X64) || defined(DRFLAC_X86)
1396         #if defined(_MSC_VER) && !defined(__clang__)
1397             /* MSVC. */
1398             #if _MSC_VER >= 1400 && !defined(DRFLAC_NO_SSE2)    /* 2005 */
1399                 #define DRFLAC_SUPPORT_SSE2
1400             #endif
1401             #if _MSC_VER >= 1600 && !defined(DRFLAC_NO_SSE41)   /* 2010 */
1402                 #define DRFLAC_SUPPORT_SSE41
1403             #endif
1404         #elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
1405             /* Assume GNUC-style. */
1406             #if defined(__SSE2__) && !defined(DRFLAC_NO_SSE2)
1407                 #define DRFLAC_SUPPORT_SSE2
1408             #endif
1409             #if defined(__SSE4_1__) && !defined(DRFLAC_NO_SSE41)
1410                 #define DRFLAC_SUPPORT_SSE41
1411             #endif
1412         #endif
1413
1414         /* If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include. */
1415         #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
1416             #if !defined(DRFLAC_SUPPORT_SSE2) && !defined(DRFLAC_NO_SSE2) && __has_include(<emmintrin.h>)
1417                 #define DRFLAC_SUPPORT_SSE2
1418             #endif
1419             #if !defined(DRFLAC_SUPPORT_SSE41) && !defined(DRFLAC_NO_SSE41) && __has_include(<smmintrin.h>)
1420                 #define DRFLAC_SUPPORT_SSE41
1421             #endif
1422         #endif
1423
1424         #if defined(DRFLAC_SUPPORT_SSE41)
1425             #include <smmintrin.h>
1426         #elif defined(DRFLAC_SUPPORT_SSE2)
1427             #include <emmintrin.h>
1428         #endif
1429     #endif
1430
1431     #if defined(DRFLAC_ARM)
1432         #if !defined(DRFLAC_NO_NEON) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
1433             #define DRFLAC_SUPPORT_NEON
1434         #endif
1435
1436         /* Fall back to looking for the #include file. */
1437         #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
1438             #if !defined(DRFLAC_SUPPORT_NEON) && !defined(DRFLAC_NO_NEON) && __has_include(<arm_neon.h>)
1439                 #define DRFLAC_SUPPORT_NEON
1440             #endif
1441         #endif
1442
1443         #if defined(DRFLAC_SUPPORT_NEON)
1444             #include <arm_neon.h>
1445         #endif
1446     #endif
1447 #endif
1448
1449 /* Compile-time CPU feature support. */
1450 #if !defined(DR_FLAC_NO_SIMD) && (defined(DRFLAC_X86) || defined(DRFLAC_X64))
1451     #if defined(_MSC_VER) && !defined(__clang__)
1452         #if _MSC_VER >= 1400
1453             #include <intrin.h>
1454             static void drflac__cpuid(int info[4], int fid)
1455             {
1456                 __cpuid(info, fid);
1457             }
1458         #else
1459             #define DRFLAC_NO_CPUID
1460         #endif
1461     #else
1462         #if defined(__GNUC__) || defined(__clang__)
1463             static void drflac__cpuid(int info[4], int fid)
1464             {
1465                 /*
1466                 It looks like the -fPIC option uses the ebx register which GCC complains about. We can work around this by just using a different register, the
1467                 specific register of which I'm letting the compiler decide on. The "k" prefix is used to specify a 32-bit register. The {...} syntax is for
1468                 supporting different assembly dialects.
1469
1470                 What's basically happening is that we're saving and restoring the ebx register manually.
1471                 */
1472                 #if defined(DRFLAC_X86) && defined(__PIC__)
1473                     __asm__ __volatile__ (
1474                         "xchg{l} {%%}ebx, %k1;"
1475                         "cpuid;"
1476                         "xchg{l} {%%}ebx, %k1;"
1477                         : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
1478                     );
1479                 #else
1480                     __asm__ __volatile__ (
1481                         "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
1482                     );
1483                 #endif
1484             }
1485         #else
1486             #define DRFLAC_NO_CPUID
1487         #endif
1488     #endif
1489 #else
1490     #define DRFLAC_NO_CPUID
1491 #endif
1492
1493 static DRFLAC_INLINE drflac_bool32 drflac_has_sse2(void)
1494 {
1495 #if defined(DRFLAC_SUPPORT_SSE2)
1496     #if (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(DRFLAC_NO_SSE2)
1497         #if defined(DRFLAC_X64)
1498             return DRFLAC_TRUE;    /* 64-bit targets always support SSE2. */
1499         #elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)
1500             return DRFLAC_TRUE;    /* If the compiler is allowed to freely generate SSE2 code we can assume support. */
1501         #else
1502             #if defined(DRFLAC_NO_CPUID)
1503                 return DRFLAC_FALSE;
1504             #else
1505                 int info[4];
1506                 drflac__cpuid(info, 1);
1507                 return (info[3] & (1 << 26)) != 0;
1508             #endif
1509         #endif
1510     #else
1511         return DRFLAC_FALSE;       /* SSE2 is only supported on x86 and x64 architectures. */
1512     #endif
1513 #else
1514     return DRFLAC_FALSE;           /* No compiler support. */
1515 #endif
1516 }
1517
1518 static DRFLAC_INLINE drflac_bool32 drflac_has_sse41(void)
1519 {
1520 #if defined(DRFLAC_SUPPORT_SSE41)
1521     #if (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(DRFLAC_NO_SSE41)
1522         #if defined(DRFLAC_X64)
1523             return DRFLAC_TRUE;    /* 64-bit targets always support SSE4.1. */
1524         #elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE4_1__)
1525             return DRFLAC_TRUE;    /* If the compiler is allowed to freely generate SSE41 code we can assume support. */
1526         #else
1527             #if defined(DRFLAC_NO_CPUID)
1528                 return DRFLAC_FALSE;
1529             #else
1530                 int info[4];
1531                 drflac__cpuid(info, 1);
1532                 return (info[2] & (1 << 19)) != 0;
1533             #endif
1534         #endif
1535     #else
1536         return DRFLAC_FALSE;       /* SSE41 is only supported on x86 and x64 architectures. */
1537     #endif
1538 #else
1539     return DRFLAC_FALSE;           /* No compiler support. */
1540 #endif
1541 }
1542
1543
1544 #if defined(_MSC_VER) && _MSC_VER >= 1500 && (defined(DRFLAC_X86) || defined(DRFLAC_X64)) && !defined(__clang__)
1545     #define DRFLAC_HAS_LZCNT_INTRINSIC
1546 #elif (defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
1547     #define DRFLAC_HAS_LZCNT_INTRINSIC
1548 #elif defined(__clang__)
1549     #if defined(__has_builtin)
1550         #if __has_builtin(__builtin_clzll) || __has_builtin(__builtin_clzl)
1551             #define DRFLAC_HAS_LZCNT_INTRINSIC
1552         #endif
1553     #endif
1554 #endif
1555
1556 #if defined(_MSC_VER) && _MSC_VER >= 1400 && !defined(__clang__)
1557     #define DRFLAC_HAS_BYTESWAP16_INTRINSIC
1558     #define DRFLAC_HAS_BYTESWAP32_INTRINSIC
1559     #define DRFLAC_HAS_BYTESWAP64_INTRINSIC
1560 #elif defined(__clang__)
1561     #if defined(__has_builtin)
1562         #if __has_builtin(__builtin_bswap16)
1563             #define DRFLAC_HAS_BYTESWAP16_INTRINSIC
1564         #endif
1565         #if __has_builtin(__builtin_bswap32)
1566             #define DRFLAC_HAS_BYTESWAP32_INTRINSIC
1567         #endif
1568         #if __has_builtin(__builtin_bswap64)
1569             #define DRFLAC_HAS_BYTESWAP64_INTRINSIC
1570         #endif
1571     #endif
1572 #elif defined(__GNUC__)
1573     #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
1574         #define DRFLAC_HAS_BYTESWAP32_INTRINSIC
1575         #define DRFLAC_HAS_BYTESWAP64_INTRINSIC
1576     #endif
1577     #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
1578         #define DRFLAC_HAS_BYTESWAP16_INTRINSIC
1579     #endif
1580 #elif defined(__WATCOMC__) && defined(__386__)
1581     #define DRFLAC_HAS_BYTESWAP16_INTRINSIC
1582     #define DRFLAC_HAS_BYTESWAP32_INTRINSIC
1583     #define DRFLAC_HAS_BYTESWAP64_INTRINSIC
1584     extern __inline drflac_uint16 _watcom_bswap16(drflac_uint16);
1585     extern __inline drflac_uint32 _watcom_bswap32(drflac_uint32);
1586     extern __inline drflac_uint64 _watcom_bswap64(drflac_uint64);
1587 #pragma aux _watcom_bswap16 = \
1588     "xchg al, ah" \
1589     parm   [ax]   \
1590     modify [ax];
1591 #pragma aux _watcom_bswap32 = \
1592     "bswap eax"  \
1593     parm   [eax] \
1594     modify [eax];
1595 #pragma aux _watcom_bswap64 = \
1596     "bswap eax"     \
1597     "bswap edx"     \
1598     "xchg eax,edx"  \
1599     parm [eax edx]  \
1600     modify [eax edx];
1601 #endif
1602
1603
1604 /* Standard library stuff. */
1605 #ifndef DRFLAC_ASSERT
1606 #include <assert.h>
1607 #define DRFLAC_ASSERT(expression)           assert(expression)
1608 #endif
1609 #ifndef DRFLAC_MALLOC
1610 #define DRFLAC_MALLOC(sz)                   malloc((sz))
1611 #endif
1612 #ifndef DRFLAC_REALLOC
1613 #define DRFLAC_REALLOC(p, sz)               realloc((p), (sz))
1614 #endif
1615 #ifndef DRFLAC_FREE
1616 #define DRFLAC_FREE(p)                      free((p))
1617 #endif
1618 #ifndef DRFLAC_COPY_MEMORY
1619 #define DRFLAC_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
1620 #endif
1621 #ifndef DRFLAC_ZERO_MEMORY
1622 #define DRFLAC_ZERO_MEMORY(p, sz)           memset((p), 0, (sz))
1623 #endif
1624 #ifndef DRFLAC_ZERO_OBJECT
1625 #define DRFLAC_ZERO_OBJECT(p)               DRFLAC_ZERO_MEMORY((p), sizeof(*(p)))
1626 #endif
1627
1628 #define DRFLAC_MAX_SIMD_VECTOR_SIZE                     64  /* 64 for AVX-512 in the future. */
1629
1630 typedef drflac_int32 drflac_result;
1631 #define DRFLAC_SUCCESS                                   0
1632 #define DRFLAC_ERROR                                    -1   /* A generic error. */
1633 #define DRFLAC_INVALID_ARGS                             -2
1634 #define DRFLAC_INVALID_OPERATION                        -3
1635 #define DRFLAC_OUT_OF_MEMORY                            -4
1636 #define DRFLAC_OUT_OF_RANGE                             -5
1637 #define DRFLAC_ACCESS_DENIED                            -6
1638 #define DRFLAC_DOES_NOT_EXIST                           -7
1639 #define DRFLAC_ALREADY_EXISTS                           -8
1640 #define DRFLAC_TOO_MANY_OPEN_FILES                      -9
1641 #define DRFLAC_INVALID_FILE                             -10
1642 #define DRFLAC_TOO_BIG                                  -11
1643 #define DRFLAC_PATH_TOO_LONG                            -12
1644 #define DRFLAC_NAME_TOO_LONG                            -13
1645 #define DRFLAC_NOT_DIRECTORY                            -14
1646 #define DRFLAC_IS_DIRECTORY                             -15
1647 #define DRFLAC_DIRECTORY_NOT_EMPTY                      -16
1648 #define DRFLAC_END_OF_FILE                              -17
1649 #define DRFLAC_NO_SPACE                                 -18
1650 #define DRFLAC_BUSY                                     -19
1651 #define DRFLAC_IO_ERROR                                 -20
1652 #define DRFLAC_INTERRUPT                                -21
1653 #define DRFLAC_UNAVAILABLE                              -22
1654 #define DRFLAC_ALREADY_IN_USE                           -23
1655 #define DRFLAC_BAD_ADDRESS                              -24
1656 #define DRFLAC_BAD_SEEK                                 -25
1657 #define DRFLAC_BAD_PIPE                                 -26
1658 #define DRFLAC_DEADLOCK                                 -27
1659 #define DRFLAC_TOO_MANY_LINKS                           -28
1660 #define DRFLAC_NOT_IMPLEMENTED                          -29
1661 #define DRFLAC_NO_MESSAGE                               -30
1662 #define DRFLAC_BAD_MESSAGE                              -31
1663 #define DRFLAC_NO_DATA_AVAILABLE                        -32
1664 #define DRFLAC_INVALID_DATA                             -33
1665 #define DRFLAC_TIMEOUT                                  -34
1666 #define DRFLAC_NO_NETWORK                               -35
1667 #define DRFLAC_NOT_UNIQUE                               -36
1668 #define DRFLAC_NOT_SOCKET                               -37
1669 #define DRFLAC_NO_ADDRESS                               -38
1670 #define DRFLAC_BAD_PROTOCOL                             -39
1671 #define DRFLAC_PROTOCOL_UNAVAILABLE                     -40
1672 #define DRFLAC_PROTOCOL_NOT_SUPPORTED                   -41
1673 #define DRFLAC_PROTOCOL_FAMILY_NOT_SUPPORTED            -42
1674 #define DRFLAC_ADDRESS_FAMILY_NOT_SUPPORTED             -43
1675 #define DRFLAC_SOCKET_NOT_SUPPORTED                     -44
1676 #define DRFLAC_CONNECTION_RESET                         -45
1677 #define DRFLAC_ALREADY_CONNECTED                        -46
1678 #define DRFLAC_NOT_CONNECTED                            -47
1679 #define DRFLAC_CONNECTION_REFUSED                       -48
1680 #define DRFLAC_NO_HOST                                  -49
1681 #define DRFLAC_IN_PROGRESS                              -50
1682 #define DRFLAC_CANCELLED                                -51
1683 #define DRFLAC_MEMORY_ALREADY_MAPPED                    -52
1684 #define DRFLAC_AT_END                                   -53
1685 #define DRFLAC_CRC_MISMATCH                             -128
1686
1687 #define DRFLAC_SUBFRAME_CONSTANT                        0
1688 #define DRFLAC_SUBFRAME_VERBATIM                        1
1689 #define DRFLAC_SUBFRAME_FIXED                           8
1690 #define DRFLAC_SUBFRAME_LPC                             32
1691 #define DRFLAC_SUBFRAME_RESERVED                        255
1692
1693 #define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE  0
1694 #define DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2 1
1695
1696 #define DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT           0
1697 #define DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE             8
1698 #define DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE            9
1699 #define DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE              10
1700
1701 #define drflac_align(x, a)                              ((((x) + (a) - 1) / (a)) * (a))
1702
1703
1704 DRFLAC_API void drflac_version(drflac_uint32* pMajor, drflac_uint32* pMinor, drflac_uint32* pRevision)
1705 {
1706     if (pMajor) {
1707         *pMajor = DRFLAC_VERSION_MAJOR;
1708     }
1709
1710     if (pMinor) {
1711         *pMinor = DRFLAC_VERSION_MINOR;
1712     }
1713
1714     if (pRevision) {
1715         *pRevision = DRFLAC_VERSION_REVISION;
1716     }
1717 }
1718
1719 DRFLAC_API const char* drflac_version_string(void)
1720 {
1721     return DRFLAC_VERSION_STRING;
1722 }
1723
1724
1725 /* CPU caps. */
1726 #if defined(__has_feature)
1727     #if __has_feature(thread_sanitizer)
1728         #define DRFLAC_NO_THREAD_SANITIZE __attribute__((no_sanitize("thread")))
1729     #else
1730         #define DRFLAC_NO_THREAD_SANITIZE
1731     #endif
1732 #else
1733     #define DRFLAC_NO_THREAD_SANITIZE
1734 #endif
1735
1736 #if defined(DRFLAC_HAS_LZCNT_INTRINSIC)
1737 static drflac_bool32 drflac__gIsLZCNTSupported = DRFLAC_FALSE;
1738 #endif
1739
1740 #ifndef DRFLAC_NO_CPUID
1741 static drflac_bool32 drflac__gIsSSE2Supported  = DRFLAC_FALSE;
1742 static drflac_bool32 drflac__gIsSSE41Supported = DRFLAC_FALSE;
1743
1744 /*
1745 I've had a bug report that Clang's ThreadSanitizer presents a warning in this function. Having reviewed this, this does
1746 actually make sense. However, since CPU caps should never differ for a running process, I don't think the trade off of
1747 complicating internal API's by passing around CPU caps versus just disabling the warnings is worthwhile. I'm therefore
1748 just going to disable these warnings. This is disabled via the DRFLAC_NO_THREAD_SANITIZE attribute.
1749 */
1750 DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps(void)
1751 {
1752     static drflac_bool32 isCPUCapsInitialized = DRFLAC_FALSE;
1753
1754     if (!isCPUCapsInitialized) {
1755         /* LZCNT */
1756 #if defined(DRFLAC_HAS_LZCNT_INTRINSIC)
1757         int info[4] = {0};
1758         drflac__cpuid(info, 0x80000001);
1759         drflac__gIsLZCNTSupported = (info[2] & (1 << 5)) != 0;
1760 #endif
1761
1762         /* SSE2 */
1763         drflac__gIsSSE2Supported = drflac_has_sse2();
1764
1765         /* SSE4.1 */
1766         drflac__gIsSSE41Supported = drflac_has_sse41();
1767
1768         /* Initialized. */
1769         isCPUCapsInitialized = DRFLAC_TRUE;
1770     }
1771 }
1772 #else
1773 static drflac_bool32 drflac__gIsNEONSupported  = DRFLAC_FALSE;
1774
1775 static DRFLAC_INLINE drflac_bool32 drflac__has_neon(void)
1776 {
1777 #if defined(DRFLAC_SUPPORT_NEON)
1778     #if defined(DRFLAC_ARM) && !defined(DRFLAC_NO_NEON)
1779         #if (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
1780             return DRFLAC_TRUE;    /* If the compiler is allowed to freely generate NEON code we can assume support. */
1781         #else
1782             /* TODO: Runtime check. */
1783             return DRFLAC_FALSE;
1784         #endif
1785     #else
1786         return DRFLAC_FALSE;       /* NEON is only supported on ARM architectures. */
1787     #endif
1788 #else
1789     return DRFLAC_FALSE;           /* No compiler support. */
1790 #endif
1791 }
1792
1793 DRFLAC_NO_THREAD_SANITIZE static void drflac__init_cpu_caps(void)
1794 {
1795     drflac__gIsNEONSupported = drflac__has_neon();
1796
1797 #if defined(DRFLAC_HAS_LZCNT_INTRINSIC) && defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5)
1798     drflac__gIsLZCNTSupported = DRFLAC_TRUE;
1799 #endif
1800 }
1801 #endif
1802
1803
1804 /* Endian Management */
1805 static DRFLAC_INLINE drflac_bool32 drflac__is_little_endian(void)
1806 {
1807 #if defined(DRFLAC_X86) || defined(DRFLAC_X64)
1808     return DRFLAC_TRUE;
1809 #elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN
1810     return DRFLAC_TRUE;
1811 #else
1812     int n = 1;
1813     return (*(char*)&n) == 1;
1814 #endif
1815 }
1816
1817 static DRFLAC_INLINE drflac_uint16 drflac__swap_endian_uint16(drflac_uint16 n)
1818 {
1819 #ifdef DRFLAC_HAS_BYTESWAP16_INTRINSIC
1820     #if defined(_MSC_VER) && !defined(__clang__)
1821         return _byteswap_ushort(n);
1822     #elif defined(__GNUC__) || defined(__clang__)
1823         return __builtin_bswap16(n);
1824     #elif defined(__WATCOMC__) && defined(__386__)
1825         return _watcom_bswap16(n);
1826     #else
1827         #error "This compiler does not support the byte swap intrinsic."
1828     #endif
1829 #else
1830     return ((n & 0xFF00) >> 8) |
1831            ((n & 0x00FF) << 8);
1832 #endif
1833 }
1834
1835 static DRFLAC_INLINE drflac_uint32 drflac__swap_endian_uint32(drflac_uint32 n)
1836 {
1837 #ifdef DRFLAC_HAS_BYTESWAP32_INTRINSIC
1838     #if defined(_MSC_VER) && !defined(__clang__)
1839         return _byteswap_ulong(n);
1840     #elif defined(__GNUC__) || defined(__clang__)
1841         #if defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(DRFLAC_64BIT)   /* <-- 64-bit inline assembly has not been tested, so disabling for now. */
1842             /* Inline assembly optimized implementation for ARM. In my testing, GCC does not generate optimized code with __builtin_bswap32(). */
1843             drflac_uint32 r;
1844             __asm__ __volatile__ (
1845             #if defined(DRFLAC_64BIT)
1846                 "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)   /* <-- This is untested. If someone in the community could test this, that would be appreciated! */
1847             #else
1848                 "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
1849             #endif
1850             );
1851             return r;
1852         #else
1853             return __builtin_bswap32(n);
1854         #endif
1855     #elif defined(__WATCOMC__) && defined(__386__)
1856         return _watcom_bswap32(n);
1857     #else
1858         #error "This compiler does not support the byte swap intrinsic."
1859     #endif
1860 #else
1861     return ((n & 0xFF000000) >> 24) |
1862            ((n & 0x00FF0000) >>  8) |
1863            ((n & 0x0000FF00) <<  8) |
1864            ((n & 0x000000FF) << 24);
1865 #endif
1866 }
1867
1868 static DRFLAC_INLINE drflac_uint64 drflac__swap_endian_uint64(drflac_uint64 n)
1869 {
1870 #ifdef DRFLAC_HAS_BYTESWAP64_INTRINSIC
1871     #if defined(_MSC_VER) && !defined(__clang__)
1872         return _byteswap_uint64(n);
1873     #elif defined(__GNUC__) || defined(__clang__)
1874         return __builtin_bswap64(n);
1875     #elif defined(__WATCOMC__) && defined(__386__)
1876         return _watcom_bswap64(n);
1877     #else
1878         #error "This compiler does not support the byte swap intrinsic."
1879     #endif
1880 #else
1881     /* Weird "<< 32" bitshift is required for C89 because it doesn't support 64-bit constants. Should be optimized out by a good compiler. */
1882     return ((n & ((drflac_uint64)0xFF000000 << 32)) >> 56) |
1883            ((n & ((drflac_uint64)0x00FF0000 << 32)) >> 40) |
1884            ((n & ((drflac_uint64)0x0000FF00 << 32)) >> 24) |
1885            ((n & ((drflac_uint64)0x000000FF << 32)) >>  8) |
1886            ((n & ((drflac_uint64)0xFF000000      )) <<  8) |
1887            ((n & ((drflac_uint64)0x00FF0000      )) << 24) |
1888            ((n & ((drflac_uint64)0x0000FF00      )) << 40) |
1889            ((n & ((drflac_uint64)0x000000FF      )) << 56);
1890 #endif
1891 }
1892
1893
1894 static DRFLAC_INLINE drflac_uint16 drflac__be2host_16(drflac_uint16 n)
1895 {
1896     if (drflac__is_little_endian()) {
1897         return drflac__swap_endian_uint16(n);
1898     }
1899
1900     return n;
1901 }
1902
1903 static DRFLAC_INLINE drflac_uint32 drflac__be2host_32(drflac_uint32 n)
1904 {
1905     if (drflac__is_little_endian()) {
1906         return drflac__swap_endian_uint32(n);
1907     }
1908
1909     return n;
1910 }
1911
1912 static DRFLAC_INLINE drflac_uint64 drflac__be2host_64(drflac_uint64 n)
1913 {
1914     if (drflac__is_little_endian()) {
1915         return drflac__swap_endian_uint64(n);
1916     }
1917
1918     return n;
1919 }
1920
1921
1922 static DRFLAC_INLINE drflac_uint32 drflac__le2host_32(drflac_uint32 n)
1923 {
1924     if (!drflac__is_little_endian()) {
1925         return drflac__swap_endian_uint32(n);
1926     }
1927
1928     return n;
1929 }
1930
1931
1932 static DRFLAC_INLINE drflac_uint32 drflac__unsynchsafe_32(drflac_uint32 n)
1933 {
1934     drflac_uint32 result = 0;
1935     result |= (n & 0x7F000000) >> 3;
1936     result |= (n & 0x007F0000) >> 2;
1937     result |= (n & 0x00007F00) >> 1;
1938     result |= (n & 0x0000007F) >> 0;
1939
1940     return result;
1941 }
1942
1943
1944
1945 /* The CRC code below is based on this document: http://zlib.net/crc_v3.txt */
1946 static drflac_uint8 drflac__crc8_table[] = {
1947     0x00, 0x07, 0x0E, 0x09, 0x1C, 0x1B, 0x12, 0x15, 0x38, 0x3F, 0x36, 0x31, 0x24, 0x23, 0x2A, 0x2D,
1948     0x70, 0x77, 0x7E, 0x79, 0x6C, 0x6B, 0x62, 0x65, 0x48, 0x4F, 0x46, 0x41, 0x54, 0x53, 0x5A, 0x5D,
1949     0xE0, 0xE7, 0xEE, 0xE9, 0xFC, 0xFB, 0xF2, 0xF5, 0xD8, 0xDF, 0xD6, 0xD1, 0xC4, 0xC3, 0xCA, 0xCD,
1950     0x90, 0x97, 0x9E, 0x99, 0x8C, 0x8B, 0x82, 0x85, 0xA8, 0xAF, 0xA6, 0xA1, 0xB4, 0xB3, 0xBA, 0xBD,
1951     0xC7, 0xC0, 0xC9, 0xCE, 0xDB, 0xDC, 0xD5, 0xD2, 0xFF, 0xF8, 0xF1, 0xF6, 0xE3, 0xE4, 0xED, 0xEA,
1952     0xB7, 0xB0, 0xB9, 0xBE, 0xAB, 0xAC, 0xA5, 0xA2, 0x8F, 0x88, 0x81, 0x86, 0x93, 0x94, 0x9D, 0x9A,
1953     0x27, 0x20, 0x29, 0x2E, 0x3B, 0x3C, 0x35, 0x32, 0x1F, 0x18, 0x11, 0x16, 0x03, 0x04, 0x0D, 0x0A,
1954     0x57, 0x50, 0x59, 0x5E, 0x4B, 0x4C, 0x45, 0x42, 0x6F, 0x68, 0x61, 0x66, 0x73, 0x74, 0x7D, 0x7A,
1955     0x89, 0x8E, 0x87, 0x80, 0x95, 0x92, 0x9B, 0x9C, 0xB1, 0xB6, 0xBF, 0xB8, 0xAD, 0xAA, 0xA3, 0xA4,
1956     0xF9, 0xFE, 0xF7, 0xF0, 0xE5, 0xE2, 0xEB, 0xEC, 0xC1, 0xC6, 0xCF, 0xC8, 0xDD, 0xDA, 0xD3, 0xD4,
1957     0x69, 0x6E, 0x67, 0x60, 0x75, 0x72, 0x7B, 0x7C, 0x51, 0x56, 0x5F, 0x58, 0x4D, 0x4A, 0x43, 0x44,
1958     0x19, 0x1E, 0x17, 0x10, 0x05, 0x02, 0x0B, 0x0C, 0x21, 0x26, 0x2F, 0x28, 0x3D, 0x3A, 0x33, 0x34,
1959     0x4E, 0x49, 0x40, 0x47, 0x52, 0x55, 0x5C, 0x5B, 0x76, 0x71, 0x78, 0x7F, 0x6A, 0x6D, 0x64, 0x63,
1960     0x3E, 0x39, 0x30, 0x37, 0x22, 0x25, 0x2C, 0x2B, 0x06, 0x01, 0x08, 0x0F, 0x1A, 0x1D, 0x14, 0x13,
1961     0xAE, 0xA9, 0xA0, 0xA7, 0xB2, 0xB5, 0xBC, 0xBB, 0x96, 0x91, 0x98, 0x9F, 0x8A, 0x8D, 0x84, 0x83,
1962     0xDE, 0xD9, 0xD0, 0xD7, 0xC2, 0xC5, 0xCC, 0xCB, 0xE6, 0xE1, 0xE8, 0xEF, 0xFA, 0xFD, 0xF4, 0xF3
1963 };
1964
1965 static drflac_uint16 drflac__crc16_table[] = {
1966     0x0000, 0x8005, 0x800F, 0x000A, 0x801B, 0x001E, 0x0014, 0x8011,
1967     0x8033, 0x0036, 0x003C, 0x8039, 0x0028, 0x802D, 0x8027, 0x0022,
1968     0x8063, 0x0066, 0x006C, 0x8069, 0x0078, 0x807D, 0x8077, 0x0072,
1969     0x0050, 0x8055, 0x805F, 0x005A, 0x804B, 0x004E, 0x0044, 0x8041,
1970     0x80C3, 0x00C6, 0x00CC, 0x80C9, 0x00D8, 0x80DD, 0x80D7, 0x00D2,
1971     0x00F0, 0x80F5, 0x80FF, 0x00FA, 0x80EB, 0x00EE, 0x00E4, 0x80E1,
1972     0x00A0, 0x80A5, 0x80AF, 0x00AA, 0x80BB, 0x00BE, 0x00B4, 0x80B1,
1973     0x8093, 0x0096, 0x009C, 0x8099, 0x0088, 0x808D, 0x8087, 0x0082,
1974     0x8183, 0x0186, 0x018C, 0x8189, 0x0198, 0x819D, 0x8197, 0x0192,
1975     0x01B0, 0x81B5, 0x81BF, 0x01BA, 0x81AB, 0x01AE, 0x01A4, 0x81A1,
1976     0x01E0, 0x81E5, 0x81EF, 0x01EA, 0x81FB, 0x01FE, 0x01F4, 0x81F1,
1977     0x81D3, 0x01D6, 0x01DC, 0x81D9, 0x01C8, 0x81CD, 0x81C7, 0x01C2,
1978     0x0140, 0x8145, 0x814F, 0x014A, 0x815B, 0x015E, 0x0154, 0x8151,
1979     0x8173, 0x0176, 0x017C, 0x8179, 0x0168, 0x816D, 0x8167, 0x0162,
1980     0x8123, 0x0126, 0x012C, 0x8129, 0x0138, 0x813D, 0x8137, 0x0132,
1981     0x0110, 0x8115, 0x811F, 0x011A, 0x810B, 0x010E, 0x0104, 0x8101,
1982     0x8303, 0x0306, 0x030C, 0x8309, 0x0318, 0x831D, 0x8317, 0x0312,
1983     0x0330, 0x8335, 0x833F, 0x033A, 0x832B, 0x032E, 0x0324, 0x8321,
1984     0x0360, 0x8365, 0x836F, 0x036A, 0x837B, 0x037E, 0x0374, 0x8371,
1985     0x8353, 0x0356, 0x035C, 0x8359, 0x0348, 0x834D, 0x8347, 0x0342,
1986     0x03C0, 0x83C5, 0x83CF, 0x03CA, 0x83DB, 0x03DE, 0x03D4, 0x83D1,
1987     0x83F3, 0x03F6, 0x03FC, 0x83F9, 0x03E8, 0x83ED, 0x83E7, 0x03E2,
1988     0x83A3, 0x03A6, 0x03AC, 0x83A9, 0x03B8, 0x83BD, 0x83B7, 0x03B2,
1989     0x0390, 0x8395, 0x839F, 0x039A, 0x838B, 0x038E, 0x0384, 0x8381,
1990     0x0280, 0x8285, 0x828F, 0x028A, 0x829B, 0x029E, 0x0294, 0x8291,
1991     0x82B3, 0x02B6, 0x02BC, 0x82B9, 0x02A8, 0x82AD, 0x82A7, 0x02A2,
1992     0x82E3, 0x02E6, 0x02EC, 0x82E9, 0x02F8, 0x82FD, 0x82F7, 0x02F2,
1993     0x02D0, 0x82D5, 0x82DF, 0x02DA, 0x82CB, 0x02CE, 0x02C4, 0x82C1,
1994     0x8243, 0x0246, 0x024C, 0x8249, 0x0258, 0x825D, 0x8257, 0x0252,
1995     0x0270, 0x8275, 0x827F, 0x027A, 0x826B, 0x026E, 0x0264, 0x8261,
1996     0x0220, 0x8225, 0x822F, 0x022A, 0x823B, 0x023E, 0x0234, 0x8231,
1997     0x8213, 0x0216, 0x021C, 0x8219, 0x0208, 0x820D, 0x8207, 0x0202
1998 };
1999
2000 static DRFLAC_INLINE drflac_uint8 drflac_crc8_byte(drflac_uint8 crc, drflac_uint8 data)
2001 {
2002     return drflac__crc8_table[crc ^ data];
2003 }
2004
2005 static DRFLAC_INLINE drflac_uint8 drflac_crc8(drflac_uint8 crc, drflac_uint32 data, drflac_uint32 count)
2006 {
2007 #ifdef DR_FLAC_NO_CRC
2008     (void)crc;
2009     (void)data;
2010     (void)count;
2011     return 0;
2012 #else
2013 #if 0
2014     /* REFERENCE (use of this implementation requires an explicit flush by doing "drflac_crc8(crc, 0, 8);") */
2015     drflac_uint8 p = 0x07;
2016     for (int i = count-1; i >= 0; --i) {
2017         drflac_uint8 bit = (data & (1 << i)) >> i;
2018         if (crc & 0x80) {
2019             crc = ((crc << 1) | bit) ^ p;
2020         } else {
2021             crc = ((crc << 1) | bit);
2022         }
2023     }
2024     return crc;
2025 #else
2026     drflac_uint32 wholeBytes;
2027     drflac_uint32 leftoverBits;
2028     drflac_uint64 leftoverDataMask;
2029
2030     static drflac_uint64 leftoverDataMaskTable[8] = {
2031         0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
2032     };
2033
2034     DRFLAC_ASSERT(count <= 32);
2035
2036     wholeBytes = count >> 3;
2037     leftoverBits = count - (wholeBytes*8);
2038     leftoverDataMask = leftoverDataMaskTable[leftoverBits];
2039
2040     switch (wholeBytes) {
2041         case 4: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0xFF000000UL << leftoverBits)) >> (24 + leftoverBits)));
2042         case 3: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0x00FF0000UL << leftoverBits)) >> (16 + leftoverBits)));
2043         case 2: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0x0000FF00UL << leftoverBits)) >> ( 8 + leftoverBits)));
2044         case 1: crc = drflac_crc8_byte(crc, (drflac_uint8)((data & (0x000000FFUL << leftoverBits)) >> ( 0 + leftoverBits)));
2045         case 0: if (leftoverBits > 0) crc = (drflac_uint8)((crc << leftoverBits) ^ drflac__crc8_table[(crc >> (8 - leftoverBits)) ^ (data & leftoverDataMask)]);
2046     }
2047     return crc;
2048 #endif
2049 #endif
2050 }
2051
2052 static DRFLAC_INLINE drflac_uint16 drflac_crc16_byte(drflac_uint16 crc, drflac_uint8 data)
2053 {
2054     return (crc << 8) ^ drflac__crc16_table[(drflac_uint8)(crc >> 8) ^ data];
2055 }
2056
2057 static DRFLAC_INLINE drflac_uint16 drflac_crc16_cache(drflac_uint16 crc, drflac_cache_t data)
2058 {
2059 #ifdef DRFLAC_64BIT
2060     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 56) & 0xFF));
2061     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 48) & 0xFF));
2062     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 40) & 0xFF));
2063     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 32) & 0xFF));
2064 #endif
2065     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 24) & 0xFF));
2066     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 16) & 0xFF));
2067     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  8) & 0xFF));
2068     crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  0) & 0xFF));
2069
2070     return crc;
2071 }
2072
2073 static DRFLAC_INLINE drflac_uint16 drflac_crc16_bytes(drflac_uint16 crc, drflac_cache_t data, drflac_uint32 byteCount)
2074 {
2075     switch (byteCount)
2076     {
2077 #ifdef DRFLAC_64BIT
2078     case 8: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 56) & 0xFF));
2079     case 7: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 48) & 0xFF));
2080     case 6: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 40) & 0xFF));
2081     case 5: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 32) & 0xFF));
2082 #endif
2083     case 4: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 24) & 0xFF));
2084     case 3: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >> 16) & 0xFF));
2085     case 2: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  8) & 0xFF));
2086     case 1: crc = drflac_crc16_byte(crc, (drflac_uint8)((data >>  0) & 0xFF));
2087     }
2088
2089     return crc;
2090 }
2091
2092 #if 0
2093 static DRFLAC_INLINE drflac_uint16 drflac_crc16__32bit(drflac_uint16 crc, drflac_uint32 data, drflac_uint32 count)
2094 {
2095 #ifdef DR_FLAC_NO_CRC
2096     (void)crc;
2097     (void)data;
2098     (void)count;
2099     return 0;
2100 #else
2101 #if 0
2102     /* REFERENCE (use of this implementation requires an explicit flush by doing "drflac_crc16(crc, 0, 16);") */
2103     drflac_uint16 p = 0x8005;
2104     for (int i = count-1; i >= 0; --i) {
2105         drflac_uint16 bit = (data & (1ULL << i)) >> i;
2106         if (r & 0x8000) {
2107             r = ((r << 1) | bit) ^ p;
2108         } else {
2109             r = ((r << 1) | bit);
2110         }
2111     }
2112
2113     return crc;
2114 #else
2115     drflac_uint32 wholeBytes;
2116     drflac_uint32 leftoverBits;
2117     drflac_uint64 leftoverDataMask;
2118
2119     static drflac_uint64 leftoverDataMaskTable[8] = {
2120         0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
2121     };
2122
2123     DRFLAC_ASSERT(count <= 64);
2124
2125     wholeBytes = count >> 3;
2126     leftoverBits = count & 7;
2127     leftoverDataMask = leftoverDataMaskTable[leftoverBits];
2128
2129     switch (wholeBytes) {
2130         default:
2131         case 4: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (0xFF000000UL << leftoverBits)) >> (24 + leftoverBits)));
2132         case 3: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (0x00FF0000UL << leftoverBits)) >> (16 + leftoverBits)));
2133         case 2: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (0x0000FF00UL << leftoverBits)) >> ( 8 + leftoverBits)));
2134         case 1: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (0x000000FFUL << leftoverBits)) >> ( 0 + leftoverBits)));
2135         case 0: if (leftoverBits > 0) crc = (crc << leftoverBits) ^ drflac__crc16_table[(crc >> (16 - leftoverBits)) ^ (data & leftoverDataMask)];
2136     }
2137     return crc;
2138 #endif
2139 #endif
2140 }
2141
2142 static DRFLAC_INLINE drflac_uint16 drflac_crc16__64bit(drflac_uint16 crc, drflac_uint64 data, drflac_uint32 count)
2143 {
2144 #ifdef DR_FLAC_NO_CRC
2145     (void)crc;
2146     (void)data;
2147     (void)count;
2148     return 0;
2149 #else
2150     drflac_uint32 wholeBytes;
2151     drflac_uint32 leftoverBits;
2152     drflac_uint64 leftoverDataMask;
2153
2154     static drflac_uint64 leftoverDataMaskTable[8] = {
2155         0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
2156     };
2157
2158     DRFLAC_ASSERT(count <= 64);
2159
2160     wholeBytes = count >> 3;
2161     leftoverBits = count & 7;
2162     leftoverDataMask = leftoverDataMaskTable[leftoverBits];
2163
2164     switch (wholeBytes) {
2165         default:
2166         case 8: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0xFF000000 << 32) << leftoverBits)) >> (56 + leftoverBits)));    /* Weird "<< 32" bitshift is required for C89 because it doesn't support 64-bit constants. Should be optimized out by a good compiler. */
2167         case 7: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x00FF0000 << 32) << leftoverBits)) >> (48 + leftoverBits)));
2168         case 6: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x0000FF00 << 32) << leftoverBits)) >> (40 + leftoverBits)));
2169         case 5: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x000000FF << 32) << leftoverBits)) >> (32 + leftoverBits)));
2170         case 4: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0xFF000000      ) << leftoverBits)) >> (24 + leftoverBits)));
2171         case 3: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x00FF0000      ) << leftoverBits)) >> (16 + leftoverBits)));
2172         case 2: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x0000FF00      ) << leftoverBits)) >> ( 8 + leftoverBits)));
2173         case 1: crc = drflac_crc16_byte(crc, (drflac_uint8)((data & (((drflac_uint64)0x000000FF      ) << leftoverBits)) >> ( 0 + leftoverBits)));
2174         case 0: if (leftoverBits > 0) crc = (crc << leftoverBits) ^ drflac__crc16_table[(crc >> (16 - leftoverBits)) ^ (data & leftoverDataMask)];
2175     }
2176     return crc;
2177 #endif
2178 }
2179
2180
2181 static DRFLAC_INLINE drflac_uint16 drflac_crc16(drflac_uint16 crc, drflac_cache_t data, drflac_uint32 count)
2182 {
2183 #ifdef DRFLAC_64BIT
2184     return drflac_crc16__64bit(crc, data, count);
2185 #else
2186     return drflac_crc16__32bit(crc, data, count);
2187 #endif
2188 }
2189 #endif
2190
2191
2192 #ifdef DRFLAC_64BIT
2193 #define drflac__be2host__cache_line drflac__be2host_64
2194 #else
2195 #define drflac__be2host__cache_line drflac__be2host_32
2196 #endif
2197
2198 /*
2199 BIT READING ATTEMPT #2
2200
2201 This uses a 32- or 64-bit bit-shifted cache - as bits are read, the cache is shifted such that the first valid bit is sitting
2202 on the most significant bit. It uses the notion of an L1 and L2 cache (borrowed from CPU architecture), where the L1 cache
2203 is a 32- or 64-bit unsigned integer (depending on whether or not a 32- or 64-bit build is being compiled) and the L2 is an
2204 array of "cache lines", with each cache line being the same size as the L1. The L2 is a buffer of about 4KB and is where data
2205 from onRead() is read into.
2206 */
2207 #define DRFLAC_CACHE_L1_SIZE_BYTES(bs)                      (sizeof((bs)->cache))
2208 #define DRFLAC_CACHE_L1_SIZE_BITS(bs)                       (sizeof((bs)->cache)*8)
2209 #define DRFLAC_CACHE_L1_BITS_REMAINING(bs)                  (DRFLAC_CACHE_L1_SIZE_BITS(bs) - (bs)->consumedBits)
2210 #define DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount)           (~((~(drflac_cache_t)0) >> (_bitCount)))
2211 #define DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, _bitCount)      (DRFLAC_CACHE_L1_SIZE_BITS(bs) - (_bitCount))
2212 #define DRFLAC_CACHE_L1_SELECT(bs, _bitCount)               (((bs)->cache) & DRFLAC_CACHE_L1_SELECTION_MASK(_bitCount))
2213 #define DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, _bitCount)     (DRFLAC_CACHE_L1_SELECT((bs), (_bitCount)) >>  DRFLAC_CACHE_L1_SELECTION_SHIFT((bs), (_bitCount)))
2214 #define DRFLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE(bs, _bitCount)(DRFLAC_CACHE_L1_SELECT((bs), (_bitCount)) >> (DRFLAC_CACHE_L1_SELECTION_SHIFT((bs), (_bitCount)) & (DRFLAC_CACHE_L1_SIZE_BITS(bs)-1)))
2215 #define DRFLAC_CACHE_L2_SIZE_BYTES(bs)                      (sizeof((bs)->cacheL2))
2216 #define DRFLAC_CACHE_L2_LINE_COUNT(bs)                      (DRFLAC_CACHE_L2_SIZE_BYTES(bs) / sizeof((bs)->cacheL2[0]))
2217 #define DRFLAC_CACHE_L2_LINES_REMAINING(bs)                 (DRFLAC_CACHE_L2_LINE_COUNT(bs) - (bs)->nextL2Line)
2218
2219
2220 #ifndef DR_FLAC_NO_CRC
2221 static DRFLAC_INLINE void drflac__reset_crc16(drflac_bs* bs)
2222 {
2223     bs->crc16 = 0;
2224     bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
2225 }
2226
2227 static DRFLAC_INLINE void drflac__update_crc16(drflac_bs* bs)
2228 {
2229     if (bs->crc16CacheIgnoredBytes == 0) {
2230         bs->crc16 = drflac_crc16_cache(bs->crc16, bs->crc16Cache);
2231     } else {
2232         bs->crc16 = drflac_crc16_bytes(bs->crc16, bs->crc16Cache, DRFLAC_CACHE_L1_SIZE_BYTES(bs) - bs->crc16CacheIgnoredBytes);
2233         bs->crc16CacheIgnoredBytes = 0;
2234     }
2235 }
2236
2237 static DRFLAC_INLINE drflac_uint16 drflac__flush_crc16(drflac_bs* bs)
2238 {
2239     /* We should never be flushing in a situation where we are not aligned on a byte boundary. */
2240     DRFLAC_ASSERT((DRFLAC_CACHE_L1_BITS_REMAINING(bs) & 7) == 0);
2241
2242     /*
2243     The bits that were read from the L1 cache need to be accumulated. The number of bytes needing to be accumulated is determined
2244     by the number of bits that have been consumed.
2245     */
2246     if (DRFLAC_CACHE_L1_BITS_REMAINING(bs) == 0) {
2247         drflac__update_crc16(bs);
2248     } else {
2249         /* We only accumulate the consumed bits. */
2250         bs->crc16 = drflac_crc16_bytes(bs->crc16, bs->crc16Cache >> DRFLAC_CACHE_L1_BITS_REMAINING(bs), (bs->consumedBits >> 3) - bs->crc16CacheIgnoredBytes);
2251
2252         /*
2253         The bits that we just accumulated should never be accumulated again. We need to keep track of how many bytes were accumulated
2254         so we can handle that later.
2255         */
2256         bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
2257     }
2258
2259     return bs->crc16;
2260 }
2261 #endif
2262
2263 static DRFLAC_INLINE drflac_bool32 drflac__reload_l1_cache_from_l2(drflac_bs* bs)
2264 {
2265     size_t bytesRead;
2266     size_t alignedL1LineCount;
2267
2268     /* Fast path. Try loading straight from L2. */
2269     if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
2270         bs->cache = bs->cacheL2[bs->nextL2Line++];
2271         return DRFLAC_TRUE;
2272     }
2273
2274     /*
2275     If we get here it means we've run out of data in the L2 cache. We'll need to fetch more from the client, if there's
2276     any left.
2277     */
2278     if (bs->unalignedByteCount > 0) {
2279         return DRFLAC_FALSE;   /* If we have any unaligned bytes it means there's no more aligned bytes left in the client. */
2280     }
2281
2282     bytesRead = bs->onRead(bs->pUserData, bs->cacheL2, DRFLAC_CACHE_L2_SIZE_BYTES(bs));
2283
2284     bs->nextL2Line = 0;
2285     if (bytesRead == DRFLAC_CACHE_L2_SIZE_BYTES(bs)) {
2286         bs->cache = bs->cacheL2[bs->nextL2Line++];
2287         return DRFLAC_TRUE;
2288     }
2289
2290
2291     /*
2292     If we get here it means we were unable to retrieve enough data to fill the entire L2 cache. It probably
2293     means we've just reached the end of the file. We need to move the valid data down to the end of the buffer
2294     and adjust the index of the next line accordingly. Also keep in mind that the L2 cache must be aligned to
2295     the size of the L1 so we'll need to seek backwards by any misaligned bytes.
2296     */
2297     alignedL1LineCount = bytesRead / DRFLAC_CACHE_L1_SIZE_BYTES(bs);
2298
2299     /* We need to keep track of any unaligned bytes for later use. */
2300     bs->unalignedByteCount = bytesRead - (alignedL1LineCount * DRFLAC_CACHE_L1_SIZE_BYTES(bs));
2301     if (bs->unalignedByteCount > 0) {
2302         bs->unalignedCache = bs->cacheL2[alignedL1LineCount];
2303     }
2304
2305     if (alignedL1LineCount > 0) {
2306         size_t offset = DRFLAC_CACHE_L2_LINE_COUNT(bs) - alignedL1LineCount;
2307         size_t i;
2308         for (i = alignedL1LineCount; i > 0; --i) {
2309             bs->cacheL2[i-1 + offset] = bs->cacheL2[i-1];
2310         }
2311
2312         bs->nextL2Line = (drflac_uint32)offset;
2313         bs->cache = bs->cacheL2[bs->nextL2Line++];
2314         return DRFLAC_TRUE;
2315     } else {
2316         /* If we get into this branch it means we weren't able to load any L1-aligned data. */
2317         bs->nextL2Line = DRFLAC_CACHE_L2_LINE_COUNT(bs);
2318         return DRFLAC_FALSE;
2319     }
2320 }
2321
2322 static drflac_bool32 drflac__reload_cache(drflac_bs* bs)
2323 {
2324     size_t bytesRead;
2325
2326 #ifndef DR_FLAC_NO_CRC
2327     drflac__update_crc16(bs);
2328 #endif
2329
2330     /* Fast path. Try just moving the next value in the L2 cache to the L1 cache. */
2331     if (drflac__reload_l1_cache_from_l2(bs)) {
2332         bs->cache = drflac__be2host__cache_line(bs->cache);
2333         bs->consumedBits = 0;
2334 #ifndef DR_FLAC_NO_CRC
2335         bs->crc16Cache = bs->cache;
2336 #endif
2337         return DRFLAC_TRUE;
2338     }
2339
2340     /* Slow path. */
2341
2342     /*
2343     If we get here it means we have failed to load the L1 cache from the L2. Likely we've just reached the end of the stream and the last
2344     few bytes did not meet the alignment requirements for the L2 cache. In this case we need to fall back to a slower path and read the
2345     data from the unaligned cache.
2346     */
2347     bytesRead = bs->unalignedByteCount;
2348     if (bytesRead == 0) {
2349         bs->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs);   /* <-- The stream has been exhausted, so marked the bits as consumed. */
2350         return DRFLAC_FALSE;
2351     }
2352
2353     DRFLAC_ASSERT(bytesRead < DRFLAC_CACHE_L1_SIZE_BYTES(bs));
2354     bs->consumedBits = (drflac_uint32)(DRFLAC_CACHE_L1_SIZE_BYTES(bs) - bytesRead) * 8;
2355
2356     bs->cache = drflac__be2host__cache_line(bs->unalignedCache);
2357     bs->cache &= DRFLAC_CACHE_L1_SELECTION_MASK(DRFLAC_CACHE_L1_BITS_REMAINING(bs));    /* <-- Make sure the consumed bits are always set to zero. Other parts of the library depend on this property. */
2358     bs->unalignedByteCount = 0;     /* <-- At this point the unaligned bytes have been moved into the cache and we thus have no more unaligned bytes. */
2359
2360 #ifndef DR_FLAC_NO_CRC
2361     bs->crc16Cache = bs->cache >> bs->consumedBits;
2362     bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
2363 #endif
2364     return DRFLAC_TRUE;
2365 }
2366
2367 static void drflac__reset_cache(drflac_bs* bs)
2368 {
2369     bs->nextL2Line   = DRFLAC_CACHE_L2_LINE_COUNT(bs);  /* <-- This clears the L2 cache. */
2370     bs->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs);   /* <-- This clears the L1 cache. */
2371     bs->cache = 0;
2372     bs->unalignedByteCount = 0;                         /* <-- This clears the trailing unaligned bytes. */
2373     bs->unalignedCache = 0;
2374
2375 #ifndef DR_FLAC_NO_CRC
2376     bs->crc16Cache = 0;
2377     bs->crc16CacheIgnoredBytes = 0;
2378 #endif
2379 }
2380
2381
2382 static DRFLAC_INLINE drflac_bool32 drflac__read_uint32(drflac_bs* bs, unsigned int bitCount, drflac_uint32* pResultOut)
2383 {
2384     DRFLAC_ASSERT(bs != NULL);
2385     DRFLAC_ASSERT(pResultOut != NULL);
2386     DRFLAC_ASSERT(bitCount > 0);
2387     DRFLAC_ASSERT(bitCount <= 32);
2388
2389     if (bs->consumedBits == DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
2390         if (!drflac__reload_cache(bs)) {
2391             return DRFLAC_FALSE;
2392         }
2393     }
2394
2395     if (bitCount <= DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
2396         /*
2397         If we want to load all 32-bits from a 32-bit cache we need to do it slightly differently because we can't do
2398         a 32-bit shift on a 32-bit integer. This will never be the case on 64-bit caches, so we can have a slightly
2399         more optimal solution for this.
2400         */
2401 #ifdef DRFLAC_64BIT
2402         *pResultOut = (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCount);
2403         bs->consumedBits += bitCount;
2404         bs->cache <<= bitCount;
2405 #else
2406         if (bitCount < DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
2407             *pResultOut = (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCount);
2408             bs->consumedBits += bitCount;
2409             bs->cache <<= bitCount;
2410         } else {
2411             /* Cannot shift by 32-bits, so need to do it differently. */
2412             *pResultOut = (drflac_uint32)bs->cache;
2413             bs->consumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs);
2414             bs->cache = 0;
2415         }
2416 #endif
2417
2418         return DRFLAC_TRUE;
2419     } else {
2420         /* It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them. */
2421         drflac_uint32 bitCountHi = DRFLAC_CACHE_L1_BITS_REMAINING(bs);
2422         drflac_uint32 bitCountLo = bitCount - bitCountHi;
2423         drflac_uint32 resultHi;
2424
2425         DRFLAC_ASSERT(bitCountHi > 0);
2426         DRFLAC_ASSERT(bitCountHi < 32);
2427         resultHi = (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCountHi);
2428
2429         if (!drflac__reload_cache(bs)) {
2430             return DRFLAC_FALSE;
2431         }
2432
2433         *pResultOut = (resultHi << bitCountLo) | (drflac_uint32)DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCountLo);
2434         bs->consumedBits += bitCountLo;
2435         bs->cache <<= bitCountLo;
2436         return DRFLAC_TRUE;
2437     }
2438 }
2439
2440 static drflac_bool32 drflac__read_int32(drflac_bs* bs, unsigned int bitCount, drflac_int32* pResult)
2441 {
2442     drflac_uint32 result;
2443
2444     DRFLAC_ASSERT(bs != NULL);
2445     DRFLAC_ASSERT(pResult != NULL);
2446     DRFLAC_ASSERT(bitCount > 0);
2447     DRFLAC_ASSERT(bitCount <= 32);
2448
2449     if (!drflac__read_uint32(bs, bitCount, &result)) {
2450         return DRFLAC_FALSE;
2451     }
2452
2453     /* Do not attempt to shift by 32 as it's undefined. */
2454     if (bitCount < 32) {
2455         drflac_uint32 signbit;
2456         signbit = ((result >> (bitCount-1)) & 0x01);
2457         result |= (~signbit + 1) << bitCount;
2458     }
2459
2460     *pResult = (drflac_int32)result;
2461     return DRFLAC_TRUE;
2462 }
2463
2464 #ifdef DRFLAC_64BIT
2465 static drflac_bool32 drflac__read_uint64(drflac_bs* bs, unsigned int bitCount, drflac_uint64* pResultOut)
2466 {
2467     drflac_uint32 resultHi;
2468     drflac_uint32 resultLo;
2469
2470     DRFLAC_ASSERT(bitCount <= 64);
2471     DRFLAC_ASSERT(bitCount >  32);
2472
2473     if (!drflac__read_uint32(bs, bitCount - 32, &resultHi)) {
2474         return DRFLAC_FALSE;
2475     }
2476
2477     if (!drflac__read_uint32(bs, 32, &resultLo)) {
2478         return DRFLAC_FALSE;
2479     }
2480
2481     *pResultOut = (((drflac_uint64)resultHi) << 32) | ((drflac_uint64)resultLo);
2482     return DRFLAC_TRUE;
2483 }
2484 #endif
2485
2486 /* Function below is unused, but leaving it here in case I need to quickly add it again. */
2487 #if 0
2488 static drflac_bool32 drflac__read_int64(drflac_bs* bs, unsigned int bitCount, drflac_int64* pResultOut)
2489 {
2490     drflac_uint64 result;
2491     drflac_uint64 signbit;
2492
2493     DRFLAC_ASSERT(bitCount <= 64);
2494
2495     if (!drflac__read_uint64(bs, bitCount, &result)) {
2496         return DRFLAC_FALSE;
2497     }
2498
2499     signbit = ((result >> (bitCount-1)) & 0x01);
2500     result |= (~signbit + 1) << bitCount;
2501
2502     *pResultOut = (drflac_int64)result;
2503     return DRFLAC_TRUE;
2504 }
2505 #endif
2506
2507 static drflac_bool32 drflac__read_uint16(drflac_bs* bs, unsigned int bitCount, drflac_uint16* pResult)
2508 {
2509     drflac_uint32 result;
2510
2511     DRFLAC_ASSERT(bs != NULL);
2512     DRFLAC_ASSERT(pResult != NULL);
2513     DRFLAC_ASSERT(bitCount > 0);
2514     DRFLAC_ASSERT(bitCount <= 16);
2515
2516     if (!drflac__read_uint32(bs, bitCount, &result)) {
2517         return DRFLAC_FALSE;
2518     }
2519
2520     *pResult = (drflac_uint16)result;
2521     return DRFLAC_TRUE;
2522 }
2523
2524 #if 0
2525 static drflac_bool32 drflac__read_int16(drflac_bs* bs, unsigned int bitCount, drflac_int16* pResult)
2526 {
2527     drflac_int32 result;
2528
2529     DRFLAC_ASSERT(bs != NULL);
2530     DRFLAC_ASSERT(pResult != NULL);
2531     DRFLAC_ASSERT(bitCount > 0);
2532     DRFLAC_ASSERT(bitCount <= 16);
2533
2534     if (!drflac__read_int32(bs, bitCount, &result)) {
2535         return DRFLAC_FALSE;
2536     }
2537
2538     *pResult = (drflac_int16)result;
2539     return DRFLAC_TRUE;
2540 }
2541 #endif
2542
2543 static drflac_bool32 drflac__read_uint8(drflac_bs* bs, unsigned int bitCount, drflac_uint8* pResult)
2544 {
2545     drflac_uint32 result;
2546
2547     DRFLAC_ASSERT(bs != NULL);
2548     DRFLAC_ASSERT(pResult != NULL);
2549     DRFLAC_ASSERT(bitCount > 0);
2550     DRFLAC_ASSERT(bitCount <= 8);
2551
2552     if (!drflac__read_uint32(bs, bitCount, &result)) {
2553         return DRFLAC_FALSE;
2554     }
2555
2556     *pResult = (drflac_uint8)result;
2557     return DRFLAC_TRUE;
2558 }
2559
2560 static drflac_bool32 drflac__read_int8(drflac_bs* bs, unsigned int bitCount, drflac_int8* pResult)
2561 {
2562     drflac_int32 result;
2563
2564     DRFLAC_ASSERT(bs != NULL);
2565     DRFLAC_ASSERT(pResult != NULL);
2566     DRFLAC_ASSERT(bitCount > 0);
2567     DRFLAC_ASSERT(bitCount <= 8);
2568
2569     if (!drflac__read_int32(bs, bitCount, &result)) {
2570         return DRFLAC_FALSE;
2571     }
2572
2573     *pResult = (drflac_int8)result;
2574     return DRFLAC_TRUE;
2575 }
2576
2577
2578 static drflac_bool32 drflac__seek_bits(drflac_bs* bs, size_t bitsToSeek)
2579 {
2580     if (bitsToSeek <= DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
2581         bs->consumedBits += (drflac_uint32)bitsToSeek;
2582         bs->cache <<= bitsToSeek;
2583         return DRFLAC_TRUE;
2584     } else {
2585         /* It straddles the cached data. This function isn't called too frequently so I'm favouring simplicity here. */
2586         bitsToSeek       -= DRFLAC_CACHE_L1_BITS_REMAINING(bs);
2587         bs->consumedBits += DRFLAC_CACHE_L1_BITS_REMAINING(bs);
2588         bs->cache         = 0;
2589
2590         /* Simple case. Seek in groups of the same number as bits that fit within a cache line. */
2591 #ifdef DRFLAC_64BIT
2592         while (bitsToSeek >= DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
2593             drflac_uint64 bin;
2594             if (!drflac__read_uint64(bs, DRFLAC_CACHE_L1_SIZE_BITS(bs), &bin)) {
2595                 return DRFLAC_FALSE;
2596             }
2597             bitsToSeek -= DRFLAC_CACHE_L1_SIZE_BITS(bs);
2598         }
2599 #else
2600         while (bitsToSeek >= DRFLAC_CACHE_L1_SIZE_BITS(bs)) {
2601             drflac_uint32 bin;
2602             if (!drflac__read_uint32(bs, DRFLAC_CACHE_L1_SIZE_BITS(bs), &bin)) {
2603                 return DRFLAC_FALSE;
2604             }
2605             bitsToSeek -= DRFLAC_CACHE_L1_SIZE_BITS(bs);
2606         }
2607 #endif
2608
2609         /* Whole leftover bytes. */
2610         while (bitsToSeek >= 8) {
2611             drflac_uint8 bin;
2612             if (!drflac__read_uint8(bs, 8, &bin)) {
2613                 return DRFLAC_FALSE;
2614             }
2615             bitsToSeek -= 8;
2616         }
2617
2618         /* Leftover bits. */
2619         if (bitsToSeek > 0) {
2620             drflac_uint8 bin;
2621             if (!drflac__read_uint8(bs, (drflac_uint32)bitsToSeek, &bin)) {
2622                 return DRFLAC_FALSE;
2623             }
2624             bitsToSeek = 0; /* <-- Necessary for the assert below. */
2625         }
2626
2627         DRFLAC_ASSERT(bitsToSeek == 0);
2628         return DRFLAC_TRUE;
2629     }
2630 }
2631
2632
2633 /* This function moves the bit streamer to the first bit after the sync code (bit 15 of the of the frame header). It will also update the CRC-16. */
2634 static drflac_bool32 drflac__find_and_seek_to_next_sync_code(drflac_bs* bs)
2635 {
2636     DRFLAC_ASSERT(bs != NULL);
2637
2638     /*
2639     The sync code is always aligned to 8 bits. This is convenient for us because it means we can do byte-aligned movements. The first
2640     thing to do is align to the next byte.
2641     */
2642     if (!drflac__seek_bits(bs, DRFLAC_CACHE_L1_BITS_REMAINING(bs) & 7)) {
2643         return DRFLAC_FALSE;
2644     }
2645
2646     for (;;) {
2647         drflac_uint8 hi;
2648
2649 #ifndef DR_FLAC_NO_CRC
2650         drflac__reset_crc16(bs);
2651 #endif
2652
2653         if (!drflac__read_uint8(bs, 8, &hi)) {
2654             return DRFLAC_FALSE;
2655         }
2656
2657         if (hi == 0xFF) {
2658             drflac_uint8 lo;
2659             if (!drflac__read_uint8(bs, 6, &lo)) {
2660                 return DRFLAC_FALSE;
2661             }
2662
2663             if (lo == 0x3E) {
2664                 return DRFLAC_TRUE;
2665             } else {
2666                 if (!drflac__seek_bits(bs, DRFLAC_CACHE_L1_BITS_REMAINING(bs) & 7)) {
2667                     return DRFLAC_FALSE;
2668                 }
2669             }
2670         }
2671     }
2672
2673     /* Should never get here. */
2674     /*return DRFLAC_FALSE;*/
2675 }
2676
2677
2678 #if defined(DRFLAC_HAS_LZCNT_INTRINSIC)
2679 #define DRFLAC_IMPLEMENT_CLZ_LZCNT
2680 #endif
2681 #if  defined(_MSC_VER) && _MSC_VER >= 1400 && (defined(DRFLAC_X64) || defined(DRFLAC_X86)) && !defined(__clang__)
2682 #define DRFLAC_IMPLEMENT_CLZ_MSVC
2683 #endif
2684 #if  defined(__WATCOMC__) && defined(__386__)
2685 #define DRFLAC_IMPLEMENT_CLZ_WATCOM
2686 #endif
2687
2688 static DRFLAC_INLINE drflac_uint32 drflac__clz_software(drflac_cache_t x)
2689 {
2690     drflac_uint32 n;
2691     static drflac_uint32 clz_table_4[] = {
2692         0,
2693         4,
2694         3, 3,
2695         2, 2, 2, 2,
2696         1, 1, 1, 1, 1, 1, 1, 1
2697     };
2698
2699     if (x == 0) {
2700         return sizeof(x)*8;
2701     }
2702
2703     n = clz_table_4[x >> (sizeof(x)*8 - 4)];
2704     if (n == 0) {
2705 #ifdef DRFLAC_64BIT
2706         if ((x & ((drflac_uint64)0xFFFFFFFF << 32)) == 0) { n  = 32; x <<= 32; }
2707         if ((x & ((drflac_uint64)0xFFFF0000 << 32)) == 0) { n += 16; x <<= 16; }
2708         if ((x & ((drflac_uint64)0xFF000000 << 32)) == 0) { n += 8;  x <<= 8;  }
2709         if ((x & ((drflac_uint64)0xF0000000 << 32)) == 0) { n += 4;  x <<= 4;  }
2710 #else
2711         if ((x & 0xFFFF0000) == 0) { n  = 16; x <<= 16; }
2712         if ((x & 0xFF000000) == 0) { n += 8;  x <<= 8;  }
2713         if ((x & 0xF0000000) == 0) { n += 4;  x <<= 4;  }
2714 #endif
2715         n += clz_table_4[x >> (sizeof(x)*8 - 4)];
2716     }
2717
2718     return n - 1;
2719 }
2720
2721 #ifdef DRFLAC_IMPLEMENT_CLZ_LZCNT
2722 static DRFLAC_INLINE drflac_bool32 drflac__is_lzcnt_supported(void)
2723 {
2724     /* Fast compile time check for ARM. */
2725 #if defined(DRFLAC_HAS_LZCNT_INTRINSIC) && defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5)
2726     return DRFLAC_TRUE;
2727 #else
2728     /* If the compiler itself does not support the intrinsic then we'll need to return false. */
2729     #ifdef DRFLAC_HAS_LZCNT_INTRINSIC
2730         return drflac__gIsLZCNTSupported;
2731     #else
2732         return DRFLAC_FALSE;
2733     #endif
2734 #endif
2735 }
2736
2737 static DRFLAC_INLINE drflac_uint32 drflac__clz_lzcnt(drflac_cache_t x)
2738 {
2739     /*
2740     It's critical for competitive decoding performance that this function be highly optimal. With MSVC we can use the __lzcnt64() and __lzcnt() intrinsics
2741     to achieve good performance, however on GCC and Clang it's a little bit more annoying. The __builtin_clzl() and __builtin_clzll() intrinsics leave
2742     it undefined as to the return value when `x` is 0. We need this to be well defined as returning 32 or 64, depending on whether or not it's a 32- or
2743     64-bit build. To work around this we would need to add a conditional to check for the x = 0 case, but this creates unnecessary inefficiency. To work
2744     around this problem I have written some inline assembly to emit the LZCNT (x86) or CLZ (ARM) instruction directly which removes the need to include
2745     the conditional. This has worked well in the past, but for some reason Clang's MSVC compatible driver, clang-cl, does not seem to be handling this
2746     in the same way as the normal Clang driver. It seems that `clang-cl` is just outputting the wrong results sometimes, maybe due to some register
2747     getting clobbered?
2748
2749     I'm not sure if this is a bug with dr_flac's inlined assembly (most likely), a bug in `clang-cl` or just a misunderstanding on my part with inline
2750     assembly rules for `clang-cl`. If somebody can identify an error in dr_flac's inlined assembly I'm happy to get that fixed.
2751
2752     Fortunately there is an easy workaround for this. Clang implements MSVC-specific intrinsics for compatibility. It also defines _MSC_VER for extra
2753     compatibility. We can therefore just check for _MSC_VER and use the MSVC intrinsic which, fortunately for us, Clang supports. It would still be nice
2754     to know how to fix the inlined assembly for correctness sake, however.
2755     */
2756
2757 #if defined(_MSC_VER) /*&& !defined(__clang__)*/    /* <-- Intentionally wanting Clang to use the MSVC __lzcnt64/__lzcnt intrinsics due to above ^. */
2758     #ifdef DRFLAC_64BIT
2759         return (drflac_uint32)__lzcnt64(x);
2760     #else
2761         return (drflac_uint32)__lzcnt(x);
2762     #endif
2763 #else
2764     #if defined(__GNUC__) || defined(__clang__)
2765         #if defined(DRFLAC_X64)
2766             {
2767                 drflac_uint64 r;
2768                 __asm__ __volatile__ (
2769                     "lzcnt{ %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
2770                 );
2771
2772                 return (drflac_uint32)r;
2773             }
2774         #elif defined(DRFLAC_X86)
2775             {
2776                 drflac_uint32 r;
2777                 __asm__ __volatile__ (
2778                     "lzcnt{l %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
2779                 );
2780
2781                 return r;
2782             }
2783         #elif defined(DRFLAC_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5) && !defined(DRFLAC_64BIT)   /* <-- I haven't tested 64-bit inline assembly, so only enabling this for the 32-bit build for now. */
2784             {
2785                 unsigned int r;
2786                 __asm__ __volatile__ (
2787                 #if defined(DRFLAC_64BIT)
2788                     "clz %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(x)   /* <-- This is untested. If someone in the community could test this, that would be appreciated! */
2789                 #else
2790                     "clz %[out], %[in]" : [out]"=r"(r) : [in]"r"(x)
2791                 #endif
2792                 );
2793
2794                 return r;
2795             }
2796         #else
2797             if (x == 0) {
2798                 return sizeof(x)*8;
2799             }
2800             #ifdef DRFLAC_64BIT
2801                 return (drflac_uint32)__builtin_clzll((drflac_uint64)x);
2802             #else
2803                 return (drflac_uint32)__builtin_clzl((drflac_uint32)x);
2804             #endif
2805         #endif
2806     #else
2807         /* Unsupported compiler. */
2808         #error "This compiler does not support the lzcnt intrinsic."
2809     #endif
2810 #endif
2811 }
2812 #endif
2813
2814 #ifdef DRFLAC_IMPLEMENT_CLZ_MSVC
2815 #include <intrin.h> /* For BitScanReverse(). */
2816
2817 static DRFLAC_INLINE drflac_uint32 drflac__clz_msvc(drflac_cache_t x)
2818 {
2819     drflac_uint32 n;
2820
2821     if (x == 0) {
2822         return sizeof(x)*8;
2823     }
2824
2825 #ifdef DRFLAC_64BIT
2826     _BitScanReverse64((unsigned long*)&n, x);
2827 #else
2828     _BitScanReverse((unsigned long*)&n, x);
2829 #endif
2830     return sizeof(x)*8 - n - 1;
2831 }
2832 #endif
2833
2834 #ifdef DRFLAC_IMPLEMENT_CLZ_WATCOM
2835 static __inline drflac_uint32 drflac__clz_watcom (drflac_uint32);
2836 #pragma aux drflac__clz_watcom = \
2837     "bsr eax, eax" \
2838     "xor eax, 31" \
2839     parm [eax] nomemory \
2840     value [eax] \
2841     modify exact [eax] nomemory;
2842 #endif
2843
2844 static DRFLAC_INLINE drflac_uint32 drflac__clz(drflac_cache_t x)
2845 {
2846 #ifdef DRFLAC_IMPLEMENT_CLZ_LZCNT
2847     if (drflac__is_lzcnt_supported()) {
2848         return drflac__clz_lzcnt(x);
2849     } else
2850 #endif
2851     {
2852 #ifdef DRFLAC_IMPLEMENT_CLZ_MSVC
2853         return drflac__clz_msvc(x);
2854 #elif defined(DRFLAC_IMPLEMENT_CLZ_WATCOM)
2855         return (x == 0) ? sizeof(x)*8 : drflac__clz_watcom(x);
2856 #else
2857         return drflac__clz_software(x);
2858 #endif
2859     }
2860 }
2861
2862
2863 static DRFLAC_INLINE drflac_bool32 drflac__seek_past_next_set_bit(drflac_bs* bs, unsigned int* pOffsetOut)
2864 {
2865     drflac_uint32 zeroCounter = 0;
2866     drflac_uint32 setBitOffsetPlus1;
2867
2868     while (bs->cache == 0) {
2869         zeroCounter += (drflac_uint32)DRFLAC_CACHE_L1_BITS_REMAINING(bs);
2870         if (!drflac__reload_cache(bs)) {
2871             return DRFLAC_FALSE;
2872         }
2873     }
2874
2875     setBitOffsetPlus1 = drflac__clz(bs->cache);
2876     setBitOffsetPlus1 += 1;
2877
2878     bs->consumedBits += setBitOffsetPlus1;
2879     bs->cache <<= setBitOffsetPlus1;
2880
2881     *pOffsetOut = zeroCounter + setBitOffsetPlus1 - 1;
2882     return DRFLAC_TRUE;
2883 }
2884
2885
2886
2887 static drflac_bool32 drflac__seek_to_byte(drflac_bs* bs, drflac_uint64 offsetFromStart)
2888 {
2889     DRFLAC_ASSERT(bs != NULL);
2890     DRFLAC_ASSERT(offsetFromStart > 0);
2891
2892     /*
2893     Seeking from the start is not quite as trivial as it sounds because the onSeek callback takes a signed 32-bit integer (which
2894     is intentional because it simplifies the implementation of the onSeek callbacks), however offsetFromStart is unsigned 64-bit.
2895     To resolve we just need to do an initial seek from the start, and then a series of offset seeks to make up the remainder.
2896     */
2897     if (offsetFromStart > 0x7FFFFFFF) {
2898         drflac_uint64 bytesRemaining = offsetFromStart;
2899         if (!bs->onSeek(bs->pUserData, 0x7FFFFFFF, drflac_seek_origin_start)) {
2900             return DRFLAC_FALSE;
2901         }
2902         bytesRemaining -= 0x7FFFFFFF;
2903
2904         while (bytesRemaining > 0x7FFFFFFF) {
2905             if (!bs->onSeek(bs->pUserData, 0x7FFFFFFF, drflac_seek_origin_current)) {
2906                 return DRFLAC_FALSE;
2907             }
2908             bytesRemaining -= 0x7FFFFFFF;
2909         }
2910
2911         if (bytesRemaining > 0) {
2912             if (!bs->onSeek(bs->pUserData, (int)bytesRemaining, drflac_seek_origin_current)) {
2913                 return DRFLAC_FALSE;
2914             }
2915         }
2916     } else {
2917         if (!bs->onSeek(bs->pUserData, (int)offsetFromStart, drflac_seek_origin_start)) {
2918             return DRFLAC_FALSE;
2919         }
2920     }
2921
2922     /* The cache should be reset to force a reload of fresh data from the client. */
2923     drflac__reset_cache(bs);
2924     return DRFLAC_TRUE;
2925 }
2926
2927
2928 static drflac_result drflac__read_utf8_coded_number(drflac_bs* bs, drflac_uint64* pNumberOut, drflac_uint8* pCRCOut)
2929 {
2930     drflac_uint8 crc;
2931     drflac_uint64 result;
2932     drflac_uint8 utf8[7] = {0};
2933     int byteCount;
2934     int i;
2935
2936     DRFLAC_ASSERT(bs != NULL);
2937     DRFLAC_ASSERT(pNumberOut != NULL);
2938     DRFLAC_ASSERT(pCRCOut != NULL);
2939
2940     crc = *pCRCOut;
2941
2942     if (!drflac__read_uint8(bs, 8, utf8)) {
2943         *pNumberOut = 0;
2944         return DRFLAC_AT_END;
2945     }
2946     crc = drflac_crc8(crc, utf8[0], 8);
2947
2948     if ((utf8[0] & 0x80) == 0) {
2949         *pNumberOut = utf8[0];
2950         *pCRCOut = crc;
2951         return DRFLAC_SUCCESS;
2952     }
2953
2954     /*byteCount = 1;*/
2955     if ((utf8[0] & 0xE0) == 0xC0) {
2956         byteCount = 2;
2957     } else if ((utf8[0] & 0xF0) == 0xE0) {
2958         byteCount = 3;
2959     } else if ((utf8[0] & 0xF8) == 0xF0) {
2960         byteCount = 4;
2961     } else if ((utf8[0] & 0xFC) == 0xF8) {
2962         byteCount = 5;
2963     } else if ((utf8[0] & 0xFE) == 0xFC) {
2964         byteCount = 6;
2965     } else if ((utf8[0] & 0xFF) == 0xFE) {
2966         byteCount = 7;
2967     } else {
2968         *pNumberOut = 0;
2969         return DRFLAC_CRC_MISMATCH;     /* Bad UTF-8 encoding. */
2970     }
2971
2972     /* Read extra bytes. */
2973     DRFLAC_ASSERT(byteCount > 1);
2974
2975     result = (drflac_uint64)(utf8[0] & (0xFF >> (byteCount + 1)));
2976     for (i = 1; i < byteCount; ++i) {
2977         if (!drflac__read_uint8(bs, 8, utf8 + i)) {
2978             *pNumberOut = 0;
2979             return DRFLAC_AT_END;
2980         }
2981         crc = drflac_crc8(crc, utf8[i], 8);
2982
2983         result = (result << 6) | (utf8[i] & 0x3F);
2984     }
2985
2986     *pNumberOut = result;
2987     *pCRCOut = crc;
2988     return DRFLAC_SUCCESS;
2989 }
2990
2991
2992
2993 /*
2994 The next two functions are responsible for calculating the prediction.
2995
2996 When the bits per sample is >16 we need to use 64-bit integer arithmetic because otherwise we'll run out of precision. It's
2997 safe to assume this will be slower on 32-bit platforms so we use a more optimal solution when the bits per sample is <=16.
2998 */
2999 static DRFLAC_INLINE drflac_int32 drflac__calculate_prediction_32(drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pDecodedSamples)
3000 {
3001     drflac_int32 prediction = 0;
3002
3003     DRFLAC_ASSERT(order <= 32);
3004
3005     /* 32-bit version. */
3006
3007     /* VC++ optimizes this to a single jmp. I've not yet verified this for other compilers. */
3008     switch (order)
3009     {
3010     case 32: prediction += coefficients[31] * pDecodedSamples[-32];
3011     case 31: prediction += coefficients[30] * pDecodedSamples[-31];
3012     case 30: prediction += coefficients[29] * pDecodedSamples[-30];
3013     case 29: prediction += coefficients[28] * pDecodedSamples[-29];
3014     case 28: prediction += coefficients[27] * pDecodedSamples[-28];
3015     case 27: prediction += coefficients[26] * pDecodedSamples[-27];
3016     case 26: prediction += coefficients[25] * pDecodedSamples[-26];
3017     case 25: prediction += coefficients[24] * pDecodedSamples[-25];
3018     case 24: prediction += coefficients[23] * pDecodedSamples[-24];
3019     case 23: prediction += coefficients[22] * pDecodedSamples[-23];
3020     case 22: prediction += coefficients[21] * pDecodedSamples[-22];
3021     case 21: prediction += coefficients[20] * pDecodedSamples[-21];
3022     case 20: prediction += coefficients[19] * pDecodedSamples[-20];
3023     case 19: prediction += coefficients[18] * pDecodedSamples[-19];
3024     case 18: prediction += coefficients[17] * pDecodedSamples[-18];
3025     case 17: prediction += coefficients[16] * pDecodedSamples[-17];
3026     case 16: prediction += coefficients[15] * pDecodedSamples[-16];
3027     case 15: prediction += coefficients[14] * pDecodedSamples[-15];
3028     case 14: prediction += coefficients[13] * pDecodedSamples[-14];
3029     case 13: prediction += coefficients[12] * pDecodedSamples[-13];
3030     case 12: prediction += coefficients[11] * pDecodedSamples[-12];
3031     case 11: prediction += coefficients[10] * pDecodedSamples[-11];
3032     case 10: prediction += coefficients[ 9] * pDecodedSamples[-10];
3033     case  9: prediction += coefficients[ 8] * pDecodedSamples[- 9];
3034     case  8: prediction += coefficients[ 7] * pDecodedSamples[- 8];
3035     case  7: prediction += coefficients[ 6] * pDecodedSamples[- 7];
3036     case  6: prediction += coefficients[ 5] * pDecodedSamples[- 6];
3037     case  5: prediction += coefficients[ 4] * pDecodedSamples[- 5];
3038     case  4: prediction += coefficients[ 3] * pDecodedSamples[- 4];
3039     case  3: prediction += coefficients[ 2] * pDecodedSamples[- 3];
3040     case  2: prediction += coefficients[ 1] * pDecodedSamples[- 2];
3041     case  1: prediction += coefficients[ 0] * pDecodedSamples[- 1];
3042     }
3043
3044     return (drflac_int32)(prediction >> shift);
3045 }
3046
3047 static DRFLAC_INLINE drflac_int32 drflac__calculate_prediction_64(drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pDecodedSamples)
3048 {
3049     drflac_int64 prediction;
3050
3051     DRFLAC_ASSERT(order <= 32);
3052
3053     /* 64-bit version. */
3054
3055     /* This method is faster on the 32-bit build when compiling with VC++. See note below. */
3056 #ifndef DRFLAC_64BIT
3057     if (order == 8)
3058     {
3059         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
3060         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
3061         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
3062         prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
3063         prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
3064         prediction += coefficients[5] * (drflac_int64)pDecodedSamples[-6];
3065         prediction += coefficients[6] * (drflac_int64)pDecodedSamples[-7];
3066         prediction += coefficients[7] * (drflac_int64)pDecodedSamples[-8];
3067     }
3068     else if (order == 7)
3069     {
3070         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
3071         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
3072         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
3073         prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
3074         prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
3075         prediction += coefficients[5] * (drflac_int64)pDecodedSamples[-6];
3076         prediction += coefficients[6] * (drflac_int64)pDecodedSamples[-7];
3077     }
3078     else if (order == 3)
3079     {
3080         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
3081         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
3082         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
3083     }
3084     else if (order == 6)
3085     {
3086         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
3087         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
3088         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
3089         prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
3090         prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
3091         prediction += coefficients[5] * (drflac_int64)pDecodedSamples[-6];
3092     }
3093     else if (order == 5)
3094     {
3095         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
3096         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
3097         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
3098         prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
3099         prediction += coefficients[4] * (drflac_int64)pDecodedSamples[-5];
3100     }
3101     else if (order == 4)
3102     {
3103         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
3104         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
3105         prediction += coefficients[2] * (drflac_int64)pDecodedSamples[-3];
3106         prediction += coefficients[3] * (drflac_int64)pDecodedSamples[-4];
3107     }
3108     else if (order == 12)
3109     {
3110         prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
3111         prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
3112         prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
3113         prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
3114         prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
3115         prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
3116         prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
3117         prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
3118         prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
3119         prediction += coefficients[9]  * (drflac_int64)pDecodedSamples[-10];
3120         prediction += coefficients[10] * (drflac_int64)pDecodedSamples[-11];
3121         prediction += coefficients[11] * (drflac_int64)pDecodedSamples[-12];
3122     }
3123     else if (order == 2)
3124     {
3125         prediction  = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
3126         prediction += coefficients[1] * (drflac_int64)pDecodedSamples[-2];
3127     }
3128     else if (order == 1)
3129     {
3130         prediction = coefficients[0] * (drflac_int64)pDecodedSamples[-1];
3131     }
3132     else if (order == 10)
3133     {
3134         prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
3135         prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
3136         prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
3137         prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
3138         prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
3139         prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
3140         prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
3141         prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
3142         prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
3143         prediction += coefficients[9]  * (drflac_int64)pDecodedSamples[-10];
3144     }
3145     else if (order == 9)
3146     {
3147         prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
3148         prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
3149         prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
3150         prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
3151         prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
3152         prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
3153         prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
3154         prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
3155         prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
3156     }
3157     else if (order == 11)
3158     {
3159         prediction  = coefficients[0]  * (drflac_int64)pDecodedSamples[-1];
3160         prediction += coefficients[1]  * (drflac_int64)pDecodedSamples[-2];
3161         prediction += coefficients[2]  * (drflac_int64)pDecodedSamples[-3];
3162         prediction += coefficients[3]  * (drflac_int64)pDecodedSamples[-4];
3163         prediction += coefficients[4]  * (drflac_int64)pDecodedSamples[-5];
3164         prediction += coefficients[5]  * (drflac_int64)pDecodedSamples[-6];
3165         prediction += coefficients[6]  * (drflac_int64)pDecodedSamples[-7];
3166         prediction += coefficients[7]  * (drflac_int64)pDecodedSamples[-8];
3167         prediction += coefficients[8]  * (drflac_int64)pDecodedSamples[-9];
3168         prediction += coefficients[9]  * (drflac_int64)pDecodedSamples[-10];
3169         prediction += coefficients[10] * (drflac_int64)pDecodedSamples[-11];
3170     }
3171     else
3172     {
3173         int j;
3174
3175         prediction = 0;
3176         for (j = 0; j < (int)order; ++j) {
3177             prediction += coefficients[j] * (drflac_int64)pDecodedSamples[-j-1];
3178         }
3179     }
3180 #endif
3181
3182     /*
3183     VC++ optimizes this to a single jmp instruction, but only the 64-bit build. The 32-bit build generates less efficient code for some
3184     reason. The ugly version above is faster so we'll just switch between the two depending on the target platform.
3185     */
3186 #ifdef DRFLAC_64BIT
3187     prediction = 0;
3188     switch (order)
3189     {
3190     case 32: prediction += coefficients[31] * (drflac_int64)pDecodedSamples[-32];
3191     case 31: prediction += coefficients[30] * (drflac_int64)pDecodedSamples[-31];
3192     case 30: prediction += coefficients[29] * (drflac_int64)pDecodedSamples[-30];
3193     case 29: prediction += coefficients[28] * (drflac_int64)pDecodedSamples[-29];
3194     case 28: prediction += coefficients[27] * (drflac_int64)pDecodedSamples[-28];
3195     case 27: prediction += coefficients[26] * (drflac_int64)pDecodedSamples[-27];
3196     case 26: prediction += coefficients[25] * (drflac_int64)pDecodedSamples[-26];
3197     case 25: prediction += coefficients[24] * (drflac_int64)pDecodedSamples[-25];
3198     case 24: prediction += coefficients[23] * (drflac_int64)pDecodedSamples[-24];
3199     case 23: prediction += coefficients[22] * (drflac_int64)pDecodedSamples[-23];
3200     case 22: prediction += coefficients[21] * (drflac_int64)pDecodedSamples[-22];
3201     case 21: prediction += coefficients[20] * (drflac_int64)pDecodedSamples[-21];
3202     case 20: prediction += coefficients[19] * (drflac_int64)pDecodedSamples[-20];
3203     case 19: prediction += coefficients[18] * (drflac_int64)pDecodedSamples[-19];
3204     case 18: prediction += coefficients[17] * (drflac_int64)pDecodedSamples[-18];
3205     case 17: prediction += coefficients[16] * (drflac_int64)pDecodedSamples[-17];
3206     case 16: prediction += coefficients[15] * (drflac_int64)pDecodedSamples[-16];
3207     case 15: prediction += coefficients[14] * (drflac_int64)pDecodedSamples[-15];
3208     case 14: prediction += coefficients[13] * (drflac_int64)pDecodedSamples[-14];
3209     case 13: prediction += coefficients[12] * (drflac_int64)pDecodedSamples[-13];
3210     case 12: prediction += coefficients[11] * (drflac_int64)pDecodedSamples[-12];
3211     case 11: prediction += coefficients[10] * (drflac_int64)pDecodedSamples[-11];
3212     case 10: prediction += coefficients[ 9] * (drflac_int64)pDecodedSamples[-10];
3213     case  9: prediction += coefficients[ 8] * (drflac_int64)pDecodedSamples[- 9];
3214     case  8: prediction += coefficients[ 7] * (drflac_int64)pDecodedSamples[- 8];
3215     case  7: prediction += coefficients[ 6] * (drflac_int64)pDecodedSamples[- 7];
3216     case  6: prediction += coefficients[ 5] * (drflac_int64)pDecodedSamples[- 6];
3217     case  5: prediction += coefficients[ 4] * (drflac_int64)pDecodedSamples[- 5];
3218     case  4: prediction += coefficients[ 3] * (drflac_int64)pDecodedSamples[- 4];
3219     case  3: prediction += coefficients[ 2] * (drflac_int64)pDecodedSamples[- 3];
3220     case  2: prediction += coefficients[ 1] * (drflac_int64)pDecodedSamples[- 2];
3221     case  1: prediction += coefficients[ 0] * (drflac_int64)pDecodedSamples[- 1];
3222     }
3223 #endif
3224
3225     return (drflac_int32)(prediction >> shift);
3226 }
3227
3228
3229 #if 0
3230 /*
3231 Reference implementation for reading and decoding samples with residual. This is intentionally left unoptimized for the
3232 sake of readability and should only be used as a reference.
3233 */
3234 static drflac_bool32 drflac__decode_samples_with_residual__rice__reference(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
3235 {
3236     drflac_uint32 i;
3237
3238     DRFLAC_ASSERT(bs != NULL);
3239     DRFLAC_ASSERT(pSamplesOut != NULL);
3240
3241     for (i = 0; i < count; ++i) {
3242         drflac_uint32 zeroCounter = 0;
3243         for (;;) {
3244             drflac_uint8 bit;
3245             if (!drflac__read_uint8(bs, 1, &bit)) {
3246                 return DRFLAC_FALSE;
3247             }
3248
3249             if (bit == 0) {
3250                 zeroCounter += 1;
3251             } else {
3252                 break;
3253             }
3254         }
3255
3256         drflac_uint32 decodedRice;
3257         if (riceParam > 0) {
3258             if (!drflac__read_uint32(bs, riceParam, &decodedRice)) {
3259                 return DRFLAC_FALSE;
3260             }
3261         } else {
3262             decodedRice = 0;
3263         }
3264
3265         decodedRice |= (zeroCounter << riceParam);
3266         if ((decodedRice & 0x01)) {
3267             decodedRice = ~(decodedRice >> 1);
3268         } else {
3269             decodedRice =  (decodedRice >> 1);
3270         }
3271
3272
3273         if (bitsPerSample+shift >= 32) {
3274             pSamplesOut[i] = decodedRice + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + i);
3275         } else {
3276             pSamplesOut[i] = decodedRice + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + i);
3277         }
3278     }
3279
3280     return DRFLAC_TRUE;
3281 }
3282 #endif
3283
3284 #if 0
3285 static drflac_bool32 drflac__read_rice_parts__reference(drflac_bs* bs, drflac_uint8 riceParam, drflac_uint32* pZeroCounterOut, drflac_uint32* pRiceParamPartOut)
3286 {
3287     drflac_uint32 zeroCounter = 0;
3288     drflac_uint32 decodedRice;
3289
3290     for (;;) {
3291         drflac_uint8 bit;
3292         if (!drflac__read_uint8(bs, 1, &bit)) {
3293             return DRFLAC_FALSE;
3294         }
3295
3296         if (bit == 0) {
3297             zeroCounter += 1;
3298         } else {
3299             break;
3300         }
3301     }
3302
3303     if (riceParam > 0) {
3304         if (!drflac__read_uint32(bs, riceParam, &decodedRice)) {
3305             return DRFLAC_FALSE;
3306         }
3307     } else {
3308         decodedRice = 0;
3309     }
3310
3311     *pZeroCounterOut = zeroCounter;
3312     *pRiceParamPartOut = decodedRice;
3313     return DRFLAC_TRUE;
3314 }
3315 #endif
3316
3317 #if 0
3318 static DRFLAC_INLINE drflac_bool32 drflac__read_rice_parts(drflac_bs* bs, drflac_uint8 riceParam, drflac_uint32* pZeroCounterOut, drflac_uint32* pRiceParamPartOut)
3319 {
3320     drflac_cache_t riceParamMask;
3321     drflac_uint32 zeroCounter;
3322     drflac_uint32 setBitOffsetPlus1;
3323     drflac_uint32 riceParamPart;
3324     drflac_uint32 riceLength;
3325
3326     DRFLAC_ASSERT(riceParam > 0);   /* <-- riceParam should never be 0. drflac__read_rice_parts__param_equals_zero() should be used instead for this case. */
3327
3328     riceParamMask = DRFLAC_CACHE_L1_SELECTION_MASK(riceParam);
3329
3330     zeroCounter = 0;
3331     while (bs->cache == 0) {
3332         zeroCounter += (drflac_uint32)DRFLAC_CACHE_L1_BITS_REMAINING(bs);
3333         if (!drflac__reload_cache(bs)) {
3334             return DRFLAC_FALSE;
3335         }
3336     }
3337
3338     setBitOffsetPlus1 = drflac__clz(bs->cache);
3339     zeroCounter += setBitOffsetPlus1;
3340     setBitOffsetPlus1 += 1;
3341
3342     riceLength = setBitOffsetPlus1 + riceParam;
3343     if (riceLength < DRFLAC_CACHE_L1_BITS_REMAINING(bs)) {
3344         riceParamPart = (drflac_uint32)((bs->cache & (riceParamMask >> setBitOffsetPlus1)) >> DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceLength));
3345
3346         bs->consumedBits += riceLength;
3347         bs->cache <<= riceLength;
3348     } else {
3349         drflac_uint32 bitCountLo;
3350         drflac_cache_t resultHi;
3351
3352         bs->consumedBits += riceLength;
3353         bs->cache <<= setBitOffsetPlus1 & (DRFLAC_CACHE_L1_SIZE_BITS(bs)-1);    /* <-- Equivalent to "if (setBitOffsetPlus1 < DRFLAC_CACHE_L1_SIZE_BITS(bs)) { bs->cache <<= setBitOffsetPlus1; }" */
3354
3355         /* It straddles the cached data. It will never cover more than the next chunk. We just read the number in two parts and combine them. */
3356         bitCountLo = bs->consumedBits - DRFLAC_CACHE_L1_SIZE_BITS(bs);
3357         resultHi = DRFLAC_CACHE_L1_SELECT_AND_SHIFT(bs, riceParam);  /* <-- Use DRFLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE() if ever this function allows riceParam=0. */
3358
3359         if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
3360 #ifndef DR_FLAC_NO_CRC
3361             drflac__update_crc16(bs);
3362 #endif
3363             bs->cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
3364             bs->consumedBits = 0;
3365 #ifndef DR_FLAC_NO_CRC
3366             bs->crc16Cache = bs->cache;
3367 #endif
3368         } else {
3369             /* Slow path. We need to fetch more data from the client. */
3370             if (!drflac__reload_cache(bs)) {
3371                 return DRFLAC_FALSE;
3372             }
3373         }
3374
3375         riceParamPart = (drflac_uint32)(resultHi | DRFLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE(bs, bitCountLo));
3376
3377         bs->consumedBits += bitCountLo;
3378         bs->cache <<= bitCountLo;
3379     }
3380
3381     pZeroCounterOut[0] = zeroCounter;
3382     pRiceParamPartOut[0] = riceParamPart;
3383
3384     return DRFLAC_TRUE;
3385 }
3386 #endif
3387
3388 static DRFLAC_INLINE drflac_bool32 drflac__read_rice_parts_x1(drflac_bs* bs, drflac_uint8 riceParam, drflac_uint32* pZeroCounterOut, drflac_uint32* pRiceParamPartOut)
3389 {
3390     drflac_uint32  riceParamPlus1 = riceParam + 1;
3391     /*drflac_cache_t riceParamPlus1Mask  = DRFLAC_CACHE_L1_SELECTION_MASK(riceParamPlus1);*/
3392     drflac_uint32  riceParamPlus1Shift = DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPlus1);
3393     drflac_uint32  riceParamPlus1MaxConsumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
3394
3395     /*
3396     The idea here is to use local variables for the cache in an attempt to encourage the compiler to store them in registers. I have
3397     no idea how this will work in practice...
3398     */
3399     drflac_cache_t bs_cache = bs->cache;
3400     drflac_uint32  bs_consumedBits = bs->consumedBits;
3401
3402     /* The first thing to do is find the first unset bit. Most likely a bit will be set in the current cache line. */
3403     drflac_uint32  lzcount = drflac__clz(bs_cache);
3404     if (lzcount < sizeof(bs_cache)*8) {
3405         pZeroCounterOut[0] = lzcount;
3406
3407         /*
3408         It is most likely that the riceParam part (which comes after the zero counter) is also on this cache line. When extracting
3409         this, we include the set bit from the unary coded part because it simplifies cache management. This bit will be handled
3410         outside of this function at a higher level.
3411         */
3412     extract_rice_param_part:
3413         bs_cache       <<= lzcount;
3414         bs_consumedBits += lzcount;
3415
3416         if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
3417             /* Getting here means the rice parameter part is wholly contained within the current cache line. */
3418             pRiceParamPartOut[0] = (drflac_uint32)(bs_cache >> riceParamPlus1Shift);
3419             bs_cache       <<= riceParamPlus1;
3420             bs_consumedBits += riceParamPlus1;
3421         } else {
3422             drflac_uint32 riceParamPartHi;
3423             drflac_uint32 riceParamPartLo;
3424             drflac_uint32 riceParamPartLoBitCount;
3425
3426             /*
3427             Getting here means the rice parameter part straddles the cache line. We need to read from the tail of the current cache
3428             line, reload the cache, and then combine it with the head of the next cache line.
3429             */
3430
3431             /* Grab the high part of the rice parameter part. */
3432             riceParamPartHi = (drflac_uint32)(bs_cache >> riceParamPlus1Shift);
3433
3434             /* Before reloading the cache we need to grab the size in bits of the low part. */
3435             riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
3436             DRFLAC_ASSERT(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
3437
3438             /* Now reload the cache. */
3439             if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
3440             #ifndef DR_FLAC_NO_CRC
3441                 drflac__update_crc16(bs);
3442             #endif
3443                 bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
3444                 bs_consumedBits = riceParamPartLoBitCount;
3445             #ifndef DR_FLAC_NO_CRC
3446                 bs->crc16Cache = bs_cache;
3447             #endif
3448             } else {
3449                 /* Slow path. We need to fetch more data from the client. */
3450                 if (!drflac__reload_cache(bs)) {
3451                     return DRFLAC_FALSE;
3452                 }
3453
3454                 bs_cache = bs->cache;
3455                 bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
3456             }
3457
3458             /* We should now have enough information to construct the rice parameter part. */
3459             riceParamPartLo = (drflac_uint32)(bs_cache >> (DRFLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPartLoBitCount)));
3460             pRiceParamPartOut[0] = riceParamPartHi | riceParamPartLo;
3461
3462             bs_cache <<= riceParamPartLoBitCount;
3463         }
3464     } else {
3465         /*
3466         Getting here means there are no bits set on the cache line. This is a less optimal case because we just wasted a call
3467         to drflac__clz() and we need to reload the cache.
3468         */
3469         drflac_uint32 zeroCounter = (drflac_uint32)(DRFLAC_CACHE_L1_SIZE_BITS(bs) - bs_consumedBits);
3470         for (;;) {
3471             if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
3472             #ifndef DR_FLAC_NO_CRC
3473                 drflac__update_crc16(bs);
3474             #endif
3475                 bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
3476                 bs_consumedBits = 0;
3477             #ifndef DR_FLAC_NO_CRC
3478                 bs->crc16Cache = bs_cache;
3479             #endif
3480             } else {
3481                 /* Slow path. We need to fetch more data from the client. */
3482                 if (!drflac__reload_cache(bs)) {
3483                     return DRFLAC_FALSE;
3484                 }
3485
3486                 bs_cache = bs->cache;
3487                 bs_consumedBits = bs->consumedBits;
3488             }
3489
3490             lzcount = drflac__clz(bs_cache);
3491             zeroCounter += lzcount;
3492
3493             if (lzcount < sizeof(bs_cache)*8) {
3494                 break;
3495             }
3496         }
3497
3498         pZeroCounterOut[0] = zeroCounter;
3499         goto extract_rice_param_part;
3500     }
3501
3502     /* Make sure the cache is restored at the end of it all. */
3503     bs->cache = bs_cache;
3504     bs->consumedBits = bs_consumedBits;
3505
3506     return DRFLAC_TRUE;
3507 }
3508
3509 static DRFLAC_INLINE drflac_bool32 drflac__seek_rice_parts(drflac_bs* bs, drflac_uint8 riceParam)
3510 {
3511     drflac_uint32  riceParamPlus1 = riceParam + 1;
3512     drflac_uint32  riceParamPlus1MaxConsumedBits = DRFLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
3513
3514     /*
3515     The idea here is to use local variables for the cache in an attempt to encourage the compiler to store them in registers. I have
3516     no idea how this will work in practice...
3517     */
3518     drflac_cache_t bs_cache = bs->cache;
3519     drflac_uint32  bs_consumedBits = bs->consumedBits;
3520
3521     /* The first thing to do is find the first unset bit. Most likely a bit will be set in the current cache line. */
3522     drflac_uint32  lzcount = drflac__clz(bs_cache);
3523     if (lzcount < sizeof(bs_cache)*8) {
3524         /*
3525         It is most likely that the riceParam part (which comes after the zero counter) is also on this cache line. When extracting
3526         this, we include the set bit from the unary coded part because it simplifies cache management. This bit will be handled
3527         outside of this function at a higher level.
3528         */
3529     extract_rice_param_part:
3530         bs_cache       <<= lzcount;
3531         bs_consumedBits += lzcount;
3532
3533         if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
3534             /* Getting here means the rice parameter part is wholly contained within the current cache line. */
3535             bs_cache       <<= riceParamPlus1;
3536             bs_consumedBits += riceParamPlus1;
3537         } else {
3538             /*
3539             Getting here means the rice parameter part straddles the cache line. We need to read from the tail of the current cache
3540             line, reload the cache, and then combine it with the head of the next cache line.
3541             */
3542
3543             /* Before reloading the cache we need to grab the size in bits of the low part. */
3544             drflac_uint32 riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
3545             DRFLAC_ASSERT(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
3546
3547             /* Now reload the cache. */
3548             if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
3549             #ifndef DR_FLAC_NO_CRC
3550                 drflac__update_crc16(bs);
3551             #endif
3552                 bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
3553                 bs_consumedBits = riceParamPartLoBitCount;
3554             #ifndef DR_FLAC_NO_CRC
3555                 bs->crc16Cache = bs_cache;
3556             #endif
3557             } else {
3558                 /* Slow path. We need to fetch more data from the client. */
3559                 if (!drflac__reload_cache(bs)) {
3560                     return DRFLAC_FALSE;
3561                 }
3562
3563                 bs_cache = bs->cache;
3564                 bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
3565             }
3566
3567             bs_cache <<= riceParamPartLoBitCount;
3568         }
3569     } else {
3570         /*
3571         Getting here means there are no bits set on the cache line. This is a less optimal case because we just wasted a call
3572         to drflac__clz() and we need to reload the cache.
3573         */
3574         for (;;) {
3575             if (bs->nextL2Line < DRFLAC_CACHE_L2_LINE_COUNT(bs)) {
3576             #ifndef DR_FLAC_NO_CRC
3577                 drflac__update_crc16(bs);
3578             #endif
3579                 bs_cache = drflac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
3580                 bs_consumedBits = 0;
3581             #ifndef DR_FLAC_NO_CRC
3582                 bs->crc16Cache = bs_cache;
3583             #endif
3584             } else {
3585                 /* Slow path. We need to fetch more data from the client. */
3586                 if (!drflac__reload_cache(bs)) {
3587                     return DRFLAC_FALSE;
3588                 }
3589
3590                 bs_cache = bs->cache;
3591                 bs_consumedBits = bs->consumedBits;
3592             }
3593
3594             lzcount = drflac__clz(bs_cache);
3595             if (lzcount < sizeof(bs_cache)*8) {
3596                 break;
3597             }
3598         }
3599
3600         goto extract_rice_param_part;
3601     }
3602
3603     /* Make sure the cache is restored at the end of it all. */
3604     bs->cache = bs_cache;
3605     bs->consumedBits = bs_consumedBits;
3606
3607     return DRFLAC_TRUE;
3608 }
3609
3610
3611 static drflac_bool32 drflac__decode_samples_with_residual__rice__scalar_zeroorder(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
3612 {
3613     drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
3614     drflac_uint32 zeroCountPart0;
3615     drflac_uint32 riceParamPart0;
3616     drflac_uint32 riceParamMask;
3617     drflac_uint32 i;
3618
3619     DRFLAC_ASSERT(bs != NULL);
3620     DRFLAC_ASSERT(pSamplesOut != NULL);
3621
3622     (void)bitsPerSample;
3623     (void)order;
3624     (void)shift;
3625     (void)coefficients;
3626
3627     riceParamMask  = (drflac_uint32)~((~0UL) << riceParam);
3628
3629     i = 0;
3630     while (i < count) {
3631         /* Rice extraction. */
3632         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
3633             return DRFLAC_FALSE;
3634         }
3635
3636         /* Rice reconstruction. */
3637         riceParamPart0 &= riceParamMask;
3638         riceParamPart0 |= (zeroCountPart0 << riceParam);
3639         riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
3640
3641         pSamplesOut[i] = riceParamPart0;
3642
3643         i += 1;
3644     }
3645
3646     return DRFLAC_TRUE;
3647 }
3648
3649 static drflac_bool32 drflac__decode_samples_with_residual__rice__scalar(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
3650 {
3651     drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
3652     drflac_uint32 zeroCountPart0 = 0;
3653     drflac_uint32 zeroCountPart1 = 0;
3654     drflac_uint32 zeroCountPart2 = 0;
3655     drflac_uint32 zeroCountPart3 = 0;
3656     drflac_uint32 riceParamPart0 = 0;
3657     drflac_uint32 riceParamPart1 = 0;
3658     drflac_uint32 riceParamPart2 = 0;
3659     drflac_uint32 riceParamPart3 = 0;
3660     drflac_uint32 riceParamMask;
3661     const drflac_int32* pSamplesOutEnd;
3662     drflac_uint32 i;
3663
3664     DRFLAC_ASSERT(bs != NULL);
3665     DRFLAC_ASSERT(pSamplesOut != NULL);
3666
3667     if (order == 0) {
3668         return drflac__decode_samples_with_residual__rice__scalar_zeroorder(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
3669     }
3670
3671     riceParamMask  = (drflac_uint32)~((~0UL) << riceParam);
3672     pSamplesOutEnd = pSamplesOut + (count & ~3);
3673
3674     if (bitsPerSample+shift > 32) {
3675         while (pSamplesOut < pSamplesOutEnd) {
3676             /*
3677             Rice extraction. It's faster to do this one at a time against local variables than it is to use the x4 version
3678             against an array. Not sure why, but perhaps it's making more efficient use of registers?
3679             */
3680             if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
3681                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
3682                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
3683                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
3684                 return DRFLAC_FALSE;
3685             }
3686
3687             riceParamPart0 &= riceParamMask;
3688             riceParamPart1 &= riceParamMask;
3689             riceParamPart2 &= riceParamMask;
3690             riceParamPart3 &= riceParamMask;
3691
3692             riceParamPart0 |= (zeroCountPart0 << riceParam);
3693             riceParamPart1 |= (zeroCountPart1 << riceParam);
3694             riceParamPart2 |= (zeroCountPart2 << riceParam);
3695             riceParamPart3 |= (zeroCountPart3 << riceParam);
3696
3697             riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
3698             riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
3699             riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
3700             riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
3701
3702             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 0);
3703             pSamplesOut[1] = riceParamPart1 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 1);
3704             pSamplesOut[2] = riceParamPart2 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 2);
3705             pSamplesOut[3] = riceParamPart3 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 3);
3706
3707             pSamplesOut += 4;
3708         }
3709     } else {
3710         while (pSamplesOut < pSamplesOutEnd) {
3711             if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
3712                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
3713                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
3714                 !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
3715                 return DRFLAC_FALSE;
3716             }
3717
3718             riceParamPart0 &= riceParamMask;
3719             riceParamPart1 &= riceParamMask;
3720             riceParamPart2 &= riceParamMask;
3721             riceParamPart3 &= riceParamMask;
3722
3723             riceParamPart0 |= (zeroCountPart0 << riceParam);
3724             riceParamPart1 |= (zeroCountPart1 << riceParam);
3725             riceParamPart2 |= (zeroCountPart2 << riceParam);
3726             riceParamPart3 |= (zeroCountPart3 << riceParam);
3727
3728             riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
3729             riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
3730             riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
3731             riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
3732
3733             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 0);
3734             pSamplesOut[1] = riceParamPart1 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 1);
3735             pSamplesOut[2] = riceParamPart2 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 2);
3736             pSamplesOut[3] = riceParamPart3 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 3);
3737
3738             pSamplesOut += 4;
3739         }
3740     }
3741
3742     i = (count & ~3);
3743     while (i < count) {
3744         /* Rice extraction. */
3745         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
3746             return DRFLAC_FALSE;
3747         }
3748
3749         /* Rice reconstruction. */
3750         riceParamPart0 &= riceParamMask;
3751         riceParamPart0 |= (zeroCountPart0 << riceParam);
3752         riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
3753         /*riceParamPart0  = (riceParamPart0 >> 1) ^ (~(riceParamPart0 & 0x01) + 1);*/
3754
3755         /* Sample reconstruction. */
3756         if (bitsPerSample+shift > 32) {
3757             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + 0);
3758         } else {
3759             pSamplesOut[0] = riceParamPart0 + drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + 0);
3760         }
3761
3762         i += 1;
3763         pSamplesOut += 1;
3764     }
3765
3766     return DRFLAC_TRUE;
3767 }
3768
3769 #if defined(DRFLAC_SUPPORT_SSE2)
3770 static DRFLAC_INLINE __m128i drflac__mm_packs_interleaved_epi32(__m128i a, __m128i b)
3771 {
3772     __m128i r;
3773
3774     /* Pack. */
3775     r = _mm_packs_epi32(a, b);
3776
3777     /* a3a2 a1a0 b3b2 b1b0 -> a3a2 b3b2 a1a0 b1b0 */
3778     r = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 1, 2, 0));
3779
3780     /* a3a2 b3b2 a1a0 b1b0 -> a3b3 a2b2 a1b1 a0b0 */
3781     r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(3, 1, 2, 0));
3782     r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(3, 1, 2, 0));
3783
3784     return r;
3785 }
3786 #endif
3787
3788 #if defined(DRFLAC_SUPPORT_SSE41)
3789 static DRFLAC_INLINE __m128i drflac__mm_not_si128(__m128i a)
3790 {
3791     return _mm_xor_si128(a, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
3792 }
3793
3794 static DRFLAC_INLINE __m128i drflac__mm_hadd_epi32(__m128i x)
3795 {
3796     __m128i x64 = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
3797     __m128i x32 = _mm_shufflelo_epi16(x64, _MM_SHUFFLE(1, 0, 3, 2));
3798     return _mm_add_epi32(x64, x32);
3799 }
3800
3801 static DRFLAC_INLINE __m128i drflac__mm_hadd_epi64(__m128i x)
3802 {
3803     return _mm_add_epi64(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
3804 }
3805
3806 static DRFLAC_INLINE __m128i drflac__mm_srai_epi64(__m128i x, int count)
3807 {
3808     /*
3809     To simplify this we are assuming count < 32. This restriction allows us to work on a low side and a high side. The low side
3810     is shifted with zero bits, whereas the right side is shifted with sign bits.
3811     */
3812     __m128i lo = _mm_srli_epi64(x, count);
3813     __m128i hi = _mm_srai_epi32(x, count);
3814
3815     hi = _mm_and_si128(hi, _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0));    /* The high part needs to have the low part cleared. */
3816
3817     return _mm_or_si128(lo, hi);
3818 }
3819
3820 static drflac_bool32 drflac__decode_samples_with_residual__rice__sse41_32(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
3821 {
3822     int i;
3823     drflac_uint32 riceParamMask;
3824     drflac_int32* pDecodedSamples    = pSamplesOut;
3825     drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
3826     drflac_uint32 zeroCountParts0 = 0;
3827     drflac_uint32 zeroCountParts1 = 0;
3828     drflac_uint32 zeroCountParts2 = 0;
3829     drflac_uint32 zeroCountParts3 = 0;
3830     drflac_uint32 riceParamParts0 = 0;
3831     drflac_uint32 riceParamParts1 = 0;
3832     drflac_uint32 riceParamParts2 = 0;
3833     drflac_uint32 riceParamParts3 = 0;
3834     __m128i coefficients128_0;
3835     __m128i coefficients128_4;
3836     __m128i coefficients128_8;
3837     __m128i samples128_0;
3838     __m128i samples128_4;
3839     __m128i samples128_8;
3840     __m128i riceParamMask128;
3841
3842     const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
3843
3844     riceParamMask    = (drflac_uint32)~((~0UL) << riceParam);
3845     riceParamMask128 = _mm_set1_epi32(riceParamMask);
3846
3847     /* Pre-load. */
3848     coefficients128_0 = _mm_setzero_si128();
3849     coefficients128_4 = _mm_setzero_si128();
3850     coefficients128_8 = _mm_setzero_si128();
3851
3852     samples128_0 = _mm_setzero_si128();
3853     samples128_4 = _mm_setzero_si128();
3854     samples128_8 = _mm_setzero_si128();
3855
3856     /*
3857     Pre-loading the coefficients and prior samples is annoying because we need to ensure we don't try reading more than
3858     what's available in the input buffers. It would be convenient to use a fall-through switch to do this, but this results
3859     in strict aliasing warnings with GCC. To work around this I'm just doing something hacky. This feels a bit convoluted
3860     so I think there's opportunity for this to be simplified.
3861     */
3862 #if 1
3863     {
3864         int runningOrder = order;
3865
3866         /* 0 - 3. */
3867         if (runningOrder >= 4) {
3868             coefficients128_0 = _mm_loadu_si128((const __m128i*)(coefficients + 0));
3869             samples128_0      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 4));
3870             runningOrder -= 4;
3871         } else {
3872             switch (runningOrder) {
3873                 case 3: coefficients128_0 = _mm_set_epi32(0, coefficients[2], coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], pSamplesOut[-3], 0); break;
3874                 case 2: coefficients128_0 = _mm_set_epi32(0, 0,               coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], 0,               0); break;
3875                 case 1: coefficients128_0 = _mm_set_epi32(0, 0,               0,               coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], 0,               0,               0); break;
3876             }
3877             runningOrder = 0;
3878         }
3879
3880         /* 4 - 7 */
3881         if (runningOrder >= 4) {
3882             coefficients128_4 = _mm_loadu_si128((const __m128i*)(coefficients + 4));
3883             samples128_4      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 8));
3884             runningOrder -= 4;
3885         } else {
3886             switch (runningOrder) {
3887                 case 3: coefficients128_4 = _mm_set_epi32(0, coefficients[6], coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], pSamplesOut[-7], 0); break;
3888                 case 2: coefficients128_4 = _mm_set_epi32(0, 0,               coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], 0,               0); break;
3889                 case 1: coefficients128_4 = _mm_set_epi32(0, 0,               0,               coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], 0,               0,               0); break;
3890             }
3891             runningOrder = 0;
3892         }
3893
3894         /* 8 - 11 */
3895         if (runningOrder == 4) {
3896             coefficients128_8 = _mm_loadu_si128((const __m128i*)(coefficients + 8));
3897             samples128_8      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 12));
3898             runningOrder -= 4;
3899         } else {
3900             switch (runningOrder) {
3901                 case 3: coefficients128_8 = _mm_set_epi32(0, coefficients[10], coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], pSamplesOut[-11], 0); break;
3902                 case 2: coefficients128_8 = _mm_set_epi32(0, 0,                coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], 0,                0); break;
3903                 case 1: coefficients128_8 = _mm_set_epi32(0, 0,                0,               coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], 0,                0,                0); break;
3904             }
3905             runningOrder = 0;
3906         }
3907
3908         /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
3909         coefficients128_0 = _mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(0, 1, 2, 3));
3910         coefficients128_4 = _mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(0, 1, 2, 3));
3911         coefficients128_8 = _mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(0, 1, 2, 3));
3912     }
3913 #else
3914     /* This causes strict-aliasing warnings with GCC. */
3915     switch (order)
3916     {
3917     case 12: ((drflac_int32*)&coefficients128_8)[0] = coefficients[11]; ((drflac_int32*)&samples128_8)[0] = pDecodedSamples[-12];
3918     case 11: ((drflac_int32*)&coefficients128_8)[1] = coefficients[10]; ((drflac_int32*)&samples128_8)[1] = pDecodedSamples[-11];
3919     case 10: ((drflac_int32*)&coefficients128_8)[2] = coefficients[ 9]; ((drflac_int32*)&samples128_8)[2] = pDecodedSamples[-10];
3920     case 9:  ((drflac_int32*)&coefficients128_8)[3] = coefficients[ 8]; ((drflac_int32*)&samples128_8)[3] = pDecodedSamples[- 9];
3921     case 8:  ((drflac_int32*)&coefficients128_4)[0] = coefficients[ 7]; ((drflac_int32*)&samples128_4)[0] = pDecodedSamples[- 8];
3922     case 7:  ((drflac_int32*)&coefficients128_4)[1] = coefficients[ 6]; ((drflac_int32*)&samples128_4)[1] = pDecodedSamples[- 7];
3923     case 6:  ((drflac_int32*)&coefficients128_4)[2] = coefficients[ 5]; ((drflac_int32*)&samples128_4)[2] = pDecodedSamples[- 6];
3924     case 5:  ((drflac_int32*)&coefficients128_4)[3] = coefficients[ 4]; ((drflac_int32*)&samples128_4)[3] = pDecodedSamples[- 5];
3925     case 4:  ((drflac_int32*)&coefficients128_0)[0] = coefficients[ 3]; ((drflac_int32*)&samples128_0)[0] = pDecodedSamples[- 4];
3926     case 3:  ((drflac_int32*)&coefficients128_0)[1] = coefficients[ 2]; ((drflac_int32*)&samples128_0)[1] = pDecodedSamples[- 3];
3927     case 2:  ((drflac_int32*)&coefficients128_0)[2] = coefficients[ 1]; ((drflac_int32*)&samples128_0)[2] = pDecodedSamples[- 2];
3928     case 1:  ((drflac_int32*)&coefficients128_0)[3] = coefficients[ 0]; ((drflac_int32*)&samples128_0)[3] = pDecodedSamples[- 1];
3929     }
3930 #endif
3931
3932     /* For this version we are doing one sample at a time. */
3933     while (pDecodedSamples < pDecodedSamplesEnd) {
3934         __m128i prediction128;
3935         __m128i zeroCountPart128;
3936         __m128i riceParamPart128;
3937
3938         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
3939             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
3940             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
3941             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
3942             return DRFLAC_FALSE;
3943         }
3944
3945         zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
3946         riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
3947
3948         riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
3949         riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
3950         riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_add_epi32(drflac__mm_not_si128(_mm_and_si128(riceParamPart128, _mm_set1_epi32(0x01))), _mm_set1_epi32(0x01)));  /* <-- SSE2 compatible */
3951         /*riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_mullo_epi32(_mm_and_si128(riceParamPart128, _mm_set1_epi32(0x01)), _mm_set1_epi32(0xFFFFFFFF)));*/   /* <-- Only supported from SSE4.1 and is slower in my testing... */
3952
3953         if (order <= 4) {
3954             for (i = 0; i < 4; i += 1) {
3955                 prediction128 = _mm_mullo_epi32(coefficients128_0, samples128_0);
3956
3957                 /* Horizontal add and shift. */
3958                 prediction128 = drflac__mm_hadd_epi32(prediction128);
3959                 prediction128 = _mm_srai_epi32(prediction128, shift);
3960                 prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
3961
3962                 samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
3963                 riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
3964             }
3965         } else if (order <= 8) {
3966             for (i = 0; i < 4; i += 1) {
3967                 prediction128 =                              _mm_mullo_epi32(coefficients128_4, samples128_4);
3968                 prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_0, samples128_0));
3969
3970                 /* Horizontal add and shift. */
3971                 prediction128 = drflac__mm_hadd_epi32(prediction128);
3972                 prediction128 = _mm_srai_epi32(prediction128, shift);
3973                 prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
3974
3975                 samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
3976                 samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
3977                 riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
3978             }
3979         } else {
3980             for (i = 0; i < 4; i += 1) {
3981                 prediction128 =                              _mm_mullo_epi32(coefficients128_8, samples128_8);
3982                 prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_4, samples128_4));
3983                 prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_0, samples128_0));
3984
3985                 /* Horizontal add and shift. */
3986                 prediction128 = drflac__mm_hadd_epi32(prediction128);
3987                 prediction128 = _mm_srai_epi32(prediction128, shift);
3988                 prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
3989
3990                 samples128_8 = _mm_alignr_epi8(samples128_4,  samples128_8, 4);
3991                 samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
3992                 samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
3993                 riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
3994             }
3995         }
3996
3997         /* We store samples in groups of 4. */
3998         _mm_storeu_si128((__m128i*)pDecodedSamples, samples128_0);
3999         pDecodedSamples += 4;
4000     }
4001
4002     /* Make sure we process the last few samples. */
4003     i = (count & ~3);
4004     while (i < (int)count) {
4005         /* Rice extraction. */
4006         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0)) {
4007             return DRFLAC_FALSE;
4008         }
4009
4010         /* Rice reconstruction. */
4011         riceParamParts0 &= riceParamMask;
4012         riceParamParts0 |= (zeroCountParts0 << riceParam);
4013         riceParamParts0  = (riceParamParts0 >> 1) ^ t[riceParamParts0 & 0x01];
4014
4015         /* Sample reconstruction. */
4016         pDecodedSamples[0] = riceParamParts0 + drflac__calculate_prediction_32(order, shift, coefficients, pDecodedSamples);
4017
4018         i += 1;
4019         pDecodedSamples += 1;
4020     }
4021
4022     return DRFLAC_TRUE;
4023 }
4024
4025 static drflac_bool32 drflac__decode_samples_with_residual__rice__sse41_64(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
4026 {
4027     int i;
4028     drflac_uint32 riceParamMask;
4029     drflac_int32* pDecodedSamples    = pSamplesOut;
4030     drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
4031     drflac_uint32 zeroCountParts0 = 0;
4032     drflac_uint32 zeroCountParts1 = 0;
4033     drflac_uint32 zeroCountParts2 = 0;
4034     drflac_uint32 zeroCountParts3 = 0;
4035     drflac_uint32 riceParamParts0 = 0;
4036     drflac_uint32 riceParamParts1 = 0;
4037     drflac_uint32 riceParamParts2 = 0;
4038     drflac_uint32 riceParamParts3 = 0;
4039     __m128i coefficients128_0;
4040     __m128i coefficients128_4;
4041     __m128i coefficients128_8;
4042     __m128i samples128_0;
4043     __m128i samples128_4;
4044     __m128i samples128_8;
4045     __m128i prediction128;
4046     __m128i riceParamMask128;
4047
4048     const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
4049
4050     DRFLAC_ASSERT(order <= 12);
4051
4052     riceParamMask    = (drflac_uint32)~((~0UL) << riceParam);
4053     riceParamMask128 = _mm_set1_epi32(riceParamMask);
4054
4055     prediction128 = _mm_setzero_si128();
4056
4057     /* Pre-load. */
4058     coefficients128_0  = _mm_setzero_si128();
4059     coefficients128_4  = _mm_setzero_si128();
4060     coefficients128_8  = _mm_setzero_si128();
4061
4062     samples128_0  = _mm_setzero_si128();
4063     samples128_4  = _mm_setzero_si128();
4064     samples128_8  = _mm_setzero_si128();
4065
4066 #if 1
4067     {
4068         int runningOrder = order;
4069
4070         /* 0 - 3. */
4071         if (runningOrder >= 4) {
4072             coefficients128_0 = _mm_loadu_si128((const __m128i*)(coefficients + 0));
4073             samples128_0      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 4));
4074             runningOrder -= 4;
4075         } else {
4076             switch (runningOrder) {
4077                 case 3: coefficients128_0 = _mm_set_epi32(0, coefficients[2], coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], pSamplesOut[-3], 0); break;
4078                 case 2: coefficients128_0 = _mm_set_epi32(0, 0,               coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], 0,               0); break;
4079                 case 1: coefficients128_0 = _mm_set_epi32(0, 0,               0,               coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], 0,               0,               0); break;
4080             }
4081             runningOrder = 0;
4082         }
4083
4084         /* 4 - 7 */
4085         if (runningOrder >= 4) {
4086             coefficients128_4 = _mm_loadu_si128((const __m128i*)(coefficients + 4));
4087             samples128_4      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 8));
4088             runningOrder -= 4;
4089         } else {
4090             switch (runningOrder) {
4091                 case 3: coefficients128_4 = _mm_set_epi32(0, coefficients[6], coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], pSamplesOut[-7], 0); break;
4092                 case 2: coefficients128_4 = _mm_set_epi32(0, 0,               coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], 0,               0); break;
4093                 case 1: coefficients128_4 = _mm_set_epi32(0, 0,               0,               coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], 0,               0,               0); break;
4094             }
4095             runningOrder = 0;
4096         }
4097
4098         /* 8 - 11 */
4099         if (runningOrder == 4) {
4100             coefficients128_8 = _mm_loadu_si128((const __m128i*)(coefficients + 8));
4101             samples128_8      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 12));
4102             runningOrder -= 4;
4103         } else {
4104             switch (runningOrder) {
4105                 case 3: coefficients128_8 = _mm_set_epi32(0, coefficients[10], coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], pSamplesOut[-11], 0); break;
4106                 case 2: coefficients128_8 = _mm_set_epi32(0, 0,                coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], 0,                0); break;
4107                 case 1: coefficients128_8 = _mm_set_epi32(0, 0,                0,               coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], 0,                0,                0); break;
4108             }
4109             runningOrder = 0;
4110         }
4111
4112         /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
4113         coefficients128_0 = _mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(0, 1, 2, 3));
4114         coefficients128_4 = _mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(0, 1, 2, 3));
4115         coefficients128_8 = _mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(0, 1, 2, 3));
4116     }
4117 #else
4118     switch (order)
4119     {
4120     case 12: ((drflac_int32*)&coefficients128_8)[0] = coefficients[11]; ((drflac_int32*)&samples128_8)[0] = pDecodedSamples[-12];
4121     case 11: ((drflac_int32*)&coefficients128_8)[1] = coefficients[10]; ((drflac_int32*)&samples128_8)[1] = pDecodedSamples[-11];
4122     case 10: ((drflac_int32*)&coefficients128_8)[2] = coefficients[ 9]; ((drflac_int32*)&samples128_8)[2] = pDecodedSamples[-10];
4123     case 9:  ((drflac_int32*)&coefficients128_8)[3] = coefficients[ 8]; ((drflac_int32*)&samples128_8)[3] = pDecodedSamples[- 9];
4124     case 8:  ((drflac_int32*)&coefficients128_4)[0] = coefficients[ 7]; ((drflac_int32*)&samples128_4)[0] = pDecodedSamples[- 8];
4125     case 7:  ((drflac_int32*)&coefficients128_4)[1] = coefficients[ 6]; ((drflac_int32*)&samples128_4)[1] = pDecodedSamples[- 7];
4126     case 6:  ((drflac_int32*)&coefficients128_4)[2] = coefficients[ 5]; ((drflac_int32*)&samples128_4)[2] = pDecodedSamples[- 6];
4127     case 5:  ((drflac_int32*)&coefficients128_4)[3] = coefficients[ 4]; ((drflac_int32*)&samples128_4)[3] = pDecodedSamples[- 5];
4128     case 4:  ((drflac_int32*)&coefficients128_0)[0] = coefficients[ 3]; ((drflac_int32*)&samples128_0)[0] = pDecodedSamples[- 4];
4129     case 3:  ((drflac_int32*)&coefficients128_0)[1] = coefficients[ 2]; ((drflac_int32*)&samples128_0)[1] = pDecodedSamples[- 3];
4130     case 2:  ((drflac_int32*)&coefficients128_0)[2] = coefficients[ 1]; ((drflac_int32*)&samples128_0)[2] = pDecodedSamples[- 2];
4131     case 1:  ((drflac_int32*)&coefficients128_0)[3] = coefficients[ 0]; ((drflac_int32*)&samples128_0)[3] = pDecodedSamples[- 1];
4132     }
4133 #endif
4134
4135     /* For this version we are doing one sample at a time. */
4136     while (pDecodedSamples < pDecodedSamplesEnd) {
4137         __m128i zeroCountPart128;
4138         __m128i riceParamPart128;
4139
4140         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
4141             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
4142             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
4143             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
4144             return DRFLAC_FALSE;
4145         }
4146
4147         zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
4148         riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
4149
4150         riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
4151         riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
4152         riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_add_epi32(drflac__mm_not_si128(_mm_and_si128(riceParamPart128, _mm_set1_epi32(1))), _mm_set1_epi32(1)));
4153
4154         for (i = 0; i < 4; i += 1) {
4155             prediction128 = _mm_xor_si128(prediction128, prediction128);    /* Reset to 0. */
4156
4157             switch (order)
4158             {
4159             case 12:
4160             case 11: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_8, _MM_SHUFFLE(1, 1, 0, 0))));
4161             case 10:
4162             case  9: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_8, _MM_SHUFFLE(3, 3, 2, 2))));
4163             case  8:
4164             case  7: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_4, _MM_SHUFFLE(1, 1, 0, 0))));
4165             case  6:
4166             case  5: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_4, _MM_SHUFFLE(3, 3, 2, 2))));
4167             case  4:
4168             case  3: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_0, _MM_SHUFFLE(1, 1, 0, 0))));
4169             case  2:
4170             case  1: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_0, _MM_SHUFFLE(3, 3, 2, 2))));
4171             }
4172
4173             /* Horizontal add and shift. */
4174             prediction128 = drflac__mm_hadd_epi64(prediction128);
4175             prediction128 = drflac__mm_srai_epi64(prediction128, shift);
4176             prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
4177
4178             /* Our value should be sitting in prediction128[0]. We need to combine this with our SSE samples. */
4179             samples128_8 = _mm_alignr_epi8(samples128_4,  samples128_8, 4);
4180             samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
4181             samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
4182
4183             /* Slide our rice parameter down so that the value in position 0 contains the next one to process. */
4184             riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
4185         }
4186
4187         /* We store samples in groups of 4. */
4188         _mm_storeu_si128((__m128i*)pDecodedSamples, samples128_0);
4189         pDecodedSamples += 4;
4190     }
4191
4192     /* Make sure we process the last few samples. */
4193     i = (count & ~3);
4194     while (i < (int)count) {
4195         /* Rice extraction. */
4196         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0)) {
4197             return DRFLAC_FALSE;
4198         }
4199
4200         /* Rice reconstruction. */
4201         riceParamParts0 &= riceParamMask;
4202         riceParamParts0 |= (zeroCountParts0 << riceParam);
4203         riceParamParts0  = (riceParamParts0 >> 1) ^ t[riceParamParts0 & 0x01];
4204
4205         /* Sample reconstruction. */
4206         pDecodedSamples[0] = riceParamParts0 + drflac__calculate_prediction_64(order, shift, coefficients, pDecodedSamples);
4207
4208         i += 1;
4209         pDecodedSamples += 1;
4210     }
4211
4212     return DRFLAC_TRUE;
4213 }
4214
4215 static drflac_bool32 drflac__decode_samples_with_residual__rice__sse41(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
4216 {
4217     DRFLAC_ASSERT(bs != NULL);
4218     DRFLAC_ASSERT(pSamplesOut != NULL);
4219
4220     /* In my testing the order is rarely > 12, so in this case I'm going to simplify the SSE implementation by only handling order <= 12. */
4221     if (order > 0 && order <= 12) {
4222         if (bitsPerSample+shift > 32) {
4223             return drflac__decode_samples_with_residual__rice__sse41_64(bs, count, riceParam, order, shift, coefficients, pSamplesOut);
4224         } else {
4225             return drflac__decode_samples_with_residual__rice__sse41_32(bs, count, riceParam, order, shift, coefficients, pSamplesOut);
4226         }
4227     } else {
4228         return drflac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
4229     }
4230 }
4231 #endif
4232
4233 #if defined(DRFLAC_SUPPORT_NEON)
4234 static DRFLAC_INLINE void drflac__vst2q_s32(drflac_int32* p, int32x4x2_t x)
4235 {
4236     vst1q_s32(p+0, x.val[0]);
4237     vst1q_s32(p+4, x.val[1]);
4238 }
4239
4240 static DRFLAC_INLINE void drflac__vst2q_u32(drflac_uint32* p, uint32x4x2_t x)
4241 {
4242     vst1q_u32(p+0, x.val[0]);
4243     vst1q_u32(p+4, x.val[1]);
4244 }
4245
4246 static DRFLAC_INLINE void drflac__vst2q_f32(float* p, float32x4x2_t x)
4247 {
4248     vst1q_f32(p+0, x.val[0]);
4249     vst1q_f32(p+4, x.val[1]);
4250 }
4251
4252 static DRFLAC_INLINE void drflac__vst2q_s16(drflac_int16* p, int16x4x2_t x)
4253 {
4254     vst1q_s16(p, vcombine_s16(x.val[0], x.val[1]));
4255 }
4256
4257 static DRFLAC_INLINE void drflac__vst2q_u16(drflac_uint16* p, uint16x4x2_t x)
4258 {
4259     vst1q_u16(p, vcombine_u16(x.val[0], x.val[1]));
4260 }
4261
4262 static DRFLAC_INLINE int32x4_t drflac__vdupq_n_s32x4(drflac_int32 x3, drflac_int32 x2, drflac_int32 x1, drflac_int32 x0)
4263 {
4264     drflac_int32 x[4];
4265     x[3] = x3;
4266     x[2] = x2;
4267     x[1] = x1;
4268     x[0] = x0;
4269     return vld1q_s32(x);
4270 }
4271
4272 static DRFLAC_INLINE int32x4_t drflac__valignrq_s32_1(int32x4_t a, int32x4_t b)
4273 {
4274     /* Equivalent to SSE's _mm_alignr_epi8(a, b, 4) */
4275
4276     /* Reference */
4277     /*return drflac__vdupq_n_s32x4(
4278         vgetq_lane_s32(a, 0),
4279         vgetq_lane_s32(b, 3),
4280         vgetq_lane_s32(b, 2),
4281         vgetq_lane_s32(b, 1)
4282     );*/
4283
4284     return vextq_s32(b, a, 1);
4285 }
4286
4287 static DRFLAC_INLINE uint32x4_t drflac__valignrq_u32_1(uint32x4_t a, uint32x4_t b)
4288 {
4289     /* Equivalent to SSE's _mm_alignr_epi8(a, b, 4) */
4290
4291     /* Reference */
4292     /*return drflac__vdupq_n_s32x4(
4293         vgetq_lane_s32(a, 0),
4294         vgetq_lane_s32(b, 3),
4295         vgetq_lane_s32(b, 2),
4296         vgetq_lane_s32(b, 1)
4297     );*/
4298
4299     return vextq_u32(b, a, 1);
4300 }
4301
4302 static DRFLAC_INLINE int32x2_t drflac__vhaddq_s32(int32x4_t x)
4303 {
4304     /* The sum must end up in position 0. */
4305
4306     /* Reference */
4307     /*return vdupq_n_s32(
4308         vgetq_lane_s32(x, 3) +
4309         vgetq_lane_s32(x, 2) +
4310         vgetq_lane_s32(x, 1) +
4311         vgetq_lane_s32(x, 0)
4312     );*/
4313
4314     int32x2_t r = vadd_s32(vget_high_s32(x), vget_low_s32(x));
4315     return vpadd_s32(r, r);
4316 }
4317
4318 static DRFLAC_INLINE int64x1_t drflac__vhaddq_s64(int64x2_t x)
4319 {
4320     return vadd_s64(vget_high_s64(x), vget_low_s64(x));
4321 }
4322
4323 static DRFLAC_INLINE int32x4_t drflac__vrevq_s32(int32x4_t x)
4324 {
4325     /* Reference */
4326     /*return drflac__vdupq_n_s32x4(
4327         vgetq_lane_s32(x, 0),
4328         vgetq_lane_s32(x, 1),
4329         vgetq_lane_s32(x, 2),
4330         vgetq_lane_s32(x, 3)
4331     );*/
4332
4333     return vrev64q_s32(vcombine_s32(vget_high_s32(x), vget_low_s32(x)));
4334 }
4335
4336 static DRFLAC_INLINE int32x4_t drflac__vnotq_s32(int32x4_t x)
4337 {
4338     return veorq_s32(x, vdupq_n_s32(0xFFFFFFFF));
4339 }
4340
4341 static DRFLAC_INLINE uint32x4_t drflac__vnotq_u32(uint32x4_t x)
4342 {
4343     return veorq_u32(x, vdupq_n_u32(0xFFFFFFFF));
4344 }
4345
4346 static drflac_bool32 drflac__decode_samples_with_residual__rice__neon_32(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
4347 {
4348     int i;
4349     drflac_uint32 riceParamMask;
4350     drflac_int32* pDecodedSamples    = pSamplesOut;
4351     drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
4352     drflac_uint32 zeroCountParts[4];
4353     drflac_uint32 riceParamParts[4];
4354     int32x4_t coefficients128_0;
4355     int32x4_t coefficients128_4;
4356     int32x4_t coefficients128_8;
4357     int32x4_t samples128_0;
4358     int32x4_t samples128_4;
4359     int32x4_t samples128_8;
4360     uint32x4_t riceParamMask128;
4361     int32x4_t riceParam128;
4362     int32x2_t shift64;
4363     uint32x4_t one128;
4364
4365     const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
4366
4367     riceParamMask    = ~((~0UL) << riceParam);
4368     riceParamMask128 = vdupq_n_u32(riceParamMask);
4369
4370     riceParam128 = vdupq_n_s32(riceParam);
4371     shift64 = vdup_n_s32(-shift); /* Negate the shift because we'll be doing a variable shift using vshlq_s32(). */
4372     one128 = vdupq_n_u32(1);
4373
4374     /*
4375     Pre-loading the coefficients and prior samples is annoying because we need to ensure we don't try reading more than
4376     what's available in the input buffers. It would be conenient to use a fall-through switch to do this, but this results
4377     in strict aliasing warnings with GCC. To work around this I'm just doing something hacky. This feels a bit convoluted
4378     so I think there's opportunity for this to be simplified.
4379     */
4380     {
4381         int runningOrder = order;
4382         drflac_int32 tempC[4] = {0, 0, 0, 0};
4383         drflac_int32 tempS[4] = {0, 0, 0, 0};
4384
4385         /* 0 - 3. */
4386         if (runningOrder >= 4) {
4387             coefficients128_0 = vld1q_s32(coefficients + 0);
4388             samples128_0      = vld1q_s32(pSamplesOut  - 4);
4389             runningOrder -= 4;
4390         } else {
4391             switch (runningOrder) {
4392                 case 3: tempC[2] = coefficients[2]; tempS[1] = pSamplesOut[-3]; /* fallthrough */
4393                 case 2: tempC[1] = coefficients[1]; tempS[2] = pSamplesOut[-2]; /* fallthrough */
4394                 case 1: tempC[0] = coefficients[0]; tempS[3] = pSamplesOut[-1]; /* fallthrough */
4395             }
4396
4397             coefficients128_0 = vld1q_s32(tempC);
4398             samples128_0      = vld1q_s32(tempS);
4399             runningOrder = 0;
4400         }
4401
4402         /* 4 - 7 */
4403         if (runningOrder >= 4) {
4404             coefficients128_4 = vld1q_s32(coefficients + 4);
4405             samples128_4      = vld1q_s32(pSamplesOut  - 8);
4406             runningOrder -= 4;
4407         } else {
4408             switch (runningOrder) {
4409                 case 3: tempC[2] = coefficients[6]; tempS[1] = pSamplesOut[-7]; /* fallthrough */
4410                 case 2: tempC[1] = coefficients[5]; tempS[2] = pSamplesOut[-6]; /* fallthrough */
4411                 case 1: tempC[0] = coefficients[4]; tempS[3] = pSamplesOut[-5]; /* fallthrough */
4412             }
4413
4414             coefficients128_4 = vld1q_s32(tempC);
4415             samples128_4      = vld1q_s32(tempS);
4416             runningOrder = 0;
4417         }
4418
4419         /* 8 - 11 */
4420         if (runningOrder == 4) {
4421             coefficients128_8 = vld1q_s32(coefficients + 8);
4422             samples128_8      = vld1q_s32(pSamplesOut  - 12);
4423             runningOrder -= 4;
4424         } else {
4425             switch (runningOrder) {
4426                 case 3: tempC[2] = coefficients[10]; tempS[1] = pSamplesOut[-11]; /* fallthrough */
4427                 case 2: tempC[1] = coefficients[ 9]; tempS[2] = pSamplesOut[-10]; /* fallthrough */
4428                 case 1: tempC[0] = coefficients[ 8]; tempS[3] = pSamplesOut[- 9]; /* fallthrough */
4429             }
4430
4431             coefficients128_8 = vld1q_s32(tempC);
4432             samples128_8      = vld1q_s32(tempS);
4433             runningOrder = 0;
4434         }
4435
4436         /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
4437         coefficients128_0 = drflac__vrevq_s32(coefficients128_0);
4438         coefficients128_4 = drflac__vrevq_s32(coefficients128_4);
4439         coefficients128_8 = drflac__vrevq_s32(coefficients128_8);
4440     }
4441
4442     /* For this version we are doing one sample at a time. */
4443     while (pDecodedSamples < pDecodedSamplesEnd) {
4444         int32x4_t prediction128;
4445         int32x2_t prediction64;
4446         uint32x4_t zeroCountPart128;
4447         uint32x4_t riceParamPart128;
4448
4449         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0]) ||
4450             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[1], &riceParamParts[1]) ||
4451             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[2], &riceParamParts[2]) ||
4452             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[3], &riceParamParts[3])) {
4453             return DRFLAC_FALSE;
4454         }
4455
4456         zeroCountPart128 = vld1q_u32(zeroCountParts);
4457         riceParamPart128 = vld1q_u32(riceParamParts);
4458
4459         riceParamPart128 = vandq_u32(riceParamPart128, riceParamMask128);
4460         riceParamPart128 = vorrq_u32(riceParamPart128, vshlq_u32(zeroCountPart128, riceParam128));
4461         riceParamPart128 = veorq_u32(vshrq_n_u32(riceParamPart128, 1), vaddq_u32(drflac__vnotq_u32(vandq_u32(riceParamPart128, one128)), one128));
4462
4463         if (order <= 4) {
4464             for (i = 0; i < 4; i += 1) {
4465                 prediction128 = vmulq_s32(coefficients128_0, samples128_0);
4466
4467                 /* Horizontal add and shift. */
4468                 prediction64 = drflac__vhaddq_s32(prediction128);
4469                 prediction64 = vshl_s32(prediction64, shift64);
4470                 prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
4471
4472                 samples128_0 = drflac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
4473                 riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
4474             }
4475         } else if (order <= 8) {
4476             for (i = 0; i < 4; i += 1) {
4477                 prediction128 =                vmulq_s32(coefficients128_4, samples128_4);
4478                 prediction128 = vmlaq_s32(prediction128, coefficients128_0, samples128_0);
4479
4480                 /* Horizontal add and shift. */
4481                 prediction64 = drflac__vhaddq_s32(prediction128);
4482                 prediction64 = vshl_s32(prediction64, shift64);
4483                 prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
4484
4485                 samples128_4 = drflac__valignrq_s32_1(samples128_0, samples128_4);
4486                 samples128_0 = drflac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
4487                 riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
4488             }
4489         } else {
4490             for (i = 0; i < 4; i += 1) {
4491                 prediction128 =                vmulq_s32(coefficients128_8, samples128_8);
4492                 prediction128 = vmlaq_s32(prediction128, coefficients128_4, samples128_4);
4493                 prediction128 = vmlaq_s32(prediction128, coefficients128_0, samples128_0);
4494
4495                 /* Horizontal add and shift. */
4496                 prediction64 = drflac__vhaddq_s32(prediction128);
4497                 prediction64 = vshl_s32(prediction64, shift64);
4498                 prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
4499
4500                 samples128_8 = drflac__valignrq_s32_1(samples128_4, samples128_8);
4501                 samples128_4 = drflac__valignrq_s32_1(samples128_0, samples128_4);
4502                 samples128_0 = drflac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
4503                 riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
4504             }
4505         }
4506
4507         /* We store samples in groups of 4. */
4508         vst1q_s32(pDecodedSamples, samples128_0);
4509         pDecodedSamples += 4;
4510     }
4511
4512     /* Make sure we process the last few samples. */
4513     i = (count & ~3);
4514     while (i < (int)count) {
4515         /* Rice extraction. */
4516         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0])) {
4517             return DRFLAC_FALSE;
4518         }
4519
4520         /* Rice reconstruction. */
4521         riceParamParts[0] &= riceParamMask;
4522         riceParamParts[0] |= (zeroCountParts[0] << riceParam);
4523         riceParamParts[0]  = (riceParamParts[0] >> 1) ^ t[riceParamParts[0] & 0x01];
4524
4525         /* Sample reconstruction. */
4526         pDecodedSamples[0] = riceParamParts[0] + drflac__calculate_prediction_32(order, shift, coefficients, pDecodedSamples);
4527
4528         i += 1;
4529         pDecodedSamples += 1;
4530     }
4531
4532     return DRFLAC_TRUE;
4533 }
4534
4535 static drflac_bool32 drflac__decode_samples_with_residual__rice__neon_64(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
4536 {
4537     int i;
4538     drflac_uint32 riceParamMask;
4539     drflac_int32* pDecodedSamples    = pSamplesOut;
4540     drflac_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
4541     drflac_uint32 zeroCountParts[4];
4542     drflac_uint32 riceParamParts[4];
4543     int32x4_t coefficients128_0;
4544     int32x4_t coefficients128_4;
4545     int32x4_t coefficients128_8;
4546     int32x4_t samples128_0;
4547     int32x4_t samples128_4;
4548     int32x4_t samples128_8;
4549     uint32x4_t riceParamMask128;
4550     int32x4_t riceParam128;
4551     int64x1_t shift64;
4552     uint32x4_t one128;
4553
4554     const drflac_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
4555
4556     riceParamMask    = ~((~0UL) << riceParam);
4557     riceParamMask128 = vdupq_n_u32(riceParamMask);
4558
4559     riceParam128 = vdupq_n_s32(riceParam);
4560     shift64 = vdup_n_s64(-shift); /* Negate the shift because we'll be doing a variable shift using vshlq_s32(). */
4561     one128 = vdupq_n_u32(1);
4562
4563     /*
4564     Pre-loading the coefficients and prior samples is annoying because we need to ensure we don't try reading more than
4565     what's available in the input buffers. It would be conenient to use a fall-through switch to do this, but this results
4566     in strict aliasing warnings with GCC. To work around this I'm just doing something hacky. This feels a bit convoluted
4567     so I think there's opportunity for this to be simplified.
4568     */
4569     {
4570         int runningOrder = order;
4571         drflac_int32 tempC[4] = {0, 0, 0, 0};
4572         drflac_int32 tempS[4] = {0, 0, 0, 0};
4573
4574         /* 0 - 3. */
4575         if (runningOrder >= 4) {
4576             coefficients128_0 = vld1q_s32(coefficients + 0);
4577             samples128_0      = vld1q_s32(pSamplesOut  - 4);
4578             runningOrder -= 4;
4579         } else {
4580             switch (runningOrder) {
4581                 case 3: tempC[2] = coefficients[2]; tempS[1] = pSamplesOut[-3]; /* fallthrough */
4582                 case 2: tempC[1] = coefficients[1]; tempS[2] = pSamplesOut[-2]; /* fallthrough */
4583                 case 1: tempC[0] = coefficients[0]; tempS[3] = pSamplesOut[-1]; /* fallthrough */
4584             }
4585
4586             coefficients128_0 = vld1q_s32(tempC);
4587             samples128_0      = vld1q_s32(tempS);
4588             runningOrder = 0;
4589         }
4590
4591         /* 4 - 7 */
4592         if (runningOrder >= 4) {
4593             coefficients128_4 = vld1q_s32(coefficients + 4);
4594             samples128_4      = vld1q_s32(pSamplesOut  - 8);
4595             runningOrder -= 4;
4596         } else {
4597             switch (runningOrder) {
4598                 case 3: tempC[2] = coefficients[6]; tempS[1] = pSamplesOut[-7]; /* fallthrough */
4599                 case 2: tempC[1] = coefficients[5]; tempS[2] = pSamplesOut[-6]; /* fallthrough */
4600                 case 1: tempC[0] = coefficients[4]; tempS[3] = pSamplesOut[-5]; /* fallthrough */
4601             }
4602
4603             coefficients128_4 = vld1q_s32(tempC);
4604             samples128_4      = vld1q_s32(tempS);
4605             runningOrder = 0;
4606         }
4607
4608         /* 8 - 11 */
4609         if (runningOrder == 4) {
4610             coefficients128_8 = vld1q_s32(coefficients + 8);
4611             samples128_8      = vld1q_s32(pSamplesOut  - 12);
4612             runningOrder -= 4;
4613         } else {
4614             switch (runningOrder) {
4615                 case 3: tempC[2] = coefficients[10]; tempS[1] = pSamplesOut[-11]; /* fallthrough */
4616                 case 2: tempC[1] = coefficients[ 9]; tempS[2] = pSamplesOut[-10]; /* fallthrough */
4617                 case 1: tempC[0] = coefficients[ 8]; tempS[3] = pSamplesOut[- 9]; /* fallthrough */
4618             }
4619
4620             coefficients128_8 = vld1q_s32(tempC);
4621             samples128_8      = vld1q_s32(tempS);
4622             runningOrder = 0;
4623         }
4624
4625         /* Coefficients need to be shuffled for our streaming algorithm below to work. Samples are already in the correct order from the loading routine above. */
4626         coefficients128_0 = drflac__vrevq_s32(coefficients128_0);
4627         coefficients128_4 = drflac__vrevq_s32(coefficients128_4);
4628         coefficients128_8 = drflac__vrevq_s32(coefficients128_8);
4629     }
4630
4631     /* For this version we are doing one sample at a time. */
4632     while (pDecodedSamples < pDecodedSamplesEnd) {
4633         int64x2_t prediction128;
4634         uint32x4_t zeroCountPart128;
4635         uint32x4_t riceParamPart128;
4636
4637         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0]) ||
4638             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[1], &riceParamParts[1]) ||
4639             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[2], &riceParamParts[2]) ||
4640             !drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[3], &riceParamParts[3])) {
4641             return DRFLAC_FALSE;
4642         }
4643
4644         zeroCountPart128 = vld1q_u32(zeroCountParts);
4645         riceParamPart128 = vld1q_u32(riceParamParts);
4646
4647         riceParamPart128 = vandq_u32(riceParamPart128, riceParamMask128);
4648         riceParamPart128 = vorrq_u32(riceParamPart128, vshlq_u32(zeroCountPart128, riceParam128));
4649         riceParamPart128 = veorq_u32(vshrq_n_u32(riceParamPart128, 1), vaddq_u32(drflac__vnotq_u32(vandq_u32(riceParamPart128, one128)), one128));
4650
4651         for (i = 0; i < 4; i += 1) {
4652             int64x1_t prediction64;
4653
4654             prediction128 = veorq_s64(prediction128, prediction128);    /* Reset to 0. */
4655             switch (order)
4656             {
4657             case 12:
4658             case 11: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_8), vget_low_s32(samples128_8)));
4659             case 10:
4660             case  9: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_8), vget_high_s32(samples128_8)));
4661             case  8:
4662             case  7: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_4), vget_low_s32(samples128_4)));
4663             case  6:
4664             case  5: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_4), vget_high_s32(samples128_4)));
4665             case  4:
4666             case  3: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_0), vget_low_s32(samples128_0)));
4667             case  2:
4668             case  1: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_0), vget_high_s32(samples128_0)));
4669             }
4670
4671             /* Horizontal add and shift. */
4672             prediction64 = drflac__vhaddq_s64(prediction128);
4673             prediction64 = vshl_s64(prediction64, shift64);
4674             prediction64 = vadd_s64(prediction64, vdup_n_s64(vgetq_lane_u32(riceParamPart128, 0)));
4675
4676             /* Our value should be sitting in prediction64[0]. We need to combine this with our SSE samples. */
4677             samples128_8 = drflac__valignrq_s32_1(samples128_4, samples128_8);
4678             samples128_4 = drflac__valignrq_s32_1(samples128_0, samples128_4);
4679             samples128_0 = drflac__valignrq_s32_1(vcombine_s32(vreinterpret_s32_s64(prediction64), vdup_n_s32(0)), samples128_0);
4680
4681             /* Slide our rice parameter down so that the value in position 0 contains the next one to process. */
4682             riceParamPart128 = drflac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
4683         }
4684
4685         /* We store samples in groups of 4. */
4686         vst1q_s32(pDecodedSamples, samples128_0);
4687         pDecodedSamples += 4;
4688     }
4689
4690     /* Make sure we process the last few samples. */
4691     i = (count & ~3);
4692     while (i < (int)count) {
4693         /* Rice extraction. */
4694         if (!drflac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0])) {
4695             return DRFLAC_FALSE;
4696         }
4697
4698         /* Rice reconstruction. */
4699         riceParamParts[0] &= riceParamMask;
4700         riceParamParts[0] |= (zeroCountParts[0] << riceParam);
4701         riceParamParts[0]  = (riceParamParts[0] >> 1) ^ t[riceParamParts[0] & 0x01];
4702
4703         /* Sample reconstruction. */
4704         pDecodedSamples[0] = riceParamParts[0] + drflac__calculate_prediction_64(order, shift, coefficients, pDecodedSamples);
4705
4706         i += 1;
4707         pDecodedSamples += 1;
4708     }
4709
4710     return DRFLAC_TRUE;
4711 }
4712
4713 static drflac_bool32 drflac__decode_samples_with_residual__rice__neon(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
4714 {
4715     DRFLAC_ASSERT(bs != NULL);
4716     DRFLAC_ASSERT(pSamplesOut != NULL);
4717
4718     /* In my testing the order is rarely > 12, so in this case I'm going to simplify the NEON implementation by only handling order <= 12. */
4719     if (order > 0 && order <= 12) {
4720         if (bitsPerSample+shift > 32) {
4721             return drflac__decode_samples_with_residual__rice__neon_64(bs, count, riceParam, order, shift, coefficients, pSamplesOut);
4722         } else {
4723             return drflac__decode_samples_with_residual__rice__neon_32(bs, count, riceParam, order, shift, coefficients, pSamplesOut);
4724         }
4725     } else {
4726         return drflac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
4727     }
4728 }
4729 #endif
4730
4731 static drflac_bool32 drflac__decode_samples_with_residual__rice(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 riceParam, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
4732 {
4733 #if defined(DRFLAC_SUPPORT_SSE41)
4734     if (drflac__gIsSSE41Supported) {
4735         return drflac__decode_samples_with_residual__rice__sse41(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
4736     } else
4737 #elif defined(DRFLAC_SUPPORT_NEON)
4738     if (drflac__gIsNEONSupported) {
4739         return drflac__decode_samples_with_residual__rice__neon(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
4740     } else
4741 #endif
4742     {
4743         /* Scalar fallback. */
4744     #if 0
4745         return drflac__decode_samples_with_residual__rice__reference(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
4746     #else
4747         return drflac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, order, shift, coefficients, pSamplesOut);
4748     #endif
4749     }
4750 }
4751
4752 /* Reads and seeks past a string of residual values as Rice codes. The decoder should be sitting on the first bit of the Rice codes. */
4753 static drflac_bool32 drflac__read_and_seek_residual__rice(drflac_bs* bs, drflac_uint32 count, drflac_uint8 riceParam)
4754 {
4755     drflac_uint32 i;
4756
4757     DRFLAC_ASSERT(bs != NULL);
4758
4759     for (i = 0; i < count; ++i) {
4760         if (!drflac__seek_rice_parts(bs, riceParam)) {
4761             return DRFLAC_FALSE;
4762         }
4763     }
4764
4765     return DRFLAC_TRUE;
4766 }
4767
4768 static drflac_bool32 drflac__decode_samples_with_residual__unencoded(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 count, drflac_uint8 unencodedBitsPerSample, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pSamplesOut)
4769 {
4770     drflac_uint32 i;
4771
4772     DRFLAC_ASSERT(bs != NULL);
4773     DRFLAC_ASSERT(unencodedBitsPerSample <= 31);    /* <-- unencodedBitsPerSample is a 5 bit number, so cannot exceed 31. */
4774     DRFLAC_ASSERT(pSamplesOut != NULL);
4775
4776     for (i = 0; i < count; ++i) {
4777         if (unencodedBitsPerSample > 0) {
4778             if (!drflac__read_int32(bs, unencodedBitsPerSample, pSamplesOut + i)) {
4779                 return DRFLAC_FALSE;
4780             }
4781         } else {
4782             pSamplesOut[i] = 0;
4783         }
4784
4785         if (bitsPerSample >= 24) {
4786             pSamplesOut[i] += drflac__calculate_prediction_64(order, shift, coefficients, pSamplesOut + i);
4787         } else {
4788             pSamplesOut[i] += drflac__calculate_prediction_32(order, shift, coefficients, pSamplesOut + i);
4789         }
4790     }
4791
4792     return DRFLAC_TRUE;
4793 }
4794
4795
4796 /*
4797 Reads and decodes the residual for the sub-frame the decoder is currently sitting on. This function should be called
4798 when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be ignored. The
4799 <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
4800 */
4801 static drflac_bool32 drflac__decode_samples_with_residual(drflac_bs* bs, drflac_uint32 bitsPerSample, drflac_uint32 blockSize, drflac_uint32 order, drflac_int32 shift, const drflac_int32* coefficients, drflac_int32* pDecodedSamples)
4802 {
4803     drflac_uint8 residualMethod;
4804     drflac_uint8 partitionOrder;
4805     drflac_uint32 samplesInPartition;
4806     drflac_uint32 partitionsRemaining;
4807
4808     DRFLAC_ASSERT(bs != NULL);
4809     DRFLAC_ASSERT(blockSize != 0);
4810     DRFLAC_ASSERT(pDecodedSamples != NULL);       /* <-- Should we allow NULL, in which case we just seek past the residual rather than do a full decode? */
4811
4812     if (!drflac__read_uint8(bs, 2, &residualMethod)) {
4813         return DRFLAC_FALSE;
4814     }
4815
4816     if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
4817         return DRFLAC_FALSE;    /* Unknown or unsupported residual coding method. */
4818     }
4819
4820     /* Ignore the first <order> values. */
4821     pDecodedSamples += order;
4822
4823     if (!drflac__read_uint8(bs, 4, &partitionOrder)) {
4824         return DRFLAC_FALSE;
4825     }
4826
4827     /*
4828     From the FLAC spec:
4829       The Rice partition order in a Rice-coded residual section must be less than or equal to 8.
4830     */
4831     if (partitionOrder > 8) {
4832         return DRFLAC_FALSE;
4833     }
4834
4835     /* Validation check. */
4836     if ((blockSize / (1 << partitionOrder)) < order) {
4837         return DRFLAC_FALSE;
4838     }
4839
4840     samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
4841     partitionsRemaining = (1 << partitionOrder);
4842     for (;;) {
4843         drflac_uint8 riceParam = 0;
4844         if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
4845             if (!drflac__read_uint8(bs, 4, &riceParam)) {
4846                 return DRFLAC_FALSE;
4847             }
4848             if (riceParam == 15) {
4849                 riceParam = 0xFF;
4850             }
4851         } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
4852             if (!drflac__read_uint8(bs, 5, &riceParam)) {
4853                 return DRFLAC_FALSE;
4854             }
4855             if (riceParam == 31) {
4856                 riceParam = 0xFF;
4857             }
4858         }
4859
4860         if (riceParam != 0xFF) {
4861             if (!drflac__decode_samples_with_residual__rice(bs, bitsPerSample, samplesInPartition, riceParam, order, shift, coefficients, pDecodedSamples)) {
4862                 return DRFLAC_FALSE;
4863             }
4864         } else {
4865             drflac_uint8 unencodedBitsPerSample = 0;
4866             if (!drflac__read_uint8(bs, 5, &unencodedBitsPerSample)) {
4867                 return DRFLAC_FALSE;
4868             }
4869
4870             if (!drflac__decode_samples_with_residual__unencoded(bs, bitsPerSample, samplesInPartition, unencodedBitsPerSample, order, shift, coefficients, pDecodedSamples)) {
4871                 return DRFLAC_FALSE;
4872             }
4873         }
4874
4875         pDecodedSamples += samplesInPartition;
4876
4877         if (partitionsRemaining == 1) {
4878             break;
4879         }
4880
4881         partitionsRemaining -= 1;
4882
4883         if (partitionOrder != 0) {
4884             samplesInPartition = blockSize / (1 << partitionOrder);
4885         }
4886     }
4887
4888     return DRFLAC_TRUE;
4889 }
4890
4891 /*
4892 Reads and seeks past the residual for the sub-frame the decoder is currently sitting on. This function should be called
4893 when the decoder is sitting at the very start of the RESIDUAL block. The first <order> residuals will be set to 0. The
4894 <blockSize> and <order> parameters are used to determine how many residual values need to be decoded.
4895 */
4896 static drflac_bool32 drflac__read_and_seek_residual(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 order)
4897 {
4898     drflac_uint8 residualMethod;
4899     drflac_uint8 partitionOrder;
4900     drflac_uint32 samplesInPartition;
4901     drflac_uint32 partitionsRemaining;
4902
4903     DRFLAC_ASSERT(bs != NULL);
4904     DRFLAC_ASSERT(blockSize != 0);
4905
4906     if (!drflac__read_uint8(bs, 2, &residualMethod)) {
4907         return DRFLAC_FALSE;
4908     }
4909
4910     if (residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
4911         return DRFLAC_FALSE;    /* Unknown or unsupported residual coding method. */
4912     }
4913
4914     if (!drflac__read_uint8(bs, 4, &partitionOrder)) {
4915         return DRFLAC_FALSE;
4916     }
4917
4918     /*
4919     From the FLAC spec:
4920       The Rice partition order in a Rice-coded residual section must be less than or equal to 8.
4921     */
4922     if (partitionOrder > 8) {
4923         return DRFLAC_FALSE;
4924     }
4925
4926     /* Validation check. */
4927     if ((blockSize / (1 << partitionOrder)) <= order) {
4928         return DRFLAC_FALSE;
4929     }
4930
4931     samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
4932     partitionsRemaining = (1 << partitionOrder);
4933     for (;;)
4934     {
4935         drflac_uint8 riceParam = 0;
4936         if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
4937             if (!drflac__read_uint8(bs, 4, &riceParam)) {
4938                 return DRFLAC_FALSE;
4939             }
4940             if (riceParam == 15) {
4941                 riceParam = 0xFF;
4942             }
4943         } else if (residualMethod == DRFLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
4944             if (!drflac__read_uint8(bs, 5, &riceParam)) {
4945                 return DRFLAC_FALSE;
4946             }
4947             if (riceParam == 31) {
4948                 riceParam = 0xFF;
4949             }
4950         }
4951
4952         if (riceParam != 0xFF) {
4953             if (!drflac__read_and_seek_residual__rice(bs, samplesInPartition, riceParam)) {
4954                 return DRFLAC_FALSE;
4955             }
4956         } else {
4957             drflac_uint8 unencodedBitsPerSample = 0;
4958             if (!drflac__read_uint8(bs, 5, &unencodedBitsPerSample)) {
4959                 return DRFLAC_FALSE;
4960             }
4961
4962             if (!drflac__seek_bits(bs, unencodedBitsPerSample * samplesInPartition)) {
4963                 return DRFLAC_FALSE;
4964             }
4965         }
4966
4967
4968         if (partitionsRemaining == 1) {
4969             break;
4970         }
4971
4972         partitionsRemaining -= 1;
4973         samplesInPartition = blockSize / (1 << partitionOrder);
4974     }
4975
4976     return DRFLAC_TRUE;
4977 }
4978
4979
4980 static drflac_bool32 drflac__decode_samples__constant(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 subframeBitsPerSample, drflac_int32* pDecodedSamples)
4981 {
4982     drflac_uint32 i;
4983
4984     /* Only a single sample needs to be decoded here. */
4985     drflac_int32 sample;
4986     if (!drflac__read_int32(bs, subframeBitsPerSample, &sample)) {
4987         return DRFLAC_FALSE;
4988     }
4989
4990     /*
4991     We don't really need to expand this, but it does simplify the process of reading samples. If this becomes a performance issue (unlikely)
4992     we'll want to look at a more efficient way.
4993     */
4994     for (i = 0; i < blockSize; ++i) {
4995         pDecodedSamples[i] = sample;
4996     }
4997
4998     return DRFLAC_TRUE;
4999 }
5000
5001 static drflac_bool32 drflac__decode_samples__verbatim(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 subframeBitsPerSample, drflac_int32* pDecodedSamples)
5002 {
5003     drflac_uint32 i;
5004
5005     for (i = 0; i < blockSize; ++i) {
5006         drflac_int32 sample;
5007         if (!drflac__read_int32(bs, subframeBitsPerSample, &sample)) {
5008             return DRFLAC_FALSE;
5009         }
5010
5011         pDecodedSamples[i] = sample;
5012     }
5013
5014     return DRFLAC_TRUE;
5015 }
5016
5017 static drflac_bool32 drflac__decode_samples__fixed(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 subframeBitsPerSample, drflac_uint8 lpcOrder, drflac_int32* pDecodedSamples)
5018 {
5019     drflac_uint32 i;
5020
5021     static drflac_int32 lpcCoefficientsTable[5][4] = {
5022         {0,  0, 0,  0},
5023         {1,  0, 0,  0},
5024         {2, -1, 0,  0},
5025         {3, -3, 1,  0},
5026         {4, -6, 4, -1}
5027     };
5028
5029     /* Warm up samples and coefficients. */
5030     for (i = 0; i < lpcOrder; ++i) {
5031         drflac_int32 sample;
5032         if (!drflac__read_int32(bs, subframeBitsPerSample, &sample)) {
5033             return DRFLAC_FALSE;
5034         }
5035
5036         pDecodedSamples[i] = sample;
5037     }
5038
5039     if (!drflac__decode_samples_with_residual(bs, subframeBitsPerSample, blockSize, lpcOrder, 0, lpcCoefficientsTable[lpcOrder], pDecodedSamples)) {
5040         return DRFLAC_FALSE;
5041     }
5042
5043     return DRFLAC_TRUE;
5044 }
5045
5046 static drflac_bool32 drflac__decode_samples__lpc(drflac_bs* bs, drflac_uint32 blockSize, drflac_uint32 bitsPerSample, drflac_uint8 lpcOrder, drflac_int32* pDecodedSamples)
5047 {
5048     drflac_uint8 i;
5049     drflac_uint8 lpcPrecision;
5050     drflac_int8 lpcShift;
5051     drflac_int32 coefficients[32];
5052
5053     /* Warm up samples. */
5054     for (i = 0; i < lpcOrder; ++i) {
5055         drflac_int32 sample;
5056         if (!drflac__read_int32(bs, bitsPerSample, &sample)) {
5057             return DRFLAC_FALSE;
5058         }
5059
5060         pDecodedSamples[i] = sample;
5061     }
5062
5063     if (!drflac__read_uint8(bs, 4, &lpcPrecision)) {
5064         return DRFLAC_FALSE;
5065     }
5066     if (lpcPrecision == 15) {
5067         return DRFLAC_FALSE;    /* Invalid. */
5068     }
5069     lpcPrecision += 1;
5070
5071     if (!drflac__read_int8(bs, 5, &lpcShift)) {
5072         return DRFLAC_FALSE;
5073     }
5074
5075     /*
5076     From the FLAC specification:
5077
5078         Quantized linear predictor coefficient shift needed in bits (NOTE: this number is signed two's-complement)
5079
5080     Emphasis on the "signed two's-complement". In practice there does not seem to be any encoders nor decoders supporting negative shifts. For now dr_flac is
5081     not going to support negative shifts as I don't have any reference files. However, when a reference file comes through I will consider adding support.
5082     */
5083     if (lpcShift < 0) {
5084         return DRFLAC_FALSE;
5085     }
5086
5087     DRFLAC_ZERO_MEMORY(coefficients, sizeof(coefficients));
5088     for (i = 0; i < lpcOrder; ++i) {
5089         if (!drflac__read_int32(bs, lpcPrecision, coefficients + i)) {
5090             return DRFLAC_FALSE;
5091         }
5092     }
5093
5094     if (!drflac__decode_samples_with_residual(bs, bitsPerSample, blockSize, lpcOrder, lpcShift, coefficients, pDecodedSamples)) {
5095         return DRFLAC_FALSE;
5096     }
5097
5098     return DRFLAC_TRUE;
5099 }
5100
5101
5102 static drflac_bool32 drflac__read_next_flac_frame_header(drflac_bs* bs, drflac_uint8 streaminfoBitsPerSample, drflac_frame_header* header)
5103 {
5104     const drflac_uint32 sampleRateTable[12]  = {0, 88200, 176400, 192000, 8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000};
5105     const drflac_uint8 bitsPerSampleTable[8] = {0, 8, 12, (drflac_uint8)-1, 16, 20, 24, (drflac_uint8)-1};   /* -1 = reserved. */
5106
5107     DRFLAC_ASSERT(bs != NULL);
5108     DRFLAC_ASSERT(header != NULL);
5109
5110     /* Keep looping until we find a valid sync code. */
5111     for (;;) {
5112         drflac_uint8 crc8 = 0xCE; /* 0xCE = drflac_crc8(0, 0x3FFE, 14); */
5113         drflac_uint8 reserved = 0;
5114         drflac_uint8 blockingStrategy = 0;
5115         drflac_uint8 blockSize = 0;
5116         drflac_uint8 sampleRate = 0;
5117         drflac_uint8 channelAssignment = 0;
5118         drflac_uint8 bitsPerSample = 0;
5119         drflac_bool32 isVariableBlockSize;
5120
5121         if (!drflac__find_and_seek_to_next_sync_code(bs)) {
5122             return DRFLAC_FALSE;
5123         }
5124
5125         if (!drflac__read_uint8(bs, 1, &reserved)) {
5126             return DRFLAC_FALSE;
5127         }
5128         if (reserved == 1) {
5129             continue;
5130         }
5131         crc8 = drflac_crc8(crc8, reserved, 1);
5132
5133         if (!drflac__read_uint8(bs, 1, &blockingStrategy)) {
5134             return DRFLAC_FALSE;
5135         }
5136         crc8 = drflac_crc8(crc8, blockingStrategy, 1);
5137
5138         if (!drflac__read_uint8(bs, 4, &blockSize)) {
5139             return DRFLAC_FALSE;
5140         }
5141         if (blockSize == 0) {
5142             continue;
5143         }
5144         crc8 = drflac_crc8(crc8, blockSize, 4);
5145
5146         if (!drflac__read_uint8(bs, 4, &sampleRate)) {
5147             return DRFLAC_FALSE;
5148         }
5149         crc8 = drflac_crc8(crc8, sampleRate, 4);
5150
5151         if (!drflac__read_uint8(bs, 4, &channelAssignment)) {
5152             return DRFLAC_FALSE;
5153         }
5154         if (channelAssignment > 10) {
5155             continue;
5156         }
5157         crc8 = drflac_crc8(crc8, channelAssignment, 4);
5158
5159         if (!drflac__read_uint8(bs, 3, &bitsPerSample)) {
5160             return DRFLAC_FALSE;
5161         }
5162         if (bitsPerSample == 3 || bitsPerSample == 7) {
5163             continue;
5164         }
5165         crc8 = drflac_crc8(crc8, bitsPerSample, 3);
5166
5167
5168         if (!drflac__read_uint8(bs, 1, &reserved)) {
5169             return DRFLAC_FALSE;
5170         }
5171         if (reserved == 1) {
5172             continue;
5173         }
5174         crc8 = drflac_crc8(crc8, reserved, 1);
5175
5176
5177         isVariableBlockSize = blockingStrategy == 1;
5178         if (isVariableBlockSize) {
5179             drflac_uint64 pcmFrameNumber;
5180             drflac_result result = drflac__read_utf8_coded_number(bs, &pcmFrameNumber, &crc8);
5181             if (result != DRFLAC_SUCCESS) {
5182                 if (result == DRFLAC_AT_END) {
5183                     return DRFLAC_FALSE;
5184                 } else {
5185                     continue;
5186                 }
5187             }
5188             header->flacFrameNumber  = 0;
5189             header->pcmFrameNumber = pcmFrameNumber;
5190         } else {
5191             drflac_uint64 flacFrameNumber = 0;
5192             drflac_result result = drflac__read_utf8_coded_number(bs, &flacFrameNumber, &crc8);
5193             if (result != DRFLAC_SUCCESS) {
5194                 if (result == DRFLAC_AT_END) {
5195                     return DRFLAC_FALSE;
5196                 } else {
5197                     continue;
5198                 }
5199             }
5200             header->flacFrameNumber  = (drflac_uint32)flacFrameNumber;   /* <-- Safe cast. */
5201             header->pcmFrameNumber = 0;
5202         }
5203
5204
5205         DRFLAC_ASSERT(blockSize > 0);
5206         if (blockSize == 1) {
5207             header->blockSizeInPCMFrames = 192;
5208         } else if (blockSize <= 5) {
5209             DRFLAC_ASSERT(blockSize >= 2);
5210             header->blockSizeInPCMFrames = 576 * (1 << (blockSize - 2));
5211         } else if (blockSize == 6) {
5212             if (!drflac__read_uint16(bs, 8, &header->blockSizeInPCMFrames)) {
5213                 return DRFLAC_FALSE;
5214             }
5215             crc8 = drflac_crc8(crc8, header->blockSizeInPCMFrames, 8);
5216             header->blockSizeInPCMFrames += 1;
5217         } else if (blockSize == 7) {
5218             if (!drflac__read_uint16(bs, 16, &header->blockSizeInPCMFrames)) {
5219                 return DRFLAC_FALSE;
5220             }
5221             crc8 = drflac_crc8(crc8, header->blockSizeInPCMFrames, 16);
5222             header->blockSizeInPCMFrames += 1;
5223         } else {
5224             DRFLAC_ASSERT(blockSize >= 8);
5225             header->blockSizeInPCMFrames = 256 * (1 << (blockSize - 8));
5226         }
5227
5228
5229         if (sampleRate <= 11) {
5230             header->sampleRate = sampleRateTable[sampleRate];
5231         } else if (sampleRate == 12) {
5232             if (!drflac__read_uint32(bs, 8, &header->sampleRate)) {
5233                 return DRFLAC_FALSE;
5234             }
5235             crc8 = drflac_crc8(crc8, header->sampleRate, 8);
5236             header->sampleRate *= 1000;
5237         } else if (sampleRate == 13) {
5238             if (!drflac__read_uint32(bs, 16, &header->sampleRate)) {
5239                 return DRFLAC_FALSE;
5240             }
5241             crc8 = drflac_crc8(crc8, header->sampleRate, 16);
5242         } else if (sampleRate == 14) {
5243             if (!drflac__read_uint32(bs, 16, &header->sampleRate)) {
5244                 return DRFLAC_FALSE;
5245             }
5246             crc8 = drflac_crc8(crc8, header->sampleRate, 16);
5247             header->sampleRate *= 10;
5248         } else {
5249             continue;  /* Invalid. Assume an invalid block. */
5250         }
5251
5252
5253         header->channelAssignment = channelAssignment;
5254
5255         header->bitsPerSample = bitsPerSampleTable[bitsPerSample];
5256         if (header->bitsPerSample == 0) {
5257             header->bitsPerSample = streaminfoBitsPerSample;
5258         }
5259
5260         if (!drflac__read_uint8(bs, 8, &header->crc8)) {
5261             return DRFLAC_FALSE;
5262         }
5263
5264 #ifndef DR_FLAC_NO_CRC
5265         if (header->crc8 != crc8) {
5266             continue;    /* CRC mismatch. Loop back to the top and find the next sync code. */
5267         }
5268 #endif
5269         return DRFLAC_TRUE;
5270     }
5271 }
5272
5273 static drflac_bool32 drflac__read_subframe_header(drflac_bs* bs, drflac_subframe* pSubframe)
5274 {
5275     drflac_uint8 header;
5276     int type;
5277
5278     if (!drflac__read_uint8(bs, 8, &header)) {
5279         return DRFLAC_FALSE;
5280     }
5281
5282     /* First bit should always be 0. */
5283     if ((header & 0x80) != 0) {
5284         return DRFLAC_FALSE;
5285     }
5286
5287     type = (header & 0x7E) >> 1;
5288     if (type == 0) {
5289         pSubframe->subframeType = DRFLAC_SUBFRAME_CONSTANT;
5290     } else if (type == 1) {
5291         pSubframe->subframeType = DRFLAC_SUBFRAME_VERBATIM;
5292     } else {
5293         if ((type & 0x20) != 0) {
5294             pSubframe->subframeType = DRFLAC_SUBFRAME_LPC;
5295             pSubframe->lpcOrder = (drflac_uint8)(type & 0x1F) + 1;
5296         } else if ((type & 0x08) != 0) {
5297             pSubframe->subframeType = DRFLAC_SUBFRAME_FIXED;
5298             pSubframe->lpcOrder = (drflac_uint8)(type & 0x07);
5299             if (pSubframe->lpcOrder > 4) {
5300                 pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
5301                 pSubframe->lpcOrder = 0;
5302             }
5303         } else {
5304             pSubframe->subframeType = DRFLAC_SUBFRAME_RESERVED;
5305         }
5306     }
5307
5308     if (pSubframe->subframeType == DRFLAC_SUBFRAME_RESERVED) {
5309         return DRFLAC_FALSE;
5310     }
5311
5312     /* Wasted bits per sample. */
5313     pSubframe->wastedBitsPerSample = 0;
5314     if ((header & 0x01) == 1) {
5315         unsigned int wastedBitsPerSample;
5316         if (!drflac__seek_past_next_set_bit(bs, &wastedBitsPerSample)) {
5317             return DRFLAC_FALSE;
5318         }
5319         pSubframe->wastedBitsPerSample = (drflac_uint8)wastedBitsPerSample + 1;
5320     }
5321
5322     return DRFLAC_TRUE;
5323 }
5324
5325 static drflac_bool32 drflac__decode_subframe(drflac_bs* bs, drflac_frame* frame, int subframeIndex, drflac_int32* pDecodedSamplesOut)
5326 {
5327     drflac_subframe* pSubframe;
5328     drflac_uint32 subframeBitsPerSample;
5329
5330     DRFLAC_ASSERT(bs != NULL);
5331     DRFLAC_ASSERT(frame != NULL);
5332
5333     pSubframe = frame->subframes + subframeIndex;
5334     if (!drflac__read_subframe_header(bs, pSubframe)) {
5335         return DRFLAC_FALSE;
5336     }
5337
5338     /* Side channels require an extra bit per sample. Took a while to figure that one out... */
5339     subframeBitsPerSample = frame->header.bitsPerSample;
5340     if ((frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
5341         subframeBitsPerSample += 1;
5342     } else if (frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
5343         subframeBitsPerSample += 1;
5344     }
5345
5346     /* Need to handle wasted bits per sample. */
5347     if (pSubframe->wastedBitsPerSample >= subframeBitsPerSample) {
5348         return DRFLAC_FALSE;
5349     }
5350     subframeBitsPerSample -= pSubframe->wastedBitsPerSample;
5351
5352     pSubframe->pSamplesS32 = pDecodedSamplesOut;
5353
5354     switch (pSubframe->subframeType)
5355     {
5356         case DRFLAC_SUBFRAME_CONSTANT:
5357         {
5358             drflac__decode_samples__constant(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->pSamplesS32);
5359         } break;
5360
5361         case DRFLAC_SUBFRAME_VERBATIM:
5362         {
5363             drflac__decode_samples__verbatim(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->pSamplesS32);
5364         } break;
5365
5366         case DRFLAC_SUBFRAME_FIXED:
5367         {
5368             drflac__decode_samples__fixed(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->lpcOrder, pSubframe->pSamplesS32);
5369         } break;
5370
5371         case DRFLAC_SUBFRAME_LPC:
5372         {
5373             drflac__decode_samples__lpc(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->lpcOrder, pSubframe->pSamplesS32);
5374         } break;
5375
5376         default: return DRFLAC_FALSE;
5377     }
5378
5379     return DRFLAC_TRUE;
5380 }
5381
5382 static drflac_bool32 drflac__seek_subframe(drflac_bs* bs, drflac_frame* frame, int subframeIndex)
5383 {
5384     drflac_subframe* pSubframe;
5385     drflac_uint32 subframeBitsPerSample;
5386
5387     DRFLAC_ASSERT(bs != NULL);
5388     DRFLAC_ASSERT(frame != NULL);
5389
5390     pSubframe = frame->subframes + subframeIndex;
5391     if (!drflac__read_subframe_header(bs, pSubframe)) {
5392         return DRFLAC_FALSE;
5393     }
5394
5395     /* Side channels require an extra bit per sample. Took a while to figure that one out... */
5396     subframeBitsPerSample = frame->header.bitsPerSample;
5397     if ((frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
5398         subframeBitsPerSample += 1;
5399     } else if (frame->header.channelAssignment == DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
5400         subframeBitsPerSample += 1;
5401     }
5402
5403     /* Need to handle wasted bits per sample. */
5404     if (pSubframe->wastedBitsPerSample >= subframeBitsPerSample) {
5405         return DRFLAC_FALSE;
5406     }
5407     subframeBitsPerSample -= pSubframe->wastedBitsPerSample;
5408
5409     pSubframe->pSamplesS32 = NULL;
5410
5411     switch (pSubframe->subframeType)
5412     {
5413         case DRFLAC_SUBFRAME_CONSTANT:
5414         {
5415             if (!drflac__seek_bits(bs, subframeBitsPerSample)) {
5416                 return DRFLAC_FALSE;
5417             }
5418         } break;
5419
5420         case DRFLAC_SUBFRAME_VERBATIM:
5421         {
5422             unsigned int bitsToSeek = frame->header.blockSizeInPCMFrames * subframeBitsPerSample;
5423             if (!drflac__seek_bits(bs, bitsToSeek)) {
5424                 return DRFLAC_FALSE;
5425             }
5426         } break;
5427
5428         case DRFLAC_SUBFRAME_FIXED:
5429         {
5430             unsigned int bitsToSeek = pSubframe->lpcOrder * subframeBitsPerSample;
5431             if (!drflac__seek_bits(bs, bitsToSeek)) {
5432                 return DRFLAC_FALSE;
5433             }
5434
5435             if (!drflac__read_and_seek_residual(bs, frame->header.blockSizeInPCMFrames, pSubframe->lpcOrder)) {
5436                 return DRFLAC_FALSE;
5437             }
5438         } break;
5439
5440         case DRFLAC_SUBFRAME_LPC:
5441         {
5442             drflac_uint8 lpcPrecision;
5443
5444             unsigned int bitsToSeek = pSubframe->lpcOrder * subframeBitsPerSample;
5445             if (!drflac__seek_bits(bs, bitsToSeek)) {
5446                 return DRFLAC_FALSE;
5447             }
5448
5449             if (!drflac__read_uint8(bs, 4, &lpcPrecision)) {
5450                 return DRFLAC_FALSE;
5451             }
5452             if (lpcPrecision == 15) {
5453                 return DRFLAC_FALSE;    /* Invalid. */
5454             }
5455             lpcPrecision += 1;
5456
5457
5458             bitsToSeek = (pSubframe->lpcOrder * lpcPrecision) + 5;    /* +5 for shift. */
5459             if (!drflac__seek_bits(bs, bitsToSeek)) {
5460                 return DRFLAC_FALSE;
5461             }
5462
5463             if (!drflac__read_and_seek_residual(bs, frame->header.blockSizeInPCMFrames, pSubframe->lpcOrder)) {
5464                 return DRFLAC_FALSE;
5465             }
5466         } break;
5467
5468         default: return DRFLAC_FALSE;
5469     }
5470
5471     return DRFLAC_TRUE;
5472 }
5473
5474
5475 static DRFLAC_INLINE drflac_uint8 drflac__get_channel_count_from_channel_assignment(drflac_int8 channelAssignment)
5476 {
5477     drflac_uint8 lookup[] = {1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2};
5478
5479     DRFLAC_ASSERT(channelAssignment <= 10);
5480     return lookup[channelAssignment];
5481 }
5482
5483 static drflac_result drflac__decode_flac_frame(drflac* pFlac)
5484 {
5485     int channelCount;
5486     int i;
5487     drflac_uint8 paddingSizeInBits;
5488     drflac_uint16 desiredCRC16;
5489 #ifndef DR_FLAC_NO_CRC
5490     drflac_uint16 actualCRC16;
5491 #endif
5492
5493     /* This function should be called while the stream is sitting on the first byte after the frame header. */
5494     DRFLAC_ZERO_MEMORY(pFlac->currentFLACFrame.subframes, sizeof(pFlac->currentFLACFrame.subframes));
5495
5496     /* The frame block size must never be larger than the maximum block size defined by the FLAC stream. */
5497     if (pFlac->currentFLACFrame.header.blockSizeInPCMFrames > pFlac->maxBlockSizeInPCMFrames) {
5498         return DRFLAC_ERROR;
5499     }
5500
5501     /* The number of channels in the frame must match the channel count from the STREAMINFO block. */
5502     channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
5503     if (channelCount != (int)pFlac->channels) {
5504         return DRFLAC_ERROR;
5505     }
5506
5507     for (i = 0; i < channelCount; ++i) {
5508         if (!drflac__decode_subframe(&pFlac->bs, &pFlac->currentFLACFrame, i, pFlac->pDecodedSamples + (pFlac->currentFLACFrame.header.blockSizeInPCMFrames * i))) {
5509             return DRFLAC_ERROR;
5510         }
5511     }
5512
5513     paddingSizeInBits = (drflac_uint8)(DRFLAC_CACHE_L1_BITS_REMAINING(&pFlac->bs) & 7);
5514     if (paddingSizeInBits > 0) {
5515         drflac_uint8 padding = 0;
5516         if (!drflac__read_uint8(&pFlac->bs, paddingSizeInBits, &padding)) {
5517             return DRFLAC_AT_END;
5518         }
5519     }
5520
5521 #ifndef DR_FLAC_NO_CRC
5522     actualCRC16 = drflac__flush_crc16(&pFlac->bs);
5523 #endif
5524     if (!drflac__read_uint16(&pFlac->bs, 16, &desiredCRC16)) {
5525         return DRFLAC_AT_END;
5526     }
5527
5528 #ifndef DR_FLAC_NO_CRC
5529     if (actualCRC16 != desiredCRC16) {
5530         return DRFLAC_CRC_MISMATCH;    /* CRC mismatch. */
5531     }
5532 #endif
5533
5534     pFlac->currentFLACFrame.pcmFramesRemaining = pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
5535
5536     return DRFLAC_SUCCESS;
5537 }
5538
5539 static drflac_result drflac__seek_flac_frame(drflac* pFlac)
5540 {
5541     int channelCount;
5542     int i;
5543     drflac_uint16 desiredCRC16;
5544 #ifndef DR_FLAC_NO_CRC
5545     drflac_uint16 actualCRC16;
5546 #endif
5547
5548     channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
5549     for (i = 0; i < channelCount; ++i) {
5550         if (!drflac__seek_subframe(&pFlac->bs, &pFlac->currentFLACFrame, i)) {
5551             return DRFLAC_ERROR;
5552         }
5553     }
5554
5555     /* Padding. */
5556     if (!drflac__seek_bits(&pFlac->bs, DRFLAC_CACHE_L1_BITS_REMAINING(&pFlac->bs) & 7)) {
5557         return DRFLAC_ERROR;
5558     }
5559
5560     /* CRC. */
5561 #ifndef DR_FLAC_NO_CRC
5562     actualCRC16 = drflac__flush_crc16(&pFlac->bs);
5563 #endif
5564     if (!drflac__read_uint16(&pFlac->bs, 16, &desiredCRC16)) {
5565         return DRFLAC_AT_END;
5566     }
5567
5568 #ifndef DR_FLAC_NO_CRC
5569     if (actualCRC16 != desiredCRC16) {
5570         return DRFLAC_CRC_MISMATCH;    /* CRC mismatch. */
5571     }
5572 #endif
5573
5574     return DRFLAC_SUCCESS;
5575 }
5576
5577 static drflac_bool32 drflac__read_and_decode_next_flac_frame(drflac* pFlac)
5578 {
5579     DRFLAC_ASSERT(pFlac != NULL);
5580
5581     for (;;) {
5582         drflac_result result;
5583
5584         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5585             return DRFLAC_FALSE;
5586         }
5587
5588         result = drflac__decode_flac_frame(pFlac);
5589         if (result != DRFLAC_SUCCESS) {
5590             if (result == DRFLAC_CRC_MISMATCH) {
5591                 continue;   /* CRC mismatch. Skip to the next frame. */
5592             } else {
5593                 return DRFLAC_FALSE;
5594             }
5595         }
5596
5597         return DRFLAC_TRUE;
5598     }
5599 }
5600
5601 static void drflac__get_pcm_frame_range_of_current_flac_frame(drflac* pFlac, drflac_uint64* pFirstPCMFrame, drflac_uint64* pLastPCMFrame)
5602 {
5603     drflac_uint64 firstPCMFrame;
5604     drflac_uint64 lastPCMFrame;
5605
5606     DRFLAC_ASSERT(pFlac != NULL);
5607
5608     firstPCMFrame = pFlac->currentFLACFrame.header.pcmFrameNumber;
5609     if (firstPCMFrame == 0) {
5610         firstPCMFrame = ((drflac_uint64)pFlac->currentFLACFrame.header.flacFrameNumber) * pFlac->maxBlockSizeInPCMFrames;
5611     }
5612
5613     lastPCMFrame = firstPCMFrame + pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
5614     if (lastPCMFrame > 0) {
5615         lastPCMFrame -= 1; /* Needs to be zero based. */
5616     }
5617
5618     if (pFirstPCMFrame) {
5619         *pFirstPCMFrame = firstPCMFrame;
5620     }
5621     if (pLastPCMFrame) {
5622         *pLastPCMFrame = lastPCMFrame;
5623     }
5624 }
5625
5626 static drflac_bool32 drflac__seek_to_first_frame(drflac* pFlac)
5627 {
5628     drflac_bool32 result;
5629
5630     DRFLAC_ASSERT(pFlac != NULL);
5631
5632     result = drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes);
5633
5634     DRFLAC_ZERO_MEMORY(&pFlac->currentFLACFrame, sizeof(pFlac->currentFLACFrame));
5635     pFlac->currentPCMFrame = 0;
5636
5637     return result;
5638 }
5639
5640 static DRFLAC_INLINE drflac_result drflac__seek_to_next_flac_frame(drflac* pFlac)
5641 {
5642     /* This function should only ever be called while the decoder is sitting on the first byte past the FRAME_HEADER section. */
5643     DRFLAC_ASSERT(pFlac != NULL);
5644     return drflac__seek_flac_frame(pFlac);
5645 }
5646
5647
5648 static drflac_uint64 drflac__seek_forward_by_pcm_frames(drflac* pFlac, drflac_uint64 pcmFramesToSeek)
5649 {
5650     drflac_uint64 pcmFramesRead = 0;
5651     while (pcmFramesToSeek > 0) {
5652         if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
5653             if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
5654                 break;  /* Couldn't read the next frame, so just break from the loop and return. */
5655             }
5656         } else {
5657             if (pFlac->currentFLACFrame.pcmFramesRemaining > pcmFramesToSeek) {
5658                 pcmFramesRead   += pcmFramesToSeek;
5659                 pFlac->currentFLACFrame.pcmFramesRemaining -= (drflac_uint32)pcmFramesToSeek;   /* <-- Safe cast. Will always be < currentFrame.pcmFramesRemaining < 65536. */
5660                 pcmFramesToSeek  = 0;
5661             } else {
5662                 pcmFramesRead   += pFlac->currentFLACFrame.pcmFramesRemaining;
5663                 pcmFramesToSeek -= pFlac->currentFLACFrame.pcmFramesRemaining;
5664                 pFlac->currentFLACFrame.pcmFramesRemaining = 0;
5665             }
5666         }
5667     }
5668
5669     pFlac->currentPCMFrame += pcmFramesRead;
5670     return pcmFramesRead;
5671 }
5672
5673
5674 static drflac_bool32 drflac__seek_to_pcm_frame__brute_force(drflac* pFlac, drflac_uint64 pcmFrameIndex)
5675 {
5676     drflac_bool32 isMidFrame = DRFLAC_FALSE;
5677     drflac_uint64 runningPCMFrameCount;
5678
5679     DRFLAC_ASSERT(pFlac != NULL);
5680
5681     /* If we are seeking forward we start from the current position. Otherwise we need to start all the way from the start of the file. */
5682     if (pcmFrameIndex >= pFlac->currentPCMFrame) {
5683         /* Seeking forward. Need to seek from the current position. */
5684         runningPCMFrameCount = pFlac->currentPCMFrame;
5685
5686         /* The frame header for the first frame may not yet have been read. We need to do that if necessary. */
5687         if (pFlac->currentPCMFrame == 0 && pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
5688             if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5689                 return DRFLAC_FALSE;
5690             }
5691         } else {
5692             isMidFrame = DRFLAC_TRUE;
5693         }
5694     } else {
5695         /* Seeking backwards. Need to seek from the start of the file. */
5696         runningPCMFrameCount = 0;
5697
5698         /* Move back to the start. */
5699         if (!drflac__seek_to_first_frame(pFlac)) {
5700             return DRFLAC_FALSE;
5701         }
5702
5703         /* Decode the first frame in preparation for sample-exact seeking below. */
5704         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5705             return DRFLAC_FALSE;
5706         }
5707     }
5708
5709     /*
5710     We need to as quickly as possible find the frame that contains the target sample. To do this, we iterate over each frame and inspect its
5711     header. If based on the header we can determine that the frame contains the sample, we do a full decode of that frame.
5712     */
5713     for (;;) {
5714         drflac_uint64 pcmFrameCountInThisFLACFrame;
5715         drflac_uint64 firstPCMFrameInFLACFrame = 0;
5716         drflac_uint64 lastPCMFrameInFLACFrame = 0;
5717
5718         drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
5719
5720         pcmFrameCountInThisFLACFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
5721         if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFLACFrame)) {
5722             /*
5723             The sample should be in this frame. We need to fully decode it, however if it's an invalid frame (a CRC mismatch), we need to pretend
5724             it never existed and keep iterating.
5725             */
5726             drflac_uint64 pcmFramesToDecode = pcmFrameIndex - runningPCMFrameCount;
5727
5728             if (!isMidFrame) {
5729                 drflac_result result = drflac__decode_flac_frame(pFlac);
5730                 if (result == DRFLAC_SUCCESS) {
5731                     /* The frame is valid. We just need to skip over some samples to ensure it's sample-exact. */
5732                     return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;  /* <-- If this fails, something bad has happened (it should never fail). */
5733                 } else {
5734                     if (result == DRFLAC_CRC_MISMATCH) {
5735                         goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
5736                     } else {
5737                         return DRFLAC_FALSE;
5738                     }
5739                 }
5740             } else {
5741                 /* We started seeking mid-frame which means we need to skip the frame decoding part. */
5742                 return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
5743             }
5744         } else {
5745             /*
5746             It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
5747             frame never existed and leave the running sample count untouched.
5748             */
5749             if (!isMidFrame) {
5750                 drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
5751                 if (result == DRFLAC_SUCCESS) {
5752                     runningPCMFrameCount += pcmFrameCountInThisFLACFrame;
5753                 } else {
5754                     if (result == DRFLAC_CRC_MISMATCH) {
5755                         goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
5756                     } else {
5757                         return DRFLAC_FALSE;
5758                     }
5759                 }
5760             } else {
5761                 /*
5762                 We started seeking mid-frame which means we need to seek by reading to the end of the frame instead of with
5763                 drflac__seek_to_next_flac_frame() which only works if the decoder is sitting on the byte just after the frame header.
5764                 */
5765                 runningPCMFrameCount += pFlac->currentFLACFrame.pcmFramesRemaining;
5766                 pFlac->currentFLACFrame.pcmFramesRemaining = 0;
5767                 isMidFrame = DRFLAC_FALSE;
5768             }
5769
5770             /* If we are seeking to the end of the file and we've just hit it, we're done. */
5771             if (pcmFrameIndex == pFlac->totalPCMFrameCount && runningPCMFrameCount == pFlac->totalPCMFrameCount) {
5772                 return DRFLAC_TRUE;
5773             }
5774         }
5775
5776     next_iteration:
5777         /* Grab the next frame in preparation for the next iteration. */
5778         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5779             return DRFLAC_FALSE;
5780         }
5781     }
5782 }
5783
5784
5785 #if !defined(DR_FLAC_NO_CRC)
5786 /*
5787 We use an average compression ratio to determine our approximate start location. FLAC files are generally about 50%-70% the size of their
5788 uncompressed counterparts so we'll use this as a basis. I'm going to split the middle and use a factor of 0.6 to determine the starting
5789 location.
5790 */
5791 #define DRFLAC_BINARY_SEARCH_APPROX_COMPRESSION_RATIO 0.6f
5792
5793 static drflac_bool32 drflac__seek_to_approximate_flac_frame_to_byte(drflac* pFlac, drflac_uint64 targetByte, drflac_uint64 rangeLo, drflac_uint64 rangeHi, drflac_uint64* pLastSuccessfulSeekOffset)
5794 {
5795     DRFLAC_ASSERT(pFlac != NULL);
5796     DRFLAC_ASSERT(pLastSuccessfulSeekOffset != NULL);
5797     DRFLAC_ASSERT(targetByte >= rangeLo);
5798     DRFLAC_ASSERT(targetByte <= rangeHi);
5799
5800     *pLastSuccessfulSeekOffset = pFlac->firstFLACFramePosInBytes;
5801
5802     for (;;) {
5803         /* After rangeLo == rangeHi == targetByte fails, we need to break out. */
5804         drflac_uint64 lastTargetByte = targetByte;
5805
5806         /* When seeking to a byte, failure probably means we've attempted to seek beyond the end of the stream. To counter this we just halve it each attempt. */
5807         if (!drflac__seek_to_byte(&pFlac->bs, targetByte)) {
5808             /* If we couldn't even seek to the first byte in the stream we have a problem. Just abandon the whole thing. */
5809             if (targetByte == 0) {
5810                 drflac__seek_to_first_frame(pFlac); /* Try to recover. */
5811                 return DRFLAC_FALSE;
5812             }
5813
5814             /* Halve the byte location and continue. */
5815             targetByte = rangeLo + ((rangeHi - rangeLo)/2);
5816             rangeHi = targetByte;
5817         } else {
5818             /* Getting here should mean that we have seeked to an appropriate byte. */
5819
5820             /* Clear the details of the FLAC frame so we don't misreport data. */
5821             DRFLAC_ZERO_MEMORY(&pFlac->currentFLACFrame, sizeof(pFlac->currentFLACFrame));
5822
5823             /*
5824             Now seek to the next FLAC frame. We need to decode the entire frame (not just the header) because it's possible for the header to incorrectly pass the
5825             CRC check and return bad data. We need to decode the entire frame to be more certain. Although this seems unlikely, this has happened to me in testing
5826             so it needs to stay this way for now.
5827             */
5828 #if 1
5829             if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
5830                 /* Halve the byte location and continue. */
5831                 targetByte = rangeLo + ((rangeHi - rangeLo)/2);
5832                 rangeHi = targetByte;
5833             } else {
5834                 break;
5835             }
5836 #else
5837             if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
5838                 /* Halve the byte location and continue. */
5839                 targetByte = rangeLo + ((rangeHi - rangeLo)/2);
5840                 rangeHi = targetByte;
5841             } else {
5842                 break;
5843             }
5844 #endif
5845         }
5846
5847         /* We already tried this byte and there are no more to try, break out. */
5848         if(targetByte == lastTargetByte) {
5849             return DRFLAC_FALSE;
5850         }
5851     }
5852
5853     /* The current PCM frame needs to be updated based on the frame we just seeked to. */
5854     drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &pFlac->currentPCMFrame, NULL);
5855
5856     DRFLAC_ASSERT(targetByte <= rangeHi);
5857
5858     *pLastSuccessfulSeekOffset = targetByte;
5859     return DRFLAC_TRUE;
5860 }
5861
5862 static drflac_bool32 drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(drflac* pFlac, drflac_uint64 offset)
5863 {
5864     /* This section of code would be used if we were only decoding the FLAC frame header when calling drflac__seek_to_approximate_flac_frame_to_byte(). */
5865 #if 0
5866     if (drflac__decode_flac_frame(pFlac) != DRFLAC_SUCCESS) {
5867         /* We failed to decode this frame which may be due to it being corrupt. We'll just use the next valid FLAC frame. */
5868         if (drflac__read_and_decode_next_flac_frame(pFlac) == DRFLAC_FALSE) {
5869             return DRFLAC_FALSE;
5870         }
5871     }
5872 #endif
5873
5874     return drflac__seek_forward_by_pcm_frames(pFlac, offset) == offset;
5875 }
5876
5877
5878 static drflac_bool32 drflac__seek_to_pcm_frame__binary_search_internal(drflac* pFlac, drflac_uint64 pcmFrameIndex, drflac_uint64 byteRangeLo, drflac_uint64 byteRangeHi)
5879 {
5880     /* This assumes pFlac->currentPCMFrame is sitting on byteRangeLo upon entry. */
5881
5882     drflac_uint64 targetByte;
5883     drflac_uint64 pcmRangeLo = pFlac->totalPCMFrameCount;
5884     drflac_uint64 pcmRangeHi = 0;
5885     drflac_uint64 lastSuccessfulSeekOffset = (drflac_uint64)-1;
5886     drflac_uint64 closestSeekOffsetBeforeTargetPCMFrame = byteRangeLo;
5887     drflac_uint32 seekForwardThreshold = (pFlac->maxBlockSizeInPCMFrames != 0) ? pFlac->maxBlockSizeInPCMFrames*2 : 4096;
5888
5889     targetByte = byteRangeLo + (drflac_uint64)(((drflac_int64)((pcmFrameIndex - pFlac->currentPCMFrame) * pFlac->channels * pFlac->bitsPerSample)/8.0f) * DRFLAC_BINARY_SEARCH_APPROX_COMPRESSION_RATIO);
5890     if (targetByte > byteRangeHi) {
5891         targetByte = byteRangeHi;
5892     }
5893
5894     for (;;) {
5895         if (drflac__seek_to_approximate_flac_frame_to_byte(pFlac, targetByte, byteRangeLo, byteRangeHi, &lastSuccessfulSeekOffset)) {
5896             /* We found a FLAC frame. We need to check if it contains the sample we're looking for. */
5897             drflac_uint64 newPCMRangeLo;
5898             drflac_uint64 newPCMRangeHi;
5899             drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &newPCMRangeLo, &newPCMRangeHi);
5900
5901             /* If we selected the same frame, it means we should be pretty close. Just decode the rest. */
5902             if (pcmRangeLo == newPCMRangeLo) {
5903                 if (!drflac__seek_to_approximate_flac_frame_to_byte(pFlac, closestSeekOffsetBeforeTargetPCMFrame, closestSeekOffsetBeforeTargetPCMFrame, byteRangeHi, &lastSuccessfulSeekOffset)) {
5904                     break;  /* Failed to seek to closest frame. */
5905                 }
5906
5907                 if (drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame)) {
5908                     return DRFLAC_TRUE;
5909                 } else {
5910                     break;  /* Failed to seek forward. */
5911                 }
5912             }
5913
5914             pcmRangeLo = newPCMRangeLo;
5915             pcmRangeHi = newPCMRangeHi;
5916
5917             if (pcmRangeLo <= pcmFrameIndex && pcmRangeHi >= pcmFrameIndex) {
5918                 /* The target PCM frame is in this FLAC frame. */
5919                 if (drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame) ) {
5920                     return DRFLAC_TRUE;
5921                 } else {
5922                     break;  /* Failed to seek to FLAC frame. */
5923                 }
5924             } else {
5925                 const float approxCompressionRatio = (drflac_int64)(lastSuccessfulSeekOffset - pFlac->firstFLACFramePosInBytes) / ((drflac_int64)(pcmRangeLo * pFlac->channels * pFlac->bitsPerSample)/8.0f);
5926
5927                 if (pcmRangeLo > pcmFrameIndex) {
5928                     /* We seeked too far forward. We need to move our target byte backward and try again. */
5929                     byteRangeHi = lastSuccessfulSeekOffset;
5930                     if (byteRangeLo > byteRangeHi) {
5931                         byteRangeLo = byteRangeHi;
5932                     }
5933
5934                     targetByte = byteRangeLo + ((byteRangeHi - byteRangeLo) / 2);
5935                     if (targetByte < byteRangeLo) {
5936                         targetByte = byteRangeLo;
5937                     }
5938                 } else /*if (pcmRangeHi < pcmFrameIndex)*/ {
5939                     /* We didn't seek far enough. We need to move our target byte forward and try again. */
5940
5941                     /* If we're close enough we can just seek forward. */
5942                     if ((pcmFrameIndex - pcmRangeLo) < seekForwardThreshold) {
5943                         if (drflac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame)) {
5944                             return DRFLAC_TRUE;
5945                         } else {
5946                             break;  /* Failed to seek to FLAC frame. */
5947                         }
5948                     } else {
5949                         byteRangeLo = lastSuccessfulSeekOffset;
5950                         if (byteRangeHi < byteRangeLo) {
5951                             byteRangeHi = byteRangeLo;
5952                         }
5953
5954                         targetByte = lastSuccessfulSeekOffset + (drflac_uint64)(((drflac_int64)((pcmFrameIndex-pcmRangeLo) * pFlac->channels * pFlac->bitsPerSample)/8.0f) * approxCompressionRatio);
5955                         if (targetByte > byteRangeHi) {
5956                             targetByte = byteRangeHi;
5957                         }
5958
5959                         if (closestSeekOffsetBeforeTargetPCMFrame < lastSuccessfulSeekOffset) {
5960                             closestSeekOffsetBeforeTargetPCMFrame = lastSuccessfulSeekOffset;
5961                         }
5962                     }
5963                 }
5964             }
5965         } else {
5966             /* Getting here is really bad. We just recover as best we can, but moving to the first frame in the stream, and then abort. */
5967             break;
5968         }
5969     }
5970
5971     drflac__seek_to_first_frame(pFlac); /* <-- Try to recover. */
5972     return DRFLAC_FALSE;
5973 }
5974
5975 static drflac_bool32 drflac__seek_to_pcm_frame__binary_search(drflac* pFlac, drflac_uint64 pcmFrameIndex)
5976 {
5977     drflac_uint64 byteRangeLo;
5978     drflac_uint64 byteRangeHi;
5979     drflac_uint32 seekForwardThreshold = (pFlac->maxBlockSizeInPCMFrames != 0) ? pFlac->maxBlockSizeInPCMFrames*2 : 4096;
5980
5981     /* Our algorithm currently assumes the FLAC stream is currently sitting at the start. */
5982     if (drflac__seek_to_first_frame(pFlac) == DRFLAC_FALSE) {
5983         return DRFLAC_FALSE;
5984     }
5985
5986     /* If we're close enough to the start, just move to the start and seek forward. */
5987     if (pcmFrameIndex < seekForwardThreshold) {
5988         return drflac__seek_forward_by_pcm_frames(pFlac, pcmFrameIndex) == pcmFrameIndex;
5989     }
5990
5991     /*
5992     Our starting byte range is the byte position of the first FLAC frame and the approximate end of the file as if it were completely uncompressed. This ensures
5993     the entire file is included, even though most of the time it'll exceed the end of the actual stream. This is OK as the frame searching logic will handle it.
5994     */
5995     byteRangeLo = pFlac->firstFLACFramePosInBytes;
5996     byteRangeHi = pFlac->firstFLACFramePosInBytes + (drflac_uint64)((drflac_int64)(pFlac->totalPCMFrameCount * pFlac->channels * pFlac->bitsPerSample)/8.0f);
5997
5998     return drflac__seek_to_pcm_frame__binary_search_internal(pFlac, pcmFrameIndex, byteRangeLo, byteRangeHi);
5999 }
6000 #endif  /* !DR_FLAC_NO_CRC */
6001
6002 static drflac_bool32 drflac__seek_to_pcm_frame__seek_table(drflac* pFlac, drflac_uint64 pcmFrameIndex)
6003 {
6004     drflac_uint32 iClosestSeekpoint = 0;
6005     drflac_bool32 isMidFrame = DRFLAC_FALSE;
6006     drflac_uint64 runningPCMFrameCount;
6007     drflac_uint32 iSeekpoint;
6008
6009
6010     DRFLAC_ASSERT(pFlac != NULL);
6011
6012     if (pFlac->pSeekpoints == NULL || pFlac->seekpointCount == 0) {
6013         return DRFLAC_FALSE;
6014     }
6015
6016     for (iSeekpoint = 0; iSeekpoint < pFlac->seekpointCount; ++iSeekpoint) {
6017         if (pFlac->pSeekpoints[iSeekpoint].firstPCMFrame >= pcmFrameIndex) {
6018             break;
6019         }
6020
6021         iClosestSeekpoint = iSeekpoint;
6022     }
6023
6024     /* There's been cases where the seek table contains only zeros. We need to do some basic validation on the closest seekpoint. */
6025     if (pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount == 0 || pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount > pFlac->maxBlockSizeInPCMFrames) {
6026         return DRFLAC_FALSE;
6027     }
6028     if (pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame > pFlac->totalPCMFrameCount && pFlac->totalPCMFrameCount > 0) {
6029         return DRFLAC_FALSE;
6030     }
6031
6032 #if !defined(DR_FLAC_NO_CRC)
6033     /* At this point we should know the closest seek point. We can use a binary search for this. We need to know the total sample count for this. */
6034     if (pFlac->totalPCMFrameCount > 0) {
6035         drflac_uint64 byteRangeLo;
6036         drflac_uint64 byteRangeHi;
6037
6038         byteRangeHi = pFlac->firstFLACFramePosInBytes + (drflac_uint64)((drflac_int64)(pFlac->totalPCMFrameCount * pFlac->channels * pFlac->bitsPerSample)/8.0f);
6039         byteRangeLo = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset;
6040
6041         /*
6042         If our closest seek point is not the last one, we only need to search between it and the next one. The section below calculates an appropriate starting
6043         value for byteRangeHi which will clamp it appropriately.
6044
6045         Note that the next seekpoint must have an offset greater than the closest seekpoint because otherwise our binary search algorithm will break down. There
6046         have been cases where a seektable consists of seek points where every byte offset is set to 0 which causes problems. If this happens we need to abort.
6047         */
6048         if (iClosestSeekpoint < pFlac->seekpointCount-1) {
6049             drflac_uint32 iNextSeekpoint = iClosestSeekpoint + 1;
6050
6051             /* Basic validation on the seekpoints to ensure they're usable. */
6052             if (pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset >= pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset || pFlac->pSeekpoints[iNextSeekpoint].pcmFrameCount == 0) {
6053                 return DRFLAC_FALSE;    /* The next seekpoint doesn't look right. The seek table cannot be trusted from here. Abort. */
6054             }
6055
6056             if (pFlac->pSeekpoints[iNextSeekpoint].firstPCMFrame != (((drflac_uint64)0xFFFFFFFF << 32) | 0xFFFFFFFF)) { /* Make sure it's not a placeholder seekpoint. */
6057                 byteRangeHi = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset - 1; /* byteRangeHi must be zero based. */
6058             }
6059         }
6060
6061         if (drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset)) {
6062             if (drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
6063                 drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &pFlac->currentPCMFrame, NULL);
6064
6065                 if (drflac__seek_to_pcm_frame__binary_search_internal(pFlac, pcmFrameIndex, byteRangeLo, byteRangeHi)) {
6066                     return DRFLAC_TRUE;
6067                 }
6068             }
6069         }
6070     }
6071 #endif  /* !DR_FLAC_NO_CRC */
6072
6073     /* Getting here means we need to use a slower algorithm because the binary search method failed or cannot be used. */
6074
6075     /*
6076     If we are seeking forward and the closest seekpoint is _before_ the current sample, we just seek forward from where we are. Otherwise we start seeking
6077     from the seekpoint's first sample.
6078     */
6079     if (pcmFrameIndex >= pFlac->currentPCMFrame && pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame <= pFlac->currentPCMFrame) {
6080         /* Optimized case. Just seek forward from where we are. */
6081         runningPCMFrameCount = pFlac->currentPCMFrame;
6082
6083         /* The frame header for the first frame may not yet have been read. We need to do that if necessary. */
6084         if (pFlac->currentPCMFrame == 0 && pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
6085             if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
6086                 return DRFLAC_FALSE;
6087             }
6088         } else {
6089             isMidFrame = DRFLAC_TRUE;
6090         }
6091     } else {
6092         /* Slower case. Seek to the start of the seekpoint and then seek forward from there. */
6093         runningPCMFrameCount = pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame;
6094
6095         if (!drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset)) {
6096             return DRFLAC_FALSE;
6097         }
6098
6099         /* Grab the frame the seekpoint is sitting on in preparation for the sample-exact seeking below. */
6100         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
6101             return DRFLAC_FALSE;
6102         }
6103     }
6104
6105     for (;;) {
6106         drflac_uint64 pcmFrameCountInThisFLACFrame;
6107         drflac_uint64 firstPCMFrameInFLACFrame = 0;
6108         drflac_uint64 lastPCMFrameInFLACFrame = 0;
6109
6110         drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
6111
6112         pcmFrameCountInThisFLACFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
6113         if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFLACFrame)) {
6114             /*
6115             The sample should be in this frame. We need to fully decode it, but if it's an invalid frame (a CRC mismatch) we need to pretend
6116             it never existed and keep iterating.
6117             */
6118             drflac_uint64 pcmFramesToDecode = pcmFrameIndex - runningPCMFrameCount;
6119
6120             if (!isMidFrame) {
6121                 drflac_result result = drflac__decode_flac_frame(pFlac);
6122                 if (result == DRFLAC_SUCCESS) {
6123                     /* The frame is valid. We just need to skip over some samples to ensure it's sample-exact. */
6124                     return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;  /* <-- If this fails, something bad has happened (it should never fail). */
6125                 } else {
6126                     if (result == DRFLAC_CRC_MISMATCH) {
6127                         goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
6128                     } else {
6129                         return DRFLAC_FALSE;
6130                     }
6131                 }
6132             } else {
6133                 /* We started seeking mid-frame which means we need to skip the frame decoding part. */
6134                 return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
6135             }
6136         } else {
6137             /*
6138             It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
6139             frame never existed and leave the running sample count untouched.
6140             */
6141             if (!isMidFrame) {
6142                 drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
6143                 if (result == DRFLAC_SUCCESS) {
6144                     runningPCMFrameCount += pcmFrameCountInThisFLACFrame;
6145                 } else {
6146                     if (result == DRFLAC_CRC_MISMATCH) {
6147                         goto next_iteration;   /* CRC mismatch. Pretend this frame never existed. */
6148                     } else {
6149                         return DRFLAC_FALSE;
6150                     }
6151                 }
6152             } else {
6153                 /*
6154                 We started seeking mid-frame which means we need to seek by reading to the end of the frame instead of with
6155                 drflac__seek_to_next_flac_frame() which only works if the decoder is sitting on the byte just after the frame header.
6156                 */
6157                 runningPCMFrameCount += pFlac->currentFLACFrame.pcmFramesRemaining;
6158                 pFlac->currentFLACFrame.pcmFramesRemaining = 0;
6159                 isMidFrame = DRFLAC_FALSE;
6160             }
6161
6162             /* If we are seeking to the end of the file and we've just hit it, we're done. */
6163             if (pcmFrameIndex == pFlac->totalPCMFrameCount && runningPCMFrameCount == pFlac->totalPCMFrameCount) {
6164                 return DRFLAC_TRUE;
6165             }
6166         }
6167
6168     next_iteration:
6169         /* Grab the next frame in preparation for the next iteration. */
6170         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
6171             return DRFLAC_FALSE;
6172         }
6173     }
6174 }
6175
6176
6177 #ifndef DR_FLAC_NO_OGG
6178 typedef struct
6179 {
6180     drflac_uint8 capturePattern[4];  /* Should be "OggS" */
6181     drflac_uint8 structureVersion;   /* Always 0. */
6182     drflac_uint8 headerType;
6183     drflac_uint64 granulePosition;
6184     drflac_uint32 serialNumber;
6185     drflac_uint32 sequenceNumber;
6186     drflac_uint32 checksum;
6187     drflac_uint8 segmentCount;
6188     drflac_uint8 segmentTable[255];
6189 } drflac_ogg_page_header;
6190 #endif
6191
6192 typedef struct
6193 {
6194     drflac_read_proc onRead;
6195     drflac_seek_proc onSeek;
6196     drflac_meta_proc onMeta;
6197     drflac_container container;
6198     void* pUserData;
6199     void* pUserDataMD;
6200     drflac_uint32 sampleRate;
6201     drflac_uint8  channels;
6202     drflac_uint8  bitsPerSample;
6203     drflac_uint64 totalPCMFrameCount;
6204     drflac_uint16 maxBlockSizeInPCMFrames;
6205     drflac_uint64 runningFilePos;
6206     drflac_bool32 hasStreamInfoBlock;
6207     drflac_bool32 hasMetadataBlocks;
6208     drflac_bs bs;                           /* <-- A bit streamer is required for loading data during initialization. */
6209     drflac_frame_header firstFrameHeader;   /* <-- The header of the first frame that was read during relaxed initalization. Only set if there is no STREAMINFO block. */
6210
6211 #ifndef DR_FLAC_NO_OGG
6212     drflac_uint32 oggSerial;
6213     drflac_uint64 oggFirstBytePos;
6214     drflac_ogg_page_header oggBosHeader;
6215 #endif
6216 } drflac_init_info;
6217
6218 static DRFLAC_INLINE void drflac__decode_block_header(drflac_uint32 blockHeader, drflac_uint8* isLastBlock, drflac_uint8* blockType, drflac_uint32* blockSize)
6219 {
6220     blockHeader = drflac__be2host_32(blockHeader);
6221     *isLastBlock = (drflac_uint8)((blockHeader & 0x80000000UL) >> 31);
6222     *blockType   = (drflac_uint8)((blockHeader & 0x7F000000UL) >> 24);
6223     *blockSize   =                (blockHeader & 0x00FFFFFFUL);
6224 }
6225
6226 static DRFLAC_INLINE drflac_bool32 drflac__read_and_decode_block_header(drflac_read_proc onRead, void* pUserData, drflac_uint8* isLastBlock, drflac_uint8* blockType, drflac_uint32* blockSize)
6227 {
6228     drflac_uint32 blockHeader;
6229
6230     *blockSize = 0;
6231     if (onRead(pUserData, &blockHeader, 4) != 4) {
6232         return DRFLAC_FALSE;
6233     }
6234
6235     drflac__decode_block_header(blockHeader, isLastBlock, blockType, blockSize);
6236     return DRFLAC_TRUE;
6237 }
6238
6239 static drflac_bool32 drflac__read_streaminfo(drflac_read_proc onRead, void* pUserData, drflac_streaminfo* pStreamInfo)
6240 {
6241     drflac_uint32 blockSizes;
6242     drflac_uint64 frameSizes = 0;
6243     drflac_uint64 importantProps;
6244     drflac_uint8 md5[16];
6245
6246     /* min/max block size. */
6247     if (onRead(pUserData, &blockSizes, 4) != 4) {
6248         return DRFLAC_FALSE;
6249     }
6250
6251     /* min/max frame size. */
6252     if (onRead(pUserData, &frameSizes, 6) != 6) {
6253         return DRFLAC_FALSE;
6254     }
6255
6256     /* Sample rate, channels, bits per sample and total sample count. */
6257     if (onRead(pUserData, &importantProps, 8) != 8) {
6258         return DRFLAC_FALSE;
6259     }
6260
6261     /* MD5 */
6262     if (onRead(pUserData, md5, sizeof(md5)) != sizeof(md5)) {
6263         return DRFLAC_FALSE;
6264     }
6265
6266     blockSizes     = drflac__be2host_32(blockSizes);
6267     frameSizes     = drflac__be2host_64(frameSizes);
6268     importantProps = drflac__be2host_64(importantProps);
6269
6270     pStreamInfo->minBlockSizeInPCMFrames = (drflac_uint16)((blockSizes & 0xFFFF0000) >> 16);
6271     pStreamInfo->maxBlockSizeInPCMFrames = (drflac_uint16) (blockSizes & 0x0000FFFF);
6272     pStreamInfo->minFrameSizeInPCMFrames = (drflac_uint32)((frameSizes     &  (((drflac_uint64)0x00FFFFFF << 16) << 24)) >> 40);
6273     pStreamInfo->maxFrameSizeInPCMFrames = (drflac_uint32)((frameSizes     &  (((drflac_uint64)0x00FFFFFF << 16) <<  0)) >> 16);
6274     pStreamInfo->sampleRate              = (drflac_uint32)((importantProps &  (((drflac_uint64)0x000FFFFF << 16) << 28)) >> 44);
6275     pStreamInfo->channels                = (drflac_uint8 )((importantProps &  (((drflac_uint64)0x0000000E << 16) << 24)) >> 41) + 1;
6276     pStreamInfo->bitsPerSample           = (drflac_uint8 )((importantProps &  (((drflac_uint64)0x0000001F << 16) << 20)) >> 36) + 1;
6277     pStreamInfo->totalPCMFrameCount      =                ((importantProps & ((((drflac_uint64)0x0000000F << 16) << 16) | 0xFFFFFFFF)));
6278     DRFLAC_COPY_MEMORY(pStreamInfo->md5, md5, sizeof(md5));
6279
6280     return DRFLAC_TRUE;
6281 }
6282
6283
6284 static void* drflac__malloc_default(size_t sz, void* pUserData)
6285 {
6286     (void)pUserData;
6287     return DRFLAC_MALLOC(sz);
6288 }
6289
6290 static void* drflac__realloc_default(void* p, size_t sz, void* pUserData)
6291 {
6292     (void)pUserData;
6293     return DRFLAC_REALLOC(p, sz);
6294 }
6295
6296 static void drflac__free_default(void* p, void* pUserData)
6297 {
6298     (void)pUserData;
6299     DRFLAC_FREE(p);
6300 }
6301
6302
6303 static void* drflac__malloc_from_callbacks(size_t sz, const drflac_allocation_callbacks* pAllocationCallbacks)
6304 {
6305     if (pAllocationCallbacks == NULL) {
6306         return NULL;
6307     }
6308
6309     if (pAllocationCallbacks->onMalloc != NULL) {
6310         return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
6311     }
6312
6313     /* Try using realloc(). */
6314     if (pAllocationCallbacks->onRealloc != NULL) {
6315         return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
6316     }
6317
6318     return NULL;
6319 }
6320
6321 static void* drflac__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const drflac_allocation_callbacks* pAllocationCallbacks)
6322 {
6323     if (pAllocationCallbacks == NULL) {
6324         return NULL;
6325     }
6326
6327     if (pAllocationCallbacks->onRealloc != NULL) {
6328         return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
6329     }
6330
6331     /* Try emulating realloc() in terms of malloc()/free(). */
6332     if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
6333         void* p2;
6334
6335         p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
6336         if (p2 == NULL) {
6337             return NULL;
6338         }
6339
6340         if (p != NULL) {
6341             DRFLAC_COPY_MEMORY(p2, p, szOld);
6342             pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
6343         }
6344
6345         return p2;
6346     }
6347
6348     return NULL;
6349 }
6350
6351 static void drflac__free_from_callbacks(void* p, const drflac_allocation_callbacks* pAllocationCallbacks)
6352 {
6353     if (p == NULL || pAllocationCallbacks == NULL) {
6354         return;
6355     }
6356
6357     if (pAllocationCallbacks->onFree != NULL) {
6358         pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
6359     }
6360 }
6361
6362
6363 static drflac_bool32 drflac__read_and_decode_metadata(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, void* pUserDataMD, drflac_uint64* pFirstFramePos, drflac_uint64* pSeektablePos, drflac_uint32* pSeektableSize, drflac_allocation_callbacks* pAllocationCallbacks)
6364 {
6365     /*
6366     We want to keep track of the byte position in the stream of the seektable. At the time of calling this function we know that
6367     we'll be sitting on byte 42.
6368     */
6369     drflac_uint64 runningFilePos = 42;
6370     drflac_uint64 seektablePos   = 0;
6371     drflac_uint32 seektableSize  = 0;
6372
6373     for (;;) {
6374         drflac_metadata metadata;
6375         drflac_uint8 isLastBlock = 0;
6376         drflac_uint8 blockType;
6377         drflac_uint32 blockSize;
6378         if (drflac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize) == DRFLAC_FALSE) {
6379             return DRFLAC_FALSE;
6380         }
6381         runningFilePos += 4;
6382
6383         metadata.type = blockType;
6384         metadata.pRawData = NULL;
6385         metadata.rawDataSize = 0;
6386
6387         switch (blockType)
6388         {
6389             case DRFLAC_METADATA_BLOCK_TYPE_APPLICATION:
6390             {
6391                 if (blockSize < 4) {
6392                     return DRFLAC_FALSE;
6393                 }
6394
6395                 if (onMeta) {
6396                     void* pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
6397                     if (pRawData == NULL) {
6398                         return DRFLAC_FALSE;
6399                     }
6400
6401                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
6402                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6403                         return DRFLAC_FALSE;
6404                     }
6405
6406                     metadata.pRawData = pRawData;
6407                     metadata.rawDataSize = blockSize;
6408                     metadata.data.application.id       = drflac__be2host_32(*(drflac_uint32*)pRawData);
6409                     metadata.data.application.pData    = (const void*)((drflac_uint8*)pRawData + sizeof(drflac_uint32));
6410                     metadata.data.application.dataSize = blockSize - sizeof(drflac_uint32);
6411                     onMeta(pUserDataMD, &metadata);
6412
6413                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6414                 }
6415             } break;
6416
6417             case DRFLAC_METADATA_BLOCK_TYPE_SEEKTABLE:
6418             {
6419                 seektablePos  = runningFilePos;
6420                 seektableSize = blockSize;
6421
6422                 if (onMeta) {
6423                     drflac_uint32 iSeekpoint;
6424                     void* pRawData;
6425
6426                     pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
6427                     if (pRawData == NULL) {
6428                         return DRFLAC_FALSE;
6429                     }
6430
6431                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
6432                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6433                         return DRFLAC_FALSE;
6434                     }
6435
6436                     metadata.pRawData = pRawData;
6437                     metadata.rawDataSize = blockSize;
6438                     metadata.data.seektable.seekpointCount = blockSize/sizeof(drflac_seekpoint);
6439                     metadata.data.seektable.pSeekpoints = (const drflac_seekpoint*)pRawData;
6440
6441                     /* Endian swap. */
6442                     for (iSeekpoint = 0; iSeekpoint < metadata.data.seektable.seekpointCount; ++iSeekpoint) {
6443                         drflac_seekpoint* pSeekpoint = (drflac_seekpoint*)pRawData + iSeekpoint;
6444                         pSeekpoint->firstPCMFrame   = drflac__be2host_64(pSeekpoint->firstPCMFrame);
6445                         pSeekpoint->flacFrameOffset = drflac__be2host_64(pSeekpoint->flacFrameOffset);
6446                         pSeekpoint->pcmFrameCount   = drflac__be2host_16(pSeekpoint->pcmFrameCount);
6447                     }
6448
6449                     onMeta(pUserDataMD, &metadata);
6450
6451                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6452                 }
6453             } break;
6454
6455             case DRFLAC_METADATA_BLOCK_TYPE_VORBIS_COMMENT:
6456             {
6457                 if (blockSize < 8) {
6458                     return DRFLAC_FALSE;
6459                 }
6460
6461                 if (onMeta) {
6462                     void* pRawData;
6463                     const char* pRunningData;
6464                     const char* pRunningDataEnd;
6465                     drflac_uint32 i;
6466
6467                     pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
6468                     if (pRawData == NULL) {
6469                         return DRFLAC_FALSE;
6470                     }
6471
6472                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
6473                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6474                         return DRFLAC_FALSE;
6475                     }
6476
6477                     metadata.pRawData = pRawData;
6478                     metadata.rawDataSize = blockSize;
6479
6480                     pRunningData    = (const char*)pRawData;
6481                     pRunningDataEnd = (const char*)pRawData + blockSize;
6482
6483                     metadata.data.vorbis_comment.vendorLength = drflac__le2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6484
6485                     /* Need space for the rest of the block */
6486                     if ((pRunningDataEnd - pRunningData) - 4 < (drflac_int64)metadata.data.vorbis_comment.vendorLength) { /* <-- Note the order of operations to avoid overflow to a valid value */
6487                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6488                         return DRFLAC_FALSE;
6489                     }
6490                     metadata.data.vorbis_comment.vendor       = pRunningData;                                            pRunningData += metadata.data.vorbis_comment.vendorLength;
6491                     metadata.data.vorbis_comment.commentCount = drflac__le2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6492
6493                     /* Need space for 'commentCount' comments after the block, which at minimum is a drflac_uint32 per comment */
6494                     if ((pRunningDataEnd - pRunningData) / sizeof(drflac_uint32) < metadata.data.vorbis_comment.commentCount) { /* <-- Note the order of operations to avoid overflow to a valid value */
6495                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6496                         return DRFLAC_FALSE;
6497                     }
6498                     metadata.data.vorbis_comment.pComments    = pRunningData;
6499
6500                     /* Check that the comments section is valid before passing it to the callback */
6501                     for (i = 0; i < metadata.data.vorbis_comment.commentCount; ++i) {
6502                         drflac_uint32 commentLength;
6503
6504                         if (pRunningDataEnd - pRunningData < 4) {
6505                             drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6506                             return DRFLAC_FALSE;
6507                         }
6508
6509                         commentLength = drflac__le2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6510                         if (pRunningDataEnd - pRunningData < (drflac_int64)commentLength) { /* <-- Note the order of operations to avoid overflow to a valid value */
6511                             drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6512                             return DRFLAC_FALSE;
6513                         }
6514                         pRunningData += commentLength;
6515                     }
6516
6517                     onMeta(pUserDataMD, &metadata);
6518
6519                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6520                 }
6521             } break;
6522
6523             case DRFLAC_METADATA_BLOCK_TYPE_CUESHEET:
6524             {
6525                 if (blockSize < 396) {
6526                     return DRFLAC_FALSE;
6527                 }
6528
6529                 if (onMeta) {
6530                     void* pRawData;
6531                     const char* pRunningData;
6532                     const char* pRunningDataEnd;
6533                     drflac_uint8 iTrack;
6534                     drflac_uint8 iIndex;
6535
6536                     pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
6537                     if (pRawData == NULL) {
6538                         return DRFLAC_FALSE;
6539                     }
6540
6541                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
6542                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6543                         return DRFLAC_FALSE;
6544                     }
6545
6546                     metadata.pRawData = pRawData;
6547                     metadata.rawDataSize = blockSize;
6548
6549                     pRunningData    = (const char*)pRawData;
6550                     pRunningDataEnd = (const char*)pRawData + blockSize;
6551
6552                     DRFLAC_COPY_MEMORY(metadata.data.cuesheet.catalog, pRunningData, 128);                              pRunningData += 128;
6553                     metadata.data.cuesheet.leadInSampleCount = drflac__be2host_64(*(const drflac_uint64*)pRunningData); pRunningData += 8;
6554                     metadata.data.cuesheet.isCD              = (pRunningData[0] & 0x80) != 0;                           pRunningData += 259;
6555                     metadata.data.cuesheet.trackCount        = pRunningData[0];                                         pRunningData += 1;
6556                     metadata.data.cuesheet.pTrackData        = pRunningData;
6557
6558                     /* Check that the cuesheet tracks are valid before passing it to the callback */
6559                     for (iTrack = 0; iTrack < metadata.data.cuesheet.trackCount; ++iTrack) {
6560                         drflac_uint8 indexCount;
6561                         drflac_uint32 indexPointSize;
6562
6563                         if (pRunningDataEnd - pRunningData < 36) {
6564                             drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6565                             return DRFLAC_FALSE;
6566                         }
6567
6568                         /* Skip to the index point count */
6569                         pRunningData += 35;
6570                         indexCount = pRunningData[0]; pRunningData += 1;
6571                         indexPointSize = indexCount * sizeof(drflac_cuesheet_track_index);
6572                         if (pRunningDataEnd - pRunningData < (drflac_int64)indexPointSize) {
6573                             drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6574                             return DRFLAC_FALSE;
6575                         }
6576
6577                         /* Endian swap. */
6578                         for (iIndex = 0; iIndex < indexCount; ++iIndex) {
6579                             drflac_cuesheet_track_index* pTrack = (drflac_cuesheet_track_index*)pRunningData;
6580                             pRunningData += sizeof(drflac_cuesheet_track_index);
6581                             pTrack->offset = drflac__be2host_64(pTrack->offset);
6582                         }
6583                     }
6584
6585                     onMeta(pUserDataMD, &metadata);
6586
6587                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6588                 }
6589             } break;
6590
6591             case DRFLAC_METADATA_BLOCK_TYPE_PICTURE:
6592             {
6593                 if (blockSize < 32) {
6594                     return DRFLAC_FALSE;
6595                 }
6596
6597                 if (onMeta) {
6598                     void* pRawData;
6599                     const char* pRunningData;
6600                     const char* pRunningDataEnd;
6601
6602                     pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
6603                     if (pRawData == NULL) {
6604                         return DRFLAC_FALSE;
6605                     }
6606
6607                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
6608                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6609                         return DRFLAC_FALSE;
6610                     }
6611
6612                     metadata.pRawData = pRawData;
6613                     metadata.rawDataSize = blockSize;
6614
6615                     pRunningData    = (const char*)pRawData;
6616                     pRunningDataEnd = (const char*)pRawData + blockSize;
6617
6618                     metadata.data.picture.type       = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6619                     metadata.data.picture.mimeLength = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6620
6621                     /* Need space for the rest of the block */
6622                     if ((pRunningDataEnd - pRunningData) - 24 < (drflac_int64)metadata.data.picture.mimeLength) { /* <-- Note the order of operations to avoid overflow to a valid value */
6623                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6624                         return DRFLAC_FALSE;
6625                     }
6626                     metadata.data.picture.mime              = pRunningData;                                            pRunningData += metadata.data.picture.mimeLength;
6627                     metadata.data.picture.descriptionLength = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6628
6629                     /* Need space for the rest of the block */
6630                     if ((pRunningDataEnd - pRunningData) - 20 < (drflac_int64)metadata.data.picture.descriptionLength) { /* <-- Note the order of operations to avoid overflow to a valid value */
6631                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6632                         return DRFLAC_FALSE;
6633                     }
6634                     metadata.data.picture.description     = pRunningData;                                            pRunningData += metadata.data.picture.descriptionLength;
6635                     metadata.data.picture.width           = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6636                     metadata.data.picture.height          = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6637                     metadata.data.picture.colorDepth      = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6638                     metadata.data.picture.indexColorCount = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6639                     metadata.data.picture.pictureDataSize = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
6640                     metadata.data.picture.pPictureData    = (const drflac_uint8*)pRunningData;
6641
6642                     /* Need space for the picture after the block */
6643                     if (pRunningDataEnd - pRunningData < (drflac_int64)metadata.data.picture.pictureDataSize) { /* <-- Note the order of operations to avoid overflow to a valid value */
6644                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6645                         return DRFLAC_FALSE;
6646                     }
6647
6648                     onMeta(pUserDataMD, &metadata);
6649
6650                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6651                 }
6652             } break;
6653
6654             case DRFLAC_METADATA_BLOCK_TYPE_PADDING:
6655             {
6656                 if (onMeta) {
6657                     metadata.data.padding.unused = 0;
6658
6659                     /* Padding doesn't have anything meaningful in it, so just skip over it, but make sure the caller is aware of it by firing the callback. */
6660                     if (!onSeek(pUserData, blockSize, drflac_seek_origin_current)) {
6661                         isLastBlock = DRFLAC_TRUE;  /* An error occurred while seeking. Attempt to recover by treating this as the last block which will in turn terminate the loop. */
6662                     } else {
6663                         onMeta(pUserDataMD, &metadata);
6664                     }
6665                 }
6666             } break;
6667
6668             case DRFLAC_METADATA_BLOCK_TYPE_INVALID:
6669             {
6670                 /* Invalid chunk. Just skip over this one. */
6671                 if (onMeta) {
6672                     if (!onSeek(pUserData, blockSize, drflac_seek_origin_current)) {
6673                         isLastBlock = DRFLAC_TRUE;  /* An error occurred while seeking. Attempt to recover by treating this as the last block which will in turn terminate the loop. */
6674                     }
6675                 }
6676             } break;
6677
6678             default:
6679             {
6680                 /*
6681                 It's an unknown chunk, but not necessarily invalid. There's a chance more metadata blocks might be defined later on, so we
6682                 can at the very least report the chunk to the application and let it look at the raw data.
6683                 */
6684                 if (onMeta) {
6685                     void* pRawData = drflac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
6686                     if (pRawData == NULL) {
6687                         return DRFLAC_FALSE;
6688                     }
6689
6690                     if (onRead(pUserData, pRawData, blockSize) != blockSize) {
6691                         drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6692                         return DRFLAC_FALSE;
6693                     }
6694
6695                     metadata.pRawData = pRawData;
6696                     metadata.rawDataSize = blockSize;
6697                     onMeta(pUserDataMD, &metadata);
6698
6699                     drflac__free_from_callbacks(pRawData, pAllocationCallbacks);
6700                 }
6701             } break;
6702         }
6703
6704         /* If we're not handling metadata, just skip over the block. If we are, it will have been handled earlier in the switch statement above. */
6705         if (onMeta == NULL && blockSize > 0) {
6706             if (!onSeek(pUserData, blockSize, drflac_seek_origin_current)) {
6707                 isLastBlock = DRFLAC_TRUE;
6708             }
6709         }
6710
6711         runningFilePos += blockSize;
6712         if (isLastBlock) {
6713             break;
6714         }
6715     }
6716
6717     *pSeektablePos = seektablePos;
6718     *pSeektableSize = seektableSize;
6719     *pFirstFramePos = runningFilePos;
6720
6721     return DRFLAC_TRUE;
6722 }
6723
6724 static drflac_bool32 drflac__init_private__native(drflac_init_info* pInit, drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, void* pUserDataMD, drflac_bool32 relaxed)
6725 {
6726     /* Pre Condition: The bit stream should be sitting just past the 4-byte id header. */
6727
6728     drflac_uint8 isLastBlock;
6729     drflac_uint8 blockType;
6730     drflac_uint32 blockSize;
6731
6732     (void)onSeek;
6733
6734     pInit->container = drflac_container_native;
6735
6736     /* The first metadata block should be the STREAMINFO block. */
6737     if (!drflac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize)) {
6738         return DRFLAC_FALSE;
6739     }
6740
6741     if (blockType != DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO || blockSize != 34) {
6742         if (!relaxed) {
6743             /* We're opening in strict mode and the first block is not the STREAMINFO block. Error. */
6744             return DRFLAC_FALSE;
6745         } else {
6746             /*
6747             Relaxed mode. To open from here we need to just find the first frame and set the sample rate, etc. to whatever is defined
6748             for that frame.
6749             */
6750             pInit->hasStreamInfoBlock = DRFLAC_FALSE;
6751             pInit->hasMetadataBlocks  = DRFLAC_FALSE;
6752
6753             if (!drflac__read_next_flac_frame_header(&pInit->bs, 0, &pInit->firstFrameHeader)) {
6754                 return DRFLAC_FALSE;    /* Couldn't find a frame. */
6755             }
6756
6757             if (pInit->firstFrameHeader.bitsPerSample == 0) {
6758                 return DRFLAC_FALSE;    /* Failed to initialize because the first frame depends on the STREAMINFO block, which does not exist. */
6759             }
6760
6761             pInit->sampleRate              = pInit->firstFrameHeader.sampleRate;
6762             pInit->channels                = drflac__get_channel_count_from_channel_assignment(pInit->firstFrameHeader.channelAssignment);
6763             pInit->bitsPerSample           = pInit->firstFrameHeader.bitsPerSample;
6764             pInit->maxBlockSizeInPCMFrames = 65535;   /* <-- See notes here: https://xiph.org/flac/format.html#metadata_block_streaminfo */
6765             return DRFLAC_TRUE;
6766         }
6767     } else {
6768         drflac_streaminfo streaminfo;
6769         if (!drflac__read_streaminfo(onRead, pUserData, &streaminfo)) {
6770             return DRFLAC_FALSE;
6771         }
6772
6773         pInit->hasStreamInfoBlock      = DRFLAC_TRUE;
6774         pInit->sampleRate              = streaminfo.sampleRate;
6775         pInit->channels                = streaminfo.channels;
6776         pInit->bitsPerSample           = streaminfo.bitsPerSample;
6777         pInit->totalPCMFrameCount      = streaminfo.totalPCMFrameCount;
6778         pInit->maxBlockSizeInPCMFrames = streaminfo.maxBlockSizeInPCMFrames;    /* Don't care about the min block size - only the max (used for determining the size of the memory allocation). */
6779         pInit->hasMetadataBlocks       = !isLastBlock;
6780
6781         if (onMeta) {
6782             drflac_metadata metadata;
6783             metadata.type = DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO;
6784             metadata.pRawData = NULL;
6785             metadata.rawDataSize = 0;
6786             metadata.data.streaminfo = streaminfo;
6787             onMeta(pUserDataMD, &metadata);
6788         }
6789
6790         return DRFLAC_TRUE;
6791     }
6792 }
6793
6794 #ifndef DR_FLAC_NO_OGG
6795 #define DRFLAC_OGG_MAX_PAGE_SIZE            65307
6796 #define DRFLAC_OGG_CAPTURE_PATTERN_CRC32    1605413199  /* CRC-32 of "OggS". */
6797
6798 typedef enum
6799 {
6800     drflac_ogg_recover_on_crc_mismatch,
6801     drflac_ogg_fail_on_crc_mismatch
6802 } drflac_ogg_crc_mismatch_recovery;
6803
6804 #ifndef DR_FLAC_NO_CRC
6805 static drflac_uint32 drflac__crc32_table[] = {
6806     0x00000000L, 0x04C11DB7L, 0x09823B6EL, 0x0D4326D9L,
6807     0x130476DCL, 0x17C56B6BL, 0x1A864DB2L, 0x1E475005L,
6808     0x2608EDB8L, 0x22C9F00FL, 0x2F8AD6D6L, 0x2B4BCB61L,
6809     0x350C9B64L, 0x31CD86D3L, 0x3C8EA00AL, 0x384FBDBDL,
6810     0x4C11DB70L, 0x48D0C6C7L, 0x4593E01EL, 0x4152FDA9L,
6811     0x5F15ADACL, 0x5BD4B01BL, 0x569796C2L, 0x52568B75L,
6812     0x6A1936C8L, 0x6ED82B7FL, 0x639B0DA6L, 0x675A1011L,
6813     0x791D4014L, 0x7DDC5DA3L, 0x709F7B7AL, 0x745E66CDL,
6814     0x9823B6E0L, 0x9CE2AB57L, 0x91A18D8EL, 0x95609039L,
6815     0x8B27C03CL, 0x8FE6DD8BL, 0x82A5FB52L, 0x8664E6E5L,
6816     0xBE2B5B58L, 0xBAEA46EFL, 0xB7A96036L, 0xB3687D81L,
6817     0xAD2F2D84L, 0xA9EE3033L, 0xA4AD16EAL, 0xA06C0B5DL,
6818     0xD4326D90L, 0xD0F37027L, 0xDDB056FEL, 0xD9714B49L,
6819     0xC7361B4CL, 0xC3F706FBL, 0xCEB42022L, 0xCA753D95L,
6820     0xF23A8028L, 0xF6FB9D9FL, 0xFBB8BB46L, 0xFF79A6F1L,
6821     0xE13EF6F4L, 0xE5FFEB43L, 0xE8BCCD9AL, 0xEC7DD02DL,
6822     0x34867077L, 0x30476DC0L, 0x3D044B19L, 0x39C556AEL,
6823     0x278206ABL, 0x23431B1CL, 0x2E003DC5L, 0x2AC12072L,
6824     0x128E9DCFL, 0x164F8078L, 0x1B0CA6A1L, 0x1FCDBB16L,
6825     0x018AEB13L, 0x054BF6A4L, 0x0808D07DL, 0x0CC9CDCAL,
6826     0x7897AB07L, 0x7C56B6B0L, 0x71159069L, 0x75D48DDEL,
6827     0x6B93DDDBL, 0x6F52C06CL, 0x6211E6B5L, 0x66D0FB02L,
6828     0x5E9F46BFL, 0x5A5E5B08L, 0x571D7DD1L, 0x53DC6066L,
6829     0x4D9B3063L, 0x495A2DD4L, 0x44190B0DL, 0x40D816BAL,
6830     0xACA5C697L, 0xA864DB20L, 0xA527FDF9L, 0xA1E6E04EL,
6831     0xBFA1B04BL, 0xBB60ADFCL, 0xB6238B25L, 0xB2E29692L,
6832     0x8AAD2B2FL, 0x8E6C3698L, 0x832F1041L, 0x87EE0DF6L,
6833     0x99A95DF3L, 0x9D684044L, 0x902B669DL, 0x94EA7B2AL,
6834     0xE0B41DE7L, 0xE4750050L, 0xE9362689L, 0xEDF73B3EL,
6835     0xF3B06B3BL, 0xF771768CL, 0xFA325055L, 0xFEF34DE2L,
6836     0xC6BCF05FL, 0xC27DEDE8L, 0xCF3ECB31L, 0xCBFFD686L,
6837     0xD5B88683L, 0xD1799B34L, 0xDC3ABDEDL, 0xD8FBA05AL,
6838     0x690CE0EEL, 0x6DCDFD59L, 0x608EDB80L, 0x644FC637L,
6839     0x7A089632L, 0x7EC98B85L, 0x738AAD5CL, 0x774BB0EBL,
6840     0x4F040D56L, 0x4BC510E1L, 0x46863638L, 0x42472B8FL,
6841     0x5C007B8AL, 0x58C1663DL, 0x558240E4L, 0x51435D53L,
6842     0x251D3B9EL, 0x21DC2629L, 0x2C9F00F0L, 0x285E1D47L,
6843     0x36194D42L, 0x32D850F5L, 0x3F9B762CL, 0x3B5A6B9BL,
6844     0x0315D626L, 0x07D4CB91L, 0x0A97ED48L, 0x0E56F0FFL,
6845     0x1011A0FAL, 0x14D0BD4DL, 0x19939B94L, 0x1D528623L,
6846     0xF12F560EL, 0xF5EE4BB9L, 0xF8AD6D60L, 0xFC6C70D7L,
6847     0xE22B20D2L, 0xE6EA3D65L, 0xEBA91BBCL, 0xEF68060BL,
6848     0xD727BBB6L, 0xD3E6A601L, 0xDEA580D8L, 0xDA649D6FL,
6849     0xC423CD6AL, 0xC0E2D0DDL, 0xCDA1F604L, 0xC960EBB3L,
6850     0xBD3E8D7EL, 0xB9FF90C9L, 0xB4BCB610L, 0xB07DABA7L,
6851     0xAE3AFBA2L, 0xAAFBE615L, 0xA7B8C0CCL, 0xA379DD7BL,
6852     0x9B3660C6L, 0x9FF77D71L, 0x92B45BA8L, 0x9675461FL,
6853     0x8832161AL, 0x8CF30BADL, 0x81B02D74L, 0x857130C3L,
6854     0x5D8A9099L, 0x594B8D2EL, 0x5408ABF7L, 0x50C9B640L,
6855     0x4E8EE645L, 0x4A4FFBF2L, 0x470CDD2BL, 0x43CDC09CL,
6856     0x7B827D21L, 0x7F436096L, 0x7200464FL, 0x76C15BF8L,
6857     0x68860BFDL, 0x6C47164AL, 0x61043093L, 0x65C52D24L,
6858     0x119B4BE9L, 0x155A565EL, 0x18197087L, 0x1CD86D30L,
6859     0x029F3D35L, 0x065E2082L, 0x0B1D065BL, 0x0FDC1BECL,
6860     0x3793A651L, 0x3352BBE6L, 0x3E119D3FL, 0x3AD08088L,
6861     0x2497D08DL, 0x2056CD3AL, 0x2D15EBE3L, 0x29D4F654L,
6862     0xC5A92679L, 0xC1683BCEL, 0xCC2B1D17L, 0xC8EA00A0L,
6863     0xD6AD50A5L, 0xD26C4D12L, 0xDF2F6BCBL, 0xDBEE767CL,
6864     0xE3A1CBC1L, 0xE760D676L, 0xEA23F0AFL, 0xEEE2ED18L,
6865     0xF0A5BD1DL, 0xF464A0AAL, 0xF9278673L, 0xFDE69BC4L,
6866     0x89B8FD09L, 0x8D79E0BEL, 0x803AC667L, 0x84FBDBD0L,
6867     0x9ABC8BD5L, 0x9E7D9662L, 0x933EB0BBL, 0x97FFAD0CL,
6868     0xAFB010B1L, 0xAB710D06L, 0xA6322BDFL, 0xA2F33668L,
6869     0xBCB4666DL, 0xB8757BDAL, 0xB5365D03L, 0xB1F740B4L
6870 };
6871 #endif
6872
6873 static DRFLAC_INLINE drflac_uint32 drflac_crc32_byte(drflac_uint32 crc32, drflac_uint8 data)
6874 {
6875 #ifndef DR_FLAC_NO_CRC
6876     return (crc32 << 8) ^ drflac__crc32_table[(drflac_uint8)((crc32 >> 24) & 0xFF) ^ data];
6877 #else
6878     (void)data;
6879     return crc32;
6880 #endif
6881 }
6882
6883 #if 0
6884 static DRFLAC_INLINE drflac_uint32 drflac_crc32_uint32(drflac_uint32 crc32, drflac_uint32 data)
6885 {
6886     crc32 = drflac_crc32_byte(crc32, (drflac_uint8)((data >> 24) & 0xFF));
6887     crc32 = drflac_crc32_byte(crc32, (drflac_uint8)((data >> 16) & 0xFF));
6888     crc32 = drflac_crc32_byte(crc32, (drflac_uint8)((data >>  8) & 0xFF));
6889     crc32 = drflac_crc32_byte(crc32, (drflac_uint8)((data >>  0) & 0xFF));
6890     return crc32;
6891 }
6892
6893 static DRFLAC_INLINE drflac_uint32 drflac_crc32_uint64(drflac_uint32 crc32, drflac_uint64 data)
6894 {
6895     crc32 = drflac_crc32_uint32(crc32, (drflac_uint32)((data >> 32) & 0xFFFFFFFF));
6896     crc32 = drflac_crc32_uint32(crc32, (drflac_uint32)((data >>  0) & 0xFFFFFFFF));
6897     return crc32;
6898 }
6899 #endif
6900
6901 static DRFLAC_INLINE drflac_uint32 drflac_crc32_buffer(drflac_uint32 crc32, drflac_uint8* pData, drflac_uint32 dataSize)
6902 {
6903     /* This can be optimized. */
6904     drflac_uint32 i;
6905     for (i = 0; i < dataSize; ++i) {
6906         crc32 = drflac_crc32_byte(crc32, pData[i]);
6907     }
6908     return crc32;
6909 }
6910
6911
6912 static DRFLAC_INLINE drflac_bool32 drflac_ogg__is_capture_pattern(drflac_uint8 pattern[4])
6913 {
6914     return pattern[0] == 'O' && pattern[1] == 'g' && pattern[2] == 'g' && pattern[3] == 'S';
6915 }
6916
6917 static DRFLAC_INLINE drflac_uint32 drflac_ogg__get_page_header_size(drflac_ogg_page_header* pHeader)
6918 {
6919     return 27 + pHeader->segmentCount;
6920 }
6921
6922 static DRFLAC_INLINE drflac_uint32 drflac_ogg__get_page_body_size(drflac_ogg_page_header* pHeader)
6923 {
6924     drflac_uint32 pageBodySize = 0;
6925     int i;
6926
6927     for (i = 0; i < pHeader->segmentCount; ++i) {
6928         pageBodySize += pHeader->segmentTable[i];
6929     }
6930
6931     return pageBodySize;
6932 }
6933
6934 static drflac_result drflac_ogg__read_page_header_after_capture_pattern(drflac_read_proc onRead, void* pUserData, drflac_ogg_page_header* pHeader, drflac_uint32* pBytesRead, drflac_uint32* pCRC32)
6935 {
6936     drflac_uint8 data[23];
6937     drflac_uint32 i;
6938
6939     DRFLAC_ASSERT(*pCRC32 == DRFLAC_OGG_CAPTURE_PATTERN_CRC32);
6940
6941     if (onRead(pUserData, data, 23) != 23) {
6942         return DRFLAC_AT_END;
6943     }
6944     *pBytesRead += 23;
6945
6946     /*
6947     It's not actually used, but set the capture pattern to 'OggS' for completeness. Not doing this will cause static analysers to complain about
6948     us trying to access uninitialized data. We could alternatively just comment out this member of the drflac_ogg_page_header structure, but I
6949     like to have it map to the structure of the underlying data.
6950     */
6951     pHeader->capturePattern[0] = 'O';
6952     pHeader->capturePattern[1] = 'g';
6953     pHeader->capturePattern[2] = 'g';
6954     pHeader->capturePattern[3] = 'S';
6955
6956     pHeader->structureVersion = data[0];
6957     pHeader->headerType       = data[1];
6958     DRFLAC_COPY_MEMORY(&pHeader->granulePosition, &data[ 2], 8);
6959     DRFLAC_COPY_MEMORY(&pHeader->serialNumber,    &data[10], 4);
6960     DRFLAC_COPY_MEMORY(&pHeader->sequenceNumber,  &data[14], 4);
6961     DRFLAC_COPY_MEMORY(&pHeader->checksum,        &data[18], 4);
6962     pHeader->segmentCount     = data[22];
6963
6964     /* Calculate the CRC. Note that for the calculation the checksum part of the page needs to be set to 0. */
6965     data[18] = 0;
6966     data[19] = 0;
6967     data[20] = 0;
6968     data[21] = 0;
6969
6970     for (i = 0; i < 23; ++i) {
6971         *pCRC32 = drflac_crc32_byte(*pCRC32, data[i]);
6972     }
6973
6974
6975     if (onRead(pUserData, pHeader->segmentTable, pHeader->segmentCount) != pHeader->segmentCount) {
6976         return DRFLAC_AT_END;
6977     }
6978     *pBytesRead += pHeader->segmentCount;
6979
6980     for (i = 0; i < pHeader->segmentCount; ++i) {
6981         *pCRC32 = drflac_crc32_byte(*pCRC32, pHeader->segmentTable[i]);
6982     }
6983
6984     return DRFLAC_SUCCESS;
6985 }
6986
6987 static drflac_result drflac_ogg__read_page_header(drflac_read_proc onRead, void* pUserData, drflac_ogg_page_header* pHeader, drflac_uint32* pBytesRead, drflac_uint32* pCRC32)
6988 {
6989     drflac_uint8 id[4];
6990
6991     *pBytesRead = 0;
6992
6993     if (onRead(pUserData, id, 4) != 4) {
6994         return DRFLAC_AT_END;
6995     }
6996     *pBytesRead += 4;
6997
6998     /* We need to read byte-by-byte until we find the OggS capture pattern. */
6999     for (;;) {
7000         if (drflac_ogg__is_capture_pattern(id)) {
7001             drflac_result result;
7002
7003             *pCRC32 = DRFLAC_OGG_CAPTURE_PATTERN_CRC32;
7004
7005             result = drflac_ogg__read_page_header_after_capture_pattern(onRead, pUserData, pHeader, pBytesRead, pCRC32);
7006             if (result == DRFLAC_SUCCESS) {
7007                 return DRFLAC_SUCCESS;
7008             } else {
7009                 if (result == DRFLAC_CRC_MISMATCH) {
7010                     continue;
7011                 } else {
7012                     return result;
7013                 }
7014             }
7015         } else {
7016             /* The first 4 bytes did not equal the capture pattern. Read the next byte and try again. */
7017             id[0] = id[1];
7018             id[1] = id[2];
7019             id[2] = id[3];
7020             if (onRead(pUserData, &id[3], 1) != 1) {
7021                 return DRFLAC_AT_END;
7022             }
7023             *pBytesRead += 1;
7024         }
7025     }
7026 }
7027
7028
7029 /*
7030 The main part of the Ogg encapsulation is the conversion from the physical Ogg bitstream to the native FLAC bitstream. It works
7031 in three general stages: Ogg Physical Bitstream -> Ogg/FLAC Logical Bitstream -> FLAC Native Bitstream. dr_flac is designed
7032 in such a way that the core sections assume everything is delivered in native format. Therefore, for each encapsulation type
7033 dr_flac is supporting there needs to be a layer sitting on top of the onRead and onSeek callbacks that ensures the bits read from
7034 the physical Ogg bitstream are converted and delivered in native FLAC format.
7035 */
7036 typedef struct
7037 {
7038     drflac_read_proc onRead;                /* The original onRead callback from drflac_open() and family. */
7039     drflac_seek_proc onSeek;                /* The original onSeek callback from drflac_open() and family. */
7040     void* pUserData;                        /* The user data passed on onRead and onSeek. This is the user data that was passed on drflac_open() and family. */
7041     drflac_uint64 currentBytePos;           /* The position of the byte we are sitting on in the physical byte stream. Used for efficient seeking. */
7042     drflac_uint64 firstBytePos;             /* The position of the first byte in the physical bitstream. Points to the start of the "OggS" identifier of the FLAC bos page. */
7043     drflac_uint32 serialNumber;             /* The serial number of the FLAC audio pages. This is determined by the initial header page that was read during initialization. */
7044     drflac_ogg_page_header bosPageHeader;   /* Used for seeking. */
7045     drflac_ogg_page_header currentPageHeader;
7046     drflac_uint32 bytesRemainingInPage;
7047     drflac_uint32 pageDataSize;
7048     drflac_uint8 pageData[DRFLAC_OGG_MAX_PAGE_SIZE];
7049 } drflac_oggbs; /* oggbs = Ogg Bitstream */
7050
7051 static size_t drflac_oggbs__read_physical(drflac_oggbs* oggbs, void* bufferOut, size_t bytesToRead)
7052 {
7053     size_t bytesActuallyRead = oggbs->onRead(oggbs->pUserData, bufferOut, bytesToRead);
7054     oggbs->currentBytePos += bytesActuallyRead;
7055
7056     return bytesActuallyRead;
7057 }
7058
7059 static drflac_bool32 drflac_oggbs__seek_physical(drflac_oggbs* oggbs, drflac_uint64 offset, drflac_seek_origin origin)
7060 {
7061     if (origin == drflac_seek_origin_start) {
7062         if (offset <= 0x7FFFFFFF) {
7063             if (!oggbs->onSeek(oggbs->pUserData, (int)offset, drflac_seek_origin_start)) {
7064                 return DRFLAC_FALSE;
7065             }
7066             oggbs->currentBytePos = offset;
7067
7068             return DRFLAC_TRUE;
7069         } else {
7070             if (!oggbs->onSeek(oggbs->pUserData, 0x7FFFFFFF, drflac_seek_origin_start)) {
7071                 return DRFLAC_FALSE;
7072             }
7073             oggbs->currentBytePos = offset;
7074
7075             return drflac_oggbs__seek_physical(oggbs, offset - 0x7FFFFFFF, drflac_seek_origin_current);
7076         }
7077     } else {
7078         while (offset > 0x7FFFFFFF) {
7079             if (!oggbs->onSeek(oggbs->pUserData, 0x7FFFFFFF, drflac_seek_origin_current)) {
7080                 return DRFLAC_FALSE;
7081             }
7082             oggbs->currentBytePos += 0x7FFFFFFF;
7083             offset -= 0x7FFFFFFF;
7084         }
7085
7086         if (!oggbs->onSeek(oggbs->pUserData, (int)offset, drflac_seek_origin_current)) {    /* <-- Safe cast thanks to the loop above. */
7087             return DRFLAC_FALSE;
7088         }
7089         oggbs->currentBytePos += offset;
7090
7091         return DRFLAC_TRUE;
7092     }
7093 }
7094
7095 static drflac_bool32 drflac_oggbs__goto_next_page(drflac_oggbs* oggbs, drflac_ogg_crc_mismatch_recovery recoveryMethod)
7096 {
7097     drflac_ogg_page_header header;
7098     for (;;) {
7099         drflac_uint32 crc32 = 0;
7100         drflac_uint32 bytesRead;
7101         drflac_uint32 pageBodySize;
7102 #ifndef DR_FLAC_NO_CRC
7103         drflac_uint32 actualCRC32;
7104 #endif
7105
7106         if (drflac_ogg__read_page_header(oggbs->onRead, oggbs->pUserData, &header, &bytesRead, &crc32) != DRFLAC_SUCCESS) {
7107             return DRFLAC_FALSE;
7108         }
7109         oggbs->currentBytePos += bytesRead;
7110
7111         pageBodySize = drflac_ogg__get_page_body_size(&header);
7112         if (pageBodySize > DRFLAC_OGG_MAX_PAGE_SIZE) {
7113             continue;   /* Invalid page size. Assume it's corrupted and just move to the next page. */
7114         }
7115
7116         if (header.serialNumber != oggbs->serialNumber) {
7117             /* It's not a FLAC page. Skip it. */
7118             if (pageBodySize > 0 && !drflac_oggbs__seek_physical(oggbs, pageBodySize, drflac_seek_origin_current)) {
7119                 return DRFLAC_FALSE;
7120             }
7121             continue;
7122         }
7123
7124
7125         /* We need to read the entire page and then do a CRC check on it. If there's a CRC mismatch we need to skip this page. */
7126         if (drflac_oggbs__read_physical(oggbs, oggbs->pageData, pageBodySize) != pageBodySize) {
7127             return DRFLAC_FALSE;
7128         }
7129         oggbs->pageDataSize = pageBodySize;
7130
7131 #ifndef DR_FLAC_NO_CRC
7132         actualCRC32 = drflac_crc32_buffer(crc32, oggbs->pageData, oggbs->pageDataSize);
7133         if (actualCRC32 != header.checksum) {
7134             if (recoveryMethod == drflac_ogg_recover_on_crc_mismatch) {
7135                 continue;   /* CRC mismatch. Skip this page. */
7136             } else {
7137                 /*
7138                 Even though we are failing on a CRC mismatch, we still want our stream to be in a good state. Therefore we
7139                 go to the next valid page to ensure we're in a good state, but return false to let the caller know that the
7140                 seek did not fully complete.
7141                 */
7142                 drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch);
7143                 return DRFLAC_FALSE;
7144             }
7145         }
7146 #else
7147         (void)recoveryMethod;   /* <-- Silence a warning. */
7148 #endif
7149
7150         oggbs->currentPageHeader = header;
7151         oggbs->bytesRemainingInPage = pageBodySize;
7152         return DRFLAC_TRUE;
7153     }
7154 }
7155
7156 /* Function below is unused at the moment, but I might be re-adding it later. */
7157 #if 0
7158 static drflac_uint8 drflac_oggbs__get_current_segment_index(drflac_oggbs* oggbs, drflac_uint8* pBytesRemainingInSeg)
7159 {
7160     drflac_uint32 bytesConsumedInPage = drflac_ogg__get_page_body_size(&oggbs->currentPageHeader) - oggbs->bytesRemainingInPage;
7161     drflac_uint8 iSeg = 0;
7162     drflac_uint32 iByte = 0;
7163     while (iByte < bytesConsumedInPage) {
7164         drflac_uint8 segmentSize = oggbs->currentPageHeader.segmentTable[iSeg];
7165         if (iByte + segmentSize > bytesConsumedInPage) {
7166             break;
7167         } else {
7168             iSeg += 1;
7169             iByte += segmentSize;
7170         }
7171     }
7172
7173     *pBytesRemainingInSeg = oggbs->currentPageHeader.segmentTable[iSeg] - (drflac_uint8)(bytesConsumedInPage - iByte);
7174     return iSeg;
7175 }
7176
7177 static drflac_bool32 drflac_oggbs__seek_to_next_packet(drflac_oggbs* oggbs)
7178 {
7179     /* The current packet ends when we get to the segment with a lacing value of < 255 which is not at the end of a page. */
7180     for (;;) {
7181         drflac_bool32 atEndOfPage = DRFLAC_FALSE;
7182
7183         drflac_uint8 bytesRemainingInSeg;
7184         drflac_uint8 iFirstSeg = drflac_oggbs__get_current_segment_index(oggbs, &bytesRemainingInSeg);
7185
7186         drflac_uint32 bytesToEndOfPacketOrPage = bytesRemainingInSeg;
7187         for (drflac_uint8 iSeg = iFirstSeg; iSeg < oggbs->currentPageHeader.segmentCount; ++iSeg) {
7188             drflac_uint8 segmentSize = oggbs->currentPageHeader.segmentTable[iSeg];
7189             if (segmentSize < 255) {
7190                 if (iSeg == oggbs->currentPageHeader.segmentCount-1) {
7191                     atEndOfPage = DRFLAC_TRUE;
7192                 }
7193
7194                 break;
7195             }
7196
7197             bytesToEndOfPacketOrPage += segmentSize;
7198         }
7199
7200         /*
7201         At this point we will have found either the packet or the end of the page. If were at the end of the page we'll
7202         want to load the next page and keep searching for the end of the packet.
7203         */
7204         drflac_oggbs__seek_physical(oggbs, bytesToEndOfPacketOrPage, drflac_seek_origin_current);
7205         oggbs->bytesRemainingInPage -= bytesToEndOfPacketOrPage;
7206
7207         if (atEndOfPage) {
7208             /*
7209             We're potentially at the next packet, but we need to check the next page first to be sure because the packet may
7210             straddle pages.
7211             */
7212             if (!drflac_oggbs__goto_next_page(oggbs)) {
7213                 return DRFLAC_FALSE;
7214             }
7215
7216             /* If it's a fresh packet it most likely means we're at the next packet. */
7217             if ((oggbs->currentPageHeader.headerType & 0x01) == 0) {
7218                 return DRFLAC_TRUE;
7219             }
7220         } else {
7221             /* We're at the next packet. */
7222             return DRFLAC_TRUE;
7223         }
7224     }
7225 }
7226
7227 static drflac_bool32 drflac_oggbs__seek_to_next_frame(drflac_oggbs* oggbs)
7228 {
7229     /* The bitstream should be sitting on the first byte just after the header of the frame. */
7230
7231     /* What we're actually doing here is seeking to the start of the next packet. */
7232     return drflac_oggbs__seek_to_next_packet(oggbs);
7233 }
7234 #endif
7235
7236 static size_t drflac__on_read_ogg(void* pUserData, void* bufferOut, size_t bytesToRead)
7237 {
7238     drflac_oggbs* oggbs = (drflac_oggbs*)pUserData;
7239     drflac_uint8* pRunningBufferOut = (drflac_uint8*)bufferOut;
7240     size_t bytesRead = 0;
7241
7242     DRFLAC_ASSERT(oggbs != NULL);
7243     DRFLAC_ASSERT(pRunningBufferOut != NULL);
7244
7245     /* Reading is done page-by-page. If we've run out of bytes in the page we need to move to the next one. */
7246     while (bytesRead < bytesToRead) {
7247         size_t bytesRemainingToRead = bytesToRead - bytesRead;
7248
7249         if (oggbs->bytesRemainingInPage >= bytesRemainingToRead) {
7250             DRFLAC_COPY_MEMORY(pRunningBufferOut, oggbs->pageData + (oggbs->pageDataSize - oggbs->bytesRemainingInPage), bytesRemainingToRead);
7251             bytesRead += bytesRemainingToRead;
7252             oggbs->bytesRemainingInPage -= (drflac_uint32)bytesRemainingToRead;
7253             break;
7254         }
7255
7256         /* If we get here it means some of the requested data is contained in the next pages. */
7257         if (oggbs->bytesRemainingInPage > 0) {
7258             DRFLAC_COPY_MEMORY(pRunningBufferOut, oggbs->pageData + (oggbs->pageDataSize - oggbs->bytesRemainingInPage), oggbs->bytesRemainingInPage);
7259             bytesRead += oggbs->bytesRemainingInPage;
7260             pRunningBufferOut += oggbs->bytesRemainingInPage;
7261             oggbs->bytesRemainingInPage = 0;
7262         }
7263
7264         DRFLAC_ASSERT(bytesRemainingToRead > 0);
7265         if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch)) {
7266             break;  /* Failed to go to the next page. Might have simply hit the end of the stream. */
7267         }
7268     }
7269
7270     return bytesRead;
7271 }
7272
7273 static drflac_bool32 drflac__on_seek_ogg(void* pUserData, int offset, drflac_seek_origin origin)
7274 {
7275     drflac_oggbs* oggbs = (drflac_oggbs*)pUserData;
7276     int bytesSeeked = 0;
7277
7278     DRFLAC_ASSERT(oggbs != NULL);
7279     DRFLAC_ASSERT(offset >= 0);  /* <-- Never seek backwards. */
7280
7281     /* Seeking is always forward which makes things a lot simpler. */
7282     if (origin == drflac_seek_origin_start) {
7283         if (!drflac_oggbs__seek_physical(oggbs, (int)oggbs->firstBytePos, drflac_seek_origin_start)) {
7284             return DRFLAC_FALSE;
7285         }
7286
7287         if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_fail_on_crc_mismatch)) {
7288             return DRFLAC_FALSE;
7289         }
7290
7291         return drflac__on_seek_ogg(pUserData, offset, drflac_seek_origin_current);
7292     }
7293
7294     DRFLAC_ASSERT(origin == drflac_seek_origin_current);
7295
7296     while (bytesSeeked < offset) {
7297         int bytesRemainingToSeek = offset - bytesSeeked;
7298         DRFLAC_ASSERT(bytesRemainingToSeek >= 0);
7299
7300         if (oggbs->bytesRemainingInPage >= (size_t)bytesRemainingToSeek) {
7301             bytesSeeked += bytesRemainingToSeek;
7302             (void)bytesSeeked;  /* <-- Silence a dead store warning emitted by Clang Static Analyzer. */
7303             oggbs->bytesRemainingInPage -= bytesRemainingToSeek;
7304             break;
7305         }
7306
7307         /* If we get here it means some of the requested data is contained in the next pages. */
7308         if (oggbs->bytesRemainingInPage > 0) {
7309             bytesSeeked += (int)oggbs->bytesRemainingInPage;
7310             oggbs->bytesRemainingInPage = 0;
7311         }
7312
7313         DRFLAC_ASSERT(bytesRemainingToSeek > 0);
7314         if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_fail_on_crc_mismatch)) {
7315             /* Failed to go to the next page. We either hit the end of the stream or had a CRC mismatch. */
7316             return DRFLAC_FALSE;
7317         }
7318     }
7319
7320     return DRFLAC_TRUE;
7321 }
7322
7323
7324 static drflac_bool32 drflac_ogg__seek_to_pcm_frame(drflac* pFlac, drflac_uint64 pcmFrameIndex)
7325 {
7326     drflac_oggbs* oggbs = (drflac_oggbs*)pFlac->_oggbs;
7327     drflac_uint64 originalBytePos;
7328     drflac_uint64 runningGranulePosition;
7329     drflac_uint64 runningFrameBytePos;
7330     drflac_uint64 runningPCMFrameCount;
7331
7332     DRFLAC_ASSERT(oggbs != NULL);
7333
7334     originalBytePos = oggbs->currentBytePos;   /* For recovery. Points to the OggS identifier. */
7335
7336     /* First seek to the first frame. */
7337     if (!drflac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes)) {
7338         return DRFLAC_FALSE;
7339     }
7340     oggbs->bytesRemainingInPage = 0;
7341
7342     runningGranulePosition = 0;
7343     for (;;) {
7344         if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch)) {
7345             drflac_oggbs__seek_physical(oggbs, originalBytePos, drflac_seek_origin_start);
7346             return DRFLAC_FALSE;   /* Never did find that sample... */
7347         }
7348
7349         runningFrameBytePos = oggbs->currentBytePos - drflac_ogg__get_page_header_size(&oggbs->currentPageHeader) - oggbs->pageDataSize;
7350         if (oggbs->currentPageHeader.granulePosition >= pcmFrameIndex) {
7351             break; /* The sample is somewhere in the previous page. */
7352         }
7353
7354         /*
7355         At this point we know the sample is not in the previous page. It could possibly be in this page. For simplicity we
7356         disregard any pages that do not begin a fresh packet.
7357         */
7358         if ((oggbs->currentPageHeader.headerType & 0x01) == 0) {    /* <-- Is it a fresh page? */
7359             if (oggbs->currentPageHeader.segmentTable[0] >= 2) {
7360                 drflac_uint8 firstBytesInPage[2];
7361                 firstBytesInPage[0] = oggbs->pageData[0];
7362                 firstBytesInPage[1] = oggbs->pageData[1];
7363
7364                 if ((firstBytesInPage[0] == 0xFF) && (firstBytesInPage[1] & 0xFC) == 0xF8) {    /* <-- Does the page begin with a frame's sync code? */
7365                     runningGranulePosition = oggbs->currentPageHeader.granulePosition;
7366                 }
7367
7368                 continue;
7369             }
7370         }
7371     }
7372
7373     /*
7374     We found the page that that is closest to the sample, so now we need to find it. The first thing to do is seek to the
7375     start of that page. In the loop above we checked that it was a fresh page which means this page is also the start of
7376     a new frame. This property means that after we've seeked to the page we can immediately start looping over frames until
7377     we find the one containing the target sample.
7378     */
7379     if (!drflac_oggbs__seek_physical(oggbs, runningFrameBytePos, drflac_seek_origin_start)) {
7380         return DRFLAC_FALSE;
7381     }
7382     if (!drflac_oggbs__goto_next_page(oggbs, drflac_ogg_recover_on_crc_mismatch)) {
7383         return DRFLAC_FALSE;
7384     }
7385
7386     /*
7387     At this point we'll be sitting on the first byte of the frame header of the first frame in the page. We just keep
7388     looping over these frames until we find the one containing the sample we're after.
7389     */
7390     runningPCMFrameCount = runningGranulePosition;
7391     for (;;) {
7392         /*
7393         There are two ways to find the sample and seek past irrelevant frames:
7394           1) Use the native FLAC decoder.
7395           2) Use Ogg's framing system.
7396
7397         Both of these options have their own pros and cons. Using the native FLAC decoder is slower because it needs to
7398         do a full decode of the frame. Using Ogg's framing system is faster, but more complicated and involves some code
7399         duplication for the decoding of frame headers.
7400
7401         Another thing to consider is that using the Ogg framing system will perform direct seeking of the physical Ogg
7402         bitstream. This is important to consider because it means we cannot read data from the drflac_bs object using the
7403         standard drflac__*() APIs because that will read in extra data for its own internal caching which in turn breaks
7404         the positioning of the read pointer of the physical Ogg bitstream. Therefore, anything that would normally be read
7405         using the native FLAC decoding APIs, such as drflac__read_next_flac_frame_header(), need to be re-implemented so as to
7406         avoid the use of the drflac_bs object.
7407
7408         Considering these issues, I have decided to use the slower native FLAC decoding method for the following reasons:
7409           1) Seeking is already partially accelerated using Ogg's paging system in the code block above.
7410           2) Seeking in an Ogg encapsulated FLAC stream is probably quite uncommon.
7411           3) Simplicity.
7412         */
7413         drflac_uint64 firstPCMFrameInFLACFrame = 0;
7414         drflac_uint64 lastPCMFrameInFLACFrame = 0;
7415         drflac_uint64 pcmFrameCountInThisFrame;
7416
7417         if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
7418             return DRFLAC_FALSE;
7419         }
7420
7421         drflac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
7422
7423         pcmFrameCountInThisFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
7424
7425         /* If we are seeking to the end of the file and we've just hit it, we're done. */
7426         if (pcmFrameIndex == pFlac->totalPCMFrameCount && (runningPCMFrameCount + pcmFrameCountInThisFrame) == pFlac->totalPCMFrameCount) {
7427             drflac_result result = drflac__decode_flac_frame(pFlac);
7428             if (result == DRFLAC_SUCCESS) {
7429                 pFlac->currentPCMFrame = pcmFrameIndex;
7430                 pFlac->currentFLACFrame.pcmFramesRemaining = 0;
7431                 return DRFLAC_TRUE;
7432             } else {
7433                 return DRFLAC_FALSE;
7434             }
7435         }
7436
7437         if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFrame)) {
7438             /*
7439             The sample should be in this FLAC frame. We need to fully decode it, however if it's an invalid frame (a CRC mismatch), we need to pretend
7440             it never existed and keep iterating.
7441             */
7442             drflac_result result = drflac__decode_flac_frame(pFlac);
7443             if (result == DRFLAC_SUCCESS) {
7444                 /* The frame is valid. We just need to skip over some samples to ensure it's sample-exact. */
7445                 drflac_uint64 pcmFramesToDecode = (size_t)(pcmFrameIndex - runningPCMFrameCount);    /* <-- Safe cast because the maximum number of samples in a frame is 65535. */
7446                 if (pcmFramesToDecode == 0) {
7447                     return DRFLAC_TRUE;
7448                 }
7449
7450                 pFlac->currentPCMFrame = runningPCMFrameCount;
7451
7452                 return drflac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;  /* <-- If this fails, something bad has happened (it should never fail). */
7453             } else {
7454                 if (result == DRFLAC_CRC_MISMATCH) {
7455                     continue;   /* CRC mismatch. Pretend this frame never existed. */
7456                 } else {
7457                     return DRFLAC_FALSE;
7458                 }
7459             }
7460         } else {
7461             /*
7462             It's not in this frame. We need to seek past the frame, but check if there was a CRC mismatch. If so, we pretend this
7463             frame never existed and leave the running sample count untouched.
7464             */
7465             drflac_result result = drflac__seek_to_next_flac_frame(pFlac);
7466             if (result == DRFLAC_SUCCESS) {
7467                 runningPCMFrameCount += pcmFrameCountInThisFrame;
7468             } else {
7469                 if (result == DRFLAC_CRC_MISMATCH) {
7470                     continue;   /* CRC mismatch. Pretend this frame never existed. */
7471                 } else {
7472                     return DRFLAC_FALSE;
7473                 }
7474             }
7475         }
7476     }
7477 }
7478
7479
7480
7481 static drflac_bool32 drflac__init_private__ogg(drflac_init_info* pInit, drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, void* pUserDataMD, drflac_bool32 relaxed)
7482 {
7483     drflac_ogg_page_header header;
7484     drflac_uint32 crc32 = DRFLAC_OGG_CAPTURE_PATTERN_CRC32;
7485     drflac_uint32 bytesRead = 0;
7486
7487     /* Pre Condition: The bit stream should be sitting just past the 4-byte OggS capture pattern. */
7488     (void)relaxed;
7489
7490     pInit->container = drflac_container_ogg;
7491     pInit->oggFirstBytePos = 0;
7492
7493     /*
7494     We'll get here if the first 4 bytes of the stream were the OggS capture pattern, however it doesn't necessarily mean the
7495     stream includes FLAC encoded audio. To check for this we need to scan the beginning-of-stream page markers and check if
7496     any match the FLAC specification. Important to keep in mind that the stream may be multiplexed.
7497     */
7498     if (drflac_ogg__read_page_header_after_capture_pattern(onRead, pUserData, &header, &bytesRead, &crc32) != DRFLAC_SUCCESS) {
7499         return DRFLAC_FALSE;
7500     }
7501     pInit->runningFilePos += bytesRead;
7502
7503     for (;;) {
7504         int pageBodySize;
7505
7506         /* Break if we're past the beginning of stream page. */
7507         if ((header.headerType & 0x02) == 0) {
7508             return DRFLAC_FALSE;
7509         }
7510
7511         /* Check if it's a FLAC header. */
7512         pageBodySize = drflac_ogg__get_page_body_size(&header);
7513         if (pageBodySize == 51) {   /* 51 = the lacing value of the FLAC header packet. */
7514             /* It could be a FLAC page... */
7515             drflac_uint32 bytesRemainingInPage = pageBodySize;
7516             drflac_uint8 packetType;
7517
7518             if (onRead(pUserData, &packetType, 1) != 1) {
7519                 return DRFLAC_FALSE;
7520             }
7521
7522             bytesRemainingInPage -= 1;
7523             if (packetType == 0x7F) {
7524                 /* Increasingly more likely to be a FLAC page... */
7525                 drflac_uint8 sig[4];
7526                 if (onRead(pUserData, sig, 4) != 4) {
7527                     return DRFLAC_FALSE;
7528                 }
7529
7530                 bytesRemainingInPage -= 4;
7531                 if (sig[0] == 'F' && sig[1] == 'L' && sig[2] == 'A' && sig[3] == 'C') {
7532                     /* Almost certainly a FLAC page... */
7533                     drflac_uint8 mappingVersion[2];
7534                     if (onRead(pUserData, mappingVersion, 2) != 2) {
7535                         return DRFLAC_FALSE;
7536                     }
7537
7538                     if (mappingVersion[0] != 1) {
7539                         return DRFLAC_FALSE;   /* Only supporting version 1.x of the Ogg mapping. */
7540                     }
7541
7542                     /*
7543                     The next 2 bytes are the non-audio packets, not including this one. We don't care about this because we're going to
7544                     be handling it in a generic way based on the serial number and packet types.
7545                     */
7546                     if (!onSeek(pUserData, 2, drflac_seek_origin_current)) {
7547                         return DRFLAC_FALSE;
7548                     }
7549
7550                     /* Expecting the native FLAC signature "fLaC". */
7551                     if (onRead(pUserData, sig, 4) != 4) {
7552                         return DRFLAC_FALSE;
7553                     }
7554
7555                     if (sig[0] == 'f' && sig[1] == 'L' && sig[2] == 'a' && sig[3] == 'C') {
7556                         /* The remaining data in the page should be the STREAMINFO block. */
7557                         drflac_streaminfo streaminfo;
7558                         drflac_uint8 isLastBlock;
7559                         drflac_uint8 blockType;
7560                         drflac_uint32 blockSize;
7561                         if (!drflac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize)) {
7562                             return DRFLAC_FALSE;
7563                         }
7564
7565                         if (blockType != DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO || blockSize != 34) {
7566                             return DRFLAC_FALSE;    /* Invalid block type. First block must be the STREAMINFO block. */
7567                         }
7568
7569                         if (drflac__read_streaminfo(onRead, pUserData, &streaminfo)) {
7570                             /* Success! */
7571                             pInit->hasStreamInfoBlock      = DRFLAC_TRUE;
7572                             pInit->sampleRate              = streaminfo.sampleRate;
7573                             pInit->channels                = streaminfo.channels;
7574                             pInit->bitsPerSample           = streaminfo.bitsPerSample;
7575                             pInit->totalPCMFrameCount      = streaminfo.totalPCMFrameCount;
7576                             pInit->maxBlockSizeInPCMFrames = streaminfo.maxBlockSizeInPCMFrames;
7577                             pInit->hasMetadataBlocks       = !isLastBlock;
7578
7579                             if (onMeta) {
7580                                 drflac_metadata metadata;
7581                                 metadata.type = DRFLAC_METADATA_BLOCK_TYPE_STREAMINFO;
7582                                 metadata.pRawData = NULL;
7583                                 metadata.rawDataSize = 0;
7584                                 metadata.data.streaminfo = streaminfo;
7585                                 onMeta(pUserDataMD, &metadata);
7586                             }
7587
7588                             pInit->runningFilePos  += pageBodySize;
7589                             pInit->oggFirstBytePos  = pInit->runningFilePos - 79;   /* Subtracting 79 will place us right on top of the "OggS" identifier of the FLAC bos page. */
7590                             pInit->oggSerial        = header.serialNumber;
7591                             pInit->oggBosHeader     = header;
7592                             break;
7593                         } else {
7594                             /* Failed to read STREAMINFO block. Aww, so close... */
7595                             return DRFLAC_FALSE;
7596                         }
7597                     } else {
7598                         /* Invalid file. */
7599                         return DRFLAC_FALSE;
7600                     }
7601                 } else {
7602                     /* Not a FLAC header. Skip it. */
7603                     if (!onSeek(pUserData, bytesRemainingInPage, drflac_seek_origin_current)) {
7604                         return DRFLAC_FALSE;
7605                     }
7606                 }
7607             } else {
7608                 /* Not a FLAC header. Seek past the entire page and move on to the next. */
7609                 if (!onSeek(pUserData, bytesRemainingInPage, drflac_seek_origin_current)) {
7610                     return DRFLAC_FALSE;
7611                 }
7612             }
7613         } else {
7614             if (!onSeek(pUserData, pageBodySize, drflac_seek_origin_current)) {
7615                 return DRFLAC_FALSE;
7616             }
7617         }
7618
7619         pInit->runningFilePos += pageBodySize;
7620
7621
7622         /* Read the header of the next page. */
7623         if (drflac_ogg__read_page_header(onRead, pUserData, &header, &bytesRead, &crc32) != DRFLAC_SUCCESS) {
7624             return DRFLAC_FALSE;
7625         }
7626         pInit->runningFilePos += bytesRead;
7627     }
7628
7629     /*
7630     If we get here it means we found a FLAC audio stream. We should be sitting on the first byte of the header of the next page. The next
7631     packets in the FLAC logical stream contain the metadata. The only thing left to do in the initialization phase for Ogg is to create the
7632     Ogg bistream object.
7633     */
7634     pInit->hasMetadataBlocks = DRFLAC_TRUE;    /* <-- Always have at least VORBIS_COMMENT metadata block. */
7635     return DRFLAC_TRUE;
7636 }
7637 #endif
7638
7639 static drflac_bool32 drflac__init_private(drflac_init_info* pInit, drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, drflac_container container, void* pUserData, void* pUserDataMD)
7640 {
7641     drflac_bool32 relaxed;
7642     drflac_uint8 id[4];
7643
7644     if (pInit == NULL || onRead == NULL || onSeek == NULL) {
7645         return DRFLAC_FALSE;
7646     }
7647
7648     DRFLAC_ZERO_MEMORY(pInit, sizeof(*pInit));
7649     pInit->onRead       = onRead;
7650     pInit->onSeek       = onSeek;
7651     pInit->onMeta       = onMeta;
7652     pInit->container    = container;
7653     pInit->pUserData    = pUserData;
7654     pInit->pUserDataMD  = pUserDataMD;
7655
7656     pInit->bs.onRead    = onRead;
7657     pInit->bs.onSeek    = onSeek;
7658     pInit->bs.pUserData = pUserData;
7659     drflac__reset_cache(&pInit->bs);
7660
7661
7662     /* If the container is explicitly defined then we can try opening in relaxed mode. */
7663     relaxed = container != drflac_container_unknown;
7664
7665     /* Skip over any ID3 tags. */
7666     for (;;) {
7667         if (onRead(pUserData, id, 4) != 4) {
7668             return DRFLAC_FALSE;    /* Ran out of data. */
7669         }
7670         pInit->runningFilePos += 4;
7671
7672         if (id[0] == 'I' && id[1] == 'D' && id[2] == '3') {
7673             drflac_uint8 header[6];
7674             drflac_uint8 flags;
7675             drflac_uint32 headerSize;
7676
7677             if (onRead(pUserData, header, 6) != 6) {
7678                 return DRFLAC_FALSE;    /* Ran out of data. */
7679             }
7680             pInit->runningFilePos += 6;
7681
7682             flags = header[1];
7683
7684             DRFLAC_COPY_MEMORY(&headerSize, header+2, 4);
7685             headerSize = drflac__unsynchsafe_32(drflac__be2host_32(headerSize));
7686             if (flags & 0x10) {
7687                 headerSize += 10;
7688             }
7689
7690             if (!onSeek(pUserData, headerSize, drflac_seek_origin_current)) {
7691                 return DRFLAC_FALSE;    /* Failed to seek past the tag. */
7692             }
7693             pInit->runningFilePos += headerSize;
7694         } else {
7695             break;
7696         }
7697     }
7698
7699     if (id[0] == 'f' && id[1] == 'L' && id[2] == 'a' && id[3] == 'C') {
7700         return drflac__init_private__native(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
7701     }
7702 #ifndef DR_FLAC_NO_OGG
7703     if (id[0] == 'O' && id[1] == 'g' && id[2] == 'g' && id[3] == 'S') {
7704         return drflac__init_private__ogg(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
7705     }
7706 #endif
7707
7708     /* If we get here it means we likely don't have a header. Try opening in relaxed mode, if applicable. */
7709     if (relaxed) {
7710         if (container == drflac_container_native) {
7711             return drflac__init_private__native(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
7712         }
7713 #ifndef DR_FLAC_NO_OGG
7714         if (container == drflac_container_ogg) {
7715             return drflac__init_private__ogg(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
7716         }
7717 #endif
7718     }
7719
7720     /* Unsupported container. */
7721     return DRFLAC_FALSE;
7722 }
7723
7724 static void drflac__init_from_info(drflac* pFlac, const drflac_init_info* pInit)
7725 {
7726     DRFLAC_ASSERT(pFlac != NULL);
7727     DRFLAC_ASSERT(pInit != NULL);
7728
7729     DRFLAC_ZERO_MEMORY(pFlac, sizeof(*pFlac));
7730     pFlac->bs                      = pInit->bs;
7731     pFlac->onMeta                  = pInit->onMeta;
7732     pFlac->pUserDataMD             = pInit->pUserDataMD;
7733     pFlac->maxBlockSizeInPCMFrames = pInit->maxBlockSizeInPCMFrames;
7734     pFlac->sampleRate              = pInit->sampleRate;
7735     pFlac->channels                = (drflac_uint8)pInit->channels;
7736     pFlac->bitsPerSample           = (drflac_uint8)pInit->bitsPerSample;
7737     pFlac->totalPCMFrameCount      = pInit->totalPCMFrameCount;
7738     pFlac->container               = pInit->container;
7739 }
7740
7741
7742 static drflac* drflac_open_with_metadata_private(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, drflac_container container, void* pUserData, void* pUserDataMD, const drflac_allocation_callbacks* pAllocationCallbacks)
7743 {
7744     drflac_init_info init;
7745     drflac_uint32 allocationSize;
7746     drflac_uint32 wholeSIMDVectorCountPerChannel;
7747     drflac_uint32 decodedSamplesAllocationSize;
7748 #ifndef DR_FLAC_NO_OGG
7749     drflac_oggbs oggbs;
7750 #endif
7751     drflac_uint64 firstFramePos;
7752     drflac_uint64 seektablePos;
7753     drflac_uint32 seektableSize;
7754     drflac_allocation_callbacks allocationCallbacks;
7755     drflac* pFlac;
7756
7757     /* CPU support first. */
7758     drflac__init_cpu_caps();
7759
7760     if (!drflac__init_private(&init, onRead, onSeek, onMeta, container, pUserData, pUserDataMD)) {
7761         return NULL;
7762     }
7763
7764     if (pAllocationCallbacks != NULL) {
7765         allocationCallbacks = *pAllocationCallbacks;
7766         if (allocationCallbacks.onFree == NULL || (allocationCallbacks.onMalloc == NULL && allocationCallbacks.onRealloc == NULL)) {
7767             return NULL;    /* Invalid allocation callbacks. */
7768         }
7769     } else {
7770         allocationCallbacks.pUserData = NULL;
7771         allocationCallbacks.onMalloc  = drflac__malloc_default;
7772         allocationCallbacks.onRealloc = drflac__realloc_default;
7773         allocationCallbacks.onFree    = drflac__free_default;
7774     }
7775
7776
7777     /*
7778     The size of the allocation for the drflac object needs to be large enough to fit the following:
7779       1) The main members of the drflac structure
7780       2) A block of memory large enough to store the decoded samples of the largest frame in the stream
7781       3) If the container is Ogg, a drflac_oggbs object
7782
7783     The complicated part of the allocation is making sure there's enough room the decoded samples, taking into consideration
7784     the different SIMD instruction sets.
7785     */
7786     allocationSize = sizeof(drflac);
7787
7788     /*
7789     The allocation size for decoded frames depends on the number of 32-bit integers that fit inside the largest SIMD vector
7790     we are supporting.
7791     */
7792     if ((init.maxBlockSizeInPCMFrames % (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32))) == 0) {
7793         wholeSIMDVectorCountPerChannel = (init.maxBlockSizeInPCMFrames / (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32)));
7794     } else {
7795         wholeSIMDVectorCountPerChannel = (init.maxBlockSizeInPCMFrames / (DRFLAC_MAX_SIMD_VECTOR_SIZE / sizeof(drflac_int32))) + 1;
7796     }
7797
7798     decodedSamplesAllocationSize = wholeSIMDVectorCountPerChannel * DRFLAC_MAX_SIMD_VECTOR_SIZE * init.channels;
7799
7800     allocationSize += decodedSamplesAllocationSize;
7801     allocationSize += DRFLAC_MAX_SIMD_VECTOR_SIZE;  /* Allocate extra bytes to ensure we have enough for alignment. */
7802
7803 #ifndef DR_FLAC_NO_OGG
7804     /* There's additional data required for Ogg streams. */
7805     if (init.container == drflac_container_ogg) {
7806         allocationSize += sizeof(drflac_oggbs);
7807     }
7808
7809     DRFLAC_ZERO_MEMORY(&oggbs, sizeof(oggbs));
7810     if (init.container == drflac_container_ogg) {
7811         oggbs.onRead = onRead;
7812         oggbs.onSeek = onSeek;
7813         oggbs.pUserData = pUserData;
7814         oggbs.currentBytePos = init.oggFirstBytePos;
7815         oggbs.firstBytePos = init.oggFirstBytePos;
7816         oggbs.serialNumber = init.oggSerial;
7817         oggbs.bosPageHeader = init.oggBosHeader;
7818         oggbs.bytesRemainingInPage = 0;
7819     }
7820 #endif
7821
7822     /*
7823     This part is a bit awkward. We need to load the seektable so that it can be referenced in-memory, but I want the drflac object to
7824     consist of only a single heap allocation. To this, the size of the seek table needs to be known, which we determine when reading
7825     and decoding the metadata.
7826     */
7827     firstFramePos = 42;   /* <-- We know we are at byte 42 at this point. */
7828     seektablePos  = 0;
7829     seektableSize = 0;
7830     if (init.hasMetadataBlocks) {
7831         drflac_read_proc onReadOverride = onRead;
7832         drflac_seek_proc onSeekOverride = onSeek;
7833         void* pUserDataOverride = pUserData;
7834
7835 #ifndef DR_FLAC_NO_OGG
7836         if (init.container == drflac_container_ogg) {
7837             onReadOverride = drflac__on_read_ogg;
7838             onSeekOverride = drflac__on_seek_ogg;
7839             pUserDataOverride = (void*)&oggbs;
7840         }
7841 #endif
7842
7843         if (!drflac__read_and_decode_metadata(onReadOverride, onSeekOverride, onMeta, pUserDataOverride, pUserDataMD, &firstFramePos, &seektablePos, &seektableSize, &allocationCallbacks)) {
7844             return NULL;
7845         }
7846
7847         allocationSize += seektableSize;
7848     }
7849
7850
7851     pFlac = (drflac*)drflac__malloc_from_callbacks(allocationSize, &allocationCallbacks);
7852     if (pFlac == NULL) {
7853         return NULL;
7854     }
7855
7856     drflac__init_from_info(pFlac, &init);
7857     pFlac->allocationCallbacks = allocationCallbacks;
7858     pFlac->pDecodedSamples = (drflac_int32*)drflac_align((size_t)pFlac->pExtraData, DRFLAC_MAX_SIMD_VECTOR_SIZE);
7859
7860 #ifndef DR_FLAC_NO_OGG
7861     if (init.container == drflac_container_ogg) {
7862         drflac_oggbs* pInternalOggbs = (drflac_oggbs*)((drflac_uint8*)pFlac->pDecodedSamples + decodedSamplesAllocationSize + seektableSize);
7863         *pInternalOggbs = oggbs;
7864
7865         /* The Ogg bistream needs to be layered on top of the original bitstream. */
7866         pFlac->bs.onRead = drflac__on_read_ogg;
7867         pFlac->bs.onSeek = drflac__on_seek_ogg;
7868         pFlac->bs.pUserData = (void*)pInternalOggbs;
7869         pFlac->_oggbs = (void*)pInternalOggbs;
7870     }
7871 #endif
7872
7873     pFlac->firstFLACFramePosInBytes = firstFramePos;
7874
7875     /* NOTE: Seektables are not currently compatible with Ogg encapsulation (Ogg has its own accelerated seeking system). I may change this later, so I'm leaving this here for now. */
7876 #ifndef DR_FLAC_NO_OGG
7877     if (init.container == drflac_container_ogg)
7878     {
7879         pFlac->pSeekpoints = NULL;
7880         pFlac->seekpointCount = 0;
7881     }
7882     else
7883 #endif
7884     {
7885         /* If we have a seektable we need to load it now, making sure we move back to where we were previously. */
7886         if (seektablePos != 0) {
7887             pFlac->seekpointCount = seektableSize / sizeof(*pFlac->pSeekpoints);
7888             pFlac->pSeekpoints = (drflac_seekpoint*)((drflac_uint8*)pFlac->pDecodedSamples + decodedSamplesAllocationSize);
7889
7890             DRFLAC_ASSERT(pFlac->bs.onSeek != NULL);
7891             DRFLAC_ASSERT(pFlac->bs.onRead != NULL);
7892
7893             /* Seek to the seektable, then just read directly into our seektable buffer. */
7894             if (pFlac->bs.onSeek(pFlac->bs.pUserData, (int)seektablePos, drflac_seek_origin_start)) {
7895                 if (pFlac->bs.onRead(pFlac->bs.pUserData, pFlac->pSeekpoints, seektableSize) == seektableSize) {
7896                     /* Endian swap. */
7897                     drflac_uint32 iSeekpoint;
7898                     for (iSeekpoint = 0; iSeekpoint < pFlac->seekpointCount; ++iSeekpoint) {
7899                         pFlac->pSeekpoints[iSeekpoint].firstPCMFrame   = drflac__be2host_64(pFlac->pSeekpoints[iSeekpoint].firstPCMFrame);
7900                         pFlac->pSeekpoints[iSeekpoint].flacFrameOffset = drflac__be2host_64(pFlac->pSeekpoints[iSeekpoint].flacFrameOffset);
7901                         pFlac->pSeekpoints[iSeekpoint].pcmFrameCount   = drflac__be2host_16(pFlac->pSeekpoints[iSeekpoint].pcmFrameCount);
7902                     }
7903                 } else {
7904                     /* Failed to read the seektable. Pretend we don't have one. */
7905                     pFlac->pSeekpoints = NULL;
7906                     pFlac->seekpointCount = 0;
7907                 }
7908
7909                 /* We need to seek back to where we were. If this fails it's a critical error. */
7910                 if (!pFlac->bs.onSeek(pFlac->bs.pUserData, (int)pFlac->firstFLACFramePosInBytes, drflac_seek_origin_start)) {
7911                     drflac__free_from_callbacks(pFlac, &allocationCallbacks);
7912                     return NULL;
7913                 }
7914             } else {
7915                 /* Failed to seek to the seektable. Ominous sign, but for now we can just pretend we don't have one. */
7916                 pFlac->pSeekpoints = NULL;
7917                 pFlac->seekpointCount = 0;
7918             }
7919         }
7920     }
7921
7922
7923     /*
7924     If we get here, but don't have a STREAMINFO block, it means we've opened the stream in relaxed mode and need to decode
7925     the first frame.
7926     */
7927     if (!init.hasStreamInfoBlock) {
7928         pFlac->currentFLACFrame.header = init.firstFrameHeader;
7929         for (;;) {
7930             drflac_result result = drflac__decode_flac_frame(pFlac);
7931             if (result == DRFLAC_SUCCESS) {
7932                 break;
7933             } else {
7934                 if (result == DRFLAC_CRC_MISMATCH) {
7935                     if (!drflac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
7936                         drflac__free_from_callbacks(pFlac, &allocationCallbacks);
7937                         return NULL;
7938                     }
7939                     continue;
7940                 } else {
7941                     drflac__free_from_callbacks(pFlac, &allocationCallbacks);
7942                     return NULL;
7943                 }
7944             }
7945         }
7946     }
7947
7948     return pFlac;
7949 }
7950
7951
7952
7953 #ifndef DR_FLAC_NO_STDIO
7954 #include <stdio.h>
7955 #include <wchar.h>      /* For wcslen(), wcsrtombs() */
7956
7957 /* drflac_result_from_errno() is only used for fopen() and wfopen() so putting it inside DR_WAV_NO_STDIO for now. If something else needs this later we can move it out. */
7958 #include <errno.h>
7959 static drflac_result drflac_result_from_errno(int e)
7960 {
7961     switch (e)
7962     {
7963         case 0: return DRFLAC_SUCCESS;
7964     #ifdef EPERM
7965         case EPERM: return DRFLAC_INVALID_OPERATION;
7966     #endif
7967     #ifdef ENOENT
7968         case ENOENT: return DRFLAC_DOES_NOT_EXIST;
7969     #endif
7970     #ifdef ESRCH
7971         case ESRCH: return DRFLAC_DOES_NOT_EXIST;
7972     #endif
7973     #ifdef EINTR
7974         case EINTR: return DRFLAC_INTERRUPT;
7975     #endif
7976     #ifdef EIO
7977         case EIO: return DRFLAC_IO_ERROR;
7978     #endif
7979     #ifdef ENXIO
7980         case ENXIO: return DRFLAC_DOES_NOT_EXIST;
7981     #endif
7982     #ifdef E2BIG
7983         case E2BIG: return DRFLAC_INVALID_ARGS;
7984     #endif
7985     #ifdef ENOEXEC
7986         case ENOEXEC: return DRFLAC_INVALID_FILE;
7987     #endif
7988     #ifdef EBADF
7989         case EBADF: return DRFLAC_INVALID_FILE;
7990     #endif
7991     #ifdef ECHILD
7992         case ECHILD: return DRFLAC_ERROR;
7993     #endif
7994     #ifdef EAGAIN
7995         case EAGAIN: return DRFLAC_UNAVAILABLE;
7996     #endif
7997     #ifdef ENOMEM
7998         case ENOMEM: return DRFLAC_OUT_OF_MEMORY;
7999     #endif
8000     #ifdef EACCES
8001         case EACCES: return DRFLAC_ACCESS_DENIED;
8002     #endif
8003     #ifdef EFAULT
8004         case EFAULT: return DRFLAC_BAD_ADDRESS;
8005     #endif
8006     #ifdef ENOTBLK
8007         case ENOTBLK: return DRFLAC_ERROR;
8008     #endif
8009     #ifdef EBUSY
8010         case EBUSY: return DRFLAC_BUSY;
8011     #endif
8012     #ifdef EEXIST
8013         case EEXIST: return DRFLAC_ALREADY_EXISTS;
8014     #endif
8015     #ifdef EXDEV
8016         case EXDEV: return DRFLAC_ERROR;
8017     #endif
8018     #ifdef ENODEV
8019         case ENODEV: return DRFLAC_DOES_NOT_EXIST;
8020     #endif
8021     #ifdef ENOTDIR
8022         case ENOTDIR: return DRFLAC_NOT_DIRECTORY;
8023     #endif
8024     #ifdef EISDIR
8025         case EISDIR: return DRFLAC_IS_DIRECTORY;
8026     #endif
8027     #ifdef EINVAL
8028         case EINVAL: return DRFLAC_INVALID_ARGS;
8029     #endif
8030     #ifdef ENFILE
8031         case ENFILE: return DRFLAC_TOO_MANY_OPEN_FILES;
8032     #endif
8033     #ifdef EMFILE
8034         case EMFILE: return DRFLAC_TOO_MANY_OPEN_FILES;
8035     #endif
8036     #ifdef ENOTTY
8037         case ENOTTY: return DRFLAC_INVALID_OPERATION;
8038     #endif
8039     #ifdef ETXTBSY
8040         case ETXTBSY: return DRFLAC_BUSY;
8041     #endif
8042     #ifdef EFBIG
8043         case EFBIG: return DRFLAC_TOO_BIG;
8044     #endif
8045     #ifdef ENOSPC
8046         case ENOSPC: return DRFLAC_NO_SPACE;
8047     #endif
8048     #ifdef ESPIPE
8049         case ESPIPE: return DRFLAC_BAD_SEEK;
8050     #endif
8051     #ifdef EROFS
8052         case EROFS: return DRFLAC_ACCESS_DENIED;
8053     #endif
8054     #ifdef EMLINK
8055         case EMLINK: return DRFLAC_TOO_MANY_LINKS;
8056     #endif
8057     #ifdef EPIPE
8058         case EPIPE: return DRFLAC_BAD_PIPE;
8059     #endif
8060     #ifdef EDOM
8061         case EDOM: return DRFLAC_OUT_OF_RANGE;
8062     #endif
8063     #ifdef ERANGE
8064         case ERANGE: return DRFLAC_OUT_OF_RANGE;
8065     #endif
8066     #ifdef EDEADLK
8067         case EDEADLK: return DRFLAC_DEADLOCK;
8068     #endif
8069     #ifdef ENAMETOOLONG
8070         case ENAMETOOLONG: return DRFLAC_PATH_TOO_LONG;
8071     #endif
8072     #ifdef ENOLCK
8073         case ENOLCK: return DRFLAC_ERROR;
8074     #endif
8075     #ifdef ENOSYS
8076         case ENOSYS: return DRFLAC_NOT_IMPLEMENTED;
8077     #endif
8078     #ifdef ENOTEMPTY
8079         case ENOTEMPTY: return DRFLAC_DIRECTORY_NOT_EMPTY;
8080     #endif
8081     #ifdef ELOOP
8082         case ELOOP: return DRFLAC_TOO_MANY_LINKS;
8083     #endif
8084     #ifdef ENOMSG
8085         case ENOMSG: return DRFLAC_NO_MESSAGE;
8086     #endif
8087     #ifdef EIDRM
8088         case EIDRM: return DRFLAC_ERROR;
8089     #endif
8090     #ifdef ECHRNG
8091         case ECHRNG: return DRFLAC_ERROR;
8092     #endif
8093     #ifdef EL2NSYNC
8094         case EL2NSYNC: return DRFLAC_ERROR;
8095     #endif
8096     #ifdef EL3HLT
8097         case EL3HLT: return DRFLAC_ERROR;
8098     #endif
8099     #ifdef EL3RST
8100         case EL3RST: return DRFLAC_ERROR;
8101     #endif
8102     #ifdef ELNRNG
8103         case ELNRNG: return DRFLAC_OUT_OF_RANGE;
8104     #endif
8105     #ifdef EUNATCH
8106         case EUNATCH: return DRFLAC_ERROR;
8107     #endif
8108     #ifdef ENOCSI
8109         case ENOCSI: return DRFLAC_ERROR;
8110     #endif
8111     #ifdef EL2HLT
8112         case EL2HLT: return DRFLAC_ERROR;
8113     #endif
8114     #ifdef EBADE
8115         case EBADE: return DRFLAC_ERROR;
8116     #endif
8117     #ifdef EBADR
8118         case EBADR: return DRFLAC_ERROR;
8119     #endif
8120     #ifdef EXFULL
8121         case EXFULL: return DRFLAC_ERROR;
8122     #endif
8123     #ifdef ENOANO
8124         case ENOANO: return DRFLAC_ERROR;
8125     #endif
8126     #ifdef EBADRQC
8127         case EBADRQC: return DRFLAC_ERROR;
8128     #endif
8129     #ifdef EBADSLT
8130         case EBADSLT: return DRFLAC_ERROR;
8131     #endif
8132     #ifdef EBFONT
8133         case EBFONT: return DRFLAC_INVALID_FILE;
8134     #endif
8135     #ifdef ENOSTR
8136         case ENOSTR: return DRFLAC_ERROR;
8137     #endif
8138     #ifdef ENODATA
8139         case ENODATA: return DRFLAC_NO_DATA_AVAILABLE;
8140     #endif
8141     #ifdef ETIME
8142         case ETIME: return DRFLAC_TIMEOUT;
8143     #endif
8144     #ifdef ENOSR
8145         case ENOSR: return DRFLAC_NO_DATA_AVAILABLE;
8146     #endif
8147     #ifdef ENONET
8148         case ENONET: return DRFLAC_NO_NETWORK;
8149     #endif
8150     #ifdef ENOPKG
8151         case ENOPKG: return DRFLAC_ERROR;
8152     #endif
8153     #ifdef EREMOTE
8154         case EREMOTE: return DRFLAC_ERROR;
8155     #endif
8156     #ifdef ENOLINK
8157         case ENOLINK: return DRFLAC_ERROR;
8158     #endif
8159     #ifdef EADV
8160         case EADV: return DRFLAC_ERROR;
8161     #endif
8162     #ifdef ESRMNT
8163         case ESRMNT: return DRFLAC_ERROR;
8164     #endif
8165     #ifdef ECOMM
8166         case ECOMM: return DRFLAC_ERROR;
8167     #endif
8168     #ifdef EPROTO
8169         case EPROTO: return DRFLAC_ERROR;
8170     #endif
8171     #ifdef EMULTIHOP
8172         case EMULTIHOP: return DRFLAC_ERROR;
8173     #endif
8174     #ifdef EDOTDOT
8175         case EDOTDOT: return DRFLAC_ERROR;
8176     #endif
8177     #ifdef EBADMSG
8178         case EBADMSG: return DRFLAC_BAD_MESSAGE;
8179     #endif
8180     #ifdef EOVERFLOW
8181         case EOVERFLOW: return DRFLAC_TOO_BIG;
8182     #endif
8183     #ifdef ENOTUNIQ
8184         case ENOTUNIQ: return DRFLAC_NOT_UNIQUE;
8185     #endif
8186     #ifdef EBADFD
8187         case EBADFD: return DRFLAC_ERROR;
8188     #endif
8189     #ifdef EREMCHG
8190         case EREMCHG: return DRFLAC_ERROR;
8191     #endif
8192     #ifdef ELIBACC
8193         case ELIBACC: return DRFLAC_ACCESS_DENIED;
8194     #endif
8195     #ifdef ELIBBAD
8196         case ELIBBAD: return DRFLAC_INVALID_FILE;
8197     #endif
8198     #ifdef ELIBSCN
8199         case ELIBSCN: return DRFLAC_INVALID_FILE;
8200     #endif
8201     #ifdef ELIBMAX
8202         case ELIBMAX: return DRFLAC_ERROR;
8203     #endif
8204     #ifdef ELIBEXEC
8205         case ELIBEXEC: return DRFLAC_ERROR;
8206     #endif
8207     #ifdef EILSEQ
8208         case EILSEQ: return DRFLAC_INVALID_DATA;
8209     #endif
8210     #ifdef ERESTART
8211         case ERESTART: return DRFLAC_ERROR;
8212     #endif
8213     #ifdef ESTRPIPE
8214         case ESTRPIPE: return DRFLAC_ERROR;
8215     #endif
8216     #ifdef EUSERS
8217         case EUSERS: return DRFLAC_ERROR;
8218     #endif
8219     #ifdef ENOTSOCK
8220         case ENOTSOCK: return DRFLAC_NOT_SOCKET;
8221     #endif
8222     #ifdef EDESTADDRREQ
8223         case EDESTADDRREQ: return DRFLAC_NO_ADDRESS;
8224     #endif
8225     #ifdef EMSGSIZE
8226         case EMSGSIZE: return DRFLAC_TOO_BIG;
8227     #endif
8228     #ifdef EPROTOTYPE
8229         case EPROTOTYPE: return DRFLAC_BAD_PROTOCOL;
8230     #endif
8231     #ifdef ENOPROTOOPT
8232         case ENOPROTOOPT: return DRFLAC_PROTOCOL_UNAVAILABLE;
8233     #endif
8234     #ifdef EPROTONOSUPPORT
8235         case EPROTONOSUPPORT: return DRFLAC_PROTOCOL_NOT_SUPPORTED;
8236     #endif
8237     #ifdef ESOCKTNOSUPPORT
8238         case ESOCKTNOSUPPORT: return DRFLAC_SOCKET_NOT_SUPPORTED;
8239     #endif
8240     #ifdef EOPNOTSUPP
8241         case EOPNOTSUPP: return DRFLAC_INVALID_OPERATION;
8242     #endif
8243     #ifdef EPFNOSUPPORT
8244         case EPFNOSUPPORT: return DRFLAC_PROTOCOL_FAMILY_NOT_SUPPORTED;
8245     #endif
8246     #ifdef EAFNOSUPPORT
8247         case EAFNOSUPPORT: return DRFLAC_ADDRESS_FAMILY_NOT_SUPPORTED;
8248     #endif
8249     #ifdef EADDRINUSE
8250         case EADDRINUSE: return DRFLAC_ALREADY_IN_USE;
8251     #endif
8252     #ifdef EADDRNOTAVAIL
8253         case EADDRNOTAVAIL: return DRFLAC_ERROR;
8254     #endif
8255     #ifdef ENETDOWN
8256         case ENETDOWN: return DRFLAC_NO_NETWORK;
8257     #endif
8258     #ifdef ENETUNREACH
8259         case ENETUNREACH: return DRFLAC_NO_NETWORK;
8260     #endif
8261     #ifdef ENETRESET
8262         case ENETRESET: return DRFLAC_NO_NETWORK;
8263     #endif
8264     #ifdef ECONNABORTED
8265         case ECONNABORTED: return DRFLAC_NO_NETWORK;
8266     #endif
8267     #ifdef ECONNRESET
8268         case ECONNRESET: return DRFLAC_CONNECTION_RESET;
8269     #endif
8270     #ifdef ENOBUFS
8271         case ENOBUFS: return DRFLAC_NO_SPACE;
8272     #endif
8273     #ifdef EISCONN
8274         case EISCONN: return DRFLAC_ALREADY_CONNECTED;
8275     #endif
8276     #ifdef ENOTCONN
8277         case ENOTCONN: return DRFLAC_NOT_CONNECTED;
8278     #endif
8279     #ifdef ESHUTDOWN
8280         case ESHUTDOWN: return DRFLAC_ERROR;
8281     #endif
8282     #ifdef ETOOMANYREFS
8283         case ETOOMANYREFS: return DRFLAC_ERROR;
8284     #endif
8285     #ifdef ETIMEDOUT
8286         case ETIMEDOUT: return DRFLAC_TIMEOUT;
8287     #endif
8288     #ifdef ECONNREFUSED
8289         case ECONNREFUSED: return DRFLAC_CONNECTION_REFUSED;
8290     #endif
8291     #ifdef EHOSTDOWN
8292         case EHOSTDOWN: return DRFLAC_NO_HOST;
8293     #endif
8294     #ifdef EHOSTUNREACH
8295         case EHOSTUNREACH: return DRFLAC_NO_HOST;
8296     #endif
8297     #ifdef EALREADY
8298         case EALREADY: return DRFLAC_IN_PROGRESS;
8299     #endif
8300     #ifdef EINPROGRESS
8301         case EINPROGRESS: return DRFLAC_IN_PROGRESS;
8302     #endif
8303     #ifdef ESTALE
8304         case ESTALE: return DRFLAC_INVALID_FILE;
8305     #endif
8306     #ifdef EUCLEAN
8307         case EUCLEAN: return DRFLAC_ERROR;
8308     #endif
8309     #ifdef ENOTNAM
8310         case ENOTNAM: return DRFLAC_ERROR;
8311     #endif
8312     #ifdef ENAVAIL
8313         case ENAVAIL: return DRFLAC_ERROR;
8314     #endif
8315     #ifdef EISNAM
8316         case EISNAM: return DRFLAC_ERROR;
8317     #endif
8318     #ifdef EREMOTEIO
8319         case EREMOTEIO: return DRFLAC_IO_ERROR;
8320     #endif
8321     #ifdef EDQUOT
8322         case EDQUOT: return DRFLAC_NO_SPACE;
8323     #endif
8324     #ifdef ENOMEDIUM
8325         case ENOMEDIUM: return DRFLAC_DOES_NOT_EXIST;
8326     #endif
8327     #ifdef EMEDIUMTYPE
8328         case EMEDIUMTYPE: return DRFLAC_ERROR;
8329     #endif
8330     #ifdef ECANCELED
8331         case ECANCELED: return DRFLAC_CANCELLED;
8332     #endif
8333     #ifdef ENOKEY
8334         case ENOKEY: return DRFLAC_ERROR;
8335     #endif
8336     #ifdef EKEYEXPIRED
8337         case EKEYEXPIRED: return DRFLAC_ERROR;
8338     #endif
8339     #ifdef EKEYREVOKED
8340         case EKEYREVOKED: return DRFLAC_ERROR;
8341     #endif
8342     #ifdef EKEYREJECTED
8343         case EKEYREJECTED: return DRFLAC_ERROR;
8344     #endif
8345     #ifdef EOWNERDEAD
8346         case EOWNERDEAD: return DRFLAC_ERROR;
8347     #endif
8348     #ifdef ENOTRECOVERABLE
8349         case ENOTRECOVERABLE: return DRFLAC_ERROR;
8350     #endif
8351     #ifdef ERFKILL
8352         case ERFKILL: return DRFLAC_ERROR;
8353     #endif
8354     #ifdef EHWPOISON
8355         case EHWPOISON: return DRFLAC_ERROR;
8356     #endif
8357         default: return DRFLAC_ERROR;
8358     }
8359 }
8360
8361 static drflac_result drflac_fopen(FILE** ppFile, const char* pFilePath, const char* pOpenMode)
8362 {
8363 #if defined(_MSC_VER) && _MSC_VER >= 1400
8364     errno_t err;
8365 #endif
8366
8367     if (ppFile != NULL) {
8368         *ppFile = NULL;  /* Safety. */
8369     }
8370
8371     if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
8372         return DRFLAC_INVALID_ARGS;
8373     }
8374
8375 #if defined(_MSC_VER) && _MSC_VER >= 1400
8376     err = fopen_s(ppFile, pFilePath, pOpenMode);
8377     if (err != 0) {
8378         return drflac_result_from_errno(err);
8379     }
8380 #else
8381 #if defined(_WIN32) || defined(__APPLE__)
8382     *ppFile = fopen(pFilePath, pOpenMode);
8383 #else
8384     #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64 && defined(_LARGEFILE64_SOURCE)
8385         *ppFile = fopen64(pFilePath, pOpenMode);
8386     #else
8387         *ppFile = fopen(pFilePath, pOpenMode);
8388     #endif
8389 #endif
8390     if (*ppFile == NULL) {
8391         drflac_result result = drflac_result_from_errno(errno);
8392         if (result == DRFLAC_SUCCESS) {
8393             result = DRFLAC_ERROR;   /* Just a safety check to make sure we never ever return success when pFile == NULL. */
8394         }
8395
8396         return result;
8397     }
8398 #endif
8399
8400     return DRFLAC_SUCCESS;
8401 }
8402
8403 /*
8404 _wfopen() isn't always available in all compilation environments.
8405
8406     * Windows only.
8407     * MSVC seems to support it universally as far back as VC6 from what I can tell (haven't checked further back).
8408     * MinGW-64 (both 32- and 64-bit) seems to support it.
8409     * MinGW wraps it in !defined(__STRICT_ANSI__).
8410     * OpenWatcom wraps it in !defined(_NO_EXT_KEYS).
8411
8412 This can be reviewed as compatibility issues arise. The preference is to use _wfopen_s() and _wfopen() as opposed to the wcsrtombs()
8413 fallback, so if you notice your compiler not detecting this properly I'm happy to look at adding support.
8414 */
8415 #if defined(_WIN32)
8416     #if defined(_MSC_VER) || defined(__MINGW64__) || (!defined(__STRICT_ANSI__) && !defined(_NO_EXT_KEYS))
8417         #define DRFLAC_HAS_WFOPEN
8418     #endif
8419 #endif
8420
8421 static drflac_result drflac_wfopen(FILE** ppFile, const wchar_t* pFilePath, const wchar_t* pOpenMode, const drflac_allocation_callbacks* pAllocationCallbacks)
8422 {
8423     if (ppFile != NULL) {
8424         *ppFile = NULL;  /* Safety. */
8425     }
8426
8427     if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
8428         return DRFLAC_INVALID_ARGS;
8429     }
8430
8431 #if defined(DRFLAC_HAS_WFOPEN)
8432     {
8433         /* Use _wfopen() on Windows. */
8434     #if defined(_MSC_VER) && _MSC_VER >= 1400
8435         errno_t err = _wfopen_s(ppFile, pFilePath, pOpenMode);
8436         if (err != 0) {
8437             return drflac_result_from_errno(err);
8438         }
8439     #else
8440         *ppFile = _wfopen(pFilePath, pOpenMode);
8441         if (*ppFile == NULL) {
8442             return drflac_result_from_errno(errno);
8443         }
8444     #endif
8445         (void)pAllocationCallbacks;
8446     }
8447 #else
8448     /*
8449     Use fopen() on anything other than Windows. Requires a conversion. This is annoying because fopen() is locale specific. The only real way I can
8450     think of to do this is with wcsrtombs(). Note that wcstombs() is apparently not thread-safe because it uses a static global mbstate_t object for
8451     maintaining state. I've checked this with -std=c89 and it works, but if somebody get's a compiler error I'll look into improving compatibility.
8452     */
8453     {
8454         mbstate_t mbs;
8455         size_t lenMB;
8456         const wchar_t* pFilePathTemp = pFilePath;
8457         char* pFilePathMB = NULL;
8458         char pOpenModeMB[32] = {0};
8459
8460         /* Get the length first. */
8461         DRFLAC_ZERO_OBJECT(&mbs);
8462         lenMB = wcsrtombs(NULL, &pFilePathTemp, 0, &mbs);
8463         if (lenMB == (size_t)-1) {
8464             return drflac_result_from_errno(errno);
8465         }
8466
8467         pFilePathMB = (char*)drflac__malloc_from_callbacks(lenMB + 1, pAllocationCallbacks);
8468         if (pFilePathMB == NULL) {
8469             return DRFLAC_OUT_OF_MEMORY;
8470         }
8471
8472         pFilePathTemp = pFilePath;
8473         DRFLAC_ZERO_OBJECT(&mbs);
8474         wcsrtombs(pFilePathMB, &pFilePathTemp, lenMB + 1, &mbs);
8475
8476         /* The open mode should always consist of ASCII characters so we should be able to do a trivial conversion. */
8477         {
8478             size_t i = 0;
8479             for (;;) {
8480                 if (pOpenMode[i] == 0) {
8481                     pOpenModeMB[i] = '\0';
8482                     break;
8483                 }
8484
8485                 pOpenModeMB[i] = (char)pOpenMode[i];
8486                 i += 1;
8487             }
8488         }
8489
8490         *ppFile = fopen(pFilePathMB, pOpenModeMB);
8491
8492         drflac__free_from_callbacks(pFilePathMB, pAllocationCallbacks);
8493     }
8494
8495     if (*ppFile == NULL) {
8496         return DRFLAC_ERROR;
8497     }
8498 #endif
8499
8500     return DRFLAC_SUCCESS;
8501 }
8502
8503 static size_t drflac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
8504 {
8505     return fread(bufferOut, 1, bytesToRead, (FILE*)pUserData);
8506 }
8507
8508 static drflac_bool32 drflac__on_seek_stdio(void* pUserData, int offset, drflac_seek_origin origin)
8509 {
8510     DRFLAC_ASSERT(offset >= 0);  /* <-- Never seek backwards. */
8511
8512     return fseek((FILE*)pUserData, offset, (origin == drflac_seek_origin_current) ? SEEK_CUR : SEEK_SET) == 0;
8513 }
8514
8515
8516 DRFLAC_API drflac* drflac_open_file(const char* pFileName, const drflac_allocation_callbacks* pAllocationCallbacks)
8517 {
8518     drflac* pFlac;
8519     FILE* pFile;
8520
8521     if (drflac_fopen(&pFile, pFileName, "rb") != DRFLAC_SUCCESS) {
8522         return NULL;
8523     }
8524
8525     pFlac = drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
8526     if (pFlac == NULL) {
8527         fclose(pFile);
8528         return NULL;
8529     }
8530
8531     return pFlac;
8532 }
8533
8534 DRFLAC_API drflac* drflac_open_file_w(const wchar_t* pFileName, const drflac_allocation_callbacks* pAllocationCallbacks)
8535 {
8536     drflac* pFlac;
8537     FILE* pFile;
8538
8539     if (drflac_wfopen(&pFile, pFileName, L"rb", pAllocationCallbacks) != DRFLAC_SUCCESS) {
8540         return NULL;
8541     }
8542
8543     pFlac = drflac_open(drflac__on_read_stdio, drflac__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
8544     if (pFlac == NULL) {
8545         fclose(pFile);
8546         return NULL;
8547     }
8548
8549     return pFlac;
8550 }
8551
8552 DRFLAC_API drflac* drflac_open_file_with_metadata(const char* pFileName, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
8553 {
8554     drflac* pFlac;
8555     FILE* pFile;
8556
8557     if (drflac_fopen(&pFile, pFileName, "rb") != DRFLAC_SUCCESS) {
8558         return NULL;
8559     }
8560
8561     pFlac = drflac_open_with_metadata_private(drflac__on_read_stdio, drflac__on_seek_stdio, onMeta, drflac_container_unknown, (void*)pFile, pUserData, pAllocationCallbacks);
8562     if (pFlac == NULL) {
8563         fclose(pFile);
8564         return pFlac;
8565     }
8566
8567     return pFlac;
8568 }
8569
8570 DRFLAC_API drflac* drflac_open_file_with_metadata_w(const wchar_t* pFileName, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
8571 {
8572     drflac* pFlac;
8573     FILE* pFile;
8574
8575     if (drflac_wfopen(&pFile, pFileName, L"rb", pAllocationCallbacks) != DRFLAC_SUCCESS) {
8576         return NULL;
8577     }
8578
8579     pFlac = drflac_open_with_metadata_private(drflac__on_read_stdio, drflac__on_seek_stdio, onMeta, drflac_container_unknown, (void*)pFile, pUserData, pAllocationCallbacks);
8580     if (pFlac == NULL) {
8581         fclose(pFile);
8582         return pFlac;
8583     }
8584
8585     return pFlac;
8586 }
8587 #endif  /* DR_FLAC_NO_STDIO */
8588
8589 static size_t drflac__on_read_memory(void* pUserData, void* bufferOut, size_t bytesToRead)
8590 {
8591     drflac__memory_stream* memoryStream = (drflac__memory_stream*)pUserData;
8592     size_t bytesRemaining;
8593
8594     DRFLAC_ASSERT(memoryStream != NULL);
8595     DRFLAC_ASSERT(memoryStream->dataSize >= memoryStream->currentReadPos);
8596
8597     bytesRemaining = memoryStream->dataSize - memoryStream->currentReadPos;
8598     if (bytesToRead > bytesRemaining) {
8599         bytesToRead = bytesRemaining;
8600     }
8601
8602     if (bytesToRead > 0) {
8603         DRFLAC_COPY_MEMORY(bufferOut, memoryStream->data + memoryStream->currentReadPos, bytesToRead);
8604         memoryStream->currentReadPos += bytesToRead;
8605     }
8606
8607     return bytesToRead;
8608 }
8609
8610 static drflac_bool32 drflac__on_seek_memory(void* pUserData, int offset, drflac_seek_origin origin)
8611 {
8612     drflac__memory_stream* memoryStream = (drflac__memory_stream*)pUserData;
8613
8614     DRFLAC_ASSERT(memoryStream != NULL);
8615     DRFLAC_ASSERT(offset >= 0); /* <-- Never seek backwards. */
8616
8617     if (offset > (drflac_int64)memoryStream->dataSize) {
8618         return DRFLAC_FALSE;
8619     }
8620
8621     if (origin == drflac_seek_origin_current) {
8622         if (memoryStream->currentReadPos + offset <= memoryStream->dataSize) {
8623             memoryStream->currentReadPos += offset;
8624         } else {
8625             return DRFLAC_FALSE;  /* Trying to seek too far forward. */
8626         }
8627     } else {
8628         if ((drflac_uint32)offset <= memoryStream->dataSize) {
8629             memoryStream->currentReadPos = offset;
8630         } else {
8631             return DRFLAC_FALSE;  /* Trying to seek too far forward. */
8632         }
8633     }
8634
8635     return DRFLAC_TRUE;
8636 }
8637
8638 DRFLAC_API drflac* drflac_open_memory(const void* pData, size_t dataSize, const drflac_allocation_callbacks* pAllocationCallbacks)
8639 {
8640     drflac__memory_stream memoryStream;
8641     drflac* pFlac;
8642
8643     memoryStream.data = (const drflac_uint8*)pData;
8644     memoryStream.dataSize = dataSize;
8645     memoryStream.currentReadPos = 0;
8646     pFlac = drflac_open(drflac__on_read_memory, drflac__on_seek_memory, &memoryStream, pAllocationCallbacks);
8647     if (pFlac == NULL) {
8648         return NULL;
8649     }
8650
8651     pFlac->memoryStream = memoryStream;
8652
8653     /* This is an awful hack... */
8654 #ifndef DR_FLAC_NO_OGG
8655     if (pFlac->container == drflac_container_ogg)
8656     {
8657         drflac_oggbs* oggbs = (drflac_oggbs*)pFlac->_oggbs;
8658         oggbs->pUserData = &pFlac->memoryStream;
8659     }
8660     else
8661 #endif
8662     {
8663         pFlac->bs.pUserData = &pFlac->memoryStream;
8664     }
8665
8666     return pFlac;
8667 }
8668
8669 DRFLAC_API drflac* drflac_open_memory_with_metadata(const void* pData, size_t dataSize, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
8670 {
8671     drflac__memory_stream memoryStream;
8672     drflac* pFlac;
8673
8674     memoryStream.data = (const drflac_uint8*)pData;
8675     memoryStream.dataSize = dataSize;
8676     memoryStream.currentReadPos = 0;
8677     pFlac = drflac_open_with_metadata_private(drflac__on_read_memory, drflac__on_seek_memory, onMeta, drflac_container_unknown, &memoryStream, pUserData, pAllocationCallbacks);
8678     if (pFlac == NULL) {
8679         return NULL;
8680     }
8681
8682     pFlac->memoryStream = memoryStream;
8683
8684     /* This is an awful hack... */
8685 #ifndef DR_FLAC_NO_OGG
8686     if (pFlac->container == drflac_container_ogg)
8687     {
8688         drflac_oggbs* oggbs = (drflac_oggbs*)pFlac->_oggbs;
8689         oggbs->pUserData = &pFlac->memoryStream;
8690     }
8691     else
8692 #endif
8693     {
8694         pFlac->bs.pUserData = &pFlac->memoryStream;
8695     }
8696
8697     return pFlac;
8698 }
8699
8700
8701
8702 DRFLAC_API drflac* drflac_open(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
8703 {
8704     return drflac_open_with_metadata_private(onRead, onSeek, NULL, drflac_container_unknown, pUserData, pUserData, pAllocationCallbacks);
8705 }
8706 DRFLAC_API drflac* drflac_open_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
8707 {
8708     return drflac_open_with_metadata_private(onRead, onSeek, NULL, container, pUserData, pUserData, pAllocationCallbacks);
8709 }
8710
8711 DRFLAC_API drflac* drflac_open_with_metadata(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
8712 {
8713     return drflac_open_with_metadata_private(onRead, onSeek, onMeta, drflac_container_unknown, pUserData, pUserData, pAllocationCallbacks);
8714 }
8715 DRFLAC_API drflac* drflac_open_with_metadata_relaxed(drflac_read_proc onRead, drflac_seek_proc onSeek, drflac_meta_proc onMeta, drflac_container container, void* pUserData, const drflac_allocation_callbacks* pAllocationCallbacks)
8716 {
8717     return drflac_open_with_metadata_private(onRead, onSeek, onMeta, container, pUserData, pUserData, pAllocationCallbacks);
8718 }
8719
8720 DRFLAC_API void drflac_close(drflac* pFlac)
8721 {
8722     if (pFlac == NULL) {
8723         return;
8724     }
8725
8726 #ifndef DR_FLAC_NO_STDIO
8727     /*
8728     If we opened the file with drflac_open_file() we will want to close the file handle. We can know whether or not drflac_open_file()
8729     was used by looking at the callbacks.
8730     */
8731     if (pFlac->bs.onRead == drflac__on_read_stdio) {
8732         fclose((FILE*)pFlac->bs.pUserData);
8733     }
8734
8735 #ifndef DR_FLAC_NO_OGG
8736     /* Need to clean up Ogg streams a bit differently due to the way the bit streaming is chained. */
8737     if (pFlac->container == drflac_container_ogg) {
8738         drflac_oggbs* oggbs = (drflac_oggbs*)pFlac->_oggbs;
8739         DRFLAC_ASSERT(pFlac->bs.onRead == drflac__on_read_ogg);
8740
8741         if (oggbs->onRead == drflac__on_read_stdio) {
8742             fclose((FILE*)oggbs->pUserData);
8743         }
8744     }
8745 #endif
8746 #endif
8747
8748     drflac__free_from_callbacks(pFlac, &pFlac->allocationCallbacks);
8749 }
8750
8751
8752 #if 0
8753 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8754 {
8755     drflac_uint64 i;
8756     for (i = 0; i < frameCount; ++i) {
8757         drflac_uint32 left  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
8758         drflac_uint32 side  = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
8759         drflac_uint32 right = left - side;
8760
8761         pOutputSamples[i*2+0] = (drflac_int32)left;
8762         pOutputSamples[i*2+1] = (drflac_int32)right;
8763     }
8764 }
8765 #endif
8766
8767 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8768 {
8769     drflac_uint64 i;
8770     drflac_uint64 frameCount4 = frameCount >> 2;
8771     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8772     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8773     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8774     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8775
8776     for (i = 0; i < frameCount4; ++i) {
8777         drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
8778         drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
8779         drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
8780         drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
8781
8782         drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
8783         drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
8784         drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
8785         drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
8786
8787         drflac_uint32 right0 = left0 - side0;
8788         drflac_uint32 right1 = left1 - side1;
8789         drflac_uint32 right2 = left2 - side2;
8790         drflac_uint32 right3 = left3 - side3;
8791
8792         pOutputSamples[i*8+0] = (drflac_int32)left0;
8793         pOutputSamples[i*8+1] = (drflac_int32)right0;
8794         pOutputSamples[i*8+2] = (drflac_int32)left1;
8795         pOutputSamples[i*8+3] = (drflac_int32)right1;
8796         pOutputSamples[i*8+4] = (drflac_int32)left2;
8797         pOutputSamples[i*8+5] = (drflac_int32)right2;
8798         pOutputSamples[i*8+6] = (drflac_int32)left3;
8799         pOutputSamples[i*8+7] = (drflac_int32)right3;
8800     }
8801
8802     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8803         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
8804         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
8805         drflac_uint32 right = left - side;
8806
8807         pOutputSamples[i*2+0] = (drflac_int32)left;
8808         pOutputSamples[i*2+1] = (drflac_int32)right;
8809     }
8810 }
8811
8812 #if defined(DRFLAC_SUPPORT_SSE2)
8813 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8814 {
8815     drflac_uint64 i;
8816     drflac_uint64 frameCount4 = frameCount >> 2;
8817     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8818     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8819     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8820     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8821
8822     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
8823
8824     for (i = 0; i < frameCount4; ++i) {
8825         __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
8826         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
8827         __m128i right = _mm_sub_epi32(left, side);
8828
8829         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
8830         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
8831     }
8832
8833     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8834         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
8835         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
8836         drflac_uint32 right = left - side;
8837
8838         pOutputSamples[i*2+0] = (drflac_int32)left;
8839         pOutputSamples[i*2+1] = (drflac_int32)right;
8840     }
8841 }
8842 #endif
8843
8844 #if defined(DRFLAC_SUPPORT_NEON)
8845 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8846 {
8847     drflac_uint64 i;
8848     drflac_uint64 frameCount4 = frameCount >> 2;
8849     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8850     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8851     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8852     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8853     int32x4_t shift0_4;
8854     int32x4_t shift1_4;
8855
8856     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
8857
8858     shift0_4 = vdupq_n_s32(shift0);
8859     shift1_4 = vdupq_n_s32(shift1);
8860
8861     for (i = 0; i < frameCount4; ++i) {
8862         uint32x4_t left;
8863         uint32x4_t side;
8864         uint32x4_t right;
8865
8866         left  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
8867         side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
8868         right = vsubq_u32(left, side);
8869
8870         drflac__vst2q_u32((drflac_uint32*)pOutputSamples + i*8, vzipq_u32(left, right));
8871     }
8872
8873     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8874         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
8875         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
8876         drflac_uint32 right = left - side;
8877
8878         pOutputSamples[i*2+0] = (drflac_int32)left;
8879         pOutputSamples[i*2+1] = (drflac_int32)right;
8880     }
8881 }
8882 #endif
8883
8884 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8885 {
8886 #if defined(DRFLAC_SUPPORT_SSE2)
8887     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
8888         drflac_read_pcm_frames_s32__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8889     } else
8890 #elif defined(DRFLAC_SUPPORT_NEON)
8891     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
8892         drflac_read_pcm_frames_s32__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8893     } else
8894 #endif
8895     {
8896         /* Scalar fallback. */
8897 #if 0
8898         drflac_read_pcm_frames_s32__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8899 #else
8900         drflac_read_pcm_frames_s32__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
8901 #endif
8902     }
8903 }
8904
8905
8906 #if 0
8907 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8908 {
8909     drflac_uint64 i;
8910     for (i = 0; i < frameCount; ++i) {
8911         drflac_uint32 side  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
8912         drflac_uint32 right = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
8913         drflac_uint32 left  = right + side;
8914
8915         pOutputSamples[i*2+0] = (drflac_int32)left;
8916         pOutputSamples[i*2+1] = (drflac_int32)right;
8917     }
8918 }
8919 #endif
8920
8921 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8922 {
8923     drflac_uint64 i;
8924     drflac_uint64 frameCount4 = frameCount >> 2;
8925     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8926     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8927     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8928     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8929
8930     for (i = 0; i < frameCount4; ++i) {
8931         drflac_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
8932         drflac_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
8933         drflac_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
8934         drflac_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
8935
8936         drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
8937         drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
8938         drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
8939         drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
8940
8941         drflac_uint32 left0 = right0 + side0;
8942         drflac_uint32 left1 = right1 + side1;
8943         drflac_uint32 left2 = right2 + side2;
8944         drflac_uint32 left3 = right3 + side3;
8945
8946         pOutputSamples[i*8+0] = (drflac_int32)left0;
8947         pOutputSamples[i*8+1] = (drflac_int32)right0;
8948         pOutputSamples[i*8+2] = (drflac_int32)left1;
8949         pOutputSamples[i*8+3] = (drflac_int32)right1;
8950         pOutputSamples[i*8+4] = (drflac_int32)left2;
8951         pOutputSamples[i*8+5] = (drflac_int32)right2;
8952         pOutputSamples[i*8+6] = (drflac_int32)left3;
8953         pOutputSamples[i*8+7] = (drflac_int32)right3;
8954     }
8955
8956     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8957         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
8958         drflac_uint32 right = pInputSamples1U32[i] << shift1;
8959         drflac_uint32 left  = right + side;
8960
8961         pOutputSamples[i*2+0] = (drflac_int32)left;
8962         pOutputSamples[i*2+1] = (drflac_int32)right;
8963     }
8964 }
8965
8966 #if defined(DRFLAC_SUPPORT_SSE2)
8967 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
8968 {
8969     drflac_uint64 i;
8970     drflac_uint64 frameCount4 = frameCount >> 2;
8971     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
8972     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
8973     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
8974     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
8975
8976     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
8977
8978     for (i = 0; i < frameCount4; ++i) {
8979         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
8980         __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
8981         __m128i left  = _mm_add_epi32(right, side);
8982
8983         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
8984         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
8985     }
8986
8987     for (i = (frameCount4 << 2); i < frameCount; ++i) {
8988         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
8989         drflac_uint32 right = pInputSamples1U32[i] << shift1;
8990         drflac_uint32 left  = right + side;
8991
8992         pOutputSamples[i*2+0] = (drflac_int32)left;
8993         pOutputSamples[i*2+1] = (drflac_int32)right;
8994     }
8995 }
8996 #endif
8997
8998 #if defined(DRFLAC_SUPPORT_NEON)
8999 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9000 {
9001     drflac_uint64 i;
9002     drflac_uint64 frameCount4 = frameCount >> 2;
9003     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9004     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9005     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9006     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9007     int32x4_t shift0_4;
9008     int32x4_t shift1_4;
9009
9010     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9011
9012     shift0_4 = vdupq_n_s32(shift0);
9013     shift1_4 = vdupq_n_s32(shift1);
9014
9015     for (i = 0; i < frameCount4; ++i) {
9016         uint32x4_t side;
9017         uint32x4_t right;
9018         uint32x4_t left;
9019
9020         side  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
9021         right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
9022         left  = vaddq_u32(right, side);
9023
9024         drflac__vst2q_u32((drflac_uint32*)pOutputSamples + i*8, vzipq_u32(left, right));
9025     }
9026
9027     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9028         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
9029         drflac_uint32 right = pInputSamples1U32[i] << shift1;
9030         drflac_uint32 left  = right + side;
9031
9032         pOutputSamples[i*2+0] = (drflac_int32)left;
9033         pOutputSamples[i*2+1] = (drflac_int32)right;
9034     }
9035 }
9036 #endif
9037
9038 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9039 {
9040 #if defined(DRFLAC_SUPPORT_SSE2)
9041     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
9042         drflac_read_pcm_frames_s32__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9043     } else
9044 #elif defined(DRFLAC_SUPPORT_NEON)
9045     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
9046         drflac_read_pcm_frames_s32__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9047     } else
9048 #endif
9049     {
9050         /* Scalar fallback. */
9051 #if 0
9052         drflac_read_pcm_frames_s32__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9053 #else
9054         drflac_read_pcm_frames_s32__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9055 #endif
9056     }
9057 }
9058
9059
9060 #if 0
9061 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9062 {
9063     for (drflac_uint64 i = 0; i < frameCount; ++i) {
9064         drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9065         drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9066
9067         mid = (mid << 1) | (side & 0x01);
9068
9069         pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample);
9070         pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample);
9071     }
9072 }
9073 #endif
9074
9075 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9076 {
9077     drflac_uint64 i;
9078     drflac_uint64 frameCount4 = frameCount >> 2;
9079     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9080     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9081     drflac_int32 shift = unusedBitsPerSample;
9082
9083     if (shift > 0) {
9084         shift -= 1;
9085         for (i = 0; i < frameCount4; ++i) {
9086             drflac_uint32 temp0L;
9087             drflac_uint32 temp1L;
9088             drflac_uint32 temp2L;
9089             drflac_uint32 temp3L;
9090             drflac_uint32 temp0R;
9091             drflac_uint32 temp1R;
9092             drflac_uint32 temp2R;
9093             drflac_uint32 temp3R;
9094
9095             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9096             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9097             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9098             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9099
9100             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9101             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9102             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9103             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9104
9105             mid0 = (mid0 << 1) | (side0 & 0x01);
9106             mid1 = (mid1 << 1) | (side1 & 0x01);
9107             mid2 = (mid2 << 1) | (side2 & 0x01);
9108             mid3 = (mid3 << 1) | (side3 & 0x01);
9109
9110             temp0L = (mid0 + side0) << shift;
9111             temp1L = (mid1 + side1) << shift;
9112             temp2L = (mid2 + side2) << shift;
9113             temp3L = (mid3 + side3) << shift;
9114
9115             temp0R = (mid0 - side0) << shift;
9116             temp1R = (mid1 - side1) << shift;
9117             temp2R = (mid2 - side2) << shift;
9118             temp3R = (mid3 - side3) << shift;
9119
9120             pOutputSamples[i*8+0] = (drflac_int32)temp0L;
9121             pOutputSamples[i*8+1] = (drflac_int32)temp0R;
9122             pOutputSamples[i*8+2] = (drflac_int32)temp1L;
9123             pOutputSamples[i*8+3] = (drflac_int32)temp1R;
9124             pOutputSamples[i*8+4] = (drflac_int32)temp2L;
9125             pOutputSamples[i*8+5] = (drflac_int32)temp2R;
9126             pOutputSamples[i*8+6] = (drflac_int32)temp3L;
9127             pOutputSamples[i*8+7] = (drflac_int32)temp3R;
9128         }
9129     } else {
9130         for (i = 0; i < frameCount4; ++i) {
9131             drflac_uint32 temp0L;
9132             drflac_uint32 temp1L;
9133             drflac_uint32 temp2L;
9134             drflac_uint32 temp3L;
9135             drflac_uint32 temp0R;
9136             drflac_uint32 temp1R;
9137             drflac_uint32 temp2R;
9138             drflac_uint32 temp3R;
9139
9140             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9141             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9142             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9143             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9144
9145             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9146             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9147             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9148             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9149
9150             mid0 = (mid0 << 1) | (side0 & 0x01);
9151             mid1 = (mid1 << 1) | (side1 & 0x01);
9152             mid2 = (mid2 << 1) | (side2 & 0x01);
9153             mid3 = (mid3 << 1) | (side3 & 0x01);
9154
9155             temp0L = (drflac_uint32)((drflac_int32)(mid0 + side0) >> 1);
9156             temp1L = (drflac_uint32)((drflac_int32)(mid1 + side1) >> 1);
9157             temp2L = (drflac_uint32)((drflac_int32)(mid2 + side2) >> 1);
9158             temp3L = (drflac_uint32)((drflac_int32)(mid3 + side3) >> 1);
9159
9160             temp0R = (drflac_uint32)((drflac_int32)(mid0 - side0) >> 1);
9161             temp1R = (drflac_uint32)((drflac_int32)(mid1 - side1) >> 1);
9162             temp2R = (drflac_uint32)((drflac_int32)(mid2 - side2) >> 1);
9163             temp3R = (drflac_uint32)((drflac_int32)(mid3 - side3) >> 1);
9164
9165             pOutputSamples[i*8+0] = (drflac_int32)temp0L;
9166             pOutputSamples[i*8+1] = (drflac_int32)temp0R;
9167             pOutputSamples[i*8+2] = (drflac_int32)temp1L;
9168             pOutputSamples[i*8+3] = (drflac_int32)temp1R;
9169             pOutputSamples[i*8+4] = (drflac_int32)temp2L;
9170             pOutputSamples[i*8+5] = (drflac_int32)temp2R;
9171             pOutputSamples[i*8+6] = (drflac_int32)temp3L;
9172             pOutputSamples[i*8+7] = (drflac_int32)temp3R;
9173         }
9174     }
9175
9176     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9177         drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9178         drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9179
9180         mid = (mid << 1) | (side & 0x01);
9181
9182         pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample);
9183         pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample);
9184     }
9185 }
9186
9187 #if defined(DRFLAC_SUPPORT_SSE2)
9188 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9189 {
9190     drflac_uint64 i;
9191     drflac_uint64 frameCount4 = frameCount >> 2;
9192     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9193     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9194     drflac_int32 shift = unusedBitsPerSample;
9195
9196     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9197
9198     if (shift == 0) {
9199         for (i = 0; i < frameCount4; ++i) {
9200             __m128i mid;
9201             __m128i side;
9202             __m128i left;
9203             __m128i right;
9204
9205             mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
9206             side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
9207
9208             mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
9209
9210             left  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
9211             right = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
9212
9213             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
9214             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
9215         }
9216
9217         for (i = (frameCount4 << 2); i < frameCount; ++i) {
9218             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9219             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9220
9221             mid = (mid << 1) | (side & 0x01);
9222
9223             pOutputSamples[i*2+0] = (drflac_int32)(mid + side) >> 1;
9224             pOutputSamples[i*2+1] = (drflac_int32)(mid - side) >> 1;
9225         }
9226     } else {
9227         shift -= 1;
9228         for (i = 0; i < frameCount4; ++i) {
9229             __m128i mid;
9230             __m128i side;
9231             __m128i left;
9232             __m128i right;
9233
9234             mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
9235             side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
9236
9237             mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
9238
9239             left  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
9240             right = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
9241
9242             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
9243             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
9244         }
9245
9246         for (i = (frameCount4 << 2); i < frameCount; ++i) {
9247             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9248             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9249
9250             mid = (mid << 1) | (side & 0x01);
9251
9252             pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift);
9253             pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift);
9254         }
9255     }
9256 }
9257 #endif
9258
9259 #if defined(DRFLAC_SUPPORT_NEON)
9260 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9261 {
9262     drflac_uint64 i;
9263     drflac_uint64 frameCount4 = frameCount >> 2;
9264     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9265     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9266     drflac_int32 shift = unusedBitsPerSample;
9267     int32x4_t  wbpsShift0_4; /* wbps = Wasted Bits Per Sample */
9268     int32x4_t  wbpsShift1_4; /* wbps = Wasted Bits Per Sample */
9269     uint32x4_t one4;
9270
9271     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9272
9273     wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
9274     wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
9275     one4         = vdupq_n_u32(1);
9276
9277     if (shift == 0) {
9278         for (i = 0; i < frameCount4; ++i) {
9279             uint32x4_t mid;
9280             uint32x4_t side;
9281             int32x4_t left;
9282             int32x4_t right;
9283
9284             mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
9285             side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
9286
9287             mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4));
9288
9289             left  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
9290             right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
9291
9292             drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
9293         }
9294
9295         for (i = (frameCount4 << 2); i < frameCount; ++i) {
9296             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9297             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9298
9299             mid = (mid << 1) | (side & 0x01);
9300
9301             pOutputSamples[i*2+0] = (drflac_int32)(mid + side) >> 1;
9302             pOutputSamples[i*2+1] = (drflac_int32)(mid - side) >> 1;
9303         }
9304     } else {
9305         int32x4_t shift4;
9306
9307         shift -= 1;
9308         shift4 = vdupq_n_s32(shift);
9309
9310         for (i = 0; i < frameCount4; ++i) {
9311             uint32x4_t mid;
9312             uint32x4_t side;
9313             int32x4_t left;
9314             int32x4_t right;
9315
9316             mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
9317             side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
9318
9319             mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4));
9320
9321             left  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
9322             right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
9323
9324             drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
9325         }
9326
9327         for (i = (frameCount4 << 2); i < frameCount; ++i) {
9328             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9329             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9330
9331             mid = (mid << 1) | (side & 0x01);
9332
9333             pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift);
9334             pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift);
9335         }
9336     }
9337 }
9338 #endif
9339
9340 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9341 {
9342 #if defined(DRFLAC_SUPPORT_SSE2)
9343     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
9344         drflac_read_pcm_frames_s32__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9345     } else
9346 #elif defined(DRFLAC_SUPPORT_NEON)
9347     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
9348         drflac_read_pcm_frames_s32__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9349     } else
9350 #endif
9351     {
9352         /* Scalar fallback. */
9353 #if 0
9354         drflac_read_pcm_frames_s32__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9355 #else
9356         drflac_read_pcm_frames_s32__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9357 #endif
9358     }
9359 }
9360
9361
9362 #if 0
9363 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9364 {
9365     for (drflac_uint64 i = 0; i < frameCount; ++i) {
9366         pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample));
9367         pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample));
9368     }
9369 }
9370 #endif
9371
9372 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9373 {
9374     drflac_uint64 i;
9375     drflac_uint64 frameCount4 = frameCount >> 2;
9376     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9377     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9378     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9379     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9380
9381     for (i = 0; i < frameCount4; ++i) {
9382         drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
9383         drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
9384         drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
9385         drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
9386
9387         drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
9388         drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
9389         drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
9390         drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
9391
9392         pOutputSamples[i*8+0] = (drflac_int32)tempL0;
9393         pOutputSamples[i*8+1] = (drflac_int32)tempR0;
9394         pOutputSamples[i*8+2] = (drflac_int32)tempL1;
9395         pOutputSamples[i*8+3] = (drflac_int32)tempR1;
9396         pOutputSamples[i*8+4] = (drflac_int32)tempL2;
9397         pOutputSamples[i*8+5] = (drflac_int32)tempR2;
9398         pOutputSamples[i*8+6] = (drflac_int32)tempL3;
9399         pOutputSamples[i*8+7] = (drflac_int32)tempR3;
9400     }
9401
9402     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9403         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0);
9404         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1);
9405     }
9406 }
9407
9408 #if defined(DRFLAC_SUPPORT_SSE2)
9409 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9410 {
9411     drflac_uint64 i;
9412     drflac_uint64 frameCount4 = frameCount >> 2;
9413     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9414     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9415     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9416     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9417
9418     for (i = 0; i < frameCount4; ++i) {
9419         __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
9420         __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
9421
9422         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
9423         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
9424     }
9425
9426     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9427         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0);
9428         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1);
9429     }
9430 }
9431 #endif
9432
9433 #if defined(DRFLAC_SUPPORT_NEON)
9434 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9435 {
9436     drflac_uint64 i;
9437     drflac_uint64 frameCount4 = frameCount >> 2;
9438     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9439     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9440     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9441     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9442
9443     int32x4_t shift4_0 = vdupq_n_s32(shift0);
9444     int32x4_t shift4_1 = vdupq_n_s32(shift1);
9445
9446     for (i = 0; i < frameCount4; ++i) {
9447         int32x4_t left;
9448         int32x4_t right;
9449
9450         left  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift4_0));
9451         right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift4_1));
9452
9453         drflac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
9454     }
9455
9456     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9457         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0);
9458         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1);
9459     }
9460 }
9461 #endif
9462
9463 static DRFLAC_INLINE void drflac_read_pcm_frames_s32__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int32* pOutputSamples)
9464 {
9465 #if defined(DRFLAC_SUPPORT_SSE2)
9466     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
9467         drflac_read_pcm_frames_s32__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9468     } else
9469 #elif defined(DRFLAC_SUPPORT_NEON)
9470     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
9471         drflac_read_pcm_frames_s32__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9472     } else
9473 #endif
9474     {
9475         /* Scalar fallback. */
9476 #if 0
9477         drflac_read_pcm_frames_s32__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9478 #else
9479         drflac_read_pcm_frames_s32__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9480 #endif
9481     }
9482 }
9483
9484
9485 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s32(drflac* pFlac, drflac_uint64 framesToRead, drflac_int32* pBufferOut)
9486 {
9487     drflac_uint64 framesRead;
9488     drflac_uint32 unusedBitsPerSample;
9489
9490     if (pFlac == NULL || framesToRead == 0) {
9491         return 0;
9492     }
9493
9494     if (pBufferOut == NULL) {
9495         return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead);
9496     }
9497
9498     DRFLAC_ASSERT(pFlac->bitsPerSample <= 32);
9499     unusedBitsPerSample = 32 - pFlac->bitsPerSample;
9500
9501     framesRead = 0;
9502     while (framesToRead > 0) {
9503         /* If we've run out of samples in this frame, go to the next. */
9504         if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
9505             if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
9506                 break;  /* Couldn't read the next frame, so just break from the loop and return. */
9507             }
9508         } else {
9509             unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
9510             drflac_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
9511             drflac_uint64 frameCountThisIteration = framesToRead;
9512
9513             if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
9514                 frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
9515             }
9516
9517             if (channelCount == 2) {
9518                 const drflac_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
9519                 const drflac_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
9520
9521                 switch (pFlac->currentFLACFrame.header.channelAssignment)
9522                 {
9523                     case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
9524                     {
9525                         drflac_read_pcm_frames_s32__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9526                     } break;
9527
9528                     case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
9529                     {
9530                         drflac_read_pcm_frames_s32__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9531                     } break;
9532
9533                     case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
9534                     {
9535                         drflac_read_pcm_frames_s32__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9536                     } break;
9537
9538                     case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
9539                     default:
9540                     {
9541                         drflac_read_pcm_frames_s32__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
9542                     } break;
9543                 }
9544             } else {
9545                 /* Generic interleaving. */
9546                 drflac_uint64 i;
9547                 for (i = 0; i < frameCountThisIteration; ++i) {
9548                     unsigned int j;
9549                     for (j = 0; j < channelCount; ++j) {
9550                         pBufferOut[(i*channelCount)+j] = (drflac_int32)((drflac_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
9551                     }
9552                 }
9553             }
9554
9555             framesRead                += frameCountThisIteration;
9556             pBufferOut                += frameCountThisIteration * channelCount;
9557             framesToRead              -= frameCountThisIteration;
9558             pFlac->currentPCMFrame    += frameCountThisIteration;
9559             pFlac->currentFLACFrame.pcmFramesRemaining -= (drflac_uint32)frameCountThisIteration;
9560         }
9561     }
9562
9563     return framesRead;
9564 }
9565
9566
9567 #if 0
9568 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9569 {
9570     drflac_uint64 i;
9571     for (i = 0; i < frameCount; ++i) {
9572         drflac_uint32 left  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
9573         drflac_uint32 side  = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
9574         drflac_uint32 right = left - side;
9575
9576         left  >>= 16;
9577         right >>= 16;
9578
9579         pOutputSamples[i*2+0] = (drflac_int16)left;
9580         pOutputSamples[i*2+1] = (drflac_int16)right;
9581     }
9582 }
9583 #endif
9584
9585 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9586 {
9587     drflac_uint64 i;
9588     drflac_uint64 frameCount4 = frameCount >> 2;
9589     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9590     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9591     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9592     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9593
9594     for (i = 0; i < frameCount4; ++i) {
9595         drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
9596         drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
9597         drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
9598         drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
9599
9600         drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
9601         drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
9602         drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
9603         drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
9604
9605         drflac_uint32 right0 = left0 - side0;
9606         drflac_uint32 right1 = left1 - side1;
9607         drflac_uint32 right2 = left2 - side2;
9608         drflac_uint32 right3 = left3 - side3;
9609
9610         left0  >>= 16;
9611         left1  >>= 16;
9612         left2  >>= 16;
9613         left3  >>= 16;
9614
9615         right0 >>= 16;
9616         right1 >>= 16;
9617         right2 >>= 16;
9618         right3 >>= 16;
9619
9620         pOutputSamples[i*8+0] = (drflac_int16)left0;
9621         pOutputSamples[i*8+1] = (drflac_int16)right0;
9622         pOutputSamples[i*8+2] = (drflac_int16)left1;
9623         pOutputSamples[i*8+3] = (drflac_int16)right1;
9624         pOutputSamples[i*8+4] = (drflac_int16)left2;
9625         pOutputSamples[i*8+5] = (drflac_int16)right2;
9626         pOutputSamples[i*8+6] = (drflac_int16)left3;
9627         pOutputSamples[i*8+7] = (drflac_int16)right3;
9628     }
9629
9630     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9631         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
9632         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
9633         drflac_uint32 right = left - side;
9634
9635         left  >>= 16;
9636         right >>= 16;
9637
9638         pOutputSamples[i*2+0] = (drflac_int16)left;
9639         pOutputSamples[i*2+1] = (drflac_int16)right;
9640     }
9641 }
9642
9643 #if defined(DRFLAC_SUPPORT_SSE2)
9644 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9645 {
9646     drflac_uint64 i;
9647     drflac_uint64 frameCount4 = frameCount >> 2;
9648     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9649     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9650     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9651     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9652
9653     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9654
9655     for (i = 0; i < frameCount4; ++i) {
9656         __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
9657         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
9658         __m128i right = _mm_sub_epi32(left, side);
9659
9660         left  = _mm_srai_epi32(left,  16);
9661         right = _mm_srai_epi32(right, 16);
9662
9663         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
9664     }
9665
9666     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9667         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
9668         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
9669         drflac_uint32 right = left - side;
9670
9671         left  >>= 16;
9672         right >>= 16;
9673
9674         pOutputSamples[i*2+0] = (drflac_int16)left;
9675         pOutputSamples[i*2+1] = (drflac_int16)right;
9676     }
9677 }
9678 #endif
9679
9680 #if defined(DRFLAC_SUPPORT_NEON)
9681 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9682 {
9683     drflac_uint64 i;
9684     drflac_uint64 frameCount4 = frameCount >> 2;
9685     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9686     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9687     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9688     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9689     int32x4_t shift0_4;
9690     int32x4_t shift1_4;
9691
9692     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9693
9694     shift0_4 = vdupq_n_s32(shift0);
9695     shift1_4 = vdupq_n_s32(shift1);
9696
9697     for (i = 0; i < frameCount4; ++i) {
9698         uint32x4_t left;
9699         uint32x4_t side;
9700         uint32x4_t right;
9701
9702         left  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
9703         side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
9704         right = vsubq_u32(left, side);
9705
9706         left  = vshrq_n_u32(left,  16);
9707         right = vshrq_n_u32(right, 16);
9708
9709         drflac__vst2q_u16((drflac_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right)));
9710     }
9711
9712     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9713         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
9714         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
9715         drflac_uint32 right = left - side;
9716
9717         left  >>= 16;
9718         right >>= 16;
9719
9720         pOutputSamples[i*2+0] = (drflac_int16)left;
9721         pOutputSamples[i*2+1] = (drflac_int16)right;
9722     }
9723 }
9724 #endif
9725
9726 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9727 {
9728 #if defined(DRFLAC_SUPPORT_SSE2)
9729     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
9730         drflac_read_pcm_frames_s16__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9731     } else
9732 #elif defined(DRFLAC_SUPPORT_NEON)
9733     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
9734         drflac_read_pcm_frames_s16__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9735     } else
9736 #endif
9737     {
9738         /* Scalar fallback. */
9739 #if 0
9740         drflac_read_pcm_frames_s16__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9741 #else
9742         drflac_read_pcm_frames_s16__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9743 #endif
9744     }
9745 }
9746
9747
9748 #if 0
9749 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9750 {
9751     drflac_uint64 i;
9752     for (i = 0; i < frameCount; ++i) {
9753         drflac_uint32 side  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
9754         drflac_uint32 right = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
9755         drflac_uint32 left  = right + side;
9756
9757         left  >>= 16;
9758         right >>= 16;
9759
9760         pOutputSamples[i*2+0] = (drflac_int16)left;
9761         pOutputSamples[i*2+1] = (drflac_int16)right;
9762     }
9763 }
9764 #endif
9765
9766 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9767 {
9768     drflac_uint64 i;
9769     drflac_uint64 frameCount4 = frameCount >> 2;
9770     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9771     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9772     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9773     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9774
9775     for (i = 0; i < frameCount4; ++i) {
9776         drflac_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
9777         drflac_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
9778         drflac_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
9779         drflac_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
9780
9781         drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
9782         drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
9783         drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
9784         drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
9785
9786         drflac_uint32 left0 = right0 + side0;
9787         drflac_uint32 left1 = right1 + side1;
9788         drflac_uint32 left2 = right2 + side2;
9789         drflac_uint32 left3 = right3 + side3;
9790
9791         left0  >>= 16;
9792         left1  >>= 16;
9793         left2  >>= 16;
9794         left3  >>= 16;
9795
9796         right0 >>= 16;
9797         right1 >>= 16;
9798         right2 >>= 16;
9799         right3 >>= 16;
9800
9801         pOutputSamples[i*8+0] = (drflac_int16)left0;
9802         pOutputSamples[i*8+1] = (drflac_int16)right0;
9803         pOutputSamples[i*8+2] = (drflac_int16)left1;
9804         pOutputSamples[i*8+3] = (drflac_int16)right1;
9805         pOutputSamples[i*8+4] = (drflac_int16)left2;
9806         pOutputSamples[i*8+5] = (drflac_int16)right2;
9807         pOutputSamples[i*8+6] = (drflac_int16)left3;
9808         pOutputSamples[i*8+7] = (drflac_int16)right3;
9809     }
9810
9811     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9812         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
9813         drflac_uint32 right = pInputSamples1U32[i] << shift1;
9814         drflac_uint32 left  = right + side;
9815
9816         left  >>= 16;
9817         right >>= 16;
9818
9819         pOutputSamples[i*2+0] = (drflac_int16)left;
9820         pOutputSamples[i*2+1] = (drflac_int16)right;
9821     }
9822 }
9823
9824 #if defined(DRFLAC_SUPPORT_SSE2)
9825 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9826 {
9827     drflac_uint64 i;
9828     drflac_uint64 frameCount4 = frameCount >> 2;
9829     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9830     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9831     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9832     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9833
9834     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9835
9836     for (i = 0; i < frameCount4; ++i) {
9837         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
9838         __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
9839         __m128i left  = _mm_add_epi32(right, side);
9840
9841         left  = _mm_srai_epi32(left,  16);
9842         right = _mm_srai_epi32(right, 16);
9843
9844         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
9845     }
9846
9847     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9848         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
9849         drflac_uint32 right = pInputSamples1U32[i] << shift1;
9850         drflac_uint32 left  = right + side;
9851
9852         left  >>= 16;
9853         right >>= 16;
9854
9855         pOutputSamples[i*2+0] = (drflac_int16)left;
9856         pOutputSamples[i*2+1] = (drflac_int16)right;
9857     }
9858 }
9859 #endif
9860
9861 #if defined(DRFLAC_SUPPORT_NEON)
9862 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9863 {
9864     drflac_uint64 i;
9865     drflac_uint64 frameCount4 = frameCount >> 2;
9866     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9867     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9868     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9869     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9870     int32x4_t shift0_4;
9871     int32x4_t shift1_4;
9872
9873     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
9874
9875     shift0_4 = vdupq_n_s32(shift0);
9876     shift1_4 = vdupq_n_s32(shift1);
9877
9878     for (i = 0; i < frameCount4; ++i) {
9879         uint32x4_t side;
9880         uint32x4_t right;
9881         uint32x4_t left;
9882
9883         side  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
9884         right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
9885         left  = vaddq_u32(right, side);
9886
9887         left  = vshrq_n_u32(left,  16);
9888         right = vshrq_n_u32(right, 16);
9889
9890         drflac__vst2q_u16((drflac_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right)));
9891     }
9892
9893     for (i = (frameCount4 << 2); i < frameCount; ++i) {
9894         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
9895         drflac_uint32 right = pInputSamples1U32[i] << shift1;
9896         drflac_uint32 left  = right + side;
9897
9898         left  >>= 16;
9899         right >>= 16;
9900
9901         pOutputSamples[i*2+0] = (drflac_int16)left;
9902         pOutputSamples[i*2+1] = (drflac_int16)right;
9903     }
9904 }
9905 #endif
9906
9907 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9908 {
9909 #if defined(DRFLAC_SUPPORT_SSE2)
9910     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
9911         drflac_read_pcm_frames_s16__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9912     } else
9913 #elif defined(DRFLAC_SUPPORT_NEON)
9914     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
9915         drflac_read_pcm_frames_s16__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9916     } else
9917 #endif
9918     {
9919         /* Scalar fallback. */
9920 #if 0
9921         drflac_read_pcm_frames_s16__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9922 #else
9923         drflac_read_pcm_frames_s16__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
9924 #endif
9925     }
9926 }
9927
9928
9929 #if 0
9930 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9931 {
9932     for (drflac_uint64 i = 0; i < frameCount; ++i) {
9933         drflac_uint32 mid  = (drflac_uint32)pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9934         drflac_uint32 side = (drflac_uint32)pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9935
9936         mid = (mid << 1) | (side & 0x01);
9937
9938         pOutputSamples[i*2+0] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample) >> 16);
9939         pOutputSamples[i*2+1] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample) >> 16);
9940     }
9941 }
9942 #endif
9943
9944 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
9945 {
9946     drflac_uint64 i;
9947     drflac_uint64 frameCount4 = frameCount >> 2;
9948     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
9949     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
9950     drflac_uint32 shift = unusedBitsPerSample;
9951
9952     if (shift > 0) {
9953         shift -= 1;
9954         for (i = 0; i < frameCount4; ++i) {
9955             drflac_uint32 temp0L;
9956             drflac_uint32 temp1L;
9957             drflac_uint32 temp2L;
9958             drflac_uint32 temp3L;
9959             drflac_uint32 temp0R;
9960             drflac_uint32 temp1R;
9961             drflac_uint32 temp2R;
9962             drflac_uint32 temp3R;
9963
9964             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9965             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9966             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9967             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
9968
9969             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9970             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9971             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9972             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
9973
9974             mid0 = (mid0 << 1) | (side0 & 0x01);
9975             mid1 = (mid1 << 1) | (side1 & 0x01);
9976             mid2 = (mid2 << 1) | (side2 & 0x01);
9977             mid3 = (mid3 << 1) | (side3 & 0x01);
9978
9979             temp0L = (mid0 + side0) << shift;
9980             temp1L = (mid1 + side1) << shift;
9981             temp2L = (mid2 + side2) << shift;
9982             temp3L = (mid3 + side3) << shift;
9983
9984             temp0R = (mid0 - side0) << shift;
9985             temp1R = (mid1 - side1) << shift;
9986             temp2R = (mid2 - side2) << shift;
9987             temp3R = (mid3 - side3) << shift;
9988
9989             temp0L >>= 16;
9990             temp1L >>= 16;
9991             temp2L >>= 16;
9992             temp3L >>= 16;
9993
9994             temp0R >>= 16;
9995             temp1R >>= 16;
9996             temp2R >>= 16;
9997             temp3R >>= 16;
9998
9999             pOutputSamples[i*8+0] = (drflac_int16)temp0L;
10000             pOutputSamples[i*8+1] = (drflac_int16)temp0R;
10001             pOutputSamples[i*8+2] = (drflac_int16)temp1L;
10002             pOutputSamples[i*8+3] = (drflac_int16)temp1R;
10003             pOutputSamples[i*8+4] = (drflac_int16)temp2L;
10004             pOutputSamples[i*8+5] = (drflac_int16)temp2R;
10005             pOutputSamples[i*8+6] = (drflac_int16)temp3L;
10006             pOutputSamples[i*8+7] = (drflac_int16)temp3R;
10007         }
10008     } else {
10009         for (i = 0; i < frameCount4; ++i) {
10010             drflac_uint32 temp0L;
10011             drflac_uint32 temp1L;
10012             drflac_uint32 temp2L;
10013             drflac_uint32 temp3L;
10014             drflac_uint32 temp0R;
10015             drflac_uint32 temp1R;
10016             drflac_uint32 temp2R;
10017             drflac_uint32 temp3R;
10018
10019             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10020             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10021             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10022             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10023
10024             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10025             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10026             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10027             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10028
10029             mid0 = (mid0 << 1) | (side0 & 0x01);
10030             mid1 = (mid1 << 1) | (side1 & 0x01);
10031             mid2 = (mid2 << 1) | (side2 & 0x01);
10032             mid3 = (mid3 << 1) | (side3 & 0x01);
10033
10034             temp0L = ((drflac_int32)(mid0 + side0) >> 1);
10035             temp1L = ((drflac_int32)(mid1 + side1) >> 1);
10036             temp2L = ((drflac_int32)(mid2 + side2) >> 1);
10037             temp3L = ((drflac_int32)(mid3 + side3) >> 1);
10038
10039             temp0R = ((drflac_int32)(mid0 - side0) >> 1);
10040             temp1R = ((drflac_int32)(mid1 - side1) >> 1);
10041             temp2R = ((drflac_int32)(mid2 - side2) >> 1);
10042             temp3R = ((drflac_int32)(mid3 - side3) >> 1);
10043
10044             temp0L >>= 16;
10045             temp1L >>= 16;
10046             temp2L >>= 16;
10047             temp3L >>= 16;
10048
10049             temp0R >>= 16;
10050             temp1R >>= 16;
10051             temp2R >>= 16;
10052             temp3R >>= 16;
10053
10054             pOutputSamples[i*8+0] = (drflac_int16)temp0L;
10055             pOutputSamples[i*8+1] = (drflac_int16)temp0R;
10056             pOutputSamples[i*8+2] = (drflac_int16)temp1L;
10057             pOutputSamples[i*8+3] = (drflac_int16)temp1R;
10058             pOutputSamples[i*8+4] = (drflac_int16)temp2L;
10059             pOutputSamples[i*8+5] = (drflac_int16)temp2R;
10060             pOutputSamples[i*8+6] = (drflac_int16)temp3L;
10061             pOutputSamples[i*8+7] = (drflac_int16)temp3R;
10062         }
10063     }
10064
10065     for (i = (frameCount4 << 2); i < frameCount; ++i) {
10066         drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10067         drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10068
10069         mid = (mid << 1) | (side & 0x01);
10070
10071         pOutputSamples[i*2+0] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample) >> 16);
10072         pOutputSamples[i*2+1] = (drflac_int16)(((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample) >> 16);
10073     }
10074 }
10075
10076 #if defined(DRFLAC_SUPPORT_SSE2)
10077 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
10078 {
10079     drflac_uint64 i;
10080     drflac_uint64 frameCount4 = frameCount >> 2;
10081     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10082     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10083     drflac_uint32 shift = unusedBitsPerSample;
10084
10085     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
10086
10087     if (shift == 0) {
10088         for (i = 0; i < frameCount4; ++i) {
10089             __m128i mid;
10090             __m128i side;
10091             __m128i left;
10092             __m128i right;
10093
10094             mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
10095             side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
10096
10097             mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
10098
10099             left  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
10100             right = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
10101
10102             left  = _mm_srai_epi32(left,  16);
10103             right = _mm_srai_epi32(right, 16);
10104
10105             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
10106         }
10107
10108         for (i = (frameCount4 << 2); i < frameCount; ++i) {
10109             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10110             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10111
10112             mid = (mid << 1) | (side & 0x01);
10113
10114             pOutputSamples[i*2+0] = (drflac_int16)(((drflac_int32)(mid + side) >> 1) >> 16);
10115             pOutputSamples[i*2+1] = (drflac_int16)(((drflac_int32)(mid - side) >> 1) >> 16);
10116         }
10117     } else {
10118         shift -= 1;
10119         for (i = 0; i < frameCount4; ++i) {
10120             __m128i mid;
10121             __m128i side;
10122             __m128i left;
10123             __m128i right;
10124
10125             mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
10126             side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
10127
10128             mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
10129
10130             left  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
10131             right = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
10132
10133             left  = _mm_srai_epi32(left,  16);
10134             right = _mm_srai_epi32(right, 16);
10135
10136             _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
10137         }
10138
10139         for (i = (frameCount4 << 2); i < frameCount; ++i) {
10140             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10141             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10142
10143             mid = (mid << 1) | (side & 0x01);
10144
10145             pOutputSamples[i*2+0] = (drflac_int16)(((mid + side) << shift) >> 16);
10146             pOutputSamples[i*2+1] = (drflac_int16)(((mid - side) << shift) >> 16);
10147         }
10148     }
10149 }
10150 #endif
10151
10152 #if defined(DRFLAC_SUPPORT_NEON)
10153 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
10154 {
10155     drflac_uint64 i;
10156     drflac_uint64 frameCount4 = frameCount >> 2;
10157     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10158     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10159     drflac_uint32 shift = unusedBitsPerSample;
10160     int32x4_t wbpsShift0_4; /* wbps = Wasted Bits Per Sample */
10161     int32x4_t wbpsShift1_4; /* wbps = Wasted Bits Per Sample */
10162
10163     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
10164
10165     wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
10166     wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
10167
10168     if (shift == 0) {
10169         for (i = 0; i < frameCount4; ++i) {
10170             uint32x4_t mid;
10171             uint32x4_t side;
10172             int32x4_t left;
10173             int32x4_t right;
10174
10175             mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
10176             side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
10177
10178             mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
10179
10180             left  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
10181             right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
10182
10183             left  = vshrq_n_s32(left,  16);
10184             right = vshrq_n_s32(right, 16);
10185
10186             drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
10187         }
10188
10189         for (i = (frameCount4 << 2); i < frameCount; ++i) {
10190             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10191             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10192
10193             mid = (mid << 1) | (side & 0x01);
10194
10195             pOutputSamples[i*2+0] = (drflac_int16)(((drflac_int32)(mid + side) >> 1) >> 16);
10196             pOutputSamples[i*2+1] = (drflac_int16)(((drflac_int32)(mid - side) >> 1) >> 16);
10197         }
10198     } else {
10199         int32x4_t shift4;
10200
10201         shift -= 1;
10202         shift4 = vdupq_n_s32(shift);
10203
10204         for (i = 0; i < frameCount4; ++i) {
10205             uint32x4_t mid;
10206             uint32x4_t side;
10207             int32x4_t left;
10208             int32x4_t right;
10209
10210             mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
10211             side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
10212
10213             mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
10214
10215             left  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
10216             right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
10217
10218             left  = vshrq_n_s32(left,  16);
10219             right = vshrq_n_s32(right, 16);
10220
10221             drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
10222         }
10223
10224         for (i = (frameCount4 << 2); i < frameCount; ++i) {
10225             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10226             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10227
10228             mid = (mid << 1) | (side & 0x01);
10229
10230             pOutputSamples[i*2+0] = (drflac_int16)(((mid + side) << shift) >> 16);
10231             pOutputSamples[i*2+1] = (drflac_int16)(((mid - side) << shift) >> 16);
10232         }
10233     }
10234 }
10235 #endif
10236
10237 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
10238 {
10239 #if defined(DRFLAC_SUPPORT_SSE2)
10240     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
10241         drflac_read_pcm_frames_s16__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10242     } else
10243 #elif defined(DRFLAC_SUPPORT_NEON)
10244     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
10245         drflac_read_pcm_frames_s16__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10246     } else
10247 #endif
10248     {
10249         /* Scalar fallback. */
10250 #if 0
10251         drflac_read_pcm_frames_s16__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10252 #else
10253         drflac_read_pcm_frames_s16__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10254 #endif
10255     }
10256 }
10257
10258
10259 #if 0
10260 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
10261 {
10262     for (drflac_uint64 i = 0; i < frameCount; ++i) {
10263         pOutputSamples[i*2+0] = (drflac_int16)((drflac_int32)((drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) >> 16);
10264         pOutputSamples[i*2+1] = (drflac_int16)((drflac_int32)((drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) >> 16);
10265     }
10266 }
10267 #endif
10268
10269 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
10270 {
10271     drflac_uint64 i;
10272     drflac_uint64 frameCount4 = frameCount >> 2;
10273     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10274     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10275     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10276     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10277
10278     for (i = 0; i < frameCount4; ++i) {
10279         drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
10280         drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
10281         drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
10282         drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
10283
10284         drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
10285         drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
10286         drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
10287         drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
10288
10289         tempL0 >>= 16;
10290         tempL1 >>= 16;
10291         tempL2 >>= 16;
10292         tempL3 >>= 16;
10293
10294         tempR0 >>= 16;
10295         tempR1 >>= 16;
10296         tempR2 >>= 16;
10297         tempR3 >>= 16;
10298
10299         pOutputSamples[i*8+0] = (drflac_int16)tempL0;
10300         pOutputSamples[i*8+1] = (drflac_int16)tempR0;
10301         pOutputSamples[i*8+2] = (drflac_int16)tempL1;
10302         pOutputSamples[i*8+3] = (drflac_int16)tempR1;
10303         pOutputSamples[i*8+4] = (drflac_int16)tempL2;
10304         pOutputSamples[i*8+5] = (drflac_int16)tempR2;
10305         pOutputSamples[i*8+6] = (drflac_int16)tempL3;
10306         pOutputSamples[i*8+7] = (drflac_int16)tempR3;
10307     }
10308
10309     for (i = (frameCount4 << 2); i < frameCount; ++i) {
10310         pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16);
10311         pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16);
10312     }
10313 }
10314
10315 #if defined(DRFLAC_SUPPORT_SSE2)
10316 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
10317 {
10318     drflac_uint64 i;
10319     drflac_uint64 frameCount4 = frameCount >> 2;
10320     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10321     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10322     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10323     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10324
10325     for (i = 0; i < frameCount4; ++i) {
10326         __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
10327         __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
10328
10329         left  = _mm_srai_epi32(left,  16);
10330         right = _mm_srai_epi32(right, 16);
10331
10332         /* At this point we have results. We can now pack and interleave these into a single __m128i object and then store the in the output buffer. */
10333         _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), drflac__mm_packs_interleaved_epi32(left, right));
10334     }
10335
10336     for (i = (frameCount4 << 2); i < frameCount; ++i) {
10337         pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16);
10338         pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16);
10339     }
10340 }
10341 #endif
10342
10343 #if defined(DRFLAC_SUPPORT_NEON)
10344 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
10345 {
10346     drflac_uint64 i;
10347     drflac_uint64 frameCount4 = frameCount >> 2;
10348     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10349     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10350     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10351     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10352
10353     int32x4_t shift0_4 = vdupq_n_s32(shift0);
10354     int32x4_t shift1_4 = vdupq_n_s32(shift1);
10355
10356     for (i = 0; i < frameCount4; ++i) {
10357         int32x4_t left;
10358         int32x4_t right;
10359
10360         left  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4));
10361         right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4));
10362
10363         left  = vshrq_n_s32(left,  16);
10364         right = vshrq_n_s32(right, 16);
10365
10366         drflac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
10367     }
10368
10369     for (i = (frameCount4 << 2); i < frameCount; ++i) {
10370         pOutputSamples[i*2+0] = (drflac_int16)((pInputSamples0U32[i] << shift0) >> 16);
10371         pOutputSamples[i*2+1] = (drflac_int16)((pInputSamples1U32[i] << shift1) >> 16);
10372     }
10373 }
10374 #endif
10375
10376 static DRFLAC_INLINE void drflac_read_pcm_frames_s16__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, drflac_int16* pOutputSamples)
10377 {
10378 #if defined(DRFLAC_SUPPORT_SSE2)
10379     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
10380         drflac_read_pcm_frames_s16__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10381     } else
10382 #elif defined(DRFLAC_SUPPORT_NEON)
10383     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
10384         drflac_read_pcm_frames_s16__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10385     } else
10386 #endif
10387     {
10388         /* Scalar fallback. */
10389 #if 0
10390         drflac_read_pcm_frames_s16__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10391 #else
10392         drflac_read_pcm_frames_s16__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10393 #endif
10394     }
10395 }
10396
10397 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_s16(drflac* pFlac, drflac_uint64 framesToRead, drflac_int16* pBufferOut)
10398 {
10399     drflac_uint64 framesRead;
10400     drflac_uint32 unusedBitsPerSample;
10401
10402     if (pFlac == NULL || framesToRead == 0) {
10403         return 0;
10404     }
10405
10406     if (pBufferOut == NULL) {
10407         return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead);
10408     }
10409
10410     DRFLAC_ASSERT(pFlac->bitsPerSample <= 32);
10411     unusedBitsPerSample = 32 - pFlac->bitsPerSample;
10412
10413     framesRead = 0;
10414     while (framesToRead > 0) {
10415         /* If we've run out of samples in this frame, go to the next. */
10416         if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
10417             if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
10418                 break;  /* Couldn't read the next frame, so just break from the loop and return. */
10419             }
10420         } else {
10421             unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
10422             drflac_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
10423             drflac_uint64 frameCountThisIteration = framesToRead;
10424
10425             if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
10426                 frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
10427             }
10428
10429             if (channelCount == 2) {
10430                 const drflac_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
10431                 const drflac_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
10432
10433                 switch (pFlac->currentFLACFrame.header.channelAssignment)
10434                 {
10435                     case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
10436                     {
10437                         drflac_read_pcm_frames_s16__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
10438                     } break;
10439
10440                     case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
10441                     {
10442                         drflac_read_pcm_frames_s16__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
10443                     } break;
10444
10445                     case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
10446                     {
10447                         drflac_read_pcm_frames_s16__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
10448                     } break;
10449
10450                     case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
10451                     default:
10452                     {
10453                         drflac_read_pcm_frames_s16__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
10454                     } break;
10455                 }
10456             } else {
10457                 /* Generic interleaving. */
10458                 drflac_uint64 i;
10459                 for (i = 0; i < frameCountThisIteration; ++i) {
10460                     unsigned int j;
10461                     for (j = 0; j < channelCount; ++j) {
10462                         drflac_int32 sampleS32 = (drflac_int32)((drflac_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
10463                         pBufferOut[(i*channelCount)+j] = (drflac_int16)(sampleS32 >> 16);
10464                     }
10465                 }
10466             }
10467
10468             framesRead                += frameCountThisIteration;
10469             pBufferOut                += frameCountThisIteration * channelCount;
10470             framesToRead              -= frameCountThisIteration;
10471             pFlac->currentPCMFrame    += frameCountThisIteration;
10472             pFlac->currentFLACFrame.pcmFramesRemaining -= (drflac_uint32)frameCountThisIteration;
10473         }
10474     }
10475
10476     return framesRead;
10477 }
10478
10479
10480 #if 0
10481 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10482 {
10483     drflac_uint64 i;
10484     for (i = 0; i < frameCount; ++i) {
10485         drflac_uint32 left  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
10486         drflac_uint32 side  = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
10487         drflac_uint32 right = left - side;
10488
10489         pOutputSamples[i*2+0] = (float)((drflac_int32)left  / 2147483648.0);
10490         pOutputSamples[i*2+1] = (float)((drflac_int32)right / 2147483648.0);
10491     }
10492 }
10493 #endif
10494
10495 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10496 {
10497     drflac_uint64 i;
10498     drflac_uint64 frameCount4 = frameCount >> 2;
10499     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10500     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10501     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10502     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10503
10504     float factor = 1 / 2147483648.0;
10505
10506     for (i = 0; i < frameCount4; ++i) {
10507         drflac_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
10508         drflac_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
10509         drflac_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
10510         drflac_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
10511
10512         drflac_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
10513         drflac_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
10514         drflac_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
10515         drflac_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
10516
10517         drflac_uint32 right0 = left0 - side0;
10518         drflac_uint32 right1 = left1 - side1;
10519         drflac_uint32 right2 = left2 - side2;
10520         drflac_uint32 right3 = left3 - side3;
10521
10522         pOutputSamples[i*8+0] = (drflac_int32)left0  * factor;
10523         pOutputSamples[i*8+1] = (drflac_int32)right0 * factor;
10524         pOutputSamples[i*8+2] = (drflac_int32)left1  * factor;
10525         pOutputSamples[i*8+3] = (drflac_int32)right1 * factor;
10526         pOutputSamples[i*8+4] = (drflac_int32)left2  * factor;
10527         pOutputSamples[i*8+5] = (drflac_int32)right2 * factor;
10528         pOutputSamples[i*8+6] = (drflac_int32)left3  * factor;
10529         pOutputSamples[i*8+7] = (drflac_int32)right3 * factor;
10530     }
10531
10532     for (i = (frameCount4 << 2); i < frameCount; ++i) {
10533         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
10534         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
10535         drflac_uint32 right = left - side;
10536
10537         pOutputSamples[i*2+0] = (drflac_int32)left  * factor;
10538         pOutputSamples[i*2+1] = (drflac_int32)right * factor;
10539     }
10540 }
10541
10542 #if defined(DRFLAC_SUPPORT_SSE2)
10543 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10544 {
10545     drflac_uint64 i;
10546     drflac_uint64 frameCount4 = frameCount >> 2;
10547     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10548     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10549     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
10550     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
10551     __m128 factor;
10552
10553     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
10554
10555     factor = _mm_set1_ps(1.0f / 8388608.0f);
10556
10557     for (i = 0; i < frameCount4; ++i) {
10558         __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
10559         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
10560         __m128i right = _mm_sub_epi32(left, side);
10561         __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
10562         __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
10563
10564         _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
10565         _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
10566     }
10567
10568     for (i = (frameCount4 << 2); i < frameCount; ++i) {
10569         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
10570         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
10571         drflac_uint32 right = left - side;
10572
10573         pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
10574         pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
10575     }
10576 }
10577 #endif
10578
10579 #if defined(DRFLAC_SUPPORT_NEON)
10580 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10581 {
10582     drflac_uint64 i;
10583     drflac_uint64 frameCount4 = frameCount >> 2;
10584     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10585     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10586     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
10587     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
10588     float32x4_t factor4;
10589     int32x4_t shift0_4;
10590     int32x4_t shift1_4;
10591
10592     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
10593
10594     factor4  = vdupq_n_f32(1.0f / 8388608.0f);
10595     shift0_4 = vdupq_n_s32(shift0);
10596     shift1_4 = vdupq_n_s32(shift1);
10597
10598     for (i = 0; i < frameCount4; ++i) {
10599         uint32x4_t left;
10600         uint32x4_t side;
10601         uint32x4_t right;
10602         float32x4_t leftf;
10603         float32x4_t rightf;
10604
10605         left   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
10606         side   = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
10607         right  = vsubq_u32(left, side);
10608         leftf  = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)),  factor4);
10609         rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4);
10610
10611         drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
10612     }
10613
10614     for (i = (frameCount4 << 2); i < frameCount; ++i) {
10615         drflac_uint32 left  = pInputSamples0U32[i] << shift0;
10616         drflac_uint32 side  = pInputSamples1U32[i] << shift1;
10617         drflac_uint32 right = left - side;
10618
10619         pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
10620         pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
10621     }
10622 }
10623 #endif
10624
10625 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_left_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10626 {
10627 #if defined(DRFLAC_SUPPORT_SSE2)
10628     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
10629         drflac_read_pcm_frames_f32__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10630     } else
10631 #elif defined(DRFLAC_SUPPORT_NEON)
10632     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
10633         drflac_read_pcm_frames_f32__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10634     } else
10635 #endif
10636     {
10637         /* Scalar fallback. */
10638 #if 0
10639         drflac_read_pcm_frames_f32__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10640 #else
10641         drflac_read_pcm_frames_f32__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10642 #endif
10643     }
10644 }
10645
10646
10647 #if 0
10648 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10649 {
10650     drflac_uint64 i;
10651     for (i = 0; i < frameCount; ++i) {
10652         drflac_uint32 side  = (drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
10653         drflac_uint32 right = (drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
10654         drflac_uint32 left  = right + side;
10655
10656         pOutputSamples[i*2+0] = (float)((drflac_int32)left  / 2147483648.0);
10657         pOutputSamples[i*2+1] = (float)((drflac_int32)right / 2147483648.0);
10658     }
10659 }
10660 #endif
10661
10662 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10663 {
10664     drflac_uint64 i;
10665     drflac_uint64 frameCount4 = frameCount >> 2;
10666     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10667     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10668     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10669     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10670     float factor = 1 / 2147483648.0;
10671
10672     for (i = 0; i < frameCount4; ++i) {
10673         drflac_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
10674         drflac_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
10675         drflac_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
10676         drflac_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
10677
10678         drflac_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
10679         drflac_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
10680         drflac_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
10681         drflac_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
10682
10683         drflac_uint32 left0 = right0 + side0;
10684         drflac_uint32 left1 = right1 + side1;
10685         drflac_uint32 left2 = right2 + side2;
10686         drflac_uint32 left3 = right3 + side3;
10687
10688         pOutputSamples[i*8+0] = (drflac_int32)left0  * factor;
10689         pOutputSamples[i*8+1] = (drflac_int32)right0 * factor;
10690         pOutputSamples[i*8+2] = (drflac_int32)left1  * factor;
10691         pOutputSamples[i*8+3] = (drflac_int32)right1 * factor;
10692         pOutputSamples[i*8+4] = (drflac_int32)left2  * factor;
10693         pOutputSamples[i*8+5] = (drflac_int32)right2 * factor;
10694         pOutputSamples[i*8+6] = (drflac_int32)left3  * factor;
10695         pOutputSamples[i*8+7] = (drflac_int32)right3 * factor;
10696     }
10697
10698     for (i = (frameCount4 << 2); i < frameCount; ++i) {
10699         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
10700         drflac_uint32 right = pInputSamples1U32[i] << shift1;
10701         drflac_uint32 left  = right + side;
10702
10703         pOutputSamples[i*2+0] = (drflac_int32)left  * factor;
10704         pOutputSamples[i*2+1] = (drflac_int32)right * factor;
10705     }
10706 }
10707
10708 #if defined(DRFLAC_SUPPORT_SSE2)
10709 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10710 {
10711     drflac_uint64 i;
10712     drflac_uint64 frameCount4 = frameCount >> 2;
10713     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10714     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10715     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
10716     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
10717     __m128 factor;
10718
10719     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
10720
10721     factor = _mm_set1_ps(1.0f / 8388608.0f);
10722
10723     for (i = 0; i < frameCount4; ++i) {
10724         __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
10725         __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
10726         __m128i left  = _mm_add_epi32(right, side);
10727         __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
10728         __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
10729
10730         _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
10731         _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
10732     }
10733
10734     for (i = (frameCount4 << 2); i < frameCount; ++i) {
10735         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
10736         drflac_uint32 right = pInputSamples1U32[i] << shift1;
10737         drflac_uint32 left  = right + side;
10738
10739         pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
10740         pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
10741     }
10742 }
10743 #endif
10744
10745 #if defined(DRFLAC_SUPPORT_NEON)
10746 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10747 {
10748     drflac_uint64 i;
10749     drflac_uint64 frameCount4 = frameCount >> 2;
10750     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10751     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10752     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
10753     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
10754     float32x4_t factor4;
10755     int32x4_t shift0_4;
10756     int32x4_t shift1_4;
10757
10758     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
10759
10760     factor4  = vdupq_n_f32(1.0f / 8388608.0f);
10761     shift0_4 = vdupq_n_s32(shift0);
10762     shift1_4 = vdupq_n_s32(shift1);
10763
10764     for (i = 0; i < frameCount4; ++i) {
10765         uint32x4_t side;
10766         uint32x4_t right;
10767         uint32x4_t left;
10768         float32x4_t leftf;
10769         float32x4_t rightf;
10770
10771         side   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
10772         right  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
10773         left   = vaddq_u32(right, side);
10774         leftf  = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)),  factor4);
10775         rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4);
10776
10777         drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
10778     }
10779
10780     for (i = (frameCount4 << 2); i < frameCount; ++i) {
10781         drflac_uint32 side  = pInputSamples0U32[i] << shift0;
10782         drflac_uint32 right = pInputSamples1U32[i] << shift1;
10783         drflac_uint32 left  = right + side;
10784
10785         pOutputSamples[i*2+0] = (drflac_int32)left  / 8388608.0f;
10786         pOutputSamples[i*2+1] = (drflac_int32)right / 8388608.0f;
10787     }
10788 }
10789 #endif
10790
10791 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_right_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10792 {
10793 #if defined(DRFLAC_SUPPORT_SSE2)
10794     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
10795         drflac_read_pcm_frames_f32__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10796     } else
10797 #elif defined(DRFLAC_SUPPORT_NEON)
10798     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
10799         drflac_read_pcm_frames_f32__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10800     } else
10801 #endif
10802     {
10803         /* Scalar fallback. */
10804 #if 0
10805         drflac_read_pcm_frames_f32__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10806 #else
10807         drflac_read_pcm_frames_f32__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
10808 #endif
10809     }
10810 }
10811
10812
10813 #if 0
10814 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10815 {
10816     for (drflac_uint64 i = 0; i < frameCount; ++i) {
10817         drflac_uint32 mid  = (drflac_uint32)pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10818         drflac_uint32 side = (drflac_uint32)pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10819
10820         mid = (mid << 1) | (side & 0x01);
10821
10822         pOutputSamples[i*2+0] = (float)((((drflac_int32)(mid + side) >> 1) << (unusedBitsPerSample)) / 2147483648.0);
10823         pOutputSamples[i*2+1] = (float)((((drflac_int32)(mid - side) >> 1) << (unusedBitsPerSample)) / 2147483648.0);
10824     }
10825 }
10826 #endif
10827
10828 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10829 {
10830     drflac_uint64 i;
10831     drflac_uint64 frameCount4 = frameCount >> 2;
10832     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10833     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10834     drflac_uint32 shift = unusedBitsPerSample;
10835     float factor = 1 / 2147483648.0;
10836
10837     if (shift > 0) {
10838         shift -= 1;
10839         for (i = 0; i < frameCount4; ++i) {
10840             drflac_uint32 temp0L;
10841             drflac_uint32 temp1L;
10842             drflac_uint32 temp2L;
10843             drflac_uint32 temp3L;
10844             drflac_uint32 temp0R;
10845             drflac_uint32 temp1R;
10846             drflac_uint32 temp2R;
10847             drflac_uint32 temp3R;
10848
10849             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10850             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10851             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10852             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10853
10854             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10855             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10856             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10857             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10858
10859             mid0 = (mid0 << 1) | (side0 & 0x01);
10860             mid1 = (mid1 << 1) | (side1 & 0x01);
10861             mid2 = (mid2 << 1) | (side2 & 0x01);
10862             mid3 = (mid3 << 1) | (side3 & 0x01);
10863
10864             temp0L = (mid0 + side0) << shift;
10865             temp1L = (mid1 + side1) << shift;
10866             temp2L = (mid2 + side2) << shift;
10867             temp3L = (mid3 + side3) << shift;
10868
10869             temp0R = (mid0 - side0) << shift;
10870             temp1R = (mid1 - side1) << shift;
10871             temp2R = (mid2 - side2) << shift;
10872             temp3R = (mid3 - side3) << shift;
10873
10874             pOutputSamples[i*8+0] = (drflac_int32)temp0L * factor;
10875             pOutputSamples[i*8+1] = (drflac_int32)temp0R * factor;
10876             pOutputSamples[i*8+2] = (drflac_int32)temp1L * factor;
10877             pOutputSamples[i*8+3] = (drflac_int32)temp1R * factor;
10878             pOutputSamples[i*8+4] = (drflac_int32)temp2L * factor;
10879             pOutputSamples[i*8+5] = (drflac_int32)temp2R * factor;
10880             pOutputSamples[i*8+6] = (drflac_int32)temp3L * factor;
10881             pOutputSamples[i*8+7] = (drflac_int32)temp3R * factor;
10882         }
10883     } else {
10884         for (i = 0; i < frameCount4; ++i) {
10885             drflac_uint32 temp0L;
10886             drflac_uint32 temp1L;
10887             drflac_uint32 temp2L;
10888             drflac_uint32 temp3L;
10889             drflac_uint32 temp0R;
10890             drflac_uint32 temp1R;
10891             drflac_uint32 temp2R;
10892             drflac_uint32 temp3R;
10893
10894             drflac_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10895             drflac_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10896             drflac_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10897             drflac_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10898
10899             drflac_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10900             drflac_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10901             drflac_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10902             drflac_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10903
10904             mid0 = (mid0 << 1) | (side0 & 0x01);
10905             mid1 = (mid1 << 1) | (side1 & 0x01);
10906             mid2 = (mid2 << 1) | (side2 & 0x01);
10907             mid3 = (mid3 << 1) | (side3 & 0x01);
10908
10909             temp0L = (drflac_uint32)((drflac_int32)(mid0 + side0) >> 1);
10910             temp1L = (drflac_uint32)((drflac_int32)(mid1 + side1) >> 1);
10911             temp2L = (drflac_uint32)((drflac_int32)(mid2 + side2) >> 1);
10912             temp3L = (drflac_uint32)((drflac_int32)(mid3 + side3) >> 1);
10913
10914             temp0R = (drflac_uint32)((drflac_int32)(mid0 - side0) >> 1);
10915             temp1R = (drflac_uint32)((drflac_int32)(mid1 - side1) >> 1);
10916             temp2R = (drflac_uint32)((drflac_int32)(mid2 - side2) >> 1);
10917             temp3R = (drflac_uint32)((drflac_int32)(mid3 - side3) >> 1);
10918
10919             pOutputSamples[i*8+0] = (drflac_int32)temp0L * factor;
10920             pOutputSamples[i*8+1] = (drflac_int32)temp0R * factor;
10921             pOutputSamples[i*8+2] = (drflac_int32)temp1L * factor;
10922             pOutputSamples[i*8+3] = (drflac_int32)temp1R * factor;
10923             pOutputSamples[i*8+4] = (drflac_int32)temp2L * factor;
10924             pOutputSamples[i*8+5] = (drflac_int32)temp2R * factor;
10925             pOutputSamples[i*8+6] = (drflac_int32)temp3L * factor;
10926             pOutputSamples[i*8+7] = (drflac_int32)temp3R * factor;
10927         }
10928     }
10929
10930     for (i = (frameCount4 << 2); i < frameCount; ++i) {
10931         drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10932         drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10933
10934         mid = (mid << 1) | (side & 0x01);
10935
10936         pOutputSamples[i*2+0] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid + side) >> 1) << unusedBitsPerSample) * factor;
10937         pOutputSamples[i*2+1] = (drflac_int32)((drflac_uint32)((drflac_int32)(mid - side) >> 1) << unusedBitsPerSample) * factor;
10938     }
10939 }
10940
10941 #if defined(DRFLAC_SUPPORT_SSE2)
10942 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
10943 {
10944     drflac_uint64 i;
10945     drflac_uint64 frameCount4 = frameCount >> 2;
10946     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
10947     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
10948     drflac_uint32 shift = unusedBitsPerSample - 8;
10949     float factor;
10950     __m128 factor128;
10951
10952     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
10953
10954     factor = 1.0f / 8388608.0f;
10955     factor128 = _mm_set1_ps(factor);
10956
10957     if (shift == 0) {
10958         for (i = 0; i < frameCount4; ++i) {
10959             __m128i mid;
10960             __m128i side;
10961             __m128i tempL;
10962             __m128i tempR;
10963             __m128  leftf;
10964             __m128  rightf;
10965
10966             mid    = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
10967             side   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
10968
10969             mid    = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
10970
10971             tempL  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
10972             tempR  = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
10973
10974             leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
10975             rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
10976
10977             _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
10978             _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
10979         }
10980
10981         for (i = (frameCount4 << 2); i < frameCount; ++i) {
10982             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
10983             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
10984
10985             mid = (mid << 1) | (side & 0x01);
10986
10987             pOutputSamples[i*2+0] = ((drflac_int32)(mid + side) >> 1) * factor;
10988             pOutputSamples[i*2+1] = ((drflac_int32)(mid - side) >> 1) * factor;
10989         }
10990     } else {
10991         shift -= 1;
10992         for (i = 0; i < frameCount4; ++i) {
10993             __m128i mid;
10994             __m128i side;
10995             __m128i tempL;
10996             __m128i tempR;
10997             __m128 leftf;
10998             __m128 rightf;
10999
11000             mid    = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
11001             side   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
11002
11003             mid    = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
11004
11005             tempL  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
11006             tempR  = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
11007
11008             leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
11009             rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
11010
11011             _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
11012             _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
11013         }
11014
11015         for (i = (frameCount4 << 2); i < frameCount; ++i) {
11016             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
11017             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
11018
11019             mid = (mid << 1) | (side & 0x01);
11020
11021             pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift) * factor;
11022             pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift) * factor;
11023         }
11024     }
11025 }
11026 #endif
11027
11028 #if defined(DRFLAC_SUPPORT_NEON)
11029 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
11030 {
11031     drflac_uint64 i;
11032     drflac_uint64 frameCount4 = frameCount >> 2;
11033     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
11034     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
11035     drflac_uint32 shift = unusedBitsPerSample - 8;
11036     float factor;
11037     float32x4_t factor4;
11038     int32x4_t shift4;
11039     int32x4_t wbps0_4;  /* Wasted Bits Per Sample */
11040     int32x4_t wbps1_4;  /* Wasted Bits Per Sample */
11041
11042     DRFLAC_ASSERT(pFlac->bitsPerSample <= 24);
11043
11044     factor  = 1.0f / 8388608.0f;
11045     factor4 = vdupq_n_f32(factor);
11046     wbps0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
11047     wbps1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
11048
11049     if (shift == 0) {
11050         for (i = 0; i < frameCount4; ++i) {
11051             int32x4_t lefti;
11052             int32x4_t righti;
11053             float32x4_t leftf;
11054             float32x4_t rightf;
11055
11056             uint32x4_t mid  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4);
11057             uint32x4_t side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4);
11058
11059             mid    = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
11060
11061             lefti  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
11062             righti = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
11063
11064             leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
11065             rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
11066
11067             drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
11068         }
11069
11070         for (i = (frameCount4 << 2); i < frameCount; ++i) {
11071             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
11072             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
11073
11074             mid = (mid << 1) | (side & 0x01);
11075
11076             pOutputSamples[i*2+0] = ((drflac_int32)(mid + side) >> 1) * factor;
11077             pOutputSamples[i*2+1] = ((drflac_int32)(mid - side) >> 1) * factor;
11078         }
11079     } else {
11080         shift -= 1;
11081         shift4 = vdupq_n_s32(shift);
11082         for (i = 0; i < frameCount4; ++i) {
11083             uint32x4_t mid;
11084             uint32x4_t side;
11085             int32x4_t lefti;
11086             int32x4_t righti;
11087             float32x4_t leftf;
11088             float32x4_t rightf;
11089
11090             mid    = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4);
11091             side   = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4);
11092
11093             mid    = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
11094
11095             lefti  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
11096             righti = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
11097
11098             leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
11099             rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
11100
11101             drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
11102         }
11103
11104         for (i = (frameCount4 << 2); i < frameCount; ++i) {
11105             drflac_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
11106             drflac_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
11107
11108             mid = (mid << 1) | (side & 0x01);
11109
11110             pOutputSamples[i*2+0] = (drflac_int32)((mid + side) << shift) * factor;
11111             pOutputSamples[i*2+1] = (drflac_int32)((mid - side) << shift) * factor;
11112         }
11113     }
11114 }
11115 #endif
11116
11117 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_mid_side(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
11118 {
11119 #if defined(DRFLAC_SUPPORT_SSE2)
11120     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
11121         drflac_read_pcm_frames_f32__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
11122     } else
11123 #elif defined(DRFLAC_SUPPORT_NEON)
11124     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
11125         drflac_read_pcm_frames_f32__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
11126     } else
11127 #endif
11128     {
11129         /* Scalar fallback. */
11130 #if 0
11131         drflac_read_pcm_frames_f32__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
11132 #else
11133         drflac_read_pcm_frames_f32__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
11134 #endif
11135     }
11136 }
11137
11138 #if 0
11139 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__reference(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
11140 {
11141     for (drflac_uint64 i = 0; i < frameCount; ++i) {
11142         pOutputSamples[i*2+0] = (float)((drflac_int32)((drflac_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) / 2147483648.0);
11143         pOutputSamples[i*2+1] = (float)((drflac_int32)((drflac_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) / 2147483648.0);
11144     }
11145 }
11146 #endif
11147
11148 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__scalar(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
11149 {
11150     drflac_uint64 i;
11151     drflac_uint64 frameCount4 = frameCount >> 2;
11152     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
11153     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
11154     drflac_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
11155     drflac_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
11156     float factor = 1 / 2147483648.0;
11157
11158     for (i = 0; i < frameCount4; ++i) {
11159         drflac_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
11160         drflac_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
11161         drflac_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
11162         drflac_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
11163
11164         drflac_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
11165         drflac_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
11166         drflac_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
11167         drflac_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
11168
11169         pOutputSamples[i*8+0] = (drflac_int32)tempL0 * factor;
11170         pOutputSamples[i*8+1] = (drflac_int32)tempR0 * factor;
11171         pOutputSamples[i*8+2] = (drflac_int32)tempL1 * factor;
11172         pOutputSamples[i*8+3] = (drflac_int32)tempR1 * factor;
11173         pOutputSamples[i*8+4] = (drflac_int32)tempL2 * factor;
11174         pOutputSamples[i*8+5] = (drflac_int32)tempR2 * factor;
11175         pOutputSamples[i*8+6] = (drflac_int32)tempL3 * factor;
11176         pOutputSamples[i*8+7] = (drflac_int32)tempR3 * factor;
11177     }
11178
11179     for (i = (frameCount4 << 2); i < frameCount; ++i) {
11180         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor;
11181         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor;
11182     }
11183 }
11184
11185 #if defined(DRFLAC_SUPPORT_SSE2)
11186 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__sse2(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
11187 {
11188     drflac_uint64 i;
11189     drflac_uint64 frameCount4 = frameCount >> 2;
11190     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
11191     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
11192     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
11193     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
11194
11195     float factor = 1.0f / 8388608.0f;
11196     __m128 factor128 = _mm_set1_ps(factor);
11197
11198     for (i = 0; i < frameCount4; ++i) {
11199         __m128i lefti;
11200         __m128i righti;
11201         __m128 leftf;
11202         __m128 rightf;
11203
11204         lefti  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
11205         righti = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
11206
11207         leftf  = _mm_mul_ps(_mm_cvtepi32_ps(lefti),  factor128);
11208         rightf = _mm_mul_ps(_mm_cvtepi32_ps(righti), factor128);
11209
11210         _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
11211         _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
11212     }
11213
11214     for (i = (frameCount4 << 2); i < frameCount; ++i) {
11215         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor;
11216         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor;
11217     }
11218 }
11219 #endif
11220
11221 #if defined(DRFLAC_SUPPORT_NEON)
11222 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo__neon(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
11223 {
11224     drflac_uint64 i;
11225     drflac_uint64 frameCount4 = frameCount >> 2;
11226     const drflac_uint32* pInputSamples0U32 = (const drflac_uint32*)pInputSamples0;
11227     const drflac_uint32* pInputSamples1U32 = (const drflac_uint32*)pInputSamples1;
11228     drflac_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
11229     drflac_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
11230
11231     float factor = 1.0f / 8388608.0f;
11232     float32x4_t factor4 = vdupq_n_f32(factor);
11233     int32x4_t shift0_4  = vdupq_n_s32(shift0);
11234     int32x4_t shift1_4  = vdupq_n_s32(shift1);
11235
11236     for (i = 0; i < frameCount4; ++i) {
11237         int32x4_t lefti;
11238         int32x4_t righti;
11239         float32x4_t leftf;
11240         float32x4_t rightf;
11241
11242         lefti  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4));
11243         righti = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4));
11244
11245         leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
11246         rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
11247
11248         drflac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
11249     }
11250
11251     for (i = (frameCount4 << 2); i < frameCount; ++i) {
11252         pOutputSamples[i*2+0] = (drflac_int32)(pInputSamples0U32[i] << shift0) * factor;
11253         pOutputSamples[i*2+1] = (drflac_int32)(pInputSamples1U32[i] << shift1) * factor;
11254     }
11255 }
11256 #endif
11257
11258 static DRFLAC_INLINE void drflac_read_pcm_frames_f32__decode_independent_stereo(drflac* pFlac, drflac_uint64 frameCount, drflac_uint32 unusedBitsPerSample, const drflac_int32* pInputSamples0, const drflac_int32* pInputSamples1, float* pOutputSamples)
11259 {
11260 #if defined(DRFLAC_SUPPORT_SSE2)
11261     if (drflac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
11262         drflac_read_pcm_frames_f32__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
11263     } else
11264 #elif defined(DRFLAC_SUPPORT_NEON)
11265     if (drflac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
11266         drflac_read_pcm_frames_f32__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
11267     } else
11268 #endif
11269     {
11270         /* Scalar fallback. */
11271 #if 0
11272         drflac_read_pcm_frames_f32__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
11273 #else
11274         drflac_read_pcm_frames_f32__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
11275 #endif
11276     }
11277 }
11278
11279 DRFLAC_API drflac_uint64 drflac_read_pcm_frames_f32(drflac* pFlac, drflac_uint64 framesToRead, float* pBufferOut)
11280 {
11281     drflac_uint64 framesRead;
11282     drflac_uint32 unusedBitsPerSample;
11283
11284     if (pFlac == NULL || framesToRead == 0) {
11285         return 0;
11286     }
11287
11288     if (pBufferOut == NULL) {
11289         return drflac__seek_forward_by_pcm_frames(pFlac, framesToRead);
11290     }
11291
11292     DRFLAC_ASSERT(pFlac->bitsPerSample <= 32);
11293     unusedBitsPerSample = 32 - pFlac->bitsPerSample;
11294
11295     framesRead = 0;
11296     while (framesToRead > 0) {
11297         /* If we've run out of samples in this frame, go to the next. */
11298         if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
11299             if (!drflac__read_and_decode_next_flac_frame(pFlac)) {
11300                 break;  /* Couldn't read the next frame, so just break from the loop and return. */
11301             }
11302         } else {
11303             unsigned int channelCount = drflac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
11304             drflac_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
11305             drflac_uint64 frameCountThisIteration = framesToRead;
11306
11307             if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
11308                 frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
11309             }
11310
11311             if (channelCount == 2) {
11312                 const drflac_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
11313                 const drflac_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
11314
11315                 switch (pFlac->currentFLACFrame.header.channelAssignment)
11316                 {
11317                     case DRFLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
11318                     {
11319                         drflac_read_pcm_frames_f32__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
11320                     } break;
11321
11322                     case DRFLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
11323                     {
11324                         drflac_read_pcm_frames_f32__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
11325                     } break;
11326
11327                     case DRFLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
11328                     {
11329                         drflac_read_pcm_frames_f32__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
11330                     } break;
11331
11332                     case DRFLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
11333                     default:
11334                     {
11335                         drflac_read_pcm_frames_f32__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
11336                     } break;
11337                 }
11338             } else {
11339                 /* Generic interleaving. */
11340                 drflac_uint64 i;
11341                 for (i = 0; i < frameCountThisIteration; ++i) {
11342                     unsigned int j;
11343                     for (j = 0; j < channelCount; ++j) {
11344                         drflac_int32 sampleS32 = (drflac_int32)((drflac_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
11345                         pBufferOut[(i*channelCount)+j] = (float)(sampleS32 / 2147483648.0);
11346                     }
11347                 }
11348             }
11349
11350             framesRead                += frameCountThisIteration;
11351             pBufferOut                += frameCountThisIteration * channelCount;
11352             framesToRead              -= frameCountThisIteration;
11353             pFlac->currentPCMFrame    += frameCountThisIteration;
11354             pFlac->currentFLACFrame.pcmFramesRemaining -= (unsigned int)frameCountThisIteration;
11355         }
11356     }
11357
11358     return framesRead;
11359 }
11360
11361
11362 DRFLAC_API drflac_bool32 drflac_seek_to_pcm_frame(drflac* pFlac, drflac_uint64 pcmFrameIndex)
11363 {
11364     if (pFlac == NULL) {
11365         return DRFLAC_FALSE;
11366     }
11367
11368     /* Don't do anything if we're already on the seek point. */
11369     if (pFlac->currentPCMFrame == pcmFrameIndex) {
11370         return DRFLAC_TRUE;
11371     }
11372
11373     /*
11374     If we don't know where the first frame begins then we can't seek. This will happen when the STREAMINFO block was not present
11375     when the decoder was opened.
11376     */
11377     if (pFlac->firstFLACFramePosInBytes == 0) {
11378         return DRFLAC_FALSE;
11379     }
11380
11381     if (pcmFrameIndex == 0) {
11382         pFlac->currentPCMFrame = 0;
11383         return drflac__seek_to_first_frame(pFlac);
11384     } else {
11385         drflac_bool32 wasSuccessful = DRFLAC_FALSE;
11386         drflac_uint64 originalPCMFrame = pFlac->currentPCMFrame;
11387
11388         /* Clamp the sample to the end. */
11389         if (pcmFrameIndex > pFlac->totalPCMFrameCount) {
11390             pcmFrameIndex = pFlac->totalPCMFrameCount;
11391         }
11392
11393         /* If the target sample and the current sample are in the same frame we just move the position forward. */
11394         if (pcmFrameIndex > pFlac->currentPCMFrame) {
11395             /* Forward. */
11396             drflac_uint32 offset = (drflac_uint32)(pcmFrameIndex - pFlac->currentPCMFrame);
11397             if (pFlac->currentFLACFrame.pcmFramesRemaining >  offset) {
11398                 pFlac->currentFLACFrame.pcmFramesRemaining -= offset;
11399                 pFlac->currentPCMFrame = pcmFrameIndex;
11400                 return DRFLAC_TRUE;
11401             }
11402         } else {
11403             /* Backward. */
11404             drflac_uint32 offsetAbs = (drflac_uint32)(pFlac->currentPCMFrame - pcmFrameIndex);
11405             drflac_uint32 currentFLACFramePCMFrameCount = pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
11406             drflac_uint32 currentFLACFramePCMFramesConsumed = currentFLACFramePCMFrameCount - pFlac->currentFLACFrame.pcmFramesRemaining;
11407             if (currentFLACFramePCMFramesConsumed > offsetAbs) {
11408                 pFlac->currentFLACFrame.pcmFramesRemaining += offsetAbs;
11409                 pFlac->currentPCMFrame = pcmFrameIndex;
11410                 return DRFLAC_TRUE;
11411             }
11412         }
11413
11414         /*
11415         Different techniques depending on encapsulation. Using the native FLAC seektable with Ogg encapsulation is a bit awkward so
11416         we'll instead use Ogg's natural seeking facility.
11417         */
11418 #ifndef DR_FLAC_NO_OGG
11419         if (pFlac->container == drflac_container_ogg)
11420         {
11421             wasSuccessful = drflac_ogg__seek_to_pcm_frame(pFlac, pcmFrameIndex);
11422         }
11423         else
11424 #endif
11425         {
11426             /* First try seeking via the seek table. If this fails, fall back to a brute force seek which is much slower. */
11427             if (/*!wasSuccessful && */!pFlac->_noSeekTableSeek) {
11428                 wasSuccessful = drflac__seek_to_pcm_frame__seek_table(pFlac, pcmFrameIndex);
11429             }
11430
11431 #if !defined(DR_FLAC_NO_CRC)
11432             /* Fall back to binary search if seek table seeking fails. This requires the length of the stream to be known. */
11433             if (!wasSuccessful && !pFlac->_noBinarySearchSeek && pFlac->totalPCMFrameCount > 0) {
11434                 wasSuccessful = drflac__seek_to_pcm_frame__binary_search(pFlac, pcmFrameIndex);
11435             }
11436 #endif
11437
11438             /* Fall back to brute force if all else fails. */
11439             if (!wasSuccessful && !pFlac->_noBruteForceSeek) {
11440                 wasSuccessful = drflac__seek_to_pcm_frame__brute_force(pFlac, pcmFrameIndex);
11441             }
11442         }
11443
11444         if (wasSuccessful) {
11445             pFlac->currentPCMFrame = pcmFrameIndex;
11446         } else {
11447             /* Seek failed. Try putting the decoder back to it's original state. */
11448             if (drflac_seek_to_pcm_frame(pFlac, originalPCMFrame) == DRFLAC_FALSE) {
11449                 /* Failed to seek back to the original PCM frame. Fall back to 0. */
11450                 drflac_seek_to_pcm_frame(pFlac, 0);
11451             }
11452         }
11453
11454         return wasSuccessful;
11455     }
11456 }
11457
11458
11459
11460 /* High Level APIs */
11461
11462 #if defined(SIZE_MAX)
11463     #define DRFLAC_SIZE_MAX  SIZE_MAX
11464 #else
11465     #if defined(DRFLAC_64BIT)
11466         #define DRFLAC_SIZE_MAX  ((drflac_uint64)0xFFFFFFFFFFFFFFFF)
11467     #else
11468         #define DRFLAC_SIZE_MAX  0xFFFFFFFF
11469     #endif
11470 #endif
11471
11472
11473 /* Using a macro as the definition of the drflac__full_decode_and_close_*() API family. Sue me. */
11474 #define DRFLAC_DEFINE_FULL_READ_AND_CLOSE(extension, type) \
11475 static type* drflac__full_read_and_close_ ## extension (drflac* pFlac, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut)\
11476 {                                                                                                                                                                   \
11477     type* pSampleData = NULL;                                                                                                                                       \
11478     drflac_uint64 totalPCMFrameCount;                                                                                                                               \
11479                                                                                                                                                                     \
11480     DRFLAC_ASSERT(pFlac != NULL);                                                                                                                                   \
11481                                                                                                                                                                     \
11482     totalPCMFrameCount = pFlac->totalPCMFrameCount;                                                                                                                 \
11483                                                                                                                                                                     \
11484     if (totalPCMFrameCount == 0) {                                                                                                                                  \
11485         type buffer[4096];                                                                                                                                          \
11486         drflac_uint64 pcmFramesRead;                                                                                                                                \
11487         size_t sampleDataBufferSize = sizeof(buffer);                                                                                                               \
11488                                                                                                                                                                     \
11489         pSampleData = (type*)drflac__malloc_from_callbacks(sampleDataBufferSize, &pFlac->allocationCallbacks);                                                      \
11490         if (pSampleData == NULL) {                                                                                                                                  \
11491             goto on_error;                                                                                                                                          \
11492         }                                                                                                                                                           \
11493                                                                                                                                                                     \
11494         while ((pcmFramesRead = (drflac_uint64)drflac_read_pcm_frames_##extension(pFlac, sizeof(buffer)/sizeof(buffer[0])/pFlac->channels, buffer)) > 0) {          \
11495             if (((totalPCMFrameCount + pcmFramesRead) * pFlac->channels * sizeof(type)) > sampleDataBufferSize) {                                                   \
11496                 type* pNewSampleData;                                                                                                                               \
11497                 size_t newSampleDataBufferSize;                                                                                                                     \
11498                                                                                                                                                                     \
11499                 newSampleDataBufferSize = sampleDataBufferSize * 2;                                                                                                 \
11500                 pNewSampleData = (type*)drflac__realloc_from_callbacks(pSampleData, newSampleDataBufferSize, sampleDataBufferSize, &pFlac->allocationCallbacks);    \
11501                 if (pNewSampleData == NULL) {                                                                                                                       \
11502                     drflac__free_from_callbacks(pSampleData, &pFlac->allocationCallbacks);                                                                          \
11503                     goto on_error;                                                                                                                                  \
11504                 }                                                                                                                                                   \
11505                                                                                                                                                                     \
11506                 sampleDataBufferSize = newSampleDataBufferSize;                                                                                                     \
11507                 pSampleData = pNewSampleData;                                                                                                                       \
11508             }                                                                                                                                                       \
11509                                                                                                                                                                     \
11510             DRFLAC_COPY_MEMORY(pSampleData + (totalPCMFrameCount*pFlac->channels), buffer, (size_t)(pcmFramesRead*pFlac->channels*sizeof(type)));                   \
11511             totalPCMFrameCount += pcmFramesRead;                                                                                                                    \
11512         }                                                                                                                                                           \
11513                                                                                                                                                                     \
11514         /* At this point everything should be decoded, but we just want to fill the unused part buffer with silence - need to                                       \
11515            protect those ears from random noise! */                                                                                                                 \
11516         DRFLAC_ZERO_MEMORY(pSampleData + (totalPCMFrameCount*pFlac->channels), (size_t)(sampleDataBufferSize - totalPCMFrameCount*pFlac->channels*sizeof(type)));   \
11517     } else {                                                                                                                                                        \
11518         drflac_uint64 dataSize = totalPCMFrameCount*pFlac->channels*sizeof(type);                                                                                   \
11519         if (dataSize > (drflac_uint64)DRFLAC_SIZE_MAX) {                                                                                                            \
11520             goto on_error;  /* The decoded data is too big. */                                                                                                      \
11521         }                                                                                                                                                           \
11522                                                                                                                                                                     \
11523         pSampleData = (type*)drflac__malloc_from_callbacks((size_t)dataSize, &pFlac->allocationCallbacks);    /* <-- Safe cast as per the check above. */           \
11524         if (pSampleData == NULL) {                                                                                                                                  \
11525             goto on_error;                                                                                                                                          \
11526         }                                                                                                                                                           \
11527                                                                                                                                                                     \
11528         totalPCMFrameCount = drflac_read_pcm_frames_##extension(pFlac, pFlac->totalPCMFrameCount, pSampleData);                                                     \
11529     }                                                                                                                                                               \
11530                                                                                                                                                                     \
11531     if (sampleRateOut) *sampleRateOut = pFlac->sampleRate;                                                                                                          \
11532     if (channelsOut) *channelsOut = pFlac->channels;                                                                                                                \
11533     if (totalPCMFrameCountOut) *totalPCMFrameCountOut = totalPCMFrameCount;                                                                                         \
11534                                                                                                                                                                     \
11535     drflac_close(pFlac);                                                                                                                                            \
11536     return pSampleData;                                                                                                                                             \
11537                                                                                                                                                                     \
11538 on_error:                                                                                                                                                           \
11539     drflac_close(pFlac);                                                                                                                                            \
11540     return NULL;                                                                                                                                                    \
11541 }
11542
11543 DRFLAC_DEFINE_FULL_READ_AND_CLOSE(s32, drflac_int32)
11544 DRFLAC_DEFINE_FULL_READ_AND_CLOSE(s16, drflac_int16)
11545 DRFLAC_DEFINE_FULL_READ_AND_CLOSE(f32, float)
11546
11547 DRFLAC_API drflac_int32* drflac_open_and_read_pcm_frames_s32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut, const drflac_allocation_callbacks* pAllocationCallbacks)
11548 {
11549     drflac* pFlac;
11550
11551     if (channelsOut) {
11552         *channelsOut = 0;
11553     }
11554     if (sampleRateOut) {
11555         *sampleRateOut = 0;
11556     }
11557     if (totalPCMFrameCountOut) {
11558         *totalPCMFrameCountOut = 0;
11559     }
11560
11561     pFlac = drflac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
11562     if (pFlac == NULL) {
11563         return NULL;
11564     }
11565
11566     return drflac__full_read_and_close_s32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
11567 }
11568
11569 DRFLAC_API drflac_int16* drflac_open_and_read_pcm_frames_s16(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut, const drflac_allocation_callbacks* pAllocationCallbacks)
11570 {
11571     drflac* pFlac;
11572
11573     if (channelsOut) {
11574         *channelsOut = 0;
11575     }
11576     if (sampleRateOut) {
11577         *sampleRateOut = 0;
11578     }
11579     if (totalPCMFrameCountOut) {
11580         *totalPCMFrameCountOut = 0;
11581     }
11582
11583     pFlac = drflac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
11584     if (pFlac == NULL) {
11585         return NULL;
11586     }
11587
11588     return drflac__full_read_and_close_s16(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
11589 }
11590
11591 DRFLAC_API float* drflac_open_and_read_pcm_frames_f32(drflac_read_proc onRead, drflac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drflac_uint64* totalPCMFrameCountOut, const drflac_allocation_callbacks* pAllocationCallbacks)
11592 {
11593     drflac* pFlac;
11594
11595     if (channelsOut) {
11596         *channelsOut = 0;
11597     }
11598     if (sampleRateOut) {
11599         *sampleRateOut = 0;
11600     }
11601     if (totalPCMFrameCountOut) {
11602         *totalPCMFrameCountOut = 0;
11603     }
11604
11605     pFlac = drflac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
11606     if (pFlac == NULL) {
11607         return NULL;
11608     }
11609
11610     return drflac__full_read_and_close_f32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
11611 }
11612
11613 #ifndef DR_FLAC_NO_STDIO
11614 DRFLAC_API drflac_int32* drflac_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
11615 {
11616     drflac* pFlac;
11617
11618     if (sampleRate) {
11619         *sampleRate = 0;
11620     }
11621     if (channels) {
11622         *channels = 0;
11623     }
11624     if (totalPCMFrameCount) {
11625         *totalPCMFrameCount = 0;
11626     }
11627
11628     pFlac = drflac_open_file(filename, pAllocationCallbacks);
11629     if (pFlac == NULL) {
11630         return NULL;
11631     }
11632
11633     return drflac__full_read_and_close_s32(pFlac, channels, sampleRate, totalPCMFrameCount);
11634 }
11635
11636 DRFLAC_API drflac_int16* drflac_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
11637 {
11638     drflac* pFlac;
11639
11640     if (sampleRate) {
11641         *sampleRate = 0;
11642     }
11643     if (channels) {
11644         *channels = 0;
11645     }
11646     if (totalPCMFrameCount) {
11647         *totalPCMFrameCount = 0;
11648     }
11649
11650     pFlac = drflac_open_file(filename, pAllocationCallbacks);
11651     if (pFlac == NULL) {
11652         return NULL;
11653     }
11654
11655     return drflac__full_read_and_close_s16(pFlac, channels, sampleRate, totalPCMFrameCount);
11656 }
11657
11658 DRFLAC_API float* drflac_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
11659 {
11660     drflac* pFlac;
11661
11662     if (sampleRate) {
11663         *sampleRate = 0;
11664     }
11665     if (channels) {
11666         *channels = 0;
11667     }
11668     if (totalPCMFrameCount) {
11669         *totalPCMFrameCount = 0;
11670     }
11671
11672     pFlac = drflac_open_file(filename, pAllocationCallbacks);
11673     if (pFlac == NULL) {
11674         return NULL;
11675     }
11676
11677     return drflac__full_read_and_close_f32(pFlac, channels, sampleRate, totalPCMFrameCount);
11678 }
11679 #endif
11680
11681 DRFLAC_API drflac_int32* drflac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
11682 {
11683     drflac* pFlac;
11684
11685     if (sampleRate) {
11686         *sampleRate = 0;
11687     }
11688     if (channels) {
11689         *channels = 0;
11690     }
11691     if (totalPCMFrameCount) {
11692         *totalPCMFrameCount = 0;
11693     }
11694
11695     pFlac = drflac_open_memory(data, dataSize, pAllocationCallbacks);
11696     if (pFlac == NULL) {
11697         return NULL;
11698     }
11699
11700     return drflac__full_read_and_close_s32(pFlac, channels, sampleRate, totalPCMFrameCount);
11701 }
11702
11703 DRFLAC_API drflac_int16* drflac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
11704 {
11705     drflac* pFlac;
11706
11707     if (sampleRate) {
11708         *sampleRate = 0;
11709     }
11710     if (channels) {
11711         *channels = 0;
11712     }
11713     if (totalPCMFrameCount) {
11714         *totalPCMFrameCount = 0;
11715     }
11716
11717     pFlac = drflac_open_memory(data, dataSize, pAllocationCallbacks);
11718     if (pFlac == NULL) {
11719         return NULL;
11720     }
11721
11722     return drflac__full_read_and_close_s16(pFlac, channels, sampleRate, totalPCMFrameCount);
11723 }
11724
11725 DRFLAC_API float* drflac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, drflac_uint64* totalPCMFrameCount, const drflac_allocation_callbacks* pAllocationCallbacks)
11726 {
11727     drflac* pFlac;
11728
11729     if (sampleRate) {
11730         *sampleRate = 0;
11731     }
11732     if (channels) {
11733         *channels = 0;
11734     }
11735     if (totalPCMFrameCount) {
11736         *totalPCMFrameCount = 0;
11737     }
11738
11739     pFlac = drflac_open_memory(data, dataSize, pAllocationCallbacks);
11740     if (pFlac == NULL) {
11741         return NULL;
11742     }
11743
11744     return drflac__full_read_and_close_f32(pFlac, channels, sampleRate, totalPCMFrameCount);
11745 }
11746
11747
11748 DRFLAC_API void drflac_free(void* p, const drflac_allocation_callbacks* pAllocationCallbacks)
11749 {
11750     if (pAllocationCallbacks != NULL) {
11751         drflac__free_from_callbacks(p, pAllocationCallbacks);
11752     } else {
11753         drflac__free_default(p, NULL);
11754     }
11755 }
11756
11757
11758
11759
11760 DRFLAC_API void drflac_init_vorbis_comment_iterator(drflac_vorbis_comment_iterator* pIter, drflac_uint32 commentCount, const void* pComments)
11761 {
11762     if (pIter == NULL) {
11763         return;
11764     }
11765
11766     pIter->countRemaining = commentCount;
11767     pIter->pRunningData   = (const char*)pComments;
11768 }
11769
11770 DRFLAC_API const char* drflac_next_vorbis_comment(drflac_vorbis_comment_iterator* pIter, drflac_uint32* pCommentLengthOut)
11771 {
11772     drflac_int32 length;
11773     const char* pComment;
11774
11775     /* Safety. */
11776     if (pCommentLengthOut) {
11777         *pCommentLengthOut = 0;
11778     }
11779
11780     if (pIter == NULL || pIter->countRemaining == 0 || pIter->pRunningData == NULL) {
11781         return NULL;
11782     }
11783
11784     length = drflac__le2host_32(*(const drflac_uint32*)pIter->pRunningData);
11785     pIter->pRunningData += 4;
11786
11787     pComment = pIter->pRunningData;
11788     pIter->pRunningData += length;
11789     pIter->countRemaining -= 1;
11790
11791     if (pCommentLengthOut) {
11792         *pCommentLengthOut = length;
11793     }
11794
11795     return pComment;
11796 }
11797
11798
11799
11800
11801 DRFLAC_API void drflac_init_cuesheet_track_iterator(drflac_cuesheet_track_iterator* pIter, drflac_uint32 trackCount, const void* pTrackData)
11802 {
11803     if (pIter == NULL) {
11804         return;
11805     }
11806
11807     pIter->countRemaining = trackCount;
11808     pIter->pRunningData   = (const char*)pTrackData;
11809 }
11810
11811 DRFLAC_API drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterator* pIter, drflac_cuesheet_track* pCuesheetTrack)
11812 {
11813     drflac_cuesheet_track cuesheetTrack;
11814     const char* pRunningData;
11815     drflac_uint64 offsetHi;
11816     drflac_uint64 offsetLo;
11817
11818     if (pIter == NULL || pIter->countRemaining == 0 || pIter->pRunningData == NULL) {
11819         return DRFLAC_FALSE;
11820     }
11821
11822     pRunningData = pIter->pRunningData;
11823
11824     offsetHi                   = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
11825     offsetLo                   = drflac__be2host_32(*(const drflac_uint32*)pRunningData); pRunningData += 4;
11826     cuesheetTrack.offset       = offsetLo | (offsetHi << 32);
11827     cuesheetTrack.trackNumber  = pRunningData[0];                                         pRunningData += 1;
11828     DRFLAC_COPY_MEMORY(cuesheetTrack.ISRC, pRunningData, sizeof(cuesheetTrack.ISRC));     pRunningData += 12;
11829     cuesheetTrack.isAudio      = (pRunningData[0] & 0x80) != 0;
11830     cuesheetTrack.preEmphasis  = (pRunningData[0] & 0x40) != 0;                           pRunningData += 14;
11831     cuesheetTrack.indexCount   = pRunningData[0];                                         pRunningData += 1;
11832     cuesheetTrack.pIndexPoints = (const drflac_cuesheet_track_index*)pRunningData;        pRunningData += cuesheetTrack.indexCount * sizeof(drflac_cuesheet_track_index);
11833
11834     pIter->pRunningData = pRunningData;
11835     pIter->countRemaining -= 1;
11836
11837     if (pCuesheetTrack) {
11838         *pCuesheetTrack = cuesheetTrack;
11839     }
11840
11841     return DRFLAC_TRUE;
11842 }
11843
11844 #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
11845     #pragma GCC diagnostic pop
11846 #endif
11847 #endif  /* dr_flac_c */
11848 #endif  /* DR_FLAC_IMPLEMENTATION */
11849
11850
11851 /*
11852 REVISION HISTORY
11853 ================
11854 v0.12.31 - 2021-08-16
11855   - Silence some warnings.
11856
11857 v0.12.30 - 2021-07-31
11858   - Fix platform detection for ARM64.
11859
11860 v0.12.29 - 2021-04-02
11861   - Fix a bug where the running PCM frame index is set to an invalid value when over-seeking.
11862   - Fix a decoding error due to an incorrect validation check.
11863
11864 v0.12.28 - 2021-02-21
11865   - Fix a warning due to referencing _MSC_VER when it is undefined.
11866
11867 v0.12.27 - 2021-01-31
11868   - Fix a static analysis warning.
11869
11870 v0.12.26 - 2021-01-17
11871   - Fix a compilation warning due to _BSD_SOURCE being deprecated.
11872
11873 v0.12.25 - 2020-12-26
11874   - Update documentation.
11875
11876 v0.12.24 - 2020-11-29
11877   - Fix ARM64/NEON detection when compiling with MSVC.
11878
11879 v0.12.23 - 2020-11-21
11880   - Fix compilation with OpenWatcom.
11881
11882 v0.12.22 - 2020-11-01
11883   - Fix an error with the previous release.
11884
11885 v0.12.21 - 2020-11-01
11886   - Fix a possible deadlock when seeking.
11887   - Improve compiler support for older versions of GCC.
11888
11889 v0.12.20 - 2020-09-08
11890   - Fix a compilation error on older compilers.
11891
11892 v0.12.19 - 2020-08-30
11893   - Fix a bug due to an undefined 32-bit shift.
11894
11895 v0.12.18 - 2020-08-14
11896   - Fix a crash when compiling with clang-cl.
11897
11898 v0.12.17 - 2020-08-02
11899   - Simplify sized types.
11900
11901 v0.12.16 - 2020-07-25
11902   - Fix a compilation warning.
11903
11904 v0.12.15 - 2020-07-06
11905   - Check for negative LPC shifts and return an error.
11906
11907 v0.12.14 - 2020-06-23
11908   - Add include guard for the implementation section.
11909
11910 v0.12.13 - 2020-05-16
11911   - Add compile-time and run-time version querying.
11912     - DRFLAC_VERSION_MINOR
11913     - DRFLAC_VERSION_MAJOR
11914     - DRFLAC_VERSION_REVISION
11915     - DRFLAC_VERSION_STRING
11916     - drflac_version()
11917     - drflac_version_string()
11918
11919 v0.12.12 - 2020-04-30
11920   - Fix compilation errors with VC6.
11921
11922 v0.12.11 - 2020-04-19
11923   - Fix some pedantic warnings.
11924   - Fix some undefined behaviour warnings.
11925
11926 v0.12.10 - 2020-04-10
11927   - Fix some bugs when trying to seek with an invalid seek table.
11928
11929 v0.12.9 - 2020-04-05
11930   - Fix warnings.
11931
11932 v0.12.8 - 2020-04-04
11933   - Add drflac_open_file_w() and drflac_open_file_with_metadata_w().
11934   - Fix some static analysis warnings.
11935   - Minor documentation updates.
11936
11937 v0.12.7 - 2020-03-14
11938   - Fix compilation errors with VC6.
11939
11940 v0.12.6 - 2020-03-07
11941   - Fix compilation error with Visual Studio .NET 2003.
11942
11943 v0.12.5 - 2020-01-30
11944   - Silence some static analysis warnings.
11945
11946 v0.12.4 - 2020-01-29
11947   - Silence some static analysis warnings.
11948
11949 v0.12.3 - 2019-12-02
11950   - Fix some warnings when compiling with GCC and the -Og flag.
11951   - Fix a crash in out-of-memory situations.
11952   - Fix potential integer overflow bug.
11953   - Fix some static analysis warnings.
11954   - Fix a possible crash when using custom memory allocators without a custom realloc() implementation.
11955   - Fix a bug with binary search seeking where the bits per sample is not a multiple of 8.
11956
11957 v0.12.2 - 2019-10-07
11958   - Internal code clean up.
11959
11960 v0.12.1 - 2019-09-29
11961   - Fix some Clang Static Analyzer warnings.
11962   - Fix an unused variable warning.
11963
11964 v0.12.0 - 2019-09-23
11965   - API CHANGE: Add support for user defined memory allocation routines. This system allows the program to specify their own memory allocation
11966     routines with a user data pointer for client-specific contextual data. This adds an extra parameter to the end of the following APIs:
11967     - drflac_open()
11968     - drflac_open_relaxed()
11969     - drflac_open_with_metadata()
11970     - drflac_open_with_metadata_relaxed()
11971     - drflac_open_file()
11972     - drflac_open_file_with_metadata()
11973     - drflac_open_memory()
11974     - drflac_open_memory_with_metadata()
11975     - drflac_open_and_read_pcm_frames_s32()
11976     - drflac_open_and_read_pcm_frames_s16()
11977     - drflac_open_and_read_pcm_frames_f32()
11978     - drflac_open_file_and_read_pcm_frames_s32()
11979     - drflac_open_file_and_read_pcm_frames_s16()
11980     - drflac_open_file_and_read_pcm_frames_f32()
11981     - drflac_open_memory_and_read_pcm_frames_s32()
11982     - drflac_open_memory_and_read_pcm_frames_s16()
11983     - drflac_open_memory_and_read_pcm_frames_f32()
11984     Set this extra parameter to NULL to use defaults which is the same as the previous behaviour. Setting this NULL will use
11985     DRFLAC_MALLOC, DRFLAC_REALLOC and DRFLAC_FREE.
11986   - Remove deprecated APIs:
11987     - drflac_read_s32()
11988     - drflac_read_s16()
11989     - drflac_read_f32()
11990     - drflac_seek_to_sample()
11991     - drflac_open_and_decode_s32()
11992     - drflac_open_and_decode_s16()
11993     - drflac_open_and_decode_f32()
11994     - drflac_open_and_decode_file_s32()
11995     - drflac_open_and_decode_file_s16()
11996     - drflac_open_and_decode_file_f32()
11997     - drflac_open_and_decode_memory_s32()
11998     - drflac_open_and_decode_memory_s16()
11999     - drflac_open_and_decode_memory_f32()
12000   - Remove drflac.totalSampleCount which is now replaced with drflac.totalPCMFrameCount. You can emulate drflac.totalSampleCount
12001     by doing pFlac->totalPCMFrameCount*pFlac->channels.
12002   - Rename drflac.currentFrame to drflac.currentFLACFrame to remove ambiguity with PCM frames.
12003   - Fix errors when seeking to the end of a stream.
12004   - Optimizations to seeking.
12005   - SSE improvements and optimizations.
12006   - ARM NEON optimizations.
12007   - Optimizations to drflac_read_pcm_frames_s16().
12008   - Optimizations to drflac_read_pcm_frames_s32().
12009
12010 v0.11.10 - 2019-06-26
12011   - Fix a compiler error.
12012
12013 v0.11.9 - 2019-06-16
12014   - Silence some ThreadSanitizer warnings.
12015
12016 v0.11.8 - 2019-05-21
12017   - Fix warnings.
12018
12019 v0.11.7 - 2019-05-06
12020   - C89 fixes.
12021
12022 v0.11.6 - 2019-05-05
12023   - Add support for C89.
12024   - Fix a compiler warning when CRC is disabled.
12025   - Change license to choice of public domain or MIT-0.
12026
12027 v0.11.5 - 2019-04-19
12028   - Fix a compiler error with GCC.
12029
12030 v0.11.4 - 2019-04-17
12031   - Fix some warnings with GCC when compiling with -std=c99.
12032
12033 v0.11.3 - 2019-04-07
12034   - Silence warnings with GCC.
12035
12036 v0.11.2 - 2019-03-10
12037   - Fix a warning.
12038
12039 v0.11.1 - 2019-02-17
12040   - Fix a potential bug with seeking.
12041
12042 v0.11.0 - 2018-12-16
12043   - API CHANGE: Deprecated drflac_read_s32(), drflac_read_s16() and drflac_read_f32() and replaced them with
12044     drflac_read_pcm_frames_s32(), drflac_read_pcm_frames_s16() and drflac_read_pcm_frames_f32(). The new APIs take
12045     and return PCM frame counts instead of sample counts. To upgrade you will need to change the input count by
12046     dividing it by the channel count, and then do the same with the return value.
12047   - API_CHANGE: Deprecated drflac_seek_to_sample() and replaced with drflac_seek_to_pcm_frame(). Same rules as
12048     the changes to drflac_read_*() apply.
12049   - API CHANGE: Deprecated drflac_open_and_decode_*() and replaced with drflac_open_*_and_read_*(). Same rules as
12050     the changes to drflac_read_*() apply.
12051   - Optimizations.
12052
12053 v0.10.0 - 2018-09-11
12054   - Remove the DR_FLAC_NO_WIN32_IO option and the Win32 file IO functionality. If you need to use Win32 file IO you
12055     need to do it yourself via the callback API.
12056   - Fix the clang build.
12057   - Fix undefined behavior.
12058   - Fix errors with CUESHEET metdata blocks.
12059   - Add an API for iterating over each cuesheet track in the CUESHEET metadata block. This works the same way as the
12060     Vorbis comment API.
12061   - Other miscellaneous bug fixes, mostly relating to invalid FLAC streams.
12062   - Minor optimizations.
12063
12064 v0.9.11 - 2018-08-29
12065   - Fix a bug with sample reconstruction.
12066
12067 v0.9.10 - 2018-08-07
12068   - Improve 64-bit detection.
12069
12070 v0.9.9 - 2018-08-05
12071   - Fix C++ build on older versions of GCC.
12072
12073 v0.9.8 - 2018-07-24
12074   - Fix compilation errors.
12075
12076 v0.9.7 - 2018-07-05
12077   - Fix a warning.
12078
12079 v0.9.6 - 2018-06-29
12080   - Fix some typos.
12081
12082 v0.9.5 - 2018-06-23
12083   - Fix some warnings.
12084
12085 v0.9.4 - 2018-06-14
12086   - Optimizations to seeking.
12087   - Clean up.
12088
12089 v0.9.3 - 2018-05-22
12090   - Bug fix.
12091
12092 v0.9.2 - 2018-05-12
12093   - Fix a compilation error due to a missing break statement.
12094
12095 v0.9.1 - 2018-04-29
12096   - Fix compilation error with Clang.
12097
12098 v0.9 - 2018-04-24
12099   - Fix Clang build.
12100   - Start using major.minor.revision versioning.
12101
12102 v0.8g - 2018-04-19
12103   - Fix build on non-x86/x64 architectures.
12104
12105 v0.8f - 2018-02-02
12106   - Stop pretending to support changing rate/channels mid stream.
12107
12108 v0.8e - 2018-02-01
12109   - Fix a crash when the block size of a frame is larger than the maximum block size defined by the FLAC stream.
12110   - Fix a crash the the Rice partition order is invalid.
12111
12112 v0.8d - 2017-09-22
12113   - Add support for decoding streams with ID3 tags. ID3 tags are just skipped.
12114
12115 v0.8c - 2017-09-07
12116   - Fix warning on non-x86/x64 architectures.
12117
12118 v0.8b - 2017-08-19
12119   - Fix build on non-x86/x64 architectures.
12120
12121 v0.8a - 2017-08-13
12122   - A small optimization for the Clang build.
12123
12124 v0.8 - 2017-08-12
12125   - API CHANGE: Rename dr_* types to drflac_*.
12126   - Optimizations. This brings dr_flac back to about the same class of efficiency as the reference implementation.
12127   - Add support for custom implementations of malloc(), realloc(), etc.
12128   - Add CRC checking to Ogg encapsulated streams.
12129   - Fix VC++ 6 build. This is only for the C++ compiler. The C compiler is not currently supported.
12130   - Bug fixes.
12131
12132 v0.7 - 2017-07-23
12133   - Add support for opening a stream without a header block. To do this, use drflac_open_relaxed() / drflac_open_with_metadata_relaxed().
12134
12135 v0.6 - 2017-07-22
12136   - Add support for recovering from invalid frames. With this change, dr_flac will simply skip over invalid frames as if they
12137     never existed. Frames are checked against their sync code, the CRC-8 of the frame header and the CRC-16 of the whole frame.
12138
12139 v0.5 - 2017-07-16
12140   - Fix typos.
12141   - Change drflac_bool* types to unsigned.
12142   - Add CRC checking. This makes dr_flac slower, but can be disabled with #define DR_FLAC_NO_CRC.
12143
12144 v0.4f - 2017-03-10
12145   - Fix a couple of bugs with the bitstreaming code.
12146
12147 v0.4e - 2017-02-17
12148   - Fix some warnings.
12149
12150 v0.4d - 2016-12-26
12151   - Add support for 32-bit floating-point PCM decoding.
12152   - Use drflac_int* and drflac_uint* sized types to improve compiler support.
12153   - Minor improvements to documentation.
12154
12155 v0.4c - 2016-12-26
12156   - Add support for signed 16-bit integer PCM decoding.
12157
12158 v0.4b - 2016-10-23
12159   - A minor change to drflac_bool8 and drflac_bool32 types.
12160
12161 v0.4a - 2016-10-11
12162   - Rename drBool32 to drflac_bool32 for styling consistency.
12163
12164 v0.4 - 2016-09-29
12165   - API/ABI CHANGE: Use fixed size 32-bit booleans instead of the built-in bool type.
12166   - API CHANGE: Rename drflac_open_and_decode*() to drflac_open_and_decode*_s32().
12167   - API CHANGE: Swap the order of "channels" and "sampleRate" parameters in drflac_open_and_decode*(). Rationale for this is to
12168     keep it consistent with drflac_audio.
12169
12170 v0.3f - 2016-09-21
12171   - Fix a warning with GCC.
12172
12173 v0.3e - 2016-09-18
12174   - Fixed a bug where GCC 4.3+ was not getting properly identified.
12175   - Fixed a few typos.
12176   - Changed date formats to ISO 8601 (YYYY-MM-DD).
12177
12178 v0.3d - 2016-06-11
12179   - Minor clean up.
12180
12181 v0.3c - 2016-05-28
12182   - Fixed compilation error.
12183
12184 v0.3b - 2016-05-16
12185   - Fixed Linux/GCC build.
12186   - Updated documentation.
12187
12188 v0.3a - 2016-05-15
12189   - Minor fixes to documentation.
12190
12191 v0.3 - 2016-05-11
12192   - Optimizations. Now at about parity with the reference implementation on 32-bit builds.
12193   - Lots of clean up.
12194
12195 v0.2b - 2016-05-10
12196   - Bug fixes.
12197
12198 v0.2a - 2016-05-10
12199   - Made drflac_open_and_decode() more robust.
12200   - Removed an unused debugging variable
12201
12202 v0.2 - 2016-05-09
12203   - Added support for Ogg encapsulation.
12204   - API CHANGE. Have the onSeek callback take a third argument which specifies whether or not the seek
12205     should be relative to the start or the current position. Also changes the seeking rules such that
12206     seeking offsets will never be negative.
12207   - Have drflac_open_and_decode() fail gracefully if the stream has an unknown total sample count.
12208
12209 v0.1b - 2016-05-07
12210   - Properly close the file handle in drflac_open_file() and family when the decoder fails to initialize.
12211   - Removed a stale comment.
12212
12213 v0.1a - 2016-05-05
12214   - Minor formatting changes.
12215   - Fixed a warning on the GCC build.
12216
12217 v0.1 - 2016-05-03
12218   - Initial versioned release.
12219 */
12220
12221 /*
12222 This software is available as a choice of the following licenses. Choose
12223 whichever you prefer.
12224
12225 ===============================================================================
12226 ALTERNATIVE 1 - Public Domain (www.unlicense.org)
12227 ===============================================================================
12228 This is free and unencumbered software released into the public domain.
12229
12230 Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
12231 software, either in source code form or as a compiled binary, for any purpose,
12232 commercial or non-commercial, and by any means.
12233
12234 In jurisdictions that recognize copyright laws, the author or authors of this
12235 software dedicate any and all copyright interest in the software to the public
12236 domain. We make this dedication for the benefit of the public at large and to
12237 the detriment of our heirs and successors. We intend this dedication to be an
12238 overt act of relinquishment in perpetuity of all present and future rights to
12239 this software under copyright law.
12240
12241 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12242 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
12243 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
12244 AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
12245 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
12246 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
12247
12248 For more information, please refer to <http://unlicense.org/>
12249
12250 ===============================================================================
12251 ALTERNATIVE 2 - MIT No Attribution
12252 ===============================================================================
12253 Copyright 2020 David Reid
12254
12255 Permission is hereby granted, free of charge, to any person obtaining a copy of
12256 this software and associated documentation files (the "Software"), to deal in
12257 the Software without restriction, including without limitation the rights to
12258 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12259 of the Software, and to permit persons to whom the Software is furnished to do
12260 so.
12261
12262 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12263 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
12264 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
12265 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
12266 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
12267 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
12268 SOFTWARE.
12269 */