common/SC_fftlib.cpp

   1 /*
   2 SC_fftlib.cpp
   3 An interface to abstract over different FFT libraries, for SuperCollider 3.
   4 (c) 2007-2008 Dan Stowell, incorporating code from
   5         SuperCollider 3 Copyright (c) 2002 James McCartney.
   6         All rights reserved.
   7
   8     This program is free software; you can redistribute it and/or modify
   9     it under the terms of the GNU General Public License as published by
  10     the Free Software Foundation; either version 2 of the License, or
  11     (at your option) any later version.
  12
  13     This program is distributed in the hope that it will be useful,
  14     but WITHOUT ANY WARRANTY; without even the implied warranty of
  15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16     GNU General Public License for more details.
  17
  18     You should have received a copy of the GNU General Public License
  19     along with this program; if not, write to the Free Software
  20     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
  21
  22 NOTE:
  23 vDSP uses a "SplitBuf" as an intermediate representation of the data.
  24 For speed we keep this global, although this makes the code non-thread-safe.
  25 (This is not new to this refactoring. Just worth noting.)
  26 */
  27
  28 #include "clz.h"
  29 #include <stdlib.h>
  30 #include <cmath>
  31 #include <stdio.h>
  32 #include <cstring>
  33 #include <cassert>
  34
  35 #include "SC_fftlib.h"
  36
  37 #ifdef NOVA_SIMD
  38 #include "simd_binary_arithmetic.hpp"
  39 #endif
  40
  41
  42 // We include vDSP even if not using for FFT, since we want to use some vectorised add/mul tricks
  43 #if defined(__APPLE__) && !defined(SC_IPHONE)
  44         #include "vecLib/vDSP.h"
  45 #elif defined(SC_IPHONE)
  46         #include <Accelerate/Accelerate.h>
  47 #endif
  48
  49 ////////////////////////////////////////////////////////////////////////////////////////////////////
  50 // Constants and structs
  51
  52 // Decisions here about which FFT library to use - vDSP only exists on Mac BTW.
  53 // We include the relevant libs but also ensure that one, and only one, of them is active...
  54 #if SC_FFT_NONE
  55         // "SC_FFT_NONE" allows compilation without FFT support; only expected to be used on very limited platforms
  56         #define SC_FFT_FFTW 0
  57         #define SC_FFT_VDSP 0
  58         #define SC_FFT_GREEN 0
  59 #elif SC_FFT_GREEN
  60         #define SC_FFT_FFTW 0
  61         #define SC_FFT_VDSP 0
  62         #define SC_FFT_GREEN 1
  63 #elif !SC_FFT_FFTW && defined(__APPLE__)
  64         #define SC_FFT_FFTW 0
  65         #define SC_FFT_VDSP 1
  66         #define SC_FFT_GREEN 0
  67 #else
  68 //#elif SC_FFT_FFTW
  69         #define SC_FFT_FFTW 1
  70         #define SC_FFT_VDSP 0
  71         #define SC_FFT_GREEN 0
  72         #include <fftw3.h>
  73 #endif
  74
  75 // Note that things like *fftWindow actually allow for other sizes, to be created on user request.
  76 #define SC_FFT_ABSOLUTE_MAXSIZE 2147483648
  77 #define SC_FFT_LOG2_ABSOLUTE_MAXSIZE 31
  78 #define SC_FFT_LOG2_ABSOLUTE_MAXSIZE_PLUS1 32
  79
  80 // This struct is a bit like FFTW's idea of a "plan": it represents an FFT operation that may be applied once or repeatedly.
  81 // It should be possible for indata and outdata to be the same, for quasi-in-place operation.
  82 typedef struct scfft {
  83         unsigned int nfull, nwin, log2nfull, log2nwin; // Lengths of full FFT frame, and the (possibly shorter) windowed data frame
  84         short wintype;
  85         float *indata, *outdata, *trbuf;
  86         float scalefac; // Used to rescale the data to unity gain
  87 } scfft;
  88
  89
  90 static float *fftWindow[2][SC_FFT_LOG2_ABSOLUTE_MAXSIZE_PLUS1];
  91
  92 #if SC_FFT_VDSP
  93         static FFTSetup fftSetup[SC_FFT_LOG2_ABSOLUTE_MAXSIZE_PLUS1]; // vDSP setups, one per FFT size
  94         static COMPLEX_SPLIT splitBuf; // Temp buf for holding rearranged data
  95 #endif
  96
  97 #if SC_FFT_GREEN
  98         extern "C" {
  99                 #include "fftlib.h"
 100                 static float *cosTable[SC_FFT_LOG2_ABSOLUTE_MAXSIZE_PLUS1];
 101         }
 102 #endif
 103
 104 #if SC_FFT_FFTW
 105 static fftwf_plan precompiledForwardPlans[SC_FFT_LOG2_ABSOLUTE_MAXSIZE_PLUS1];
 106 static fftwf_plan precompiledBackwardPlans[SC_FFT_LOG2_ABSOLUTE_MAXSIZE_PLUS1];
 107 static fftwf_plan precompiledForwardPlansInPlace[SC_FFT_LOG2_ABSOLUTE_MAXSIZE_PLUS1];
 108 static fftwf_plan precompiledBackwardPlansInPlace[SC_FFT_LOG2_ABSOLUTE_MAXSIZE_PLUS1];
 109 #endif
 110
 111 #define pi 3.1415926535898f
 112 #define twopi 6.28318530717952646f
 113
 114 ////////////////////////////////////////////////////////////////////////////////////////////////////
 115 // Functions
 116
 117 #if SC_FFT_GREEN
 118 static float* create_cosTable(int log2n);
 119 static float* create_cosTable(int log2n)
 120 {
 121         int size = 1 << log2n;
 122         int size2 = size / 4 + 1;
 123         float *win = (float*)malloc(size2 * sizeof(float));
 124         double winc = twopi / size;
 125         for (int i=0; i<size2; ++i) {
 126                 double w = i * winc;
 127                 win[i] = cos(w);
 128         }
 129         return win;
 130 }
 131 #endif
 132
 133 static float* scfft_create_fftwindow(int wintype, int log2n)
 134 {
 135         int size = 1 << log2n;
 136         unsigned short i;
 137 #if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600
 138         float *win;
 139         int error = posix_memalign((void**)&win, 64, size * sizeof(float)); // align by cacheline
 140         if (error)
 141                 win = NULL;
 142 #else
 143         float *win = (float*)malloc(size * sizeof(float));
 144 #endif
 145         double winc;
 146         switch(wintype) {
 147                 case kSineWindow:
 148                         winc = pi / size;
 149                         for (i=0; i<size; ++i) {
 150                                 double w = i * winc;
 151                                 win[i] = sin(w);
 152                         }
 153                         break;
 154                 case kHannWindow:
 155                         winc = twopi / size;
 156                         for (i=0; i<size; ++i) {
 157                                 double w = i * winc;
 158                                 win[i] = 0.5 - 0.5 * cos(w);
 159                         }
 160                         break;
 161         }
 162
 163         return win;
 164
 165 }
 166
 167 static void scfft_ensurewindow(unsigned short log2_fullsize, unsigned short log2_winsize, short wintype);
 168
 169 static bool scfft_global_initialization (void)
 170 {
 171         for (int wintype=0; wintype<2; ++wintype) {
 172                 for (int i=0; i< SC_FFT_LOG2_ABSOLUTE_MAXSIZE_PLUS1; ++i) {
 173                         fftWindow[wintype][i] = 0;
 174                 }
 175                 for (int i= SC_FFT_LOG2_MINSIZE; i < SC_FFT_LOG2_MAXSIZE+1; ++i) {
 176                         fftWindow[wintype][i] = scfft_create_fftwindow(wintype, i);
 177                 }
 178         }
 179 #if SC_FFT_GREEN
 180         for (int i=0; i< SC_FFT_LOG2_ABSOLUTE_MAXSIZE_PLUS1; ++i) {
 181                 cosTable[i] = 0;
 182         }
 183         for (int i= SC_FFT_LOG2_MINSIZE; i < SC_FFT_LOG2_MAXSIZE+1; ++i) {
 184                 cosTable[i] = create_cosTable(i);
 185         }
 186         printf("SC FFT global init: cosTable initialised.\n");
 187 #elif SC_FFT_VDSP
 188         // vDSP inits its twiddle factors
 189         for (int i= SC_FFT_LOG2_MINSIZE; i < SC_FFT_LOG2_MAXSIZE+1; ++i) {
 190                 fftSetup[i] = vDSP_create_fftsetup (i, FFT_RADIX2);
 191                 if(fftSetup[i] == NULL){
 192                         printf("FFT ERROR: Mac vDSP library could not allocate FFT setup for size %i\n", 1<<i);
 193                 }
 194         }
 195         // vDSP prepares its memory-aligned buffer for rearranging input data.
 196         // Note max size here - meaning max input buffer size is these two sizes added together.
 197         // vec_malloc used in API docs, but apparently that's deprecated and malloc is sufficient for aligned memory on OSX.
 198         splitBuf.realp = (float*) malloc ( SC_FFT_MAXSIZE * sizeof(float) / 2);
 199         splitBuf.imagp = (float*) malloc ( SC_FFT_MAXSIZE * sizeof(float) / 2);
 200         //printf("SC FFT global init: vDSP initialised.\n");
 201 #elif SC_FFT_FFTW
 202         size_t maxSize = 1<<SC_FFT_LOG2_MAXSIZE;
 203         float * buffer1 = (float*)fftwf_malloc((maxSize + 1) * sizeof(float));
 204         float * buffer2 = (float*)fftwf_malloc((maxSize + 1) * sizeof(float));
 205         for (int i= SC_FFT_LOG2_MINSIZE; i < SC_FFT_LOG2_MAXSIZE+1; ++i) {
 206                 size_t currentSize = 1<<i;
 207
 208                 precompiledForwardPlans[i]  = fftwf_plan_dft_r2c_1d(currentSize, buffer1, (fftwf_complex*) buffer2, FFTW_ESTIMATE);
 209                 precompiledBackwardPlans[i] = fftwf_plan_dft_c2r_1d(currentSize, (fftwf_complex*) buffer2, buffer1, FFTW_ESTIMATE);
 210
 211                 precompiledForwardPlansInPlace[i]  = fftwf_plan_dft_r2c_1d(currentSize, buffer1, (fftwf_complex*) buffer1, FFTW_ESTIMATE);
 212                 precompiledBackwardPlansInPlace[i] = fftwf_plan_dft_c2r_1d(currentSize, (fftwf_complex*) buffer1, buffer1, FFTW_ESTIMATE);
 213         }
 214         fftwf_free(buffer1);
 215         fftwf_free(buffer2);
 216 #endif
 217         return false;
 218 }
 219
 220 static bool dummy = scfft_global_initialization();
 221
 222 // You need to provide an intermediate "transform buffer". Size will depend on which underlying lib is being used.
 223 // "fullsize" is the number of samples in the input buffer (inc any padding), aka the number of "points" in the FFT.
 224 //   Often in an SC plugin you can get this number from buf->samples if you're grabbing an external buffer.
 225 // The input value is given in samples.
 226 // The return value is given in bytes.
 227 static size_t scfft_trbufsize(unsigned int fullsize)
 228 {
 229 #if SC_FFT_FFTW
 230         // Transform buf is two floats "too big" because of FFTWF's output ordering
 231         return (fullsize+2) * sizeof(float);
 232 #else
 233         // vDSP packs the nyquist in with the DC, so size is same as input buffer (plus zeropadding)
 234         // Green does this too
 235         return (fullsize  ) * sizeof(float);
 236 #endif
 237 }
 238
 239
 240 scfft * scfft_create(size_t fullsize, size_t winsize, SCFFT_WindowFunction wintype,
 241                                          float *indata, float *outdata, SCFFT_Direction forward, SCFFT_Allocator & alloc)
 242 {
 243         char * chunk = (char*) alloc.alloc(sizeof(scfft) + scfft_trbufsize(fullsize));
 244         if (!chunk)
 245                 return NULL;
 246
 247         scfft * f = (scfft*)chunk;
 248         float *trbuf = (float*)(chunk + sizeof(scfft));
 249
 250         f->nfull = fullsize;
 251         f->nwin  =  winsize;
 252         f->log2nfull = LOG2CEIL(fullsize);
 253         f->log2nwin  = LOG2CEIL( winsize);
 254         f->wintype = wintype;
 255         f->indata  = indata;
 256         f->outdata = outdata;
 257         f->trbuf   = trbuf;
 258
 259         // Buffer is larger than the range of sizes we provide for at startup; we can get ready just-in-time though
 260         if (fullsize > SC_FFT_MAXSIZE)
 261                 scfft_ensurewindow(f->log2nfull, f->log2nwin, wintype);
 262
 263         // The scale factors rescale the data to unity gain. The old Green lib did this itself, meaning scalefacs would here be 1...
 264         if(forward){
 265 #if SC_FFT_VDSP
 266                 f->scalefac = 0.5f;
 267 #else // forward FFTW and Green factor
 268                 f->scalefac = 1.f;
 269 #endif
 270         } else { // backward FFTW and VDSP factor
 271 #if SC_FFT_GREEN
 272                 f->scalefac = 1.f;
 273 #else  // fftw, vdsp
 274                 f->scalefac = 1.f / fullsize;
 275 #endif
 276         }
 277
 278         memset(trbuf, 0, scfft_trbufsize(fullsize));
 279
 280         return f;
 281 }
 282
 283 static int largest_log2n = SC_FFT_LOG2_MAXSIZE;
 284
 285 // check the global list of windows incs ours; create if not.
 286 // Note that expanding the table, if triggered, will cause a CPU hit as things are malloc'ed, realloc'ed, etc.
 287 void scfft_ensurewindow(unsigned short log2_fullsize, unsigned short log2_winsize, short wintype)
 288 {
 289         // Ensure we have enough space to do our calcs
 290         if(log2_fullsize > largest_log2n){
 291                 largest_log2n = log2_fullsize;
 292                 #if SC_FFT_VDSP
 293                         size_t newsize = (1 << largest_log2n) * sizeof(float) / 2;
 294                         splitBuf.realp = (float*) realloc (splitBuf.realp, newsize);
 295                         splitBuf.imagp = (float*) realloc (splitBuf.imagp, newsize);
 296                 #endif
 297         }
 298
 299         // Ensure our window has been created
 300         if((wintype != -1) && (fftWindow[wintype][log2_winsize] == 0)){
 301                 fftWindow[wintype][log2_winsize] = scfft_create_fftwindow(wintype, log2_winsize);
 302         }
 303
 304         // Ensure our FFT twiddle factors (or whatever) have been created
 305 #if SC_FFT_VDSP
 306         if(fftSetup[log2_fullsize] == 0)
 307                 fftSetup[log2_fullsize] = vDSP_create_fftsetup (log2_fullsize, FFT_RADIX2);
 308 #elif SC_FFT_GREEN
 309         if(cosTable[log2_fullsize] == 0)
 310                 cosTable[log2_fullsize] = create_cosTable(log2_fullsize);
 311 #endif
 312 }
 313
 314 // these do the main jobs.
 315 // Note: you DON"T need to call the windowing function yourself, it'll be applied by the _dofft and _doifft funcs.
 316 static void scfft_dowindowing(float *data, unsigned int winsize, unsigned int fullsize, unsigned short log2_winsize,
 317                                                           short wintype, float scalefac)
 318 {
 319         if (wintype != kRectWindow) {
 320                 float *win = fftWindow[wintype][log2_winsize];
 321                 if (!win) return;
 322 #if SC_FFT_VDSP
 323                 vDSP_vmul(data, 1, win, 1, data, 1, winsize);
 324 #elif defined (NOVA_SIMD)
 325                 using namespace nova;
 326                 if ((vec<float>::objects_per_cacheline & winsize == 0) && vec<float>::is_aligned(data))
 327                         times_vec_simd(data, data, win, winsize);
 328                 else
 329                         times_vec(data, data, win, winsize);
 330 #else
 331                 --win;
 332                 float *in = data - 1;
 333                 for (int i=0; i< winsize ; ++i) {
 334                         *++in *= *++win;
 335                 }
 336 #endif
 337         }
 338
 339         // scale factor is different for different libs. But the compiler switch here is about using vDSP's fast multiplication method.
 340 #if SC_FFT_VDSP
 341         vDSP_vsmul(data, 1, &scalefac, data, 1, winsize);
 342 #else
 343         for(int i=0; i<winsize; ++i){
 344                 data[i] *= scalefac;
 345         }
 346 #endif
 347
 348         // Zero-padding:
 349         memset(data + winsize, 0, (fullsize - winsize) * sizeof(float));
 350 }
 351
 352 void scfft_dofft(scfft * f)
 353 {
 354         // Data goes to transform buf
 355         memcpy(f->trbuf, f->indata, f->nwin * sizeof(float));
 356         scfft_dowindowing(f->trbuf, f->nwin, f->nfull, f->log2nwin, f->wintype, f->scalefac);
 357 #if SC_FFT_FFTW
 358         // forward transformation is in-place
 359         fftwf_execute_dft_r2c(precompiledForwardPlansInPlace[f->log2nfull], f->trbuf, (fftwf_complex*)f->trbuf);
 360
 361         // Rearrange output data onto public buffer
 362         memcpy(f->outdata, f->trbuf, f->nfull * sizeof(float));
 363         f->outdata[1] = f->trbuf[f->nfull]; // Pack nyquist val in
 364 #elif SC_FFT_VDSP
 365         // Perform even-odd split
 366         vDSP_ctoz((COMPLEX*) f->trbuf, 2, &splitBuf, 1, f->nfull >> 1);
 367         // Now the actual FFT
 368         vDSP_fft_zrip(fftSetup[f->log2nfull], &splitBuf, 1, f->log2nfull, FFT_FORWARD);
 369         // Copy the data to the public output buf, transforming it back out of "split" representation
 370         vDSP_ztoc(&splitBuf, 1, (DSPComplex*) f->outdata, 2, f->nfull >> 1);
 371 #elif SC_FFT_GREEN
 372         // Green FFT is in-place
 373         rffts(f->trbuf, f->log2nfull, 1, cosTable[f->log2nfull]);
 374         // Copy to public buffer
 375         memcpy(f->outdata, f->trbuf, f->nfull * sizeof(float));
 376 #endif
 377 }
 378
 379 void scfft_doifft(scfft * f)
 380 {
 381 #if SC_FFT_FFTW
 382         float *trbuf = f->trbuf;
 383         size_t bytesize = f->nfull * sizeof(float);
 384         memcpy(trbuf, f->indata, bytesize);
 385         trbuf[1] = 0.f;
 386         trbuf[f->nfull] = f->indata[1];  // Nyquist goes all the way down to the end of the line...
 387         trbuf[f->nfull+1] = 0.f;
 388
 389         fftwf_execute_dft_c2r(precompiledBackwardPlans[f->log2nfull], (fftwf_complex*)trbuf, f->outdata);
 390
 391 #elif SC_FFT_VDSP
 392         vDSP_ctoz((COMPLEX*) f->indata, 2, &splitBuf, 1, f->nfull >> 1);
 393         vDSP_fft_zrip(fftSetup[f->log2nfull], &splitBuf, 1, f->log2nfull, FFT_INVERSE);
 394         vDSP_ztoc(&splitBuf, 1, (DSPComplex*) f->outdata, 2, f->nfull >> 1);
 395 #elif SC_FFT_GREEN
 396         float *trbuf = f->trbuf;
 397         size_t bytesize = f->nfull * sizeof(float);
 398         memcpy(trbuf, f->indata, bytesize);
 399         // Green FFT is in-place
 400         riffts(trbuf, f->log2nfull, 1, cosTable[f->log2nfull]);
 401         // Copy to public buffer
 402         memcpy(f->outdata, trbuf, f->nwin * sizeof(float));
 403 #endif
 404         scfft_dowindowing(f->outdata, f->nwin, f->nfull, f->log2nwin, f->wintype, f->scalefac);
 405 }
 406
 407 void scfft_destroy(scfft *f, SCFFT_Allocator & alloc)
 408 {
 409         alloc.free(f);
 410 }