perf/ffbench.c

   1 // This small program computes a Fast Fourier Transform.  It tests
   2 // Valgrind's handling of FP operations.  It is representative of all
   3 // programs that do a lot of FP operations.
   4
   5 // Licensing: This program is closely based on the one of the same name from
   6 // http://www.fourmilab.ch/.  The front page of that site says:
   7 //
   8 //   "Except for a few clearly-marked exceptions, all the material on this
   9 //   site is in the public domain and may be used in any manner without
  10 //   permission, restriction, attribution, or compensation."
  11
  12 /*
  13
  14         Two-dimensional FFT benchmark
  15
  16         Designed and implemented by John Walker in April of 1989.
  17
  18         This  benchmark executes a specified number of passes (default
  19         20) through a loop in which each  iteration  performs  a  fast
  20         Fourier transform of a square matrix (default size 256x256) of
  21         complex numbers (default precision double),  followed  by  the
  22         inverse  transform.   After  all loop iterations are performed
  23         the results are checked against known correct values.
  24
  25         This benchmark is intended for use on C implementations  which
  26         define  "int"  as  32 bits or longer and permit allocation and
  27         direct addressing of arrays larger than one megabyte.
  28
  29         If CAPOUT is defined,  the  result  after  all  iterations  is
  30         written  as  a  CA  Lab  pattern  file.   This is intended for
  31         debugging in case horribly wrong results  are  obtained  on  a
  32         given machine.
  33
  34         Archival  timings  are  run  with the definitions below set as
  35         follows: Float = double, Asize = 256, Passes = 20, CAPOUT  not
  36         defined.
  37
  38         Time (seconds)              System
  39
  40             2393.93       Sun 3/260, SunOS 3.4, C, "-f68881 -O".
  41                           (John Walker).
  42
  43             1928          Macintosh IIx, MPW C 3.0, "-mc68020
  44                           -mc68881 -elems881 -m".  (Hugh Hoover).
  45
  46             1636.1        Sun 4/110, "cc -O3 -lm".  (Michael McClary).
  47                           The suspicion is that this is software
  48                           floating point.
  49
  50             1556.7        Macintosh II, A/UX, "cc -O -lm"
  51                           (Michael McClary).
  52
  53             1388.8        Sun 386i/250, SunOS 4.0.1 C
  54                           "-O /usr/lib/trig.il".  (James Carrington).
  55
  56             1331.93       Sun 3/60, SunOS 4.0.1, C,
  57                           "-O4 -f68881 /usr/lib/libm.il"
  58                           (Bob Elman).
  59
  60             1204.0        Apollo Domain DN4000, C, "-cpu 3000 -opt 4".
  61                           (Sam Crupi).
  62
  63             1174.66       Compaq 386/25, SCO Xenix 386 C.
  64                           (Peter Shieh).
  65
  66             1068          Compaq 386/25, SCO Xenix 386,
  67                           Metaware High C.  (Robert Wenig).
  68
  69             1064.0        Sun 3/80, SunOS 4.0.3 Beta C
  70                           "-O3 -f68881 /usr/lib/libm.il".  (James Carrington).
  71
  72             1061.4        Compaq 386/25, SCO Xenix, High C 1.4.
  73                           (James Carrington).
  74
  75             1059.79       Compaq 386/25, 387/25, High C 1.4,
  76                           DOS|Extender 2.2, 387 inline code
  77                           generation.  (Nathan Bender).
  78
  79              777.14       Compaq 386/25, IIT 3C87-25 (387 Compatible),
  80                           High C 1.5, DOS|Extender 2.2, 387 inline
  81                           code generation.  (Nathan Bender).
  82
  83              751          Compaq DeskPro 386/33, High C 1.5 + DOS|Extender,
  84                           387 code generation.  (James Carrington).
  85
  86              431.44       Compaq 386/25, Weitek 3167-25, DOS 3.31,
  87                           High C 1.4, DOS|Extender, Weitek code generation.
  88                           (Nathan Bender).
  89
  90              344.9        Compaq 486/25, Metaware High C 1.6, Phar Lap
  91                           DOS|Extender, in-line floating point.  (Nathan
  92                           Bender).
  93
  94              324.2        Data General Motorola 88000, 16 Mhz, Gnu C.
  95
  96              323.1        Sun 4/280, C, "-O4".  (Eric Hill).
  97
  98              254          Compaq SystemPro 486/33, High C 1.5 + DOS|Extender,
  99                           387 code generation.  (James Carrington).
 100
 101              242.8        Silicon Graphics Personal IRIS, MIPS R2000A,
 102                           12.5 Mhz, "-O3" (highest level optimisation).
 103                           (Mike Zentner).
 104
 105              233.0        Sun SPARCStation 1, C, "-O4", SunOS 4.0.3.
 106                           (Nathan Bender).
 107
 108              187.30       DEC PMAX 3100, MIPS 2000 chip.
 109                           (Robert Wenig).
 110
 111              120.46       Sun SparcStation 2, C, "-O4", SunOS 4.1.1.
 112                           (John Walker).
 113
 114              120.21       DEC 3MAX, MIPS 3000, "-O4".
 115
 116               98.0        Intel i860 experimental environment,
 117                           OS/2, data caching disabled.  (Kern
 118                           Sibbald).
 119
 120               34.9        Silicon Graphics Indigo², MIPS R4400,
 121                           175 Mhz, IRIX 5.2, "-O".
 122
 123               32.4        Pentium 133, Windows NT, Microsoft Visual
 124                           C++ 4.0.
 125
 126               17.25       Silicon Graphics Indigo², MIPS R4400,
 127                           175 Mhz, IRIX 6.5, "-O3".
 128
 129               14.10       Dell Dimension XPS R100, Pentium II 400 MHz,
 130                           Windows 98, Microsoft Visual C 5.0.
 131
 132               10.7        Hewlett-Packard Kayak XU 450Mhz Pentium II,
 133                           Microsoft Visual C++ 6.0, Windows NT 4.0sp3.  (Nathan Bender).
 134
 135                5.09       Sun Ultra 2, UltraSPARC V9, 300 MHz, gcc -O3.
 136
 137                0.846      Dell Inspiron 9100, Pentium 4, 3.4 GHz, gcc -O3.
 138
 139 */
 140
 141 #include <stdio.h>
 142 #include <stdlib.h>
 143 #include <math.h>
 144 #include <string.h>
 145
 146 /*  The  program  may  be  run  with  Float defined as either float or
 147     double.  With IEEE arithmetic, the same answers are generated  for
 148     either floating point mode.  */
 149
 150 #define Float    double            /* Floating point type used in FFT */
 151
 152 #define Asize    256               /* Array edge size */
 153 #define Passes   20                /* Number of FFT/Inverse passes */
 154
 155 #define max(a,b) ((a)>(b)?(a):(b))
 156 #define min(a,b) ((a)<=(b)?(a):(b))
 157
 158 /*
 159
 160         Multi-dimensional fast Fourier transform
 161
 162         Adapted from Press et al., "Numerical Recipes in C".
 163
 164 */
 165
 166 #define SWAP(a,b) tempr=(a); (a)=(b); (b)=tempr
 167
 168 static void fourn(data, nn, ndim, isign)
 169   Float data[];
 170   int nn[], ndim, isign;
 171 {
 172         register int i1, i2, i3;
 173         int i2rev, i3rev, ip1, ip2, ip3, ifp1, ifp2;
 174         int ibit, idim, k1, k2, n, nprev, nrem, ntot;
 175         Float tempi, tempr;
 176         double theta, wi, wpi, wpr, wr, wtemp;
 177
 178         ntot = 1;
 179         for (idim = 1; idim <= ndim; idim++)
 180            ntot *= nn[idim];
 181         nprev = 1;
 182         for (idim = ndim; idim >= 1; idim--) {
 183            n = nn[idim];
 184            nrem = ntot / (n * nprev);
 185            ip1 = nprev << 1;
 186            ip2 = ip1 * n;
 187            ip3 = ip2 * nrem;
 188            i2rev = 1;
 189            for (i2 = 1; i2 <= ip2; i2 += ip1) {
 190               if (i2 < i2rev) {
 191                  for (i1 = i2; i1 <= i2 + ip1 - 2; i1 += 2) {
 192                     for (i3 = i1; i3 <= ip3; i3 += ip2) {
 193                        i3rev = i2rev + i3 - i2;
 194                        SWAP(data[i3], data[i3rev]);
 195                        SWAP(data[i3 + 1], data[i3rev + 1]);
 196                     }
 197                  }
 198               }
 199               ibit = ip2 >> 1;
 200               while (ibit >= ip1 && i2rev > ibit) {
 201                  i2rev -= ibit;
 202                  ibit >>= 1;
 203               }
 204               i2rev += ibit;
 205            }
 206            ifp1 = ip1;
 207            while (ifp1 < ip2) {
 208               ifp2 = ifp1 << 1;
 209               theta = isign * 6.28318530717959 / (ifp2 / ip1);
 210               wtemp = sin(0.5 * theta);
 211               wpr = -2.0 * wtemp * wtemp;
 212               wpi = sin(theta);
 213               wr = 1.0;
 214               wi = 0.0;
 215               for (i3 = 1; i3 <= ifp1; i3 += ip1) {
 216                  for (i1 = i3; i1 <= i3 + ip1 - 2; i1 += 2) {
 217                     for (i2 = i1; i2 <= ip3; i2 += ifp2) {
 218                        k1 = i2;
 219                        k2 = k1 + ifp1;
 220                        tempr = wr * data[k2] - wi * data[k2 + 1];
 221                        tempi = wr * data[k2 + 1] + wi * data[k2];
 222                        data[k2] = data[k1] - tempr;
 223                        data[k2 + 1] = data[k1 + 1] - tempi;
 224                        data[k1] += tempr;
 225                        data[k1 + 1] += tempi;
 226                     }
 227                  }
 228                  wr = (wtemp = wr) * wpr - wi * wpi + wr;
 229                  wi = wi * wpr + wtemp * wpi + wi;
 230               }
 231               ifp1 = ifp2;
 232            }
 233            nprev *= n;
 234         }
 235 }
 236 #undef SWAP
 237
 238 int main()
 239 {
 240         int i, j, k, l, m, npasses = Passes, faedge;
 241         Float *fdata /* , *fd */ ;
 242         static int nsize[] = {0, 0, 0};
 243         long fanum, fasize;
 244         double mapbase, mapscale, /* x, */ rmin, rmax, imin, imax;
 245
 246         faedge = Asize;            /* FFT array edge size */
 247         fanum = faedge * faedge;   /* Elements in FFT array */
 248         fasize = ((fanum + 1) * 2 * sizeof(Float)); /* FFT array size */
 249         nsize[1] = nsize[2] = faedge;
 250
 251         fdata = (Float *) malloc(fasize);
 252         if (fdata == NULL) {
 253            fprintf(stdout, "Can't allocate data array.\n");
 254            exit(1);
 255         }
 256
 257         /*  Generate data array to process.  */
 258
 259 #define Re(x,y) fdata[1 + (faedge * (x) + (y)) * 2]
 260 #define Im(x,y) fdata[2 + (faedge * (x) + (y)) * 2]
 261
 262         memset(fdata, 0, fasize);
 263         for (i = 0; i < faedge; i++) {
 264            for (j = 0; j < faedge; j++) {
 265               if (((i & 15) == 8) || ((j & 15) == 8))
 266                  Re(i, j) = 128.0;
 267            }
 268         }
 269
 270         for (i = 0; i < npasses; i++) {
 271 /*printf("Pass %d\n", i);*/
 272            /* Transform image to frequency domain. */
 273            fourn(fdata, nsize, 2, 1);
 274
 275            /*  Back-transform to image. */
 276            fourn(fdata, nsize, 2, -1);
 277         }
 278
 279         {
 280            double r, ij, ar, ai;
 281            rmin = 1e10; rmax = -1e10;
 282            imin = 1e10; imax = -1e10;
 283            ar = 0;
 284            ai = 0;
 285
 286            for (i = 1; i <= fanum; i += 2) {
 287               r = fdata[i];
 288               ij = fdata[i + 1];
 289               ar += r;
 290               ai += ij;
 291               rmin = min(r, rmin);
 292               rmax = max(r, rmax);
 293               imin = min(ij, imin);
 294               imax = max(ij, imax);
 295            }
 296 #ifdef DEBUG
 297            printf("Real min %.4g, max %.4g.  Imaginary min %.4g, max %.4g.\n",
 298               rmin, rmax, imin, imax);
 299            printf("Average real %.4g, imaginary %.4g.\n",
 300               ar / fanum, ai / fanum);
 301 #endif
 302            mapbase = rmin;
 303            mapscale = 255 / (rmax - rmin);
 304         }
 305
 306         /* See if we got the right answers. */
 307
 308         m = 0;
 309         for (i = 0; i < faedge; i++) {
 310            for (j = 0; j < faedge; j++) {
 311               k = (Re(i, j) - mapbase) * mapscale;
 312               l = (((i & 15) == 8) || ((j & 15) == 8)) ? 255 : 0;
 313               if (k != l) {
 314                  m++;
 315                  fprintf(stdout,
 316                     "Wrong answer at (%d,%d)!  Expected %d, got %d.\n",
 317                     i, j, l, k);
 318               }
 319            }
 320         }
 321         if (m == 0) {
 322            fprintf(stdout, "%d passes.  No errors in results.\n", npasses);
 323         } else {
 324            fprintf(stdout, "%d passes.  %d errors in results.\n",
 325               npasses, m);
 326         }
 327
 328 #ifdef CAPOUT
 329
 330         /* Output the result of the transform as a CA Lab pattern
 331            file for debugging. */
 332
 333         {
 334 #define SCRX  322
 335 #define SCRY  200
 336 #define SCRN  (SCRX * SCRY)
 337            unsigned char patarr[SCRY][SCRX];
 338            FILE *fp;
 339
 340 /*  Map user external state numbers to internal state index  */
 341
 342 #define UtoI(x)     (((((x) >> 1) & 0x7F) | ((x) << 7)) & 0xFF)
 343
 344            /* Copy data from FFT buffer to map. */
 345
 346            memset(patarr, 0, sizeof patarr);
 347            l = (SCRX - faedge) / 2;
 348            m = (faedge > SCRY) ? 0 : ((SCRY - faedge) / 2);
 349            for (i = 1; i < faedge; i++) {
 350               for (j = 0; j < min(SCRY, faedge); j++) {
 351                  k = (Re(i, j) - mapbase) * mapscale;
 352                  patarr[j + m][i + l] = UtoI(k);
 353               }
 354            }
 355
 356            /* Dump pattern map to file. */
 357
 358            fp = fopen("fft.cap", "w");
 359            if (fp == NULL) {
 360               fprintf(stdout, "Cannot open output file.\n");
 361               exit(0);
 362            }
 363            putc(':', fp);
 364            putc(1, fp);
 365            fwrite(patarr, SCRN, 1, fp);
 366            putc(6, fp);
 367            fclose(fp);
 368         }
 369 #endif
 370
 371         return 0;
 372 }