sim/testsuite/bfin/fir.s

   1 # mach: bfin
   2
   3 // FIR FILTER COMPTUED DIRECTLY ON INPUT WITH NO
   4 //   INTERNAL STATE
   5 //   TWO OUTPUTS PER ITERATION
   6 // This program computes a FIR filter without maintaining a buffer of internal
   7 // state.
   8 // This example computes two output samples per inner loop. The following
   9 // diagram shows the alignment required for signal x and coefficients c:
  10 // x0 x1 x2 x3 x4 x5
  11 // c0 c1 c2 c3 c4      -> output z(0)=x0*c0 + x1*c1 + ...
  12 //    c0 c1 c2 c3 c4   ->        z(1)=x1*c0 + x2*c1 + ...
  13 //             L-1
  14 //               ---
  15 //      Z(k) =   \   c(n) * x(n+k)
  16 //               /
  17 //               ---
  18 //                     n=0
  19 // Naive, first stab at spliting this for dual MACS.
  20 //             L/2-1                     L/2-1
  21 //               ---                       ---
  22 //      R(k) =   \   (x(2n) * y(2n+k))  +  \   (x(2n-1) * y(2n-1+k))
  23 //               /                         /
  24 //               ---                       ---
  25 //                     n=0                       n=0
  26 // Alternate, better partitioning for the machine.
  27 //             L-1
  28 //               ---
  29 //      R(0) =   \   x(n) * y(n)
  30 //               /
  31 //               ---
  32 //               n=0
  33 //             L-1
  34 //               ---
  35 //      R(1) =   \   x(n) * y(n+1)
  36 //               /
  37 //               ---
  38 //              n=0
  39 //             L-1
  40 //               ---
  41 //      R(2) =   \   x(n) * y(n+2)
  42 //               /
  43 //               ---
  44 //              n=0
  45 //             L-1
  46 //               ---
  47 //      R(3) =   \   x(n) * y(n+3)
  48 //               /
  49 //               ---
  50 //               n=0
  51 //              .
  52 //              .
  53 //              .
  54 //              .
  55 // Okay in this verion the inner loop will compute R(2k) and R(2k+1) in parallel
  56 //             L-1
  57 //               ---
  58 //     R(2k) =   \   x(n) * y(n+2k)
  59 //               /
  60 //               ---
  61 //              n=0
  62 //             L-1
  63 //               ---
  64 //   R(2k+1) =   \   x(n) * y(n+2k+1)
  65 //               /
  66 //               ---
  67 //              n=0
  68 // Implementation
  69 // --------------
  70 // Sample pair x1 x0 is loaded into register R0, and coefficients c1 c0
  71 // is loaded into register R1:
  72 // +-------+ R0
  73 // | x1 x0 |
  74 // +-------+
  75 // +-------+ R1
  76 // | c1 c0 |  compute two MACs: z(0)+=x0*c0, and z(1)+=x1*c0
  77 // +-------+
  78 // Now load x2 into lo half of R0, and compute the next two MACs:
  79 // +-------+ R0
  80 // | x1 x2 |
  81 // +-------+
  82 // +-------+ R1
  83 // | c1 c0 |    compute z(0)+=x1*c1 and z(1)+=x2*c1 (c0 not used)
  84 // +-------+
  85 // Meanwhile, load coefficient pair c3 c2 into R2, and x3 into hi half of R0:
  86 // +-------+ R0
  87 // | x3 x2 |
  88 // +-------+
  89 // +-------+ R2
  90 // | c3 c2 |    compute z(0)+=x2*c2 and z(1)+=x3*c2 (c3 not used)
  91 // +-------+
  92 // Load x4 into low half of R0:
  93 // +-------+ R0
  94 // | x3 x4 |
  95 // +-------+
  96 // +-------+ R1
  97 // | c3 c2 |    compute z(0)+=x3*c3 and z(1)+=x4*c3 (c2 not used)
  98 // +-------+
  99 // //This is a reference FIR function used to test: */
 100 //void firf (float input[], float  output[], float coeffs[],
 101 //           long input_size, long coeffs_size)
 102 //{
 103 //  long i, k;
 104 //  for(i=0;    i< input_size; i++){
 105 //    output[i] = 0;
 106 //    for(k=0;  k < coeffs_size; k++)
 107 //      output[i] += input[k+i] * coeffs[k];
 108 // }
 109 //}
 110
 111 .include "testutils.inc"
 112         start
 113
 114
 115         R0 = 0; R1 = 0; R2 = 0;
 116         P1 = 128 (X);   // Load loop bounds in R5, R6, and divide by 2
 117         P2 = 64 (X);
 118
 119         // P0 holds pointer to input data in one memory
 120         // bank. Increments by 2 after each inner-loop iter
 121         loadsym P0, input;
 122
 123         // Pointer to coeffs in alternate memory bank.
 124         loadsym I1, coef;
 125
 126         // Pointer to outputs in any memory bank.
 127         loadsym I2, output;
 128
 129         // Setup outer do-loop for M/2 iterations
 130         // (2 outputs are computed per pass)
 131
 132         LSETUP ( L$0 , L$0end ) LC0 = P1 >> 1;
 133
 134 L$0:
 135         loadsym I1, coef;
 136         I0 = P0;
 137                 // Set-up inner do-loop for L/2 iterations
 138                 // (2 MACs are computed per pass)
 139
 140         LSETUP ( L$1 , L$1end ) LC1 = P2 >> 1;
 141
 142                 // Load first two data elements in r0,
 143                 // and two coeffs into r1:
 144
 145         R0.L = W [ I0 ++ ];
 146         A1 = A0 = 0 || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
 147
 148 L$1:
 149         A1 += R0.H * R1.L, A0 += R0.L * R1.L || R0.L = W [ I0 ++ ] || NOP;
 150 L$1end:
 151         A1 += R0.L * R1.H, A0 += R0.H * R1.H || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
 152
 153         // Line 1: do 2 MACs and load next data element into RL0.
 154         // Line 2: do 2 MACs, load next data element into RH0,
 155         // and load next 2 coeffs
 156
 157         R0.H = A1, R0.L = A0;
 158
 159                 // advance data pointer by 2 16b elements
 160         P0 += 4;
 161
 162 L$0end:
 163         [ I2 ++ ] = R0; // store 2 outputs
 164
 165         // Check results
 166         loadsym I2, output;
 167
 168         R0.L = W [ I2 ++ ];     DBGA ( R0.L , 0x0800 );
 169         R0.L = W [ I2 ++ ];     DBGA ( R0.L , 0x1000 );
 170         R0.L = W [ I2 ++ ];     DBGA ( R0.L , 0x2000 );
 171         R0.L = W [ I2 ++ ];     DBGA ( R0.L , 0x1000 );
 172         R0.L = W [ I2 ++ ];     DBGA ( R0.L , 0x0800 );
 173         pass
 174
 175         .data
 176 input:
 177         .dw 0x0000
 178         .dw 0x0000
 179         .dw 0x0000
 180         .dw 0x0000
 181         .dw 0x4000
 182         .dw 0x0000
 183         .dw 0x0000
 184         .dw 0x0000
 185         .dw 0x0000
 186         .dw 0x0000
 187         .space ((128-10)*2);    // must pad with zeros or uninitialized values.
 188
 189         .data
 190 coef:
 191         .dw 0x1000
 192         .dw 0x2000
 193         .dw 0x4000
 194         .dw 0x2000
 195         .dw 0x1000
 196         .dw 0x0000
 197         .space ((64-6)*2);      // must pad with zeros or uninitialized values.
 198
 199         .data
 200 output:
 201         .space (128*4)