ld: Move the .note.build-id section to near the start of the memory map.
[binutils-gdb.git] / sim / testsuite / bfin / fir.s
blob0ba4d2f3b9362d92778985fdc9e95f0e74646cb9
1 # mach: bfin
3 // FIR FILTER COMPTUED DIRECTLY ON INPUT WITH NO
4 // INTERNAL STATE
5 // TWO OUTPUTS PER ITERATION
6 // This program computes a FIR filter without maintaining a buffer of internal
7 // state.
8 // This example computes two output samples per inner loop. The following
9 // diagram shows the alignment required for signal x and coefficients c:
10 // x0 x1 x2 x3 x4 x5
11 // c0 c1 c2 c3 c4 -> output z(0)=x0*c0 + x1*c1 + ...
12 // c0 c1 c2 c3 c4 -> z(1)=x1*c0 + x2*c1 + ...
13 // L-1
14 // ---
15 // Z(k) = \ c(n) * x(n+k)
16 // /
17 // ---
18 // n=0
19 // Naive, first stab at spliting this for dual MACS.
20 // L/2-1 L/2-1
21 // --- ---
22 // R(k) = \ (x(2n) * y(2n+k)) + \ (x(2n-1) * y(2n-1+k))
23 // / /
24 // --- ---
25 // n=0 n=0
26 // Alternate, better partitioning for the machine.
27 // L-1
28 // ---
29 // R(0) = \ x(n) * y(n)
30 // /
31 // ---
32 // n=0
33 // L-1
34 // ---
35 // R(1) = \ x(n) * y(n+1)
36 // /
37 // ---
38 // n=0
39 // L-1
40 // ---
41 // R(2) = \ x(n) * y(n+2)
42 // /
43 // ---
44 // n=0
45 // L-1
46 // ---
47 // R(3) = \ x(n) * y(n+3)
48 // /
49 // ---
50 // n=0
51 // .
52 // .
53 // .
54 // .
55 // Okay in this verion the inner loop will compute R(2k) and R(2k+1) in parallel
56 // L-1
57 // ---
58 // R(2k) = \ x(n) * y(n+2k)
59 // /
60 // ---
61 // n=0
62 // L-1
63 // ---
64 // R(2k+1) = \ x(n) * y(n+2k+1)
65 // /
66 // ---
67 // n=0
68 // Implementation
69 // --------------
70 // Sample pair x1 x0 is loaded into register R0, and coefficients c1 c0
71 // is loaded into register R1:
72 // +-------+ R0
73 // | x1 x0 |
74 // +-------+
75 // +-------+ R1
76 // | c1 c0 | compute two MACs: z(0)+=x0*c0, and z(1)+=x1*c0
77 // +-------+
78 // Now load x2 into lo half of R0, and compute the next two MACs:
79 // +-------+ R0
80 // | x1 x2 |
81 // +-------+
82 // +-------+ R1
83 // | c1 c0 | compute z(0)+=x1*c1 and z(1)+=x2*c1 (c0 not used)
84 // +-------+
85 // Meanwhile, load coefficient pair c3 c2 into R2, and x3 into hi half of R0:
86 // +-------+ R0
87 // | x3 x2 |
88 // +-------+
89 // +-------+ R2
90 // | c3 c2 | compute z(0)+=x2*c2 and z(1)+=x3*c2 (c3 not used)
91 // +-------+
92 // Load x4 into low half of R0:
93 // +-------+ R0
94 // | x3 x4 |
95 // +-------+
96 // +-------+ R1
97 // | c3 c2 | compute z(0)+=x3*c3 and z(1)+=x4*c3 (c2 not used)
98 // +-------+
99 // //This is a reference FIR function used to test: */
100 //void firf (float input[], float output[], float coeffs[],
101 // long input_size, long coeffs_size)
103 // long i, k;
104 // for(i=0; i< input_size; i++){
105 // output[i] = 0;
106 // for(k=0; k < coeffs_size; k++)
107 // output[i] += input[k+i] * coeffs[k];
108 // }
111 .include "testutils.inc"
112 start
115 R0 = 0; R1 = 0; R2 = 0;
116 P1 = 128 (X); // Load loop bounds in R5, R6, and divide by 2
117 P2 = 64 (X);
119 // P0 holds pointer to input data in one memory
120 // bank. Increments by 2 after each inner-loop iter
121 loadsym P0, input;
123 // Pointer to coeffs in alternate memory bank.
124 loadsym I1, coef;
126 // Pointer to outputs in any memory bank.
127 loadsym I2, output;
129 // Setup outer do-loop for M/2 iterations
130 // (2 outputs are computed per pass)
132 LSETUP ( L$0 , L$0end ) LC0 = P1 >> 1;
134 L$0:
135 loadsym I1, coef;
136 I0 = P0;
137 // Set-up inner do-loop for L/2 iterations
138 // (2 MACs are computed per pass)
140 LSETUP ( L$1 , L$1end ) LC1 = P2 >> 1;
142 // Load first two data elements in r0,
143 // and two coeffs into r1:
145 R0.L = W [ I0 ++ ];
146 A1 = A0 = 0 || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
148 L$1:
149 A1 += R0.H * R1.L, A0 += R0.L * R1.L || R0.L = W [ I0 ++ ] || NOP;
150 L$1end:
151 A1 += R0.L * R1.H, A0 += R0.H * R1.H || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
153 // Line 1: do 2 MACs and load next data element into RL0.
154 // Line 2: do 2 MACs, load next data element into RH0,
155 // and load next 2 coeffs
157 R0.H = A1, R0.L = A0;
159 // advance data pointer by 2 16b elements
160 P0 += 4;
162 L$0end:
163 [ I2 ++ ] = R0; // store 2 outputs
165 // Check results
166 loadsym I2, output;
168 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );
169 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );
170 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x2000 );
171 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );
172 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );
173 pass
175 .data
176 input:
177 .dw 0x0000
178 .dw 0x0000
179 .dw 0x0000
180 .dw 0x0000
181 .dw 0x4000
182 .dw 0x0000
183 .dw 0x0000
184 .dw 0x0000
185 .dw 0x0000
186 .dw 0x0000
187 .space ((128-10)*2); // must pad with zeros or uninitialized values.
189 .data
190 coef:
191 .dw 0x1000
192 .dw 0x2000
193 .dw 0x4000
194 .dw 0x2000
195 .dw 0x1000
196 .dw 0x0000
197 .space ((64-6)*2); // must pad with zeros or uninitialized values.
199 .data
200 output:
201 .space (128*4)