3 // FIR FILTER COMPTUED DIRECTLY ON INPUT WITH NO
5 // TWO OUTPUTS PER ITERATION
6 // This program computes
a FIR filter without maintaining
a buffer of internal
8 // This example computes two output samples per inner loop. The following
9 // diagram shows the alignment required for signal x
and coefficients c
:
11 // c0 c1 c2 c3 c4
-> output z
(0)=x0
*c0
+ x1
*c1
+ ...
12 // c0 c1 c2 c3 c4
-> z
(1)=x1
*c0
+ x2
*c1
+ ...
15 // Z
(k
) = \ c
(n
) * x
(n+k
)
19 // Naive
, first stab at spliting this for dual MACS.
22 // R
(k
) = \
(x
(2n
) * y
(2n+k
)) + \
(x
(2n-
1) * y
(2n-
1+k
))
26 // Alternate
, better partitioning for the machine.
29 // R
(0) = \ x
(n
) * y
(n
)
35 // R
(1) = \ x
(n
) * y
(n+
1)
41 // R
(2) = \ x
(n
) * y
(n+
2)
47 // R
(3) = \ x
(n
) * y
(n+
3)
55 // Okay in this verion the inner loop will compute R
(2k
) and R
(2k+
1) in parallel
58 // R
(2k
) = \ x
(n
) * y
(n+
2k
)
64 // R
(2k+
1) = \ x
(n
) * y
(n+
2k+
1)
70 // Sample pair x1 x0 is loaded into register
R0, and coefficients c1 c0
71 // is loaded into register
R1:
76 // | c1 c0 | compute two MACs
: z
(0)+=x0
*c0
, and z
(1)+=x1
*c0
78 // Now load x2 into lo half of
R0, and compute the next two MACs
:
83 // | c1 c0 | compute z
(0)+=x1
*c1
and z
(1)+=x2
*c1
(c0
not used
)
85 // Meanwhile
, load coefficient pair c3 c2 into
R2, and x3 into hi half of
R0:
90 // | c3 c2 | compute z
(0)+=x2
*c2
and z
(1)+=x3
*c2
(c3
not used
)
92 // Load x4 into low half of
R0:
97 // | c3 c2 | compute z
(0)+=x3
*c3
and z
(1)+=x4
*c3
(c2
not used
)
99 // //This is
a reference FIR function used to test
: */
100 //void firf
(float input
[], float output
[], float coeffs
[],
101 // long input_size
, long coeffs_size
)
104 // for
(i
=0; i
< input_size; i+
+){
106 // for
(k
=0; k
< coeffs_size; k+
+)
107 // output
[i
] += input
[k+i
] * coeffs
[k
];
111 .include "testutils.inc"
115 R0 = 0;
R1 = 0;
R2 = 0;
116 P1
= 128 (X
);
// Load loop bounds in
R5, R6, and divide by
2
119 // P0 holds pointer to input data in one memory
120 // bank. Increments by
2 after each inner-loop iter
123 // Pointer to coeffs in alternate memory bank.
126 // Pointer to outputs in any memory bank.
129 // Setup outer do-loop for M
/2 iterations
130 // (2 outputs are computed per pass
)
132 LSETUP
( L$
0 , L$
0end
) LC0
= P1
>> 1;
137 // Set-up inner do-loop for
L/2 iterations
138 // (2 MACs are computed per pass
)
140 LSETUP
( L$
1 , L$
1end
) LC1
= P2
>> 1;
142 // Load first two data elements in
r0,
143 // and two coeffs into
r1:
146 A1
= A0
= 0 || R0.H
= W
[ I0
++ ] ||
R1 = [ I1
++ ];
149 A1
+= R0.H
* R1.
L, A0
+= R0.
L * R1.
L || R0.
L = W
[ I0
++ ] ||
NOP;
151 A1
+= R0.
L * R1.H
, A0
+= R0.H
* R1.H || R0.H
= W
[ I0
++ ] ||
R1 = [ I1
++ ];
153 // Line
1: do
2 MACs
and load next data element into RL0.
154 // Line
2: do
2 MACs
, load next data element into RH0
,
155 // and load next
2 coeffs
157 R0.H
= A1
, R0.
L = A0;
159 // advance data pointer by
2 16b elements
163 [ I2
++ ] = R0;
// store
2 outputs
168 R0.
L = W
[ I2
++ ]; DBGA
( R0.
L , 0x0800 );
169 R0.
L = W
[ I2
++ ]; DBGA
( R0.
L , 0x1000 );
170 R0.
L = W
[ I2
++ ]; DBGA
( R0.
L , 0x2000 );
171 R0.
L = W
[ I2
++ ]; DBGA
( R0.
L , 0x1000 );
172 R0.
L = W
[ I2
++ ]; DBGA
( R0.
L , 0x0800 );
187 .space ((128-10)*2); // must pad with zeros or uninitialized values.
197 .space ((64-6)*2); // must pad with zeros or uninitialized values.