1 /* { dg-do compile } */
5 transpose_vmx (vector
signed short *input
, vector
signed short *output
)
7 vector
signed short v0
, v1
, v2
, v3
, v4
, v5
, v6
, v7
;
8 vector
signed short x0
, x1
, x2
, x3
, x4
, x5
, x6
, x7
;
10 /* Matrix transpose */
11 v0
= vec_mergeh (input
[0], input
[4]);
12 v1
= vec_mergel (input
[0], input
[4]);
13 v2
= vec_mergeh (input
[1], input
[5]);
14 v3
= vec_mergel (input
[1], input
[5]);
15 v4
= vec_mergeh (input
[2], input
[6]);
16 v5
= vec_mergel (input
[2], input
[6]);
17 v6
= vec_mergeh (input
[3], input
[7]);
18 v7
= vec_mergel (input
[3], input
[7]);
20 x0
= vec_mergeh (v0
, v4
);
21 x1
= vec_mergel (v0
, v4
);
22 x2
= vec_mergeh (v1
, v5
);
23 x3
= vec_mergel (v1
, v5
);
24 x4
= vec_mergeh (v2
, v6
);
25 x5
= vec_mergel (v2
, v6
);
26 x6
= vec_mergeh (v3
, v7
);
27 x7
= vec_mergel (v3
, v7
);
29 output
[0] = vec_mergeh (x0
, x4
);
30 output
[1] = vec_mergel (x0
, x4
);
31 output
[2] = vec_mergeh (x1
, x5
);
32 output
[3] = vec_mergel (x1
, x5
);
33 output
[4] = vec_mergeh (x2
, x6
);
34 output
[5] = vec_mergel (x2
, x6
);
35 output
[6] = vec_mergeh (x3
, x7
);
36 output
[7] = vec_mergel (x3
, x7
);
40 dct_vmx (vector
signed short *input
, vector
signed short *output
,
41 vector
signed short *postscale
)
43 vector
signed short mul0
, mul1
, mul2
, mul3
, mul4
, mul5
, mul6
, mul
;
44 vector
signed short v0
, v1
, v2
, v3
, v4
, v5
, v6
, v7
, v8
, v9
;
45 vector
signed short v20
, v21
, v22
, v23
, v24
, v25
, v26
, v27
, v31
;
47 vector
signed short in
[8], out
[8];
49 /* Load first eight rows of input data */
51 /* Load multiplication constants */
53 /* Splat multiplication constants */
54 mul0
= vec_splat(input
[8],0);
55 mul1
= vec_splat(input
[8],1);
56 mul2
= vec_splat(input
[8],2);
57 mul3
= vec_splat(input
[8],3);
58 mul4
= vec_splat(input
[8],4);
59 mul5
= vec_splat(input
[8],5);
60 mul6
= vec_splat(input
[8],6);
62 /* Perform DCT on the eight columns */
64 /*********** Stage 1 ***********/
66 v8
= vec_adds (input
[0], input
[7]);
67 v9
= vec_subs (input
[0], input
[7]);
68 v0
= vec_adds (input
[1], input
[6]);
69 v7
= vec_subs (input
[1], input
[6]);
70 v1
= vec_adds (input
[2], input
[5]);
71 v6
= vec_subs (input
[2], input
[5]);
72 v2
= vec_adds (input
[3], input
[4]);
73 v5
= vec_subs (input
[3], input
[4]);
75 /*********** Stage 2 ***********/
78 v3
= vec_adds (v8
, v2
); /* (V0+V7) + (V3+V4) */
79 v4
= vec_subs (v8
, v2
); /* (V0+V7) - (V3+V4) */
80 v2
= vec_adds (v0
, v1
); /* (V1+V6) + (V2+V5) */
81 v8
= vec_subs (v0
, v1
); /* (V1+V6) - (V2+V5) */
84 v0
= vec_subs (v7
, v6
); /* (V1-V6) - (V2-V5) */
85 v1
= vec_adds (v7
, v6
); /* (V1-V6) + (V2-V5) */
87 /*********** Stage 3 ***********/
90 in
[0] = vec_adds (v3
, v2
); /* y0 = v3 + v2 */
91 in
[4] = vec_subs (v3
, v2
); /* y4 = v3 - v2 */
92 in
[2] = vec_mradds (v8
, mul2
, v4
); /* y2 = v8 * a0 + v4 */
93 v6
= vec_mradds (v4
, mul2
, mul6
);
94 in
[6] = vec_subs (v6
, v8
); /* y6 = v4 * a0 - v8 */
97 v6
= vec_mradds (v0
, mul0
, v5
); /* v6 = v0 * (c4) + v5 */
98 v7
= vec_mradds (v0
, mul4
, v5
); /* v7 = v0 * (-c4) + v5 */
99 v2
= vec_mradds (v1
, mul4
, v9
); /* v2 = v1 * (-c4) + v9 */
100 v3
= vec_mradds (v1
, mul0
, v9
); /* v3 = v1 * (c4) + v9 */
102 /*********** Stage 4 ***********/
105 in
[1] = vec_mradds (v6
, mul3
, v3
); /* y1 = v6 * (a1) + v3 */
106 v23
= vec_mradds (v3
, mul3
, mul6
);
107 in
[7] = vec_subs (v23
, v6
); /* y7 = v3 * (a1) - v6 */
108 in
[5] = vec_mradds (v2
, mul1
, v7
); /* y5 = v2 * (a2) + v7 */
109 in
[3] = vec_mradds (v7
, mul5
, v2
); /* y3 = v7 * (-a2) + v2 */
111 transpose_vmx (in
, out
);
113 /* Perform DCT on the eight rows */
115 /*********** Stage 1 ***********/
117 v8
= vec_adds (out
[0], out
[7]);
118 v9
= vec_subs (out
[0], out
[7]);
119 v0
= vec_adds (out
[1], out
[6]);
120 v7
= vec_subs (out
[1], out
[6]);
121 v1
= vec_adds (out
[2], out
[5]);
122 v6
= vec_subs (out
[2], out
[5]);
123 v2
= vec_adds (out
[3], out
[4]);
124 v5
= vec_subs (out
[3], out
[4]);
126 /*********** Stage 2 ***********/
129 v3
= vec_adds (v8
, v2
); /* (V0+V7) + (V3+V4) */
130 v4
= vec_subs (v8
, v2
); /* (V0+V7) - (V3+V4) */
131 v2
= vec_adds (v0
, v1
); /* (V1+V6) + (V2+V5) */
132 v8
= vec_subs (v0
, v1
); /* (V1+V6) - (V2+V5) */
135 v0
= vec_subs (v7
, v6
); /* (V1-V6) - (V2-V5) */
136 v1
= vec_adds (v7
, v6
); /* (V1-V6) + (V2-V5) */
138 /*********** Stage 3 ***********/
141 v25
= vec_subs (v25
, v25
); /* reinit v25 = 0 */
143 v20
= vec_adds (v3
, v2
); /* y0 = v3 + v2 */
144 v24
= vec_subs (v3
, v2
); /* y4 = v3 - v2 */
145 v22
= vec_mradds (v8
, mul2
, v4
); /* y2 = v8 * a0 + v4 */
146 v6
= vec_mradds (v4
, mul2
, v25
);
147 v26
= vec_subs (v6
, v8
); /* y6 = v4 * a0 - v8 */
150 v6
= vec_mradds (v0
, mul0
, v5
); /* v6 = v0 * (c4) + v5 */
151 v7
= vec_mradds (v0
, mul4
, v5
); /* v7 = v0 * (-c4) + v5 */
152 v2
= vec_mradds (v1
, mul4
, v9
); /* v2 = v1 * (-c4) + v9 */
153 v3
= vec_mradds (v1
, mul0
, v9
); /* v3 = v1 * (c4) + v9 */
155 /*********** Stage 4 ***********/
158 v21
= vec_mradds (v6
, mul3
, v3
); /* y1 = v6 * (a1) + v3 */
159 v23
= vec_mradds (v3
, mul3
, v25
);
160 v27
= vec_subs (v23
, v6
); /* y7 = v3 * (a1) - v6 */
161 v25
= vec_mradds (v2
, mul1
, v7
); /* y5 = v2 * (a2) + v7 */
162 v23
= vec_mradds (v7
, mul5
, v2
); /* y3 = v7 * (-a2) + v2 */
164 /* Post-scale and store reults */
166 v31
= vec_subs (v31
, v31
); /* reinit v25 = 0 */
168 output
[0] = vec_mradds (postscale
[0], v20
, v31
);
169 output
[2] = vec_mradds (postscale
[2], v22
, v31
);
170 output
[4] = vec_mradds (postscale
[4], v24
, v31
);
171 output
[6] = vec_mradds (postscale
[6], v26
, v31
);
172 output
[1] = vec_mradds (postscale
[1], v21
, v31
);
173 output
[3] = vec_mradds (postscale
[3], v23
, v31
);
174 output
[5] = vec_mradds (postscale
[5], v25
, v31
);
175 output
[7] = vec_mradds (postscale
[7], v27
, v31
);