1 /* { dg-require-effective-target vect_int } */
2 /* { dg-require-effective-target vect_perm } */
3 /* { dg-additional-options "--param tree-reassoc-width=1" } */
39 #define N (VECTOR_BITS * 5 / 32)
44 void foo (unsigned int *__restrict__ pInput
,
45 unsigned int *__restrict__ pOutput
,
46 unsigned int *__restrict__ pInput2
,
47 unsigned int *__restrict__ pOutput2
)
49 unsigned int i
, a
, b
, c
, d
, e
;
51 for (i
= 0; i
< N
/ 5; i
++)
59 *pOutput
++ = M00
* a
+ M01
* b
+ M02
* c
+ M03
* d
+ M04
* e
;
60 *pOutput
++ = M10
* a
+ M11
* b
+ M12
* c
+ M13
* d
+ M14
* e
;
61 *pOutput
++ = M20
* a
+ M21
* b
+ M22
* c
+ M23
* d
+ M24
* e
;
62 *pOutput
++ = M30
* a
+ M31
* b
+ M32
* c
+ M33
* d
+ M34
* e
;
63 *pOutput
++ = M40
* a
+ M41
* b
+ M42
* c
+ M43
* d
+ M44
* e
;
72 *pOutput2
++ = M00
* a
+ M01
* b
+ M02
* c
+ M03
* d
+ M04
* e
;
73 *pOutput2
++ = M10
* a
+ M11
* b
+ M12
* c
+ M13
* d
+ M14
* e
;
74 *pOutput2
++ = M20
* a
+ M21
* b
+ M22
* c
+ M23
* d
+ M24
* e
;
75 *pOutput2
++ = M30
* a
+ M31
* b
+ M32
* c
+ M33
* d
+ M34
* e
;
76 *pOutput2
++ = M40
* a
+ M41
* b
+ M42
* c
+ M43
* d
+ M44
* e
;
81 int main (int argc
, const char* argv
[])
83 unsigned int input
[N
], output
[N
], i
, input2
[N
], output2
[N
];
87 for (i
= 0; i
< N
; i
++)
93 __asm__
volatile ("");
97 unsigned int check_results
[N
]
98 = { 3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399,
99 22848, 8174, 307964, 146829, 22009, 32668, 11594, 447564, 202404,
101 unsigned int check_results2
[N
]
102 = { 7136, 2702, 84604, 57909, 6633, 16956, 6122, 224204, 113484, 16243,
103 26776, 9542, 363804, 169059, 25853, 36596, 12962, 503404, 224634,
106 volatile unsigned int check_results
[N
];
107 volatile unsigned int check_results2
[N
];
109 for (i
= 0; i
< N
/ 5; i
++)
111 unsigned int a
= input
[i
* 5];
112 unsigned int b
= input
[i
* 5 + 1];
113 unsigned int c
= input
[i
* 5 + 2];
114 unsigned int d
= input
[i
* 5 + 3];
115 unsigned int e
= input
[i
* 5 + 4];
117 check_results
[i
* 5] = M00
* a
+ M01
* b
+ M02
* c
+ M03
* d
+ M04
* e
;
118 check_results
[i
* 5 + 1] = (M10
* a
+ M11
* b
+ M12
* c
119 + M13
* d
+ M14
* e
);
120 check_results
[i
* 5 + 2] = (M20
* a
+ M21
* b
+ M22
* c
121 + M23
* d
+ M24
* e
);
122 check_results
[i
* 5 + 3] = (M30
* a
+ M31
* b
+ M32
* c
123 + M33
* d
+ M34
* e
);
124 check_results
[i
* 5 + 4] = (M40
* a
+ M41
* b
+ M42
* c
125 + M43
* d
+ M44
* e
);
128 b
= input2
[i
* 5 + 1];
129 c
= input2
[i
* 5 + 2];
130 d
= input2
[i
* 5 + 3];
131 e
= input2
[i
* 5 + 4];
133 check_results2
[i
* 5] = M00
* a
+ M01
* b
+ M02
* c
+ M03
* d
+ M04
* e
;
134 check_results2
[i
* 5 + 1] = (M10
* a
+ M11
* b
+ M12
* c
135 + M13
* d
+ M14
* e
);
136 check_results2
[i
* 5 + 2] = (M20
* a
+ M21
* b
+ M22
* c
137 + M23
* d
+ M24
* e
);
138 check_results2
[i
* 5 + 3] = (M30
* a
+ M31
* b
+ M32
* c
139 + M33
* d
+ M34
* e
);
140 check_results2
[i
* 5 + 4] = (M40
* a
+ M41
* b
+ M42
* c
141 + M43
* d
+ M44
* e
);
143 asm volatile ("" ::: "memory");
147 foo (input
, output
, input2
, output2
);
150 for (i
= 0; i
< N
; i
++)
151 if (output
[i
] != check_results
[i
]
152 || output2
[i
] != check_results2
[i
])
158 /* Currently interleaving is not supported for a group-size of 5. */
160 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
161 /* { dg-final { scan-tree-dump-times "gaps requires scalar epilogue loop" 0 "vect" } } */
162 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target { ! { vect_load_lanes && vect_strided5 } } } } } */