1 /* { dg-require-effective-target vect_int } */
22 #define N (VECTOR_BITS * 3 / 32 + 4)
27 /* SLP with load permutation and loop-based vectorization. */
28 void foo (int *__restrict__ pInput
, int *__restrict__ pOutput
,
29 int *__restrict__ pInput2
, int *__restrict__ pOutput2
)
33 for (i
= 0; i
< N
/ 3; i
++)
40 *pOutput
++ = M00
* a
+ M01
* b
+ M02
* c
;
41 *pOutput
++ = M10
* a
+ M11
* b
+ M12
* c
;
42 *pOutput
++ = M20
* a
+ M21
* b
+ M22
* c
;
44 /* Loop-based vectorization. */
45 *pOutput2
++ = K00
* d
;
49 int main (int argc
, const char* argv
[])
51 int input
[N
], output
[N
], i
;
52 int input2
[N
], output2
[N
];
56 for (i
= 0; i
< N
; i
++)
62 asm volatile ("" ::: "memory");
66 int check_results
[N
] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
67 int check_results2
[N
] = {0, 405, 810, 1215, 1620, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
69 volatile int check_results
[N
] = {};
70 volatile int check_results2
[N
] = {};
72 for (int i
= 0; i
< N
/ 3; i
++)
75 int b
= input
[i
* 3 + 1];
76 int c
= input
[i
* 3 + 2];
79 check_results
[i
* 3] = M00
* a
+ M01
* b
+ M02
* c
;
80 check_results
[i
* 3 + 1] = M10
* a
+ M11
* b
+ M12
* c
;
81 check_results
[i
* 3 + 2] = M20
* a
+ M21
* b
+ M22
* c
;
83 check_results2
[i
] = K00
* d
;
85 asm volatile ("" ::: "memory");
89 foo (input
, output
, input2
, output2
);
92 for (i
= 0; i
< N
; i
++)
93 if (output
[i
] != check_results
[i
] || output2
[i
] != check_results2
[i
])
99 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
100 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */
101 /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
102 /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */