1 ; RUN: opt %loadPolly -polly-process-unprofitable -polly-remarks-minimal \
2 ; RUN: -polly-opt-isl -polly-pattern-matching-based-opts=true \
3 ; RUN: -polly-target-throughput-vector-fma=1 \
4 ; RUN: -polly-target-latency-vector-fma=1 \
5 ; RUN: -polly-target-vector-register-bitwidth=4096 \
6 ; RUN: -polly-target-1st-cache-level-associativity=3 -polly-print-ast -disable-output < %s | FileCheck %s
8 ; /* Test that Polly does not crash due to configurations that can lead to
9 ; incorrect tile size computations.
10 ; The parameters are setup such that Car in `getMacroKernelParams`
11 ; is evaluated to 0. */
13 ; static const int N = 3000;
15 ; void f(int A[N][N], int B[N][N], int C[N][N]) {
16 ; for (int i = 0; i < N; i++) {
17 ; for (int j = 0; j < N; j++) {
19 ; for (int k = 0; k < N; k++) {
20 ; A[i][j] += B[i][k] * C[k][j];
26 ; CHECK: // 1st level tiling - Tiles
27 ; CHECK-NEXT: for (int c0 = 0; c0 <= 93; c0 += 1)
28 ; CHECK-NEXT: for (int c1 = 0; c1 <= 93; c1 += 1) {
29 ; CHECK-NEXT: // 1st level tiling - Points
30 ; CHECK-NEXT: for (int c2 = 0; c2 <= min(31, -32 * c0 + 2999); c2 += 1)
31 ; CHECK-NEXT: for (int c3 = 0; c3 <= min(31, -32 * c1 + 2999); c3 += 1)
32 ; CHECK-NEXT: Stmt_for_body3(32 * c0 + c2, 32 * c1 + c3);
34 ; CHECK-NEXT: // Register tiling - Tiles
35 ; CHECK-NEXT: for (int c0 = 0; c0 <= 23; c0 += 1)
36 ; CHECK-NEXT: for (int c1 = 0; c1 <= 2999; c1 += 1)
37 ; CHECK-NEXT: for (int c2 = 0; c2 <= 2999; c2 += 1) {
38 ; CHECK-NEXT: // Register tiling - Points
40 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0, c2);
41 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 1, c2);
42 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 2, c2);
43 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 3, c2);
44 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 4, c2);
45 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 5, c2);
46 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 6, c2);
47 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 7, c2);
48 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 8, c2);
49 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 9, c2);
50 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 10, c2);
51 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 11, c2);
52 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 12, c2);
53 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 13, c2);
54 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 14, c2);
55 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 15, c2);
56 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 16, c2);
57 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 17, c2);
58 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 18, c2);
59 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 19, c2);
60 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 20, c2);
61 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 21, c2);
62 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 22, c2);
63 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 23, c2);
64 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 24, c2);
65 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 25, c2);
66 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 26, c2);
67 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 27, c2);
68 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 28, c2);
69 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 29, c2);
70 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 30, c2);
71 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 31, c2);
72 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 32, c2);
73 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 33, c2);
74 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 34, c2);
75 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 35, c2);
76 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 36, c2);
77 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 37, c2);
78 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 38, c2);
79 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 39, c2);
80 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 40, c2);
81 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 41, c2);
82 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 42, c2);
83 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 43, c2);
84 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 44, c2);
85 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 45, c2);
86 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 46, c2);
87 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 47, c2);
88 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 48, c2);
89 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 49, c2);
90 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 50, c2);
91 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 51, c2);
92 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 52, c2);
93 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 53, c2);
94 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 54, c2);
95 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 55, c2);
96 ; CHECK-NEXT: if (c0 <= 22) {
97 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 56, c2);
98 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 57, c2);
99 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 58, c2);
100 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 59, c2);
101 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 60, c2);
102 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 61, c2);
103 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 62, c2);
104 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 63, c2);
105 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 64, c2);
106 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 65, c2);
107 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 66, c2);
108 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 67, c2);
109 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 68, c2);
110 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 69, c2);
111 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 70, c2);
112 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 71, c2);
113 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 72, c2);
114 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 73, c2);
115 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 74, c2);
116 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 75, c2);
117 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 76, c2);
118 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 77, c2);
119 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 78, c2);
120 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 79, c2);
121 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 80, c2);
122 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 81, c2);
123 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 82, c2);
124 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 83, c2);
125 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 84, c2);
126 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 85, c2);
127 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 86, c2);
128 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 87, c2);
129 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 88, c2);
130 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 89, c2);
131 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 90, c2);
132 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 91, c2);
133 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 92, c2);
134 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 93, c2);
135 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 94, c2);
136 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 95, c2);
137 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 96, c2);
138 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 97, c2);
139 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 98, c2);
140 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 99, c2);
141 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 100, c2);
142 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 101, c2);
143 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 102, c2);
144 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 103, c2);
145 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 104, c2);
146 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 105, c2);
147 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 106, c2);
148 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 107, c2);
149 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 108, c2);
150 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 109, c2);
151 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 110, c2);
152 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 111, c2);
153 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 112, c2);
154 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 113, c2);
155 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 114, c2);
156 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 115, c2);
157 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 116, c2);
158 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 117, c2);
159 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 118, c2);
160 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 119, c2);
161 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 120, c2);
162 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 121, c2);
163 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 122, c2);
164 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 123, c2);
165 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 124, c2);
166 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 125, c2);
167 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 126, c2);
168 ; CHECK-NEXT: Stmt_for_body8(c1, 128 * c0 + 127, c2);
172 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
174 define void @f(ptr %A, ptr %B, ptr %C) {
178 for.cond: ; preds = %for.inc24, %entry
179 %indvars.iv4 = phi i64 [ %indvars.iv.next5, %for.inc24 ], [ 0, %entry ]
180 %exitcond6 = icmp ne i64 %indvars.iv4, 3000
181 br i1 %exitcond6, label %for.body, label %for.end26
183 for.body: ; preds = %for.cond
186 for.cond1: ; preds = %for.inc21, %for.body
187 %indvars.iv1 = phi i64 [ %indvars.iv.next2, %for.inc21 ], [ 0, %for.body ]
188 %exitcond3 = icmp ne i64 %indvars.iv1, 3000
189 br i1 %exitcond3, label %for.body3, label %for.end23
191 for.body3: ; preds = %for.cond1
192 %arrayidx5 = getelementptr inbounds [3000 x i32], ptr %A, i64 %indvars.iv4, i64 %indvars.iv1
193 store i32 0, ptr %arrayidx5, align 4
196 for.cond6: ; preds = %for.inc, %for.body3
197 %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body3 ]
198 %exitcond = icmp ne i64 %indvars.iv, 3000
199 br i1 %exitcond, label %for.body8, label %for.end
201 for.body8: ; preds = %for.cond6
202 %arrayidx12 = getelementptr inbounds [3000 x i32], ptr %B, i64 %indvars.iv4, i64 %indvars.iv
203 %tmp = load i32, ptr %arrayidx12, align 4
204 %arrayidx16 = getelementptr inbounds [3000 x i32], ptr %C, i64 %indvars.iv, i64 %indvars.iv1
205 %tmp7 = load i32, ptr %arrayidx16, align 4
206 %mul = mul nsw i32 %tmp, %tmp7
207 %arrayidx20 = getelementptr inbounds [3000 x i32], ptr %A, i64 %indvars.iv4, i64 %indvars.iv1
208 %tmp8 = load i32, ptr %arrayidx20, align 4
209 %add = add nsw i32 %tmp8, %mul
210 store i32 %add, ptr %arrayidx20, align 4
213 for.inc: ; preds = %for.body8
214 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
217 for.end: ; preds = %for.cond6
220 for.inc21: ; preds = %for.end
221 %indvars.iv.next2 = add nuw nsw i64 %indvars.iv1, 1
224 for.end23: ; preds = %for.cond1
227 for.inc24: ; preds = %for.end23
228 %indvars.iv.next5 = add nuw nsw i64 %indvars.iv4, 1
231 for.end26: ; preds = %for.cond