1 // RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" -split-input-file | FileCheck %s
3 // CHECK-DAG: #[[$map_id1:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0)>
4 // CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (0)>
6 // CHECK-LABEL: func @vec1d_1
7 func.func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
8 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
9 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
10 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
11 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
12 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
13 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
14 %c0 = arith.constant 0 : index
15 %c1 = arith.constant 1 : index
16 %c2 = arith.constant 2 : index
17 %M = memref.dim %A, %c0 : memref<?x?xf32>
18 %N = memref.dim %A, %c1 : memref<?x?xf32>
19 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
21 // CHECK: for {{.*}} step 128
22 // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
23 // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%[[C0]])
24 // CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32
25 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
26 affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector
27 %a0 = affine.load %A[%c0, %c0] : memref<?x?xf32>
34 // CHECK-LABEL: func @vec1d_2
35 func.func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
36 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
37 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
38 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
39 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
40 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
41 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
42 %c0 = arith.constant 0 : index
43 %c1 = arith.constant 1 : index
44 %c2 = arith.constant 2 : index
45 %M = memref.dim %A, %c0 : memref<?x?xf32>
46 %N = memref.dim %A, %c1 : memref<?x?xf32>
47 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
49 // CHECK:for [[IV3:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
50 // CHECK-NEXT: %[[CST:.*]] = arith.constant 0.0{{.*}}: f32
51 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %[[CST]] : memref<?x?xf32>, vector<128xf32>
52 affine.for %i3 = 0 to %M { // vectorized
53 %a3 = affine.load %A[%c0, %i3] : memref<?x?xf32>
60 // CHECK-LABEL: func @vec1d_3
61 func.func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
62 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
63 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
64 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
65 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %arg0, %[[C0]] : memref<?x?xf32>
66 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %arg0, %[[C1]] : memref<?x?xf32>
67 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %arg1, %[[C2]] : memref<?x?x?xf32>
68 %c0 = arith.constant 0 : index
69 %c1 = arith.constant 1 : index
70 %c2 = arith.constant 2 : index
71 %M = memref.dim %A, %c0 : memref<?x?xf32>
72 %N = memref.dim %A, %c1 : memref<?x?xf32>
73 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
75 // CHECK:for [[IV8:%[0-9a-zA-Z_]+]] = 0 to [[ARG_M]] step 128
76 // CHECK-NEXT: for [[IV9:%[0-9a-zA-Z_]*]] = 0 to [[ARG_N]] {
77 // CHECK-NEXT: %[[APP9_0:[0-9a-zA-Z_]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
78 // CHECK-NEXT: %[[APP9_1:[0-9a-zA-Z_]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
79 // CHECK-NEXT: %[[CST:.*]] = arith.constant 0.0{{.*}}: f32
80 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %[[CST]] : memref<?x?xf32>, vector<128xf32>
81 affine.for %i8 = 0 to %M { // vectorized
82 affine.for %i9 = 0 to %N {
83 %a9 = affine.load %A[%i9, %i8 + %i9] : memref<?x?xf32>
91 // CHECK-LABEL: func @vector_add_2d
92 func.func @vector_add_2d(%M : index, %N : index) -> f32 {
93 %A = memref.alloc (%M, %N) : memref<?x?xf32, 0>
94 %B = memref.alloc (%M, %N) : memref<?x?xf32, 0>
95 %C = memref.alloc (%M, %N) : memref<?x?xf32, 0>
96 %f1 = arith.constant 1.0 : f32
97 %f2 = arith.constant 2.0 : f32
98 affine.for %i0 = 0 to %M {
99 affine.for %i1 = 0 to %N {
100 // CHECK: %[[C1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32>
101 // CHECK: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?x?xf32>
103 affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
106 affine.for %i2 = 0 to %M {
107 affine.for %i3 = 0 to %N {
108 // CHECK: %[[C3:.*]] = arith.constant dense<2.000000e+00> : vector<128xf32>
109 // CHECK: vector.transfer_write %[[C3]], {{.*}} : vector<128xf32>, memref<?x?xf32>
111 affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
114 affine.for %i4 = 0 to %M {
115 affine.for %i5 = 0 to %N {
116 // CHECK: %[[SPLAT2:.*]] = arith.constant dense<2.000000e+00> : vector<128xf32>
117 // CHECK: %[[SPLAT1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32>
118 // CHECK: %[[A5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
119 // CHECK: %[[B5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
120 // CHECK: %[[S5:.*]] = arith.addf %[[A5]], %[[B5]] : vector<128xf32>
121 // CHECK: %[[S6:.*]] = arith.addf %[[S5]], %[[SPLAT1]] : vector<128xf32>
122 // CHECK: %[[S7:.*]] = arith.addf %[[S5]], %[[SPLAT2]] : vector<128xf32>
123 // CHECK: %[[S8:.*]] = arith.addf %[[S7]], %[[S6]] : vector<128xf32>
124 // CHECK: vector.transfer_write %[[S8]], {{.*}} : vector<128xf32>, memref<?x?xf32>
125 %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
126 %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
127 %s5 = arith.addf %a5, %b5 : f32
129 %s6 = arith.addf %s5, %f1 : f32
131 %s7 = arith.addf %s5, %f2 : f32
132 // diamond dependency.
133 %s8 = arith.addf %s7, %s6 : f32
134 affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
137 %c7 = arith.constant 7 : index
138 %c42 = arith.constant 42 : index
139 %res = affine.load %C[%c7, %c42] : memref<?x?xf32, 0>
145 // CHECK-LABEL: func @vec_constant_with_two_users
146 func.func @vec_constant_with_two_users(%M : index, %N : index) -> (f32, f32) {
147 %A = memref.alloc (%M, %N) : memref<?x?xf32, 0>
148 %B = memref.alloc (%M) : memref<?xf32, 0>
149 %f1 = arith.constant 1.0 : f32
150 affine.for %i0 = 0 to %M { // vectorized
151 // CHECK: %[[C1:.*]] = arith.constant dense<1.000000e+00> : vector<128xf32>
152 // CHECK-NEXT: affine.for
153 // CHECK-NEXT: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?x?xf32>
154 affine.for %i1 = 0 to %N {
155 affine.store %f1, %A[%i1, %i0] : memref<?x?xf32, 0>
157 // CHECK: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?xf32>
158 affine.store %f1, %B[%i0] : memref<?xf32, 0>
160 %c12 = arith.constant 12 : index
161 %res1 = affine.load %A[%c12, %c12] : memref<?x?xf32, 0>
162 %res2 = affine.load %B[%c12] : memref<?xf32, 0>
163 return %res1, %res2 : f32, f32
168 // CHECK-LABEL: func @vec_block_arg
169 func.func @vec_block_arg(%A : memref<32x512xi32>) {
170 // CHECK: affine.for %[[IV0:[0-9a-zA-Z_]+]] = 0 to 512 step 128 {
171 // CHECK-NEXT: affine.for %[[IV1:[0-9a-zA-Z_]+]] = 0 to 32 {
172 // CHECK-NEXT: %[[BROADCAST:.*]] = vector.broadcast %[[IV1]] : index to vector<128xindex>
173 // CHECK-NEXT: %[[CAST:.*]] = arith.index_cast %[[BROADCAST]] : vector<128xindex> to vector<128xi32>
174 // CHECK-NEXT: vector.transfer_write %[[CAST]], {{.*}}[%[[IV1]], %[[IV0]]] : vector<128xi32>, memref<32x512xi32>
175 affine.for %i = 0 to 512 { // vectorized
176 affine.for %j = 0 to 32 {
177 %idx = arith.index_cast %j : index to i32
178 affine.store %idx, %A[%j, %i] : memref<32x512xi32>
186 // CHECK-DAG: #[[$map0:map[0-9a-zA-Z_]*]] = affine_map<(d0, d1, d2) -> (d0 * 2 + d1 - 1)>
187 // CHECK-DAG: #[[$map1:map[0-9a-zA-Z_]*]] = affine_map<(d0, d1, d2) -> (d2)>
188 // CHECK-LABEL: func @vec_block_arg_2
189 func.func @vec_block_arg_2(%A : memref<?x512xindex>) {
190 %c0 = arith.constant 0 : index
191 %N = memref.dim %A, %c0 : memref<?x512xindex>
192 // CHECK: affine.for %[[IV0:[0-9a-zA-Z_]+]] = 0 to %{{.*}} {
193 // CHECK-NEXT: %[[BROADCAST1:.*]] = vector.broadcast %[[IV0]] : index to vector<128xindex>
194 // CHECK-NEXT: affine.for %[[IV1:[0-9a-zA-Z_]+]] = 0 to 512 step 128 {
195 // CHECK-NOT: vector.broadcast %[[IV1]]
196 // CHECK: affine.for %[[IV2:[0-9a-zA-Z_]+]] = 0 to 2 {
197 // CHECK-NEXT: %[[BROADCAST2:.*]] = vector.broadcast %[[IV2]] : index to vector<128xindex>
198 // CHECK-NEXT: %[[INDEX1:.*]] = affine.apply #[[$map0]](%[[IV0]], %[[IV2]], %[[IV1]])
199 // CHECK-NEXT: %[[INDEX2:.*]] = affine.apply #[[$map1]](%[[IV0]], %[[IV2]], %[[IV1]])
200 // CHECK: %[[LOAD:.*]] = vector.transfer_read %{{.*}}[%[[INDEX1]], %[[INDEX2]]], %{{.*}} : memref<?x512xindex>, vector<128xindex>
201 // CHECK-NEXT: arith.muli %[[BROADCAST1]], %[[LOAD]] : vector<128xindex>
202 // CHECK-NEXT: arith.addi %{{.*}}, %[[BROADCAST2]] : vector<128xindex>
204 affine.for %i0 = 0 to %N {
205 affine.for %i1 = 0 to 512 { // vectorized
206 affine.for %i2 = 0 to 2 {
207 %0 = affine.load %A[%i0 * 2 + %i2 - 1, %i1] : memref<?x512xindex>
208 %mul = arith.muli %i0, %0 : index
209 %add = arith.addi %mul, %i2 : index
218 // CHECK-LABEL: func @vec_rejected_1
219 func.func @vec_rejected_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
220 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
221 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
222 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
223 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
224 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
225 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
226 %c0 = arith.constant 0 : index
227 %c1 = arith.constant 1 : index
228 %c2 = arith.constant 2 : index
229 %M = memref.dim %A, %c0 : memref<?x?xf32>
230 %N = memref.dim %A, %c1 : memref<?x?xf32>
231 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
233 // CHECK:for {{.*}} [[ARG_M]] {
234 affine.for %i1 = 0 to %M { // not vectorized
235 %a1 = affine.load %A[%i1, %i1] : memref<?x?xf32>
242 // CHECK-LABEL: func @vec_rejected_2
243 func.func @vec_rejected_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
244 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
245 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
246 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
247 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
248 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
249 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
250 %c0 = arith.constant 0 : index
251 %c1 = arith.constant 1 : index
252 %c2 = arith.constant 2 : index
253 %M = memref.dim %A, %c0 : memref<?x?xf32>
254 %N = memref.dim %A, %c1 : memref<?x?xf32>
255 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
257 // CHECK: affine.for %{{.*}}{{[0-9a-zA-Z_]*}} = 0 to [[ARG_M]] {
258 affine.for %i2 = 0 to %M { // not vectorized, would vectorize with --test-fastest-varying=1
259 %a2 = affine.load %A[%i2, %c0] : memref<?x?xf32>
266 // CHECK-LABEL: func @vec_rejected_3
267 func.func @vec_rejected_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
268 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
269 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
270 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
271 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
272 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
273 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
274 %c0 = arith.constant 0 : index
275 %c1 = arith.constant 1 : index
276 %c2 = arith.constant 2 : index
277 %M = memref.dim %A, %c0 : memref<?x?xf32>
278 %N = memref.dim %A, %c1 : memref<?x?xf32>
279 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
281 // CHECK:for [[IV4:%[0-9a-zA-Z_]+]] = 0 to [[ARG_M]] step 128 {
282 // CHECK-NEXT: for [[IV5:%[0-9a-zA-Z_]*]] = 0 to [[ARG_N]] {
283 // CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32
284 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
285 affine.for %i4 = 0 to %M { // vectorized
286 affine.for %i5 = 0 to %N { // not vectorized, would vectorize with --test-fastest-varying=1
287 %a5 = affine.load %A[%i5, %i4] : memref<?x?xf32>
295 // CHECK-LABEL: func @vec_rejected_4
296 func.func @vec_rejected_4(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
297 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
298 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
299 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
300 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
301 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
302 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
303 %c0 = arith.constant 0 : index
304 %c1 = arith.constant 1 : index
305 %c2 = arith.constant 2 : index
306 %M = memref.dim %A, %c0 : memref<?x?xf32>
307 %N = memref.dim %A, %c1 : memref<?x?xf32>
308 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
310 // CHECK: for [[IV6:%[0-9a-zA-Z_]*]] = 0 to [[ARG_M]] {
311 // CHECK-NEXT: for [[IV7:%[0-9a-zA-Z_]*]] = 0 to [[ARG_N]] {
312 affine.for %i6 = 0 to %M { // not vectorized, would vectorize with --test-fastest-varying=1
313 affine.for %i7 = 0 to %N { // not vectorized, can never vectorize
314 %a7 = affine.load %A[%i6 + %i7, %i6] : memref<?x?xf32>
322 // CHECK-LABEL: func @vec_rejected_5
323 func.func @vec_rejected_5(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
324 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
325 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
326 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
327 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
328 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
329 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
330 %c0 = arith.constant 0 : index
331 %c1 = arith.constant 1 : index
332 %c2 = arith.constant 2 : index
333 %M = memref.dim %A, %c0 : memref<?x?xf32>
334 %N = memref.dim %A, %c1 : memref<?x?xf32>
335 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
337 // CHECK: for [[IV10:%[0-9a-zA-Z_]*]] = 0 to %{{[0-9a-zA-Z_]*}} {
338 // CHECK: for [[IV11:%[0-9a-zA-Z_]*]] = 0 to %{{[0-9a-zA-Z_]*}} {
339 affine.for %i10 = 0 to %M { // not vectorized, need per load transposes
340 affine.for %i11 = 0 to %N { // not vectorized, need per load transposes
341 %a11 = affine.load %A[%i10, %i11] : memref<?x?xf32>
342 affine.store %a11, %A[%i11, %i10] : memref<?x?xf32>
350 // CHECK-LABEL: func @vec_rejected_6
351 func.func @vec_rejected_6(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
352 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
353 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
354 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
355 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
356 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
357 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
358 %c0 = arith.constant 0 : index
359 %c1 = arith.constant 1 : index
360 %c2 = arith.constant 2 : index
361 %M = memref.dim %A, %c0 : memref<?x?xf32>
362 %N = memref.dim %A, %c1 : memref<?x?xf32>
363 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
365 // CHECK: for [[IV12:%[0-9a-zA-Z_]*]] = 0 to %{{[0-9a-zA-Z_]*}} {
366 // CHECK: for [[IV13:%[0-9a-zA-Z_]*]] = 0 to %{{[0-9a-zA-Z_]*}} {
367 // CHECK: for [[IV14:%[0-9a-zA-Z_]+]] = 0 to [[ARG_P]] step 128
368 affine.for %i12 = 0 to %M { // not vectorized, can never vectorize
369 affine.for %i13 = 0 to %N { // not vectorized, can never vectorize
370 affine.for %i14 = 0 to %P { // vectorized
371 %a14 = affine.load %B[%i13, %i12 + %i13, %i12 + %i14] : memref<?x?x?xf32>
380 // CHECK-LABEL: func @vec_rejected_7
381 func.func @vec_rejected_7(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
382 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
383 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
384 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
385 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
386 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
387 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
388 %c0 = arith.constant 0 : index
389 %c1 = arith.constant 1 : index
390 %c2 = arith.constant 2 : index
391 %M = memref.dim %A, %c0 : memref<?x?xf32>
392 %N = memref.dim %A, %c1 : memref<?x?xf32>
393 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
395 // CHECK: affine.for %{{.*}}{{[0-9a-zA-Z_]*}} = 0 to %{{[0-9a-zA-Z_]*}} {
396 affine.for %i16 = 0 to %M { // not vectorized, can't vectorize a vector load
397 %a16 = memref.alloc(%M) : memref<?xvector<2xf32>>
398 %l16 = affine.load %a16[%i16] : memref<?xvector<2xf32>>
405 // CHECK-DAG: #[[$map_id1:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0)>
406 // CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (0)>
408 // CHECK-LABEL: func @vec_rejected_8
409 func.func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
410 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
411 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
412 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
413 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
414 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
415 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
416 %c0 = arith.constant 0 : index
417 %c1 = arith.constant 1 : index
418 %c2 = arith.constant 2 : index
419 %M = memref.dim %A, %c0 : memref<?x?xf32>
420 %N = memref.dim %A, %c1 : memref<?x?xf32>
421 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
423 // CHECK: affine.for %{{.*}}{{[0-9a-zA-Z_]*}} = 0 to %{{[0-9a-zA-Z_]*}} {
424 // CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
425 // CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
426 // CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
427 // CHECK: %{{.*}} = arith.constant 0.0{{.*}}: f32
428 // CHECK: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
429 affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}}
430 affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
431 %a18 = affine.load %A[%c0, %c0] : memref<?x?xf32>
439 // CHECK-DAG: #[[$map_id1:map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0)>
440 // CHECK-DAG: #[[$map_proj_d0d1_0:map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (0)>
442 // CHECK-LABEL: func @vec_rejected_9
443 func.func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
444 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
445 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
446 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
447 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
448 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
449 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
450 %c0 = arith.constant 0 : index
451 %c1 = arith.constant 1 : index
452 %c2 = arith.constant 2 : index
453 %M = memref.dim %A, %c0 : memref<?x?xf32>
454 %N = memref.dim %A, %c1 : memref<?x?xf32>
455 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
457 // CHECK: affine.for %{{.*}}{{[0-9a-zA-Z_]*}} = 0 to %{{[0-9a-zA-Z_]*}} {
458 // CHECK: for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
459 // CHECK: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
460 // CHECK-NEXT: %{{.*}} = affine.apply #[[$map_id1]](%{{.*}})
461 // CHECK-NEXT: %{{.*}} = arith.constant 0.0{{.*}}: f32
462 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[$map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
463 affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}}
464 affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
465 %a18 = affine.load %A[%c0, %c0] : memref<?x?xf32>
473 #set0 = affine_set<(i) : (i >= 0)>
475 // CHECK-LABEL: func @vec_rejected_10
476 func.func @vec_rejected_10(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
477 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
478 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
479 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
480 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
481 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
482 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
483 %c0 = arith.constant 0 : index
484 %c1 = arith.constant 1 : index
485 %c2 = arith.constant 2 : index
486 %M = memref.dim %A, %c0 : memref<?x?xf32>
487 %N = memref.dim %A, %c1 : memref<?x?xf32>
488 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
490 // CHECK: affine.for %{{.*}}{{[0-9a-zA-Z_]*}} = 0 to %{{[0-9a-zA-Z_]*}} {
491 affine.for %i15 = 0 to %M { // not vectorized due to condition below
492 affine.if #set0(%i15) {
493 %a15 = affine.load %A[%c0, %c0] : memref<?x?xf32>
501 // CHECK-LABEL: func @vec_rejected_11
502 func.func @vec_rejected_11(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
503 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
504 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
505 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
506 // CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
507 // CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
508 // CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %{{.*}}, %[[C2]] : memref<?x?x?xf32>
509 %c0 = arith.constant 0 : index
510 %c1 = arith.constant 1 : index
511 %c2 = arith.constant 2 : index
512 %M = memref.dim %A, %c0 : memref<?x?xf32>
513 %N = memref.dim %A, %c1 : memref<?x?xf32>
514 %P = memref.dim %B, %c2 : memref<?x?x?xf32>
516 // CHECK: for [[IV10:%[0-9a-zA-Z_]*]] = 0 to %{{[0-9a-zA-Z_]*}} {
517 // CHECK: for [[IV11:%[0-9a-zA-Z_]*]] = 0 to %{{[0-9a-zA-Z_]*}} {
518 // This is similar to vec_rejected_5, but the order of indices is different.
519 affine.for %i10 = 0 to %M { // not vectorized
520 affine.for %i11 = 0 to %N { // not vectorized
521 %a11 = affine.load %A[%i11, %i10] : memref<?x?xf32>
522 affine.store %a11, %A[%i10, %i11] : memref<?x?xf32>
530 // This should not vectorize due to the sequential dependence in the loop.
531 // CHECK-LABEL: @vec_rejected_sequential
532 func.func @vec_rejected_sequential(%A : memref<?xf32>) {
533 %c0 = arith.constant 0 : index
534 %N = memref.dim %A, %c0 : memref<?xf32>
535 affine.for %i = 0 to %N {
537 %a = affine.load %A[%i] : memref<?xf32>
539 affine.store %a, %A[%i + 1] : memref<?xf32>
546 // CHECK-LABEL: @vec_no_load_store_ops
547 func.func @vec_no_load_store_ops(%a: f32, %b: f32) {
548 %cst = arith.constant 0.000000e+00 : f32
549 affine.for %i = 0 to 128 {
550 %add = arith.addf %a, %b : f32
552 // CHECK-DAG: %[[bc1:.*]] = vector.broadcast
553 // CHECK-DAG: %[[bc0:.*]] = vector.broadcast
554 // CHECK: affine.for %{{.*}} = 0 to 128 step
555 // CHECK-NEXT: [[add:.*]] arith.addf %[[bc0]], %[[bc1]]
562 // This should not be vectorized due to the unsupported block argument (%i).
563 // Support for operands with linear evolution is needed.
564 // CHECK-LABEL: @vec_rejected_unsupported_block_arg
565 func.func @vec_rejected_unsupported_block_arg(%A : memref<512xi32>) {
566 affine.for %i = 0 to 512 {
568 %idx = arith.index_cast %i : index to i32
569 affine.store %idx, %A[%i] : memref<512xi32>
576 // '%i' loop is vectorized, including the inner reduction over '%j'.
578 func.func @vec_non_vecdim_reduction(%in: memref<128x256xf32>, %out: memref<256xf32>) {
579 %cst = arith.constant 0.000000e+00 : f32
580 affine.for %i = 0 to 256 {
581 %final_red = affine.for %j = 0 to 128 iter_args(%red_iter = %cst) -> (f32) {
582 %ld = affine.load %in[%j, %i] : memref<128x256xf32>
583 %add = arith.addf %red_iter, %ld : f32
584 affine.yield %add : f32
586 affine.store %final_red, %out[%i] : memref<256xf32>
591 // CHECK-LABEL: @vec_non_vecdim_reduction
592 // CHECK: affine.for %{{.*}} = 0 to 256 step 128 {
593 // CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
594 // CHECK: %[[final_red:.*]] = affine.for %{{.*}} = 0 to 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
595 // CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<128x256xf32>, vector<128xf32>
596 // CHECK: %[[add:.*]] = arith.addf %[[red_iter]], %[[ld]] : vector<128xf32>
597 // CHECK: affine.yield %[[add]] : vector<128xf32>
599 // CHECK: vector.transfer_write %[[final_red]], %{{.*}} : vector<128xf32>, memref<256xf32>
604 // '%i' loop is vectorized, including the inner reductions over '%j'.
606 func.func @vec_non_vecdim_reductions(%in0: memref<128x256xf32>, %in1: memref<128x256xi32>,
607 %out0: memref<256xf32>, %out1: memref<256xi32>) {
608 %zero = arith.constant 0.000000e+00 : f32
609 %one = arith.constant 1 : i32
610 affine.for %i = 0 to 256 {
611 %red0, %red1 = affine.for %j = 0 to 128
612 iter_args(%red_iter0 = %zero, %red_iter1 = %one) -> (f32, i32) {
613 %ld0 = affine.load %in0[%j, %i] : memref<128x256xf32>
614 %add = arith.addf %red_iter0, %ld0 : f32
615 %ld1 = affine.load %in1[%j, %i] : memref<128x256xi32>
616 %mul = arith.muli %red_iter1, %ld1 : i32
617 affine.yield %add, %mul : f32, i32
619 affine.store %red0, %out0[%i] : memref<256xf32>
620 affine.store %red1, %out1[%i] : memref<256xi32>
625 // CHECK-LABEL: @vec_non_vecdim_reductions
626 // CHECK: affine.for %{{.*}} = 0 to 256 step 128 {
627 // CHECK: %[[vone:.*]] = arith.constant dense<1> : vector<128xi32>
628 // CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
629 // CHECK: %[[reds:.*]]:2 = affine.for %{{.*}} = 0 to 128
630 // CHECK-SAME: iter_args(%[[red_iter0:.*]] = %[[vzero]], %[[red_iter1:.*]] = %[[vone]]) -> (vector<128xf32>, vector<128xi32>) {
631 // CHECK: %[[ld0:.*]] = vector.transfer_read %{{.*}} : memref<128x256xf32>, vector<128xf32>
632 // CHECK: %[[add:.*]] = arith.addf %[[red_iter0]], %[[ld0]] : vector<128xf32>
633 // CHECK: %[[ld1:.*]] = vector.transfer_read %{{.*}} : memref<128x256xi32>, vector<128xi32>
634 // CHECK: %[[mul:.*]] = arith.muli %[[red_iter1]], %[[ld1]] : vector<128xi32>
635 // CHECK: affine.yield %[[add]], %[[mul]] : vector<128xf32>, vector<128xi32>
637 // CHECK: vector.transfer_write %[[reds]]#0, %{{.*}} : vector<128xf32>, memref<256xf32>
638 // CHECK: vector.transfer_write %[[reds]]#1, %{{.*}} : vector<128xi32>, memref<256xi32>
643 // '%i' loop is vectorized, including the inner last value computation over '%j'.
645 func.func @vec_no_vecdim_last_value(%in: memref<128x256xf32>, %out: memref<256xf32>) {
646 %cst = arith.constant 0.000000e+00 : f32
647 affine.for %i = 0 to 256 {
648 %last_val = affine.for %j = 0 to 128 iter_args(%last_iter = %cst) -> (f32) {
649 %ld = affine.load %in[%j, %i] : memref<128x256xf32>
650 affine.yield %ld : f32
652 affine.store %last_val, %out[%i] : memref<256xf32>
657 // CHECK-LABEL: @vec_no_vecdim_last_value
658 // CHECK: affine.for %{{.*}} = 0 to 256 step 128 {
659 // CHECK: %[[vzero:.*]] = arith.constant dense<0.000000e+00> : vector<128xf32>
660 // CHECK: %[[last_val:.*]] = affine.for %{{.*}} = 0 to 128 iter_args(%[[last_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
661 // CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<128x256xf32>, vector<128xf32>
662 // CHECK: affine.yield %[[ld]] : vector<128xf32>
664 // CHECK: vector.transfer_write %[[last_val]], %{{.*}} : vector<128xf32>, memref<256xf32>
669 // The inner reduction loop '%j' is not vectorized if we do not request
670 // reduction vectorization.
672 func.func @vec_vecdim_reduction_rejected(%in: memref<256x512xf32>, %out: memref<256xf32>) {
673 %cst = arith.constant 0.000000e+00 : f32
674 affine.for %i = 0 to 256 {
675 %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
676 %ld = affine.load %in[%i, %j] : memref<256x512xf32>
677 %add = arith.addf %red_iter, %ld : f32
678 affine.yield %add : f32
680 affine.store %final_red, %out[%i] : memref<256xf32>
685 // CHECK-LABEL: @vec_vecdim_reduction_rejected