1 // RUN: mlir-opt %s -affine-loop-invariant-code-motion -split-input-file | FileCheck %s
3 func.func @nested_loops_both_having_invariant_code() {
4 %m = memref.alloc() : memref<10xf32>
5 %cf7 = arith.constant 7.0 : f32
6 %cf8 = arith.constant 8.0 : f32
8 affine.for %arg0 = 0 to 10 {
9 %v0 = arith.addf %cf7, %cf8 : f32
10 affine.for %arg1 = 0 to 10 {
11 affine.store %v0, %m[%arg0] : memref<10xf32>
15 // CHECK: memref.alloc() : memref<10xf32>
16 // CHECK-NEXT: %[[cst:.*]] = arith.constant 7.000000e+00 : f32
17 // CHECK-NEXT: %[[cst_0:.*]] = arith.constant 8.000000e+00 : f32
18 // CHECK-NEXT: arith.addf %[[cst]], %[[cst_0]] : f32
19 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
21 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
22 // CHECK-NEXT: affine.store
29 // The store-load forwarding can see through affine apply's since it relies on
30 // dependence information.
31 // CHECK-LABEL: func @store_affine_apply
32 func.func @store_affine_apply() -> memref<10xf32> {
33 %cf7 = arith.constant 7.0 : f32
34 %m = memref.alloc() : memref<10xf32>
35 affine.for %arg0 = 0 to 10 {
36 %t0 = affine.apply affine_map<(d1) -> (d1 + 1)>(%arg0)
37 affine.store %cf7, %m[%t0] : memref<10xf32>
39 return %m : memref<10xf32>
40 // CHECK: %[[cst:.*]] = arith.constant 7.000000e+00 : f32
41 // CHECK-NEXT: %[[VAR_0:.*]] = memref.alloc() : memref<10xf32>
42 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
43 // CHECK-NEXT: affine.apply
44 // CHECK-NEXT: affine.store %[[cst]]
46 // CHECK-NEXT: return %[[VAR_0]] : memref<10xf32>
51 func.func @nested_loops_code_invariant_to_both() {
52 %m = memref.alloc() : memref<10xf32>
53 %cf7 = arith.constant 7.0 : f32
54 %cf8 = arith.constant 8.0 : f32
56 affine.for %arg0 = 0 to 10 {
57 affine.for %arg1 = 0 to 10 {
58 %v0 = arith.addf %cf7, %cf8 : f32
62 // CHECK: memref.alloc() : memref<10xf32>
63 // CHECK-NEXT: %[[cst:.*]] = arith.constant 7.000000e+00 : f32
64 // CHECK-NEXT: %[[cst_0:.*]] = arith.constant 8.000000e+00 : f32
65 // CHECK-NEXT: arith.addf %[[cst]], %[[cst_0]] : f32
72 // CHECK-LABEL: func @nested_loops_inner_loops_invariant_to_outermost_loop
73 func.func @nested_loops_inner_loops_invariant_to_outermost_loop(%m : memref<10xindex>) {
74 affine.for %arg0 = 0 to 20 {
75 affine.for %arg1 = 0 to 30 {
76 %v0 = affine.for %arg2 = 0 to 10 iter_args (%prevAccum = %arg1) -> index {
77 %v1 = affine.load %m[%arg2] : memref<10xindex>
78 %newAccum = arith.addi %prevAccum, %v1 : index
79 affine.yield %newAccum : index
84 // CHECK: affine.for %{{.*}} = 0 to 30 {
85 // CHECK-NEXT: %{{.*}} = affine.for %{{.*}} = 0 to 10 iter_args(%{{.*}} = %{{.*}}) -> (index) {
86 // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}} : memref<10xindex>
87 // CHECK-NEXT: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : index
88 // CHECK-NEXT: affine.yield %{{.*}} : index
91 // CHECK-NEXT: affine.for %{{.*}} = 0 to 20 {
99 func.func @single_loop_nothing_invariant() {
100 %m1 = memref.alloc() : memref<10xf32>
101 %m2 = memref.alloc() : memref<11xf32>
102 affine.for %arg0 = 0 to 10 {
103 %v0 = affine.load %m1[%arg0] : memref<10xf32>
104 %v1 = affine.load %m2[%arg0] : memref<11xf32>
105 %v2 = arith.addf %v0, %v1 : f32
106 affine.store %v2, %m1[%arg0] : memref<10xf32>
109 // CHECK: memref.alloc() : memref<10xf32>
110 // CHECK-NEXT: memref.alloc() : memref<11xf32>
111 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
112 // CHECK-NEXT: affine.load %{{.*}} : memref<10xf32>
113 // CHECK-NEXT: affine.load %{{.*}} : memref<11xf32>
114 // CHECK-NEXT: arith.addf
115 // CHECK-NEXT: affine.store %{{.*}} : memref<10xf32>
122 func.func @invariant_code_inside_affine_if() {
123 %m = memref.alloc() : memref<10xf32>
124 %cf8 = arith.constant 8.0 : f32
126 affine.for %arg0 = 0 to 10 {
127 %t0 = affine.apply affine_map<(d1) -> (d1 + 1)>(%arg0)
128 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %t0) {
129 %cf9 = arith.addf %cf8, %cf8 : f32
130 affine.store %cf9, %m[%arg0] : memref<10xf32>
135 // CHECK: memref.alloc() : memref<10xf32>
136 // CHECK-NEXT: %[[cst:.*]] = arith.constant 8.000000e+00 : f32
137 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
138 // CHECK-NEXT: affine.apply #map{{[0-9]*}}(%arg0)
139 // CHECK-NEXT: affine.if
140 // CHECK-NEXT: arith.addf %[[cst]], %[[cst]] : f32
141 // CHECK-NEXT: affine.store
150 func.func @dependent_stores() {
151 %m = memref.alloc() : memref<10xf32>
152 %cf7 = arith.constant 7.0 : f32
153 %cf8 = arith.constant 8.0 : f32
155 affine.for %arg0 = 0 to 10 {
156 %v0 = arith.addf %cf7, %cf8 : f32
157 affine.for %arg1 = 0 to 10 {
158 %v1 = arith.mulf %cf7, %cf7 : f32
159 affine.store %v1, %m[%arg1] : memref<10xf32>
160 affine.store %v0, %m[%arg0] : memref<10xf32>
164 // CHECK: memref.alloc() : memref<10xf32>
165 // CHECK-NEXT: %[[cst:.*]] = arith.constant 7.000000e+00 : f32
166 // CHECK-NEXT: %[[cst_0:.*]] = arith.constant 8.000000e+00 : f32
167 // CHECK-NEXT: arith.addf %[[cst]], %[[cst_0]] : f32
168 // CHECK-NEXT: %[[mul:.*]] = arith.mulf %[[cst]], %[[cst]] : f32
169 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
171 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
172 // CHECK-NEXT: affine.store %[[mul]]
173 // CHECK-NEXT: affine.store
180 func.func @independent_stores() {
181 %m = memref.alloc() : memref<10xf32>
182 %cf7 = arith.constant 7.0 : f32
183 %cf8 = arith.constant 8.0 : f32
185 affine.for %arg0 = 0 to 10 {
186 %v0 = arith.addf %cf7, %cf8 : f32
187 affine.for %arg1 = 0 to 10 {
188 %v1 = arith.mulf %cf7, %cf7 : f32
189 affine.store %v0, %m[%arg0] : memref<10xf32>
190 affine.store %v1, %m[%arg1] : memref<10xf32>
194 // CHECK: memref.alloc() : memref<10xf32>
195 // CHECK-NEXT: %[[cst:.*]] = arith.constant 7.000000e+00 : f32
196 // CHECK-NEXT: %[[cst_0:.*]] = arith.constant 8.000000e+00 : f32
197 // CHECK-NEXT: %[[add:.*]] = arith.addf %[[cst]], %[[cst_0]] : f32
198 // CHECK-NEXT: %[[mul:.*]] = arith.mulf %[[cst]], %[[cst]] : f32
199 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
200 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
201 // CHECK-NEXT: affine.store %[[add]]
202 // CHECK-NEXT: affine.store %[[mul]]
210 func.func @load_dependent_store() {
211 %m = memref.alloc() : memref<10xf32>
212 %cf7 = arith.constant 7.0 : f32
213 %cf8 = arith.constant 8.0 : f32
215 affine.for %arg0 = 0 to 10 {
216 %v0 = arith.addf %cf7, %cf8 : f32
217 affine.for %arg1 = 0 to 10 {
218 %v1 = arith.addf %cf7, %cf7 : f32
219 affine.store %v0, %m[%arg1] : memref<10xf32>
220 %v2 = affine.load %m[%arg0] : memref<10xf32>
224 // CHECK: memref.alloc() : memref<10xf32>
225 // CHECK-NEXT: %[[cst:.*]] = arith.constant 7.000000e+00 : f32
226 // CHECK-NEXT: %[[cst_0:.*]] = arith.constant 8.000000e+00 : f32
227 // CHECK-NEXT: arith.addf %[[cst]], %[[cst_0]] : f32
228 // CHECK-NEXT: arith.addf %[[cst]], %[[cst]] : f32
229 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
230 // CHECK-NEXT: affine.for
231 // CHECK-NEXT: affine.store
232 // CHECK-NEXT: affine.load
239 func.func @load_after_load() {
240 %m = memref.alloc() : memref<10xf32>
241 %cf7 = arith.constant 7.0 : f32
242 %cf8 = arith.constant 8.0 : f32
244 affine.for %arg0 = 0 to 10 {
245 %v0 = arith.addf %cf7, %cf8 : f32
246 affine.for %arg1 = 0 to 10 {
247 %v1 = arith.addf %cf7, %cf7 : f32
248 %v3 = affine.load %m[%arg1] : memref<10xf32>
249 %v2 = affine.load %m[%arg0] : memref<10xf32>
253 // CHECK: memref.alloc() : memref<10xf32>
254 // CHECK-NEXT: %[[cst:.*]] = arith.constant 7.000000e+00 : f32
255 // CHECK-NEXT: %[[cst_0:.*]] = arith.constant 8.000000e+00 : f32
256 // CHECK-NEXT: arith.addf %[[cst]], %[[cst_0]] : f32
257 // CHECK-NEXT: arith.addf %[[cst]], %[[cst]] : f32
258 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
259 // CHECK-NEXT: affine.load
261 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
262 // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
269 func.func @invariant_affine_if() {
270 %m = memref.alloc() : memref<10xf32>
271 %cf8 = arith.constant 8.0 : f32
272 affine.for %arg0 = 0 to 10 {
273 affine.for %arg1 = 0 to 10 {
274 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %arg0) {
275 %cf9 = arith.addf %cf8, %cf8 : f32
276 affine.store %cf9, %m[%arg0] : memref<10xf32>
282 // CHECK: memref.alloc() : memref<10xf32>
283 // CHECK-NEXT: %[[cst:.*]] = arith.constant 8.000000e+00 : f32
284 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
286 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
287 // CHECK-NEXT: affine.if
288 // CHECK-NEXT: arith.addf %[[cst]], %[[cst]] : f32
289 // CHECK-NEXT: affine.store
298 func.func @invariant_affine_if2() {
299 %m = memref.alloc() : memref<10xf32>
300 %cf8 = arith.constant 8.0 : f32
301 affine.for %arg0 = 0 to 10 {
302 affine.for %arg1 = 0 to 10 {
303 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %arg0) {
304 %cf9 = arith.addf %cf8, %cf8 : f32
305 affine.store %cf9, %m[%arg1] : memref<10xf32>
311 // CHECK: memref.alloc() : memref<10xf32>
312 // CHECK-NEXT: %[[cst:.*]] = arith.constant 8.000000e+00 : f32
313 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
314 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
315 // CHECK-NEXT: affine.if
316 // CHECK-NEXT: arith.addf %[[cst]], %[[cst]] : f32
317 // CHECK-NEXT: affine.store
327 func.func @invariant_affine_nested_if() {
328 %m = memref.alloc() : memref<10xf32>
329 %cf8 = arith.constant 8.0 : f32
330 affine.for %arg0 = 0 to 10 {
331 affine.for %arg1 = 0 to 10 {
332 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %arg0) {
333 %cf9 = arith.addf %cf8, %cf8 : f32
334 affine.store %cf9, %m[%arg0] : memref<10xf32>
335 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %arg0) {
336 affine.store %cf9, %m[%arg1] : memref<10xf32>
342 // CHECK: memref.alloc() : memref<10xf32>
343 // CHECK-NEXT: %[[cst:.*]] = arith.constant 8.000000e+00 : f32
344 // CHECK-NEXT: affine.for %[[arg0:.*]] = 0 to 10 {
345 // CHECK-NEXT: affine.for %[[arg1:.*]] = 0 to 10 {
346 // CHECK-NEXT: affine.if
347 // CHECK-NEXT: arith.addf %[[cst]], %[[cst]] : f32
348 // CHECK-NEXT: affine.store {{.*}}[%[[arg0]]] : memref<10xf32>
349 // CHECK-NEXT: affine.if
350 // CHECK-NEXT: affine.store {{.*}}[%[[arg1]]] : memref<10xf32>
361 func.func @invariant_affine_nested_if_else() {
362 %m = memref.alloc() : memref<10xf32>
363 %cf8 = arith.constant 8.0 : f32
364 affine.for %arg0 = 0 to 10 {
365 affine.for %arg1 = 0 to 10 {
366 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %arg0) {
367 %cf9 = arith.addf %cf8, %cf8 : f32
368 affine.store %cf9, %m[%arg0] : memref<10xf32>
369 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %arg0) {
370 affine.store %cf9, %m[%arg0] : memref<10xf32>
372 affine.store %cf9, %m[%arg1] : memref<10xf32>
378 // CHECK: memref.alloc() : memref<10xf32>
379 // CHECK-NEXT: %[[cst:.*]] = arith.constant 8.000000e+00 : f32
380 // CHECK-NEXT: affine.for %[[arg0:.*]] = 0 to 10 {
381 // CHECK-NEXT: affine.for %[[arg1:.*]] = 0 to 10 {
382 // CHECK-NEXT: affine.if
383 // CHECK-NEXT: arith.addf %[[cst]], %[[cst]] : f32
384 // CHECK-NEXT: affine.store {{.*}}[%[[arg0]]] : memref<10xf32>
385 // CHECK-NEXT: affine.if
386 // CHECK-NEXT: affine.store {{.*}}[%[[arg0]]] : memref<10xf32>
387 // CHECK-NEXT: } else {
388 // CHECK-NEXT: affine.store {{.*}}[%[[arg1]]] : memref<10xf32>
399 func.func @invariant_affine_nested_if_else2() {
400 %m = memref.alloc() : memref<10xf32>
401 %m2 = memref.alloc() : memref<10xf32>
402 %cf8 = arith.constant 8.0 : f32
403 affine.for %arg0 = 0 to 10 {
404 affine.for %arg1 = 0 to 10 {
405 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %arg0) {
406 %cf9 = arith.addf %cf8, %cf8 : f32
407 %tload1 = affine.load %m[%arg0] : memref<10xf32>
408 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %arg0) {
409 affine.store %cf9, %m2[%arg0] : memref<10xf32>
411 %tload2 = affine.load %m[%arg0] : memref<10xf32>
417 // CHECK: memref.alloc() : memref<10xf32>
418 // CHECK-NEXT: memref.alloc() : memref<10xf32>
419 // CHECK-NEXT: %[[cst:.*]] = arith.constant 8.000000e+00 : f32
420 // CHECK-NEXT: affine.for %[[arg0:.*]] = 0 to 10 {
422 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
423 // CHECK-NEXT: affine.if
424 // CHECK-NEXT: arith.addf %[[cst]], %[[cst]] : f32
425 // CHECK-NEXT: affine.load {{.*}}[%[[arg0]]] : memref<10xf32>
426 // CHECK-NEXT: affine.if
427 // CHECK-NEXT: affine.store {{.*}}[%[[arg0]]] : memref<10xf32>
428 // CHECK-NEXT: } else {
429 // CHECK-NEXT: affine.load {{.*}}[%[[arg0]]] : memref<10xf32>
439 func.func @invariant_affine_nested_if2() {
440 %m = memref.alloc() : memref<10xf32>
441 %cf8 = arith.constant 8.0 : f32
442 affine.for %arg0 = 0 to 10 {
443 affine.for %arg1 = 0 to 10 {
444 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %arg0) {
445 %cf9 = arith.addf %cf8, %cf8 : f32
446 %v1 = affine.load %m[%arg0] : memref<10xf32>
447 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %arg0) {
448 %v2 = affine.load %m[%arg0] : memref<10xf32>
454 // CHECK: memref.alloc() : memref<10xf32>
455 // CHECK-NEXT: %[[cst:.*]] = arith.constant 8.000000e+00 : f32
456 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
458 // CHECK-NEXT: affine.for %[[arg0:.*]] = 0 to 10 {
459 // CHECK-NEXT: affine.if
460 // CHECK-NEXT: arith.addf %[[cst]], %[[cst]] : f32
461 // CHECK-NEXT: affine.load {{.*}}[%[[arg0]]] : memref<10xf32>
462 // CHECK-NEXT: affine.if
463 // CHECK-NEXT: affine.load {{.*}}[%[[arg0]]] : memref<10xf32>
473 func.func @invariant_affine_for_inside_affine_if() {
474 %m = memref.alloc() : memref<10xf32>
475 %cf8 = arith.constant 8.0 : f32
476 affine.for %arg0 = 0 to 10 {
477 affine.for %arg1 = 0 to 10 {
478 affine.if affine_set<(d0, d1) : (d1 - d0 >= 0)> (%arg0, %arg0) {
479 %cf9 = arith.addf %cf8, %cf8 : f32
480 affine.store %cf9, %m[%arg0] : memref<10xf32>
481 affine.for %arg2 = 0 to 10 {
482 affine.store %cf9, %m[%arg2] : memref<10xf32>
488 // CHECK: memref.alloc() : memref<10xf32>
489 // CHECK-NEXT: %[[cst:.*]] = arith.constant 8.000000e+00 : f32
490 // CHECK-NEXT: affine.for %[[arg0:.*]] = 0 to 10 {
491 // CHECK-NEXT: affine.for %[[arg1:.*]] = 0 to 10 {
492 // CHECK-NEXT: affine.if
493 // CHECK-NEXT: arith.addf %[[cst]], %[[cst]] : f32
494 // CHECK-NEXT: affine.store {{.*}}[%[[arg0]]] : memref<10xf32>
495 // CHECK-NEXT: affine.for %[[arg2:.*]] = 0 to 10 {
496 // CHECK-NEXT: affine.store {{.*}}[%[[arg2]]] : memref<10xf32>
507 func.func @invariant_constant_and_load() {
508 %m = memref.alloc() : memref<100xf32>
509 %m2 = memref.alloc() : memref<100xf32>
510 affine.for %arg0 = 0 to 5 {
511 %c0 = arith.constant 0 : index
512 %v = affine.load %m2[%c0] : memref<100xf32>
513 affine.store %v, %m[%arg0] : memref<100xf32>
516 // CHECK: memref.alloc() : memref<100xf32>
517 // CHECK-NEXT: memref.alloc() : memref<100xf32>
518 // CHECK-NEXT: arith.constant 0 : index
519 // CHECK-NEXT: affine.load
520 // CHECK-NEXT: affine.for %{{.*}} = 0 to 5 {
521 // CHECK-NEXT: affine.store
529 func.func @nested_load_store_same_memref() {
530 %m = memref.alloc() : memref<10xf32>
531 %cst = arith.constant 8.0 : f32
532 %c0 = arith.constant 0 : index
533 affine.for %arg0 = 0 to 10 {
534 %v0 = affine.load %m[%c0] : memref<10xf32>
535 affine.for %arg1 = 0 to 10 {
536 affine.store %cst, %m[%arg1] : memref<10xf32>
540 // CHECK: memref.alloc() : memref<10xf32>
541 // CHECK-NEXT: %[[cst:.*]] = arith.constant 8.000000e+00 : f32
542 // CHECK-NEXT: arith.constant 0 : index
543 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
544 // CHECK-NEXT: affine.load
545 // CHECK-NEXT: affine.for
546 // CHECK-NEXT: affine.store %[[cst]]
554 func.func @nested_load_store_same_memref2() {
555 %m = memref.alloc() : memref<10xf32>
556 %cst = arith.constant 8.0 : f32
557 %c0 = arith.constant 0 : index
558 affine.for %arg0 = 0 to 10 {
559 affine.store %cst, %m[%c0] : memref<10xf32>
560 affine.for %arg1 = 0 to 10 {
561 %v0 = affine.load %m[%arg0] : memref<10xf32>
565 // CHECK: memref.alloc() : memref<10xf32>
566 // CHECK-NEXT: %[[cst:.*]] = arith.constant 8.000000e+00 : f32
567 // CHECK-NEXT: arith.constant 0 : index
568 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
570 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
571 // CHECK-NEXT: affine.store %[[cst]]
572 // CHECK-NEXT: affine.load
580 // CHECK-LABEL: func @do_not_hoist_dependent_side_effect_free_op
581 func.func @do_not_hoist_dependent_side_effect_free_op(%arg0: memref<10x512xf32>) {
582 %0 = memref.alloca() : memref<1xf32>
583 %cst = arith.constant 8.0 : f32
584 affine.for %i = 0 to 512 {
585 affine.for %j = 0 to 10 {
586 %5 = affine.load %arg0[%i, %j] : memref<10x512xf32>
587 %6 = affine.load %0[0] : memref<1xf32>
588 %add = arith.addf %5, %6 : f32
589 affine.store %add, %0[0] : memref<1xf32>
591 %3 = affine.load %0[0] : memref<1xf32>
592 %4 = arith.mulf %3, %cst : f32 // It shouldn't be hoisted.
598 // CHECK-NEXT: affine.for
599 // CHECK-NEXT: affine.load
600 // CHECK-NEXT: affine.load
601 // CHECK-NEXT: arith.addf
602 // CHECK-NEXT: affine.store
604 // CHECK-NEXT: affine.load
605 // CHECK-NEXT: arith.mulf
610 // CHECK-LABEL: func @vector_loop_nothing_invariant
611 func.func @vector_loop_nothing_invariant() {
612 %m1 = memref.alloc() : memref<40xf32>
613 %m2 = memref.alloc() : memref<40xf32>
614 affine.for %arg0 = 0 to 10 {
615 %v0 = affine.vector_load %m1[%arg0*4] : memref<40xf32>, vector<4xf32>
616 %v1 = affine.vector_load %m2[%arg0*4] : memref<40xf32>, vector<4xf32>
617 %v2 = arith.addf %v0, %v1 : vector<4xf32>
618 affine.vector_store %v2, %m1[%arg0*4] : memref<40xf32>, vector<4xf32>
624 // CHECK-NEXT: affine.vector_load
625 // CHECK-NEXT: affine.vector_load
626 // CHECK-NEXT: arith.addf
627 // CHECK-NEXT: affine.vector_store
632 // CHECK-LABEL: func @vector_loop_all_invariant
633 func.func @vector_loop_all_invariant() {
634 %m1 = memref.alloc() : memref<4xf32>
635 %m2 = memref.alloc() : memref<4xf32>
636 %m3 = memref.alloc() : memref<4xf32>
637 affine.for %arg0 = 0 to 10 {
638 %v0 = affine.vector_load %m1[0] : memref<4xf32>, vector<4xf32>
639 %v1 = affine.vector_load %m2[0] : memref<4xf32>, vector<4xf32>
640 %v2 = arith.addf %v0, %v1 : vector<4xf32>
641 affine.vector_store %v2, %m3[0] : memref<4xf32>, vector<4xf32>
646 // CHECK: memref.alloc()
647 // CHECK-NEXT: memref.alloc()
648 // CHECK-NEXT: memref.alloc()
649 // CHECK-NEXT: affine.vector_load
650 // CHECK-NEXT: affine.vector_load
651 // CHECK-NEXT: arith.addf
652 // CHECK-NEXT: affine.vector_store
653 // CHECK-NEXT: affine.for
657 #set = affine_set<(d0): (d0 - 10 >= 0)>
658 // CHECK-LABEL: func @affine_if_not_invariant(
659 func.func @affine_if_not_invariant(%buffer: memref<1024xf32>) -> f32 {
660 %sum_init_0 = arith.constant 0.0 : f32
661 %sum_init_1 = arith.constant 1.0 : f32
662 %res = affine.for %i = 0 to 10 step 2 iter_args(%sum_iter = %sum_init_0) -> f32 {
663 %t = affine.load %buffer[%i] : memref<1024xf32>
664 %sum_next = affine.if #set(%i) -> (f32) {
665 %new_sum = arith.addf %sum_iter, %t : f32
666 affine.yield %new_sum : f32
668 affine.yield %sum_iter : f32
670 %modified_sum = arith.addf %sum_next, %sum_init_1 : f32
671 affine.yield %modified_sum : f32
676 // CHECK: arith.constant 0.000000e+00 : f32
677 // CHECK-NEXT: arith.constant 1.000000e+00 : f32
678 // CHECK-NEXT: affine.for
679 // CHECK-NEXT: affine.load
680 // CHECK-NEXT: affine.if
681 // CHECK-NEXT: arith.addf
682 // CHECK-NEXT: affine.yield
683 // CHECK-NEXT: } else {
684 // CHECK-NEXT: affine.yield
686 // CHECK-NEXT: arith.addf
687 // CHECK-NEXT: affine.yield
692 // CHECK-LABEL: func @affine_for_not_invariant(
693 func.func @affine_for_not_invariant(%in : memref<30x512xf32, 1>,
694 %out : memref<30x1xf32, 1>) {
695 %sum_0 = arith.constant 0.0 : f32
696 %cst_0 = arith.constant 1.1 : f32
697 affine.for %j = 0 to 30 {
698 %sum = affine.for %i = 0 to 512 iter_args(%sum_iter = %sum_0) -> (f32) {
699 %t = affine.load %in[%j,%i] : memref<30x512xf32,1>
700 %sum_next = arith.addf %sum_iter, %t : f32
701 affine.yield %sum_next : f32
703 %mod_sum = arith.mulf %sum, %cst_0 : f32
704 affine.store %mod_sum, %out[%j, 0] : memref<30x1xf32, 1>
709 // CHECK: arith.constant 0.000000e+00 : f32
710 // CHECK-NEXT: arith.constant 1.100000e+00 : f32
711 // CHECK-NEXT: affine.for
712 // CHECK-NEXT: affine.for
713 // CHECK-NEXT: affine.load
714 // CHECK-NEXT: arith.addf
715 // CHECK-NEXT: affine.yield
717 // CHECK-NEXT: arith.mulf
718 // CHECK-NEXT: affine.store
722 // CHECK-LABEL: func @use_of_iter_operands_invariant
723 func.func @use_of_iter_operands_invariant(%m : memref<10xindex>) {
724 %sum_1 = arith.constant 0 : index
725 %v0 = affine.for %arg1 = 0 to 11 iter_args (%prevAccum = %sum_1) -> index {
726 %prod = arith.muli %sum_1, %sum_1 : index
727 %newAccum = arith.addi %prevAccum, %prod : index
728 affine.yield %newAccum : index
735 // CHECK-NEXT: affine.for
737 // CHECK-NEXT: affine.yield
741 // CHECK-LABEL: func @use_of_iter_args_not_invariant
742 func.func @use_of_iter_args_not_invariant(%m : memref<10xindex>) {
743 %sum_1 = arith.constant 0 : index
744 %v0 = affine.for %arg1 = 0 to 11 iter_args (%prevAccum = %sum_1) -> index {
745 %newAccum = arith.addi %prevAccum, %sum_1 : index
746 affine.yield %newAccum : index
751 // CHECK: arith.constant
752 // CHECK-NEXT: affine.for
753 // CHECK-NEXT: arith.addi
754 // CHECK-NEXT: affine.yield
756 #map = affine_map<(d0) -> (64, d0 * -64 + 1020)>
757 // CHECK-LABEL: func.func @affine_parallel
758 func.func @affine_parallel(%memref_8: memref<4090x2040xf32>, %x: index) {
759 %cst = arith.constant 0.000000e+00 : f32
760 affine.parallel (%arg3) = (0) to (32) {
761 affine.for %arg4 = 0 to 16 {
762 affine.parallel (%arg5, %arg6) = (0, 0) to (min(128, 122), min(64, %arg3 * -64 + 2040)) {
763 affine.for %arg7 = 0 to min #map(%arg4) {
764 affine.store %cst, %memref_8[%arg5 + 3968, %arg6 + %arg3 * 64] : memref<4090x2040xf32>
769 // CHECK: affine.parallel
770 // CHECK-NEXT: affine.for
771 // CHECK-NEXT: affine.parallel
772 // CHECK-NEXT: affine.store
773 // CHECK-NEXT: affine.for
775 %c0 = arith.constant 0 : index
776 %c1 = arith.constant 1 : index
777 %c32 = arith.constant 32 : index
778 scf.parallel (%arg3) = (%c0) to (%c32) step (%c1) {
779 affine.for %arg4 = 0 to 16 {
780 affine.parallel (%arg5, %arg6) = (0, 0) to (min(128, 122), min(64, %x * -64 + 2040)) {
781 affine.for %arg7 = 0 to min #map(%arg4) {
782 affine.store %cst, %memref_8[%arg5 + 3968, %arg6] : memref<4090x2040xf32>
787 // CHECK: scf.parallel
788 // CHECK-NEXT: affine.for
789 // CHECK-NEXT: affine.parallel
790 // CHECK-NEXT: affine.store
791 // CHECK-NEXT: affine.for
793 affine.for %arg3 = 0 to 32 {
794 affine.for %arg4 = 0 to 16 {
795 affine.parallel (%arg5, %arg6) = (0, 0) to (min(128, 122), min(64, %arg3 * -64 + 2040)) {
796 // Unknown region-holding op for this pass.
797 scf.for %arg7 = %c0 to %x step %c1 {
798 affine.store %cst, %memref_8[%arg5 + 3968, %arg6 + %arg3 * 64] : memref<4090x2040xf32>
804 // CHECK-NEXT: affine.for
805 // CHECK-NEXT: affine.parallel
806 // CHECK-NEXT: scf.for
807 // CHECK-NEXT: affine.store
814 // CHECK-LABEL: func.func @affine_invariant_use_after_dma
815 #map = affine_map<(d0) -> (d0 * 163840)>
816 func.func @affine_invariant_use_after_dma(%arg0: memref<10485760xi32>, %arg1: memref<1xi32>, %arg2: memref<10485760xi32>) {
817 %c320 = arith.constant 320 : index
818 %c0 = arith.constant 0 : index
819 %c1 = arith.constant 1 : index
820 %alloc = memref.alloc() {alignment = 16 : i64} : memref<0xi32, 2>
821 %alloc_0 = memref.alloc() : memref<1xi32, 2>
822 affine.for %arg3 = 0 to 64 {
823 %0 = affine.apply #map(%arg3)
824 %alloc_1 = memref.alloc() {alignment = 16 : i64} : memref<0xi32, 2>
825 %alloc_2 = memref.alloc() : memref<320xi32, 2>
826 affine.dma_start %arg0[%0], %alloc_2[%c0], %alloc_1[%c0], %c320 : memref<10485760xi32>, memref<320xi32, 2>, memref<0xi32, 2>
827 affine.dma_start %arg1[%c0], %alloc_0[%c0], %alloc[%c0], %c1 : memref<1xi32>, memref<1xi32, 2>, memref<0xi32, 2>
828 affine.dma_wait %alloc_1[%c0], %c320 : memref<0xi32, 2>
829 affine.dma_wait %alloc[%c0], %c1 : memref<0xi32, 2>
830 %1 = affine.apply #map(%arg3)
831 %alloc_3 = memref.alloc() {alignment = 16 : i64} : memref<0xi32, 2>
832 %alloc_4 = memref.alloc() : memref<320xi32, 2>
833 affine.for %arg4 = 0 to 320 {
834 %2 = affine.load %alloc_2[%arg4] : memref<320xi32, 2>
835 %3 = affine.load %alloc_0[0] : memref<1xi32, 2>
836 %4 = arith.addi %2, %3 : i32
837 %5 = arith.addi %4, %2 : i32
838 affine.store %5, %alloc_4[%arg4] : memref<320xi32, 2>
840 affine.dma_start %alloc_4[%c0], %arg2[%1], %alloc_3[%c0], %c320 : memref<320xi32, 2>, memref<10485760xi32>, memref<0xi32, 2>
841 affine.dma_wait %alloc_3[%c0], %c320 : memref<0xi32, 2>
845 // CHECK: %[[zero:.*]] = arith.constant 0 : index
846 // CHECK: %[[scalar_mem:.*]] = memref.alloc() : memref<1xi32, 2>
847 // CHECK: affine.dma_start %arg1[%[[zero]]], %alloc_0[%[[zero]]], %alloc[%[[zero]]], %c1
848 // CHECK: affine.load %[[scalar_mem]][0]
852 // CHECK-LABEL: func @affine_prefetch_invariant
853 func.func @affine_prefetch_invariant() {
854 %0 = memref.alloc() : memref<10x10xf32>
855 affine.for %i0 = 0 to 10 {
856 affine.for %i1 = 0 to 10 {
857 %1 = affine.load %0[%i0, %i1] : memref<10x10xf32>
858 affine.prefetch %0[%i0, %i0], write, locality<0>, data : memref<10x10xf32>
862 // CHECK: memref.alloc() : memref<10x10xf32>
863 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
864 // CHECK-NEXT: affine.prefetch
865 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
866 // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}} : memref<10x10xf32>
872 // Side-effecting ops shouldn't be hoisted.
874 // CHECK-LABEL: func @side_effecting_ops
875 func.func @side_effecting_ops() {
876 %cst = arith.constant 0.0 : f32
877 %m0 = memref.alloc(): memref<1x512x16x16xf32>
879 affine.for %arg783 = 0 to 14 {
880 affine.for %arg784 = 0 to 14 {
881 affine.parallel (%arg785) = (0) to (512) {
882 affine.for %arg786 = 0 to 1 {
883 affine.for %arg787 = 0 to 1 {
884 affine.for %arg788 = 0 to 1 {
885 %m1 = memref.alloc() : memref<1xf32, 3>
886 %m2 = memref.alloc() : memref<1xf32, 3>
887 affine.store %cst, %m1[0] : memref<1xf32, 3>
888 affine.store %cst, %m2[0] : memref<1xf32, 3>
889 %memref_2897, %asyncToken_2898 = gpu.alloc async [%0] () : memref<1x512x16x16xf32>
890 %2432 = gpu.memcpy async [%0] %memref_2897, %m0 : memref<1x512x16x16xf32>, memref<1x512x16x16xf32>
891 affine.for %arg789 = 0 to 16 {
892 affine.for %arg790 = 0 to 16 {
893 affine.store %cst, %memref_2897[0, %arg785 + %arg788, %arg789, %arg790] : memref<1x512x16x16xf32>
896 memref.dealloc %m2 : memref<1xf32, 3>
897 memref.dealloc %m1 : memref<1xf32, 3>
898 %2433 = gpu.memcpy async [%0] %m0, %memref_2897 : memref<1x512x16x16xf32>, memref<1x512x16x16xf32>
899 %2434 = gpu.dealloc async [%asyncToken_2898] %memref_2897 : memref<1x512x16x16xf32>
906 // CHECK: affine.for %{{.*}} = 0 to 1
907 // CHECK-NEXT: affine.for %{{.*}} = 0 to 1
908 // CHECK: memref.alloc
909 // CHECK: memref.alloc
911 // CHECK: affine.for %{{.*}} = 0 to 16
912 // CHECK: affine.for %{{.*}} = 0 to 16
913 // CHECK: memref.dealloc