1 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion))' -split-input-file | FileCheck %s
3 // Part II of fusion tests in mlir/test/Transforms/loop-fusion=2.mlir.
4 // Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir
5 // Part IV of fusion tests in mlir/test/Transforms/loop-fusion-4.mlir
7 // TODO: Add more tests:
8 // *) Add nested fusion test cases when non-constant loop bound support is
9 // added to iteration domain in dependence check.
10 // *) Add a test w/ floordiv/ceildiv/mod when supported in dependence check.
11 // *) Add tests which check fused computation slice indexing and loop bounds.
12 // TODO: Test clean up: move memref allocs to func args.
16 // CHECK-LABEL: func @should_fuse_raw_dep_for_locality() {
17 func.func @should_fuse_raw_dep_for_locality() {
18 %m = memref.alloc() : memref<10xf32>
19 %cf7 = arith.constant 7.0 : f32
21 affine.for %i0 = 0 to 10 {
22 affine.store %cf7, %m[%i0] : memref<10xf32>
24 affine.for %i1 = 0 to 10 {
25 %v0 = affine.load %m[%i1] : memref<10xf32>
27 // CHECK: affine.for %{{.*}} = 0 to 10 {
28 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
29 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
37 // CHECK-LABEL: func @should_fuse_reduction_to_pointwise() {
38 func.func @should_fuse_reduction_to_pointwise() {
39 %a = memref.alloc() : memref<10x10xf32>
40 %b = memref.alloc() : memref<10xf32>
41 %c = memref.alloc() : memref<10xf32>
43 %cf7 = arith.constant 7.0 : f32
45 affine.for %i0 = 0 to 10 {
46 affine.for %i1 = 0 to 10 {
47 %v0 = affine.load %b[%i0] : memref<10xf32>
48 %v1 = affine.load %a[%i0, %i1] : memref<10x10xf32>
49 %v3 = arith.addf %v0, %v1 : f32
50 affine.store %v3, %b[%i0] : memref<10xf32>
53 affine.for %i2 = 0 to 10 {
54 %v4 = affine.load %b[%i2] : memref<10xf32>
55 affine.store %v4, %c[%i2] : memref<10xf32>
58 // Should fuse in entire inner loop on %i1 from source loop nest, as %i1
59 // is not used in the access function of the store/load on %b.
60 // CHECK: affine.for %{{.*}} = 0 to 10 {
61 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
62 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
63 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
64 // CHECK-NEXT: arith.addf %{{.*}}, %{{.*}} : f32
65 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
67 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
68 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
76 // CHECK-DAG: [[$MAP_SHIFT_MINUS_ONE_R1:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 - 1)>
77 // CHECK-DAG: [[$MAP_SHIFT_D0_BY_ONE:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 + 1)>
78 // CHECK-DAG: [[$MAP_SHIFT_D1_BY_ONE:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d1 + 1)>
80 // CHECK-LABEL: func @should_fuse_loop_nests_with_shifts() {
81 func.func @should_fuse_loop_nests_with_shifts() {
82 %a = memref.alloc() : memref<10x10xf32>
83 %cf7 = arith.constant 7.0 : f32
85 affine.for %i0 = 0 to 9 {
86 affine.for %i1 = 0 to 9 {
87 affine.store %cf7, %a[%i0 + 1, %i1 + 1] : memref<10x10xf32>
90 affine.for %i2 = 1 to 10 {
91 affine.for %i3 = 1 to 10 {
92 %v0 = affine.load %a[%i2, %i3] : memref<10x10xf32>
96 // Source slice affine apply sequence:
97 // *) First two affine apply's map from the dst to src iteration space.
98 // *) Third affine apply is access function around src store.
99 // *) Fourth affine apply shifts the stores access function by '-1', because
100 // of the offset induced by reducing the memref shape from 10x10 to 9x9.
101 // *) Fifth affine apply shifts the loads access function by '-1', because
102 // of the offset induced by reducing the memref shape from 10x10 to 9x9.
103 // NOTE: Should create a private memref with reduced shape 9x9xf32.
104 // CHECK: affine.for %{{.*}} = 1 to 10 {
105 // CHECK-NEXT: affine.for %{{.*}} = 1 to 10 {
106 // CHECK-NEXT: %[[I:.*]] = affine.apply [[$MAP_SHIFT_MINUS_ONE_R1]](%{{.*}})
107 // CHECK-NEXT: %[[J:.*]] = affine.apply [[$MAP_SHIFT_MINUS_ONE_R1]](%{{.*}})
108 // CHECK-NEXT: affine.apply [[$MAP_SHIFT_D0_BY_ONE]](%[[I]], %[[J]])
109 // CHECK-NEXT: affine.apply [[$MAP_SHIFT_D1_BY_ONE]](%[[I]], %[[J]])
110 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
111 // CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
114 // CHECK-NEXT: return
120 // CHECK-LABEL: func @should_fuse_loop_nest() {
121 func.func @should_fuse_loop_nest() {
122 %a = memref.alloc() : memref<10x10xf32>
123 %b = memref.alloc() : memref<10x10xf32>
124 %cf7 = arith.constant 7.0 : f32
126 affine.for %i0 = 0 to 10 {
127 affine.for %i1 = 0 to 10 {
128 affine.store %cf7, %a[%i0, %i1] : memref<10x10xf32>
131 affine.for %i2 = 0 to 10 {
132 affine.for %i3 = 0 to 10 {
133 %v0 = affine.load %a[%i3, %i2] : memref<10x10xf32>
134 affine.store %v0, %b[%i2, %i3] : memref<10x10xf32>
137 affine.for %i4 = 0 to 10 {
138 affine.for %i5 = 0 to 10 {
139 %v1 = affine.load %b[%i4, %i5] : memref<10x10xf32>
142 // Expecting private memref for '%a' first, then private memref for '%b'.
143 // CHECK-DAG: [[NEWA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1x1xf32>
144 // CHECK-DAG: [[NEWB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1x1xf32>
145 // CHECK: affine.for %{{.*}} = 0 to 10 {
146 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
147 // CHECK-NEXT: affine.store %{{.*}}, [[NEWA]][0, 0] : memref<1x1xf32>
148 // CHECK-NEXT: affine.load [[NEWA]][0, 0] : memref<1x1xf32>
149 // CHECK-NEXT: affine.store %{{.*}}, [[NEWB]][0, 0] : memref<1x1xf32>
150 // CHECK-NEXT: affine.load [[NEWB]][0, 0] : memref<1x1xf32>
153 // CHECK-NEXT: return
159 // CHECK-LABEL: func @should_fuse_across_intermediate_loop_with_no_deps() {
160 func.func @should_fuse_across_intermediate_loop_with_no_deps() {
161 %a = memref.alloc() : memref<10xf32>
162 %b = memref.alloc() : memref<10xf32>
163 %c = memref.alloc() : memref<10xf32>
165 %cf7 = arith.constant 7.0 : f32
167 affine.for %i0 = 0 to 10 {
168 %v0 = affine.load %a[%i0] : memref<10xf32>
169 affine.store %v0, %b[%i0] : memref<10xf32>
171 affine.for %i1 = 0 to 10 {
172 affine.store %cf7, %c[%i1] : memref<10xf32>
174 affine.for %i2 = 0 to 10 {
175 %v1 = affine.load %b[%i2] : memref<10xf32>
178 // Should fuse first loop (past second loop with no dependences) into third.
179 // Note that fusion creates a private memref '%2' for the fused loop nest.
180 // CHECK: affine.for %{{.*}} = 0 to 10 {
181 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
183 // CHECK: affine.for %{{.*}} = 0 to 10 {
184 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
185 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
186 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
188 // CHECK-NEXT: return
194 // CHECK-LABEL: func @should_fuse_all_loops() {
195 func.func @should_fuse_all_loops() {
196 %a = memref.alloc() : memref<10xf32>
197 %b = memref.alloc() : memref<10xf32>
198 %cf7 = arith.constant 7.0 : f32
200 // Set up flow dependences from first and second loops to third.
201 affine.for %i0 = 0 to 10 {
202 affine.store %cf7, %a[%i0] : memref<10xf32>
204 affine.for %i1 = 0 to 10 {
205 affine.store %cf7, %b[%i1] : memref<10xf32>
207 affine.for %i2 = 0 to 10 {
208 %v0 = affine.load %a[%i2] : memref<10xf32>
209 %v1 = affine.load %b[%i2] : memref<10xf32>
212 // Should fuse first and second loops into third.
213 // Expecting private memref for '%a' first, then private memref for '%b'.
214 // CHECK-DAG: [[NEWA:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xf32>
215 // CHECK-DAG: [[NEWB:%[0-9a-zA-Z_]+]] = memref.alloc() : memref<1xf32>
216 // CHECK: affine.for %{{.*}} = 0 to 10 {
217 // CHECK-NEXT: affine.store %{{.*}}, [[NEWA]][0] : memref<1xf32>
218 // CHECK-NEXT: affine.store %{{.*}}, [[NEWB]][0] : memref<1xf32>
219 // CHECK-NEXT: affine.load [[NEWA]][0] : memref<1xf32>
220 // CHECK-NEXT: affine.load [[NEWB]][0] : memref<1xf32>
222 // CHECK-NEXT: return
228 // CHECK-LABEL: func @should_fuse_first_and_second_loops() {
229 func.func @should_fuse_first_and_second_loops() {
230 %a = memref.alloc() : memref<10xf32>
231 %b = memref.alloc() : memref<10xf32>
232 %c = memref.alloc() : memref<10xf32>
234 %cf7 = arith.constant 7.0 : f32
236 affine.for %i0 = 0 to 10 {
237 affine.store %cf7, %a[%i0] : memref<10xf32>
239 affine.for %i1 = 0 to 10 {
240 %v0 = affine.load %a[%i1] : memref<10xf32>
241 affine.store %cf7, %b[%i1] : memref<10xf32>
243 affine.for %i2 = 0 to 10 {
244 %v1 = affine.load %c[%i2] : memref<10xf32>
247 // Should fuse first loop into the second (last loop should not be fused).
248 // Should create private memref '%2' for fused scf.
249 // CHECK: affine.for %{{.*}} = 0 to 10 {
250 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
251 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
252 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
254 // CHECK: affine.for %{{.*}} = 0 to 10 {
255 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
257 // CHECK-NEXT: return
264 // CHECK-LABEL: func @should_not_fuse_would_create_cycle() {
265 func.func @should_not_fuse_would_create_cycle() {
266 %a = memref.alloc() : memref<10xf32>
267 %b = memref.alloc() : memref<10xf32>
268 %c = memref.alloc() : memref<10xf32>
270 %cf7 = arith.constant 7.0 : f32
272 // Set up the following dependences:
273 // 1) loop0 -> loop1 on memref '%{{.*}}'
274 // 2) loop0 -> loop2 on memref '%{{.*}}'
275 // 3) loop1 -> loop2 on memref '%{{.*}}'
276 affine.for %i0 = 0 to 10 {
277 %v0 = affine.load %a[%i0] : memref<10xf32>
278 affine.store %cf7, %b[%i0] : memref<10xf32>
280 affine.for %i1 = 0 to 10 {
281 affine.store %cf7, %a[%i1] : memref<10xf32>
282 %v1 = affine.load %c[%i1] : memref<10xf32>
284 affine.for %i2 = 0 to 10 {
285 %v2 = affine.load %b[%i2] : memref<10xf32>
286 affine.store %cf7, %c[%i2] : memref<10xf32>
288 // Should not fuse: fusing loop first loop into last would create a cycle.
289 // CHECK: affine.for %{{.*}} = 0 to 10 {
290 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
291 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
293 // CHECK: affine.for %{{.*}} = 0 to 10 {
294 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
295 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
297 // CHECK: affine.for %{{.*}} = 0 to 10 {
298 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
299 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
301 // CHECK-NEXT: return
307 // CHECK-LABEL: func @should_fuse_producer_consumer() {
308 func.func @should_fuse_producer_consumer() {
309 %m = memref.alloc() : memref<10xf32>
310 %cf7 = arith.constant 7.0 : f32
312 affine.for %i0 = 0 to 10 {
313 affine.store %cf7, %m[%i0] : memref<10xf32>
315 affine.for %i1 = 0 to 10 {
316 affine.store %cf7, %m[%i1] : memref<10xf32>
318 affine.for %i2 = 0 to 10 {
319 %v1 = affine.load %m[%i2] : memref<10xf32>
321 // Fusing loop %i0 to %i2 would violate the WAW dependence between %i0 and
322 // %i1, but OK to fuse %i1 into %i2.
323 // TODO: When the fusion pass is run to a fixed-point, it should
324 // fuse all three of these loop nests.
325 // CHECK: memref.alloc() : memref<1xf32>
326 // CHECK: affine.for %{{.*}} = 0 to 10 {
327 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
328 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
329 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
331 // CHECK-NEXT: return
337 // CHECK-LABEL: func @should_fuse_and_move_to_preserve_war_dep() {
338 func.func @should_fuse_and_move_to_preserve_war_dep() {
339 %a = memref.alloc() : memref<10xf32>
340 %b = memref.alloc() : memref<10xf32>
341 %cf7 = arith.constant 7.0 : f32
343 affine.for %i0 = 0 to 10 {
344 %v0 = affine.load %a[%i0] : memref<10xf32>
345 affine.store %v0, %b[%i0] : memref<10xf32>
347 affine.for %i1 = 0 to 10 {
348 affine.store %cf7, %a[%i1] : memref<10xf32>
350 affine.for %i2 = 0 to 10 {
351 %v1 = affine.load %b[%i2] : memref<10xf32>
353 // Loops '%i1' and '%i2' have no dependences. We can fuse a slice of '%i0'
354 // into '%i2' if we move the fused loop nest before '%i1', which preserves
355 // the WAR dependence from load '%a' in '%i0' to the store '%a' in loop '%i1'.
356 // CHECK: affine.for %{{.*}} = 0 to 10 {
357 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
358 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
359 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
361 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
362 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
364 // CHECK-NEXT: return
370 // CHECK-LABEL: func @should_fuse_if_top_level_access() {
371 func.func @should_fuse_if_top_level_access() {
372 %m = memref.alloc() : memref<10xf32>
373 %cf7 = arith.constant 7.0 : f32
375 affine.for %i0 = 0 to 10 {
376 affine.store %cf7, %m[%i0] : memref<10xf32>
378 affine.for %i1 = 0 to 10 {
379 %v0 = affine.load %m[%i1] : memref<10xf32>
382 %c0 = arith.constant 4 : index
383 %v1 = affine.load %m[%c0] : memref<10xf32>
384 // Top-level load to '%m' should prevent creating a private memref but
385 // loop nests should be fused and '%i0' should be removed.
386 // CHECK: %[[m:.*]] = memref.alloc() : memref<10xf32>
387 // CHECK-NOT: memref.alloc
389 // CHECK: affine.for %[[i1:.*]] = 0 to 10 {
390 // CHECK-NEXT: affine.store %{{.*}}, %[[m]][%[[i1]]] : memref<10xf32>
391 // CHECK-NEXT: affine.load %[[m]][%[[i1]]] : memref<10xf32>
393 // CHECK: affine.load %[[m]][%{{.*}}] : memref<10xf32>
399 // CHECK-LABEL: func @should_fuse_but_not_remove_src() {
400 func.func @should_fuse_but_not_remove_src() {
401 %m = memref.alloc() : memref<100xf32>
402 %cf7 = arith.constant 7.0 : f32
404 affine.for %i0 = 0 to 100 {
405 affine.store %cf7, %m[%i0] : memref<100xf32>
407 affine.for %i1 = 0 to 17 {
408 %v0 = affine.load %m[%i1] : memref<100xf32>
410 %v1 = affine.load %m[99] : memref<100xf32>
412 // Loop '%i0' and '%i1' should be fused but '%i0' shouldn't be removed to
413 // preserve the dependence with the top-level access.
414 // CHECK: affine.for %{{.*}} = 0 to 100 {
415 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<100xf32>
417 // CHECK-NEXT: affine.for %{{.*}} = 0 to 17 {
418 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
419 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
421 // CHECK-NEXT: affine.load %{{.*}}[99] : memref<100xf32>
422 // CHECK-NEXT: return
428 // CHECK-LABEL: func @should_fuse_no_top_level_access() {
429 func.func @should_fuse_no_top_level_access() {
430 %m = memref.alloc() : memref<10xf32>
431 %cf7 = arith.constant 7.0 : f32
433 affine.for %i0 = 0 to 10 {
434 affine.store %cf7, %m[%i0] : memref<10xf32>
436 affine.for %i1 = 0 to 10 {
437 %v0 = affine.load %m[%i1] : memref<10xf32>
439 // CHECK: affine.for %{{.*}} = 0 to 10 {
440 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
441 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
443 // CHECK-NEXT: return
449 #set0 = affine_set<(d0) : (1 == 0)>
451 // CHECK-LABEL: func @should_not_fuse_if_op_at_top_level() {
452 func.func @should_not_fuse_if_op_at_top_level() {
453 %m = memref.alloc() : memref<10xf32>
454 %cf7 = arith.constant 7.0 : f32
456 affine.for %i0 = 0 to 10 {
457 affine.store %cf7, %m[%i0] : memref<10xf32>
459 affine.for %i1 = 0 to 10 {
460 %v0 = affine.load %m[%i1] : memref<10xf32>
462 %c0 = arith.constant 4 : index
463 affine.if #set0(%c0) {
465 // Top-level IfOp should prevent fusion.
466 // CHECK: affine.for %{{.*}} = 0 to 10 {
467 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
469 // CHECK: affine.for %{{.*}} = 0 to 10 {
470 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
477 #set0 = affine_set<(d0) : (1 == 0)>
479 // CHECK-LABEL: func @should_not_fuse_if_op_in_loop_nest() {
480 func.func @should_not_fuse_if_op_in_loop_nest() {
481 %m = memref.alloc() : memref<10xf32>
482 %cf7 = arith.constant 7.0 : f32
483 %c4 = arith.constant 4 : index
485 affine.for %i0 = 0 to 10 {
486 affine.store %cf7, %m[%i0] : memref<10xf32>
488 affine.for %i1 = 0 to 10 {
489 affine.if #set0(%c4) {
491 %v0 = affine.load %m[%i1] : memref<10xf32>
494 // IfOp in ForOp should prevent fusion.
495 // CHECK: affine.for %{{.*}} = 0 to 10 {
496 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
498 // CHECK: affine.for %{{.*}} = 0 to 10 {
499 // CHECK-NEXT: affine.if #set(%{{.*}}) {
501 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
508 #set = affine_set<(d0) : (d0 - 1 >= 0)>
510 // CHECK-LABEL: func @should_fuse_if_op_in_loop_nest_not_sandwiched() -> memref<10xf32> {
511 func.func @should_fuse_if_op_in_loop_nest_not_sandwiched() -> memref<10xf32> {
512 %a = memref.alloc() : memref<10xf32>
513 %b = memref.alloc() : memref<10xf32>
514 %cf7 = arith.constant 7.0 : f32
516 affine.for %i0 = 0 to 10 {
517 affine.store %cf7, %a[%i0] : memref<10xf32>
519 affine.for %i1 = 0 to 10 {
520 %v0 = affine.load %a[%i1] : memref<10xf32>
521 affine.store %v0, %b[%i1] : memref<10xf32>
523 affine.for %i2 = 0 to 10 {
524 affine.if #set(%i2) {
525 %v0 = affine.load %b[%i2] : memref<10xf32>
529 // IfOp in ForOp should not prevent fusion if it does not in between the
530 // source and dest ForOp ops.
533 // CHECK-NEXT: affine.store
534 // CHECK-NEXT: affine.load
535 // CHECK-NEXT: affine.store
537 // CHECK-NEXT: affine.if
538 // CHECK-NEXT: affine.load
539 // CHECK-NOT: affine.for
542 return %a : memref<10xf32>
547 #set = affine_set<(d0) : (d0 - 1 >= 0)>
549 // CHECK-LABEL: func @should_not_fuse_if_op_in_loop_nest_between_src_and_dest() -> memref<10xf32> {
550 func.func @should_not_fuse_if_op_in_loop_nest_between_src_and_dest() -> memref<10xf32> {
551 %a = memref.alloc() : memref<10xf32>
552 %b = memref.alloc() : memref<10xf32>
553 %cf7 = arith.constant 7.0 : f32
555 affine.for %i0 = 0 to 10 {
556 affine.store %cf7, %a[%i0] : memref<10xf32>
558 affine.for %i1 = 0 to 10 {
559 affine.if #set(%i1) {
560 affine.store %cf7, %a[%i1] : memref<10xf32>
563 affine.for %i3 = 0 to 10 {
564 %v0 = affine.load %a[%i3] : memref<10xf32>
565 affine.store %v0, %b[%i3] : memref<10xf32>
567 return %b : memref<10xf32>
569 // IfOp in ForOp which modifies the memref should prevent fusion if it is in
570 // between the source and dest ForOp.
573 // CHECK-NEXT: affine.store
575 // CHECK-NEXT: affine.if
576 // CHECK-NEXT: affine.store
578 // CHECK-NEXT: affine.load
579 // CHECK-NEXT: affine.store
585 // CHECK-LABEL: func @permute_and_fuse() {
586 func.func @permute_and_fuse() {
587 %m = memref.alloc() : memref<10x20x30xf32>
589 %cf7 = arith.constant 7.0 : f32
590 affine.for %i0 = 0 to 10 {
591 affine.for %i1 = 0 to 20 {
592 affine.for %i2 = 0 to 30 {
593 affine.store %cf7, %m[%i0, %i1, %i2] : memref<10x20x30xf32>
597 affine.for %i3 = 0 to 30 {
598 affine.for %i4 = 0 to 10 {
599 affine.for %i5 = 0 to 20 {
600 %v0 = affine.load %m[%i4, %i5, %i3] : memref<10x20x30xf32>
601 "foo"(%v0) : (f32) -> ()
605 // CHECK: affine.for %{{.*}} = 0 to 30 {
606 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
607 // CHECK-NEXT: affine.for %{{.*}} = 0 to 20 {
608 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0, 0] : memref<1x1x1xf32>
609 // CHECK-NEXT: affine.load %{{.*}}[0, 0, 0] : memref<1x1x1xf32>
610 // CHECK-NEXT: "foo"(%{{.*}}) : (f32) -> ()
614 // CHECK-NEXT: return
621 // CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 4 + d1)>
622 // CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 4)>
623 // CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 4)>
625 // Reshape from a 64 x f32 to 16 x 4 x f32.
626 // CHECK-LABEL: func @fuse_reshape_64_16_4
627 func.func @fuse_reshape_64_16_4(%in : memref<64xf32>) {
628 %out = memref.alloc() : memref<16x4xf32>
630 affine.for %i0 = 0 to 64 {
631 %v = affine.load %in[%i0] : memref<64xf32>
632 affine.store %v, %out[%i0 floordiv 4, %i0 mod 4] : memref<16x4xf32>
635 affine.for %i1 = 0 to 16 {
636 affine.for %i2 = 0 to 4 {
637 %w = affine.load %out[%i1, %i2] : memref<16x4xf32>
638 "foo"(%w) : (f32) -> ()
642 // CHECK: affine.for %{{.*}} =
643 // CHECK-NEXT: affine.for %{{.*}} =
647 // CHECK-NEXT: return
651 // CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 4)>
652 // CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 4)>
653 // CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 4 + d1)>
655 // Reshape a 16x4xf32 to 64xf32.
656 // CHECK-LABEL: func @fuse_reshape_16_4_64
657 func.func @fuse_reshape_16_4_64() {
658 %in = memref.alloc() : memref<16x4xf32>
659 %out = memref.alloc() : memref<64xf32>
661 affine.for %i0 = 0 to 16 {
662 affine.for %i1 = 0 to 4 {
663 %v = affine.load %in[%i0, %i1] : memref<16x4xf32>
664 affine.store %v, %out[4*%i0 + %i1] : memref<64xf32>
668 affine.for %i2 = 0 to 64 {
669 %w = affine.load %out[%i2] : memref<64xf32>
670 "foo"(%w) : (f32) -> ()
672 // CHECK: affine.for %{{.*}} = 0 to 64 {
673 // CHECK-NEXT: affine.apply [[$MAP0]](%{{.*}})
674 // CHECK-NEXT: affine.apply [[$MAP1]](%{{.*}})
675 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<16x4xf32>
676 // CHECK-NEXT: affine.apply [[$MAP2]](%{{.*}}, %{{.*}})
677 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
678 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
679 // CHECK-NEXT: "foo"(%{{.*}}) : (f32) -> ()
681 // CHECK-NEXT: return
688 // All three loop nests below (6-d one, 2-d one, 2-d one is fused into a single
690 func.func @R6_to_R2_reshape_square() -> memref<64x9xi32> {
691 %in = memref.alloc() : memref<2x2x3x3x16x1xi32>
692 %out = memref.alloc() : memref<64x9xi32>
693 %live_out = memref.alloc() : memref<64x9xi32>
696 affine.for %i0 = 0 to 2 {
697 affine.for %i1 = 0 to 2 {
698 affine.for %i2 = 0 to 3 {
699 affine.for %i3 = 0 to 3 {
700 affine.for %i4 = 0 to 16 {
701 affine.for %i5 = 0 to 1 {
702 %val = "foo"(%i0, %i1, %i2, %i3, %i4, %i5) : (index, index, index, index, index, index) -> i32
703 affine.store %val, %in[%i0, %i1, %i2, %i3, %i4, %i5] : memref<2x2x3x3x16x1xi32>
711 affine.for %ii = 0 to 64 {
712 affine.for %jj = 0 to 9 {
713 // Convert output coordinates to linear index.
714 %a0 = affine.apply affine_map<(d0, d1) -> (d0 * 9 + d1)> (%ii, %jj)
715 %0 = affine.apply affine_map<(d0) -> (d0 floordiv (2 * 3 * 3 * 16 * 1))>(%a0)
716 %1 = affine.apply affine_map<(d0) -> ((d0 mod 288) floordiv (3 * 3 * 16 * 1))>(%a0)
717 %2 = affine.apply affine_map<(d0) -> (((d0 mod 288) mod 144) floordiv (3 * 16 * 1))>(%a0)
718 %3 = affine.apply affine_map<(d0) -> ((((d0 mod 288) mod 144) mod 48) floordiv (16 * 1))>(%a0)
719 %4 = affine.apply affine_map<(d0) -> ((((d0 mod 288) mod 144) mod 48) mod 16)>(%a0)
720 %5 = affine.apply affine_map<(d0) -> (((((d0 mod 144) mod 144) mod 48) mod 16) mod 1)>(%a0)
721 %v = affine.load %in[%0, %1, %2, %3, %4, %5] : memref<2x2x3x3x16x1xi32>
722 affine.store %v, %out[%ii, %jj] : memref<64x9xi32>
726 affine.for %i = 0 to 64 {
727 affine.for %j = 0 to 9 {
728 %a = affine.load %out[%i, %j] : memref<64x9xi32>
729 %b = arith.muli %a, %a : i32
730 affine.store %b, %live_out[%i, %j] : memref<64x9xi32>
733 return %live_out : memref<64x9xi32>
735 // Everything above is fused to a single 2-d loop nest, and the 6-d tensor %in
736 // is eliminated if -memref-dataflow-opt is also supplied.
738 // CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> ((d0 * 9 + d1) floordiv 288)>
739 // CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (((d0 * 9 + d1) mod 288) floordiv 144)>
740 // CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (((d0 * 9 + d1) mod 144) floordiv 48)>
741 // CHECK-DAG: [[$MAP3:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (((d0 * 9 + d1) mod 48) floordiv 16)>
742 // CHECK-DAG: [[$MAP4:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> ((d0 * 9 + d1) mod 16)>
743 // CHECK-DAG: [[$MAP11:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 9 + d1)>
744 // CHECK-DAG: [[$MAP12:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 288)>
745 // CHECK-DAG: [[$MAP13:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> ((d0 mod 288) floordiv 144)>
746 // CHECK-DAG: [[$MAP14:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> ((d0 mod 144) floordiv 48)>
747 // CHECK-DAG: [[$MAP15:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> ((d0 mod 48) floordiv 16)>
748 // CHECK-DAG: [[$MAP16:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 mod 16)>
749 // CHECK-DAG: [[$MAP17:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (0)>
752 // CHECK-LABEL: func @R6_to_R2_reshape
753 // CHECK: memref.alloc() : memref<1x2x3x3x16x1xi32>
754 // CHECK: memref.alloc() : memref<1x1xi32>
755 // CHECK: memref.alloc() : memref<64x9xi32>
756 // CHECK-NEXT: affine.for %{{.*}} = 0 to 64 {
757 // CHECK-NEXT: affine.for %{{.*}} = 0 to 9 {
758 // CHECK-NEXT: affine.apply [[$MAP0]](%{{.*}}, %{{.*}})
759 // CHECK-NEXT: affine.apply [[$MAP1]](%{{.*}}, %{{.*}})
760 // CHECK-NEXT: affine.apply [[$MAP2]](%{{.*}}, %{{.*}})
761 // CHECK-NEXT: affine.apply [[$MAP3]](%{{.*}}, %{{.*}})
762 // CHECK-NEXT: affine.apply [[$MAP4]](%{{.*}}, %{{.*}})
763 // CHECK-NEXT: "foo"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (index, index, index, index, index, index) -> i32
764 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, ((%{{.*}} * 9 + %{{.*}}) mod 288) floordiv 144, ((%{{.*}} * 9 + %{{.*}}) mod 144) floordiv 48, ((%{{.*}} * 9 + %{{.*}}) mod 48) floordiv 16, (%{{.*}} * 9 + %{{.*}}) mod 16, 0] : memref<1x2x3x3x16x1xi32>
765 // CHECK-NEXT: affine.apply [[$MAP11]](%{{.*}}, %{{.*}})
766 // CHECK-NEXT: affine.apply [[$MAP12]](%{{.*}})
767 // CHECK-NEXT: affine.apply [[$MAP13]](%{{.*}})
768 // CHECK-NEXT: affine.apply [[$MAP14]](%{{.*}})
769 // CHECK-NEXT: affine.apply [[$MAP15]](%{{.*}})
770 // CHECK-NEXT: affine.apply [[$MAP16]](%{{.*}})
771 // CHECK-NEXT: affine.apply [[$MAP17]](%{{.*}})
772 // CHECK-NEXT: affine.load %{{.*}}[0, ((%{{.*}} * 9 + %{{.*}}) mod 288) floordiv 144, ((%{{.*}} * 9 + %{{.*}}) mod 144) floordiv 48, ((%{{.*}} * 9 + %{{.*}}) mod 48) floordiv 16, (%{{.*}} * 9 + %{{.*}}) mod 16, 0] : memref<1x2x3x3x16x1xi32>
773 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xi32>
774 // CHECK-NEXT: affine.load %{{.*}}[0, 0] : memref<1x1xi32>
775 // CHECK-NEXT: arith.muli %{{.*}}, %{{.*}} : i32
776 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x9xi32>
779 // CHECK-NEXT: return %{{.*}} : memref<64x9xi32>
783 // CHECK-LABEL: func @fuse_symbolic_bounds
784 func.func @fuse_symbolic_bounds(%M : index, %N : index) {
785 %N_plus_5 = affine.apply affine_map<(d0) -> (d0 + 5)>(%N)
786 %m = memref.alloc(%M, %N_plus_5) : memref<? x ? x f32>
788 %c0 = arith.constant 0.0 : f32
789 %s = arith.constant 5 : index
791 affine.for %i0 = 0 to %M {
792 affine.for %i1 = 0 to affine_map<(d0) -> (d0 + 5)> (%N) {
793 affine.store %c0, %m[%i0, %i1] : memref<? x ? x f32>
797 affine.for %i2 = 0 to %M {
798 affine.for %i3 = 0 to %N {
799 %v = affine.load %m[%i2, %i3 + symbol(%s)] : memref<? x ? x f32>
808 // CHECK-LABEL: func @should_fuse_reduction_at_depth_of_one
809 func.func @should_fuse_reduction_at_depth_of_one() {
810 %a = memref.alloc() : memref<10x100xf32>
811 %b = memref.alloc() : memref<10xf32>
813 affine.for %i0 = 0 to 10 {
814 affine.for %i1 = 0 to 100 {
815 %v0 = affine.load %b[%i0] : memref<10xf32>
816 %v1 = affine.load %a[%i0, %i1] : memref<10x100xf32>
817 %v2 = "maxf"(%v0, %v1) : (f32, f32) -> f32
818 affine.store %v2, %b[%i0] : memref<10xf32>
821 affine.for %i2 = 0 to 10 {
822 affine.for %i3 = 0 to 100 {
823 %v3 = affine.load %b[%i2] : memref<10xf32>
824 %v4 = affine.load %a[%i2, %i3] : memref<10x100xf32>
825 %v5 = arith.subf %v4, %v3 : f32
826 affine.store %v5, %b[%i2] : memref<10xf32>
829 // This test should fuse the src reduction loop at depth 1 in the destination
830 // loop nest, which improves locality and enables subsequence passes to
831 // decrease the reduction memref size and possibly place it in a faster
833 // CHECK: affine.for %{{.*}} = 0 to 10 {
834 // CHECK-NEXT: affine.for %{{.*}} = 0 to 100 {
835 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
836 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x100xf32>
837 // CHECK-NEXT: "maxf"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
838 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
840 // CHECK-NEXT: affine.for %{{.*}} = 0 to 100 {
841 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
842 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x100xf32>
843 // CHECK-NEXT: arith.subf %{{.*}}, %{{.*}} : f32
844 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
847 // CHECK-NEXT: return
853 // CHECK-LABEL: func @should_fuse_at_src_depth1_and_dst_depth1
854 func.func @should_fuse_at_src_depth1_and_dst_depth1() {
855 %a = memref.alloc() : memref<100x16xf32>
856 %b = memref.alloc() : memref<100x16xf32>
858 affine.for %i0 = 0 to 100 {
859 affine.for %i1 = 0 to 16 {
860 %v0 = affine.load %a[%i0, %i1] : memref<100x16xf32>
861 "op0"(%v0) : (f32) -> ()
863 affine.for %i2 = 0 to 16 {
864 %v1 = "op1"() : () -> (f32)
865 affine.store %v1, %b[%i0, %i2] : memref<100x16xf32>
869 affine.for %i3 = 0 to 100 {
870 affine.for %i4 = 0 to 16 {
871 %v2 = affine.load %b[%i3, %i4] : memref<100x16xf32>
872 "op2"(%v2) : (f32) -> ()
875 // We can slice iterations of the '%i0' and '%i1' loops in the source
876 // loop nest, but slicing at depth 2 and inserting the slice in the
877 // destination loop nest at depth2 causes extra computation. Instead,
878 // the fusion algorithm should detect that the source loop should be sliced
879 // at depth 1 and the slice should be inserted at depth 1.
880 // CHECK: affine.for %{{.*}} = 0 to 100 {
881 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
882 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<100x16xf32>
883 // CHECK-NEXT: "op0"(%{{.*}}) : (f32) -> ()
885 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
886 // CHECK-NEXT: %{{.*}} = "op1"() : () -> f32
887 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
889 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
890 // CHECK-NEXT: affine.load %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
891 // CHECK-NEXT: "op2"(%{{.*}}) : (f32) -> ()
894 // CHECK-NEXT: return
899 // CHECK: [[$MAP0:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 10 + d1)>
901 // CHECK-LABEL: func @should_fuse_src_depth1_at_dst_depth2
902 func.func @should_fuse_src_depth1_at_dst_depth2() {
903 %a = memref.alloc() : memref<100xf32>
904 %c0 = arith.constant 0.0 : f32
906 affine.for %i0 = 0 to 100 {
907 affine.store %c0, %a[%i0] : memref<100xf32>
910 affine.for %i1 = 0 to 10 {
911 affine.for %i2 = 0 to 10 {
912 %a0 = affine.apply affine_map<(d0, d1) -> (d0 * 10 + d1)> (%i1, %i2)
913 %v0 = affine.load %a[%a0] : memref<100xf32>
916 // The source loop nest slice loop bound is a function of both destination
917 // loop IVs, so we should slice at depth 1 and insert the slice at depth 2.
918 // CHECK: affine.for %{{.*}} = 0 to 10 {
919 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
920 // CHECK-NEXT: affine.apply [[$MAP0]](%{{.*}}, %{{.*}})
921 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
922 // CHECK-NEXT: affine.apply [[$MAP0]](%{{.*}}, %{{.*}})
923 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
926 // CHECK-NEXT: return
932 // CHECK-LABEL: func @fusion_at_depth0_not_currently_supported
933 func.func @fusion_at_depth0_not_currently_supported() {
934 %0 = memref.alloc() : memref<10xf32>
935 %c0 = arith.constant 0 : index
936 %cst = arith.constant 0.000000e+00 : f32
937 affine.for %i0 = 0 to 10 {
938 affine.store %cst, %0[%i0] : memref<10xf32>
940 affine.for %i1 = 0 to 10 {
941 %1 = affine.load %0[%c0] : memref<10xf32>
943 // NOTE: Should shrink memref size to 1 element access by load in dst loop
944 // nest, and make the store in the slice store to the same element.
945 // CHECK-DAG: memref.alloc() : memref<1xf32>
946 // CHECK: affine.for %{{.*}} = 0 to 10 {
947 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
948 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
950 // CHECK-NEXT: return
956 // CHECK-LABEL: func @should_fuse_deep_loop_nests
957 func.func @should_fuse_deep_loop_nests() {
958 %0 = memref.alloc() : memref<2x2x3x3x16x10xf32, 2>
959 %1 = memref.alloc() : memref<2x2x3x3x16x10xf32, 2>
960 %2 = memref.alloc() : memref<3x3x3x3x16x10xf32, 2>
961 %c0 = arith.constant 0 : index
962 %c1 = arith.constant 1 : index
963 %c1_0 = arith.constant 1 : index
964 %cst = arith.constant 0.000000e+00 : f32
965 affine.for %i0 = 0 to 2 {
966 affine.for %i1 = 0 to 2 {
967 affine.for %i2 = 0 to 3 {
968 affine.for %i3 = 0 to 3 {
969 affine.for %i4 = 0 to 16 {
970 affine.for %i5 = 0 to 10 {
971 %3 = affine.load %0[%i0, %i1, %i2, %i3, %i4, %i5]
972 : memref<2x2x3x3x16x10xf32, 2>
975 affine.for %i6 = 0 to 16 {
976 affine.for %i7 = 0 to 10 {
977 affine.store %cst, %1[%i0, %i1, %i2, %i3, %i6, %i7]
978 : memref<2x2x3x3x16x10xf32, 2>
985 affine.for %i8 = 0 to 3 {
986 affine.for %i9 = 0 to 3 {
987 affine.for %i10 = 0 to 2 {
988 affine.for %i11 = 0 to 2 {
989 affine.for %i12 = 0 to 3 {
990 affine.for %i13 = 0 to 3 {
991 affine.for %i14 = 0 to 2 {
992 affine.for %i15 = 0 to 2 {
993 affine.for %i16 = 0 to 16 {
994 affine.for %i17 = 0 to 10 {
995 %5 = affine.load %0[%i14, %i15, %i12, %i13, %i16, %i17]
996 : memref<2x2x3x3x16x10xf32, 2>
999 affine.for %i18 = 0 to 16 {
1000 affine.for %i19 = 0 to 10 {
1001 %6 = affine.load %1[%i10, %i11, %i8, %i9, %i18, %i19]
1002 : memref<2x2x3x3x16x10xf32, 2>
1013 // The first four loops of the source loop nest can be sliced with iteration
1014 // bounds which are a function of the first four loops of destination loop nest,
1015 // where the destination loops nests have been interchanged.
1017 // CHECK-DAG: memref.alloc() : memref<1x1x1x1x16x10xf32, 2>
1018 // CHECK: affine.for %{{.*}} = 0 to 3 {
1019 // CHECK-NEXT: affine.for %{{.*}} = 0 to 3 {
1020 // CHECK-NEXT: affine.for %{{.*}} = 0 to 2 {
1021 // CHECK-NEXT: affine.for %{{.*}} = 0 to 2 {
1022 // CHECK-NEXT: affine.for %{{.*}} = 0 to 3 {
1023 // CHECK-NEXT: affine.for %{{.*}} = 0 to 3 {
1024 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
1025 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1026 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<2x2x3x3x16x10xf32, 2>
1029 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
1030 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1031 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0, 0, 0, %{{.*}}, %{{.*}}] : memref<1x1x1x1x16x10xf32, 2>
1034 // CHECK-NEXT: affine.for %{{.*}} = 0 to 2 {
1035 // CHECK-NEXT: affine.for %{{.*}} = 0 to 2 {
1036 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
1037 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1038 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] : memref<2x2x3x3x16x10xf32, 2>
1041 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
1042 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1043 // CHECK-NEXT: affine.load %{{.*}}[0, 0, 0, 0, %{{.*}}, %{{.*}}] : memref<1x1x1x1x16x10xf32, 2>
1054 // CHECK-NEXT: return
1060 // CHECK-LABEL: func @should_fuse_at_depth1_and_reduce_slice_trip_count
1061 func.func @should_fuse_at_depth1_and_reduce_slice_trip_count() {
1062 %a = memref.alloc() : memref<4x256xf32>
1063 %b = memref.alloc() : memref<4x256xf32>
1065 %c0 = arith.constant 0 : index
1066 %cf0 = arith.constant 0.0 : f32
1068 affine.for %i0 = 0 to 4 {
1069 affine.for %i1 = 0 to 256 {
1070 %v0 = affine.load %b[%i0, %i1] : memref<4x256xf32>
1072 affine.for %i2 = 0 to 256 {
1073 affine.store %cf0, %a[%i0, %i2] : memref<4x256xf32>
1077 affine.for %d0 = 0 to 4 {
1078 affine.for %d1 = 0 to 16 {
1079 %v1 = affine.load %a[%d0, %d1] : memref<4x256xf32>
1082 // The cost of fusing at depth 2 is greater than the cost of fusing at depth 1
1084 // 1) Inserting the unsliceable src loop %i1 to a higher depth removes
1085 // redundant computation and reduces costs.
1086 // 2) Inserting the sliceable src loop %i2 at depth 1, we can still reduce
1087 // its trip count to 16 (from 256) reducing costs.
1088 // NOTE: the size of the private memref created for the fused loop nest
1089 // is reduced from the original shape from 4x256 to 4x16 because of the
1090 // data accessed by the load.
1091 // CHECK-DAG: memref.alloc() : memref<1x16xf32>
1092 // CHECK: affine.for %{{.*}} = 0 to 4 {
1093 // CHECK-NEXT: affine.for %{{.*}} = 0 to 256 {
1094 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4x256xf32>
1096 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
1097 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
1099 // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
1100 // CHECK-NEXT: affine.load %{{.*}}[0, %{{.*}}] : memref<1x16xf32>
1103 // CHECK-NEXT: return
1109 // CHECK-LABEL: func @should_fuse_at_depth1_with_trip_count_20
1110 func.func @should_fuse_at_depth1_with_trip_count_20() {
1111 %a = memref.alloc() : memref<100xf32>
1112 %c0 = arith.constant 0 : index
1113 %cf0 = arith.constant 0.0 : f32
1115 affine.for %i0 = 0 to 100 {
1116 affine.store %cf0, %a[%i0]: memref<100xf32>
1119 affine.for %i1 = 0 to 5 {
1120 affine.for %i2 = 0 to 10 {
1121 %v0 = affine.load %a[%i2]: memref<100xf32>
1123 affine.for %i3 = 0 to 10 {
1124 affine.for %i4 = 0 to 20 {
1125 %v1 = affine.load %a[%i4]: memref<100xf32>
1129 // NOTE: The size of the private memref created for fusion is shrunk to 20xf32
1130 // CHECK-DAG: memref.alloc() : memref<20xf32>
1131 // CHECK: affine.for %{{.*}} = 0 to 5 {
1132 // CHECK-NEXT: affine.for %{{.*}} = 0 to 20 {
1133 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<20xf32>
1135 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1136 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<20xf32>
1138 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1139 // CHECK-NEXT: affine.for %{{.*}} = 0 to 20 {
1140 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<20xf32>
1144 // CHECK-NEXT: return
1150 // CHECK-LABEL: func @should_fuse_at_depth1_with_trip_count_19
1151 func.func @should_fuse_at_depth1_with_trip_count_19() {
1152 %a = memref.alloc() : memref<100xf32>
1153 %c0 = arith.constant 0 : index
1154 %cf0 = arith.constant 0.0 : f32
1156 affine.for %i0 = 0 to 100 {
1157 affine.store %cf0, %a[%i0]: memref<100xf32>
1160 affine.for %i1 = 0 to 5 {
1161 affine.for %i2 = 0 to 19 {
1162 %v0 = affine.load %a[%i2]: memref<100xf32>
1164 affine.for %i3 = 0 to 10 {
1165 affine.for %i4 = 0 to 10 {
1166 %v1 = affine.load %a[%i4]: memref<100xf32>
1170 // NOTE: The size of the private memref created for fusion is shrunk to 19xf32
1171 // CHECK-DAG: memref.alloc() : memref<19xf32>
1172 // CHECK: affine.for %{{.*}} = 0 to 5 {
1173 // CHECK-NEXT: affine.for %{{.*}} = 0 to 19 {
1174 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<19xf32>
1176 // CHECK-NEXT: affine.for %{{.*}} = 0 to 19 {
1177 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<19xf32>
1179 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1180 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1181 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<19xf32>
1185 // CHECK-NEXT: return
1192 // CHECK-LABEL: func @should_fuse_with_private_memref() {
1193 func.func @should_fuse_with_private_memref() {
1194 %m = memref.alloc() : memref<100xf32>
1195 %cf7 = arith.constant 7.0 : f32
1197 affine.for %i0 = 0 to 100 {
1198 affine.store %cf7, %m[%i0] : memref<100xf32>
1200 affine.for %i1 = 0 to 17 {
1201 %v0 = affine.load %m[%i1] : memref<100xf32>
1203 affine.for %i2 = 0 to 82 {
1204 %v1 = affine.load %m[%i2] : memref<100xf32>
1206 // Should create a new private memref.
1207 // CHECK-DAG: memref.alloc() : memref<1xf32>
1208 // CHECK: affine.for %{{.*}} = 0 to 17 {
1209 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
1210 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
1211 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
1213 // CHECK-NEXT: return
1219 // CHECK-LABEL: func @should_fuse_live_out_arg_but_preserve_src_loop(%{{.*}}: memref<10xf32>) {
1220 func.func @should_fuse_live_out_arg_but_preserve_src_loop(%arg0: memref<10xf32>) {
1221 %cf7 = arith.constant 7.0 : f32
1223 affine.for %i0 = 0 to 10 {
1224 affine.store %cf7, %arg0[%i0] : memref<10xf32>
1226 affine.for %i1 = 0 to 9 {
1227 %v0 = affine.load %arg0[%i1] : memref<10xf32>
1229 // This tests that the loop nest '%i0' should not be removed after fusion
1230 // because it writes to memref argument '%arg0', and its read region
1231 // does not cover its write region (so fusion would shrink the write region
1232 // in the fused loop nest, so complete live out data region would not
1234 // CHECK: affine.for %{{.*}} = 0 to 10 {
1235 // CHECK-NEXT: affine.store %{{.*}} : memref<10xf32>
1237 // CHECK-NEXT: affine.for %{{.*}} = 0 to 9 {
1238 // CHECK-NEXT: affine.store %{{.*}} : memref<1xf32>
1239 // CHECK-NEXT: affine.load %{{.*}} : memref<1xf32>
1241 // CHECK-NEXT: return
1247 // CHECK-LABEL: func @should_fuse_live_out_arg(%{{.*}}: memref<10xf32>) {
1248 func.func @should_fuse_live_out_arg(%arg0: memref<10xf32>) {
1249 %cf7 = arith.constant 7.0 : f32
1251 affine.for %i0 = 0 to 10 {
1252 affine.store %cf7, %arg0[%i0] : memref<10xf32>
1254 affine.for %i1 = 0 to 10 {
1255 %v0 = affine.load %arg0[%i1] : memref<10xf32>
1257 // The read/write regions for memref '%{{.*}}' are the same for both
1258 // loops, so they should fuse.
1260 // CHECK: affine.for %{{.*}} = 0 to 10 {
1261 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1262 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1264 // CHECK-NEXT: return
1270 // CHECK-LABEL: func @should_fuse_escaping_memref_but_preserve_src_loop() -> memref<10xf32>
1271 func.func @should_fuse_escaping_memref_but_preserve_src_loop() -> memref<10xf32> {
1272 %cf7 = arith.constant 7.0 : f32
1273 %m = memref.alloc() : memref<10xf32>
1274 affine.for %i0 = 0 to 10 {
1275 affine.store %cf7, %m[%i0] : memref<10xf32>
1277 affine.for %i1 = 0 to 9 {
1278 %v0 = affine.load %m[%i1] : memref<10xf32>
1280 // This tests that the loop nest '%i0' should not be removed after fusion
1281 // because it writes to memref '%m', which is returned by the function, and
1282 // the '%i1' memory region does not cover '%i0' memory region.
1284 // CHECK-DAG: memref.alloc() : memref<1xf32>
1285 // CHECK: affine.for %{{.*}} = 0 to 10 {
1286 // CHECK-NEXT: affine.store %{{.*}} : memref<10xf32>
1288 // CHECK-NEXT: affine.for %{{.*}} = 0 to 9 {
1289 // CHECK-NEXT: affine.store %{{.*}} : memref<1xf32>
1290 // CHECK-NEXT: affine.load %{{.*}} : memref<1xf32>
1292 // CHECK-NEXT: return %{{.*}} : memref<10xf32>
1293 return %m : memref<10xf32>
1297 // This should fuse with the %in becoming a 1x1x1.
1298 func.func @R3_to_R2_reshape() {
1299 %in = memref.alloc() : memref<2x3x16xi32>
1301 %c0 = arith.constant 0 : index
1303 affine.for %i0 = 0 to 2 {
1304 affine.for %i1 = 0 to 3 {
1305 affine.for %i2 = 0 to 16 {
1306 %val = "foo"(%i0, %i1, %i2) : (index, index, index) -> i32
1307 affine.store %val, %in[%i0, %i1, %i2] : memref<2x3x16xi32>
1312 affine.for %ii = 0 to 32 {
1313 affine.for %jj = 0 to 3 {
1314 %a0 = affine.apply affine_map<(d0, d1) -> (d0 * 3 + d1)> (%ii, %jj)
1315 %idx = affine.apply affine_map<(d0) -> (d0 floordiv (3 * 16))> (%a0)
1316 %v = affine.load %in[%idx, %jj, %c0]
1317 : memref<2x3x16xi32>
1322 // CHECK-DAG: [[$MAP0:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> ((d0 * 3 + d1) floordiv 48)>
1323 // CHECK-DAG: [[$MAP1:#map[0-9a-zA-Z_]*]] = affine_map<(d0, d1) -> (d0 * 3 + d1)>
1324 // CHECK-DAG: [[$MAP2:#map[0-9a-zA-Z_]*]] = affine_map<(d0) -> (d0 floordiv 48)>
1326 // CHECK-LABEL: func @R3_to_R2_reshape()
1327 // CHECK-DAG: memref.alloc() : memref<1x1x1xi32>
1328 // CHECK: affine.for %{{.*}} = 0 to 32 {
1329 // CHECK-NEXT: affine.for %{{.*}} = 0 to 3 {
1330 // CHECK-NEXT: affine.apply [[$MAP0]](%{{.*}}, %{{.*}})
1331 // CHECK-NEXT: "foo"(%{{.*}}, %{{.*}}, %{{.*}}) : (index, index, index) -> i32
1332 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0, 0, 0] : memref<1x1x1xi32>
1333 // CHECK-NEXT: affine.apply [[$MAP1]](%{{.*}}, %{{.*}})
1334 // CHECK-NEXT: affine.apply [[$MAP2]](%{{.*}})
1335 // CHECK-NEXT: affine.load %{{.*}}[0, 0, 0] : memref<1x1x1xi32>
1338 // CHECK-NEXT: return
1342 func.func @should_fuse_multi_output_producer() {
1343 %a = memref.alloc() : memref<10xf32>
1344 %b = memref.alloc() : memref<10xf32>
1346 %cf7 = arith.constant 7.0 : f32
1348 affine.for %i0 = 0 to 10 {
1349 affine.store %cf7, %a[%i0] : memref<10xf32>
1350 affine.store %cf7, %b[%i0] : memref<10xf32>
1352 affine.for %i1 = 0 to 10 {
1353 %v0 = affine.load %a[%i1] : memref<10xf32>
1354 %v1 = affine.load %b[%i1] : memref<10xf32>
1357 // CHECK: affine.for %{{.*}} = 0 to 10 {
1358 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
1359 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
1360 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
1361 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
1363 // CHECK-NEXT: return
1369 // CHECK-LABEL: func @fusion_preventing_deps_on_middle_loop() {
1370 func.func @fusion_preventing_deps_on_middle_loop() {
1371 %a = memref.alloc() : memref<10xf32>
1372 %b = memref.alloc() : memref<10xf32>
1373 %c = memref.alloc() : memref<10xf32>
1375 %cf7 = arith.constant 7.0 : f32
1377 affine.for %i0 = 0 to 10 {
1378 %v0 = affine.load %a[%i0] : memref<10xf32>
1379 affine.store %v0, %b[%i0] : memref<10xf32>
1381 affine.for %i1 = 0 to 10 {
1382 affine.store %cf7, %a[%i1] : memref<10xf32>
1383 %v1 = affine.load %c[%i1] : memref<10xf32>
1385 affine.for %i2 = 0 to 10 {
1386 %v2 = affine.load %b[%i2] : memref<10xf32>
1387 affine.store %v2, %c[%i2] : memref<10xf32>
1389 // Loops '%i0' and '%i2' cannot fuse along producer/consumer edge on memref
1390 // '%b', because of the WAR dep from '%i0' to '%i1' on memref '%a' and
1391 // because of the WAR dep from '%i1' to '%i2' on memref '%c'.
1392 // CHECK: affine.for %{{.*}} = 0 to 10 {
1393 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1394 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1396 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1397 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1398 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1400 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1401 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1402 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1404 // CHECK-NEXT: return
1410 // CHECK-LABEL: func @should_fuse_and_move_to_preserve_war_dep() {
1411 func.func @should_fuse_and_move_to_preserve_war_dep() {
1412 %a = memref.alloc() : memref<10xf32>
1413 %b = memref.alloc() : memref<10xf32>
1414 %c = memref.alloc() : memref<10xf32>
1416 %cf7 = arith.constant 7.0 : f32
1418 affine.for %i0 = 0 to 10 {
1419 %v0 = affine.load %b[%i0] : memref<10xf32>
1420 affine.store %v0, %a[%i0] : memref<10xf32>
1422 affine.for %i1 = 0 to 3 {
1423 %v2 = affine.load %c[%i1] : memref<10xf32>
1425 affine.for %i2 = 0 to 5 {
1426 affine.store %cf7, %b[%i2] : memref<10xf32>
1428 affine.for %i3 = 0 to 10 {
1429 %v1 = affine.load %a[%i3] : memref<10xf32>
1430 affine.store %cf7, %c[%i3] : memref<10xf32>
1433 // Dependence graph:
1437 // --- %i1 | %b | %a
1441 // --> %i3 <--------
1443 // It is possible to fuse loop '%i0' into '%i3' and preserve dependences
1444 // if the fused loop nest is inserted between loops '%i1' and '%i2'.
1446 // CHECK-DAG: memref.alloc() : memref<1xf32>
1447 // CHECK: affine.for %{{.*}} = 0 to 3 {
1448 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1450 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1451 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1452 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
1453 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
1454 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1456 // CHECK-NEXT: affine.for %{{.*}} = 0 to 5 {
1457 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1459 // CHECK-NEXT: return
1465 // CHECK-LABEL: func @fusion_preventing_dep_on_constant() {
1466 func.func @fusion_preventing_dep_on_constant() {
1467 %a = memref.alloc() : memref<10xf32>
1468 %b = memref.alloc() : memref<10xf32>
1469 %c = memref.alloc() : memref<10xf32>
1471 %cf7 = arith.constant 7.0 : f32
1473 affine.for %i0 = 0 to 10 {
1474 %v0 = affine.load %b[%i0] : memref<10xf32>
1475 affine.store %cf7, %a[%i0] : memref<10xf32>
1477 affine.for %i1 = 0 to 10 {
1478 affine.store %cf7, %b[%i1] : memref<10xf32>
1480 %cf11 = arith.constant 11.0 : f32
1481 affine.for %i2 = 0 to 10 {
1482 %v2 = affine.load %a[%i2] : memref<10xf32>
1483 affine.store %cf11, %c[%i2] : memref<10xf32>
1485 // Loops '%i0' and '%i2' cannot fuse along producer/consumer edge on memref
1486 // '%a', because of the WAR dep from '%i0' to '%i1' on memref '%b' and
1487 // because of the SSA value dep from '%cf11' def to use in '%i2'.
1488 // CHECK: affine.for %{{.*}} = 0 to 10 {
1489 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1490 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1492 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1493 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1495 // CHECK-NEXT: %{{.*}} = arith.constant 1.100000e+01 : f32
1496 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1497 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1498 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1500 // CHECK-NEXT: return
1506 // CHECK-LABEL: func @should_fuse_and_preserve_dep_on_constant() {
1507 func.func @should_fuse_and_preserve_dep_on_constant() {
1508 %a = memref.alloc() : memref<10xf32>
1509 %b = memref.alloc() : memref<10xf32>
1510 %c = memref.alloc() : memref<10xf32>
1512 %cf7 = arith.constant 7.0 : f32
1513 %cf11 = arith.constant 11.0 : f32
1514 affine.for %i0 = 0 to 10 {
1515 %v0 = affine.load %b[%i0] : memref<10xf32>
1516 affine.store %cf7, %a[%i0] : memref<10xf32>
1518 affine.for %i1 = 0 to 10 {
1519 affine.store %cf7, %b[%i1] : memref<10xf32>
1521 affine.for %i2 = 0 to 10 {
1522 %v2 = affine.load %a[%i2] : memref<10xf32>
1523 affine.store %cf11, %c[%i2] : memref<10xf32>
1526 // Loops '%i0' and '%i2' can fuse along producer/consumer edge on memref
1527 // '%a', and preserve the WAR dep from '%i0' to '%i1' on memref '%b', and
1528 // the SSA value dep from '%cf11' def to use in '%i2'.
1530 // CHECK: arith.constant 1.100000e+01 : f32
1531 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1532 // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
1533 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
1534 // CHECK-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
1535 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1537 // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
1538 // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
1540 // CHECK-NEXT: return
1546 // CHECK-LABEL: @producer_consumer_with_outmost_user
1547 func.func @producer_consumer_with_outmost_user(%arg0 : f16) {
1548 %c0 = arith.constant 0 : index
1549 %src = memref.alloc() : memref<f16, 1>
1550 %dst = memref.alloc() : memref<f16>
1551 %tag = memref.alloc() : memref<1xi32>
1552 affine.for %arg1 = 4 to 6 {
1553 affine.for %arg2 = 0 to 1 {
1554 %0 = arith.addf %arg0, %arg0 : f16
1555 affine.store %0, %src[] : memref<f16, 1>
1557 affine.for %arg3 = 0 to 1 {
1558 %0 = affine.load %src[] : memref<f16, 1>
1561 affine.dma_start %src[], %dst[], %tag[%c0], %c0 : memref<f16, 1>, memref<f16>, memref<1xi32>
1562 // CHECK: %[[CST_INDEX:.*]] = arith.constant 0 : index
1563 // CHECK: %[[DMA_SRC:.*]] = memref.alloc() : memref<f16, 1>
1564 // CHECK: %[[DMA_DST:.*]] = memref.alloc() : memref<f16>
1565 // CHECK: %[[DMA_TAG:.*]] = memref.alloc() : memref<1xi32>
1566 // CHECK: affine.for %arg1 = 4 to 6
1567 // CHECK-NEXT: affine.for %arg2 = 0 to 1
1568 // CHECK-NEXT: %[[RESULT_ADD:.*]] = arith.addf %arg0, %arg0 : f16
1569 // CHECK-NEXT: affine.store %[[RESULT_ADD]], %[[DMA_SRC]][] : memref<f16, 1>
1570 // CHECK-NEXT: affine.load %[[DMA_SRC]][] : memref<f16, 1>
1571 // CHECK: affine.dma_start %[[DMA_SRC]][], %[[DMA_DST]][], %[[DMA_TAG]][%[[CST_INDEX]]], %[[CST_INDEX]] : memref<f16, 1>, memref<f16>, memref<1xi32>
1572 // CHECK-NEXT: return
1576 // Add further tests in mlir/test/Transforms/loop-fusion-4.mlir