1 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER
2 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal mode=sibling}))' -split-input-file | FileCheck %s --check-prefix=SIBLING-MAXIMAL
4 // Part I of fusion tests in mlir/test/Transforms/loop-fusion.mlir.
5 // Part II of fusion tests in mlir/test/Transforms/loop-fusion-2.mlir
6 // Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir
8 // Expects fusion of producer into consumer at depth 4 and subsequent removal of
10 // PRODUCER-CONSUMER-LABEL: func @unflatten4d
11 func.func @unflatten4d(%arg1: memref<7x8x9x10xf32>) {
12 %m = memref.alloc() : memref<5040xf32>
13 %cf7 = arith.constant 7.0 : f32
15 affine.for %i0 = 0 to 7 {
16 affine.for %i1 = 0 to 8 {
17 affine.for %i2 = 0 to 9 {
18 affine.for %i3 = 0 to 10 {
19 affine.store %cf7, %m[720 * %i0 + 90 * %i1 + 10 * %i2 + %i3] : memref<5040xf32>
24 affine.for %i0 = 0 to 7 {
25 affine.for %i1 = 0 to 8 {
26 affine.for %i2 = 0 to 9 {
27 affine.for %i3 = 0 to 10 {
28 %v0 = affine.load %m[720 * %i0 + 90 * %i1 + 10 * %i2 + %i3] : memref<5040xf32>
29 affine.store %v0, %arg1[%i0, %i1, %i2, %i3] : memref<7x8x9x10xf32>
37 // PRODUCER-CONSUMER: affine.for
38 // PRODUCER-CONSUMER-NEXT: affine.for
39 // PRODUCER-CONSUMER-NEXT: affine.for
40 // PRODUCER-CONSUMER-NEXT: affine.for
41 // PRODUCER-CONSUMER-NOT: affine.for
42 // PRODUCER-CONSUMER: return
46 // Expects fusion of producer into consumer at depth 2 and subsequent removal of
48 // PRODUCER-CONSUMER-LABEL: func @unflatten2d_with_transpose
49 func.func @unflatten2d_with_transpose(%arg1: memref<8x7xf32>) {
50 %m = memref.alloc() : memref<56xf32>
51 %cf7 = arith.constant 7.0 : f32
53 affine.for %i0 = 0 to 7 {
54 affine.for %i1 = 0 to 8 {
55 affine.store %cf7, %m[8 * %i0 + %i1] : memref<56xf32>
58 affine.for %i0 = 0 to 8 {
59 affine.for %i1 = 0 to 7 {
60 %v0 = affine.load %m[%i0 + 8 * %i1] : memref<56xf32>
61 affine.store %v0, %arg1[%i0, %i1] : memref<8x7xf32>
67 // PRODUCER-CONSUMER: affine.for
68 // PRODUCER-CONSUMER-NEXT: affine.for
69 // PRODUCER-CONSUMER-NOT: affine.for
70 // PRODUCER-CONSUMER: return
74 // Expects fusion of producer into consumer at depth 1 and source loop to not
75 // be removed due to difference in loop steps.
76 // PRODUCER-CONSUMER-LABEL: func @check_src_dst_step
77 func.func @check_src_dst_step(%m : memref<100xf32>,
78 %src: memref<100xf32>,
79 %out: memref<100xf32>) {
80 affine.for %i0 = 0 to 100 {
81 %r1 = affine.load %src[%i0]: memref<100xf32>
82 affine.store %r1, %m[%i0] : memref<100xf32>
84 affine.for %i2 = 0 to 100 step 2 {
85 %r2 = affine.load %m[%i2] : memref<100xf32>
86 affine.store %r2, %out[%i2] : memref<100xf32>
91 // Check if the fusion did take place as well as that the source loop was
92 // not removed. To check if fusion took place, the read instruction from the
93 // original source loop is checked to be in the fused loop.
95 // PRODUCER-CONSUMER: affine.for %[[idx_0:.*]] = 0 to 100 {
96 // PRODUCER-CONSUMER-NEXT: %[[result_0:.*]] = affine.load %[[arr1:.*]][%[[idx_0]]] : memref<100xf32>
97 // PRODUCER-CONSUMER-NEXT: affine.store %[[result_0]], %{{.*}}[%[[idx_0]]] : memref<100xf32>
98 // PRODUCER-CONSUMER-NEXT: }
99 // PRODUCER-CONSUMER: affine.for %[[idx_1:.*]] = 0 to 100 step 2 {
100 // PRODUCER-CONSUMER: affine.load %[[arr1]][%[[idx_1]]] : memref<100xf32>
101 // PRODUCER-CONSUMER: }
102 // PRODUCER-CONSUMER: return
106 // SIBLING-MAXIMAL-LABEL: func @reduce_add_non_maximal_f32_f32(
107 func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) {
108 %cst_0 = arith.constant 0.000000e+00 : f32
109 %cst_1 = arith.constant 1.000000e+00 : f32
110 affine.for %arg3 = 0 to 1 {
111 affine.for %arg4 = 0 to 64 {
112 %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
113 %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
114 %5 = arith.addf %prevAccum, %4 : f32
115 affine.yield %5 : f32
117 %accum_dbl = arith.addf %accum, %accum : f32
118 affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1>
121 affine.for %arg3 = 0 to 1 {
122 affine.for %arg4 = 0 to 64 {
123 // Following loop trip count does not match the corresponding source trip count.
124 %accum = affine.for %arg5 = 0 to 32 iter_args (%prevAccum = %cst_1) -> f32 {
125 %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
126 %5 = arith.mulf %prevAccum, %4 : f32
127 affine.yield %5 : f32
129 %accum_sqr = arith.mulf %accum, %accum : f32
130 affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1>
135 // Test checks the loop structure is preserved after sibling fusion
136 // since the destination loop and source loop trip counts do not
138 // SIBLING-MAXIMAL: %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32
139 // SIBLING-MAXIMAL-NEXT: %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32
140 // SIBLING-MAXIMAL-NEXT: affine.for %[[idx_0:.*]]= 0 to 1 {
141 // SIBLING-MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 {
142 // SIBLING-MAXIMAL-NEXT: %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) {
143 // SIBLING-MAXIMAL-NEXT: %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) {
147 // SIBLING-MAXIMAL-LABEL: func @sibling_load_only
148 func.func @sibling_load_only(%arg0: memref<10xf32>) {
149 affine.for %arg1 = 0 to 10 {
150 %0 = affine.load %arg0[%arg1] : memref<10xf32>
152 affine.for %arg1 = 0 to 10 {
153 %0 = affine.load %arg0[%arg1] : memref<10xf32>
155 // SIBLING-MAXIMAL-NEXT: affine.for
156 // SIBLING-MAXIMAL-NEXT: affine.load
157 // SIBLING-MAXIMAL-NEXT: affine.load
163 // PRODUCER-CONSUMER-LABEL: func @fusion_for_multiple_blocks() {
164 func.func @fusion_for_multiple_blocks() {
166 %m = memref.alloc() : memref<10xf32>
167 %cf7 = arith.constant 7.0 : f32
169 affine.for %i0 = 0 to 10 {
170 affine.store %cf7, %m[%i0] : memref<10xf32>
172 affine.for %i1 = 0 to 10 {
173 %v0 = affine.load %m[%i1] : memref<10xf32>
175 // PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 10 {
176 // PRODUCER-CONSUMER-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
177 // PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
178 // PRODUCER-CONSUMER-NEXT: }
181 affine.for %i0 = 0 to 10 {
182 affine.store %cf7, %m[%i0] : memref<10xf32>
184 affine.for %i1 = 0 to 10 {
185 %v0 = affine.load %m[%i1] : memref<10xf32>
187 // PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 10 {
188 // PRODUCER-CONSUMER-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
189 // PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[0] : memref<1xf32>
190 // PRODUCER-CONSUMER-NEXT: }
196 // PRODUCER-CONSUMER-LABEL: @fuse_higher_dim_nest_into_lower_dim_nest
197 func.func @fuse_higher_dim_nest_into_lower_dim_nest() {
198 %A = memref.alloc() : memref<8x12x128x64xf32>
199 %B = memref.alloc() : memref<8x128x12x64xf32>
200 affine.for %arg205 = 0 to 8 {
201 affine.for %arg206 = 0 to 128 {
202 affine.for %arg207 = 0 to 12 {
203 affine.for %arg208 = 0 to 64 {
204 %a = affine.load %A[%arg205, %arg207, %arg206, %arg208] : memref<8x12x128x64xf32>
205 affine.store %a, %B[%arg205, %arg206, %arg207, %arg208] : memref<8x128x12x64xf32>
210 %C = memref.alloc() : memref<8x128x768xf16>
211 affine.for %arg205 = 0 to 8 {
212 affine.for %arg206 = 0 to 128 {
213 affine.for %arg207 = 0 to 768 {
214 %b = affine.load %B[%arg205, %arg206, %arg207 floordiv 64, %arg207 mod 64] : memref<8x128x12x64xf32>
215 %c = arith.truncf %b : f32 to f16
216 affine.store %c, %C[%arg205, %arg206, %arg207] : memref<8x128x768xf16>
221 // Check that fusion happens into the innermost loop of the consumer.
222 // PRODUCER-CONSUMER: affine.for
223 // PRODUCER-CONSUMER-NEXT: affine.for %{{.*}} = 0 to 128
224 // PRODUCER-CONSUMER-NEXT: affine.for %{{.*}} = 0 to 768
225 // PRODUCER-CONSUMER-NOT: affine.for
226 // PRODUCER-CONSUMER: return