1 // RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize | FileCheck %s
2 // RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize='max-nested=1' | FileCheck --check-prefix=MAX-NESTED %s
3 // RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize='parallel-reductions=1' | FileCheck --check-prefix=REDUCE %s
5 // CHECK-LABEL: func @reduce_window_max() {
6 func.func @reduce_window_max() {
7 %cst = arith.constant 0.000000e+00 : f32
8 %0 = memref.alloc() : memref<1x8x8x64xf32>
9 %1 = memref.alloc() : memref<1x18x18x64xf32>
10 affine.for %arg0 = 0 to 1 {
11 affine.for %arg1 = 0 to 8 {
12 affine.for %arg2 = 0 to 8 {
13 affine.for %arg3 = 0 to 64 {
14 affine.store %cst, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
19 affine.for %arg0 = 0 to 1 {
20 affine.for %arg1 = 0 to 8 {
21 affine.for %arg2 = 0 to 8 {
22 affine.for %arg3 = 0 to 64 {
23 affine.for %arg4 = 0 to 1 {
24 affine.for %arg5 = 0 to 3 {
25 affine.for %arg6 = 0 to 3 {
26 affine.for %arg7 = 0 to 1 {
27 %2 = affine.load %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
28 %3 = affine.load %1[%arg0 + %arg4, %arg1 * 2 + %arg5, %arg2 * 2 + %arg6, %arg3 + %arg7] : memref<1x18x18x64xf32>
29 %4 = arith.cmpf ogt, %2, %3 : f32
30 %5 = arith.select %4, %2, %3 : f32
31 affine.store %5, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
43 // CHECK: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
44 // CHECK: %[[v0:.*]] = memref.alloc() : memref<1x8x8x64xf32>
45 // CHECK: %[[v1:.*]] = memref.alloc() : memref<1x18x18x64xf32>
46 // CHECK: affine.parallel (%[[arg0:.*]]) = (0) to (1) {
47 // CHECK: affine.parallel (%[[arg1:.*]]) = (0) to (8) {
48 // CHECK: affine.parallel (%[[arg2:.*]]) = (0) to (8) {
49 // CHECK: affine.parallel (%[[arg3:.*]]) = (0) to (64) {
50 // CHECK: affine.store %[[cst]], %[[v0]][%[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]]] : memref<1x8x8x64xf32>
55 // CHECK: affine.parallel (%[[a0:.*]]) = (0) to (1) {
56 // CHECK: affine.parallel (%[[a1:.*]]) = (0) to (8) {
57 // CHECK: affine.parallel (%[[a2:.*]]) = (0) to (8) {
58 // CHECK: affine.parallel (%[[a3:.*]]) = (0) to (64) {
59 // CHECK: affine.parallel (%[[a4:.*]]) = (0) to (1) {
60 // CHECK: affine.for %[[a5:.*]] = 0 to 3 {
61 // CHECK: affine.for %[[a6:.*]] = 0 to 3 {
62 // CHECK: affine.parallel (%[[a7:.*]]) = (0) to (1) {
63 // CHECK: %[[lhs:.*]] = affine.load %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32>
64 // CHECK: %[[rhs:.*]] = affine.load %[[v1]][%[[a0]] + %[[a4]], %[[a1]] * 2 + %[[a5]], %[[a2]] * 2 + %[[a6]], %[[a3]] + %[[a7]]] : memref<1x18x18x64xf32>
65 // CHECK: %[[res:.*]] = arith.cmpf ogt, %[[lhs]], %[[rhs]] : f32
66 // CHECK: %[[sel:.*]] = arith.select %[[res]], %[[lhs]], %[[rhs]] : f32
67 // CHECK: affine.store %[[sel]], %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32>
78 func.func @loop_nest_3d_outer_two_parallel(%N : index) {
79 %0 = memref.alloc() : memref<1024 x 1024 x vector<64xf32>>
80 %1 = memref.alloc() : memref<1024 x 1024 x vector<64xf32>>
81 %2 = memref.alloc() : memref<1024 x 1024 x vector<64xf32>>
82 affine.for %i = 0 to %N {
83 affine.for %j = 0 to %N {
84 %7 = affine.load %2[%i, %j] : memref<1024x1024xvector<64xf32>>
85 affine.for %k = 0 to %N {
86 %5 = affine.load %0[%i, %k] : memref<1024x1024xvector<64xf32>>
87 %6 = affine.load %1[%k, %j] : memref<1024x1024xvector<64xf32>>
88 %8 = arith.mulf %5, %6 : vector<64xf32>
89 %9 = arith.addf %7, %8 : vector<64xf32>
90 affine.store %9, %2[%i, %j] : memref<1024x1024xvector<64xf32>>
97 // CHECK: affine.parallel (%[[arg1:.*]]) = (0) to (symbol(%arg0)) {
98 // CHECK-NEXT: affine.parallel (%[[arg2:.*]]) = (0) to (symbol(%arg0)) {
99 // CHECK: affine.for %[[arg3:.*]] = 0 to %arg0 {
101 // CHECK-LABEL: unknown_op_conservative
102 func.func @unknown_op_conservative() {
103 affine.for %i = 0 to 10 {
104 // CHECK: affine.for %[[arg1:.*]] = 0 to 10 {
105 "unknown"() : () -> ()
110 // CHECK-LABEL: non_affine_load
111 func.func @non_affine_load() {
112 %0 = memref.alloc() : memref<100 x f32>
113 affine.for %i = 0 to 100 {
114 // CHECK: affine.for %{{.*}} = 0 to 100 {
115 memref.load %0[%i] : memref<100 x f32>
120 // CHECK-LABEL: for_with_minmax
121 func.func @for_with_minmax(%m: memref<?xf32>, %lb0: index, %lb1: index,
122 %ub0: index, %ub1: index) {
123 // CHECK: affine.parallel (%{{.*}}) = (max(%{{.*}}, %{{.*}})) to (min(%{{.*}}, %{{.*}}))
124 affine.for %i = max affine_map<(d0, d1) -> (d0, d1)>(%lb0, %lb1)
125 to min affine_map<(d0, d1) -> (d0, d1)>(%ub0, %ub1) {
126 affine.load %m[%i] : memref<?xf32>
131 // CHECK-LABEL: nested_for_with_minmax
132 func.func @nested_for_with_minmax(%m: memref<?xf32>, %lb0: index,
133 %ub0: index, %ub1: index) {
134 // CHECK: affine.parallel (%[[I:.*]]) =
135 affine.for %j = 0 to 10 {
136 // CHECK: affine.parallel (%{{.*}}) = (max(%{{.*}}, %[[I]])) to (min(%{{.*}}, %{{.*}}))
137 affine.for %i = max affine_map<(d0, d1) -> (d0, d1)>(%lb0, %j)
138 to min affine_map<(d0, d1) -> (d0, d1)>(%ub0, %ub1) {
139 affine.load %m[%i] : memref<?xf32>
145 // MAX-NESTED-LABEL: @max_nested
146 func.func @max_nested(%m: memref<?x?xf32>, %lb0: index, %lb1: index,
147 %ub0: index, %ub1: index) {
148 // MAX-NESTED: affine.parallel
149 affine.for %i = affine_map<(d0) -> (d0)>(%lb0) to affine_map<(d0) -> (d0)>(%ub0) {
150 // MAX-NESTED: affine.for
151 affine.for %j = affine_map<(d0) -> (d0)>(%lb1) to affine_map<(d0) -> (d0)>(%ub1) {
152 affine.load %m[%i, %j] : memref<?x?xf32>
158 // MAX-NESTED-LABEL: @max_nested_1
159 func.func @max_nested_1(%arg0: memref<4096x4096xf32>, %arg1: memref<4096x4096xf32>, %arg2: memref<4096x4096xf32>) {
160 %0 = memref.alloc() : memref<4096x4096xf32>
161 // MAX-NESTED: affine.parallel
162 affine.for %arg3 = 0 to 4096 {
163 // MAX-NESTED-NEXT: affine.for
164 affine.for %arg4 = 0 to 4096 {
165 // MAX-NESTED-NEXT: affine.for
166 affine.for %arg5 = 0 to 4096 {
167 %1 = affine.load %arg0[%arg3, %arg5] : memref<4096x4096xf32>
168 %2 = affine.load %arg1[%arg5, %arg4] : memref<4096x4096xf32>
169 %3 = affine.load %0[%arg3, %arg4] : memref<4096x4096xf32>
170 %4 = arith.mulf %1, %2 : f32
171 %5 = arith.addf %3, %4 : f32
172 affine.store %5, %0[%arg3, %arg4] : memref<4096x4096xf32>
179 // CHECK-LABEL: @iter_args
180 // REDUCE-LABEL: @iter_args
181 func.func @iter_args(%in: memref<10xf32>) {
182 // REDUCE: %[[init:.*]] = arith.constant
183 %cst = arith.constant 0.000000e+00 : f32
184 // CHECK-NOT: affine.parallel
185 // REDUCE: %[[reduced:.*]] = affine.parallel (%{{.*}}) = (0) to (10) reduce ("addf")
186 %final_red = affine.for %i = 0 to 10 iter_args(%red_iter = %cst) -> (f32) {
187 // REDUCE: %[[red_value:.*]] = affine.load
188 %ld = affine.load %in[%i] : memref<10xf32>
189 // REDUCE-NOT: arith.addf
190 %add = arith.addf %red_iter, %ld : f32
191 // REDUCE: affine.yield %[[red_value]]
192 affine.yield %add : f32
194 // REDUCE: arith.addf %[[init]], %[[reduced]]
198 // CHECK-LABEL: @nested_iter_args
199 // REDUCE-LABEL: @nested_iter_args
200 func.func @nested_iter_args(%in: memref<20x10xf32>) {
201 %cst = arith.constant 0.000000e+00 : f32
202 // CHECK: affine.parallel
203 affine.for %i = 0 to 20 {
204 // CHECK-NOT: affine.parallel
205 // REDUCE: affine.parallel
206 // REDUCE: reduce ("addf")
207 %final_red = affine.for %j = 0 to 10 iter_args(%red_iter = %cst) -> (f32) {
208 %ld = affine.load %in[%i, %j] : memref<20x10xf32>
209 %add = arith.addf %red_iter, %ld : f32
210 affine.yield %add : f32
216 // REDUCE-LABEL: @strange_butterfly
217 func.func @strange_butterfly() {
218 %cst1 = arith.constant 0.0 : f32
219 %cst2 = arith.constant 1.0 : f32
220 // REDUCE-NOT: affine.parallel
221 affine.for %i = 0 to 10 iter_args(%it1 = %cst1, %it2 = %cst2) -> (f32, f32) {
222 %0 = arith.addf %it1, %it2 : f32
223 affine.yield %0, %0 : f32, f32
228 // An iter arg is used more than once. This is not a simple reduction and
229 // should not be parallelized.
230 // REDUCE-LABEL: @repeated_use
231 func.func @repeated_use() {
232 %cst1 = arith.constant 0.0 : f32
233 // REDUCE-NOT: affine.parallel
234 affine.for %i = 0 to 10 iter_args(%it1 = %cst1) -> (f32) {
235 %0 = arith.addf %it1, %it1 : f32
236 affine.yield %0 : f32
241 // An iter arg is used in the chain of operations defining the value being
242 // reduced, this is not a simple reduction and should not be parallelized.
243 // REDUCE-LABEL: @use_in_backward_slice
244 func.func @use_in_backward_slice() {
245 %cst1 = arith.constant 0.0 : f32
246 %cst2 = arith.constant 1.0 : f32
247 // REDUCE-NOT: affine.parallel
248 affine.for %i = 0 to 10 iter_args(%it1 = %cst1, %it2 = %cst2) -> (f32, f32) {
249 %0 = "test.some_modification"(%it2) : (f32) -> f32
250 %1 = arith.addf %it1, %0 : f32
251 affine.yield %1, %1 : f32, f32
256 // REDUCE-LABEL: @nested_min_max
257 // CHECK-LABEL: @nested_min_max
258 // CHECK: (%{{.*}}, %[[LB0:.*]]: index, %[[UB0:.*]]: index, %[[UB1:.*]]: index)
259 func.func @nested_min_max(%m: memref<?xf32>, %lb0: index,
260 %ub0: index, %ub1: index) {
261 // CHECK: affine.parallel (%[[J:.*]]) =
262 affine.for %j = 0 to 10 {
263 // CHECK: affine.parallel (%{{.*}}) = (max(%[[LB0]], %[[J]]))
264 // CHECK: to (min(%[[UB0]], %[[UB1]]))
265 affine.for %i = max affine_map<(d0, d1) -> (d0, d1)>(%lb0, %j)
266 to min affine_map<(d0, d1) -> (d0, d1)>(%ub0, %ub1) {
267 affine.load %m[%i] : memref<?xf32>
273 // Test in the presence of locally allocated memrefs.
275 // CHECK: func @local_alloc
276 func.func @local_alloc() {
277 %cst = arith.constant 0.0 : f32
278 affine.for %i = 0 to 100 {
279 %m = memref.alloc() : memref<1xf32>
280 %ma = memref.alloca() : memref<1xf32>
281 affine.store %cst, %m[0] : memref<1xf32>
283 // CHECK: affine.parallel
287 // CHECK: func @local_alloc_cast
288 func.func @local_alloc_cast() {
289 %cst = arith.constant 0.0 : f32
290 affine.for %i = 0 to 100 {
291 %m = memref.alloc() : memref<128xf32>
292 affine.for %j = 0 to 128 {
293 affine.store %cst, %m[%j] : memref<128xf32>
295 affine.for %j = 0 to 128 {
296 affine.store %cst, %m[0] : memref<128xf32>
298 %r = memref.reinterpret_cast %m to offset: [0], sizes: [8, 16],
299 strides: [16, 1] : memref<128xf32> to memref<8x16xf32>
300 affine.for %j = 0 to 8 {
301 affine.store %cst, %r[%j, %j] : memref<8x16xf32>
304 // CHECK: affine.parallel
305 // CHECK: affine.parallel
309 // CHECK: affine.parallel
316 // CHECK-LABEL: @iter_arg_memrefs
317 func.func @iter_arg_memrefs(%in: memref<10xf32>) {
318 %mi = memref.alloc() : memref<f32>
319 // Loop-carried memrefs are treated as serializing the loop.
321 %mo = affine.for %i = 0 to 10 iter_args(%m_arg = %mi) -> (memref<f32>) {
322 affine.yield %m_arg : memref<f32>