1 // RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf))" -split-input-file -allow-unregistered-dialect | FileCheck %s
2 // RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true lower-scalable=true}))" -split-input-file -allow-unregistered-dialect | FileCheck %s --check-prefix=FULL-UNROLL
3 // RUN: mlir-opt %s "-convert-vector-to-scf=full-unroll target-rank=0" -split-input-file -allow-unregistered-dialect | FileCheck %s --check-prefix=TARGET-RANK-ZERO
5 // CHECK-LABEL: func @vector_transfer_ops_0d(
6 func.func @vector_transfer_ops_0d(%M: memref<f32>) {
7 %f0 = arith.constant 0.0 : f32
9 // 0-d transfers are left untouched by vector-to-scf.
10 // They are independently lowered to the proper memref.load/store.
11 // CHECK: vector.transfer_read {{.*}}: memref<f32>, vector<f32>
12 %0 = vector.transfer_read %M[], %f0 {permutation_map = affine_map<()->()>} :
13 memref<f32>, vector<f32>
15 // CHECK: vector.transfer_write {{.*}}: vector<f32>, memref<f32>
16 vector.transfer_write %0, %M[] {permutation_map = affine_map<()->()>} :
17 vector<f32>, memref<f32>
24 // CHECK-LABEL: func @materialize_read_1d() {
25 func.func @materialize_read_1d() {
26 %f0 = arith.constant 0.0: f32
27 %A = memref.alloc () : memref<7x42xf32>
28 affine.for %i0 = 0 to 7 step 4 {
29 affine.for %i1 = 0 to 42 step 4 {
30 %f1 = vector.transfer_read %A[%i0, %i1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
31 %ip1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i1)
32 %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
33 %ip2 = affine.apply affine_map<(d0) -> (d0 + 2)> (%i1)
34 %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
35 %ip3 = affine.apply affine_map<(d0) -> (d0 + 3)> (%i1)
36 %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
37 // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds.
39 // CHECK-NEXT: memref.load
40 // CHECK-NEXT: vector.insertelement
41 // CHECK-NEXT: scf.yield
43 // CHECK-NEXT: scf.yield
44 // Add a dummy use to prevent dead code elimination from removing transfer
46 "dummy_use"(%f1, %f2, %f3, %f4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> ()
54 // CHECK-LABEL: func @materialize_read_1d_partially_specialized
55 func.func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %dyn4 : index) {
56 %f0 = arith.constant 0.0: f32
57 %A = memref.alloc (%dyn1, %dyn2, %dyn4) : memref<7x?x?x42x?xf32>
58 affine.for %i0 = 0 to 7 {
59 affine.for %i1 = 0 to %dyn1 {
60 affine.for %i2 = 0 to %dyn2 {
61 affine.for %i3 = 0 to 42 step 2 {
62 affine.for %i4 = 0 to %dyn4 {
63 %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
64 %i3p1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i3)
65 %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
66 // Add a dummy use to prevent dead code elimination from removing
68 "dummy_use"(%f1, %f2) : (vector<4xf32>, vector<4xf32>) -> ()
74 // CHECK: %[[tensor:[0-9a-zA-Z_]+]] = memref.alloc
75 // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c0
76 // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c3
82 // CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)>
84 // CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
85 func.func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
86 %f0 = arith.constant 0.0: f32
87 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
88 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
89 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
90 // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
91 // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index
92 // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
93 // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
94 // CHECK-NEXT: affine.for %[[I1:.*]] = 0 to %{{.*}} {
95 // CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} {
96 // CHECK-NEXT: affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
97 // CHECK: %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>>
98 // CHECK: scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
100 // CHECK: %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]])
101 // CHECK: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
102 // CHECK: %[[VEC:.*]] = scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<3xf32>) {
103 // CHECK: %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]])
104 // CHECK: scf.if {{.*}} -> (vector<3xf32>) {
105 // CHECK-NEXT: %[[SCAL:.*]] = memref.load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref<?x?x?x?xf32>
106 // CHECK-NEXT: %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %{{.*}}[%[[I6]] : index] : vector<3xf32>
107 // CHECK-NEXT: scf.yield
108 // CHECK-NEXT: } else {
109 // CHECK-NEXT: scf.yield
111 // CHECK-NEXT: scf.yield
113 // CHECK-NEXT: memref.store %[[VEC]], {{.*}} : memref<5x4xvector<3xf32>>
115 // CHECK-NEXT: } else {
116 // CHECK-NEXT: memref.store {{.*}} : memref<5xvector<4x3xf32>>
119 // CHECK-NEXT: %[[LD:.*]] = memref.load %[[ALLOC]][] : memref<vector<5x4x3xf32>>
120 // CHECK-NEXT: "dummy_use"(%[[LD]]) : (vector<5x4x3xf32>) -> ()
125 // CHECK-NEXT: return
128 // Check that I0 + I4 (of size 3) read from first index load(L0, ...) and write into last index store(..., I4)
129 // Check that I3 + I6 (of size 5) read from last index load(..., L3) and write into first index store(I6, ...)
130 // Other dimensions are just accessed with I1, I2 resp.
131 %A = memref.alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
132 affine.for %i0 = 0 to %M step 3 {
133 affine.for %i1 = 0 to %N {
134 affine.for %i2 = 0 to %O {
135 affine.for %i3 = 0 to %P step 5 {
136 %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref<?x?x?x?xf32>, vector<5x4x3xf32>
137 // Add a dummy use to prevent dead code elimination from removing
138 // transfer read ops.
139 "dummy_use"(%f) : (vector<5x4x3xf32>) -> ()
149 // CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)>
151 // CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
152 func.func @materialize_write(%M: index, %N: index, %O: index, %P: index) {
153 // CHECK-DAG: %{{.*}} = arith.constant dense<1.000000e+00> : vector<3x4x1x5xf32>
154 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
155 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
156 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
157 // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
158 // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
159 // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
160 // CHECK-NEXT: affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 {
161 // CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} {
162 // CHECK-NEXT: affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
163 // CHECK: %[[ALLOC:.*]] = memref.alloca() : memref<vector<3x4x1x5xf32>>
164 // CHECK: memref.store %{{.*}}, %[[ALLOC]][] : memref<vector<3x4x1x5xf32>>
165 // CHECK: %[[VECTOR_VIEW1:.*]] = vector.type_cast %[[ALLOC]] : memref<vector<3x4x1x5xf32>> to memref<3xvector<4x1x5xf32>>
166 // CHECK: scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
168 // CHECK: %[[S3:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]])
169 // CHECK: %[[VECTOR_VIEW2:.*]] = vector.type_cast %[[VECTOR_VIEW1]] : memref<3xvector<4x1x5xf32>> to memref<3x4xvector<1x5xf32>>
170 // CHECK: scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
172 // CHECK: %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]])
173 // CHECK: %[[VECTOR_VIEW3:.*]] = vector.type_cast %[[VECTOR_VIEW2]] : memref<3x4xvector<1x5xf32>> to memref<3x4x1xvector<5xf32>>
174 // CHECK: scf.for %[[I6:.*]] = %[[C0]] to %[[C1]] step %[[C1]] {
175 // CHECK: %[[S0:.*]] = affine.apply #[[$ADD]](%[[I2]], %[[I6]])
176 // CHECK: %[[VEC:.*]] = memref.load %[[VECTOR_VIEW3]][%[[I4]], %[[I5]], %[[I6]]] : memref<3x4x1xvector<5xf32>>
177 // CHECK: vector.transfer_write %[[VEC]], %{{.*}}[%[[S3]], %[[S1]], %[[S0]], %[[I3]]] : vector<5xf32>, memref<?x?x?x?xf32>
189 // Check that I0 + I4 (of size 3) read from last index load(..., I4) and write into first index store(S0, ...)
190 // Check that I1 + I5 (of size 4) read from second index load(..., I5, ...) and write into second index store(..., S1, ...)
191 // Check that I3 + I6 (of size 5) read from first index load(I6, ...) and write into last index store(..., S3)
192 // Other dimension is just accessed with I2.
193 %A = memref.alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
194 %f1 = arith.constant dense<1.000000e+00> : vector<5x4x3xf32>
195 affine.for %i0 = 0 to %M step 3 {
196 affine.for %i1 = 0 to %N step 4 {
197 affine.for %i2 = 0 to %O {
198 affine.for %i3 = 0 to %P step 5 {
199 vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d1, d0)>} : vector<5x4x3xf32>, memref<?x?x?x?xf32>
209 // CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
211 // FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
212 // FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
215 // CHECK-LABEL: transfer_read_progressive(
216 // CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
217 // CHECK-SAME: %[[base:[a-zA-Z0-9]+]]: index
219 // FULL-UNROLL-LABEL: transfer_read_progressive(
220 // FULL-UNROLL-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
221 // FULL-UNROLL-SAME: %[[base:[a-zA-Z0-9]+]]: index
223 func.func @transfer_read_progressive(%A : memref<?x?xf32>, %base: index) -> vector<3x15xf32> {
224 %f7 = arith.constant 7.0: f32
225 // CHECK-DAG: %[[C7:.*]] = arith.constant 7.000000e+00 : f32
226 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
227 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
228 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
229 // CHECK-DAG: %[[splat:.*]] = arith.constant dense<7.000000e+00> : vector<15xf32>
230 // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
231 // CHECK: %[[alloc_casted:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
232 // CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
233 // CHECK: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
234 // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
235 // CHECK: %[[cond1:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index
236 // CHECK: scf.if %[[cond1]] {
237 // CHECK: %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
238 // CHECK: memref.store %[[vec_1d]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
240 // CHECK: store %[[splat]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
243 // CHECK: %[[cst:.*]] = memref.load %[[alloc]][] : memref<vector<3x15xf32>>
245 // FULL-UNROLL-DAG: %[[C7:.*]] = arith.constant 7.000000e+00 : f32
246 // FULL-UNROLL-DAG: %[[VEC0:.*]] = arith.constant dense<7.000000e+00> : vector<3x15xf32>
247 // FULL-UNROLL-DAG: %[[C0:.*]] = arith.constant 0 : index
248 // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
249 // FULL-UNROLL: cmpi sgt, %[[DIM]], %[[base]] : index
250 // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
251 // FULL-UNROLL: vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
252 // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32>
253 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32>
254 // FULL-UNROLL: } else {
255 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32>
257 // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]]
258 // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
259 // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
260 // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
261 // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32>
262 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32>
263 // FULL-UNROLL: } else {
264 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32>
266 // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]]
267 // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
268 // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
269 // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
270 // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32>
271 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32>
272 // FULL-UNROLL: } else {
273 // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32>
276 %f = vector.transfer_read %A[%base, %base], %f7 :
277 memref<?x?xf32>, vector<3x15xf32>
279 return %f: vector<3x15xf32>
284 // CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
286 // FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
287 // FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
289 // CHECK-LABEL: transfer_write_progressive(
290 // CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
291 // CHECK-SAME: %[[base:[a-zA-Z0-9]+]]: index,
292 // CHECK-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
293 // FULL-UNROLL-LABEL: transfer_write_progressive(
294 // FULL-UNROLL-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
295 // FULL-UNROLL-SAME: %[[base:[a-zA-Z0-9]+]]: index,
296 // FULL-UNROLL-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
297 func.func @transfer_write_progressive(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) {
298 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
299 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
300 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
301 // CHECK: %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
302 // CHECK: memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>>
303 // CHECK: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
304 // CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
305 // CHECK: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
306 // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
307 // CHECK: %[[cmp:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index
308 // CHECK: scf.if %[[cmp]] {
309 // CHECK: %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
310 // CHECK: vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
314 // FULL-UNROLL: %[[C0:.*]] = arith.constant 0 : index
315 // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
316 // FULL-UNROLL: %[[CMP0:.*]] = arith.cmpi sgt, %[[DIM]], %[[base]] : index
317 // FULL-UNROLL: scf.if %[[CMP0]] {
318 // FULL-UNROLL: %[[V0:.*]] = vector.extract %[[vec]][0] : vector<15xf32> from vector<3x15xf32>
319 // FULL-UNROLL: vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] : vector<15xf32>, memref<?x?xf32>
321 // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]]
322 // FULL-UNROLL: %[[CMP1:.*]] = arith.cmpi sgt, %{{.*}}, %[[I1]] : index
323 // FULL-UNROLL: scf.if %[[CMP1]] {
324 // FULL-UNROLL: %[[V1:.*]] = vector.extract %[[vec]][1] : vector<15xf32> from vector<3x15xf32>
325 // FULL-UNROLL: vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
327 // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]]
328 // FULL-UNROLL: %[[CMP2:.*]] = arith.cmpi sgt, %{{.*}}, %[[I2]] : index
329 // FULL-UNROLL: scf.if %[[CMP2]] {
330 // FULL-UNROLL: %[[V2:.*]] = vector.extract %[[vec]][2] : vector<15xf32> from vector<3x15xf32>
331 // FULL-UNROLL: vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
334 vector.transfer_write %vec, %A[%base, %base] :
335 vector<3x15xf32>, memref<?x?xf32>
341 // CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
343 // FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
344 // FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
346 // CHECK-LABEL: transfer_write_progressive_inbounds(
347 // CHECK-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
348 // CHECK-SAME: %[[base:[a-zA-Z0-9]+]]: index,
349 // CHECK-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
350 // FULL-UNROLL-LABEL: transfer_write_progressive_inbounds(
351 // FULL-UNROLL-SAME: %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
352 // FULL-UNROLL-SAME: %[[base:[a-zA-Z0-9]+]]: index,
353 // FULL-UNROLL-SAME: %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
354 func.func @transfer_write_progressive_inbounds(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) {
356 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
357 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
358 // CHECK: %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
359 // CHECK-NEXT: memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>>
360 // CHECK-NEXT: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
361 // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
362 // CHECK-NEXT: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
363 // CHECK-NEXT: %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
364 // CHECK-NEXT: vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
366 // FULL-UNROLL: %[[VEC0:.*]] = vector.extract %[[vec]][0] : vector<15xf32> from vector<3x15xf32>
367 // FULL-UNROLL: vector.transfer_write %[[VEC0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
368 // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]]
369 // FULL-UNROLL: %[[VEC1:.*]] = vector.extract %[[vec]][1] : vector<15xf32> from vector<3x15xf32>
370 // FULL-UNROLL: vector.transfer_write %2, %[[A]][%[[I1]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
371 // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]]
372 // FULL-UNROLL: %[[VEC2:.*]] = vector.extract %[[vec]][2] : vector<15xf32> from vector<3x15xf32>
373 // FULL-UNROLL: vector.transfer_write %[[VEC2:.*]], %[[A]][%[[I2]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
374 vector.transfer_write %vec, %A[%base, %base] {in_bounds = [true, true]} :
375 vector<3x15xf32>, memref<?x?xf32>
381 // FULL-UNROLL-LABEL: transfer_read_simple
382 func.func @transfer_read_simple(%A : memref<2x2xf32>) -> vector<2x2xf32> {
383 %c0 = arith.constant 0 : index
384 %f0 = arith.constant 0.0 : f32
385 // FULL-UNROLL-DAG: %[[VC0:.*]] = arith.constant dense<0.000000e+00> : vector<2x2xf32>
386 // FULL-UNROLL-DAG: %[[C0:.*]] = arith.constant 0 : index
387 // FULL-UNROLL-DAG: %[[C1:.*]] = arith.constant 1 : index
388 // FULL-UNROLL: %[[V0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]]
389 // FULL-UNROLL: %[[RES0:.*]] = vector.insert %[[V0]], %[[VC0]] [0] : vector<2xf32> into vector<2x2xf32>
390 // FULL-UNROLL: %[[V1:.*]] = vector.transfer_read %{{.*}}[%[[C1]], %[[C0]]]
391 // FULL-UNROLL: %[[RES1:.*]] = vector.insert %[[V1]], %[[RES0]] [1] : vector<2xf32> into vector<2x2xf32>
392 %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<2x2xf32>, vector<2x2xf32>
393 return %0 : vector<2x2xf32>
396 func.func @transfer_read_minor_identity(%A : memref<?x?x?x?xf32>) -> vector<3x3xf32> {
397 %c0 = arith.constant 0 : index
398 %f0 = arith.constant 0.0 : f32
399 %0 = vector.transfer_read %A[%c0, %c0, %c0, %c0], %f0
400 { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> }
401 : memref<?x?x?x?xf32>, vector<3x3xf32>
402 return %0 : vector<3x3xf32>
405 // CHECK-LABEL: transfer_read_minor_identity(
406 // CHECK-SAME: %[[A:.*]]: memref<?x?x?x?xf32>) -> vector<3x3xf32>
407 // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
408 // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
409 // CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index
410 // CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index
411 // CHECK-DAG: %[[f0:.*]] = arith.constant 0.000000e+00 : f32
412 // CHECK-DAG: %[[cst0:.*]] = arith.constant dense<0.000000e+00> : vector<3xf32>
413 // CHECK: %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>>
414 // CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>>
415 // CHECK: scf.for %[[arg1:.*]] = %[[c0]] to %[[c3]]
416 // CHECK: %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref<?x?x?x?xf32>
417 // CHECK: %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg1]] : index
418 // CHECK: scf.if %[[cmp]] {
419 // CHECK: %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref<?x?x?x?xf32>, vector<3xf32>
420 // CHECK: memref.store %[[tr]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
422 // CHECK: memref.store %[[cst0]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
425 // CHECK: %[[ret:.*]] = memref.load %[[m]][] : memref<vector<3x3xf32>>
426 // CHECK: return %[[ret]] : vector<3x3xf32>
428 func.func @transfer_write_minor_identity(%A : vector<3x3xf32>, %B : memref<?x?x?x?xf32>) {
429 %c0 = arith.constant 0 : index
430 %f0 = arith.constant 0.0 : f32
431 vector.transfer_write %A, %B[%c0, %c0, %c0, %c0]
432 { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> }
433 : vector<3x3xf32>, memref<?x?x?x?xf32>
437 // CHECK-LABEL: transfer_write_minor_identity(
438 // CHECK-SAME: %[[A:.*]]: vector<3x3xf32>,
439 // CHECK-SAME: %[[B:.*]]: memref<?x?x?x?xf32>)
440 // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
441 // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
442 // CHECK-DAG: %[[c2:.*]] = arith.constant 2 : index
443 // CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index
444 // CHECK: %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>>
445 // CHECK: memref.store %[[A]], %[[m]][] : memref<vector<3x3xf32>>
446 // CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>>
447 // CHECK: scf.for %[[arg2:.*]] = %[[c0]] to %[[c3]]
448 // CHECK: %[[d:.*]] = memref.dim %[[B]], %[[c2]] : memref<?x?x?x?xf32>
449 // CHECK: %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg2]] : index
450 // CHECK: scf.if %[[cmp]] {
451 // CHECK: %[[tmp:.*]] = memref.load %[[cast]][%[[arg2]]] : memref<3xvector<3xf32>>
452 // CHECK: vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref<?x?x?x?xf32>
460 func.func @transfer_read_strided(%A : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) -> vector<4xf32> {
461 %c0 = arith.constant 0 : index
462 %f0 = arith.constant 0.0 : f32
463 %0 = vector.transfer_read %A[%c0, %c0], %f0
464 : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>, vector<4xf32>
465 return %0 : vector<4xf32>
468 // CHECK-LABEL: transfer_read_strided(
470 // CHECK: memref.load
472 func.func @transfer_write_strided(%A : vector<4xf32>, %B : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) {
473 %c0 = arith.constant 0 : index
474 vector.transfer_write %A, %B[%c0, %c0] :
475 vector<4xf32>, memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>
479 // CHECK-LABEL: transfer_write_strided(
485 func.func private @fake_side_effecting_fun(%0: vector<2x2xf32>) -> ()
487 // CHECK-LABEL: transfer_read_within_async_execute
488 func.func @transfer_read_within_async_execute(%A : memref<2x2xf32>) -> !async.token {
489 %c0 = arith.constant 0 : index
490 %f0 = arith.constant 0.0 : f32
492 // CHECK: async.execute
494 %token = async.execute {
495 %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<2x2xf32>, vector<2x2xf32>
496 func.call @fake_side_effecting_fun(%0) : (vector<2x2xf32>) -> ()
499 return %token : !async.token
504 // CHECK-LABEL: transfer_read_with_tensor
505 func.func @transfer_read_with_tensor(%arg: tensor<f32>) -> vector<1xf32> {
506 // CHECK: %[[EXTRACTED:.*]] = vector.transfer_read %{{.*}}[], %{{.*}} : tensor<f32>, vector<f32>
507 // CHECK-NEXT: %[[RESULT:.*]] = vector.broadcast %[[EXTRACTED]] : vector<f32> to vector<1xf32>
508 // CHECK-NEXT: return %[[RESULT]] : vector<1xf32>
509 %f0 = arith.constant 0.0 : f32
510 %0 = vector.transfer_read %arg[], %f0 {permutation_map = affine_map<()->(0)>} :
511 tensor<f32>, vector<1xf32>
512 return %0: vector<1xf32>
517 // CHECK-LABEL: transfer_write_scalable
518 func.func @transfer_write_scalable(%arg0: memref<?xf32, strided<[?], offset: ?>>, %arg1: f32) {
519 %0 = llvm.mlir.constant(0 : i32) : i32
520 %c0 = arith.constant 0 : index
521 %dim = memref.dim %arg0, %c0 : memref<?xf32, strided<[?], offset: ?>>
522 %1 = llvm.intr.stepvector : vector<[16]xi32>
523 %2 = arith.index_cast %dim : index to i32
524 %3 = llvm.mlir.undef : vector<[16]xi32>
525 %4 = llvm.insertelement %2, %3[%0 : i32] : vector<[16]xi32>
526 %5 = llvm.shufflevector %4, %3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<[16]xi32>
527 %6 = arith.cmpi slt, %1, %5 : vector<[16]xi32>
528 %7 = llvm.mlir.undef : vector<[16]xf32>
529 %8 = llvm.insertelement %arg1, %7[%0 : i32] : vector<[16]xf32>
530 %9 = llvm.shufflevector %8, %7 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<[16]xf32>
531 vector.transfer_write %9, %arg0[%c0], %6 {in_bounds = [true]} : vector<[16]xf32>, memref<?xf32, strided<[?], offset: ?>>
535 // CHECK-SAME: %[[ARG_0:.*]]: memref<?xf32, strided<[?], offset: ?>>,
536 // CHECK-DAG: %[[C_0:.*]] = arith.constant 0 : index
537 // CHECK-DAG: %[[C_16:.*]] = arith.constant 16 : index
538 // CHECK-DAG: %[[STEP:.*]] = arith.constant 1 : index
539 // CHECK: %[[MASK_VEC:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} : vector<[16]xi32>
540 // CHECK: %[[VSCALE:.*]] = vector.vscale
541 // CHECK: %[[UB:.*]] = arith.muli %[[VSCALE]], %[[C_16]] : index
542 // CHECK: scf.for %[[IDX:.*]] = %[[C_0]] to %[[UB]] step %[[STEP]] {
543 // CHECK: %[[MASK_VAL:.*]] = vector.extractelement %[[MASK_VEC]][%[[IDX]] : index] : vector<[16]xi1>
544 // CHECK: scf.if %[[MASK_VAL]] {
545 // CHECK: %[[VAL_TO_STORE:.*]] = vector.extractelement %{{.*}}[%[[IDX]] : index] : vector<[16]xf32>
546 // CHECK: memref.store %[[VAL_TO_STORE]], %[[ARG_0]][%[[IDX]]] : memref<?xf32, strided<[?], offset: ?>>
553 func.func @vector_print_vector_0d(%arg0: vector<f32>) {
554 vector.print %arg0 : vector<f32>
557 // CHECK-LABEL: func.func @vector_print_vector_0d(
558 // CHECK-SAME: %[[VEC:.*]]: vector<f32>) {
559 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
560 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
561 // CHECK: %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<f32> to vector<1xf32>
562 // CHECK: vector.print punctuation <open>
563 // CHECK: scf.for %[[IDX:.*]] = %[[C0]] to %[[C1]] step %[[C1]] {
564 // CHECK: %[[EL:.*]] = vector.extractelement %[[FLAT_VEC]]{{\[}}%[[IDX]] : index] : vector<1xf32>
565 // CHECK: vector.print %[[EL]] : f32 punctuation <no_punctuation>
566 // CHECK: %[[IS_NOT_LAST:.*]] = arith.cmpi ult, %[[IDX]], %[[C0]] : index
567 // CHECK: scf.if %[[IS_NOT_LAST]] {
568 // CHECK: vector.print punctuation <comma>
571 // CHECK: vector.print punctuation <close>
572 // CHECK: vector.print
578 func.func @vector_print_vector(%arg0: vector<2x2xf32>) {
579 vector.print %arg0 : vector<2x2xf32>
582 // CHECK-LABEL: func.func @vector_print_vector(
583 // CHECK-SAME: %[[VEC:.*]]: vector<2x2xf32>) {
584 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
585 // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
586 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
587 // CHECK: %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<2x2xf32> to vector<4xf32>
588 // CHECK: vector.print punctuation <open>
589 // CHECK: scf.for %[[I:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
590 // CHECK: vector.print punctuation <open>
591 // CHECK: scf.for %[[J:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
592 // CHECK: %[[OUTER_INDEX:.*]] = arith.muli %[[I]], %[[C2]] : index
593 // CHECK: %[[FLAT_INDEX:.*]] = arith.addi %[[J]], %[[OUTER_INDEX]] : index
594 // CHECK: %[[EL:.*]] = vector.extractelement %[[FLAT_VEC]]{{\[}}%[[FLAT_INDEX]] : index] : vector<4xf32>
595 // CHECK: vector.print %[[EL]] : f32 punctuation <no_punctuation>
596 // CHECK: %[[IS_NOT_LAST_J:.*]] = arith.cmpi ult, %[[J]], %[[C1]] : index
597 // CHECK: scf.if %[[IS_NOT_LAST_J]] {
598 // CHECK: vector.print punctuation <comma>
601 // CHECK: vector.print punctuation <close>
602 // CHECK: %[[IS_NOT_LAST_I:.*]] = arith.cmpi ult, %[[I]], %[[C1]] : index
603 // CHECK: scf.if %[[IS_NOT_LAST_I]] {
604 // CHECK: vector.print punctuation <comma>
607 // CHECK: vector.print punctuation <close>
608 // CHECK: vector.print
614 func.func @vector_print_scalable_vector(%arg0: vector<[4]xi32>) {
615 vector.print %arg0 : vector<[4]xi32>
618 // CHECK-LABEL: func.func @vector_print_scalable_vector(
619 // CHECK-SAME: %[[VEC:.*]]: vector<[4]xi32>) {
620 // CHECK: %[[C0:.*]] = arith.constant 0 : index
621 // CHECK: %[[C4:.*]] = arith.constant 4 : index
622 // CHECK: %[[C1:.*]] = arith.constant 1 : index
623 // CHECK: %[[VSCALE:.*]] = vector.vscale
624 // CHECK: %[[UPPER_BOUND:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
625 // CHECK: %[[LAST_INDEX:.*]] = arith.subi %[[UPPER_BOUND]], %[[C1]] : index
626 // CHECK: vector.print punctuation <open>
627 // CHECK: scf.for %[[IDX:.*]] = %[[C0]] to %[[UPPER_BOUND]] step %[[C1]] {
628 // CHECK: %[[EL:.*]] = vector.extractelement %[[VEC]]{{\[}}%[[IDX]] : index] : vector<[4]xi32>
629 // CHECK: vector.print %[[EL]] : i32 punctuation <no_punctuation>
630 // CHECK: %[[IS_NOT_LAST:.*]] = arith.cmpi ult, %[[IDX]], %[[LAST_INDEX]] : index
631 // CHECK: scf.if %[[IS_NOT_LAST]] {
632 // CHECK: vector.print punctuation <comma>
635 // CHECK: vector.print punctuation <close>
636 // CHECK: vector.print
642 func.func @transfer_read_array_of_scalable(%arg0: memref<3x?xf32>) -> vector<3x[4]xf32> {
643 %c0 = arith.constant 0 : index
644 %c1 = arith.constant 1 : index
645 %cst = arith.constant 0.000000e+00 : f32
646 %dim = memref.dim %arg0, %c1 : memref<3x?xf32>
647 %mask = vector.create_mask %c1, %dim : vector<3x[4]xi1>
648 %read = vector.transfer_read %arg0[%c0, %c0], %cst, %mask {in_bounds = [true, true]} : memref<3x?xf32>, vector<3x[4]xf32>
649 return %read : vector<3x[4]xf32>
651 // CHECK-LABEL: func.func @transfer_read_array_of_scalable(
652 // CHECK-SAME: %[[ARG:.*]]: memref<3x?xf32>) -> vector<3x[4]xf32> {
653 // CHECK-DAG: %[[PADDING:.*]] = arith.constant 0.000000e+00 : f32
654 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
655 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
656 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
657 // CHECK: %[[ALLOCA_VEC:.*]] = memref.alloca() : memref<vector<3x[4]xf32>>
658 // CHECK: %[[ALLOCA_MASK:.*]] = memref.alloca() : memref<vector<3x[4]xi1>>
659 // CHECK: %[[DIM_SIZE:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<3x?xf32>
660 // CHECK: %[[MASK:.*]] = vector.create_mask %[[C1]], %[[DIM_SIZE]] : vector<3x[4]xi1>
661 // CHECK: memref.store %[[MASK]], %[[ALLOCA_MASK]][] : memref<vector<3x[4]xi1>>
662 // CHECK: %[[UNPACK_VECTOR:.*]] = vector.type_cast %[[ALLOCA_VEC]] : memref<vector<3x[4]xf32>> to memref<3xvector<[4]xf32>>
663 // CHECK: %[[UNPACK_MASK:.*]] = vector.type_cast %[[ALLOCA_MASK]] : memref<vector<3x[4]xi1>> to memref<3xvector<[4]xi1>>
664 // CHECK: scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
665 // CHECK: %[[MASK_SLICE:.*]] = memref.load %[[UNPACK_MASK]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xi1>>
666 // CHECK: %[[READ_SLICE:.*]] = vector.transfer_read %[[ARG]]{{\[}}%[[VAL_11]], %[[C0]]], %[[PADDING]], %[[MASK_SLICE]] {in_bounds = [true]} : memref<3x?xf32>, vector<[4]xf32>
667 // CHECK: memref.store %[[READ_SLICE]], %[[UNPACK_VECTOR]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xf32>>
669 // CHECK: %[[RESULT:.*]] = memref.load %[[ALLOCA_VEC]][] : memref<vector<3x[4]xf32>>
670 // CHECK: return %[[RESULT]] : vector<3x[4]xf32>
675 func.func @transfer_write_array_of_scalable(%vec: vector<3x[4]xf32>, %arg0: memref<3x?xf32>) {
676 %c0 = arith.constant 0 : index
677 %c1 = arith.constant 1 : index
678 %cst = arith.constant 0.000000e+00 : f32
679 %dim = memref.dim %arg0, %c1 : memref<3x?xf32>
680 %mask = vector.create_mask %c1, %dim : vector<3x[4]xi1>
681 vector.transfer_write %vec, %arg0[%c0, %c0], %mask {in_bounds = [true, true]} : vector<3x[4]xf32>, memref<3x?xf32>
684 // CHECK-LABEL: func.func @transfer_write_array_of_scalable(
685 // CHECK-SAME: %[[VEC:.*]]: vector<3x[4]xf32>,
686 // CHECK-SAME: %[[MEMREF:.*]]: memref<3x?xf32>) {
687 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
688 // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
689 // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
690 // CHECK: %[[ALLOCA_VEC:.*]] = memref.alloca() : memref<vector<3x[4]xf32>>
691 // CHECK: %[[ALLOCA_MASK:.*]] = memref.alloca() : memref<vector<3x[4]xi1>>
692 // CHECK: %[[DIM_SIZE:.*]] = memref.dim %[[MEMREF]], %[[C1]] : memref<3x?xf32>
693 // CHECK: %[[MASK:.*]] = vector.create_mask %[[C1]], %[[DIM_SIZE]] : vector<3x[4]xi1>
694 // CHECK: memref.store %[[MASK]], %[[ALLOCA_MASK]][] : memref<vector<3x[4]xi1>>
695 // CHECK: memref.store %[[VEC]], %[[ALLOCA_VEC]][] : memref<vector<3x[4]xf32>>
696 // CHECK: %[[UNPACK_VECTOR:.*]] = vector.type_cast %[[ALLOCA_VEC]] : memref<vector<3x[4]xf32>> to memref<3xvector<[4]xf32>>
697 // CHECK: %[[UNPACK_MASK:.*]] = vector.type_cast %[[ALLOCA_MASK]] : memref<vector<3x[4]xi1>> to memref<3xvector<[4]xi1>>
698 // CHECK: scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
699 // CHECK: %[[MASK_SLICE:.*]] = memref.load %[[UNPACK_VECTOR]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xf32>>
700 // CHECK: %[[VECTOR_SLICE:.*]] = memref.load %[[UNPACK_MASK]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xi1>>
701 // CHECK: vector.transfer_write %[[MASK_SLICE]], %[[MEMREF]]{{\[}}%[[VAL_11]], %[[C0]]], %[[VECTOR_SLICE]] {in_bounds = [true]} : vector<[4]xf32>, memref<3x?xf32>
708 /// The following two tests currently cannot be lowered via unpacking the leading dim since it is scalable.
709 /// It may be possible to special case this via a dynamic dim in future.
711 func.func @cannot_lower_transfer_write_with_leading_scalable(%vec: vector<[4]x4xf32>, %arg0: memref<?x4xf32>) {
712 %c0 = arith.constant 0 : index
713 %c4 = arith.constant 4 : index
714 %cst = arith.constant 0.000000e+00 : f32
715 %dim = memref.dim %arg0, %c0 : memref<?x4xf32>
716 %mask = vector.create_mask %dim, %c4 : vector<[4]x4xi1>
717 vector.transfer_write %vec, %arg0[%c0, %c0], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x4xf32>
720 // CHECK-LABEL: func.func @cannot_lower_transfer_write_with_leading_scalable(
721 // CHECK-SAME: %[[VEC:.*]]: vector<[4]x4xf32>,
722 // CHECK-SAME: %[[MEMREF:.*]]: memref<?x4xf32>)
723 // CHECK: vector.transfer_write %[[VEC]], %[[MEMREF]][%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x4xf32>
727 func.func @cannot_lower_transfer_read_with_leading_scalable(%arg0: memref<?x4xf32>) -> vector<[4]x4xf32> {
728 %c0 = arith.constant 0 : index
729 %c1 = arith.constant 1 : index
730 %c4 = arith.constant 4 : index
731 %cst = arith.constant 0.000000e+00 : f32
732 %dim = memref.dim %arg0, %c0 : memref<?x4xf32>
733 %mask = vector.create_mask %dim, %c4 : vector<[4]x4xi1>
734 %read = vector.transfer_read %arg0[%c0, %c0], %cst, %mask {in_bounds = [true, true]} : memref<?x4xf32>, vector<[4]x4xf32>
735 return %read : vector<[4]x4xf32>
737 // CHECK-LABEL: func.func @cannot_lower_transfer_read_with_leading_scalable(
738 // CHECK-SAME: %[[MEMREF:.*]]: memref<?x4xf32>)
739 // CHECK: %{{.*}} = vector.transfer_read %[[MEMREF]][%{{.*}}, %{{.*}}], %{{.*}}, %{{.*}} {in_bounds = [true, true]} : memref<?x4xf32>, vector<[4]x4xf32>
743 // Check that the `TransferOpConversion` generates valid indices for the LoadOp.
745 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, 0, 0, d3)>
746 func.func @does_not_crash_on_unpack_one_dim(%subview: memref<1x1x1x1xi32>, %mask: vector<1x1xi1>) -> vector<1x1x1x1xi32> {
747 %c0 = arith.constant 0 : index
748 %c0_i32 = arith.constant 0 : i32
749 %3 = vector.transfer_read %subview[%c0, %c0, %c0, %c0], %c0_i32, %mask {permutation_map = #map1}
750 : memref<1x1x1x1xi32>, vector<1x1x1x1xi32>
751 return %3 : vector<1x1x1x1xi32>
753 // CHECK-LABEL: func.func @does_not_crash_on_unpack_one_dim
754 // CHECK: %[[ALLOCA_0:.*]] = memref.alloca() : memref<vector<1x1xi1>>
755 // CHECK: %[[MASK:.*]] = vector.type_cast %[[ALLOCA_0]] : memref<vector<1x1xi1>> to memref<1xvector<1xi1>>
756 // CHECK: memref.load %[[MASK]][%{{.*}}] : memref<1xvector<1xi1>>
760 // Check that the `TransferOpConversion` generates valid indices for the StoreOp.
761 // This test is pulled from an integration test for ArmSVE.
763 func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x?xf32>) -> vector<1x2x[4]xf32> {
764 %c0 = arith.constant 0 : index
765 %c2 = arith.constant 2 : index
766 %c3 = arith.constant 2 : index
767 %cst = arith.constant 0.000000e+00 : f32
768 %dim_a = memref.dim %a, %c2 : memref<1x2x?xf32>
769 %mask_a = vector.create_mask %c2, %c3, %dim_a : vector<1x2x[4]xi1>
770 %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
771 return %vector_a : vector<1x2x[4]xf32>
773 // CHECK-LABEL: func.func @add_arrays_of_scalable_vectors
776 // CHECK: memref.load
780 // FULL-UNROLL-LABEL: @cannot_fully_unroll_transfer_write_of_nd_scalable_vector
781 func.func @cannot_fully_unroll_transfer_write_of_nd_scalable_vector(%vec: vector<[4]x[4]xf32>, %memref: memref<?x?xf32>) {
782 // FULL-UNROLL-NOT: vector.extract
783 // FULL-UNROLL: vector.transfer_write {{.*}} : vector<[4]x[4]xf32>, memref<?x?xf32>
784 // FULL-UNROLL-NOT: vector.extract
785 %c0 = arith.constant 0 : index
786 vector.transfer_write %vec, %memref[%c0, %c0] {in_bounds = [true, true]} : vector<[4]x[4]xf32>, memref<?x?xf32>
792 // TARGET-RANK-ZERO-LABEL: func @unroll_transfer_write_target_rank_zero
793 func.func @unroll_transfer_write_target_rank_zero(%vec : vector<2xi32>) {
794 %alloc = memref.alloc() : memref<4xi32>
795 %c0 = arith.constant 0 : index
796 vector.transfer_write %vec, %alloc[%c0] : vector<2xi32>, memref<4xi32>
799 // TARGET-RANK-ZERO: %[[ALLOC:.*]] = memref.alloc() : memref<4xi32>
800 // TARGET-RANK-ZERO: %[[EXTRACTED1:.*]] = vector.extract {{.*}} : i32 from vector<2xi32>
801 // TARGET-RANK-ZERO: %[[BROADCASTED1:.*]] = vector.broadcast %[[EXTRACTED1]] : i32 to vector<i32>
802 // TARGET-RANK-ZERO: vector.transfer_write %[[BROADCASTED1]], %[[ALLOC]]{{.*}} : vector<i32>, memref<4xi32>
803 // TARGET-RANK-ZERO: %[[EXTRACTED2:.*]] = vector.extract {{.*}} : i32 from vector<2xi32>
804 // TARGET-RANK-ZERO: %[[BROADCASTED2:.*]] = vector.broadcast %[[EXTRACTED2]] : i32 to vector<i32>
805 // TARGET-RANK-ZERO: vector.transfer_write %[[BROADCASTED2]], %[[ALLOC]]{{.*}} : vector<i32>, memref<4xi32>
809 func.func @scalable_transpose_store_unmasked(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
810 %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
811 vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x?xf32>
814 // FULL-UNROLL: #[[$SLICE_MAP:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
815 // FULL-UNROLL-LABEL: func.func @scalable_transpose_store_unmasked(
816 // FULL-UNROLL-SAME: %[[VEC:.*]]: vector<4x[4]xf32>,
817 // FULL-UNROLL-SAME: %[[DEST:.*]]: memref<?x?xf32>,
818 // FULL-UNROLL-SAME: %[[I:.*]]: index,
819 // FULL-UNROLL-SAME: %[[J:.*]]: index)
820 // FULL-UNROLL-DAG: %[[C0:.*]] = arith.constant 0 : index
821 // FULL-UNROLL-DAG: %[[C1:.*]] = arith.constant 1 : index
822 // FULL-UNROLL-DAG: %[[C4:.*]] = arith.constant 4 : index
823 // FULL-UNROLL: %[[SLICE_0:.*]] = vector.extract %[[VEC]][0] : vector<[4]xf32> from vector<4x[4]xf32>
824 // FULL-UNROLL: %[[SLICE_1:.*]] = vector.extract %[[VEC]][1] : vector<[4]xf32> from vector<4x[4]xf32>
825 // FULL-UNROLL: %[[SLICE_2:.*]] = vector.extract %[[VEC]][2] : vector<[4]xf32> from vector<4x[4]xf32>
826 // FULL-UNROLL: %[[SLICE_3:.*]] = vector.extract %[[VEC]][3] : vector<[4]xf32> from vector<4x[4]xf32>
827 // FULL-UNROLL: %[[VSCALE:.*]] = vector.vscale
828 // FULL-UNROLL: %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
829 // FULL-UNROLL: scf.for %[[VAL_13:.*]] = %[[C0]] to %[[C4_VSCALE]] step %[[C1]] {
830 // FULL-UNROLL: %[[SLICE_I:.*]] = affine.apply #[[$SLICE_MAP]](%[[VAL_13]]){{\[}}%[[I]]]
831 // FULL-UNROLL: %[[ELEM_0:.*]] = vector.extract %[[SLICE_0]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
832 // FULL-UNROLL: %[[ELEM_1:.*]] = vector.extract %[[SLICE_1]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
833 // FULL-UNROLL: %[[ELEM_2:.*]] = vector.extract %[[SLICE_2]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
834 // FULL-UNROLL: %[[ELEM_3:.*]] = vector.extract %[[SLICE_3]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
835 // FULL-UNROLL: %[[TRANSPOSE_SLICE:.*]] = vector.from_elements %[[ELEM_0]], %[[ELEM_1]], %[[ELEM_2]], %[[ELEM_3]] : vector<4xf32>
836 // FULL-UNROLL: vector.transfer_write %[[TRANSPOSE_SLICE]], %[[DEST]]{{\[}}%[[SLICE_I]], %[[J]]] {in_bounds = [true]} : vector<4xf32>, memref<?x?xf32>
840 func.func @scalable_transpose_store_dynamic_mask(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index, %a: index, %b: index) {
841 %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
842 %mask = vector.create_mask %a, %b : vector<[4]x4xi1>
843 vector.transfer_write %transpose, %dest[%i, %j], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x?xf32>
846 // FULL-UNROLL-LABEL: func.func @scalable_transpose_store_dynamic_mask(
847 // FULL-UNROLL-SAME: %{{.*}}, %[[A:.*]]: index, %[[B:.*]]: index)
848 // FULL-UNROLL: %[[SLICE_MASK:.*]] = vector.create_mask %[[B]] : vector<4xi1>
849 // FULL-UNROLL: scf.for %{{.*}} to %[[A]]
850 // FULL-UNROLL: vector.transfer_write {{.*}}, %[[SLICE_MASK]]
854 func.func @scalable_transpose_store_constant_mask(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
855 %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
856 %mask = vector.constant_mask [4, 3] : vector<[4]x4xi1>
857 vector.transfer_write %transpose, %dest[%i, %j], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x?xf32>
860 // FULL-UNROLL-LABEL: func.func @scalable_transpose_store_constant_mask
861 // FULL-UNROLL: %[[C3:.*]] = arith.constant 3 : index
862 // FULL-UNROLL: %[[C4:.*]] = arith.constant 4 : index
863 // FULL-UNROLL: %[[VSCALE:.*]] = vector.vscale
864 // FULL-UNROLL: %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
865 // FULL-UNROLL: %[[SLICE_MASK:.*]] = vector.create_mask %[[C3]] : vector<4xi1>
866 // FULL-UNROLL: scf.for %{{.*}} to %[[C4_VSCALE]]
867 // FULL-UNROLL: vector.transfer_write {{.*}}, %[[SLICE_MASK]]
871 /// Unsupported transpose.
872 func.func @negative_scalable_transpose_store_0(%vec: vector<[4]x4xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
873 %transpose = vector.transpose %vec, [1, 0] : vector<[4]x4xf32> to vector<4x[4]xf32>
874 vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]} : vector<4x[4]xf32>, memref<?x?xf32>
877 // FULL-UNROLL-LABEL: @negative_scalable_transpose_store_0
878 // FULL-UNROLL-NOT: scf.for
882 /// Non-identity permutation map (should be lowered first).
883 func.func @negative_scalable_transpose_store_1(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
884 %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
885 vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true], permutation_map = affine_map<(d0,d1) -> (d1, d0)> } : vector<[4]x4xf32>, memref<?x?xf32>
888 // FULL-UNROLL-LABEL: @negative_scalable_transpose_store_1
889 // FULL-UNROLL-NOT: scf.for
894 /// Out-of-bounds dim.
895 func.func @negative_scalable_transpose_store_2(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
896 %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
897 vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [false, true]} : vector<[4]x4xf32>, memref<?x?xf32>
900 // FULL-UNROLL-LABEL: @negative_scalable_transpose_store_2
901 // FULL-UNROLL-NOT: scf.for
905 /// Source not a vector.transpose.
906 func.func @negative_scalable_transpose_store_3(%vec: vector<[4]x4xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
907 vector.transfer_write %vec, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x?xf32>
910 // FULL-UNROLL-LABEL: @negative_scalable_transpose_store_3
911 // FULL-UNROLL-NOT: scf.for