mlir/test/Conversion/VectorToSCF/vector-to-scf.mlir

   1 // RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf))" -split-input-file -allow-unregistered-dialect | FileCheck %s
   2 // RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true lower-scalable=true}))" -split-input-file -allow-unregistered-dialect | FileCheck %s --check-prefix=FULL-UNROLL
   3 // RUN: mlir-opt %s "-convert-vector-to-scf=full-unroll target-rank=0" -split-input-file -allow-unregistered-dialect | FileCheck %s --check-prefix=TARGET-RANK-ZERO
   4
   5 // CHECK-LABEL: func @vector_transfer_ops_0d(
   6 func.func @vector_transfer_ops_0d(%M: memref<f32>) {
   7   %f0 = arith.constant 0.0 : f32
   8
   9   // 0-d transfers are left untouched by vector-to-scf.
  10   // They are independently lowered to the proper memref.load/store.
  11   //  CHECK: vector.transfer_read {{.*}}: memref<f32>, vector<f32>
  12   %0 = vector.transfer_read %M[], %f0 {permutation_map = affine_map<()->()>} :
  13     memref<f32>, vector<f32>
  14
  15   //  CHECK: vector.transfer_write {{.*}}: vector<f32>, memref<f32>
  16   vector.transfer_write %0, %M[] {permutation_map = affine_map<()->()>} :
  17     vector<f32>, memref<f32>
  18
  19   return
  20 }
  21
  22 // -----
  23
  24 // CHECK-LABEL: func @materialize_read_1d() {
  25 func.func @materialize_read_1d() {
  26   %f0 = arith.constant 0.0: f32
  27   %A = memref.alloc () : memref<7x42xf32>
  28   affine.for %i0 = 0 to 7 step 4 {
  29     affine.for %i1 = 0 to 42 step 4 {
  30       %f1 = vector.transfer_read %A[%i0, %i1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
  31       %ip1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i1)
  32       %f2 = vector.transfer_read %A[%i0, %ip1], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
  33       %ip2 = affine.apply affine_map<(d0) -> (d0 + 2)> (%i1)
  34       %f3 = vector.transfer_read %A[%i0, %ip2], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
  35       %ip3 = affine.apply affine_map<(d0) -> (d0 + 3)> (%i1)
  36       %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
  37       // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds.
  38       // CHECK: scf.if
  39       // CHECK-NEXT: memref.load
  40       // CHECK-NEXT: vector.insertelement
  41       // CHECK-NEXT: scf.yield
  42       // CHECK-NEXT: else
  43       // CHECK-NEXT: scf.yield
  44       // Add a dummy use to prevent dead code elimination from removing transfer
  45       // read ops.
  46       "dummy_use"(%f1, %f2, %f3, %f4) : (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) -> ()
  47     }
  48   }
  49   return
  50 }
  51
  52 // -----
  53
  54 // CHECK-LABEL: func @materialize_read_1d_partially_specialized
  55 func.func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %dyn4 : index) {
  56   %f0 = arith.constant 0.0: f32
  57   %A = memref.alloc (%dyn1, %dyn2, %dyn4) : memref<7x?x?x42x?xf32>
  58   affine.for %i0 = 0 to 7 {
  59     affine.for %i1 = 0 to %dyn1 {
  60       affine.for %i2 = 0 to %dyn2 {
  61         affine.for %i3 = 0 to 42 step 2 {
  62           affine.for %i4 = 0 to %dyn4 {
  63             %f1 = vector.transfer_read %A[%i0, %i1, %i2, %i3, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
  64             %i3p1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i3)
  65             %f2 = vector.transfer_read %A[%i0, %i1, %i2, %i3p1, %i4], %f0 {permutation_map = affine_map<(d0, d1, d2, d3, d4) -> (d3)>} : memref<7x?x?x42x?xf32>, vector<4xf32>
  66             // Add a dummy use to prevent dead code elimination from removing
  67             // transfer read ops.
  68             "dummy_use"(%f1, %f2) : (vector<4xf32>, vector<4xf32>) -> ()
  69           }
  70         }
  71       }
  72     }
  73   }
  74   // CHECK: %[[tensor:[0-9a-zA-Z_]+]] = memref.alloc
  75   // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c0
  76   // CHECK-NOT: {{.*}} memref.dim %[[tensor]], %c3
  77   return
  78 }
  79
  80 // -----
  81
  82 // CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)>
  83
  84 // CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
  85 func.func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
  86   %f0 = arith.constant 0.0: f32
  87   // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
  88   // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
  89   // CHECK-DAG:  %[[C3:.*]] = arith.constant 3 : index
  90   // CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
  91   // CHECK-DAG:  %[[C5:.*]] = arith.constant 5 : index
  92   // CHECK:      %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
  93   // CHECK-NEXT:  affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
  94   // CHECK-NEXT:    affine.for %[[I1:.*]] = 0 to %{{.*}} {
  95   // CHECK-NEXT:      affine.for %[[I2:.*]] = 0 to %{{.*}} {
  96   // CHECK-NEXT:        affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
  97   // CHECK:               %[[ALLOC:.*]] = memref.alloca() : memref<vector<5x4x3xf32>>
  98   // CHECK:               scf.for %[[I4:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
  99   // CHECK:                 scf.if
 100   // CHECK:                   %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I4]])
 101   // CHECK:                   scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
 102   // CHECK:                     %[[VEC:.*]] = scf.for %[[I6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<3xf32>) {
 103   // CHECK:                       %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I6]])
 104   // CHECK:                       scf.if {{.*}} -> (vector<3xf32>) {
 105   // CHECK-NEXT:                    %[[SCAL:.*]] = memref.load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref<?x?x?x?xf32>
 106   // CHECK-NEXT:                    %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %{{.*}}[%[[I6]] : index] : vector<3xf32>
 107   // CHECK-NEXT:                    scf.yield
 108   // CHECK-NEXT:                  } else {
 109   // CHECK-NEXT:                    scf.yield
 110   // CHECK-NEXT:                  }
 111   // CHECK-NEXT:                  scf.yield
 112   // CHECK-NEXT:                }
 113   // CHECK-NEXT:                memref.store %[[VEC]], {{.*}} : memref<5x4xvector<3xf32>>
 114   // CHECK-NEXT:              }
 115   // CHECK-NEXT:            } else {
 116   // CHECK-NEXT:              memref.store {{.*}} : memref<5xvector<4x3xf32>>
 117   // CHECK-NEXT:            }
 118   // CHECK-NEXT:          }
 119   // CHECK-NEXT:          %[[LD:.*]] = memref.load %[[ALLOC]][] : memref<vector<5x4x3xf32>>
 120   // CHECK-NEXT:          "dummy_use"(%[[LD]]) : (vector<5x4x3xf32>) -> ()
 121   // CHECK-NEXT:        }
 122   // CHECK-NEXT:      }
 123   // CHECK-NEXT:    }
 124   // CHECK-NEXT:  }
 125   // CHECK-NEXT:  return
 126   // CHECK-NEXT:}
 127
 128   // Check that I0 + I4 (of size 3) read from first index load(L0, ...) and write into last index store(..., I4)
 129   // Check that I3 + I6 (of size 5) read from last index load(..., L3) and write into first index store(I6, ...)
 130   // Other dimensions are just accessed with I1, I2 resp.
 131   %A = memref.alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
 132   affine.for %i0 = 0 to %M step 3 {
 133     affine.for %i1 = 0 to %N {
 134       affine.for %i2 = 0 to %O {
 135         affine.for %i3 = 0 to %P step 5 {
 136           %f = vector.transfer_read %A[%i0, %i1, %i2, %i3], %f0 {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, 0, d0)>} : memref<?x?x?x?xf32>, vector<5x4x3xf32>
 137           // Add a dummy use to prevent dead code elimination from removing
 138           // transfer read ops.
 139           "dummy_use"(%f) : (vector<5x4x3xf32>) -> ()
 140         }
 141       }
 142     }
 143   }
 144   return
 145 }
 146
 147 // -----
 148
 149 // CHECK: #[[$ADD:map.*]] = affine_map<(d0, d1) -> (d0 + d1)>
 150
 151 // CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
 152 func.func @materialize_write(%M: index, %N: index, %O: index, %P: index) {
 153   // CHECK-DAG:  %{{.*}} = arith.constant dense<1.000000e+00> : vector<3x4x1x5xf32>
 154   // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
 155   // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
 156   // CHECK-DAG:  %[[C3:.*]] = arith.constant 3 : index
 157   // CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
 158   // CHECK:      %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref<?x?x?x?xf32>
 159   // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 {
 160   // CHECK-NEXT:   affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 {
 161   // CHECK-NEXT:     affine.for %[[I2:.*]] = 0 to %{{.*}} {
 162   // CHECK-NEXT:       affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
 163   // CHECK:              %[[ALLOC:.*]] = memref.alloca() : memref<vector<3x4x1x5xf32>>
 164   // CHECK:              memref.store %{{.*}}, %[[ALLOC]][] : memref<vector<3x4x1x5xf32>>
 165   // CHECK:              %[[VECTOR_VIEW1:.*]] = vector.type_cast %[[ALLOC]] : memref<vector<3x4x1x5xf32>> to memref<3xvector<4x1x5xf32>>
 166   // CHECK:              scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
 167   // CHECK:                scf.if
 168   // CHECK:                  %[[S3:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]])
 169   // CHECK:                  %[[VECTOR_VIEW2:.*]] = vector.type_cast %[[VECTOR_VIEW1]] : memref<3xvector<4x1x5xf32>> to memref<3x4xvector<1x5xf32>>
 170   // CHECK:                  scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
 171   // CHECK:                    scf.if
 172   // CHECK:                      %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]])
 173   // CHECK:                      %[[VECTOR_VIEW3:.*]] = vector.type_cast %[[VECTOR_VIEW2]] : memref<3x4xvector<1x5xf32>> to memref<3x4x1xvector<5xf32>>
 174   // CHECK:                      scf.for %[[I6:.*]] = %[[C0]] to %[[C1]] step %[[C1]] {
 175   // CHECK:                        %[[S0:.*]] = affine.apply #[[$ADD]](%[[I2]], %[[I6]])
 176   // CHECK:                        %[[VEC:.*]] = memref.load %[[VECTOR_VIEW3]][%[[I4]], %[[I5]], %[[I6]]] : memref<3x4x1xvector<5xf32>>
 177   // CHECK:                        vector.transfer_write %[[VEC]], %{{.*}}[%[[S3]], %[[S1]], %[[S0]], %[[I3]]] : vector<5xf32>, memref<?x?x?x?xf32>
 178   // CHECK:                      }
 179   // CHECK:                    }
 180   // CHECK:                  }
 181   // CHECK:                }
 182   // CHECK:              }
 183   // CHECK:            }
 184   // CHECK:          }
 185   // CHECK:        }
 186   // CHECK:      }
 187   // CHECK:      return
 188
 189   // Check that I0 + I4 (of size 3) read from last index load(..., I4) and write into first index store(S0, ...)
 190   // Check that I1 + I5 (of size 4) read from second index load(..., I5, ...) and write into second index store(..., S1, ...)
 191   // Check that I3 + I6 (of size 5) read from first index load(I6, ...) and write into last index store(..., S3)
 192   // Other dimension is just accessed with I2.
 193   %A = memref.alloc (%M, %N, %O, %P) : memref<?x?x?x?xf32, 0>
 194   %f1 = arith.constant dense<1.000000e+00> : vector<5x4x3xf32>
 195   affine.for %i0 = 0 to %M step 3 {
 196     affine.for %i1 = 0 to %N step 4 {
 197       affine.for %i2 = 0 to %O {
 198         affine.for %i3 = 0 to %P step 5 {
 199           vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] {permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d1, d0)>} : vector<5x4x3xf32>, memref<?x?x?x?xf32>
 200         }
 201       }
 202     }
 203   }
 204   return
 205 }
 206
 207 // -----
 208
 209 // CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
 210
 211 // FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
 212 // FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
 213
 214
 215 // CHECK-LABEL: transfer_read_progressive(
 216 //  CHECK-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
 217 //  CHECK-SAME:   %[[base:[a-zA-Z0-9]+]]: index
 218
 219 // FULL-UNROLL-LABEL: transfer_read_progressive(
 220 //  FULL-UNROLL-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
 221 //  FULL-UNROLL-SAME:   %[[base:[a-zA-Z0-9]+]]: index
 222
 223 func.func @transfer_read_progressive(%A : memref<?x?xf32>, %base: index) -> vector<3x15xf32> {
 224   %f7 = arith.constant 7.0: f32
 225   // CHECK-DAG: %[[C7:.*]] = arith.constant 7.000000e+00 : f32
 226   // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 227   // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
 228   // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
 229   // CHECK-DAG: %[[splat:.*]] = arith.constant dense<7.000000e+00> : vector<15xf32>
 230   // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
 231   // CHECK:     %[[alloc_casted:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
 232   // CHECK:     scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
 233   // CHECK:       %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
 234   // CHECK:       %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
 235   // CHECK:       %[[cond1:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index
 236   // CHECK:       scf.if %[[cond1]] {
 237   // CHECK:         %[[vec_1d:.*]] = vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
 238   // CHECK:         memref.store %[[vec_1d]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
 239   // CHECK:       } else {
 240   // CHECK:         store %[[splat]], %[[alloc_casted]][%[[I]]] : memref<3xvector<15xf32>>
 241   // CHECK:       }
 242   // CHECK:     }
 243   // CHECK:     %[[cst:.*]] = memref.load %[[alloc]][] : memref<vector<3x15xf32>>
 244
 245   // FULL-UNROLL-DAG: %[[C7:.*]] = arith.constant 7.000000e+00 : f32
 246   // FULL-UNROLL-DAG: %[[VEC0:.*]] = arith.constant dense<7.000000e+00> : vector<3x15xf32>
 247   // FULL-UNROLL-DAG: %[[C0:.*]] = arith.constant 0 : index
 248   // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
 249   // FULL-UNROLL: cmpi sgt, %[[DIM]], %[[base]] : index
 250   // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
 251   // FULL-UNROLL:   vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
 252   // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32>
 253   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
 254   // FULL-UNROLL: } else {
 255   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
 256   // FULL-UNROLL: }
 257   // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]]
 258   // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
 259   // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
 260   // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
 261   // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32>
 262   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
 263   // FULL-UNROLL: } else {
 264   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
 265   // FULL-UNROLL: }
 266   // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]]
 267   // FULL-UNROLL: cmpi sgt, %{{.*}}, %{{.*}} : index
 268   // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) {
 269   // FULL-UNROLL:   vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[C7]] : memref<?x?xf32>, vector<15xf32>
 270   // FULL-UNROLL:   vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32>
 271   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
 272   // FULL-UNROLL: } else {
 273   // FULL-UNROLL:   scf.yield %{{.*}} : vector<3x15xf32>
 274   // FULL-UNROLL: }
 275
 276   %f = vector.transfer_read %A[%base, %base], %f7 :
 277     memref<?x?xf32>, vector<3x15xf32>
 278
 279   return %f: vector<3x15xf32>
 280 }
 281
 282 // -----
 283
 284 // CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
 285
 286 // FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
 287 // FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
 288
 289 // CHECK-LABEL: transfer_write_progressive(
 290 //  CHECK-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
 291 //  CHECK-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
 292 //  CHECK-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
 293 // FULL-UNROLL-LABEL: transfer_write_progressive(
 294 //  FULL-UNROLL-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
 295 //  FULL-UNROLL-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
 296 //  FULL-UNROLL-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
 297 func.func @transfer_write_progressive(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) {
 298   // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 299   // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
 300   // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
 301   // CHECK:     %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
 302   // CHECK:     memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>>
 303   // CHECK:     %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
 304   // CHECK:     scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
 305   // CHECK:       %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
 306   // CHECK:       %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
 307   // CHECK:       %[[cmp:.*]] = arith.cmpi sgt, %[[dim]], %[[add]] : index
 308   // CHECK:       scf.if %[[cmp]] {
 309   // CHECK:         %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
 310   // CHECK:         vector.transfer_write %[[vec_1d]], %[[A]][{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
 311   // CHECK:       }
 312   // CHECK:     }
 313
 314   // FULL-UNROLL: %[[C0:.*]] = arith.constant 0 : index
 315   // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref<?x?xf32>
 316   // FULL-UNROLL: %[[CMP0:.*]] = arith.cmpi sgt, %[[DIM]], %[[base]] : index
 317   // FULL-UNROLL: scf.if %[[CMP0]] {
 318   // FULL-UNROLL:   %[[V0:.*]] = vector.extract %[[vec]][0] : vector<15xf32> from vector<3x15xf32>
 319   // FULL-UNROLL:   vector.transfer_write %[[V0]], %[[A]][%[[base]], %[[base]]] : vector<15xf32>, memref<?x?xf32>
 320   // FULL-UNROLL: }
 321   // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]]
 322   // FULL-UNROLL: %[[CMP1:.*]] = arith.cmpi sgt, %{{.*}}, %[[I1]] : index
 323   // FULL-UNROLL: scf.if %[[CMP1]] {
 324   // FULL-UNROLL:   %[[V1:.*]] = vector.extract %[[vec]][1] : vector<15xf32> from vector<3x15xf32>
 325   // FULL-UNROLL:   vector.transfer_write %[[V1]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
 326   // FULL-UNROLL: }
 327   // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]]
 328   // FULL-UNROLL: %[[CMP2:.*]] = arith.cmpi sgt, %{{.*}}, %[[I2]] : index
 329   // FULL-UNROLL: scf.if %[[CMP2]] {
 330   // FULL-UNROLL:   %[[V2:.*]] = vector.extract %[[vec]][2] : vector<15xf32> from vector<3x15xf32>
 331   // FULL-UNROLL:   vector.transfer_write %[[V2]], %[[A]][%{{.*}}, %[[base]]] : vector<15xf32>, memref<?x?xf32>
 332   // FULL-UNROLL: }
 333
 334   vector.transfer_write %vec, %A[%base, %base] :
 335     vector<3x15xf32>, memref<?x?xf32>
 336   return
 337 }
 338
 339 // -----
 340
 341 // CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
 342
 343 // FULL-UNROLL-DAG: #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
 344 // FULL-UNROLL-DAG: #[[$MAP2:.*]] = affine_map<()[s0] -> (s0 + 2)>
 345
 346 // CHECK-LABEL: transfer_write_progressive_inbounds(
 347 //  CHECK-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
 348 //  CHECK-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
 349 //  CHECK-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
 350 // FULL-UNROLL-LABEL: transfer_write_progressive_inbounds(
 351 //  FULL-UNROLL-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
 352 //  FULL-UNROLL-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
 353 //  FULL-UNROLL-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<3x15xf32>
 354 func.func @transfer_write_progressive_inbounds(%A : memref<?x?xf32>, %base: index, %vec: vector<3x15xf32>) {
 355   // CHECK-NOT:    scf.if
 356   // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
 357   // CHECK-DAG:  %[[C3:.*]] = arith.constant 3 : index
 358   // CHECK:      %[[alloc:.*]] = memref.alloca() : memref<vector<3x15xf32>>
 359   // CHECK-NEXT: memref.store %[[vec]], %[[alloc]][] : memref<vector<3x15xf32>>
 360   // CHECK-NEXT: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<vector<3x15xf32>> to memref<3xvector<15xf32>>
 361   // CHECK-NEXT: scf.for %[[I:.*]] = %[[C0]] to %[[C3]]
 362   // CHECK-NEXT:   %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]]
 363   // CHECK-NEXT:   %[[vec_1d:.*]] = memref.load %[[vmemref]][%[[I]]] : memref<3xvector<15xf32>>
 364   // CHECK-NEXT:   vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
 365
 366   // FULL-UNROLL: %[[VEC0:.*]] = vector.extract %[[vec]][0] : vector<15xf32> from vector<3x15xf32>
 367   // FULL-UNROLL: vector.transfer_write %[[VEC0]], %[[A]][%[[base]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
 368   // FULL-UNROLL: %[[I1:.*]] = affine.apply #[[$MAP1]]()[%[[base]]]
 369   // FULL-UNROLL: %[[VEC1:.*]] = vector.extract %[[vec]][1] : vector<15xf32> from vector<3x15xf32>
 370   // FULL-UNROLL: vector.transfer_write %2, %[[A]][%[[I1]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
 371   // FULL-UNROLL: %[[I2:.*]] = affine.apply #[[$MAP2]]()[%[[base]]]
 372   // FULL-UNROLL: %[[VEC2:.*]] = vector.extract %[[vec]][2] : vector<15xf32> from vector<3x15xf32>
 373   // FULL-UNROLL: vector.transfer_write %[[VEC2:.*]], %[[A]][%[[I2]], %[[base]]] {in_bounds = [true]} : vector<15xf32>, memref<?x?xf32>
 374   vector.transfer_write %vec, %A[%base, %base] {in_bounds = [true, true]} :
 375     vector<3x15xf32>, memref<?x?xf32>
 376   return
 377 }
 378
 379 // -----
 380
 381 // FULL-UNROLL-LABEL: transfer_read_simple
 382 func.func @transfer_read_simple(%A : memref<2x2xf32>) -> vector<2x2xf32> {
 383   %c0 = arith.constant 0 : index
 384   %f0 = arith.constant 0.0 : f32
 385   // FULL-UNROLL-DAG: %[[VC0:.*]] = arith.constant dense<0.000000e+00> : vector<2x2xf32>
 386   // FULL-UNROLL-DAG: %[[C0:.*]] = arith.constant 0 : index
 387   // FULL-UNROLL-DAG: %[[C1:.*]] = arith.constant 1 : index
 388   // FULL-UNROLL: %[[V0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]]
 389   // FULL-UNROLL: %[[RES0:.*]] = vector.insert %[[V0]], %[[VC0]] [0] : vector<2xf32> into vector<2x2xf32>
 390   // FULL-UNROLL: %[[V1:.*]] = vector.transfer_read %{{.*}}[%[[C1]], %[[C0]]]
 391   // FULL-UNROLL: %[[RES1:.*]] = vector.insert %[[V1]], %[[RES0]] [1] : vector<2xf32> into vector<2x2xf32>
 392   %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<2x2xf32>, vector<2x2xf32>
 393   return %0 : vector<2x2xf32>
 394 }
 395
 396 func.func @transfer_read_minor_identity(%A : memref<?x?x?x?xf32>) -> vector<3x3xf32> {
 397   %c0 = arith.constant 0 : index
 398   %f0 = arith.constant 0.0 : f32
 399   %0 = vector.transfer_read %A[%c0, %c0, %c0, %c0], %f0
 400     { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> }
 401       : memref<?x?x?x?xf32>, vector<3x3xf32>
 402   return %0 : vector<3x3xf32>
 403 }
 404
 405 // CHECK-LABEL: transfer_read_minor_identity(
 406 //  CHECK-SAME: %[[A:.*]]: memref<?x?x?x?xf32>) -> vector<3x3xf32>
 407 //  CHECK-DAG:    %[[c0:.*]] = arith.constant 0 : index
 408 //  CHECK-DAG:    %[[c1:.*]] = arith.constant 1 : index
 409 //  CHECK-DAG:    %[[c2:.*]] = arith.constant 2 : index
 410 //  CHECK-DAG:    %[[c3:.*]] = arith.constant 3 : index
 411 //  CHECK-DAG:    %[[f0:.*]] = arith.constant 0.000000e+00 : f32
 412 //  CHECK-DAG:    %[[cst0:.*]] = arith.constant dense<0.000000e+00> : vector<3xf32>
 413 //  CHECK:        %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>>
 414 //  CHECK:        %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>>
 415 //  CHECK:        scf.for %[[arg1:.*]] = %[[c0]] to %[[c3]]
 416 //  CHECK:          %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref<?x?x?x?xf32>
 417 //  CHECK:          %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg1]] : index
 418 //  CHECK:          scf.if %[[cmp]] {
 419 //  CHECK:            %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[f0]] : memref<?x?x?x?xf32>, vector<3xf32>
 420 //  CHECK:            memref.store %[[tr]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
 421 //  CHECK:          } else {
 422 //  CHECK:            memref.store %[[cst0]], %[[cast]][%[[arg1]]] : memref<3xvector<3xf32>>
 423 //  CHECK:          }
 424 //  CHECK:        }
 425 //  CHECK:        %[[ret:.*]]  = memref.load %[[m]][] : memref<vector<3x3xf32>>
 426 //  CHECK:        return %[[ret]] : vector<3x3xf32>
 427
 428 func.func @transfer_write_minor_identity(%A : vector<3x3xf32>, %B : memref<?x?x?x?xf32>) {
 429   %c0 = arith.constant 0 : index
 430   %f0 = arith.constant 0.0 : f32
 431   vector.transfer_write %A, %B[%c0, %c0, %c0, %c0]
 432     { permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)> }
 433       : vector<3x3xf32>, memref<?x?x?x?xf32>
 434   return
 435 }
 436
 437 // CHECK-LABEL: transfer_write_minor_identity(
 438 // CHECK-SAME:      %[[A:.*]]: vector<3x3xf32>,
 439 // CHECK-SAME:      %[[B:.*]]: memref<?x?x?x?xf32>)
 440 // CHECK-DAG:     %[[c0:.*]] = arith.constant 0 : index
 441 // CHECK-DAG:     %[[c1:.*]] = arith.constant 1 : index
 442 // CHECK-DAG:     %[[c2:.*]] = arith.constant 2 : index
 443 // CHECK-DAG:     %[[c3:.*]] = arith.constant 3 : index
 444 // CHECK:         %[[m:.*]] = memref.alloca() : memref<vector<3x3xf32>>
 445 // CHECK:         memref.store %[[A]], %[[m]][] : memref<vector<3x3xf32>>
 446 // CHECK:         %[[cast:.*]] = vector.type_cast %[[m]] : memref<vector<3x3xf32>> to memref<3xvector<3xf32>>
 447 // CHECK:         scf.for %[[arg2:.*]] = %[[c0]] to %[[c3]]
 448 // CHECK:           %[[d:.*]] = memref.dim %[[B]], %[[c2]] : memref<?x?x?x?xf32>
 449 // CHECK:           %[[cmp:.*]] = arith.cmpi sgt, %[[d]], %[[arg2]] : index
 450 // CHECK:           scf.if %[[cmp]] {
 451 // CHECK:             %[[tmp:.*]] = memref.load %[[cast]][%[[arg2]]] : memref<3xvector<3xf32>>
 452 // CHECK:             vector.transfer_write %[[tmp]], %[[B]][%[[c0]], %[[c0]], %[[arg2]], %[[c0]]] : vector<3xf32>, memref<?x?x?x?xf32>
 453 // CHECK:           }
 454 // CHECK:         }
 455 // CHECK:         return
 456
 457
 458 // -----
 459
 460 func.func @transfer_read_strided(%A : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) -> vector<4xf32> {
 461   %c0 = arith.constant 0 : index
 462   %f0 = arith.constant 0.0 : f32
 463   %0 = vector.transfer_read %A[%c0, %c0], %f0
 464       : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>, vector<4xf32>
 465   return %0 : vector<4xf32>
 466 }
 467
 468 // CHECK-LABEL: transfer_read_strided(
 469 // CHECK: scf.for
 470 // CHECK: memref.load
 471
 472 func.func @transfer_write_strided(%A : vector<4xf32>, %B : memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>) {
 473   %c0 = arith.constant 0 : index
 474   vector.transfer_write %A, %B[%c0, %c0] :
 475     vector<4xf32>, memref<8x4xf32, affine_map<(d0, d1) -> (d0 + d1 * 8)>>
 476   return
 477 }
 478
 479 // CHECK-LABEL: transfer_write_strided(
 480 // CHECK: scf.for
 481 // CHECK: store
 482
 483 // -----
 484
 485 func.func private @fake_side_effecting_fun(%0: vector<2x2xf32>) -> ()
 486
 487 // CHECK-LABEL: transfer_read_within_async_execute
 488 func.func @transfer_read_within_async_execute(%A : memref<2x2xf32>) -> !async.token {
 489   %c0 = arith.constant 0 : index
 490   %f0 = arith.constant 0.0 : f32
 491   // CHECK-NOT: alloca
 492   //     CHECK: async.execute
 493   //     CHECK:   alloca
 494   %token = async.execute {
 495     %0 = vector.transfer_read %A[%c0, %c0], %f0 : memref<2x2xf32>, vector<2x2xf32>
 496     func.call @fake_side_effecting_fun(%0) : (vector<2x2xf32>) -> ()
 497     async.yield
 498   }
 499   return %token : !async.token
 500 }
 501
 502 // -----
 503
 504 // CHECK-LABEL: transfer_read_with_tensor
 505 func.func @transfer_read_with_tensor(%arg: tensor<f32>) -> vector<1xf32> {
 506     // CHECK:      %[[EXTRACTED:.*]] = vector.transfer_read %{{.*}}[], %{{.*}} : tensor<f32>, vector<f32>
 507     // CHECK-NEXT: %[[RESULT:.*]] = vector.broadcast %[[EXTRACTED]] : vector<f32> to vector<1xf32>
 508     // CHECK-NEXT: return %[[RESULT]] : vector<1xf32>
 509     %f0 = arith.constant 0.0 : f32
 510     %0 = vector.transfer_read %arg[], %f0 {permutation_map = affine_map<()->(0)>} :
 511       tensor<f32>, vector<1xf32>
 512     return %0: vector<1xf32>
 513 }
 514
 515 // -----
 516
 517 // CHECK-LABEL: transfer_write_scalable
 518 func.func @transfer_write_scalable(%arg0: memref<?xf32, strided<[?], offset: ?>>, %arg1: f32) {
 519   %0 = llvm.mlir.constant(0 : i32) : i32
 520   %c0 = arith.constant 0 : index
 521   %dim = memref.dim %arg0, %c0 : memref<?xf32, strided<[?], offset: ?>>
 522   %1 = llvm.intr.stepvector : vector<[16]xi32>
 523   %2 = arith.index_cast %dim : index to i32
 524   %3 = llvm.mlir.undef : vector<[16]xi32>
 525   %4 = llvm.insertelement %2, %3[%0 : i32] : vector<[16]xi32>
 526   %5 = llvm.shufflevector %4, %3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<[16]xi32>
 527   %6 = arith.cmpi slt, %1, %5 : vector<[16]xi32>
 528   %7 = llvm.mlir.undef : vector<[16]xf32>
 529   %8 = llvm.insertelement %arg1, %7[%0 : i32] : vector<[16]xf32>
 530   %9 = llvm.shufflevector %8, %7 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] : vector<[16]xf32>
 531   vector.transfer_write %9, %arg0[%c0], %6 {in_bounds = [true]} : vector<[16]xf32>, memref<?xf32, strided<[?], offset: ?>>
 532   return
 533 }
 534
 535 // CHECK-SAME:      %[[ARG_0:.*]]: memref<?xf32, strided<[?], offset: ?>>,
 536 // CHECK-DAG:       %[[C_0:.*]] = arith.constant 0 : index
 537 // CHECK-DAG:       %[[C_16:.*]] = arith.constant 16 : index
 538 // CHECK-DAG:       %[[STEP:.*]] = arith.constant 1 : index
 539 // CHECK:           %[[MASK_VEC:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} : vector<[16]xi32>
 540 // CHECK:           %[[VSCALE:.*]] = vector.vscale
 541 // CHECK:           %[[UB:.*]] = arith.muli %[[VSCALE]], %[[C_16]] : index
 542 // CHECK:           scf.for %[[IDX:.*]] = %[[C_0]] to %[[UB]] step %[[STEP]] {
 543 // CHECK:             %[[MASK_VAL:.*]] = vector.extractelement %[[MASK_VEC]][%[[IDX]] : index] : vector<[16]xi1>
 544 // CHECK:             scf.if %[[MASK_VAL]] {
 545 // CHECK:               %[[VAL_TO_STORE:.*]] = vector.extractelement %{{.*}}[%[[IDX]] : index] : vector<[16]xf32>
 546 // CHECK:               memref.store %[[VAL_TO_STORE]], %[[ARG_0]][%[[IDX]]] : memref<?xf32, strided<[?], offset: ?>>
 547 // CHECK:             } else {
 548 // CHECK:             }
 549 // CHECK:           }
 550
 551 // -----
 552
 553 func.func @vector_print_vector_0d(%arg0: vector<f32>) {
 554   vector.print %arg0 : vector<f32>
 555   return
 556 }
 557 // CHECK-LABEL:   func.func @vector_print_vector_0d(
 558 // CHECK-SAME:                                      %[[VEC:.*]]: vector<f32>) {
 559 // CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
 560 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 561 // CHECK:           %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<f32> to vector<1xf32>
 562 // CHECK:           vector.print punctuation <open>
 563 // CHECK:           scf.for %[[IDX:.*]] = %[[C0]] to %[[C1]] step %[[C1]] {
 564 // CHECK:             %[[EL:.*]] = vector.extractelement %[[FLAT_VEC]]{{\[}}%[[IDX]] : index] : vector<1xf32>
 565 // CHECK:             vector.print %[[EL]] : f32 punctuation <no_punctuation>
 566 // CHECK:             %[[IS_NOT_LAST:.*]] = arith.cmpi ult, %[[IDX]], %[[C0]] : index
 567 // CHECK:             scf.if %[[IS_NOT_LAST]] {
 568 // CHECK:               vector.print punctuation <comma>
 569 // CHECK:             }
 570 // CHECK:           }
 571 // CHECK:           vector.print punctuation <close>
 572 // CHECK:           vector.print
 573 // CHECK:           return
 574 // CHECK:         }
 575
 576 // -----
 577
 578 func.func @vector_print_vector(%arg0: vector<2x2xf32>) {
 579   vector.print %arg0 : vector<2x2xf32>
 580   return
 581 }
 582 // CHECK-LABEL:   func.func @vector_print_vector(
 583 // CHECK-SAME:                                   %[[VEC:.*]]: vector<2x2xf32>) {
 584 // CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
 585 // CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
 586 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 587 // CHECK:           %[[FLAT_VEC:.*]] = vector.shape_cast %[[VEC]] : vector<2x2xf32> to vector<4xf32>
 588 // CHECK:           vector.print punctuation <open>
 589 // CHECK:           scf.for %[[I:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
 590 // CHECK:             vector.print punctuation <open>
 591 // CHECK:             scf.for %[[J:.*]] = %[[C0]] to %[[C2]] step %[[C1]] {
 592 // CHECK:               %[[OUTER_INDEX:.*]] = arith.muli %[[I]], %[[C2]] : index
 593 // CHECK:               %[[FLAT_INDEX:.*]] = arith.addi %[[J]], %[[OUTER_INDEX]] : index
 594 // CHECK:               %[[EL:.*]] = vector.extractelement %[[FLAT_VEC]]{{\[}}%[[FLAT_INDEX]] : index] : vector<4xf32>
 595 // CHECK:               vector.print %[[EL]] : f32 punctuation <no_punctuation>
 596 // CHECK:               %[[IS_NOT_LAST_J:.*]] = arith.cmpi ult, %[[J]], %[[C1]] : index
 597 // CHECK:               scf.if %[[IS_NOT_LAST_J]] {
 598 // CHECK:                 vector.print punctuation <comma>
 599 // CHECK:               }
 600 // CHECK:             }
 601 // CHECK:             vector.print punctuation <close>
 602 // CHECK:             %[[IS_NOT_LAST_I:.*]] = arith.cmpi ult, %[[I]], %[[C1]] : index
 603 // CHECK:             scf.if %[[IS_NOT_LAST_I]] {
 604 // CHECK:               vector.print punctuation <comma>
 605 // CHECK:             }
 606 // CHECK:           }
 607 // CHECK:           vector.print punctuation <close>
 608 // CHECK:           vector.print
 609 // CHECK:           return
 610 // CHECK:         }
 611
 612 // -----
 613
 614 func.func @vector_print_scalable_vector(%arg0: vector<[4]xi32>) {
 615   vector.print %arg0 : vector<[4]xi32>
 616   return
 617 }
 618 // CHECK-LABEL:   func.func @vector_print_scalable_vector(
 619 // CHECK-SAME:                                            %[[VEC:.*]]: vector<[4]xi32>) {
 620 // CHECK:           %[[C0:.*]] = arith.constant 0 : index
 621 // CHECK:           %[[C4:.*]] = arith.constant 4 : index
 622 // CHECK:           %[[C1:.*]] = arith.constant 1 : index
 623 // CHECK:           %[[VSCALE:.*]] = vector.vscale
 624 // CHECK:           %[[UPPER_BOUND:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
 625 // CHECK:           %[[LAST_INDEX:.*]] = arith.subi %[[UPPER_BOUND]], %[[C1]] : index
 626 // CHECK:           vector.print punctuation <open>
 627 // CHECK:           scf.for %[[IDX:.*]] = %[[C0]] to %[[UPPER_BOUND]] step %[[C1]] {
 628 // CHECK:             %[[EL:.*]] = vector.extractelement %[[VEC]]{{\[}}%[[IDX]] : index] : vector<[4]xi32>
 629 // CHECK:             vector.print %[[EL]] : i32 punctuation <no_punctuation>
 630 // CHECK:             %[[IS_NOT_LAST:.*]] = arith.cmpi ult, %[[IDX]], %[[LAST_INDEX]] : index
 631 // CHECK:             scf.if %[[IS_NOT_LAST]] {
 632 // CHECK:               vector.print punctuation <comma>
 633 // CHECK:             }
 634 // CHECK:           }
 635 // CHECK:           vector.print punctuation <close>
 636 // CHECK:           vector.print
 637 // CHECK:           return
 638 // CHECK:         }
 639
 640 // -----
 641
 642 func.func @transfer_read_array_of_scalable(%arg0: memref<3x?xf32>) -> vector<3x[4]xf32> {
 643   %c0 = arith.constant 0 : index
 644   %c1 = arith.constant 1 : index
 645   %cst = arith.constant 0.000000e+00 : f32
 646   %dim = memref.dim %arg0, %c1 : memref<3x?xf32>
 647   %mask = vector.create_mask %c1, %dim : vector<3x[4]xi1>
 648   %read = vector.transfer_read %arg0[%c0, %c0], %cst, %mask {in_bounds = [true, true]} : memref<3x?xf32>, vector<3x[4]xf32>
 649   return %read : vector<3x[4]xf32>
 650 }
 651 // CHECK-LABEL:   func.func @transfer_read_array_of_scalable(
 652 // CHECK-SAME:                                               %[[ARG:.*]]: memref<3x?xf32>) -> vector<3x[4]xf32> {
 653 // CHECK-DAG:       %[[PADDING:.*]] = arith.constant 0.000000e+00 : f32
 654 // CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
 655 // CHECK-DAG:       %[[C3:.*]] = arith.constant 3 : index
 656 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 657 // CHECK:           %[[ALLOCA_VEC:.*]] = memref.alloca() : memref<vector<3x[4]xf32>>
 658 // CHECK:           %[[ALLOCA_MASK:.*]] = memref.alloca() : memref<vector<3x[4]xi1>>
 659 // CHECK:           %[[DIM_SIZE:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<3x?xf32>
 660 // CHECK:           %[[MASK:.*]] = vector.create_mask %[[C1]], %[[DIM_SIZE]] : vector<3x[4]xi1>
 661 // CHECK:           memref.store %[[MASK]], %[[ALLOCA_MASK]][] : memref<vector<3x[4]xi1>>
 662 // CHECK:           %[[UNPACK_VECTOR:.*]] = vector.type_cast %[[ALLOCA_VEC]] : memref<vector<3x[4]xf32>> to memref<3xvector<[4]xf32>>
 663 // CHECK:           %[[UNPACK_MASK:.*]] = vector.type_cast %[[ALLOCA_MASK]] : memref<vector<3x[4]xi1>> to memref<3xvector<[4]xi1>>
 664 // CHECK:           scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
 665 // CHECK:             %[[MASK_SLICE:.*]] = memref.load %[[UNPACK_MASK]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xi1>>
 666 // CHECK:             %[[READ_SLICE:.*]] = vector.transfer_read %[[ARG]]{{\[}}%[[VAL_11]], %[[C0]]], %[[PADDING]], %[[MASK_SLICE]] {in_bounds = [true]} : memref<3x?xf32>, vector<[4]xf32>
 667 // CHECK:             memref.store %[[READ_SLICE]], %[[UNPACK_VECTOR]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xf32>>
 668 // CHECK:           }
 669 // CHECK:           %[[RESULT:.*]] = memref.load %[[ALLOCA_VEC]][] : memref<vector<3x[4]xf32>>
 670 // CHECK:           return %[[RESULT]] : vector<3x[4]xf32>
 671 // CHECK:         }
 672
 673 // -----
 674
 675 func.func @transfer_write_array_of_scalable(%vec: vector<3x[4]xf32>, %arg0: memref<3x?xf32>) {
 676   %c0 = arith.constant 0 : index
 677   %c1 = arith.constant 1 : index
 678   %cst = arith.constant 0.000000e+00 : f32
 679   %dim = memref.dim %arg0, %c1 : memref<3x?xf32>
 680   %mask = vector.create_mask %c1, %dim : vector<3x[4]xi1>
 681   vector.transfer_write %vec, %arg0[%c0, %c0], %mask {in_bounds = [true, true]} : vector<3x[4]xf32>, memref<3x?xf32>
 682   return
 683 }
 684 // CHECK-LABEL:   func.func @transfer_write_array_of_scalable(
 685 // CHECK-SAME:                                                %[[VEC:.*]]: vector<3x[4]xf32>,
 686 // CHECK-SAME:                                                %[[MEMREF:.*]]: memref<3x?xf32>) {
 687 // CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
 688 // CHECK-DAG:       %[[C3:.*]] = arith.constant 3 : index
 689 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 690 // CHECK:           %[[ALLOCA_VEC:.*]] = memref.alloca() : memref<vector<3x[4]xf32>>
 691 // CHECK:           %[[ALLOCA_MASK:.*]] = memref.alloca() : memref<vector<3x[4]xi1>>
 692 // CHECK:           %[[DIM_SIZE:.*]] = memref.dim %[[MEMREF]], %[[C1]] : memref<3x?xf32>
 693 // CHECK:           %[[MASK:.*]] = vector.create_mask %[[C1]], %[[DIM_SIZE]] : vector<3x[4]xi1>
 694 // CHECK:           memref.store %[[MASK]], %[[ALLOCA_MASK]][] : memref<vector<3x[4]xi1>>
 695 // CHECK:           memref.store %[[VEC]], %[[ALLOCA_VEC]][] : memref<vector<3x[4]xf32>>
 696 // CHECK:           %[[UNPACK_VECTOR:.*]] = vector.type_cast %[[ALLOCA_VEC]] : memref<vector<3x[4]xf32>> to memref<3xvector<[4]xf32>>
 697 // CHECK:           %[[UNPACK_MASK:.*]] = vector.type_cast %[[ALLOCA_MASK]] : memref<vector<3x[4]xi1>> to memref<3xvector<[4]xi1>>
 698 // CHECK:           scf.for %[[VAL_11:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
 699 // CHECK:             %[[MASK_SLICE:.*]] = memref.load %[[UNPACK_VECTOR]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xf32>>
 700 // CHECK:             %[[VECTOR_SLICE:.*]] = memref.load %[[UNPACK_MASK]]{{\[}}%[[VAL_11]]] : memref<3xvector<[4]xi1>>
 701 // CHECK:             vector.transfer_write %[[MASK_SLICE]], %[[MEMREF]]{{\[}}%[[VAL_11]], %[[C0]]], %[[VECTOR_SLICE]] {in_bounds = [true]} : vector<[4]xf32>, memref<3x?xf32>
 702 // CHECK:           }
 703 // CHECK:           return
 704 // CHECK:         }
 705
 706 // -----
 707
 708 /// The following two tests currently cannot be lowered via unpacking the leading dim since it is scalable.
 709 /// It may be possible to special case this via a dynamic dim in future.
 710
 711 func.func @cannot_lower_transfer_write_with_leading_scalable(%vec: vector<[4]x4xf32>, %arg0: memref<?x4xf32>) {
 712   %c0 = arith.constant 0 : index
 713   %c4 = arith.constant 4 : index
 714   %cst = arith.constant 0.000000e+00 : f32
 715   %dim = memref.dim %arg0, %c0 : memref<?x4xf32>
 716   %mask = vector.create_mask %dim, %c4 : vector<[4]x4xi1>
 717   vector.transfer_write %vec, %arg0[%c0, %c0], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x4xf32>
 718   return
 719 }
 720 // CHECK-LABEL:   func.func @cannot_lower_transfer_write_with_leading_scalable(
 721 // CHECK-SAME:                                                                 %[[VEC:.*]]: vector<[4]x4xf32>,
 722 // CHECK-SAME:                                                                 %[[MEMREF:.*]]: memref<?x4xf32>)
 723 // CHECK: vector.transfer_write %[[VEC]], %[[MEMREF]][%{{.*}}, %{{.*}}], %{{.*}} {in_bounds = [true, true]} : vector<[4]x4xf32>, memref<?x4xf32>
 724
 725 // -----
 726
 727 func.func @cannot_lower_transfer_read_with_leading_scalable(%arg0: memref<?x4xf32>) -> vector<[4]x4xf32> {
 728   %c0 = arith.constant 0 : index
 729   %c1 = arith.constant 1 : index
 730   %c4 = arith.constant 4 : index
 731   %cst = arith.constant 0.000000e+00 : f32
 732   %dim = memref.dim %arg0, %c0 : memref<?x4xf32>
 733   %mask = vector.create_mask %dim, %c4 : vector<[4]x4xi1>
 734   %read = vector.transfer_read %arg0[%c0, %c0], %cst, %mask {in_bounds = [true, true]} : memref<?x4xf32>, vector<[4]x4xf32>
 735   return %read : vector<[4]x4xf32>
 736 }
 737 // CHECK-LABEL:   func.func @cannot_lower_transfer_read_with_leading_scalable(
 738 // CHECK-SAME:                                                                %[[MEMREF:.*]]: memref<?x4xf32>)
 739 // CHECK: %{{.*}} = vector.transfer_read %[[MEMREF]][%{{.*}}, %{{.*}}], %{{.*}}, %{{.*}} {in_bounds = [true, true]} : memref<?x4xf32>, vector<[4]x4xf32>
 740
 741 //  -----
 742
 743 // Check that the `TransferOpConversion` generates valid indices for the LoadOp.
 744
 745 #map1 = affine_map<(d0, d1, d2, d3) -> (d0, 0, 0, d3)>
 746 func.func @does_not_crash_on_unpack_one_dim(%subview:  memref<1x1x1x1xi32>, %mask: vector<1x1xi1>) -> vector<1x1x1x1xi32> {
 747   %c0 = arith.constant 0 : index
 748   %c0_i32 = arith.constant 0 : i32
 749   %3 = vector.transfer_read %subview[%c0, %c0, %c0, %c0], %c0_i32, %mask {permutation_map = #map1}
 750           : memref<1x1x1x1xi32>, vector<1x1x1x1xi32>
 751   return %3 : vector<1x1x1x1xi32>
 752 }
 753 // CHECK-LABEL: func.func @does_not_crash_on_unpack_one_dim
 754 // CHECK: %[[ALLOCA_0:.*]] = memref.alloca() : memref<vector<1x1xi1>>
 755 // CHECK: %[[MASK:.*]] = vector.type_cast %[[ALLOCA_0]] : memref<vector<1x1xi1>> to memref<1xvector<1xi1>>
 756 // CHECK: memref.load %[[MASK]][%{{.*}}] : memref<1xvector<1xi1>>
 757
 758 //  -----
 759
 760 // Check that the `TransferOpConversion` generates valid indices for the StoreOp.
 761 // This test is pulled from an integration test for ArmSVE.
 762
 763 func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x?xf32>) -> vector<1x2x[4]xf32> {
 764   %c0 = arith.constant 0 : index
 765   %c2 = arith.constant 2 : index
 766   %c3 = arith.constant 2 : index
 767   %cst = arith.constant 0.000000e+00 : f32
 768   %dim_a = memref.dim %a, %c2 : memref<1x2x?xf32>
 769   %mask_a = vector.create_mask %c2, %c3, %dim_a : vector<1x2x[4]xi1>
 770   %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
 771   return %vector_a : vector<1x2x[4]xf32>
 772 }
 773 // CHECK-LABEL: func.func @add_arrays_of_scalable_vectors
 774 // CHECK: scf.for
 775 // CHECK: scf.for
 776 // CHECK: memref.load
 777
 778 //  -----
 779
 780 // FULL-UNROLL-LABEL: @cannot_fully_unroll_transfer_write_of_nd_scalable_vector
 781 func.func @cannot_fully_unroll_transfer_write_of_nd_scalable_vector(%vec: vector<[4]x[4]xf32>, %memref: memref<?x?xf32>) {
 782   // FULL-UNROLL-NOT: vector.extract
 783   // FULL-UNROLL: vector.transfer_write {{.*}} : vector<[4]x[4]xf32>, memref<?x?xf32>
 784   // FULL-UNROLL-NOT: vector.extract
 785   %c0 = arith.constant 0 : index
 786   vector.transfer_write %vec, %memref[%c0, %c0] {in_bounds = [true, true]} : vector<[4]x[4]xf32>, memref<?x?xf32>
 787   return
 788 }
 789
 790 //  -----
 791
 792 // TARGET-RANK-ZERO-LABEL: func @unroll_transfer_write_target_rank_zero
 793 func.func @unroll_transfer_write_target_rank_zero(%vec : vector<2xi32>) {
 794   %alloc = memref.alloc() : memref<4xi32>
 795   %c0 = arith.constant 0 : index
 796   vector.transfer_write %vec, %alloc[%c0] : vector<2xi32>, memref<4xi32>
 797   return
 798 }
 799 // TARGET-RANK-ZERO: %[[ALLOC:.*]] = memref.alloc() : memref<4xi32>
 800 // TARGET-RANK-ZERO: %[[EXTRACTED1:.*]] = vector.extract {{.*}} : i32 from vector<2xi32>
 801 // TARGET-RANK-ZERO: %[[BROADCASTED1:.*]] = vector.broadcast %[[EXTRACTED1]] : i32 to vector<i32>
 802 // TARGET-RANK-ZERO: vector.transfer_write %[[BROADCASTED1]], %[[ALLOC]]{{.*}} : vector<i32>, memref<4xi32>
 803 // TARGET-RANK-ZERO: %[[EXTRACTED2:.*]] = vector.extract {{.*}} : i32 from vector<2xi32>
 804 // TARGET-RANK-ZERO: %[[BROADCASTED2:.*]] = vector.broadcast %[[EXTRACTED2]] : i32 to vector<i32>
 805 // TARGET-RANK-ZERO: vector.transfer_write %[[BROADCASTED2]], %[[ALLOC]]{{.*}} : vector<i32>, memref<4xi32>
 806
 807 // -----
 808
 809 func.func @scalable_transpose_store_unmasked(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
 810   %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
 811   vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x4xf32>,  memref<?x?xf32>
 812   return
 813 }
 814 // FULL-UNROLL: #[[$SLICE_MAP:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
 815 // FULL-UNROLL-LABEL:   func.func @scalable_transpose_store_unmasked(
 816 // FULL-UNROLL-SAME:                                                 %[[VEC:.*]]: vector<4x[4]xf32>,
 817 // FULL-UNROLL-SAME:                                                 %[[DEST:.*]]: memref<?x?xf32>,
 818 // FULL-UNROLL-SAME:                                                 %[[I:.*]]: index,
 819 // FULL-UNROLL-SAME:                                                 %[[J:.*]]: index)
 820 // FULL-UNROLL-DAG:       %[[C0:.*]] = arith.constant 0 : index
 821 // FULL-UNROLL-DAG:       %[[C1:.*]] = arith.constant 1 : index
 822 // FULL-UNROLL-DAG:       %[[C4:.*]] = arith.constant 4 : index
 823 // FULL-UNROLL:           %[[SLICE_0:.*]] = vector.extract %[[VEC]][0] : vector<[4]xf32> from vector<4x[4]xf32>
 824 // FULL-UNROLL:           %[[SLICE_1:.*]] = vector.extract %[[VEC]][1] : vector<[4]xf32> from vector<4x[4]xf32>
 825 // FULL-UNROLL:           %[[SLICE_2:.*]] = vector.extract %[[VEC]][2] : vector<[4]xf32> from vector<4x[4]xf32>
 826 // FULL-UNROLL:           %[[SLICE_3:.*]] = vector.extract %[[VEC]][3] : vector<[4]xf32> from vector<4x[4]xf32>
 827 // FULL-UNROLL:           %[[VSCALE:.*]] = vector.vscale
 828 // FULL-UNROLL:           %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
 829 // FULL-UNROLL:           scf.for %[[VAL_13:.*]] = %[[C0]] to %[[C4_VSCALE]] step %[[C1]] {
 830 // FULL-UNROLL:             %[[SLICE_I:.*]] = affine.apply #[[$SLICE_MAP]](%[[VAL_13]]){{\[}}%[[I]]]
 831 // FULL-UNROLL:             %[[ELEM_0:.*]] = vector.extract %[[SLICE_0]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
 832 // FULL-UNROLL:             %[[ELEM_1:.*]] = vector.extract %[[SLICE_1]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
 833 // FULL-UNROLL:             %[[ELEM_2:.*]] = vector.extract %[[SLICE_2]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
 834 // FULL-UNROLL:             %[[ELEM_3:.*]] = vector.extract %[[SLICE_3]]{{\[}}%[[VAL_13]]] : f32 from vector<[4]xf32>
 835 // FULL-UNROLL:             %[[TRANSPOSE_SLICE:.*]] = vector.from_elements %[[ELEM_0]], %[[ELEM_1]], %[[ELEM_2]], %[[ELEM_3]] : vector<4xf32>
 836 // FULL-UNROLL:             vector.transfer_write %[[TRANSPOSE_SLICE]], %[[DEST]]{{\[}}%[[SLICE_I]], %[[J]]] {in_bounds = [true]} : vector<4xf32>, memref<?x?xf32>
 837
 838 // -----
 839
 840 func.func @scalable_transpose_store_dynamic_mask(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index, %a: index, %b: index) {
 841   %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
 842   %mask = vector.create_mask %a, %b : vector<[4]x4xi1>
 843   vector.transfer_write %transpose, %dest[%i, %j], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>,  memref<?x?xf32>
 844   return
 845 }
 846 // FULL-UNROLL-LABEL:   func.func @scalable_transpose_store_dynamic_mask(
 847 // FULL-UNROLL-SAME:                                                     %{{.*}}, %[[A:.*]]: index, %[[B:.*]]: index)
 848 // FULL-UNROLL:           %[[SLICE_MASK:.*]] = vector.create_mask %[[B]] : vector<4xi1>
 849 // FULL-UNROLL:           scf.for %{{.*}} to %[[A]]
 850 // FULL-UNROLL:             vector.transfer_write {{.*}}, %[[SLICE_MASK]]
 851
 852 // -----
 853
 854 func.func @scalable_transpose_store_constant_mask(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
 855   %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
 856   %mask = vector.constant_mask [4, 3] : vector<[4]x4xi1>
 857   vector.transfer_write %transpose, %dest[%i, %j], %mask {in_bounds = [true, true]} : vector<[4]x4xf32>,  memref<?x?xf32>
 858   return
 859 }
 860 // FULL-UNROLL-LABEL:   func.func @scalable_transpose_store_constant_mask
 861 // FULL-UNROLL:           %[[C3:.*]] = arith.constant 3 : index
 862 // FULL-UNROLL:           %[[C4:.*]] = arith.constant 4 : index
 863 // FULL-UNROLL:           %[[VSCALE:.*]] = vector.vscale
 864 // FULL-UNROLL:           %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
 865 // FULL-UNROLL:           %[[SLICE_MASK:.*]] = vector.create_mask %[[C3]] : vector<4xi1>
 866 // FULL-UNROLL:           scf.for %{{.*}} to %[[C4_VSCALE]]
 867 // FULL-UNROLL:             vector.transfer_write {{.*}}, %[[SLICE_MASK]]
 868
 869 // -----
 870
 871 /// Unsupported transpose.
 872 func.func @negative_scalable_transpose_store_0(%vec: vector<[4]x4xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
 873   %transpose = vector.transpose %vec, [1, 0] : vector<[4]x4xf32> to vector<4x[4]xf32>
 874   vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]} : vector<4x[4]xf32>,  memref<?x?xf32>
 875   return
 876 }
 877 // FULL-UNROLL-LABEL: @negative_scalable_transpose_store_0
 878 // FULL-UNROLL-NOT:   scf.for
 879
 880 // -----
 881
 882 /// Non-identity permutation map (should be lowered first).
 883 func.func @negative_scalable_transpose_store_1(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
 884   %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
 885   vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true], permutation_map = affine_map<(d0,d1) -> (d1, d0)> } : vector<[4]x4xf32>,  memref<?x?xf32>
 886   return
 887 }
 888 // FULL-UNROLL-LABEL: @negative_scalable_transpose_store_1
 889 // FULL-UNROLL-NOT:   scf.for
 890
 891
 892 // -----
 893
 894 /// Out-of-bounds dim.
 895 func.func @negative_scalable_transpose_store_2(%vec: vector<4x[4]xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
 896   %transpose = vector.transpose %vec, [1, 0] : vector<4x[4]xf32> to vector<[4]x4xf32>
 897   vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [false, true]} : vector<[4]x4xf32>,  memref<?x?xf32>
 898   return
 899 }
 900 // FULL-UNROLL-LABEL: @negative_scalable_transpose_store_2
 901 // FULL-UNROLL-NOT:   scf.for
 902
 903 // -----
 904
 905 /// Source not a vector.transpose.
 906 func.func @negative_scalable_transpose_store_3(%vec: vector<[4]x4xf32>, %dest: memref<?x?xf32>, %i: index, %j: index) {
 907   vector.transfer_write %vec, %dest[%i, %j] {in_bounds = [true, true]} : vector<[4]x4xf32>,  memref<?x?xf32>
 908   return
 909 }
 910 // FULL-UNROLL-LABEL: @negative_scalable_transpose_store_3
 911 // FULL-UNROLL-NOT:   scf.for