mlir/test/Dialect/NVGPU/invalid.mlir

   1 // RUN: mlir-opt -split-input-file -verify-diagnostics %s
   2
   3 func.func @ldmatrix_address_space_f16_x4(%arg0: memref<128x128xf16, 2>) ->  vector<4x1xf16> {
   4   %c0  = arith.constant 0 : index
   5   // expected-error @below {{expected nvgpu.ldmatrix srcMemref must have a memory space attribute of IntegerAttr(3) or gpu::AddressSpaceAttr(Workgroup)}}
   6   %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 2> -> vector<4x1xf16>
   7   return %a : vector<4x1xf16>
   8 }
   9 // -----
  10
  11 func.func @ldmatrix_num_elements_f16_x4(%arg0: memref<128x128xf16, 3>) ->  vector<4x1xf16> {
  12   %c0  = arith.constant 0 : index
  13   // expected-error @+1 {{expected vector register shape[1] = 2}}
  14   %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 3> -> vector<4x1xf16>
  15   return %a : vector<4x1xf16>
  16 }
  17 // -----
  18
  19 func.func @ldmatrix_num_tiles_f16_x4(%arg0: memref<128x128xf16, 3>) ->  vector<2x2xf16> {
  20   %c0  = arith.constant 0 : index
  21   // expected-error @+1 {{expected vector register shape[0] and numTiles to match}}
  22   %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf16, 3> -> vector<2x2xf16>
  23   return %a : vector<2x2xf16>
  24 }
  25 // -----
  26
  27 func.func @ldmatrix_num_tiles_f32_x4(%arg0: memref<128x128xf32, 3>) ->  vector<4x2xf32> {
  28   %c0  = arith.constant 0 : index
  29   // expected-error @+1 {{expected vector register shape[1] = 1}}
  30   %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x2xf32>
  31   return %a : vector<4x2xf32>
  32 }
  33 // -----
  34
  35 func.func @ldmatrix_trans_f32_x4(%arg0: memref<128x128xf32, 3>) ->  vector<4x1xf32> {
  36   %c0  = arith.constant 0 : index
  37   // expected-error @+1 {{nvgpu.ldmatrix transpose works only at 16b granularity}}
  38   %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = true, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x1xf32>
  39   return %a : vector<4x1xf32>
  40 }
  41 // -----
  42
  43 func.func @ldmatrix_trans_f32_x4(%arg0: memref<128x128xf32, 3>) ->  vector<4x1xf32> {
  44   %c0  = arith.constant 0 : index
  45   // expected-error @+1 {{results must be 2 dimensional vector}}
  46   %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4xf32>
  47   return %a : vector<4xf32>
  48 }
  49 // -----
  50
  51 func.func @ldmatrix_type_x4(%arg0: memref<128x128xf32, 3>) ->  vector<4x2xf16> {
  52   %c0  = arith.constant 0 : index
  53   // expected-error @+1 {{'nvgpu.ldmatrix' op failed to verify that srcMemref and res have same element type}}
  54   %a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 4 : i32} : memref<128x128xf32, 3> -> vector<4x2xf16>
  55   return %a : vector<4x2xf16>
  56 }
  57 // -----
  58
  59 func.func @m16n8k16_fp16_vector_shape_a(%arg0: vector<4x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
  60   // expected-error @+1 {{expected 256 warp-wide matrix A elements}}
  61   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
  62   return %d : vector<2x2xf16>
  63 }
  64 // -----
  65
  66 func.func @m16n8k16_fp16_vector_shape_b(%arg0: vector<4x2xf16>, %arg1: vector<2x4xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
  67   // expected-error @+1 {{expected 128 warp-wide matrix B elements}}
  68   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x4xf16>, vector<2x2xf16>) -> vector<2x2xf16>
  69   return %d : vector<2x2xf16>
  70 }
  71 // -----
  72
  73 func.func @m16n8k16_fp16_vector_shape_c(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x4xf16>) -> vector<2x4xf16> {
  74   // expected-error @+1 {{expected 128 warp-wide matrix C elements}}
  75   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x4xf16>) -> vector<2x4xf16>
  76   return %d : vector<2x4xf16>
  77 }
  78 // -----
  79
  80 func.func @m16n8k16_fp16_vector_shape_a_extended(%arg0: vector<2x4xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
  81   // expected-error @+1 {{expected matrix A to be shaped (4 x 2)}}
  82   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<2x4xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
  83   return %d : vector<2x2xf16>
  84 }
  85 // -----
  86
  87 func.func @m16n8k16_fp16_tf32Enabled(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
  88   // expected-error @+1 {{expected tf32 tensor cores only for F32 operands}}
  89   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16], tf32Enabled} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
  90   return %d : vector<2x2xf16>
  91 }
  92 // -----
  93
  94 func.func @m16n8k8_fp32_vector_shape_a(%arg0: vector<4x2xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
  95   // expected-error @+1 {{expected 128 warp-wide matrix A elements}}
  96   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<4x2xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>
  97   return %d : vector<2x2xf32>
  98 }
  99 // -----
 100
 101 func.func @m16n8k8_fp32_vector_shape_a_extended(%arg0: vector<1x4xf32>, %arg1: vector<2x1xf32>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
 102   // expected-error @+1 {{expected matrix A to be shaped (4 x 1)}}
 103   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 8]} : (vector<1x4xf32>, vector<2x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>
 104   return %d : vector<2x2xf32>
 105 }
 106 // -----
 107
 108 func.func @m8n8k4_fp64_vector_shape_a(%arg0: vector<1x2xf64>, %arg1: vector<1x1xf64>, %arg2: vector<1x2xf64>) -> vector<1x2xf64> {
 109   // expected-error @+1 {{expected 32 warp-wide matrix A elements}}
 110   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x2xf64>, vector<1x1xf64>, vector<1x2xf64>) -> vector<1x2xf64>
 111   return %d : vector<1x2xf64>
 112 }
 113 // -----
 114
 115 func.func @m8n8k4_fp64_vector_shape_c_extended(%arg0: vector<1x1xf64>, %arg1: vector<1x1xf64>, %arg2: vector<2x1xf64>) -> vector<2x1xf64> {
 116   // expected-error @+1 {{expected matrix C to be shaped (1 x 2)}}
 117   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [8, 8, 4]} : (vector<1x1xf64>, vector<1x1xf64>, vector<2x1xf64>) -> vector<2x1xf64>
 118   return %d : vector<2x1xf64>
 119 }
 120 // -----
 121
 122 func.func @m16n8k32_int8_vector_shape_b(%arg0: vector<4x4xi8>, %arg1: vector<4x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> {
 123   // expected-error @+1 {{expected 256 warp-wide matrix B elements}}
 124   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi8>, vector<4x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
 125   return %d : vector<2x2xi32>
 126 }
 127 // -----
 128
 129 func.func @m16n8k32_int32_datatype(%arg0: vector<4x4xi32>, %arg1: vector<2x4xi8>, %arg2: vector<2x2xi32>) -> vector<2x2xi32> {
 130   // expected-error @+1 {{op failed to verify that matrixA and matrixB have same element type}}
 131   %d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 32]} : (vector<4x4xi32>, vector<2x4xi8>, vector<2x2xi32>) -> vector<2x2xi32>
 132   return %d : vector<2x2xi32>
 133 }
 134 // -----
 135
 136 func.func @async_cp_memory_space(%dst : memref<16xf32>, %src : memref<16xf32>, %i : index) -> () {
 137   // expected-error @below {{destination memref must have a memory space attribute of IntegerAttr(3) or gpu::AddressSpaceAttr(Workgroup)}}
 138   nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16xf32>
 139   return
 140 }
 141 // -----
 142
 143 func.func @async_cp_memref_type(%dst : memref<16xi32, 3>, %src : memref<16xf32>, %i : index) -> () {
 144   // expected-error @+1 {{source and destination must have the same element type}}
 145   nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16xi32, 3>
 146   return
 147 }
 148 // -----
 149
 150 func.func @async_cp_num_src_indices(%dst : memref<16xf32, 3>, %src : memref<16x16xf32>, %i : index) -> () {
 151   // expected-error @+1 {{expected 2 source indices, got 1}}
 152   nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16x16xf32> to memref<16xf32, 3>
 153   return
 154 }
 155 // -----
 156
 157 func.func @async_cp_num_dst_indices(%dst : memref<16x16xf32, 3>, %src : memref<16xf32>, %i : index) -> () {
 158   // expected-error @+1 {{expected 2 destination indices, got 1}}
 159   nvgpu.device_async_copy %src[%i], %dst[%i], 16 : memref<16xf32> to memref<16x16xf32, 3>
 160   return
 161 }
 162 // -----
 163
 164 func.func @async_cp_num_src_stride(
 165   %dst : memref<200x100xf32, 3>,
 166   %src : memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>>,
 167   %i : index) -> () {
 168   // expected-error @+1 {{source memref most minor dim must have unit stride}}
 169   nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 16 :
 170     memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>> to memref<200x100xf32, 3>
 171   return
 172 }
 173 // -----
 174
 175 func.func @async_cp_num_dst_stride(
 176   %dst : memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>, 3>,
 177   %src : memref<200x100xf32>,
 178   %i : index) -> () {
 179   // expected-error @+1 {{destination memref most minor dim must have unit stride}}
 180   nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 16 :
 181     memref<200x100xf32> to memref<200x100xf32, affine_map<(d0, d1) -> (200*d0 + 2*d1)>, 3>
 182   return
 183 }
 184 // -----
 185
 186 // 42 is never the answer!
 187 func.func @mma_sp_sync_f16_16816(%arg0: vector<2x2xf16>,
 188                                  %arg1: vector<2x2xf16>,
 189                                  %arg2: vector<2x2xf16>,
 190                                  %arg3: vector<2xi16>) -> vector<2x2xf16> {
 191   // expected-error @+1 {{'nvgpu.mma.sp.sync' op sparsity selector should be 0 or 1}}
 192   %d = nvgpu.mma.sp.sync(%arg0, %arg1, %arg2) metadata(%arg3) {mmaShape = [16, 8, 16], sparsitySelector = 42 : i32} :
 193        (vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
 194   return %d : vector<2x2xf16>
 195 }
 196
 197 // -----
 198
 199 func.func @async_cp_zfill_f32_align1(
 200   %src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index, %srcElements : index) {
 201     // expected-error @+1 {{'nvgpu.device_async_copy' op bypassL1 does not satify alignment for 'memref<3x16x128xf32, 3>' with destination element 1. Unset bypassL1, or set destination element to 4}}
 202   %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements {bypassL1} : memref<128x128xf32> to memref<3x16x128xf32, 3>
 203   return
 204 }
 205
 206 // -----
 207
 208 func.func @async_cp_size_invalid_f32(
 209   %src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index) {
 210     // expected-error @+1 {{Requested copy elements is 3 with width 32. But copy elements could be one of 1, 2, 4.}}
 211   %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf32> to memref<3x16x128xf32, 3>
 212   return
 213 }
 214
 215 // -----
 216
 217 func.func @async_cp_size_invalid_f16(
 218   %src: memref<128x128xf16>, %dst: memref<3x16x128xf16, 3>, %i : index) {
 219     // expected-error @+1 {{Requested copy elements is 3 with width 16. But copy elements could be one of 2, 4, 8.}}
 220   %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf16> to memref<3x16x128xf16, 3>
 221   return
 222 }
 223
 224 // -----
 225
 226 func.func @async_cp_size_invalid_f64(
 227   %src: memref<128x128xf64>, %dst: memref<3x16x128xf64, 3>, %i : index) {
 228     // expected-error @+1 {{Requested copy elements is 3 with width 64. But copy elements could be one of 1, 2.}}
 229   %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 3: memref<128x128xf64> to memref<3x16x128xf64, 3>
 230   return
 231 }
 232
 233 // -----
 234
 235 !tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128x128xf32>>
 236 !tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
 237 !tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x121xf16, 3>>
 238
 239 func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) {
 240   // expected-error @+1 {{'nvgpu.warpgroup.mma' op 2nd dim matrix-B ( 121 ) != 2nd dim matrix-C ( 128 )}}
 241   %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult
 242   return
 243 }
 244
 245 // -----
 246
 247 !tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128xf32>>
 248 !tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
 249 !tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf16, 3>>
 250 func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) {
 251   // expected-error @+1 {{'nvgpu.warpgroup.mma' op has matrices A, B, C and D, they must be 2 dimensional}}
 252   %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult
 253   return
 254 }
 255
 256 // -----
 257 !tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128x128xf32>>
 258 !tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
 259 !tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x128xf32, 3>>
 260 func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) {
 261   // expected-error @+1 {{'nvgpu.warpgroup.mma' op 'f32' += 'f16' * 'f32', it is not supported.}}
 262   %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult
 263   return
 264 }
 265
 266 // -----
 267
 268 !tResult = !nvgpu.warpgroup.accumulator<fragmented = vector<128x128xf32>>
 269 !tDescA  = !nvgpu.warpgroup.descriptor<tensor = memref<128x64xf16, 3>>
 270 !tDescB  = !nvgpu.warpgroup.descriptor<tensor = memref<64x512xf16, 3>>
 271 func.func @warpgroup_mma_wrong_input(%descA: !tDescA, %descB: !tDescB, %acc: !tResult) {
 272   // expected-error @+1 {{'nvgpu.warpgroup.mma' op 2nd dim matrix-B ( 512 ) != 2nd dim matrix-C ( 128 )}}
 273   %0 = nvgpu.warpgroup.mma %descA, %descB, %acc: !tDescA, !tDescB, !tResult -> !tResult
 274   return
 275 }
 276
 277 // -----
 278
 279 !desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
 280 !mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
 281 func.func @tma_load_1(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
 282   %c0 = arith.constant 0 : index
 283   // Pass fine
 284   nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3>
 285   // expected-error @+1 {{Maximum 5 coordinates are supported.}}
 286   nvgpu.tma.async.load %desc[%c0, %c0, %c0, %c0, %c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3>
 287   return
 288 }
 289 // -----
 290
 291 !desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
 292 !mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
 293 func.func @tma_load_2(%desc: !desc,  %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
 294   %c0 = arith.constant 0 : index
 295   // expected-error @+1 {{the tensor map descriptor has incorrect address space, it must be shared memory address space.}}
 296   nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<32x32xf32,3>
 297   return
 298 }
 299 // -----
 300
 301 !desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
 302 !mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
 303 func.func @tma_load_3(%desc: !desc, %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
 304   %c0 = arith.constant 0 : index
 305   // expected-error @+1 {{the destination memref has incorrect address space, it must be shared memory address space}}
 306   nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer3 : !desc, !mbarrier -> memref<32x32xf32>
 307   return
 308 }
 309 // -----
 310
 311 !desc = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
 312 !mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
 313 func.func @tma_load_4(%desc: !desc,  %buffer1: memref<128xf32,3>, %buffer2: memref<32x32xf32,3>, %buffer3: memref<32x32xf32>, %mbarrier: !mbarrier) {
 314   %c0 = arith.constant 0 : index
 315   // expected-error @+1 {{the shape of tensor map descriptor and memref must have same rank}}
 316   nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer1 : !desc, !mbarrier -> memref<128xf32,3>
 317   return
 318 }
 319
 320 // -----
 321
 322 !desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf16,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
 323 func.func @tma_generate_descriptor_incorrect_last_dim(%b0 : index, %b1 : index, %mem : memref<*xf16>) {
 324   // expected-error @+1 {{the tensormap descriptor must have last dimension of 128 bytes but it is 256 bytes}}
 325   %descA = nvgpu.tma.create.descriptor %mem box[%b0, %b1] : memref<*xf16> -> !desc
 326   return
 327 }
 328 // -----
 329
 330
 331 !desc = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
 332 !mbarrier = !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>>
 333 func.func @tma_generate_descriptor_incorrect_last_dim(%desc: !desc,  %buffer2: memref<64x128xf32,3>, %mbarrier: !mbarrier) {
 334   %c0 = arith.constant 0 : index
 335   // expected-error @+1 {{the tensormap descriptor must have last dimension of 128 bytes but it is 512 bytes}}
 336   nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<64x128xf32,3>
 337   return
 338 }
 339 // -----
 340
 341 func.func @rcp_unsupported_rounding_0(%in : vector<16xf32>) {
 342   // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rn> or non-ftz is not supported yet.}}
 343   %out = nvgpu.rcp %in {rounding = rn, ftz} : vector<16xf32>
 344 }
 345 // -----
 346
 347 func.func @rcp_unsupported_rounding_1(%in : vector<16xf32>) {
 348   // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rz> or non-ftz is not supported yet.}}
 349   %out = nvgpu.rcp %in {rounding = rz} : vector<16xf32>
 350 }
 351 // -----
 352
 353 func.func @rcp_unsupported_ftz(%in : vector<16xf32>) {
 354   // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode approx> or non-ftz is not supported yet.}}
 355   %out = nvgpu.rcp %in {rounding = approx} : vector<16xf32>
 356 }