mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir

   1 // RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named))" %s -verify-diagnostics -o -| FileCheck %s
   2 // RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named{prefer-conv2d-kernel-layout-hwcf=true}))" %s -verify-diagnostics -o -| FileCheck --check-prefix="HWCF" %s
   3 // RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,cse))" %s -verify-diagnostics -o -| FileCheck --check-prefix="CHECK-CSE" %s
   4
   5 // CHECK-LABEL: @matmul
   6 func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) {
   7   // CHECK: [[C0:%.+]] = arith.constant 0
   8   // CHECK: [[INIT:%.+]] = tensor.empty()
   9   // CHECK: [[FILLED:%.+]] = linalg.fill ins([[C0]] : f32) outs([[INIT]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
  10   // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x6xf32>) outs([[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
  11   %0 = tosa.matmul %arg0, %arg1 : (tensor<1x5x3xf32>, tensor<1x3x6xf32>)  -> tensor<1x5x6xf32>
  12   return %0 : tensor<1x5x6xf32>
  13 }
  14
  15 // -----
  16
  17
  18 // CHECK-LABEL: @matmul_quantized
  19 func.func @matmul_quantized(%arg0: tensor<1x5x3xi8>, %arg1: tensor<1x3x6xi8>) -> (tensor<1x5x6xi32>) {
  20   // CHECK: [[C0:%.+]] = arith.constant 0
  21   // CHECK: [[INIT:%.+]] = tensor.empty()
  22   // CHECK: [[FILLED:%.+]] = linalg.fill ins([[C0]] : i32) outs([[INIT]] : tensor<1x5x6xi32>) -> tensor<1x5x6xi32>
  23   // CHECK: [[ONE:%.+]] = arith.constant 1
  24   // CHECK: [[TWO:%.+]] = arith.constant 2
  25   // CHECK: linalg.quantized_batch_matmul ins(%arg0, %arg1, [[ONE]], [[TWO]] : tensor<1x5x3xi8>, tensor<1x3x6xi8>, i32, i32) outs([[FILLED]] : tensor<1x5x6xi32>) -> tensor<1x5x6xi32>
  26   %0 = tosa.matmul %arg0, %arg1 {quantization_info = #tosa.matmul_quant<a_zp = 1, b_zp = 2>} : (tensor<1x5x3xi8>, tensor<1x3x6xi8>) -> tensor<1x5x6xi32>
  27   return %0 : tensor<1x5x6xi32>
  28 }
  29
  30 // -----
  31
  32 // CHECK-LABEL: @matmul_dyn_batch
  33 func.func @matmul_dyn_batch(%arg0: tensor<?x5x3xf32>, %arg1: tensor<?x3x6xf32>) -> (tensor<?x5x6xf32>) {
  34   // CHECK: %[[C0:.+]] = arith.constant 0
  35   // CHECK: %[[DIM:.+]] = tensor.dim %arg0, %[[C0]]
  36   // CHECK: %[[C0_0:.+]] = arith.constant 0
  37   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM]])
  38   // CHECK: %[[FILLED:.+]] = linalg.fill ins(%[[C0_0]] : f32) outs(%[[INIT]] : tensor<?x5x6xf32>) -> tensor<?x5x6xf32>
  39   // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<?x5x3xf32>, tensor<?x3x6xf32>) outs(%[[FILLED]] : tensor<?x5x6xf32>) -> tensor<?x5x6xf32>
  40   %0 = tosa.matmul %arg0, %arg1 : (tensor<?x5x3xf32>, tensor<?x3x6xf32>) -> tensor<?x5x6xf32>
  41   return %0 : tensor<?x5x6xf32>
  42 }
  43
  44 // -----
  45
  46 // CHECK-LABEL: @matmul_dyn_independent_dim
  47 func.func @matmul_dyn_independent_dim(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x?xf32>) -> (tensor<1x5x?xf32>) {
  48   // CHECK: %[[C2:.+]] = arith.constant 2
  49   // CHECK: %[[DIM:.+]] = tensor.dim %arg1, %[[C2]]
  50   // CHECK: %[[C0:.+]] = arith.constant 0
  51   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM]])
  52   // CHECK: %[[FILLED:.+]] = linalg.fill ins(%[[C0]] : f32) outs(%[[INIT]] : tensor<1x5x?xf32>) -> tensor<1x5x?xf32>
  53   // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x3xf32>, tensor<1x3x?xf32>) outs(%[[FILLED]] : tensor<1x5x?xf32>) -> tensor<1x5x?xf32>
  54   %0 = tosa.matmul %arg0, %arg1 : (tensor<1x5x3xf32>, tensor<1x3x?xf32>) -> tensor<1x5x?xf32>
  55   return %0 : tensor<1x5x?xf32>
  56 }
  57
  58 // -----
  59
  60 // CHECK-LABEL: @matmul_dyn_independent_dim
  61 func.func @matmul_dyn_independent_dim(%arg0: tensor<1x5x?xf32>, %arg1: tensor<1x?x6xf32>) -> (tensor<1x5x6xf32>) {
  62   // CHECK: %[[C0:.+]] = arith.constant 0
  63   // CHECK: %[[INIT:.+]] = tensor.empty()
  64   // CHECK: %[[FILLED:.+]] = linalg.fill ins(%[[C0]] : f32) outs(%[[INIT]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
  65   // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x5x?xf32>, tensor<1x?x6xf32>) outs(%[[FILLED]] : tensor<1x5x6xf32>) -> tensor<1x5x6xf32>
  66   %0 = tosa.matmul %arg0, %arg1 : (tensor<1x5x?xf32>, tensor<1x?x6xf32>) -> tensor<1x5x6xf32>
  67   return %0 : tensor<1x5x6xf32>
  68 }
  69
  70 // -----
  71
  72 // CHECK-LABEL: @matmul_dyn_output
  73 func.func @matmul_dyn_output(%arg0: tensor<1x1x8xf32>, %arg1: tensor<1x8x1xf32>) -> tensor<?x1x1xf32> {
  74   // CHECK: %[[C0:.+]] = arith.constant 0 : index
  75   // CHECK: %[[DIM0:.+]] = tensor.dim %arg0, %[[C0]] : tensor<1x1x8xf32>
  76   // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32
  77   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM0]]) : tensor<?x1x1xf32>
  78   // CHECK: %[[FILLED:.+]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<?x1x1xf32>) -> tensor<?x1x1xf32>
  79   // CHECK: linalg.batch_matmul ins(%arg0, %arg1 : tensor<1x1x8xf32>, tensor<1x8x1xf32>) outs(%[[FILLED]] : tensor<?x1x1xf32>) -> tensor<?x1x1xf32>
  80   %0 = tosa.matmul %arg0, %arg1 : (tensor<1x1x8xf32>, tensor<1x8x1xf32>) -> tensor<?x1x1xf32>
  81   return %0 : tensor<?x1x1xf32>
  82 }
  83
  84 // -----
  85
  86 // CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1) -> (d1)>
  87 // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
  88
  89 // CHECK-LABEL: @fully_connected
  90 func.func @fully_connected(%arg0: tensor<5x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<5x6xf32>) {
  91   // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
  92   // CHECK: %[[TRANSPOSEDINIT:.+]] = tensor.empty() : tensor<3x6xf32>
  93   // CHECK: %[[TRANSPOSED:.+]] = linalg.transpose ins(%arg1 : tensor<6x3xf32>) outs(%[[TRANSPOSEDINIT]] : tensor<3x6xf32>) permutation = [1, 0]
  94   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<5x6xf32>
  95
  96   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xf32>) outs(%[[INIT]] : tensor<5x6xf32>) {
  97   // CHECK: ^bb0(%[[IN:.+]]: f32, %[[OUT:.+]]: f32):
  98   // CHECK:   linalg.yield %[[IN]] : f32
  99   // CHECK: } -> tensor<5x6xf32>
 100
 101   // CHECK: linalg.matmul ins(%arg0, %[[TRANSPOSED]] : tensor<5x3xf32>, tensor<3x6xf32>) outs(%[[BROADCAST]] : tensor<5x6xf32>) -> tensor<5x6xf32>
 102
 103   %0 = tosa.fully_connected %arg0, %arg1, %arg2 : (tensor<5x3xf32>, tensor<6x3xf32>, tensor<6xf32>) -> tensor<5x6xf32>
 104   return %0 : tensor<5x6xf32>
 105 }
 106
 107 // -----
 108
 109 // CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1) -> (d1)>
 110 // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 111
 112 // CHECK-LABEL: @quantized_fully_connected
 113 func.func @quantized_fully_connected(%arg0: tensor<5x3xi8>, %arg1: tensor<6x3xi8>, %arg2: tensor<6xi32>) -> (tensor<5x6xi32>) {
 114   // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
 115   // CHECK: %[[TRANSPOSE:.+]] =  linalg.transpose ins(%arg1 : tensor<6x3xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x6xi8>) permutation = [1, 0]
 116   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<5x6xi32>
 117
 118   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xi32>) outs(%[[INIT]] : tensor<5x6xi32>) {
 119   // CHECK: ^bb0(%[[IN:.+]]: i32, %[[OUT:.+]]: i32):
 120   // CHECK:   linalg.yield %[[IN]] : i32
 121   // CHECK: } -> tensor<5x6xi32>
 122
 123   // CHECK: %[[C1:.+]] = arith.constant 1 : i32
 124   // CHECK: %[[C2:.+]] = arith.constant 2 : i32
 125   // CHECK: linalg.quantized_matmul ins(%arg0, %[[TRANSPOSE]], %[[C1]], %[[C2]] : tensor<5x3xi8>, tensor<3x6xi8>, i32, i32) outs(%[[BROADCAST]] : tensor<5x6xi32>) -> tensor<5x6xi32>
 126
 127   %0 = tosa.fully_connected %arg0, %arg1, %arg2 {quantization_info = #tosa.conv_quant<input_zp = 1, weight_zp = 2>} : (tensor<5x3xi8>, tensor<6x3xi8>, tensor<6xi32>) -> tensor<5x6xi32>
 128   return %0 : tensor<5x6xi32>
 129 }
 130
 131 // -----
 132
 133 // CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1) -> (d1)>
 134 // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 135
 136 // CHECK-LABEL: @fully_connected_dyn
 137 func.func @fully_connected_dyn(%arg0: tensor<?x3xf32>, %arg1: tensor<6x3xf32>, %arg2: tensor<6xf32>) -> (tensor<?x6xf32>) {
 138   // CHECK: %[[C0:.+]] = arith.constant 0 : index
 139   // CHECK: %[[DIM0:.+]] = tensor.dim %arg0, %c0 : tensor<?x3xf32>
 140   // CHECK: %[[PERM:.+]] = arith.constant dense<[1, 0]> : tensor<2xi64>
 141   // CHECK: %[[TRANSPOSED:.+]] = linalg.transpose ins(%arg1 : tensor<6x3xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x6xf32>) permutation = [1, 0]
 142   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM0]]) : tensor<?x6xf32>
 143
 144   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<6xf32>) outs(%[[INIT]] : tensor<?x6xf32>) {
 145   // CHECK: ^bb0(%[[IN:.+]]: f32, %[[OUT:.+]]: f32):
 146   // CHECK:   linalg.yield %[[IN]] : f32
 147   // CHECK: } -> tensor<?x6xf32>
 148
 149   // CHECK: linalg.matmul ins(%arg0, %[[TRANSPOSED]] : tensor<?x3xf32>, tensor<3x6xf32>) outs(%[[BROADCAST]] : tensor<?x6xf32>) -> tensor<?x6xf32>
 150
 151   %0 = tosa.fully_connected %arg0, %arg1, %arg2 : (tensor<?x3xf32>, tensor<6x3xf32>, tensor<6xf32>) -> tensor<?x6xf32>
 152   return %0 : tensor<?x6xf32>
 153 }
 154
 155 // -----
 156
 157 // CHECK-LABEL: @max_pool
 158 func.func @max_pool(%arg0: tensor<1x6x34x62xf32>) -> () {
 159   // CHECK-DAG: [[CONST:%.+]] = arith.constant -3.40282347E+38
 160   // CHECK-DAG: [[INIT:%.+]] = tensor.empty()
 161   // CHECK-DAG: [[FILL:%.+]] = linalg.fill ins([[CONST]]{{.*}}outs([[INIT]]
 162   // CHECK-DAG: [[KERNEL:%.+]] = tensor.empty()
 163   // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, [[KERNEL]] : tensor<1x6x34x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x32x62xf32>)
 164   %0 = tosa.max_pool2d %arg0 {pad = array<i64: 0, 0, 0, 0>, kernel = array<i64: 3, 3>, stride = array<i64: 1, 1>} : (tensor<1x6x34x62xf32>) -> tensor<1x4x32x62xf32>
 165   return
 166 }
 167
 168 // CHECK-LABEL: @max_pool_padded
 169 func.func @max_pool_padded(%arg0: tensor<1x6x34x62xf32>) -> () {
 170   // CHECK-DAG: [[CONST:%.+]] = arith.constant -3.40282347E+38 : f32
 171   // CHECK-DAG: [[PAD:%.+]] = tensor.pad %arg0 low[0, 0, 0, 0] high[0, 0, 1, 0]
 172   // CHECK-DAG:   tensor.yield [[CONST]]
 173   // CHECK-DAG: [[INITVAL:%.+]] = arith.constant -3.40282347E+38 : f32
 174   // CHECK-DAG: [[INIT:%.+]] = tensor.empty()
 175   // CHECK-DAG: [[FILL:%.+]] = linalg.fill ins([[INITVAL]]{{.*}}outs([[INIT]]
 176   // CHECK-DAG: [[KERNEL:%.+]] = tensor.empty()
 177   // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins([[PAD]], [[KERNEL]] : tensor<1x6x35x62xf32>, tensor<3x3xf32>) outs([[FILL]] : tensor<1x4x33x62xf32>)
 178   %0 = tosa.max_pool2d %arg0 {pad = array<i64: 0, 0, 0, 1>, kernel = array<i64: 3, 3>, stride = array<i64: 1, 1>} : (tensor<1x6x34x62xf32>) -> tensor<1x4x33x62xf32>
 179   return
 180 }
 181
 182 // CHECK-LABEL: @max_pool_dyn
 183 func.func @max_pool_dyn(%arg0: tensor<?x6x34x62xf32>) -> () {
 184   // CHECK: %[[C0:.+]] = arith.constant 0
 185   // CHECK: %[[BATCH:.+]] = tensor.dim %arg0, %[[C0]]
 186   // CHECK: %[[CONST:.+]] = arith.constant -3.40282347E+38
 187   // CHECK: %[[INIT:.+]] = tensor.empty(%[[BATCH]])
 188   // CHECK: %[[FILL:.+]] = linalg.fill ins(%[[CONST]]{{.*}}outs(%[[INIT]]
 189   // CHECK: %[[KERNEL:.+]] = tensor.empty()
 190   // CHECK: linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %[[KERNEL]] : tensor<?x6x34x62xf32>, tensor<3x3xf32>) outs(%[[FILL]] : tensor<?x4x32x62xf32>)
 191   %0 = tosa.max_pool2d %arg0 {pad = array<i64: 0, 0, 0, 0>, kernel = array<i64: 3, 3>, stride = array<i64: 1, 1>} : (tensor<?x6x34x62xf32>) -> tensor<?x4x32x62xf32>
 192   return
 193 }
 194
 195 // CHECK-LABEL: @max_pool_i8
 196 func.func @max_pool_i8(%arg0: tensor<1x6x34x62xi8>) -> () {
 197   // CHECK: arith.constant -128
 198   // CHECK: linalg.pooling_nhwc_max
 199   %0 = tosa.max_pool2d %arg0 {pad = array<i64: 0, 0, 0, 0>, kernel = array<i64: 3, 3>, stride = array<i64: 1, 1>} : (tensor<1x6x34x62xi8>) -> tensor<1x4x32x62xi8>
 200   return
 201 }
 202
 203 // CHECK-LABEL: @max_pool_i16
 204 func.func @max_pool_i16(%arg0: tensor<1x6x34x62xi16>) -> () {
 205   // CHECK: arith.constant -32768
 206   // CHECK: linalg.pooling_nhwc_max
 207   %0 = tosa.max_pool2d %arg0 {pad = array<i64: 0, 0, 0, 0>, kernel = array<i64: 3, 3>, stride = array<i64: 1, 1>} : (tensor<1x6x34x62xi16>) -> tensor<1x4x32x62xi16>
 208   return
 209 }
 210
 211 // CHECK-LABEL: @max_pool_i32
 212 func.func @max_pool_i32(%arg0: tensor<1x6x34x62xi32>) -> () {
 213   // CHECK: arith.constant -2147483648
 214   // CHECK: linalg.pooling_nhwc_max
 215   %0 = tosa.max_pool2d %arg0 {pad = array<i64: 0, 0, 0, 0>, kernel = array<i64: 3, 3>, stride = array<i64: 1, 1>} : (tensor<1x6x34x62xi32>) -> tensor<1x4x32x62xi32>
 216   return
 217 }
 218
 219 // CHECK-CSE-LABEL: @max_pool_all_dynamic
 220 func.func @max_pool_all_dynamic(%arg0: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
 221   // Batch size
 222   // CHECK-CSE: %[[C0:.+]] = arith.constant 0 : index
 223   // CHECK-CSE: %[[BATCH:.+]] = tensor.dim %arg0, %[[C0]] : tensor<?x?x?x?xf32>
 224
 225   // Compute output height
 226   // CHECK-CSE: %[[C1:.+]] = arith.constant 1 : index
 227   // CHECK-CSE: %[[IH:.+]] = tensor.dim %arg0, %[[C1]] : tensor<?x?x?x?xf32>
 228   // CHECK-CSE: %[[C2:.+]] = arith.constant 2 : index
 229   // CHECK-CSE: %[[PADDED_BEFORE:.+]] = arith.addi %[[IH]], %[[C0]] : index
 230   // CHECK-CSE: %[[PADDED_AFTER:.+]] = arith.addi %[[PADDED_BEFORE]], %[[C0]] : index
 231   // CHECK-CSE: %[[SUB_ONE:.+]] = arith.subi %[[C2]], %[[C1]] : index
 232   // CHECK-CSE: %[[DILATED:.+]] = arith.muli %[[C1]], %[[SUB_ONE]] : index
 233   // CHECK-CSE: %[[ADD_ONE:.+]] = arith.addi %[[DILATED]], %[[C1]] : index
 234   // CHECK-CSE: %[[SUBTRACT:.+]] = arith.subi %[[PADDED_AFTER]], %[[ADD_ONE]] : index
 235   // CHECK-CSE: %[[DIVIDE:.+]] = arith.divui %[[SUBTRACT]], %[[C1]] : index
 236   // CHECK-CSE: %[[HEIGHT:.+]] = arith.addi %[[DIVIDE]], %[[C1]] : index
 237
 238   // Compute output width
 239   // CHECK-CSE: %[[IW:.+]] = tensor.dim %arg0, %[[C2]] : tensor<?x?x?x?xf32>
 240   // CHECK-CSE: %[[C5:.+]] = arith.constant 5 : index
 241   // CHECK-CSE: %[[PADDED_BEFORE:.+]] = arith.addi %[[IW]], %[[C2]] : index
 242   // CHECK-CSE: %[[PADDED_AFTER:.+]] = arith.addi %[[PADDED_BEFORE]], %[[C2]] : index
 243   // CHECK-CSE: %[[SUB_ONE:.+]] = arith.subi %[[C5]], %[[C1]] : index
 244   // CHECK-CSE: %[[DILATED:.+]] = arith.muli %[[C1]], %[[SUB_ONE]] : index
 245   // CHECK-CSE: %[[ADD_ONE:.+]] = arith.addi %[[DILATED]], %[[C1]] : index
 246   // CHECK-CSE: %[[SUBTRACT:.+]] = arith.subi %[[PADDED_AFTER]], %[[ADD_ONE]] : index
 247   // CHECK-CSE: %[[DIVIDE:.+]] = arith.divui %[[SUBTRACT]], %[[C1]] : index
 248   // CHECK-CSE: %[[WIDTH:.+]] = arith.addi %14, %[[C1]] : index
 249
 250   // Channel size
 251   // CHECK-CSE: %[[C3:.+]] = arith.constant 3 : index
 252   // CHECK-CSE: %[[CHANNEL:.+]] = tensor.dim %arg0, %[[C3]] : tensor<?x?x?x?xf32>
 253
 254   // Pad the input
 255   // CHECK-CSE: %[[FLOAT_MIN:.+]] = arith.constant -3.40282347E+38 : f32
 256   // CHECK-CSE: %[[PADDED:.+]] = tensor.pad %arg0 low[0, 0, 2, 0] high[0, 0, 2, 0] {
 257   // CHECK-CSE:   tensor.yield %[[FLOAT_MIN]] : f32
 258
 259   // Allocate the output and fill with minimum value
 260   // CHECK-CSE: %[[INIT:.+]] = tensor.empty(%[[BATCH]], %[[HEIGHT]], %[[WIDTH]], %[[CHANNEL]]) : tensor<?x?x?x?xf32>
 261   // CHECK-CSE: %[[FILL:.+]] = linalg.fill ins(%[[FLOAT_MIN]] : f32) outs(%[[INIT]] : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
 262   // CHECK-CSE: %[[FAKE_WINDOW:.+]] = tensor.empty() : tensor<2x5xf32>
 263
 264   // Compute max pool
 265   // CHECK-CSE: %[[OUT:.+]] = linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%[[PADDED]], %[[FAKE_WINDOW]] : tensor<?x?x?x?xf32>, tensor<2x5xf32>) outs(%[[FILL]] : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
 266   // CHECK-CSE: return %[[OUT]]
 267
 268   %0 = tosa.max_pool2d %arg0 {kernel = array<i64: 2, 5>, pad = array<i64: 0, 0, 2, 2>, stride = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
 269   return %0 : tensor<?x?x?x?xf32>
 270 }
 271
 272 // -----
 273
 274 // CHECK-LABEL: @avg_pool_f32
 275 func.func @avg_pool_f32(%arg0: tensor<1x6x34x62xf32>) -> (tensor<1x5x33x62xf32>) {
 276   // Apply padding to the input:
 277   // CHECK: %[[F0:.+]] = arith.constant 0.000000e+00 : f32
 278   // CHECK: %[[PAD:.+]] = tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
 279   // CHECK:   tensor.yield %[[F0]] : f32
 280
 281   // Fill the pooling target:
 282   // CHECK: %[[F0:.+]] = arith.constant 0.000000e+00 : f32
 283   // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x5x33x62xf32>
 284   // CHECK: %[[FILL:.+]] = linalg.fill ins(%[[F0]] : f32) outs(%[[EMPTY]] : tensor<1x5x33x62xf32>)
 285
 286   // Compute the sum padding:
 287   // CHECK: %[[KERNEL:.+]] = tensor.empty() : tensor<4x4xf32>
 288   // CHECK: %[[POOL:.+]] = linalg.pooling_nhwc_sum
 289   // CHECK-SAME: dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
 290   // CHECK-SAME: ins(%[[PAD]], %[[KERNEL]] : tensor<1x8x36x62xf32>, tensor<4x4xf32>)
 291   // CHECK-SAME: outs(%[[FILL]] : tensor<1x5x33x62xf32>)
 292
 293   // Compute dimension based constants:
 294   // CHECK: %[[I1:.+]] = arith.constant 1 : index
 295   // CHECK: %[[DIM1:.+]] = tensor.dim %[[POOL]], %[[I1]]
 296   // CHECK: %[[I2:.+]] = arith.constant 2 : index
 297   // CHECK: %[[DIM2:.+]] = tensor.dim %[[POOL]], %[[I2]]
 298   // CHECK: %[[ONE:.+]] = arith.constant 1 : index
 299   // CHECK: %[[HEIGHT:.+]] = arith.subi %[[DIM1]], %[[ONE]] : index
 300   // CHECK: %[[WIDTH:.+]] = arith.subi %[[DIM2]], %[[ONE]] : index
 301
 302   // Divide the sum pooling by the number of summed values.
 303   // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x5x33x62xf32>
 304   // CHECK: %[[GENERIC:.+]] = linalg.generic
 305   // CHECK-SAME: indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
 306   // CHECK-SAME: ins(%[[POOL]] : tensor<1x5x33x62xf32>)
 307   // CHECK-SAME: outs(%[[EMPTY]] : tensor<1x5x33x62xf32>)
 308   // CHECK: ^bb0(%[[IN:.+]]: f32, %{{.+}}: f32)
 309   // CHECK:   %[[ZERO:.+]] = arith.constant 0
 310
 311   // Compute how much of the height does not include padding:
 312   // CHECK:   %[[STRIDE:.+]] = arith.constant 1
 313   // CHECK:   %[[KSIZE:.+]] = arith.constant 4
 314   // CHECK:   %[[START:.+]] = linalg.index 1
 315   // CHECK:   %[[END:.+]] = arith.subi %[[HEIGHT]], %[[START]]
 316   // CHECK:   %[[SRC_START:.+]] = arith.muli %[[START]], %[[STRIDE]]
 317   // CHECK:   %[[SRC_END:.+]] = arith.muli %[[END]], %[[STRIDE]]
 318   // CHECK:   %[[PAD_START:.+]] = arith.constant 1
 319   // CHECK:   %[[START_SUB:.+]] = arith.subi %[[SRC_START]], %[[PAD_START]]
 320   // CHECK:   %[[OFFSET:.+]] = arith.minsi %[[START_SUB]], %[[ZERO]]
 321   // CHECK:   %[[START_OFFSET:.+]] = arith.addi %[[KSIZE]], %[[OFFSET]]
 322   // CHECK:   %[[PAD_END:.+]] = arith.constant 1
 323   // CHECK:   %[[END_SUB:.+]] = arith.subi %[[SRC_END]], %[[PAD_END]]
 324   // CHECK:   %[[OFFSET:.+]] = arith.minsi %[[END_SUB]], %[[ZERO]]
 325   // CHECK:   %[[END_OFFSET:.+]] = arith.addi %[[START_OFFSET]], %[[OFFSET]]
 326   // CHECK:   %[[KHEIGHT:.+]] = arith.maxsi %[[ONE]], %[[END_OFFSET]]
 327
 328   // Compute how much of the width does not include padding:
 329   // CHECK:   %[[STRIDE:.+]] = arith.constant 1
 330   // CHECK:   %[[KSIZE:.+]] = arith.constant 4
 331   // CHECK:   %[[START:.+]] = linalg.index 2
 332   // CHECK:   %[[END:.+]] = arith.subi %[[WIDTH]], %[[START]]
 333   // CHECK:   %[[SRC_START:.+]] = arith.muli %[[START]], %[[STRIDE]]
 334   // CHECK:   %[[SRC_END:.+]] = arith.muli %[[END]], %[[STRIDE]]
 335   // CHECK:   %[[PAD_START:.+]] = arith.constant 1
 336   // CHECK:   %[[START_SUB:.+]] = arith.subi %[[SRC_START]], %[[PAD_START]]
 337   // CHECK:   %[[OFFSET:.+]] = arith.minsi %[[START_SUB]], %[[ZERO]]
 338   // CHECK:   %[[START_OFFSET:.+]] = arith.addi %[[KSIZE]], %[[OFFSET]]
 339   // CHECK:   %[[PAD_END:.+]] = arith.constant 1
 340   // CHECK:   %[[END_SUB:.+]] = arith.subi %[[SRC_END]], %[[PAD_END]]
 341   // CHECK:   %[[OFFSET:.+]] = arith.minsi %[[END_SUB]], %[[ZERO]]
 342   // CHECK:   %[[END_OFFSET:.+]] = arith.addi %[[START_OFFSET]], %[[OFFSET]]
 343   // CHECK:   %[[KWIDTH:.+]] = arith.maxsi %[[ONE]], %[[END_OFFSET]]
 344
 345   // Divide the summed value by the number of values summed.
 346   // CHECK:   %[[COUNT:.+]] = arith.muli %[[KHEIGHT]], %[[KWIDTH]]
 347   // CHECK:   %[[CAST:.+]] = arith.index_cast %[[COUNT]]
 348   // CHECK:   %[[FLT:.+]] = arith.sitofp %[[CAST]]
 349   // CHECK:   %[[DIV:.+]] = arith.divf %[[IN]], %[[FLT]]
 350   // CHECK:   linalg.yield %[[DIV]]
 351   %0 = tosa.avg_pool2d %arg0 {acc_type = f32, pad = array<i64: 1, 1, 1, 1>, kernel = array<i64: 4, 4>, stride = array<i64: 1, 1>} : (tensor<1x6x34x62xf32>) -> tensor<1x5x33x62xf32>
 352   return %0 : tensor<1x5x33x62xf32>
 353 }
 354
 355 // -----
 356
 357 // CHECK-LABEL: @avg_pool_f16_f32acc
 358 // CHECK-SAME: (%[[ARG0:[0-9a-zA-Z_]*]]:
 359 func.func @avg_pool_f16_f32acc(%arg0: tensor<1x6x34x62xf16>) -> (tensor<1x5x33x62xf16>) {
 360   // Apply padding to the input:
 361   // CHECK: %[[F0:.+]] = arith.constant 0.000000e+00 : f16
 362   // CHECK: %[[PAD:.+]] = tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
 363   // CHECK:   tensor.yield %[[F0]] : f16
 364
 365   // Fill the pooling target:
 366   // CHECK: %[[F0:.+]] = arith.constant 0.000000e+00 : f32
 367   // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x5x33x62xf32>
 368   // CHECK: %[[FILL:.+]] = linalg.fill ins(%[[F0]] : f32) outs(%[[EMPTY]] : tensor<1x5x33x62xf32>)
 369
 370   // Compute the sum padding:
 371   // CHECK: %[[KERNEL:.+]] = tensor.empty() : tensor<4x4xf32>
 372   // CHECK: %[[POOL:.+]] = linalg.pooling_nhwc_sum
 373   // CHECK-SAME: dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
 374   // CHECK-SAME: ins(%[[PAD]], %[[KERNEL]] : tensor<1x8x36x62xf16>, tensor<4x4xf32>)
 375   // CHECK-SAME: outs(%[[FILL]] : tensor<1x5x33x62xf32>)
 376
 377   // Compute dimension based constants:
 378   // CHECK: %[[I1:.+]] = arith.constant 1 : index
 379   // CHECK: %[[DIM1:.+]] = tensor.dim %[[POOL]], %[[I1]]
 380   // CHECK: %[[I2:.+]] = arith.constant 2 : index
 381   // CHECK: %[[DIM2:.+]] = tensor.dim %[[POOL]], %[[I2]]
 382   // CHECK: %[[ONE:.+]] = arith.constant 1 : index
 383   // CHECK: %[[HEIGHT:.+]] = arith.subi %[[DIM1]], %[[ONE]] : index
 384   // CHECK: %[[WIDTH:.+]] = arith.subi %[[DIM2]], %[[ONE]] : index
 385
 386   // Divide the sum pooling by the number of summed values.
 387   // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x5x33x62xf16>
 388   // CHECK: %[[GENERIC:.+]] = linalg.generic
 389   // CHECK-SAME: indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
 390   // CHECK-SAME: ins(%[[POOL]] : tensor<1x5x33x62xf32>)
 391   // CHECK-SAME: outs(%[[EMPTY]] : tensor<1x5x33x62xf16>)
 392   // CHECK: ^bb0(%[[IN:.+]]: f32, %{{.+}}: f16)
 393   // CHECK:   %[[ZERO:.+]] = arith.constant 0
 394
 395   // Compute how much of the height does not include padding:
 396   // CHECK:   %[[STRIDE:.+]] = arith.constant 1
 397   // CHECK:   %[[KSIZE:.+]] = arith.constant 4
 398   // CHECK:   %[[START:.+]] = linalg.index 1
 399   // CHECK:   %[[END:.+]] = arith.subi %[[HEIGHT]], %[[START]]
 400   // CHECK:   %[[SRC_START:.+]] = arith.muli %[[START]], %[[STRIDE]]
 401   // CHECK:   %[[SRC_END:.+]] = arith.muli %[[END]], %[[STRIDE]]
 402   // CHECK:   %[[PAD_START:.+]] = arith.constant 1
 403   // CHECK:   %[[START_SUB:.+]] = arith.subi %[[SRC_START]], %[[PAD_START]]
 404   // CHECK:   %[[OFFSET:.+]] = arith.minsi %[[START_SUB]], %[[ZERO]]
 405   // CHECK:   %[[START_OFFSET:.+]] = arith.addi %[[KSIZE]], %[[OFFSET]]
 406   // CHECK:   %[[PAD_END:.+]] = arith.constant 1
 407   // CHECK:   %[[END_SUB:.+]] = arith.subi %[[SRC_END]], %[[PAD_END]]
 408   // CHECK:   %[[OFFSET:.+]] = arith.minsi %[[END_SUB]], %[[ZERO]]
 409   // CHECK:   %[[END_OFFSET:.+]] = arith.addi %[[START_OFFSET]], %[[OFFSET]]
 410   // CHECK:   %[[KHEIGHT:.+]] = arith.maxsi %[[ONE]], %[[END_OFFSET]]
 411
 412   // Compute how much of the width does not include padding:
 413   // CHECK:   %[[STRIDE:.+]] = arith.constant 1
 414   // CHECK:   %[[KSIZE:.+]] = arith.constant 4
 415   // CHECK:   %[[START:.+]] = linalg.index 2
 416   // CHECK:   %[[END:.+]] = arith.subi %[[WIDTH]], %[[START]]
 417   // CHECK:   %[[SRC_START:.+]] = arith.muli %[[START]], %[[STRIDE]]
 418   // CHECK:   %[[SRC_END:.+]] = arith.muli %[[END]], %[[STRIDE]]
 419   // CHECK:   %[[PAD_START:.+]] = arith.constant 1
 420   // CHECK:   %[[START_SUB:.+]] = arith.subi %[[SRC_START]], %[[PAD_START]]
 421   // CHECK:   %[[OFFSET:.+]] = arith.minsi %[[START_SUB]], %[[ZERO]]
 422   // CHECK:   %[[START_OFFSET:.+]] = arith.addi %[[KSIZE]], %[[OFFSET]]
 423   // CHECK:   %[[PAD_END:.+]] = arith.constant 1
 424   // CHECK:   %[[END_SUB:.+]] = arith.subi %[[SRC_END]], %[[PAD_END]]
 425   // CHECK:   %[[OFFSET:.+]] = arith.minsi %[[END_SUB]], %[[ZERO]]
 426   // CHECK:   %[[END_OFFSET:.+]] = arith.addi %[[START_OFFSET]], %[[OFFSET]]
 427   // CHECK:   %[[KWIDTH:.+]] = arith.maxsi %[[ONE]], %[[END_OFFSET]]
 428
 429   // Divide the summed value by the number of values summed.
 430   // CHECK:   %[[COUNT:.+]] = arith.muli %[[KHEIGHT]], %[[KWIDTH]]
 431   // CHECK:   %[[CAST:.+]] = arith.index_cast %[[COUNT]]
 432   // CHECK:   %[[FLT:.+]] = arith.sitofp %[[CAST]]
 433   // CHECK:   %[[DIV:.+]] = arith.divf %[[IN]], %[[FLT]]
 434   // CHECK:   %[[TRUNC:.+]] = arith.truncf %[[DIV]]
 435   // CHECK:   linalg.yield %[[TRUNC]]
 436   %0 = tosa.avg_pool2d %arg0 {acc_type = f32, pad = array<i64: 1, 1, 1, 1>, kernel = array<i64: 4, 4>, stride = array<i64: 1, 1>} : (tensor<1x6x34x62xf16>) -> tensor<1x5x33x62xf16>
 437   return %0 : tensor<1x5x33x62xf16>
 438 }
 439
 440 // -----
 441
 442 // CHECK-LABEL: @avg_pool_i8
 443 func.func @avg_pool_i8(%arg0: tensor<1x6x34x62xi8>) -> (tensor<1x5x33x62xi8>) {
 444   // CHECK: %[[GENERIC:.+]] = linalg.generic
 445   // CHECK-SAME: indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
 446   // CHECK-SAME: ins(%[[POOL:.+]] : tensor<1x5x33x62xi32>)
 447   // CHECK-SAME: outs(%[[EMPTY:.+]] : tensor<1x5x33x62xi8>)
 448   // CHECK: ^bb0(%[[IN:.+]]: i32, %{{.+}}: i8)
 449
 450   // Only different behavior is how the division is performed.
 451   // First we compute the mul and shift values for average pool:
 452   // CHECK: %[[COUNT:.+]] = arith.muli %{{[0-9]+}}, %{{[0-9]+}}
 453   // CHECK: %[[ICAST:.+]] = arith.index_cast %[[COUNT]]
 454   // CHECK: %[[C1:.+]] = arith.constant 1
 455   // CHECK: %[[C32:.+]] = arith.constant 32
 456   // CHECK: %[[ISUB:.+]] = arith.subi %[[ICAST]], %[[C1]]
 457   // CHECK: %[[CTLZ:.+]] = math.ctlz %[[ISUB]]
 458   // CHECK: %[[SUB:.+]] = arith.subi %[[C32]], %[[CTLZ]]
 459   // CHECK: %[[EXT:.+]] = arith.extui %[[SUB]]
 460   // CHECK: %[[CBIG:.+]] = arith.constant 1073741825
 461   // CHECK: %[[SHL:.+]] = arith.shli %[[CBIG]], %[[EXT]]
 462   // CHECK: %[[IEXT:.+]] = arith.extui %[[ICAST]]
 463   // CHECK: %[[DIV:.+]] = arith.divui %[[SHL]], %[[IEXT]]
 464   // CHECK: %[[TRUNC_MUL:.+]] = arith.trunci %[[DIV]]
 465   // CHECK: %[[TRUNC_SHIFT:.+]] = arith.trunci %[[SUB]]
 466   // CHECK: %[[C30:.+]] = arith.constant 30
 467   // CHECK: %[[SHIFT:.+]] = arith.addi %[[TRUNC_SHIFT]], %[[C30]] : i8
 468   // CHECK: %[[SCALED:.+]] = tosa.apply_scale %[[IN]], %[[TRUNC_MUL]], %[[SHIFT]] {double_round = false}
 469
 470   // Perform the normalization.
 471   // CHECK: %[[CMIN:.+]] = arith.constant -128
 472   // CHECK: %[[CMAX:.+]] = arith.constant 127
 473   // CHECK: %[[LOW:.+]] = arith.maxsi %[[CMIN]], %[[SCALED]]
 474   // CHECK: %[[CLAMP:.+]] = arith.minsi %[[CMAX]], %[[LOW]]
 475   // CHECK: %[[TRUNC:.+]] = arith.trunci %[[CLAMP]]
 476   // CHECK: linalg.yield %[[TRUNC]]
 477   %0 = tosa.avg_pool2d %arg0 {acc_type = i32, pad = array<i64: 1, 1, 1, 1>, kernel = array<i64: 4, 4>, stride = array<i64: 1, 1>} : (tensor<1x6x34x62xi8>) -> tensor<1x5x33x62xi8>
 478   return %0 : tensor<1x5x33x62xi8>
 479 }
 480
 481 // -----
 482
 483 // CHECK-LABEL: @avg_pool_dyn
 484 func.func @avg_pool_dyn(%arg0: tensor<?x6x34x62xf32>) -> (tensor<?x5x33x62xf32>) {
 485   // CHECK: %[[C0:.+]] = arith.constant 0 : index
 486   // CHECK: %[[BATCH:.+]] = tensor.dim %arg0, %[[C0]]
 487   // CHECK: %[[F0:.+]] = arith.constant 0.000000e+00 : f32
 488   // CHECK: %[[PADDED:.+]] = tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
 489   // CHECK:   tensor.yield %[[F0]]
 490   // CHECK: %[[F0:.+]] = arith.constant 0.000000e+00 : f32
 491   // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[BATCH]]) : tensor<?x5x33x62xf32>
 492   // CHECK: %[[FILL:.+]] = linalg.fill ins(%[[F0]] : f32) outs(%[[EMPTY]] : tensor<?x5x33x62xf32>)
 493   // CHECK: %[[KERNEL:.+]] = tensor.empty() : tensor<4x4xf32>
 494   // CHECK: %[[POOL:.+]] = linalg.pooling_nhwc_sum
 495   // CHECK-SAME: dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>
 496   // CHECK-SAME: ins(%[[PADDED]], %[[KERNEL]] : tensor<?x8x36x62xf32>, tensor<4x4xf32>)
 497   // CHECK-SAME: outs(%[[FILL]] : tensor<?x5x33x62xf32>) -> tensor<?x5x33x62xf32>
 498   // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[BATCH]]) : tensor<?x5x33x62xf32>
 499   // CHECK: %[[GENERIC:.+]] = linalg.generic
 500   %0 = tosa.avg_pool2d %arg0 {acc_type = f32, pad = array<i64: 1, 1, 1, 1>, kernel = array<i64: 4, 4>, stride = array<i64: 1, 1>} : (tensor<?x6x34x62xf32>) -> tensor<?x5x33x62xf32>
 501   return %0 : tensor<?x5x33x62xf32>
 502 }
 503
 504 // -----
 505
 506 // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (0)>
 507 // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 508
 509 // CHECK-LABEL: @conv2d_scalar_bias_f32
 510 func.func @conv2d_scalar_bias_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<1xf32>) -> () {
 511   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xf32>
 512   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<1xf32>) outs(%[[INIT]] : tensor<1x45x40x28xf32>) {
 513   %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<1xf32>) -> tensor<1x45x40x28xf32>
 514   return
 515 }
 516
 517 // -----
 518
 519 // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)>
 520 // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 521
 522 // CHECK-LABEL: @conv2d_i8
 523 func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi8>, %bias: tensor<28xi8>) -> () {
 524   // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32>
 525   // HWCF: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x1x1x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<1x1x27x28xi8>) permutation = [1, 2, 3, 0]
 526   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xi32>
 527   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xi8>) outs(%[[INIT]] : tensor<1x45x40x28xi32>) {
 528   // CHECK:   arith.extsi
 529   // CHECK:   linalg.yield
 530   // CHECK: } -> tensor<1x45x40x28xi32>
 531   // CHECK: linalg.conv_2d_nhwc_fhwc_q {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1, %c0_i32, %c0_i32_0 : tensor<1x49x42x27xi8>, tensor<28x1x1x27xi8>, i32, i32) outs(%[[BROADCAST]] : tensor<1x45x40x28xi32>) -> tensor<1x45x40x28xi32>
 532   // HWCF: linalg.conv_2d_nhwc_hwcf_q {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[TRANSPOSE]], %c0_i32, %c0_i32_0 : tensor<1x49x42x27xi8>, tensor<1x1x27x28xi8>, i32, i32) outs(%{{[a-zA-Z0-9_]*}} : tensor<1x45x40x28xi32>) -> tensor<1x45x40x28xi32>
 533
 534   %0 = tosa.conv2d %input, %weights, %bias {dilation = array<i64: 2, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = 0, weight_zp = 0>, stride = array<i64: 1, 1>} : (tensor<1x49x42x27xi8>, tensor<28x1x1x27xi8>, tensor<28xi8>) -> tensor<1x45x40x28xi32>
 535   return
 536 }
 537
 538 // -----
 539
 540 // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)>
 541 // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 542
 543 // CHECK-LABEL: @conv2d_f32
 544 func.func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () {
 545   // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32>
 546   // HWCF: %[[TRANSPOSE:.+]] =  linalg.transpose ins(%arg1 : tensor<28x3x3x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x3x27x28xf32>) permutation = [1, 2, 3, 0]
 547
 548   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xf32>
 549   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xf32>) outs(%[[INIT]] : tensor<1x45x40x28xf32>) {
 550   // CHECK:   linalg.yield
 551   // CHECK: } -> tensor<1x45x40x28xf32>
 552   // CHECK: linalg.conv_2d_nhwc_fhwc {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>) outs(%1 : tensor<1x45x40x28xf32>) -> tensor<1x45x40x28xf32>
 553
 554   // HWCF: linalg.conv_2d_nhwc_hwcf {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[TRANSPOSE]] : tensor<1x49x42x27xf32>, tensor<3x3x27x28xf32>) outs(%{{[a-zA-Z0-9_]*}} : tensor<1x45x40x28xf32>
 555   %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<1x45x40x28xf32>
 556   return
 557 }
 558
 559 // -----
 560
 561 // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)>
 562 // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 563
 564 // CHECK-LABEL: @conv2d_dyn
 565 func.func @conv2d_dyn(%input: tensor<?x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () {
 566   // CHECK: %[[C0:.+]] = arith.constant 0 : index
 567   // CHECK: %[[BATCH:.+]] = tensor.dim %arg0, %[[C0]] : tensor<?x49x42x27xf32>
 568   // CHECK: %[[INIT:.+]] = tensor.empty(%[[BATCH]]) : tensor<?x45x40x28xf32>
 569   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xf32>) outs(%[[INIT]] : tensor<?x45x40x28xf32>) {
 570   // CHECK: ^bb0(%[[IN:.+]]: f32, %{{.+}}: f32):
 571   // CHECK:   linalg.yield %[[IN]] : f32
 572   // CHECK: } -> tensor<?x45x40x28xf32>
 573   // CHECK: %2 = linalg.conv_2d_nhwc_fhwc {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<?x49x42x27xf32>, tensor<28x3x3x27xf32>) outs(%[[BROADCAST]] : tensor<?x45x40x28xf32>) -> tensor<?x45x40x28xf32>
 574   %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<?x49x42x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<?x45x40x28xf32>
 575   return
 576 }
 577
 578 // -----
 579
 580 // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)>
 581 // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 582
 583 // CHECK-LABEL: @conv2d_dyn_w_h
 584 func.func @conv2d_dyn_w_h(%input: tensor<1x?x?x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () {
 585   // Computing output height
 586   // CHECK: %[[C1:.+]] = arith.constant 1
 587   // CHECK: %[[H:.+]] = tensor.dim %arg0, %[[C1]]
 588   // CHECK: %[[C1_0:.+]] = arith.constant 1
 589   // CHECK: %[[KH:.+]] = tensor.dim %arg1, %[[C1_0]]
 590   // CHECK: %[[ONE:.+]] = arith.constant 1 : index
 591   // CHECK: %[[PAD_0:.+]] = arith.constant 0 : index
 592   // CHECK: %[[ADD_PAD_0:.+]] = arith.addi %[[H]], %[[PAD_0]] : index
 593   // CHECK: %[[PAD_1:.+]] = arith.constant 0 : index
 594   // CHECK: %[[ADD_PAD_1:.+]] = arith.addi %[[ADD_PAD_0]], %[[PAD_1]] : index
 595   // CHECK: %[[SUB_ONE:.+]] = arith.subi %[[KH]], %[[ONE]] : index
 596   // CHECK: %[[DIL_H:.+]] = arith.constant 2 : index
 597   // CHECK: %[[DILATED:.+]] = arith.muli %[[DIL_H]], %[[SUB_ONE]] : index
 598   // CHECK: %[[ADD_ONE:.+]] = arith.addi %[[DILATED]], %[[ONE]] : index
 599   // CHECK: %[[SUBTRACTED:.+]] = arith.subi %[[ADD_PAD_1]], %[[ADD_ONE]] : index
 600   // CHECK: %[[STRIDE_H:.+]] = arith.constant 1 : index
 601   // CHECK: %[[DIVIDED:.+]] = arith.divui %[[SUBTRACTED]], %[[STRIDE_H]] : index
 602   // CHECK: %[[H_OUT:.+]] = arith.addi %[[DIVIDED]], %[[ONE]] : index
 603
 604   // Computing output width
 605   // CHECK: %[[C2:.+]] = arith.constant 2
 606   // CHECK: %[[W:.+]] = tensor.dim %arg0, %[[C2]]
 607   // CHECK: %[[C2_0:.+]] = arith.constant 2
 608   // CHECK: %[[KW:.+]] = tensor.dim %arg1, %[[C2_0]]
 609   // CHECK: %[[ONE_0:.+]] = arith.constant 1 : index
 610   // CHECK: %[[PAD_2:.+]] = arith.constant 0 : index
 611   // CHECK: %[[ADD_PAD_2:.+]] = arith.addi %[[W]], %[[PAD_2]] : index
 612   // CHECK: %[[PAD_3:.+]] = arith.constant 0 : index
 613   // CHECK: %[[ADD_PAD_3:.+]] = arith.addi %[[ADD_PAD_2]], %[[PAD_3]] : index
 614   // CHECK: %[[SUB_ONE_0:.+]] = arith.subi %[[KW]], %[[ONE_0]] : index
 615   // CHECK: %[[DIL_W:.+]] = arith.constant 1 : index
 616   // CHECK: %[[DILATED_0:.+]] = arith.muli %[[DIL_W]], %[[SUB_ONE_0]] : index
 617   // CHECK: %[[ADD_ONE_0:.+]] = arith.addi %[[DILATED_0]], %[[ONE_0]] : index
 618   // CHECK: %[[SUBTRACTED_0:.+]] = arith.subi %[[ADD_PAD_3]], %[[ADD_ONE_0]] : index
 619   // CHECK: %[[STRIDE_W:.+]] = arith.constant 1 : index
 620   // CHECK: %[[DIVIDED_0:.+]] = arith.divui %[[SUBTRACTED_0]], %[[STRIDE_W]] : index
 621   // CHECK: %[[W_OUT:.+]] = arith.addi %[[DIVIDED_0]], %[[ONE_0]] : index
 622
 623   // Running convolution
 624   // CHECK: %[[INIT:.+]] = tensor.empty(%[[H_OUT]], %[[W_OUT]]) : tensor<1x?x?x28xf32>
 625   // CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xf32>) outs(%[[INIT]] : tensor<1x?x?x28xf32>) {
 626   // CHECK:   linalg.yield
 627   // CHECK: } -> tensor<1x?x?x28xf32>
 628   // CHECK: linalg.conv_2d_nhwc_fhwc {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x?x?x27xf32>, tensor<28x3x3x27xf32>) outs(%17 : tensor<1x?x?x28xf32>) -> tensor<1x?x?x28xf32>
 629
 630   %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x?x?x27xf32>, tensor<28x3x3x27xf32>, tensor<28xf32>) -> tensor<1x?x?x28xf32>
 631   return
 632 }
 633
 634 // -----
 635
 636 // CHECK: [[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3)>
 637 // CHECK: [[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 638
 639 func.func @conv2d_dyn_output(%input: tensor<2x6x5x4xf32>, %weights: tensor<4x3x3x4xf32>, %bias: tensor<4xf32>) {
 640   // %[[C0:.+]] = arith.constant 0 : index
 641   // %[[DIM0:.+]] = tensor.dim %input, %[[C0]] : tensor<2x6x5x4xf32>
 642   // %[[INIT_CONV:.+]] = tensor.empty(%[[DIM0]]) : tensor<?x4x3x4xf32>
 643   // %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32
 644   // %[[FILL:.+]] = linalg.fill
 645   // %[[INIT_GENERIC:.+]] = tensor.empty([[DIM0]]) : tensor<?x4x3x4xf32>
 646
 647   // %[[CONV:.+]] = linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<2x6x5x4xf32>, tensor<4x3x3x4xf32>) outs(%[[INIT_CONV]] : tensor<?x4x3x4xf32>) -> tensor<?x4x3x4xf32>
 648   // linalg.generic {indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %[[CONV]] : tensor<4xf32>, tensor<?x4x3x4xf32>) outs(%[[INIT_GENERIC]] : tensor<?x4x3x4xf32>) {
 649   //   %[[ADD:.+]] = arith.addf
 650   //   linalg.yield %[[ADD]] : f32
 651   // } -> tensor<?x4x3x4xf32>
 652
 653   %0 = tosa.conv2d %input, %weights, %bias {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<2x6x5x4xf32    >, tensor<4x3x3x4xf32>, tensor<4xf32>) -> tensor<?x4x3x4xf32>
 654   return
 655 }
 656
 657 // -----
 658
 659 // CHECK-LABEL: @conv2d_padded_f32
 660 func.func @conv2d_padded_f32(%input: tensor<1x47x40x28xf32>, %weights: tensor<28x3x3x28xf32>, %bias: tensor<28xf32>) -> () {
 661   // CHECK: %[[C0:.+]] = arith.constant 0
 662   // CHECK: tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
 663   // CHECK:   tensor.yield %[[C0]]
 664   // CHECK: linalg.conv_2d_nhwc_fhwc
 665   %0 = tosa.conv2d %input, %weights, %bias {pad = array<i64: 1, 1, 1, 1>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 1>} : (tensor<1x47x40x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>) -> tensor<1x45x40x28xf32>
 666   return
 667 }
 668
 669 // -----
 670
 671 // CHECK-LABEL: @conv2d_quant
 672 func.func @conv2d_quant(%arg0 : tensor<1x12x12x1xi8>, %arg1 : tensor<1024x3x3x1xi8>, %arg2 : tensor<1024xi32>) -> () {
 673   // CHECK:   %[[C22:.+]] = arith.constant -22
 674   // CHECK: tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
 675   // CHECK:   tensor.yield %[[C22]]
 676   // CHECK: linalg.conv_2d_nhwc_fhwc_q
 677   %0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -22, weight_zp = 42>, stride = array<i64: 1, 1>} : (tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, tensor<1024xi32>) -> tensor<1x12x12x1024xi32>
 678   return
 679 }
 680
 681 // -----
 682
 683 // CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
 684 // CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 685
 686 // CHECK-LABEL: @depthwise_conv
 687 func.func @depthwise_conv(%arg0 : tensor<1x7x5x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () {
 688   // CHECK: [[INIT:%.+]] = tensor.empty()
 689   // CHECK: [[CST0:%.+]] = arith.constant 0
 690   // CHECK: [[FILL:%.+]] = linalg.fill ins([[CST0]]{{.*}}outs([[INIT]]
 691   // CHECK: [[OUT:%.+]] = tensor.empty()
 692   // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>) outs([[FILL]] : tensor<1x5x5x3x11xf32>)
 693   // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]]
 694   // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<33xf32>, tensor<1x5x5x33xf32>) outs([[OUT]] : tensor<1x5x5x33xf32>) {
 695   // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: f32, %[[ARG4:[0-9a-zA-Z_]+]]: f32, %[[ARG5:[0-9a-zA-Z_]+]]: f32):
 696   // CHECK:   [[ADD:%.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
 697   // CHECK:   linalg.yield [[ADD]] : f32
 698   // CHECK: } -> tensor<1x5x5x33xf32>
 699   %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<1x5x5x33xf32>
 700   return
 701 }
 702
 703 // -----
 704
 705 // CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (0)>
 706 // CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 707
 708 // CHECK-LABEL: @depthwise_conv_scalar_bias
 709 func.func @depthwise_conv_scalar_bias(%arg0 : tensor<1x7x5x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<1xf32>) -> () {
 710   // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %{{.*}} : tensor<1xf32>, tensor<1x5x5x33xf32>) outs(%{{.*}} : tensor<1x5x5x33xf32>) {
 711   // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: f32, %[[ARG4:[0-9a-zA-Z_]+]]: f32, %{{.*}}: f32):
 712   // CHECK:   [[ADD:%.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
 713   // CHECK:   linalg.yield [[ADD]] : f32
 714   // CHECK: } -> tensor<1x5x5x33xf32>
 715   %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<1xf32>)  -> tensor<1x5x5x33xf32>
 716   return
 717 }
 718
 719 // -----
 720
 721 // CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
 722 // CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 723
 724 // CHECK-LABEL: @depthwise_conv_dyn
 725 func.func @depthwise_conv_dyn(%arg0 : tensor<?x7x5x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () {
 726   // CHECK: %[[C0:.+]] = arith.constant 0
 727   // CHECK: %[[BATCH:.+]] = tensor.dim %arg0, %[[C0]]
 728   // CHECK: %[[INIT:.+]] = tensor.empty(%[[BATCH]])
 729   // CHECK: %[[CST0:.+]] = arith.constant 0
 730   // CHECK: %[[FILL:.+]] = linalg.fill
 731   // CHECK: %[[OUT:.+]] = tensor.empty(%[[BATCH]])
 732   // CHECK: %[[DEPTH:.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<?x7x5x3xf32>, tensor<3x1x3x11xf32>) outs(%[[FILL]] : tensor<?x5x5x3x11xf32>)
 733   // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[DEPTH]] {{\[}}[0], [1], [2], [3, 4]]
 734   // CHECK: %[[BIAS:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %[[COLLAPSED]] : tensor<33xf32>, tensor<?x5x5x33xf32>) outs(%[[OUT]] : tensor<?x5x5x33xf32>) {
 735   // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: f32, %[[ARG4:[0-9a-zA-Z_]+]]: f32, %[[ARG5:[0-9a-zA-Z_]+]]: f32):
 736   // CHECK:   %[[ADD:.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
 737   // CHECK:   linalg.yield %[[ADD]] : f32
 738   // CHECK: } -> tensor<?x5x5x33xf32>
 739   %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<?x7x5x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<?x5x5x33xf32>
 740   return
 741 }
 742
 743 // -----
 744
 745 // CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
 746 // CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 747
 748 // CHECK-LABEL: @depthwise_conv_strides
 749 func.func @depthwise_conv_strides(%arg0 : tensor<1x11x9x3xf32>, %arg1 : tensor<3x1x3x11xf32>, %arg2 : tensor<33xf32>) -> () {
 750   // CHECK: [[INIT:%.+]] = tensor.empty()
 751   // CHECK: [[CST0:%.+]] = arith.constant 0
 752   // CHECK: [[FILL:%.+]] = linalg.fill ins([[CST0]]{{.*}}outs([[INIT]]
 753   // CHECK: [[OUT:%.+]] = tensor.empty()
 754   // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>) outs([[FILL]] : tensor<1x5x5x3x11xf32>)
 755   // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]]
 756   // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<33xf32>, tensor<1x5x5x33xf32>) outs([[OUT]] : tensor<1x5x5x33xf32>) {
 757   // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: f32, %[[ARG4:[0-9a-zA-Z_]+]]: f32, %[[ARG5:[0-9a-zA-Z_]+]]: f32):
 758   // CHECK:   [[ADD:%.+]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
 759   // CHECK:   linalg.yield [[ADD]] : f32
 760   // CHECK: } -> tensor<1x5x5x33xf32>
 761   %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 { pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>, dilation = array<i64: 1, 1> } : (tensor<1x11x9x3xf32>, tensor<3x1x3x11xf32>, tensor<33xf32>)  -> tensor<1x5x5x33xf32>
 762   return
 763 }
 764
 765 // -----
 766
 767 // CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
 768 // CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 769
 770 // CHECK-LABEL: @depthwise_conv_quant
 771 func.func @depthwise_conv_quant(%arg0 : tensor<1x12x12x4xi8>, %arg1 : tensor<3x3x4x128xi8>, %arg2 : tensor<512xi32>) -> () {
 772   // CHECK: [[PADV:%.+]] = arith.constant -128
 773   // CHECK: [[PAD:%.+]] = tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
 774   // CHECK:   tensor.yield [[PADV]]
 775
 776   // CHECK: [[INIT:%.+]] = tensor.empty()
 777   // CHECK: [[CST0:%.+]] = arith.constant 0
 778   // CHECK: [[FILL:%.+]] = linalg.fill ins([[CST0]]{{.*}}outs([[INIT]]
 779   // CHECK: [[OUT:%.+]] = tensor.empty()
 780   // CHECK: [[C128:%.+]] = arith.constant -128
 781   // CHECK: [[C42:%.+]] = arith.constant 42
 782   // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm_q {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins([[PAD]], %arg1, [[C128]], [[C42]] : tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, i32, i32) outs([[FILL]] : tensor<1x12x12x4x128xi32>)
 783   // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]]
 784   // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<512xi32>, tensor<1x12x12x512xi32>) outs([[OUT]] : tensor<1x12x12x512xi32>) {
 785   // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: i32, %[[ARG4:[0-9a-zA-Z_]+]]: i32, %[[ARG5:[0-9a-zA-Z_]+]]: i32):
 786   // CHECK:   [[ADD:%.+]] = arith.addi %[[ARG3]], %[[ARG4]] : i32
 787   // CHECK:   linalg.yield [[ADD]] : i32
 788   // CHECK: } -> tensor<1x12x12x512xi32>
 789   %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 1, 1, 1, 1>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1>, dilation = array<i64: 1, 1> } : (tensor<1x12x12x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>) -> tensor<1x12x12x512xi32>
 790   return
 791 }
 792
 793 // -----
 794
 795 // CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)>
 796 // CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 797
 798 // CHECK-LABEL: @depthwise_conv_quant_dilations
 799 func.func @depthwise_conv_quant_dilations(%arg0 : tensor<1x14x14x4xi8>, %arg1 : tensor<3x3x4x128xi8>, %arg2 : tensor<512xi32>) -> () {
 800   // CHECK: [[INIT:%.+]] = tensor.empty()
 801   // CHECK: [[CST0:%.+]] = arith.constant 0
 802   // CHECK: [[FILL:%.+]] = linalg.fill ins([[CST0]]{{.*}}outs([[INIT]]
 803   // CHECK: [[OUT:%.+]] = tensor.empty()
 804   // CHECK: [[C128:%.+]] = arith.constant -128
 805   // CHECK: [[C42:%.+]] = arith.constant 42
 806   // CHECK: [[DEPTH:%.+]] = linalg.depthwise_conv_2d_nhwc_hwcm_q {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1, [[C128]], [[C42]] : tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, i32, i32) outs([[FILL]] : tensor<1x10x10x4x128xi32>)
 807   // CHECK: [[COLLAPSED:%.+]] = tensor.collapse_shape [[DEPTH]] {{\[}}[0], [1], [2], [3, 4]]
 808   // CHECK: [[BIAS:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, [[COLLAPSED]] : tensor<512xi32>, tensor<1x10x10x512xi32>) outs([[OUT]] : tensor<1x10x10x512xi32>) {
 809   // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: i32, %[[ARG4:[0-9a-zA-Z_]+]]: i32, %[[ARG5:[0-9a-zA-Z_]+]]: i32):
 810   // CHECK:   [[ADD:%.+]] = arith.addi %[[ARG3]], %[[ARG4]] : i32
 811   // CHECK:   linalg.yield [[ADD]] : i32
 812   // CHECK: } -> tensor<1x10x10x512xi32>
 813   %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1>, dilation = array<i64: 2, 2> } : (tensor<1x14x14x4xi8>, tensor<3x3x4x128xi8>, tensor<512xi32>)  -> tensor<1x10x10x512xi32>
 814   return
 815 }
 816
 817 // CHECK-LABEL: @depthwise_conv2d_dyn_w_h
 818 func.func @depthwise_conv2d_dyn_w_h(%arg0: tensor<2x?x?x3xf32>, %arg1: tensor<3x6x3x5xf32>, %arg2: tensor<15xf32>) {
 819   // CHECK: arith.addi
 820   // CHECK: arith.subi
 821   // CHECK: arith.muli
 822   // CHECK: arith.divui
 823   // CHECK: %[[PADDED:.+]] = tensor.pad %arg0 low[0, 1, 3, 0] high[0, 2, 4, 0] {
 824   // CHECK: ^bb0(%[[ARG3:[0-9a-zA-Z_]+]]: index, %[[ARG4:[0-9a-zA-Z_]+]]: index, %[[ARG5:[0-9a-zA-Z_]+]]: index, %[[ARG6:[0-9a-zA-Z_]+]]: index):
 825   // CHECK: tensor.yield %cst : f32
 826   // CHECK:  } : tensor<2x?x?x3xf32> to tensor<2x?x?x3xf32>
 827   // CHECK: %[[CONV:.+]] = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<[1, 2]> : tensor<2xi64>} ins(%[[PADDED]], %arg1 : tensor<2x?x?x3xf32>, tensor<3x6x3x5xf32>) outs(%{{.*}} : tensor<2x?x?x3x5xf32>) -> tensor<2x?x?x3x5xf32>
 828   // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[CONV]] {{\[}}[0], [1], [2], [3, 4]]
 829   %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2 {pad = array<i64: 1, 2, 3, 4>, dilation = array<i64: 2, 1>, stride = array<i64: 1, 2>} : (tensor<2x?x?x3xf32>, tensor<3x6x3x5xf32>, tensor<15xf32>) -> tensor<2x?x?x15xf32>
 830   return
 831 }
 832
 833 // -----
 834
 835 // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d4)>
 836 // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
 837
 838 // CHECK-LABEL: @conv3d_f32
 839 func.func @conv3d_f32(%input: tensor<1x49x48x47x27xf32>, %weights: tensor<28x3x4x5x27xf32>, %bias: tensor<28xf32>) -> () {
 840   // CHECK-DAG:  %[[PERMS:.+]] = arith.constant dense<[1, 2, 3, 4, 0]>
 841   // CHECK-DAG:  %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x4x5x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x28xf32>) permutation = [1, 2, 3, 4, 0]
 842   // CHECK-DAG:  %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xf32>
 843   // CHECK:      %[[BROADCAST:.+]] = linalg.generic
 844   // CHECK-SAME: {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
 845   // CHECK-SAME: ins(%arg2 : tensor<28xf32>) outs(%1 : tensor<1x47x45x43x28xf32>) {
 846   // CHECK:      ^bb0(%[[IN:.+]]: f32, %[[OUT:.+]]: f32):
 847   // CHECK:        linalg.yield %[[IN]] : f32
 848   // CHECK:      } -> tensor<1x47x45x43x28xf32>
 849   // CHECK:      linalg.conv_3d_ndhwc_dhwcf
 850   // CHECK-SAME: {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
 851   // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]] : tensor<1x49x48x47x27xf32>, tensor<3x4x5x27x28xf32>)
 852   // CHECK-SAME: outs(%[[BROADCAST]] : tensor<1x47x45x43x28xf32>) -> tensor<1x47x45x43x28xf32>
 853   %0 = tosa.conv3d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xf32>, tensor<28x3x4x5x27xf32>, tensor<28xf32>)  -> tensor<1x47x45x43x28xf32>
 854   return
 855 }
 856
 857 // -----
 858
 859 // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (0)>
 860 // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
 861
 862 // CHECK-LABEL: @conv3d_scalar_bias_f32
 863 func.func @conv3d_scalar_bias_f32(%input: tensor<1x49x48x47x27xf32>, %weights: tensor<28x3x4x5x27xf32>, %bias: tensor<1xf32>) -> () {
 864   // CHECK:  %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xf32>
 865   // CHECK:      %[[BROADCAST:.+]] = linalg.generic
 866   // CHECK-SAME: {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
 867   %0 = tosa.conv3d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0, 0, 0>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xf32>, tensor<28x3x4x5x27xf32>, tensor<1xf32>)  -> tensor<1x47x45x43x28xf32>
 868   return
 869 }
 870
 871 // -----
 872
 873 // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d4)>
 874 // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
 875
 876 // CHECK-LABEL: @conv3d_i8
 877 func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<28x3x4x5x27xi8>, %bias: tensor<28xi32>) -> () {
 878   // CHECK-DAG:  %[[PERMS:.+]] = arith.constant dense<[1, 2, 3, 4, 0]>
 879   // CHECK-DAG:  %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x4x5x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x4x5x27x28xi8>) permutation = [1, 2, 3, 4, 0]
 880   // CHECK-DAG:  %[[INIT:.+]] = tensor.empty() : tensor<1x47x45x43x28xi32>
 881   // CHECK:      %[[BROADCAST:.+]] = linalg.generic
 882   // CHECK-SAME: {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
 883   // CHECK-SAME: ins(%arg2 : tensor<28xi32>)
 884   // CHECK-SAME: outs(%[[INIT]] : tensor<1x47x45x43x28xi32>) {
 885   // CHECK:      ^bb0(%[[IN:.+]]: i32, %[[OUT:.+]]: i32):
 886   // CHECK:        linalg.yield %[[IN]] : i32
 887   // CHECK:      } -> tensor<1x47x45x43x28xi32>
 888   // CHECK:      %[[IZP:.+]] = arith.constant -128 : i32
 889   // CHECK:      %[[FZP:.+]] = arith.constant 42 : i32
 890   // CHECK:      linalg.conv_3d_ndhwc_dhwcf_q
 891   // CHECK-SAME: {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
 892   // CHECK-SAME: ins(%arg0, %[[TRANSPOSE]], %[[IZP]], %[[FZP]] : tensor<1x49x48x47x27xi8>, tensor<3x4x5x27x28xi8>, i32, i32)
 893   // CHECK-SAME: outs(%[[BROADCAST]] : tensor<1x47x45x43x28xi32>) -> tensor<1x47x45x43x28xi32>
 894
 895   %0 = tosa.conv3d %input, %weights, %bias {pad = array<i64: 0, 0, 0, 0, 0, 0>, quantization_info = #tosa.conv_quant<input_zp = -128, weight_zp = 42>, stride = array<i64: 1, 1, 1>, dilation = array<i64: 1, 1, 1>} : (tensor<1x49x48x47x27xi8>, tensor<28x3x4x5x27xi8>, tensor<28xi32>)  -> tensor<1x47x45x43x28xi32>
 896   return
 897 }
 898
 899 // -----
 900
 901 // CHECK-LABEL: @test_transpose
 902 // CHECK-SAME: (%[[ARG0:.+]]: tensor<1x2x3xi32>)
 903 func.func @test_transpose(%arg0: tensor<1x2x3xi32>) -> () {
 904   %0 = arith.constant dense<[1, 2, 0]> : tensor<3xi32>
 905   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<2x3x1xi32>
 906   // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<1x2x3xi32>) outs(%[[INIT]] : tensor<2x3x1xi32>) permutation = [1, 2, 0]
 907   %1 = tosa.transpose %arg0, %0 : (tensor<1x2x3xi32>, tensor<3xi32>)  -> tensor<2x3x1xi32>
 908   return
 909 }
 910
 911 // -----
 912
 913 // CHECK-LABEL: @test_transpose_dyn
 914 // CHECK-SAME: (%[[ARG0:.+]]: tensor<1x?x3x4xi32>)
 915 func.func @test_transpose_dyn(%arg0: tensor<1x?x3x4xi32>) -> () {
 916   %0 = arith.constant dense<[1, 3, 0, 2]> : tensor<4xi32>
 917   // CHECK: %[[C1:.+]] = arith.constant 1
 918   // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C1]]
 919   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM]]) : tensor<?x4x1x3xi32>
 920   // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<1x?x3x4xi32>) outs(%[[INIT]] : tensor<?x4x1x3xi32>) permutation = [1, 3, 0, 2]
 921   %1 = tosa.transpose %arg0, %0 : (tensor<1x?x3x4xi32>, tensor<4xi32>)  -> tensor<?x4x1x3xi32>
 922   return
 923 }
 924
 925 // -----
 926
 927 // CHECK-LABEL: @test_transpose_dyn_multiple_2d
 928 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>)
 929 func.func @test_transpose_dyn_multiple_2d(%arg0: tensor<?x?xf32>) -> () {
 930   %0 = arith.constant dense<[1, 0]> : tensor<2xi32>
 931   // CHECK-DAG: %[[C0:.+]] = arith.constant 0
 932   // CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
 933   // CHECK-DAG: %[[C1:.+]] = arith.constant 1
 934   // CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
 935   // CHECK: %[[INIT:.+]] = tensor.empty(%[[DIM1]], %[[DIM0]])
 936   // CHECK: %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<?x?xf32>) outs(%[[INIT]] : tensor<?x?xf32>) permutation = [1, 0]
 937   %1 = tosa.transpose %arg0, %0 : (tensor<?x?xf32>, tensor<2xi32>)  -> tensor<?x?xf32>
 938   return
 939 }
 940
 941 // -----
 942
 943 // CHECK-LABEL: @test_transpose_dyn_multiple_3d
 944 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?x?xf32>)
 945 func.func @test_transpose_dyn_multiple_3d(%arg0: tensor<?x?x?xf32>) {
 946   %0 = arith.constant dense<[2, 0, 1]> : tensor<3xi32>
 947   // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 948   // CHECK-DAG: %[[DIM0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
 949   // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
 950   // CHECK-DAG: %[[DIM1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
 951   // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
 952   // CHECK-DAG: %[[DIM2:.*]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
 953   // CHECK: %[[INIT:.*]] = tensor.empty(%[[DIM2]], %[[DIM0]], %[[DIM1]]) : tensor<?x?x?xf32>
 954   // CHECK: %[[TRANSPOSE:.*]] = linalg.transpose ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[INIT]] : tensor<?x?x?xf32>) permutation = [2, 0, 1]
 955   %1 = "tosa.transpose"(%arg0, %0) : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
 956   return
 957 }