mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reductions.mlir

   1 // RUN: mlir-opt %s \
   2 // RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
   3 // RUN:   --sparsification --sparse-tensor-conversion \
   4 // RUN:   --convert-vector-to-scf --convert-scf-to-std \
   5 // RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
   6 // RUN:   --std-bufferize --finalizing-bufferize --lower-affine \
   7 // RUN:   --convert-vector-to-llvm --convert-memref-to-llvm \
   8 // RUN:   --convert-std-to-llvm --reconcile-unrealized-casts | \
   9 // RUN: mlir-cpu-runner \
  10 // RUN:  -e entry -entry-point-result=void  \
  11 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
  12 // RUN: FileCheck %s
  13 //
  14 // Do the same run, but now with SIMDization as well. This should not change the outcome.
  15 //
  16 // RUN: mlir-opt %s \
  17 // RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
  18 // RUN:   --sparsification="vectorization-strategy=2 vl=8" --sparse-tensor-conversion \
  19 // RUN:   --convert-vector-to-scf --convert-scf-to-std \
  20 // RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
  21 // RUN:   --std-bufferize --finalizing-bufferize --lower-affine \
  22 // RUN:   --convert-vector-to-llvm --convert-memref-to-llvm \
  23 // RUN:   --convert-std-to-llvm --reconcile-unrealized-casts | \
  24 // RUN: mlir-cpu-runner \
  25 // RUN:  -e entry -entry-point-result=void  \
  26 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
  27 // RUN: FileCheck %s
  28
  29 #SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>
  30 #DV = #sparse_tensor.encoding<{ dimLevelType = [ "dense"      ] }>
  31
  32 #trait_reduction = {
  33   indexing_maps = [
  34     affine_map<(i) -> (i)>,  // a
  35     affine_map<(i) -> ()>    // x (scalar out)
  36   ],
  37   iterator_types = ["reduction"],
  38   doc = "x += OPER_i a(i)"
  39 }
  40
  41 // An example of vector reductions.
  42 module {
  43
  44   func @sum_reduction_i32(%arga: tensor<32xi32, #SV>,
  45                           %argx: tensor<i32>) -> tensor<i32> {
  46     %0 = linalg.generic #trait_reduction
  47       ins(%arga: tensor<32xi32, #SV>)
  48       outs(%argx: tensor<i32>) {
  49         ^bb(%a: i32, %x: i32):
  50           %0 = arith.addi %x, %a : i32
  51           linalg.yield %0 : i32
  52     } -> tensor<i32>
  53     return %0 : tensor<i32>
  54   }
  55
  56   func @sum_reduction_f32(%arga: tensor<32xf32, #SV>,
  57                           %argx: tensor<f32>) -> tensor<f32> {
  58     %0 = linalg.generic #trait_reduction
  59       ins(%arga: tensor<32xf32, #SV>)
  60       outs(%argx: tensor<f32>) {
  61         ^bb(%a: f32, %x: f32):
  62           %0 = arith.addf %x, %a : f32
  63           linalg.yield %0 : f32
  64     } -> tensor<f32>
  65     return %0 : tensor<f32>
  66   }
  67
  68   func @prod_reduction_i32(%arga: tensor<32xi32, #DV>,
  69                            %argx: tensor<i32>) -> tensor<i32> {
  70     %0 = linalg.generic #trait_reduction
  71       ins(%arga: tensor<32xi32, #DV>)
  72       outs(%argx: tensor<i32>) {
  73         ^bb(%a: i32, %x: i32):
  74           %0 = arith.muli %x, %a : i32
  75           linalg.yield %0 : i32
  76     } -> tensor<i32>
  77     return %0 : tensor<i32>
  78   }
  79
  80   func @prod_reduction_f32(%arga: tensor<32xf32, #DV>,
  81                            %argx: tensor<f32>) -> tensor<f32> {
  82     %0 = linalg.generic #trait_reduction
  83       ins(%arga: tensor<32xf32, #DV>)
  84       outs(%argx: tensor<f32>) {
  85         ^bb(%a: f32, %x: f32):
  86           %0 = arith.mulf %x, %a : f32
  87           linalg.yield %0 : f32
  88     } -> tensor<f32>
  89     return %0 : tensor<f32>
  90   }
  91
  92   func @and_reduction_i32(%arga: tensor<32xi32, #DV>,
  93                           %argx: tensor<i32>) -> tensor<i32> {
  94     %0 = linalg.generic #trait_reduction
  95       ins(%arga: tensor<32xi32, #DV>)
  96       outs(%argx: tensor<i32>) {
  97         ^bb(%a: i32, %x: i32):
  98           %0 = arith.andi %x, %a : i32
  99           linalg.yield %0 : i32
 100     } -> tensor<i32>
 101     return %0 : tensor<i32>
 102   }
 103
 104   func @or_reduction_i32(%arga: tensor<32xi32, #SV>,
 105                          %argx: tensor<i32>) -> tensor<i32> {
 106     %0 = linalg.generic #trait_reduction
 107       ins(%arga: tensor<32xi32, #SV>)
 108       outs(%argx: tensor<i32>) {
 109         ^bb(%a: i32, %x: i32):
 110           %0 = arith.ori %x, %a : i32
 111           linalg.yield %0 : i32
 112     } -> tensor<i32>
 113     return %0 : tensor<i32>
 114   }
 115
 116   func @xor_reduction_i32(%arga: tensor<32xi32, #SV>,
 117                           %argx: tensor<i32>) -> tensor<i32> {
 118     %0 = linalg.generic #trait_reduction
 119       ins(%arga: tensor<32xi32, #SV>)
 120       outs(%argx: tensor<i32>) {
 121         ^bb(%a: i32, %x: i32):
 122           %0 = arith.xori %x, %a : i32
 123           linalg.yield %0 : i32
 124     } -> tensor<i32>
 125     return %0 : tensor<i32>
 126   }
 127
 128   func @dump_i32(%arg0 : memref<i32>) {
 129     %v = memref.load %arg0[] : memref<i32>
 130     vector.print %v : i32
 131     return
 132   }
 133
 134   func @dump_f32(%arg0 : memref<f32>) {
 135     %v = memref.load %arg0[] : memref<f32>
 136     vector.print %v : f32
 137     return
 138   }
 139
 140   func @entry() {
 141     %ri = arith.constant dense< 7   > : tensor<i32>
 142     %rf = arith.constant dense< 2.0 > : tensor<f32>
 143
 144     %c_0_i32 = arith.constant dense<[
 145       0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0,
 146       0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0
 147     ]> : tensor<32xi32>
 148
 149     %c_0_f32 = arith.constant dense<[
 150       0.0, 1.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0,
 151       0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0,
 152       0.0, 0.0, 0.0, 0.0, 2.5, 0.0, 0.0, 0.0,
 153       2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 9.0
 154     ]> : tensor<32xf32>
 155
 156     %c_1_i32 = arith.constant dense<[
 157       1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 158       1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 3
 159     ]> : tensor<32xi32>
 160
 161     %c_1_f32 = arith.constant dense<[
 162       1.0, 1.0, 1.0, 3.5, 1.0, 1.0, 1.0, 1.0,
 163       1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0,
 164       1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0,
 165       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0
 166     ]> : tensor<32xf32>
 167
 168     // Convert constants to annotated tensors.
 169     %sparse_input_i32 = sparse_tensor.convert %c_0_i32
 170       : tensor<32xi32> to tensor<32xi32, #SV>
 171     %sparse_input_f32 = sparse_tensor.convert %c_0_f32
 172       : tensor<32xf32> to tensor<32xf32, #SV>
 173     %dense_input_i32 = sparse_tensor.convert %c_1_i32
 174       : tensor<32xi32> to tensor<32xi32, #DV>
 175     %dense_input_f32 = sparse_tensor.convert %c_1_f32
 176       : tensor<32xf32> to tensor<32xf32, #DV>
 177
 178     // Call the kernels.
 179     %0 = call @sum_reduction_i32(%sparse_input_i32, %ri)
 180        : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
 181     %1 = call @sum_reduction_f32(%sparse_input_f32, %rf)
 182        : (tensor<32xf32, #SV>, tensor<f32>) -> tensor<f32>
 183     %2 = call @prod_reduction_i32(%dense_input_i32, %ri)
 184        : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
 185     %3 = call @prod_reduction_f32(%dense_input_f32, %rf)
 186        : (tensor<32xf32, #DV>, tensor<f32>) -> tensor<f32>
 187     %4 = call @and_reduction_i32(%dense_input_i32, %ri)
 188        : (tensor<32xi32, #DV>, tensor<i32>) -> tensor<i32>
 189     %5 = call @or_reduction_i32(%sparse_input_i32, %ri)
 190        : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
 191     %6 = call @xor_reduction_i32(%sparse_input_i32, %ri)
 192        : (tensor<32xi32, #SV>, tensor<i32>) -> tensor<i32>
 193
 194     // Verify results.
 195     //
 196     // CHECK: 26
 197     // CHECK: 27.5
 198     // CHECK: 3087
 199     // CHECK: 168
 200     // CHECK: 1
 201     // CHECK: 15
 202     // CHECK: 10
 203     //
 204     %m0 = bufferization.to_memref %0 : memref<i32>
 205     call @dump_i32(%m0) : (memref<i32>) -> ()
 206     %m1 = bufferization.to_memref %1 : memref<f32>
 207     call @dump_f32(%m1) : (memref<f32>) -> ()
 208     %m2 = bufferization.to_memref %2 : memref<i32>
 209     call @dump_i32(%m2) : (memref<i32>) -> ()
 210     %m3 = bufferization.to_memref %3 : memref<f32>
 211     call @dump_f32(%m3) : (memref<f32>) -> ()
 212     %m4 = bufferization.to_memref %4 : memref<i32>
 213     call @dump_i32(%m4) : (memref<i32>) -> ()
 214     %m5 = bufferization.to_memref %5 : memref<i32>
 215     call @dump_i32(%m5) : (memref<i32>) -> ()
 216     %m6 = bufferization.to_memref %6 : memref<i32>
 217     call @dump_i32(%m6) : (memref<i32>) -> ()
 218
 219     // Release the resources.
 220     sparse_tensor.release %sparse_input_i32 : tensor<32xi32, #SV>
 221     sparse_tensor.release %sparse_input_f32 : tensor<32xf32, #SV>
 222     sparse_tensor.release %dense_input_i32  : tensor<32xi32, #DV>
 223     sparse_tensor.release %dense_input_f32  : tensor<32xf32, #DV>
 224     memref.dealloc %m0 : memref<i32>
 225     memref.dealloc %m1 : memref<f32>
 226     memref.dealloc %m2 : memref<i32>
 227     memref.dealloc %m3 : memref<f32>
 228     memref.dealloc %m4 : memref<i32>
 229     memref.dealloc %m5 : memref<i32>
 230     memref.dealloc %m6 : memref<i32>
 231
 232     return
 233   }
 234 }