mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir

   1 // RUN:   mlir-opt %s                                                          \
   2 // RUN:               -convert-linalg-to-parallel-loops                        \
   3 // RUN:               -async-parallel-for                                      \
   4 // RUN:               -async-to-async-runtime                                  \
   5 // RUN:               -async-runtime-ref-counting                              \
   6 // RUN:               -async-runtime-ref-counting-opt                          \
   7 // RUN:               -convert-async-to-llvm                                   \
   8 // RUN:               -convert-scf-to-cf                                       \
   9 // RUN:               -arith-expand                                            \
  10 // RUN:               -memref-expand                                           \
  11 // RUN:               -convert-vector-to-llvm                                  \
  12 // RUN:               -finalize-memref-to-llvm                                 \
  13 // RUN:               -convert-func-to-llvm                                    \
  14 // RUN:               -convert-arith-to-llvm                                   \
  15 // RUN:               -convert-cf-to-llvm                                      \
  16 // RUN:               -reconcile-unrealized-casts                              \
  17 // RUN: | mlir-cpu-runner                                                      \
  18 // RUN: -e entry -entry-point-result=void -O3                                  \
  19 // RUN: -shared-libs=%mlir_runner_utils  \
  20 // RUN: -shared-libs=%mlir_c_runner_utils\
  21 // RUN: -shared-libs=%mlir_async_runtime \
  22 // RUN: | FileCheck %s --dump-input=always
  23
  24 // RUN:   mlir-opt %s                                                          \
  25 // RUN:               -convert-linalg-to-loops                                 \
  26 // RUN:               -convert-scf-to-cf                                       \
  27 // RUN:               -convert-vector-to-llvm                                  \
  28 // RUN:               -finalize-memref-to-llvm                                 \
  29 // RUN:               -convert-func-to-llvm                                    \
  30 // RUN:               -convert-arith-to-llvm                                   \
  31 // RUN:               -convert-cf-to-llvm                                      \
  32 // RUN:               -reconcile-unrealized-casts                              \
  33 // RUN: | mlir-cpu-runner                                                      \
  34 // RUN: -e entry -entry-point-result=void -O3                                  \
  35 // RUN: -shared-libs=%mlir_runner_utils  \
  36 // RUN: -shared-libs=%mlir_c_runner_utils\
  37 // RUN: -shared-libs=%mlir_async_runtime \
  38 // RUN: | FileCheck %s --dump-input=always
  39
  40 #map0 = affine_map<(d0, d1) -> (d0, d1)>
  41
  42 func.func @linalg_generic(%lhs: memref<?x?xf32>,
  43                      %rhs: memref<?x?xf32>,
  44                      %sum: memref<?x?xf32>) {
  45   linalg.generic {
  46     indexing_maps = [#map0, #map0, #map0],
  47     iterator_types = ["parallel", "parallel"]
  48   }
  49     ins(%lhs, %rhs : memref<?x?xf32>, memref<?x?xf32>)
  50     outs(%sum : memref<?x?xf32>)
  51   {
  52     ^bb0(%lhs_in: f32, %rhs_in: f32, %sum_out: f32):
  53       %0 = arith.addf %lhs_in, %rhs_in : f32
  54       linalg.yield %0 : f32
  55   }
  56
  57   return
  58 }
  59
  60 func.func @entry() {
  61   %f1 = arith.constant 1.0 : f32
  62   %f4 = arith.constant 4.0 : f32
  63   %c0 = arith.constant 0 : index
  64   %c1 = arith.constant 1 : index
  65   %cN = arith.constant 50 : index
  66
  67   //
  68   // Sanity check for the function under test.
  69   //
  70
  71   %LHS10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
  72   %RHS10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
  73   %DST10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
  74
  75   linalg.fill ins(%f1 : f32) outs(%LHS10 : memref<1x10xf32>)
  76   linalg.fill ins(%f1 : f32) outs(%RHS10 : memref<1x10xf32>)
  77
  78   %LHS = memref.cast %LHS10 : memref<1x10xf32> to memref<?x?xf32>
  79   %RHS = memref.cast %RHS10 : memref<1x10xf32> to memref<?x?xf32>
  80   %DST = memref.cast %DST10 : memref<1x10xf32> to memref<?x?xf32>
  81
  82   call @linalg_generic(%LHS, %RHS, %DST)
  83     : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
  84
  85   // CHECK: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
  86   %U = memref.cast %DST10 :  memref<1x10xf32> to memref<*xf32>
  87   call @printMemrefF32(%U): (memref<*xf32>) -> ()
  88
  89   memref.dealloc %LHS10: memref<1x10xf32>
  90   memref.dealloc %RHS10: memref<1x10xf32>
  91   memref.dealloc %DST10: memref<1x10xf32>
  92
  93   //
  94   // Allocate data for microbenchmarks.
  95   //
  96
  97   %LHS1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>
  98   %RHS1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>
  99   %DST1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>
 100
 101   %LHS0 = memref.cast %LHS1024 : memref<1024x1024xf32> to memref<?x?xf32>
 102   %RHS0 = memref.cast %RHS1024 : memref<1024x1024xf32> to memref<?x?xf32>
 103   %DST0 = memref.cast %DST1024 : memref<1024x1024xf32> to memref<?x?xf32>
 104
 105   //
 106   // Warm up.
 107   //
 108
 109   call @linalg_generic(%LHS0, %RHS0, %DST0)
 110     : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
 111
 112   //
 113   // Measure execution time.
 114   //
 115
 116   %t0 = call @rtclock() : () -> f64
 117   scf.for %i = %c0 to %cN step %c1 {
 118     func.call @linalg_generic(%LHS0, %RHS0, %DST0)
 119       : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
 120   }
 121   %t1 = call @rtclock() : () -> f64
 122   %t1024 = arith.subf %t1, %t0 : f64
 123
 124   // Print timings.
 125   vector.print %t1024 : f64
 126
 127   // Free.
 128   memref.dealloc %LHS1024: memref<1024x1024xf32>
 129   memref.dealloc %RHS1024: memref<1024x1024xf32>
 130   memref.dealloc %DST1024: memref<1024x1024xf32>
 131
 132   return
 133 }
 134
 135 func.func private @rtclock() -> f64
 136
 137 func.func private @printMemrefF32(memref<*xf32>)
 138   attributes { llvm.emit_c_interface }