mlir/test/mlir-cpu-runner/sgemm-naive-codegen.mlir

   1 // RUN: mlir-opt -convert-linalg-to-loops -lower-affine -convert-scf-to-std -convert-arith-to-llvm -convert-vector-to-llvm -convert-memref-to-llvm -convert-std-to-llvm -reconcile-unrealized-casts %s | mlir-cpu-runner -O3 -e main -entry-point-result=void -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext | FileCheck %s
   2
   3 func @main() {
   4   %A = memref.alloc() : memref<16x16xf32>
   5   %B = memref.alloc() : memref<16x16xf32>
   6   %C = memref.alloc() : memref<16x16xf32>
   7
   8   %cf1 = arith.constant 1.00000e+00 : f32
   9
  10   linalg.fill(%cf1, %A) : f32, memref<16x16xf32>
  11   linalg.fill(%cf1, %B) : f32, memref<16x16xf32>
  12
  13   %reps = arith.constant 1 : index
  14
  15   %t_start = call @rtclock() : () -> f64
  16   affine.for %arg0 = 0 to 5 {
  17     linalg.fill(%cf1, %C) : f32, memref<16x16xf32>
  18     call @sgemm_naive(%A, %B, %C) : (memref<16x16xf32>, memref<16x16xf32>, memref<16x16xf32>) -> ()
  19   }
  20   %t_end = call @rtclock() : () -> f64
  21   %t = arith.subf %t_end, %t_start : f64
  22
  23   %res = affine.load %C[0, 0]: memref<16x16xf32>
  24   vector.print %res: f32
  25
  26   %c0 = arith.constant 0 : index
  27   %c1 = arith.constant 1 : index
  28   %c2 = arith.constant 2 : index
  29
  30   %M = memref.dim %C, %c0 : memref<16x16xf32>
  31   %N = memref.dim %C, %c1 : memref<16x16xf32>
  32   %K = memref.dim %A, %c1 : memref<16x16xf32>
  33
  34   %f1 = arith.muli %M, %N : index
  35   %f2 = arith.muli %f1, %K : index
  36
  37   // 2*M*N*K.
  38   %f3 = arith.muli %c2, %f2 : index
  39   %num_flops = arith.muli %reps, %f3 : index
  40   %num_flops_i = arith.index_cast %num_flops : index to i16
  41   %num_flops_f = arith.sitofp %num_flops_i : i16 to f64
  42   %flops = arith.divf %num_flops_f, %t : f64
  43   call @print_flops(%flops) : (f64) -> ()
  44
  45   memref.dealloc %A : memref<16x16xf32>
  46   memref.dealloc %B : memref<16x16xf32>
  47   memref.dealloc %C : memref<16x16xf32>
  48   return
  49 }
  50 // CHECK: 17
  51
  52 func @sgemm_naive(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf32>, %arg2: memref<16x16xf32>) {
  53   %c0 = arith.constant 0 : index
  54   affine.for %arg3 = 0 to 16 {
  55     affine.for %arg4 = 0 to 16 {
  56       %m = memref.alloc() : memref<1xf32>
  57       %v = affine.load %arg2[%arg3, %arg4] : memref<16x16xf32>
  58       affine.store %v, %m[%c0] : memref<1xf32>
  59       affine.for %arg5 = 0 to 16 {
  60         %3 = affine.load %arg0[%arg3, %arg5] : memref<16x16xf32>
  61         %4 = affine.load %arg1[%arg5, %arg4] : memref<16x16xf32>
  62         %5 = affine.load %m[0] : memref<1xf32>
  63         %6 = arith.mulf %3, %4 : f32
  64         %7 = arith.addf %6, %5 : f32
  65         affine.store %7, %m[0] : memref<1xf32>
  66       }
  67       %s = affine.load %m[%c0] : memref<1xf32>
  68       affine.store %s, %arg2[%arg3, %arg4] : memref<16x16xf32>
  69       memref.dealloc %m : memref<1xf32>
  70     }
  71   }
  72   return
  73 }
  74
  75 func private @print_flops(f64)
  76 func private @rtclock() -> f64