mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir

   1 // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
   2 // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
   3 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
   4 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" | \
   5 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.copy register-tile-sizes=4,32 vectorize" | \
   6
   7 // RUN: mlir-opt -canonicalize -convert-vector-to-scf -lower-affine -convert-linalg-to-loops | \
   8 // RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm -convert-memref-to-llvm -convert-std-to-llvm -reconcile-unrealized-casts | \
   9 // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
  10 // Activate to dump assembly
  11 // R_UN:   -dump-object-file -object-filename=/tmp/a.o \
  12 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
  13 // RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
  14 // Use tee to both print to stderr and FileCheck
  15 // RUN: tee -a /dev/stderr | FileCheck %s
  16
  17
  18 !elem_type_a = type f32
  19 !elem_type_b = type f32
  20 !elem_type_c = type f32
  21 !row_major_A = type memref<${M}x${K}x!elem_type_a>
  22 !row_major_B = type memref<${K}x${N}x!elem_type_b>
  23 !row_major_C = type memref<${M}x${N}x!elem_type_c>
  24
  25 func @matmul(%a: !row_major_A, %b: !row_major_B, %c: !row_major_C)
  26 // TODO: activate manually for now.
  27 // attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
  28 {
  29   linalg.matmul ins(%a, %b : !row_major_A, !row_major_B)
  30     outs(%c: !row_major_C)
  31   return
  32 }
  33
  34 func @print_perf(%iters: index, %total_time: f64) {
  35   %c2 = arith.constant 2 : index
  36   %cM = arith.constant ${M} : index
  37   %cN = arith.constant ${N} : index
  38   %cK = arith.constant ${K} : index
  39
  40   %mn = arith.muli %cM, %cN : index
  41   %mnk = arith.muli %mn, %cK : index
  42
  43   // 2*M*N*K.
  44   %flops_per_iter = arith.muli %c2, %mnk : index
  45   %flops = arith.muli %iters, %flops_per_iter : index
  46   %flops_i64 = arith.index_cast %flops : index to i64
  47   %flops_f = arith.sitofp %flops_i64 : i64 to f64
  48   %flops_per_s = arith.divf %flops_f, %total_time : f64
  49   vector.print %flops_per_s : f64
  50
  51   return
  52 }
  53
  54 func @main() {
  55   %v0 = arith.constant 0.0 : !elem_type_a
  56   %v1 = arith.constant 1.0 : !elem_type_a
  57
  58   %A = memref.alloc() : !row_major_A
  59   %B = memref.alloc() : !row_major_B
  60   %C = memref.alloc() : !row_major_C
  61
  62   linalg.fill(%v1, %A) : !elem_type_a, !row_major_A
  63   linalg.fill(%v1, %B) : !elem_type_b, !row_major_B
  64   linalg.fill(%v0, %C) : !elem_type_c, !row_major_C
  65
  66   %c0 = arith.constant 0: index
  67   %c1 = arith.constant 1: index
  68   %iters = arith.constant ${ITERS}: index
  69
  70   /// Run and dump performance for matmul.
  71   /// Preheating run:
  72   scf.for %arg0 = %c0 to %iters step %c1 {
  73     %z = arith.constant 0.0 : !elem_type_c
  74     linalg.fill(%z, %C) : !elem_type_c, !row_major_C
  75     call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
  76   }
  77   %t_start_matmul = call @rtclock() : () -> f64
  78   scf.for %arg0 = %c0 to %iters step %c1 {
  79     // linalg.matmul writes %C in place, need to reset it to zero every time.
  80     // This is accounts for about 10-15% perf hit on small sizes.
  81     // Once linalg on tensors is ready, fusing fill at the register level will
  82     // be easy.
  83     %z = arith.constant 0.0 : !elem_type_c
  84     linalg.fill(%z, %C) : !elem_type_c, !row_major_C
  85     call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
  86   }
  87   %t_end_matmul = call @rtclock() : () -> f64
  88   %tmatmul = arith.subf %t_end_matmul, %t_start_matmul: f64
  89   call @print_perf(%iters, %tmatmul) : (index, f64) -> ()
  90
  91   // CHECK: {{^0$}}
  92   %C_ref = memref.alloc() : !row_major_C
  93   linalg.fill(%v0, %C_ref) : !elem_type_c, !row_major_C
  94   linalg.matmul ins(%A, %B : !row_major_A, !row_major_B)
  95     outs(%C_ref: !row_major_C)
  96   %act = memref.cast %C : !row_major_C to memref<*xf32>
  97   %exp = memref.cast %C_ref : !row_major_C to memref<*xf32>
  98   %errors = call @verifyMemRefF32(%act, %exp) : (memref<*xf32>, memref<*xf32>) -> i64
  99   vector.print %errors : i64
 100   memref.dealloc %C_ref : !row_major_C
 101
 102   memref.dealloc %A : !row_major_A
 103   memref.dealloc %B : !row_major_B
 104   memref.dealloc %C : !row_major_C
 105
 106   return
 107 }
 108
 109 func private @rtclock() -> f64
 110 func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface }
 111
 112 // TODO: init with random, run and check output.
 113 // func private @fill_random_f32(memref<*xf32>)