1 // RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
2 // RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
3 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
4 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" | \
5 // RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.copy register-tile-sizes=4,32 vectorize" | \
7 // RUN: mlir-opt -canonicalize -convert-vector-to-scf -lower-affine -convert-linalg-to-loops | \
8 // RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm -convert-memref-to-llvm -convert-std-to-llvm -reconcile-unrealized-casts | \
9 // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
10 // Activate to dump assembly
11 // R_UN: -dump-object-file -object-filename=/tmp/a.o \
12 // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
13 // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
14 // Use tee to both print to stderr and FileCheck
15 // RUN: tee -a /dev/stderr | FileCheck %s
18 !elem_type_a = type f32
19 !elem_type_b = type f32
20 !elem_type_c = type f32
21 !row_major_A = type memref<${M}x${K}x!elem_type_a>
22 !row_major_B = type memref<${K}x${N}x!elem_type_b>
23 !row_major_C = type memref<${M}x${N}x!elem_type_c>
25 func @matmul(%a: !row_major_A, %b: !row_major_B, %c: !row_major_C)
26 // TODO: activate manually for now.
27 // attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
29 linalg.matmul ins(%a, %b : !row_major_A, !row_major_B)
30 outs(%c: !row_major_C)
34 func @print_perf(%iters: index, %total_time: f64) {
35 %c2 = arith.constant 2 : index
36 %cM = arith.constant ${M} : index
37 %cN = arith.constant ${N} : index
38 %cK = arith.constant ${K} : index
40 %mn = arith.muli %cM, %cN : index
41 %mnk = arith.muli %mn, %cK : index
44 %flops_per_iter = arith.muli %c2, %mnk : index
45 %flops = arith.muli %iters, %flops_per_iter : index
46 %flops_i64 = arith.index_cast %flops : index to i64
47 %flops_f = arith.sitofp %flops_i64 : i64 to f64
48 %flops_per_s = arith.divf %flops_f, %total_time : f64
49 vector.print %flops_per_s : f64
55 %v0 = arith.constant 0.0 : !elem_type_a
56 %v1 = arith.constant 1.0 : !elem_type_a
58 %A = memref.alloc() : !row_major_A
59 %B = memref.alloc() : !row_major_B
60 %C = memref.alloc() : !row_major_C
62 linalg.fill(%v1, %A) : !elem_type_a, !row_major_A
63 linalg.fill(%v1, %B) : !elem_type_b, !row_major_B
64 linalg.fill(%v0, %C) : !elem_type_c, !row_major_C
66 %c0 = arith.constant 0: index
67 %c1 = arith.constant 1: index
68 %iters = arith.constant ${ITERS}: index
70 /// Run and dump performance for matmul.
72 scf.for %arg0 = %c0 to %iters step %c1 {
73 %z = arith.constant 0.0 : !elem_type_c
74 linalg.fill(%z, %C) : !elem_type_c, !row_major_C
75 call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
77 %t_start_matmul = call @rtclock() : () -> f64
78 scf.for %arg0 = %c0 to %iters step %c1 {
79 // linalg.matmul writes %C in place, need to reset it to zero every time.
80 // This is accounts for about 10-15% perf hit on small sizes.
81 // Once linalg on tensors is ready, fusing fill at the register level will
83 %z = arith.constant 0.0 : !elem_type_c
84 linalg.fill(%z, %C) : !elem_type_c, !row_major_C
85 call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> ()
87 %t_end_matmul = call @rtclock() : () -> f64
88 %tmatmul = arith.subf %t_end_matmul, %t_start_matmul: f64
89 call @print_perf(%iters, %tmatmul) : (index, f64) -> ()
92 %C_ref = memref.alloc() : !row_major_C
93 linalg.fill(%v0, %C_ref) : !elem_type_c, !row_major_C
94 linalg.matmul ins(%A, %B : !row_major_A, !row_major_B)
95 outs(%C_ref: !row_major_C)
96 %act = memref.cast %C : !row_major_C to memref<*xf32>
97 %exp = memref.cast %C_ref : !row_major_C to memref<*xf32>
98 %errors = call @verifyMemRefF32(%act, %exp) : (memref<*xf32>, memref<*xf32>) -> i64
99 vector.print %errors : i64
100 memref.dealloc %C_ref : !row_major_C
102 memref.dealloc %A : !row_major_A
103 memref.dealloc %B : !row_major_B
104 memref.dealloc %C : !row_major_C
109 func private @rtclock() -> f64
110 func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface }
112 // TODO: init with random, run and check output.
113 // func private @fill_random_f32(memref<*xf32>)