2 // RUN: -convert-linalg-to-parallel-loops \
3 // RUN: -async-parallel-for \
4 // RUN: -async-to-async-runtime \
5 // RUN: -async-runtime-ref-counting \
6 // RUN: -async-runtime-ref-counting-opt \
7 // RUN: -convert-async-to-llvm \
8 // RUN: -convert-scf-to-cf \
9 // RUN: -arith-expand \
10 // RUN: -memref-expand \
11 // RUN: -convert-vector-to-llvm \
12 // RUN: -finalize-memref-to-llvm \
13 // RUN: -convert-func-to-llvm \
14 // RUN: -convert-arith-to-llvm \
15 // RUN: -convert-cf-to-llvm \
16 // RUN: -reconcile-unrealized-casts \
17 // RUN: | mlir-cpu-runner \
18 // RUN: -e entry -entry-point-result=void -O3 \
19 // RUN: -shared-libs=%mlir_runner_utils \
20 // RUN: -shared-libs=%mlir_c_runner_utils\
21 // RUN: -shared-libs=%mlir_async_runtime \
22 // RUN: | FileCheck %s --dump-input=always
25 // RUN: -convert-linalg-to-loops \
26 // RUN: -convert-scf-to-cf \
27 // RUN: -convert-vector-to-llvm \
28 // RUN: -finalize-memref-to-llvm \
29 // RUN: -convert-func-to-llvm \
30 // RUN: -convert-arith-to-llvm \
31 // RUN: -convert-cf-to-llvm \
32 // RUN: -reconcile-unrealized-casts \
33 // RUN: | mlir-cpu-runner \
34 // RUN: -e entry -entry-point-result=void -O3 \
35 // RUN: -shared-libs=%mlir_runner_utils \
36 // RUN: -shared-libs=%mlir_c_runner_utils\
37 // RUN: -shared-libs=%mlir_async_runtime \
38 // RUN: | FileCheck %s --dump-input=always
40 #map0 = affine_map<(d0, d1) -> (d0, d1)>
42 func.func @linalg_generic(%lhs: memref<?x?xf32>,
43 %rhs: memref<?x?xf32>,
44 %sum: memref<?x?xf32>) {
46 indexing_maps = [#map0, #map0, #map0],
47 iterator_types = ["parallel", "parallel"]
49 ins(%lhs, %rhs : memref<?x?xf32>, memref<?x?xf32>)
50 outs(%sum : memref<?x?xf32>)
52 ^bb0(%lhs_in: f32, %rhs_in: f32, %sum_out: f32):
53 %0 = arith.addf %lhs_in, %rhs_in : f32
61 %f1 = arith.constant 1.0 : f32
62 %f4 = arith.constant 4.0 : f32
63 %c0 = arith.constant 0 : index
64 %c1 = arith.constant 1 : index
65 %cN = arith.constant 50 : index
68 // Sanity check for the function under test.
71 %LHS10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
72 %RHS10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
73 %DST10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
75 linalg.fill ins(%f1 : f32) outs(%LHS10 : memref<1x10xf32>)
76 linalg.fill ins(%f1 : f32) outs(%RHS10 : memref<1x10xf32>)
78 %LHS = memref.cast %LHS10 : memref<1x10xf32> to memref<?x?xf32>
79 %RHS = memref.cast %RHS10 : memref<1x10xf32> to memref<?x?xf32>
80 %DST = memref.cast %DST10 : memref<1x10xf32> to memref<?x?xf32>
82 call @linalg_generic(%LHS, %RHS, %DST)
83 : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
85 // CHECK: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
86 %U = memref.cast %DST10 : memref<1x10xf32> to memref<*xf32>
87 call @printMemrefF32(%U): (memref<*xf32>) -> ()
89 memref.dealloc %LHS10: memref<1x10xf32>
90 memref.dealloc %RHS10: memref<1x10xf32>
91 memref.dealloc %DST10: memref<1x10xf32>
94 // Allocate data for microbenchmarks.
97 %LHS1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>
98 %RHS1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>
99 %DST1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>
101 %LHS0 = memref.cast %LHS1024 : memref<1024x1024xf32> to memref<?x?xf32>
102 %RHS0 = memref.cast %RHS1024 : memref<1024x1024xf32> to memref<?x?xf32>
103 %DST0 = memref.cast %DST1024 : memref<1024x1024xf32> to memref<?x?xf32>
109 call @linalg_generic(%LHS0, %RHS0, %DST0)
110 : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
113 // Measure execution time.
116 %t0 = call @rtclock() : () -> f64
117 scf.for %i = %c0 to %cN step %c1 {
118 func.call @linalg_generic(%LHS0, %RHS0, %DST0)
119 : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
121 %t1 = call @rtclock() : () -> f64
122 %t1024 = arith.subf %t1, %t0 : f64
125 vector.print %t1024 : f64
128 memref.dealloc %LHS1024: memref<1024x1024xf32>
129 memref.dealloc %RHS1024: memref<1024x1024xf32>
130 memref.dealloc %DST1024: memref<1024x1024xf32>
135 func.func private @rtclock() -> f64
137 func.func private @printMemrefF32(memref<*xf32>)
138 attributes { llvm.emit_c_interface }