2 // RUN: -convert-linalg-to-parallel-loops \
3 // RUN: -async-parallel-for \
4 // RUN: -async-to-async-runtime \
5 // RUN: -async-runtime-ref-counting \
6 // RUN: -async-runtime-ref-counting-opt \
7 // RUN: -convert-async-to-llvm \
8 // RUN: -convert-scf-to-std \
9 // RUN: -arith-expand \
11 // RUN: -convert-vector-to-llvm \
12 // RUN: -convert-memref-to-llvm \
13 // RUN: -convert-std-to-llvm \
14 // RUN: -reconcile-unrealized-casts \
15 // RUN: | mlir-cpu-runner \
16 // RUN: -e entry -entry-point-result=void -O3 \
17 // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
18 // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext\
19 // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_async_runtime%shlibext \
20 // RUN: | FileCheck %s --dump-input=always
23 // RUN: -convert-linalg-to-loops \
24 // RUN: -convert-scf-to-std \
25 // RUN: -convert-vector-to-llvm \
26 // RUN: -convert-memref-to-llvm \
27 // RUN: -convert-std-to-llvm \
28 // RUN: -reconcile-unrealized-casts \
29 // RUN: | mlir-cpu-runner \
30 // RUN: -e entry -entry-point-result=void -O3 \
31 // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
32 // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext\
33 // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_async_runtime%shlibext \
34 // RUN: | FileCheck %s --dump-input=always
36 #map0 = affine_map<(d0, d1) -> (d0, d1)>
38 func @linalg_generic(%lhs: memref<?x?xf32>,
39 %rhs: memref<?x?xf32>,
40 %sum: memref<?x?xf32>) {
42 indexing_maps = [#map0, #map0, #map0],
43 iterator_types = ["parallel", "parallel"]
45 ins(%lhs, %rhs : memref<?x?xf32>, memref<?x?xf32>)
46 outs(%sum : memref<?x?xf32>)
48 ^bb0(%lhs_in: f32, %rhs_in: f32, %sum_out: f32):
49 %0 = arith.addf %lhs_in, %rhs_in : f32
57 %f1 = arith.constant 1.0 : f32
58 %f4 = arith.constant 4.0 : f32
59 %c0 = arith.constant 0 : index
60 %c1 = arith.constant 1 : index
61 %cN = arith.constant 50 : index
64 // Sanity check for the function under test.
67 %LHS10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
68 %RHS10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
69 %DST10 = memref.alloc() {alignment = 64} : memref<1x10xf32>
71 linalg.fill(%f1, %LHS10) : f32, memref<1x10xf32>
72 linalg.fill(%f1, %RHS10) : f32, memref<1x10xf32>
74 %LHS = memref.cast %LHS10 : memref<1x10xf32> to memref<?x?xf32>
75 %RHS = memref.cast %RHS10 : memref<1x10xf32> to memref<?x?xf32>
76 %DST = memref.cast %DST10 : memref<1x10xf32> to memref<?x?xf32>
78 call @linalg_generic(%LHS, %RHS, %DST)
79 : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
81 // CHECK: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
82 %U = memref.cast %DST10 : memref<1x10xf32> to memref<*xf32>
83 call @print_memref_f32(%U): (memref<*xf32>) -> ()
85 memref.dealloc %LHS10: memref<1x10xf32>
86 memref.dealloc %RHS10: memref<1x10xf32>
87 memref.dealloc %DST10: memref<1x10xf32>
90 // Allocate data for microbenchmarks.
93 %LHS1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>
94 %RHS1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>
95 %DST1024 = memref.alloc() {alignment = 64} : memref<1024x1024xf32>
97 %LHS0 = memref.cast %LHS1024 : memref<1024x1024xf32> to memref<?x?xf32>
98 %RHS0 = memref.cast %RHS1024 : memref<1024x1024xf32> to memref<?x?xf32>
99 %DST0 = memref.cast %DST1024 : memref<1024x1024xf32> to memref<?x?xf32>
105 call @linalg_generic(%LHS0, %RHS0, %DST0)
106 : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
109 // Measure execution time.
112 %t0 = call @rtclock() : () -> f64
113 scf.for %i = %c0 to %cN step %c1 {
114 call @linalg_generic(%LHS0, %RHS0, %DST0)
115 : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
117 %t1 = call @rtclock() : () -> f64
118 %t1024 = arith.subf %t1, %t0 : f64
121 vector.print %t1024 : f64
124 memref.dealloc %LHS1024: memref<1024x1024xf32>
125 memref.dealloc %RHS1024: memref<1024x1024xf32>
126 memref.dealloc %DST1024: memref<1024x1024xf32>
131 func private @rtclock() -> f64
133 func private @print_memref_f32(memref<*xf32>)
134 attributes { llvm.emit_c_interface }