1 # RUN: env SUPPORT_LIB=%mlir_cuda_runtime \
2 # RUN: %PYTHON %s | FileCheck %s
4 # ===----------------------------------------------------------------------===//
5 # Chapter 0 : Hello World
6 # ===----------------------------------------------------------------------===//
8 # This program demonstrates Hello World:
9 # 1. Build MLIR function with arguments
10 # 2. Build MLIR GPU kernel
11 # 3. Print from a GPU thread
12 # 4. Pass arguments, JIT compile and run the MLIR function
14 # ===----------------------------------------------------------------------===//
17 from mlir
.dialects
import gpu
18 from tools
.nvdsl
import *
21 # 1. The decorator generates a MLIR func.func.
22 # Everything inside the Python function becomes the body of the func.
23 # The decorator also translates `alpha` to an `index` type.
26 # 2. The decorator generates a MLIR gpu.launch.
27 # Everything inside the Python function becomes the body of the gpu.launch.
28 # This allows for late outlining of the GPU kernel, enabling optimizations
29 # like constant folding from host to device.
30 @NVDSL.mlir_gpu_launch(grid
=(1, 1, 1), block
=(4, 1, 1))
32 tidx
= gpu
.thread_id(gpu
.Dimension
.x
)
33 # + operator generates arith.addi
34 myValue
= alpha
+ tidx
35 # Print from a GPU thread
36 gpu
.printf("GPU thread %llu has %llu\n", [tidx
, myValue
])
38 # 3. Call the GPU kernel
43 # 4. The `mlir_func` decorator JIT compiles the IR and executes the MLIR function.
47 # CHECK: GPU thread 0 has 100
48 # CHECK: GPU thread 1 has 101
49 # CHECK: GPU thread 2 has 102
50 # CHECK: GPU thread 3 has 103