mlir/test/Target/LLVMIR/gpu.mlir

   1 // RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
   2
   3 // Checking the translation of the `gpu.binary` & `gpu.launch_fun` ops.
   4 module attributes {gpu.container_module} {
   5   // CHECK: [[ARGS_TY:%.*]] = type { i32, i32 }
   6   // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
   7   // CHECK: @kernel_module_kernel_kernel_name = private unnamed_addr constant [7 x i8] c"kernel\00", align 1
   8   gpu.binary @kernel_module  [#gpu.object<#nvvm.target, "BLOB">]
   9   llvm.func @foo() {
  10     // CHECK: [[ARGS:%.*]] = alloca %{{.*}}, align 8
  11     // CHECK: [[ARGS_ARRAY:%.*]] = alloca ptr, i64 2, align 8
  12     // CHECK: [[ARG0:%.*]] = getelementptr inbounds nuw [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 0
  13     // CHECK: store i32 32, ptr [[ARG0]], align 4
  14     // CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 0
  15     // CHECK: store ptr [[ARG0]], ptr %{{.*}}, align 8
  16     // CHECK: [[ARG1:%.*]] = getelementptr inbounds nuw [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 1
  17     // CHECK: store i32 32, ptr [[ARG1]], align 4
  18     // CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 1
  19     // CHECK: store ptr [[ARG1]], ptr %{{.*}}, align 8
  20     // CHECK: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4)
  21     // CHECK: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name)
  22     // CHECK: [[STREAM:%.*]] = call ptr @mgpuStreamCreate()
  23     // CHECK: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 256, ptr [[STREAM]], ptr [[ARGS_ARRAY]], ptr null, i64 2)
  24     // CHECK: call void @mgpuStreamSynchronize(ptr [[STREAM]])
  25     // CHECK: call void @mgpuStreamDestroy(ptr [[STREAM]])
  26     // CHECK: call void @mgpuModuleUnload(ptr [[MODULE]])
  27     %0 = llvm.mlir.constant(8 : index) : i64
  28     %1 = llvm.mlir.constant(32 : i32) : i32
  29     %2 = llvm.mlir.constant(256 : i32) : i32
  30     gpu.launch_func @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %2 args(%1 : i32, %1 : i32)
  31     llvm.return
  32   }
  33 }
  34
  35 // -----
  36
  37 // Checking the correct selection of the second object using an index as a selector.
  38 module {
  39   // CHECK: @kernel_module_bin_cst = internal constant [1 x i8] c"1", align 8
  40   gpu.binary @kernel_module <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "0">, #gpu.object<#nvvm.target, "1">]
  41 }
  42
  43 // -----
  44
  45 // Checking the correct selection of the second object using a target as a selector.
  46 module {
  47   // CHECK: @kernel_module_bin_cst = internal constant [6 x i8] c"AMDGPU", align 8
  48   gpu.binary @kernel_module <#gpu.select_object<#rocdl.target>> [#gpu.object<#nvvm.target, "NVPTX">, #gpu.object<#rocdl.target, "AMDGPU">]
  49 }
  50
  51 // -----
  52
  53 // Checking the correct selection of the second object using a target as a selector.
  54 module {
  55   // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
  56   gpu.binary @kernel_module <#gpu.select_object<#spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>>> [#gpu.object<#nvvm.target, "NVPTX">, #gpu.object<#spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>, "BLOB">]
  57 }
  58
  59 // -----
  60 // Checking the translation of `gpu.launch_fun` with an async dependency.
  61 module attributes {gpu.container_module} {
  62   // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
  63   gpu.binary @kernel_module  [#gpu.object<#rocdl.target, "BLOB">]
  64   llvm.func @foo() {
  65     %0 = llvm.mlir.constant(8 : index) : i64
  66     // CHECK: = call ptr @mgpuStreamCreate()
  67     // CHECK-NEXT: = alloca {{.*}}, align 8
  68     // CHECK-NEXT: [[ARGS:%.*]] = alloca ptr, i64 0, align 8
  69     // CHECK-NEXT: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4)
  70     // CHECK-NEXT: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name)
  71     // CHECK-NEXT: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 0, ptr {{.*}}, ptr [[ARGS]], ptr null, i64 0)
  72     // CHECK-NEXT: call void @mgpuModuleUnload(ptr [[MODULE]])
  73     // CHECK-NEXT: call void @mgpuStreamSynchronize(ptr %{{.*}})
  74     // CHECK-NEXT: call void @mgpuStreamDestroy(ptr %{{.*}})
  75     %1 = llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
  76     gpu.launch_func <%1 : !llvm.ptr> @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64
  77     llvm.call @mgpuStreamSynchronize(%1) : (!llvm.ptr) -> ()
  78     llvm.call @mgpuStreamDestroy(%1) : (!llvm.ptr) -> ()
  79     llvm.return
  80   }
  81   llvm.func @mgpuStreamCreate() -> !llvm.ptr
  82   llvm.func @mgpuStreamSynchronize(!llvm.ptr)
  83   llvm.func @mgpuStreamDestroy(!llvm.ptr)
  84 }
  85
  86 // -----
  87
  88 // Test cluster/block/thread syntax.
  89 module attributes {gpu.container_module} {
  90   // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
  91   gpu.binary @kernel_module  [#gpu.object<#nvvm.target, "BLOB">]
  92   llvm.func @foo() {
  93   // CHECK: [[S2:%.*]] = alloca ptr, i64 0, align 8
  94   // CHECK: [[S3:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4)
  95   // CHECK: [[S4:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[S3]], ptr @kernel_module_kernel_kernel_name)
  96   // CHECK: [[S5:%.*]] = call ptr @mgpuStreamCreate()
  97   // CHECK: call void @mgpuLaunchClusterKernel(ptr [[S4]], i64 2, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i32 0, ptr [[S5]], ptr [[S2]], ptr null)
  98     %0 = llvm.mlir.constant(1 : index) : i64
  99     %1 = llvm.mlir.constant(2 : index) : i64
 100     gpu.launch_func @kernel_module::@kernel clusters in (%1, %0, %0) blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64
 101     llvm.return
 102   }
 103 }
 104
 105 // -----
 106
 107 // Checking that ELF section is populated
 108 module attributes {gpu.container_module} {
 109   // CHECK: @cuda_device_mod_bin_cst = internal constant [4 x i8] c"BLOB", section "__nv_rel_fatbin", align 8
 110   gpu.binary @cuda_device_mod  [#gpu.object<#nvvm.target, properties = {section = "__nv_rel_fatbin"}, "BLOB">]
 111 }