Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / libomptarget / DeviceRTL / src / Kernel.cpp
blobf7d8ff8e565c185617f35e0188a2a588e694bd22
1 //===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the kernel entry points for the device.
11 //===----------------------------------------------------------------------===//
13 #include "Allocator.h"
14 #include "Debug.h"
15 #include "Environment.h"
16 #include "Interface.h"
17 #include "Mapping.h"
18 #include "State.h"
19 #include "Synchronization.h"
20 #include "Types.h"
22 #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
24 using namespace ompx;
26 #pragma omp begin declare target device_type(nohost)
28 static void
29 inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
30 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
31 // Order is important here.
32 synchronize::init(IsSPMD);
33 mapping::init(IsSPMD);
34 state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
35 allocator::init(IsSPMD, KernelEnvironment);
38 /// Simple generic state machine for worker threads.
39 static void genericStateMachine(IdentTy *Ident) {
40 uint32_t TId = mapping::getThreadIdInBlock();
42 do {
43 ParallelRegionFnTy WorkFn = nullptr;
45 // Wait for the signal that we have a new work function.
46 synchronize::threads(atomic::seq_cst);
48 // Retrieve the work function from the runtime.
49 bool IsActive = __kmpc_kernel_parallel(&WorkFn);
51 // If there is nothing more to do, break out of the state machine by
52 // returning to the caller.
53 if (!WorkFn)
54 return;
56 if (IsActive) {
57 ASSERT(!mapping::isSPMDMode(), nullptr);
58 ((void (*)(uint32_t, uint32_t))WorkFn)(0, TId);
59 __kmpc_kernel_end_parallel();
62 synchronize::threads(atomic::seq_cst);
64 } while (true);
67 extern "C" {
69 /// Initialization
70 ///
71 /// \param Ident Source location identification, can be NULL.
72 ///
73 int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
74 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
75 ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
76 bool IsSPMD = Configuration.ExecMode &
77 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
78 bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
79 if (IsSPMD) {
80 inititializeRuntime(/* IsSPMD */ true, KernelEnvironment,
81 KernelLaunchEnvironment);
82 synchronize::threadsAligned(atomic::relaxed);
83 } else {
84 inititializeRuntime(/* IsSPMD */ false, KernelEnvironment,
85 KernelLaunchEnvironment);
86 // No need to wait since only the main threads will execute user
87 // code and workers will run into a barrier right away.
90 if (IsSPMD) {
91 state::assumeInitialState(IsSPMD);
93 // Synchronize to ensure the assertions above are in an aligned region.
94 // The barrier is eliminated later.
95 synchronize::threadsAligned(atomic::relaxed);
96 return -1;
99 if (mapping::isInitialThreadInLevel0(IsSPMD))
100 return -1;
102 // Enter the generic state machine if enabled and if this thread can possibly
103 // be an active worker thread.
105 // The latter check is important for NVIDIA Pascal (but not Volta) and AMD
106 // GPU. In those cases, a single thread can apparently satisfy a barrier on
107 // behalf of all threads in the same warp. Thus, it would not be safe for
108 // other threads in the main thread's warp to reach the first
109 // synchronize::threads call in genericStateMachine before the main thread
110 // reaches its corresponding synchronize::threads call: that would permit all
111 // active worker threads to proceed before the main thread has actually set
112 // state::ParallelRegionFn, and then they would immediately quit without
113 // doing any work. mapping::getMaxTeamThreads() does not include any of the
114 // main thread's warp, so none of its threads can ever be active worker
115 // threads.
116 if (UseGenericStateMachine &&
117 mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD))
118 genericStateMachine(KernelEnvironment.Ident);
120 return mapping::getThreadIdInBlock();
123 /// De-Initialization
125 /// In non-SPMD, this function releases the workers trapped in a state machine
126 /// and also any memory dynamically allocated by the runtime.
128 /// \param Ident Source location identification, can be NULL.
130 void __kmpc_target_deinit() {
131 bool IsSPMD = mapping::isSPMDMode();
132 if (IsSPMD)
133 return;
135 if (mapping::isInitialThreadInLevel0(IsSPMD)) {
136 // Signal the workers to exit the state machine and exit the kernel.
137 state::ParallelRegionFn = nullptr;
138 } else if (!state::getKernelEnvironment()
139 .Configuration.UseGenericStateMachine) {
140 // Retrieve the work function just to ensure we always call
141 // __kmpc_kernel_parallel even if a custom state machine is used.
142 // TODO: this is not super pretty. The problem is we create the call to
143 // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it
144 // is not there yet. Thus, we assume we never reach it from
145 // __kmpc_target_deinit. That allows us to remove the store in there to
146 // ParallelRegionFn, which leads to bad results later on.
147 ParallelRegionFnTy WorkFn = nullptr;
148 __kmpc_kernel_parallel(&WorkFn);
149 ASSERT(WorkFn == nullptr, nullptr);
153 int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
156 #pragma omp end declare target