1 //===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file contains the declarations of all library macros, types,
12 //===----------------------------------------------------------------------===//
14 ////////////////////////////////////////////////////////////////////////////////
16 ////////////////////////////////////////////////////////////////////////////////
18 INLINE omp_sched_t
omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
19 // sched starts from 1..4; encode it as 0..3; so add 1 here
20 uint8_t rc
= (items
.flags
& TaskDescr_SchedMask
) + 1;
21 return (omp_sched_t
)rc
;
24 INLINE
void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched
) {
25 // sched starts from 1..4; encode it as 0..3; so sub 1 here
26 uint8_t val
= ((uint8_t)sched
) - 1;
27 // clear current sched
28 items
.flags
&= ~TaskDescr_SchedMask
;
33 INLINE
void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
36 // default sched is static,
37 // dyn is off (unused now anyway, but may need to sample from host ?)
41 items
.threadId
= 0; // is master
42 items
.runtimeChunkSize
= 1; // preferred chunking statik with chunk 1
45 // This is called when all threads are started together in SPMD mode.
46 // OMP directives include target parallel, target distribute parallel for, etc.
47 INLINE
void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
48 omptarget_nvptx_TaskDescr
*parentTaskDescr
) {
51 // default sched is static,
52 // dyn is off (unused now anyway, but may need to sample from host ?)
55 items
.flags
= TaskDescr_InPar
| TaskDescr_IsParConstr
; // set flag to parallel
57 __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
58 // called for 1st level)
59 items
.runtimeChunkSize
= 1; // preferred chunking statik with chunk 1
60 prev
= parentTaskDescr
;
63 INLINE
void omptarget_nvptx_TaskDescr::CopyData(
64 omptarget_nvptx_TaskDescr
*sourceTaskDescr
) {
65 items
= sourceTaskDescr
->items
;
69 omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr
*sourceTaskDescr
) {
70 CopyData(sourceTaskDescr
);
71 prev
= sourceTaskDescr
->prev
;
74 INLINE
void omptarget_nvptx_TaskDescr::CopyParent(
75 omptarget_nvptx_TaskDescr
*parentTaskDescr
) {
76 CopyData(parentTaskDescr
);
77 prev
= parentTaskDescr
;
80 INLINE
void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
81 omptarget_nvptx_TaskDescr
*parentTaskDescr
) {
82 CopyParent(parentTaskDescr
);
83 items
.flags
= items
.flags
& ~TaskDescr_IsParConstr
;
84 ASSERT0(LT_FUSSY
, IsTaskConstruct(), "expected task");
87 INLINE
void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
88 omptarget_nvptx_TaskDescr
*masterTaskDescr
) {
89 CopyParent(masterTaskDescr
);
90 // overwrite specific items;
92 TaskDescr_InPar
| TaskDescr_IsParConstr
; // set flag to parallel
95 INLINE
void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
96 omptarget_nvptx_TaskDescr
*workTaskDescr
) {
99 // overwrite specific items;
101 // The threadID should be __kmpc_get_hardware_thread_id_in_block() %
102 // GetMasterThreadID(). This is so that the serial master (first lane in the
103 // master warp) gets a threadId of 0. However, we know that this function is
104 // always called in a parallel region where only workers are active. The
105 // serial master thread never enters this region. When a parallel region is
106 // executed serially, the threadId is set to 0 elsewhere and the
107 // kmpc_serialized_* functions are called, which never activate this region.
109 __kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
110 // called for 1st level)
113 INLINE
void omptarget_nvptx_TaskDescr::CopyConvergentParent(
114 omptarget_nvptx_TaskDescr
*parentTaskDescr
, uint16_t tid
, uint16_t tnum
) {
115 CopyParent(parentTaskDescr
);
116 items
.flags
|= TaskDescr_InParL2P
; // In L2+ parallelism
117 items
.threadId
= tid
;
120 INLINE
void omptarget_nvptx_TaskDescr::SaveLoopData() {
121 loopData
.loopUpperBound
=
122 omptarget_nvptx_threadPrivateContext
->LoopUpperBound(items
.threadId
);
123 loopData
.nextLowerBound
=
124 omptarget_nvptx_threadPrivateContext
->NextLowerBound(items
.threadId
);
126 omptarget_nvptx_threadPrivateContext
->ScheduleType(items
.threadId
);
127 loopData
.chunk
= omptarget_nvptx_threadPrivateContext
->Chunk(items
.threadId
);
129 omptarget_nvptx_threadPrivateContext
->Stride(items
.threadId
);
132 INLINE
void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
133 omptarget_nvptx_threadPrivateContext
->Chunk(items
.threadId
) = loopData
.chunk
;
134 omptarget_nvptx_threadPrivateContext
->LoopUpperBound(items
.threadId
) =
135 loopData
.loopUpperBound
;
136 omptarget_nvptx_threadPrivateContext
->NextLowerBound(items
.threadId
) =
137 loopData
.nextLowerBound
;
138 omptarget_nvptx_threadPrivateContext
->Stride(items
.threadId
) =
140 omptarget_nvptx_threadPrivateContext
->ScheduleType(items
.threadId
) =
144 ////////////////////////////////////////////////////////////////////////////////
145 // Thread Private Context
146 ////////////////////////////////////////////////////////////////////////////////
148 INLINE omptarget_nvptx_TaskDescr
*
149 omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid
) const {
151 LT_FUSSY
, tid
< MAX_THREADS_PER_TEAM
,
152 "Getting top level, tid is larger than allocated data structure size");
153 return topTaskDescr
[tid
];
157 omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid
) {
158 // levelOneTaskDescr is init when starting the parallel region
159 // top task descr is NULL (team master version will be fixed separately)
160 topTaskDescr
[tid
] = NULL
;
161 // the following don't need to be init here; they are init when using dyn
163 // current_Event, events_Number, chunk, num_Iterations, schedule
166 ////////////////////////////////////////////////////////////////////////////////
168 ////////////////////////////////////////////////////////////////////////////////
170 INLINE
void omptarget_nvptx_TeamDescr::InitTeamDescr() {
171 levelZeroTaskDescr
.InitLevelZeroTaskDescr();
174 ////////////////////////////////////////////////////////////////////////////////
175 // Get private data structure for thread
176 ////////////////////////////////////////////////////////////////////////////////
178 // Utility routines for CUDA threads
179 INLINE omptarget_nvptx_TeamDescr
&getMyTeamDescriptor() {
180 return omptarget_nvptx_threadPrivateContext
->TeamContext();
183 INLINE omptarget_nvptx_WorkDescr
&getMyWorkDescriptor() {
184 omptarget_nvptx_TeamDescr
&currTeamDescr
= getMyTeamDescriptor();
185 return currTeamDescr
.WorkDescr();
188 INLINE omptarget_nvptx_TaskDescr
*getMyTopTaskDescriptor(int threadId
) {
189 return omptarget_nvptx_threadPrivateContext
->GetTopLevelTaskDescr(threadId
);
192 INLINE omptarget_nvptx_TaskDescr
*
193 getMyTopTaskDescriptor(bool isSPMDExecutionMode
) {
194 return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
197 ////////////////////////////////////////////////////////////////////////////////
198 // Memory management runtime functions.
199 ////////////////////////////////////////////////////////////////////////////////
201 INLINE
void omptarget_nvptx_SimpleMemoryManager::Release() {
202 ASSERT0(LT_FUSSY
, usedSlotIdx
< MAX_SM
,
203 "SlotIdx is too big or uninitialized.");
204 ASSERT0(LT_FUSSY
, usedMemIdx
< OMP_STATE_COUNT
,
205 "MemIdx is too big or uninitialized.");
206 MemDataTy
&MD
= MemData
[usedSlotIdx
];
207 __kmpc_atomic_exchange((unsigned *)&MD
.keys
[usedMemIdx
], 0u);
210 INLINE
const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf
,
212 ASSERT0(LT_FUSSY
, usedSlotIdx
< MAX_SM
,
213 "SlotIdx is too big or uninitialized.");
214 const unsigned sm
= usedSlotIdx
;
215 MemDataTy
&MD
= MemData
[sm
];
216 unsigned i
= hash(GetBlockIdInKernel());
217 while (__kmpc_atomic_cas((unsigned *)&MD
.keys
[i
], 0u, 1u) != 0) {
222 return static_cast<const char *>(buf
) + (sm
* OMP_STATE_COUNT
+ i
) * size
;