1 //===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file contains the declarations of all library macros, types,
12 //===----------------------------------------------------------------------===//
17 #include "common/allocator.h"
18 #include "common/debug.h" // debug
19 #include "common/state-queue.h"
20 #include "common/support.h"
21 #include "interface.h" // interfaces with omp, compiler, and user
22 #include "target_impl.h"
24 #define OMPTARGET_NVPTX_VERSION 1.1
26 // used by the library for the interface with the app
27 #define DISPATCH_FINISHED 0
28 #define DISPATCH_NOTFINISHED 1
30 // used by dynamic scheduling
32 #define NOT_FINISHED 1
35 #define BARRIER_COUNTER 0
36 #define ORDERED_COUNTER 1
38 // Worker slot type which is initialized with the default worker slot
39 // size of 4*32 bytes.
40 struct __kmpc_data_sharing_slot
{
41 __kmpc_data_sharing_slot
*Next
;
42 __kmpc_data_sharing_slot
*Prev
;
43 void *PrevSlotStackPtr
;
45 char Data
[DS_Worker_Warp_Slot_Size
];
48 ////////////////////////////////////////////////////////////////////////////////
49 // task ICV and (implicit & explicit) task state
51 class omptarget_nvptx_TaskDescr
{
54 INLINE omp_sched_t
GetRuntimeSched() const;
55 INLINE
void SetRuntimeSched(omp_sched_t sched
);
56 INLINE
int InParallelRegion() const { return items
.flags
& TaskDescr_InPar
; }
57 INLINE
int InL2OrHigherParallelRegion() const {
58 return items
.flags
& TaskDescr_InParL2P
;
60 INLINE
int IsParallelConstruct() const {
61 return items
.flags
& TaskDescr_IsParConstr
;
63 INLINE
int IsTaskConstruct() const { return !IsParallelConstruct(); }
64 // methods for other fields
65 INLINE
uint16_t &ThreadId() { return items
.threadId
; }
66 INLINE
uint64_t &RuntimeChunkSize() { return items
.runtimeChunkSize
; }
67 INLINE omptarget_nvptx_TaskDescr
*GetPrevTaskDescr() const { return prev
; }
68 INLINE
void SetPrevTaskDescr(omptarget_nvptx_TaskDescr
*taskDescr
) {
72 INLINE
void InitLevelZeroTaskDescr();
73 INLINE
void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr
*parentTaskDescr
);
74 INLINE
void Copy(omptarget_nvptx_TaskDescr
*sourceTaskDescr
);
75 INLINE
void CopyData(omptarget_nvptx_TaskDescr
*sourceTaskDescr
);
76 INLINE
void CopyParent(omptarget_nvptx_TaskDescr
*parentTaskDescr
);
77 INLINE
void CopyForExplicitTask(omptarget_nvptx_TaskDescr
*parentTaskDescr
);
78 INLINE
void CopyToWorkDescr(omptarget_nvptx_TaskDescr
*masterTaskDescr
);
79 INLINE
void CopyFromWorkDescr(omptarget_nvptx_TaskDescr
*workTaskDescr
);
80 INLINE
void CopyConvergentParent(omptarget_nvptx_TaskDescr
*parentTaskDescr
,
81 uint16_t tid
, uint16_t tnum
);
82 INLINE
void SaveLoopData();
83 INLINE
void RestoreLoopData() const;
86 // bits for flags: (6 used, 2 free)
87 // 3 bits (SchedMask) for runtime schedule
88 // 1 bit (InPar) if this thread has encountered one or more parallel region
89 // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
90 // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel
92 static const uint8_t TaskDescr_SchedMask
= (0x1 | 0x2 | 0x4);
93 static const uint8_t TaskDescr_InPar
= 0x10;
94 static const uint8_t TaskDescr_IsParConstr
= 0x20;
95 static const uint8_t TaskDescr_InParL2P
= 0x40;
97 struct SavedLoopDescr_items
{
98 int64_t loopUpperBound
;
99 int64_t nextLowerBound
;
102 kmp_sched_t schedule
;
105 struct TaskDescr_items
{
106 uint8_t flags
; // 6 bit used (see flag above)
108 uint16_t threadId
; // thread id
109 uint64_t runtimeChunkSize
; // runtime chunk size
111 omptarget_nvptx_TaskDescr
*prev
;
115 typedef struct omptarget_nvptx_ExplicitTaskDescr
{
116 omptarget_nvptx_TaskDescr
117 taskDescr
; // omptarget_nvptx task description (must be first)
118 kmp_TaskDescr kmpTaskDescr
; // kmp task description (must be last)
119 } omptarget_nvptx_ExplicitTaskDescr
;
121 ////////////////////////////////////////////////////////////////////////////////
122 // Descriptor of a parallel region (worksharing in general)
124 class omptarget_nvptx_WorkDescr
{
128 INLINE omptarget_nvptx_TaskDescr
*WorkTaskDescr() { return &masterTaskICV
; }
131 omptarget_nvptx_TaskDescr masterTaskICV
;
134 ////////////////////////////////////////////////////////////////////////////////
136 class omptarget_nvptx_TeamDescr
{
139 INLINE omptarget_nvptx_TaskDescr
*LevelZeroTaskDescr() {
140 return &levelZeroTaskDescr
;
142 INLINE omptarget_nvptx_WorkDescr
&WorkDescr() {
143 return workDescrForActiveParallel
;
147 INLINE
void InitTeamDescr();
149 INLINE __kmpc_data_sharing_slot
*GetPreallocatedSlotAddr(int wid
) {
150 worker_rootS
[wid
].DataEnd
=
151 &worker_rootS
[wid
].Data
[0] + DS_Worker_Warp_Slot_Size
;
152 // We currently do not have a next slot.
153 worker_rootS
[wid
].Next
= 0;
154 worker_rootS
[wid
].Prev
= 0;
155 worker_rootS
[wid
].PrevSlotStackPtr
= 0;
156 return (__kmpc_data_sharing_slot
*)&worker_rootS
[wid
];
160 omptarget_nvptx_TaskDescr
161 levelZeroTaskDescr
; // icv for team master initial thread
162 omptarget_nvptx_WorkDescr
163 workDescrForActiveParallel
; // one, ONLY for the active par
166 __kmpc_data_sharing_slot worker_rootS
[DS_Max_Warp_Number
];
169 ////////////////////////////////////////////////////////////////////////////////
170 // thread private data (struct of arrays for better coalescing)
171 // tid refers here to the global thread id
172 // do not support multiple concurrent kernel a this time
173 class omptarget_nvptx_ThreadPrivateContext
{
176 INLINE omptarget_nvptx_TaskDescr
*Level1TaskDescr(int tid
) {
177 return &levelOneTaskDescr
[tid
];
179 INLINE
void SetTopLevelTaskDescr(int tid
,
180 omptarget_nvptx_TaskDescr
*taskICV
) {
181 topTaskDescr
[tid
] = taskICV
;
183 INLINE omptarget_nvptx_TaskDescr
*GetTopLevelTaskDescr(int tid
) const;
184 // schedule (for dispatch)
185 INLINE kmp_sched_t
&ScheduleType(int tid
) { return schedule
[tid
]; }
186 INLINE
int64_t &Chunk(int tid
) { return chunk
[tid
]; }
187 INLINE
int64_t &LoopUpperBound(int tid
) { return loopUpperBound
[tid
]; }
188 INLINE
int64_t &NextLowerBound(int tid
) { return nextLowerBound
[tid
]; }
189 INLINE
int64_t &Stride(int tid
) { return stride
[tid
]; }
191 INLINE omptarget_nvptx_TeamDescr
&TeamContext() { return teamContext
; }
193 INLINE
void InitThreadPrivateContext(int tid
);
194 INLINE
uint64_t &Cnt() { return cnt
; }
197 // team context for this team
198 omptarget_nvptx_TeamDescr teamContext
;
199 // task ICV for implicit threads in the only parallel region
200 omptarget_nvptx_TaskDescr levelOneTaskDescr
[MAX_THREADS_PER_TEAM
];
201 // pointer where to find the current task ICV (top of the stack)
202 omptarget_nvptx_TaskDescr
*topTaskDescr
[MAX_THREADS_PER_TEAM
];
203 // schedule (for dispatch)
204 kmp_sched_t schedule
[MAX_THREADS_PER_TEAM
]; // remember schedule type for #for
205 int64_t chunk
[MAX_THREADS_PER_TEAM
];
206 int64_t loopUpperBound
[MAX_THREADS_PER_TEAM
];
207 // state for dispatch with dyn/guided OR static (never use both at a time)
208 int64_t nextLowerBound
[MAX_THREADS_PER_TEAM
];
209 int64_t stride
[MAX_THREADS_PER_TEAM
];
213 /// Memory manager for statically allocated memory.
214 class omptarget_nvptx_SimpleMemoryManager
{
217 volatile unsigned keys
[OMP_STATE_COUNT
];
218 } MemData
[MAX_SM
] ALIGN(128);
220 INLINE
static uint32_t hash(unsigned key
) {
221 return key
& (OMP_STATE_COUNT
- 1);
225 INLINE
void Release();
226 INLINE
const void *Acquire(const void *buf
, size_t size
);
229 ////////////////////////////////////////////////////////////////////////////////
231 ////////////////////////////////////////////////////////////////////////////////
232 // global data tables
233 ////////////////////////////////////////////////////////////////////////////////
235 extern omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager
;
236 extern uint32_t EXTERN_SHARED(usedMemIdx
);
237 extern uint32_t EXTERN_SHARED(usedSlotIdx
);
239 extern uint8_t parallelLevel
[MAX_THREADS_PER_TEAM
/ WARPSIZE
];
240 #pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
242 extern uint8_t EXTERN_SHARED(parallelLevel
)[MAX_THREADS_PER_TEAM
/ WARPSIZE
];
244 extern uint16_t EXTERN_SHARED(threadLimit
);
245 extern uint16_t EXTERN_SHARED(threadsInTeam
);
246 extern uint16_t EXTERN_SHARED(nThreads
);
247 extern omptarget_nvptx_ThreadPrivateContext
*
248 EXTERN_SHARED(omptarget_nvptx_threadPrivateContext
);
250 extern int8_t EXTERN_SHARED(execution_param
);
251 extern void *EXTERN_SHARED(ReductionScratchpadPtr
);
253 ////////////////////////////////////////////////////////////////////////////////
254 // work function (outlined parallel/simd functions) and arguments.
255 // needed for L1 parallelism only.
256 ////////////////////////////////////////////////////////////////////////////////
258 typedef void *omptarget_nvptx_WorkFn
;
259 extern omptarget_nvptx_WorkFn
EXTERN_SHARED(omptarget_nvptx_workFn
);
261 ////////////////////////////////////////////////////////////////////////////////
262 // get private data structures
263 ////////////////////////////////////////////////////////////////////////////////
265 INLINE omptarget_nvptx_TeamDescr
&getMyTeamDescriptor();
266 INLINE omptarget_nvptx_WorkDescr
&getMyWorkDescriptor();
267 INLINE omptarget_nvptx_TaskDescr
*
268 getMyTopTaskDescriptor(bool isSPMDExecutionMode
);
269 INLINE omptarget_nvptx_TaskDescr
*getMyTopTaskDescriptor(int globalThreadId
);
271 ////////////////////////////////////////////////////////////////////////////////
272 // inlined implementation
273 ////////////////////////////////////////////////////////////////////////////////
275 INLINE
uint32_t __kmpc_impl_ffs(uint32_t x
) { return __builtin_ffs(x
); }
276 INLINE
uint32_t __kmpc_impl_popc(uint32_t x
) { return __builtin_popcount(x
); }
277 INLINE
uint32_t __kmpc_impl_ffs(uint64_t x
) { return __builtin_ffsl(x
); }
278 INLINE
uint32_t __kmpc_impl_popc(uint64_t x
) { return __builtin_popcountl(x
); }
280 #include "common/omptargeti.h"