1 //===----- Workshare.cpp - OpenMP workshare implementation ------ C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file contains the implementation of the KMPC interface
10 // for the loop construct plus other worksharing constructs that use the same
11 // interface as loops.
13 //===----------------------------------------------------------------------===//
16 #include "Interface.h"
19 #include "Synchronization.h"
26 struct DynamicScheduleTracker
{
28 int64_t LoopUpperBound
;
29 int64_t NextLowerBound
;
31 kmp_sched_t ScheduleType
;
32 DynamicScheduleTracker
*NextDST
;
37 // used by the library for the interface with the app
38 #define DISPATCH_FINISHED 0
39 #define DISPATCH_NOTFINISHED 1
41 // used by dynamic scheduling
43 #define NOT_FINISHED 1
46 #pragma omp begin declare target device_type(nohost)
48 // TODO: This variable is a hack inherited from the old runtime.
49 static uint64_t SHARED(Cnt
);
51 template <typename T
, typename ST
> struct omptarget_nvptx_LoopSupport
{
52 ////////////////////////////////////////////////////////////////////////////////
53 // Loop with static scheduling with chunk
55 // Generic implementation of OMP loop scheduling with static policy
56 /*! \brief Calculate initial bounds for static loop and stride
57 * @param[in] loc location in code of the call (not used here)
58 * @param[in] global_tid global thread id
59 * @param[in] schetype type of scheduling (see omptarget-nvptx.h)
60 * @param[in] plastiter pointer to last iteration
61 * @param[in,out] pointer to loop lower bound. it will contain value of
62 * lower bound of first chunk
63 * @param[in,out] pointer to loop upper bound. It will contain value of
64 * upper bound of first chunk
65 * @param[in,out] pointer to loop stride. It will contain value of stride
66 * between two successive chunks executed by the same thread
67 * @param[in] loop increment bump
68 * @param[in] chunk size
71 // helper function for static chunk
72 static void ForStaticChunk(int &last
, T
&lb
, T
&ub
, ST
&stride
, ST chunk
,
73 T entityId
, T numberOfEntities
) {
74 // each thread executes multiple chunks all of the same size, except
76 // distance between two successive chunks
77 stride
= numberOfEntities
* chunk
;
78 lb
= lb
+ entityId
* chunk
;
80 ub
= lb
+ chunk
- 1; // Clang uses i <= ub
81 // Say ub' is the begining of the last chunk. Then who ever has a
82 // lower bound plus a multiple of the increment equal to ub' is
84 T beginingLastChunk
= inputUb
- (inputUb
% chunk
);
85 last
= ((beginingLastChunk
- lb
) % stride
) == 0;
88 ////////////////////////////////////////////////////////////////////////////////
89 // Loop with static scheduling without chunk
91 // helper function for static no chunk
92 static void ForStaticNoChunk(int &last
, T
&lb
, T
&ub
, ST
&stride
, ST
&chunk
,
93 T entityId
, T numberOfEntities
) {
94 // No chunk size specified. Each thread or warp gets at most one
95 // chunk; chunks are all almost of equal size
96 T loopSize
= ub
- lb
+ 1;
98 chunk
= loopSize
/ numberOfEntities
;
99 T leftOver
= loopSize
- chunk
* numberOfEntities
;
101 if (entityId
< leftOver
) {
103 lb
= lb
+ entityId
* chunk
;
105 lb
= lb
+ entityId
* chunk
+ leftOver
;
109 ub
= lb
+ chunk
- 1; // Clang uses i <= ub
110 last
= lb
<= inputUb
&& inputUb
<= ub
;
111 stride
= loopSize
; // make sure we only do 1 chunk per warp
114 ////////////////////////////////////////////////////////////////////////////////
115 // Support for Static Init
117 static void for_static_init(int32_t, int32_t schedtype
, int32_t *plastiter
,
118 T
*plower
, T
*pupper
, ST
*pstride
, ST chunk
,
119 bool IsSPMDExecutionMode
) {
120 int32_t gtid
= omp_get_thread_num();
121 int numberOfActiveOMPThreads
= omp_get_num_threads();
123 // All warps that are in excess of the maximum requested, do
124 // not execute the loop
125 ASSERT0(LT_FUSSY
, gtid
< numberOfActiveOMPThreads
,
126 "current thread is not needed here; error");
132 ST stride
= *pstride
;
135 switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype
)) {
136 case kmp_sched_static_chunk
: {
138 ForStaticChunk(lastiter
, lb
, ub
, stride
, chunk
, gtid
,
139 numberOfActiveOMPThreads
);
143 } // note: if chunk <=0, use nochunk
144 case kmp_sched_static_balanced_chunk
: {
146 // round up to make sure the chunk is enough to cover all iterations
147 T tripCount
= ub
- lb
+ 1; // +1 because ub is inclusive
148 T span
= (tripCount
+ numberOfActiveOMPThreads
- 1) /
149 numberOfActiveOMPThreads
;
150 // perform chunk adjustment
151 chunk
= (span
+ chunk
- 1) & ~(chunk
- 1);
153 ASSERT0(LT_FUSSY
, ub
>= lb
, "ub must be >= lb.");
155 ForStaticChunk(lastiter
, lb
, ub
, stride
, chunk
, gtid
,
156 numberOfActiveOMPThreads
);
162 } // note: if chunk <=0, use nochunk
163 case kmp_sched_static_nochunk
: {
164 ForStaticNoChunk(lastiter
, lb
, ub
, stride
, chunk
, gtid
,
165 numberOfActiveOMPThreads
);
168 case kmp_sched_distr_static_chunk
: {
170 ForStaticChunk(lastiter
, lb
, ub
, stride
, chunk
, omp_get_team_num(),
171 omp_get_num_teams());
175 } // note: if chunk <=0, use nochunk
176 case kmp_sched_distr_static_nochunk
: {
177 ForStaticNoChunk(lastiter
, lb
, ub
, stride
, chunk
, omp_get_team_num(),
178 omp_get_num_teams());
181 case kmp_sched_distr_static_chunk_sched_static_chunkone
: {
182 ForStaticChunk(lastiter
, lb
, ub
, stride
, chunk
,
183 numberOfActiveOMPThreads
* omp_get_team_num() + gtid
,
184 omp_get_num_teams() * numberOfActiveOMPThreads
);
188 // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
189 ForStaticChunk(lastiter
, lb
, ub
, stride
, chunk
, gtid
,
190 numberOfActiveOMPThreads
);
195 *plastiter
= lastiter
;
201 ////////////////////////////////////////////////////////////////////////////////
202 // Support for dispatch Init
204 static int OrderedSchedule(kmp_sched_t schedule
) {
205 return schedule
>= kmp_sched_ordered_first
&&
206 schedule
<= kmp_sched_ordered_last
;
209 static void dispatch_init(IdentTy
*loc
, int32_t threadId
,
210 kmp_sched_t schedule
, T lb
, T ub
, ST st
, ST chunk
,
211 DynamicScheduleTracker
*DST
) {
212 int tid
= mapping::getThreadIdInBlock();
213 T tnum
= omp_get_num_threads();
214 T tripCount
= ub
- lb
+ 1; // +1 because ub is inclusive
215 ASSERT0(LT_FUSSY
, threadId
< tnum
,
216 "current thread is not needed here; error");
218 /* Currently just ignore the monotonic and non-monotonic modifiers
219 * (the compiler isn't producing them * yet anyway).
220 * When it is we'll want to look at them somewhere here and use that
221 * information to add to our schedule choice. We shouldn't need to pass
222 * them on, they merely affect which schedule we can legally choose for
223 * various dynamic cases. (In particular, whether or not a stealing scheme
226 schedule
= SCHEDULE_WITHOUT_MODIFIERS(schedule
);
229 if (tnum
== 1 || tripCount
<= 1 || OrderedSchedule(schedule
)) {
230 if (OrderedSchedule(schedule
))
231 __kmpc_barrier(loc
, threadId
);
232 schedule
= kmp_sched_static_chunk
;
233 chunk
= tripCount
; // one thread gets the whole loop
234 } else if (schedule
== kmp_sched_runtime
) {
238 omp_get_schedule(&rtSched
, &ChunkInt
);
241 case omp_sched_static
: {
243 schedule
= kmp_sched_static_chunk
;
245 schedule
= kmp_sched_static_nochunk
;
248 case omp_sched_auto
: {
249 schedule
= kmp_sched_static_chunk
;
253 case omp_sched_dynamic
:
254 case omp_sched_guided
: {
255 schedule
= kmp_sched_dynamic
;
259 } else if (schedule
== kmp_sched_auto
) {
260 schedule
= kmp_sched_static_chunk
;
264 // schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
265 // "unknown schedule %d & chunk %lld\n", (int)schedule,
266 // (long long)chunk);
270 if (schedule
== kmp_sched_static_chunk
) {
271 ASSERT0(LT_FUSSY
, chunk
> 0, "bad chunk value");
273 DST
->ScheduleType
= schedule
;
275 DST
->LoopUpperBound
= ub
;
276 // compute static chunk
279 ForStaticChunk(lastiter
, lb
, ub
, stride
, chunk
, threadId
, tnum
);
280 // save computed params
282 DST
->NextLowerBound
= lb
;
283 DST
->Stride
= stride
;
284 } else if (schedule
== kmp_sched_static_balanced_chunk
) {
285 ASSERT0(LT_FUSSY
, chunk
> 0, "bad chunk value");
287 DST
->ScheduleType
= schedule
;
289 DST
->LoopUpperBound
= ub
;
290 // compute static chunk
293 // round up to make sure the chunk is enough to cover all iterations
294 T span
= (tripCount
+ tnum
- 1) / tnum
;
295 // perform chunk adjustment
296 chunk
= (span
+ chunk
- 1) & ~(chunk
- 1);
299 ForStaticChunk(lastiter
, lb
, ub
, stride
, chunk
, threadId
, tnum
);
300 ASSERT0(LT_FUSSY
, ub
>= lb
, "ub must be >= lb.");
303 // save computed params
305 DST
->NextLowerBound
= lb
;
306 DST
->Stride
= stride
;
307 } else if (schedule
== kmp_sched_static_nochunk
) {
308 ASSERT0(LT_FUSSY
, chunk
== 0, "bad chunk value");
310 DST
->ScheduleType
= schedule
;
312 DST
->LoopUpperBound
= ub
;
313 // compute static chunk
316 ForStaticNoChunk(lastiter
, lb
, ub
, stride
, chunk
, threadId
, tnum
);
317 // save computed params
319 DST
->NextLowerBound
= lb
;
320 DST
->Stride
= stride
;
321 } else if (schedule
== kmp_sched_dynamic
|| schedule
== kmp_sched_guided
) {
323 DST
->ScheduleType
= schedule
;
327 DST
->LoopUpperBound
= ub
;
328 DST
->NextLowerBound
= lb
;
329 __kmpc_barrier(loc
, threadId
);
332 fence::team(atomic::seq_cst
);
334 __kmpc_barrier(loc
, threadId
);
338 ////////////////////////////////////////////////////////////////////////////////
339 // Support for dispatch next
341 static uint64_t NextIter() {
342 __kmpc_impl_lanemask_t active
= mapping::activemask();
343 uint32_t leader
= utils::ffs(active
) - 1;
344 uint32_t change
= utils::popc(active
);
345 __kmpc_impl_lanemask_t lane_mask_lt
= mapping::lanemaskLT();
346 unsigned int rank
= utils::popc(active
& lane_mask_lt
);
347 uint64_t warp_res
= 0;
349 warp_res
= atomic::add(&Cnt
, change
, atomic::seq_cst
);
351 warp_res
= utils::shuffle(active
, warp_res
, leader
);
352 return warp_res
+ rank
;
355 static int DynamicNextChunk(T
&lb
, T
&ub
, T chunkSize
, T loopLowerBound
,
358 lb
= loopLowerBound
+ N
* chunkSize
;
359 ub
= lb
+ chunkSize
- 1; // Clang uses i <= ub
362 // a. lb and ub < loopUpperBound --> NOT_FINISHED
363 // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
365 // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
367 if (lb
<= loopUpperBound
&& ub
< loopUpperBound
) {
371 if (lb
<= loopUpperBound
) {
375 // c. if we are here, we are in case 'c'
376 lb
= loopUpperBound
+ 2;
377 ub
= loopUpperBound
+ 1;
381 static int dispatch_next(IdentTy
*loc
, int32_t gtid
, int32_t *plast
,
382 T
*plower
, T
*pupper
, ST
*pstride
,
383 DynamicScheduleTracker
*DST
) {
384 // ID of a thread in its own warp
386 // automatically selects thread or warp ID based on selected implementation
387 ASSERT0(LT_FUSSY
, gtid
< omp_get_num_threads(),
388 "current thread is not needed here; error");
390 kmp_sched_t schedule
= DST
->ScheduleType
;
393 if (schedule
== kmp_sched_static_chunk
||
394 schedule
== kmp_sched_static_nochunk
) {
395 T myLb
= DST
->NextLowerBound
;
396 T ub
= DST
->LoopUpperBound
;
399 return DISPATCH_FINISHED
;
401 // not finished, save current bounds
402 ST chunk
= DST
->Chunk
;
404 T myUb
= myLb
+ chunk
- 1; // Clang uses i <= ub
408 *plast
= (int32_t)(myUb
== ub
);
410 // increment next lower bound by the stride
411 ST stride
= DST
->Stride
;
412 DST
->NextLowerBound
= myLb
+ stride
;
413 return DISPATCH_NOTFINISHED
;
416 schedule
== kmp_sched_dynamic
|| schedule
== kmp_sched_guided
,
419 int finished
= DynamicNextChunk(myLb
, myUb
, DST
->Chunk
, DST
->NextLowerBound
,
420 DST
->LoopUpperBound
);
422 if (finished
== FINISHED
)
423 return DISPATCH_FINISHED
;
425 // not finished (either not finished or last chunk)
426 *plast
= (int32_t)(finished
== LAST_CHUNK
);
431 return DISPATCH_NOTFINISHED
;
434 static void dispatch_fini() {
438 ////////////////////////////////////////////////////////////////////////////////
439 // end of template class that encapsulate all the helper functions
440 ////////////////////////////////////////////////////////////////////////////////
443 ////////////////////////////////////////////////////////////////////////////////
444 // KMP interface implementation (dyn loops)
445 ////////////////////////////////////////////////////////////////////////////////
447 // TODO: This is a stopgap. We probably want to expand the dispatch API to take
448 // an DST pointer which can then be allocated properly without malloc.
449 static DynamicScheduleTracker
*THREAD_LOCAL(ThreadDSTPtr
);
451 // Create a new DST, link the current one, and define the new as current.
452 static DynamicScheduleTracker
*pushDST() {
453 DynamicScheduleTracker
*NewDST
= static_cast<DynamicScheduleTracker
*>(
454 memory::allocGlobal(sizeof(DynamicScheduleTracker
), "new DST"));
455 *NewDST
= DynamicScheduleTracker({0});
456 NewDST
->NextDST
= ThreadDSTPtr
;
457 ThreadDSTPtr
= NewDST
;
461 // Return the current DST.
462 static DynamicScheduleTracker
*peekDST() { return ThreadDSTPtr
; }
464 // Pop the current DST and restore the last one.
465 static void popDST() {
466 DynamicScheduleTracker
*OldDST
= ThreadDSTPtr
->NextDST
;
467 memory::freeGlobal(ThreadDSTPtr
, "remove DST");
468 ThreadDSTPtr
= OldDST
;
474 void __kmpc_dispatch_init_4(IdentTy
*loc
, int32_t tid
, int32_t schedule
,
475 int32_t lb
, int32_t ub
, int32_t st
, int32_t chunk
) {
476 DynamicScheduleTracker
*DST
= pushDST();
477 omptarget_nvptx_LoopSupport
<int32_t, int32_t>::dispatch_init(
478 loc
, tid
, (kmp_sched_t
)schedule
, lb
, ub
, st
, chunk
, DST
);
481 void __kmpc_dispatch_init_4u(IdentTy
*loc
, int32_t tid
, int32_t schedule
,
482 uint32_t lb
, uint32_t ub
, int32_t st
,
484 DynamicScheduleTracker
*DST
= pushDST();
485 omptarget_nvptx_LoopSupport
<uint32_t, int32_t>::dispatch_init(
486 loc
, tid
, (kmp_sched_t
)schedule
, lb
, ub
, st
, chunk
, DST
);
489 void __kmpc_dispatch_init_8(IdentTy
*loc
, int32_t tid
, int32_t schedule
,
490 int64_t lb
, int64_t ub
, int64_t st
, int64_t chunk
) {
491 DynamicScheduleTracker
*DST
= pushDST();
492 omptarget_nvptx_LoopSupport
<int64_t, int64_t>::dispatch_init(
493 loc
, tid
, (kmp_sched_t
)schedule
, lb
, ub
, st
, chunk
, DST
);
496 void __kmpc_dispatch_init_8u(IdentTy
*loc
, int32_t tid
, int32_t schedule
,
497 uint64_t lb
, uint64_t ub
, int64_t st
,
499 DynamicScheduleTracker
*DST
= pushDST();
500 omptarget_nvptx_LoopSupport
<uint64_t, int64_t>::dispatch_init(
501 loc
, tid
, (kmp_sched_t
)schedule
, lb
, ub
, st
, chunk
, DST
);
505 int __kmpc_dispatch_next_4(IdentTy
*loc
, int32_t tid
, int32_t *p_last
,
506 int32_t *p_lb
, int32_t *p_ub
, int32_t *p_st
) {
507 DynamicScheduleTracker
*DST
= peekDST();
508 return omptarget_nvptx_LoopSupport
<int32_t, int32_t>::dispatch_next(
509 loc
, tid
, p_last
, p_lb
, p_ub
, p_st
, DST
);
512 int __kmpc_dispatch_next_4u(IdentTy
*loc
, int32_t tid
, int32_t *p_last
,
513 uint32_t *p_lb
, uint32_t *p_ub
, int32_t *p_st
) {
514 DynamicScheduleTracker
*DST
= peekDST();
515 return omptarget_nvptx_LoopSupport
<uint32_t, int32_t>::dispatch_next(
516 loc
, tid
, p_last
, p_lb
, p_ub
, p_st
, DST
);
519 int __kmpc_dispatch_next_8(IdentTy
*loc
, int32_t tid
, int32_t *p_last
,
520 int64_t *p_lb
, int64_t *p_ub
, int64_t *p_st
) {
521 DynamicScheduleTracker
*DST
= peekDST();
522 return omptarget_nvptx_LoopSupport
<int64_t, int64_t>::dispatch_next(
523 loc
, tid
, p_last
, p_lb
, p_ub
, p_st
, DST
);
526 int __kmpc_dispatch_next_8u(IdentTy
*loc
, int32_t tid
, int32_t *p_last
,
527 uint64_t *p_lb
, uint64_t *p_ub
, int64_t *p_st
) {
528 DynamicScheduleTracker
*DST
= peekDST();
529 return omptarget_nvptx_LoopSupport
<uint64_t, int64_t>::dispatch_next(
530 loc
, tid
, p_last
, p_lb
, p_ub
, p_st
, DST
);
534 void __kmpc_dispatch_fini_4(IdentTy
*loc
, int32_t tid
) {
535 omptarget_nvptx_LoopSupport
<int32_t, int32_t>::dispatch_fini();
539 void __kmpc_dispatch_fini_4u(IdentTy
*loc
, int32_t tid
) {
540 omptarget_nvptx_LoopSupport
<uint32_t, int32_t>::dispatch_fini();
544 void __kmpc_dispatch_fini_8(IdentTy
*loc
, int32_t tid
) {
545 omptarget_nvptx_LoopSupport
<int64_t, int64_t>::dispatch_fini();
549 void __kmpc_dispatch_fini_8u(IdentTy
*loc
, int32_t tid
) {
550 omptarget_nvptx_LoopSupport
<uint64_t, int64_t>::dispatch_fini();
554 ////////////////////////////////////////////////////////////////////////////////
555 // KMP interface implementation (static loops)
556 ////////////////////////////////////////////////////////////////////////////////
558 void __kmpc_for_static_init_4(IdentTy
*loc
, int32_t global_tid
,
559 int32_t schedtype
, int32_t *plastiter
,
560 int32_t *plower
, int32_t *pupper
,
561 int32_t *pstride
, int32_t incr
, int32_t chunk
) {
562 omptarget_nvptx_LoopSupport
<int32_t, int32_t>::for_static_init(
563 global_tid
, schedtype
, plastiter
, plower
, pupper
, pstride
, chunk
,
564 mapping::isSPMDMode());
567 void __kmpc_for_static_init_4u(IdentTy
*loc
, int32_t global_tid
,
568 int32_t schedtype
, int32_t *plastiter
,
569 uint32_t *plower
, uint32_t *pupper
,
570 int32_t *pstride
, int32_t incr
, int32_t chunk
) {
571 omptarget_nvptx_LoopSupport
<uint32_t, int32_t>::for_static_init(
572 global_tid
, schedtype
, plastiter
, plower
, pupper
, pstride
, chunk
,
573 mapping::isSPMDMode());
576 void __kmpc_for_static_init_8(IdentTy
*loc
, int32_t global_tid
,
577 int32_t schedtype
, int32_t *plastiter
,
578 int64_t *plower
, int64_t *pupper
,
579 int64_t *pstride
, int64_t incr
, int64_t chunk
) {
580 omptarget_nvptx_LoopSupport
<int64_t, int64_t>::for_static_init(
581 global_tid
, schedtype
, plastiter
, plower
, pupper
, pstride
, chunk
,
582 mapping::isSPMDMode());
585 void __kmpc_for_static_init_8u(IdentTy
*loc
, int32_t global_tid
,
586 int32_t schedtype
, int32_t *plastiter
,
587 uint64_t *plower
, uint64_t *pupper
,
588 int64_t *pstride
, int64_t incr
, int64_t chunk
) {
589 omptarget_nvptx_LoopSupport
<uint64_t, int64_t>::for_static_init(
590 global_tid
, schedtype
, plastiter
, plower
, pupper
, pstride
, chunk
,
591 mapping::isSPMDMode());
594 void __kmpc_distribute_static_init_4(IdentTy
*loc
, int32_t global_tid
,
595 int32_t schedtype
, int32_t *plastiter
,
596 int32_t *plower
, int32_t *pupper
,
597 int32_t *pstride
, int32_t incr
,
599 omptarget_nvptx_LoopSupport
<int32_t, int32_t>::for_static_init(
600 global_tid
, schedtype
, plastiter
, plower
, pupper
, pstride
, chunk
,
601 mapping::isSPMDMode());
604 void __kmpc_distribute_static_init_4u(IdentTy
*loc
, int32_t global_tid
,
605 int32_t schedtype
, int32_t *plastiter
,
606 uint32_t *plower
, uint32_t *pupper
,
607 int32_t *pstride
, int32_t incr
,
609 omptarget_nvptx_LoopSupport
<uint32_t, int32_t>::for_static_init(
610 global_tid
, schedtype
, plastiter
, plower
, pupper
, pstride
, chunk
,
611 mapping::isSPMDMode());
614 void __kmpc_distribute_static_init_8(IdentTy
*loc
, int32_t global_tid
,
615 int32_t schedtype
, int32_t *plastiter
,
616 int64_t *plower
, int64_t *pupper
,
617 int64_t *pstride
, int64_t incr
,
619 omptarget_nvptx_LoopSupport
<int64_t, int64_t>::for_static_init(
620 global_tid
, schedtype
, plastiter
, plower
, pupper
, pstride
, chunk
,
621 mapping::isSPMDMode());
624 void __kmpc_distribute_static_init_8u(IdentTy
*loc
, int32_t global_tid
,
625 int32_t schedtype
, int32_t *plastiter
,
626 uint64_t *plower
, uint64_t *pupper
,
627 int64_t *pstride
, int64_t incr
,
629 omptarget_nvptx_LoopSupport
<uint64_t, int64_t>::for_static_init(
630 global_tid
, schedtype
, plastiter
, plower
, pupper
, pstride
, chunk
,
631 mapping::isSPMDMode());
634 void __kmpc_for_static_fini(IdentTy
*loc
, int32_t global_tid
) {}
636 void __kmpc_distribute_static_fini(IdentTy
*loc
, int32_t global_tid
) {}
639 #pragma omp end declare target