1 //===----------------------------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
10 #define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
12 #include <__algorithm/inplace_merge.h>
13 #include <__algorithm/lower_bound.h>
14 #include <__algorithm/max.h>
15 #include <__algorithm/merge.h>
16 #include <__algorithm/upper_bound.h>
17 #include <__atomic/atomic.h>
19 #include <__cstddef/ptrdiff_t.h>
20 #include <__exception/terminate.h>
21 #include <__iterator/iterator_traits.h>
22 #include <__iterator/move_iterator.h>
23 #include <__memory/allocator.h>
24 #include <__memory/construct_at.h>
25 #include <__memory/unique_ptr.h>
26 #include <__numeric/reduce.h>
27 #include <__pstl/backend_fwd.h>
28 #include <__pstl/cpu_algos/any_of.h>
29 #include <__pstl/cpu_algos/cpu_traits.h>
30 #include <__pstl/cpu_algos/fill.h>
31 #include <__pstl/cpu_algos/find_if.h>
32 #include <__pstl/cpu_algos/for_each.h>
33 #include <__pstl/cpu_algos/merge.h>
34 #include <__pstl/cpu_algos/stable_sort.h>
35 #include <__pstl/cpu_algos/transform.h>
36 #include <__pstl/cpu_algos/transform_reduce.h>
37 #include <__utility/empty.h>
38 #include <__utility/exception_guard.h>
39 #include <__utility/move.h>
40 #include <__utility/pair.h>
45 #include <__undef_macros>
47 #if _LIBCPP_STD_VER >= 17
49 _LIBCPP_BEGIN_NAMESPACE_STD
52 namespace __libdispatch
{
53 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
55 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
56 _LIBCPP_EXPORTED_FROM_ABI
void
57 __dispatch_apply(size_t __chunk_count
, void* __context
, void (*__func
)(void* __context
, size_t __chunk
)) noexcept
;
59 template <class _Func
>
60 _LIBCPP_HIDE_FROM_ABI
void __dispatch_apply(size_t __chunk_count
, _Func __func
) noexcept
{
61 __libdispatch::__dispatch_apply(__chunk_count
, &__func
, [](void* __context
, size_t __chunk
) {
62 (*static_cast<_Func
*>(__context
))(__chunk
);
66 struct __chunk_partitions
{
67 ptrdiff_t __chunk_count_
; // includes the first chunk
68 ptrdiff_t __chunk_size_
;
69 ptrdiff_t __first_chunk_size_
;
72 [[__gnu__::__const__
]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions
__partition_chunks(ptrdiff_t __size
) noexcept
;
74 template <class _RandomAccessIterator
, class _Functor
>
75 _LIBCPP_HIDE_FROM_ABI optional
<__empty
>
76 __dispatch_parallel_for(__chunk_partitions __partitions
, _RandomAccessIterator __first
, _Functor __func
) {
77 // Perform the chunked execution.
78 __libdispatch::__dispatch_apply(__partitions
.__chunk_count_
, [&](size_t __chunk
) {
79 auto __this_chunk_size
= __chunk
== 0 ? __partitions
.__first_chunk_size_
: __partitions
.__chunk_size_
;
83 : (__chunk
* __partitions
.__chunk_size_
) + (__partitions
.__first_chunk_size_
- __partitions
.__chunk_size_
);
84 __func(__first
+ __index
, __first
+ __index
+ __this_chunk_size
);
89 } // namespace __libdispatch
92 struct __cpu_traits
<__libdispatch_backend_tag
> {
93 template <class _RandomAccessIterator
, class _Functor
>
94 _LIBCPP_HIDE_FROM_ABI
static optional
<__empty
>
95 __for_each(_RandomAccessIterator __first
, _RandomAccessIterator __last
, _Functor __func
) {
96 return __libdispatch::__dispatch_parallel_for(
97 __libdispatch::__partition_chunks(__last
- __first
), std::move(__first
), std::move(__func
));
100 template <class _RandomAccessIterator1
, class _RandomAccessIterator2
, class _RandomAccessIteratorOut
>
101 struct __merge_range
{
102 __merge_range(_RandomAccessIterator1 __mid1
, _RandomAccessIterator2 __mid2
, _RandomAccessIteratorOut __result
)
103 : __mid1_(__mid1
), __mid2_(__mid2
), __result_(__result
) {}
105 _RandomAccessIterator1 __mid1_
;
106 _RandomAccessIterator2 __mid2_
;
107 _RandomAccessIteratorOut __result_
;
110 template <typename _RandomAccessIterator1
,
111 typename _RandomAccessIterator2
,
112 typename _RandomAccessIterator3
,
115 _LIBCPP_HIDE_FROM_ABI
static optional
<__empty
>
116 __merge(_RandomAccessIterator1 __first1
,
117 _RandomAccessIterator1 __last1
,
118 _RandomAccessIterator2 __first2
,
119 _RandomAccessIterator2 __last2
,
120 _RandomAccessIterator3 __result
,
122 _LeafMerge __leaf_merge
) noexcept
{
123 __libdispatch::__chunk_partitions __partitions
=
124 __libdispatch::__partition_chunks(std::max
<ptrdiff_t>(__last1
- __first1
, __last2
- __first2
));
126 if (__partitions
.__chunk_count_
== 0)
129 if (__partitions
.__chunk_count_
== 1) {
130 __leaf_merge(__first1
, __last1
, __first2
, __last2
, __result
, __comp
);
134 using __merge_range_t
= __merge_range
<_RandomAccessIterator1
, _RandomAccessIterator2
, _RandomAccessIterator3
>;
135 auto const __n_ranges
= __partitions
.__chunk_count_
+ 1;
137 // TODO: use __uninitialized_buffer
138 auto __destroy
= [=](__merge_range_t
* __ptr
) {
139 std::destroy_n(__ptr
, __n_ranges
);
140 std::allocator
<__merge_range_t
>().deallocate(__ptr
, __n_ranges
);
143 unique_ptr
<__merge_range_t
[], decltype(__destroy
)> __ranges(
144 [&]() -> __merge_range_t
* {
145 # if _LIBCPP_HAS_EXCEPTIONS
148 return std::allocator
<__merge_range_t
>().allocate(__n_ranges
);
149 # if _LIBCPP_HAS_EXCEPTIONS
150 } catch (const std::bad_alloc
&) {
160 // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
161 __merge_range_t
* __r
= __ranges
.get();
162 std::__construct_at(__r
++, __first1
, __first2
, __result
);
164 bool __iterate_first_range
= __last1
- __first1
> __last2
- __first2
;
166 auto __compute_chunk
= [&](size_t __chunk_size
) -> __merge_range_t
{
167 auto [__mid1
, __mid2
] = [&] {
168 if (__iterate_first_range
) {
169 auto __m1
= __first1
+ __chunk_size
;
170 auto __m2
= std::lower_bound(__first2
, __last2
, __m1
[-1], __comp
);
171 return std::make_pair(__m1
, __m2
);
173 auto __m2
= __first2
+ __chunk_size
;
174 auto __m1
= std::lower_bound(__first1
, __last1
, __m2
[-1], __comp
);
175 return std::make_pair(__m1
, __m2
);
179 __result
+= (__mid1
- __first1
) + (__mid2
- __first2
);
182 return {std::move(__mid1
), std::move(__mid2
), __result
};
185 // handle first chunk
186 std::__construct_at(__r
++, __compute_chunk(__partitions
.__first_chunk_size_
));
188 // handle 2 -> N - 1 chunks
189 for (ptrdiff_t __i
= 0; __i
!= __partitions
.__chunk_count_
- 2; ++__i
)
190 std::__construct_at(__r
++, __compute_chunk(__partitions
.__chunk_size_
));
193 std::__construct_at(__r
, __last1
, __last2
, __result
);
195 __libdispatch::__dispatch_apply(__partitions
.__chunk_count_
, [&](size_t __index
) {
196 auto __first_iters
= __ranges
[__index
];
197 auto __last_iters
= __ranges
[__index
+ 1];
199 __first_iters
.__mid1_
,
200 __last_iters
.__mid1_
,
201 __first_iters
.__mid2_
,
202 __last_iters
.__mid2_
,
203 __first_iters
.__result_
,
210 template <class _RandomAccessIterator
, class _Transform
, class _Value
, class _Combiner
, class _Reduction
>
211 _LIBCPP_HIDE_FROM_ABI
static optional
<_Value
> __transform_reduce(
212 _RandomAccessIterator __first
,
213 _RandomAccessIterator __last
,
214 _Transform __transform
,
216 _Combiner __combiner
,
217 _Reduction __reduction
) {
218 if (__first
== __last
)
221 auto __partitions
= __libdispatch::__partition_chunks(__last
- __first
);
223 auto __destroy
= [__count
= __partitions
.__chunk_count_
](_Value
* __ptr
) {
224 std::destroy_n(__ptr
, __count
);
225 std::allocator
<_Value
>().deallocate(__ptr
, __count
);
228 // TODO: use __uninitialized_buffer
229 // TODO: allocate one element per worker instead of one element per chunk
230 unique_ptr
<_Value
[], decltype(__destroy
)> __values(
231 std::allocator
<_Value
>().allocate(__partitions
.__chunk_count_
), __destroy
);
233 // __dispatch_apply is noexcept
234 __libdispatch::__dispatch_apply(__partitions
.__chunk_count_
, [&](size_t __chunk
) {
235 auto __this_chunk_size
= __chunk
== 0 ? __partitions
.__first_chunk_size_
: __partitions
.__chunk_size_
;
236 auto __index
= __chunk
== 0 ? 0
237 : (__chunk
* __partitions
.__chunk_size_
) +
238 (__partitions
.__first_chunk_size_
- __partitions
.__chunk_size_
);
239 if (__this_chunk_size
!= 1) {
241 __values
.get() + __chunk
,
242 __reduction(__first
+ __index
+ 2,
243 __first
+ __index
+ __this_chunk_size
,
244 __combiner(__transform(__first
+ __index
), __transform(__first
+ __index
+ 1))));
246 std::__construct_at(__values
.get() + __chunk
, __transform(__first
+ __index
));
251 std::make_move_iterator(__values
.get()),
252 std::make_move_iterator(__values
.get() + __partitions
.__chunk_count_
),
257 template <class _RandomAccessIterator
, class _Comp
, class _LeafSort
>
258 _LIBCPP_HIDE_FROM_ABI
static optional
<__empty
>
259 __stable_sort(_RandomAccessIterator __first
, _RandomAccessIterator __last
, _Comp __comp
, _LeafSort __leaf_sort
) {
260 const auto __size
= __last
- __first
;
261 auto __partitions
= __libdispatch::__partition_chunks(__size
);
263 if (__partitions
.__chunk_count_
== 0)
266 if (__partitions
.__chunk_count_
== 1) {
267 __leaf_sort(__first
, __last
, __comp
);
271 using _Value
= __iter_value_type
<_RandomAccessIterator
>;
273 auto __destroy
= [__size
](_Value
* __ptr
) {
274 std::destroy_n(__ptr
, __size
);
275 std::allocator
<_Value
>().deallocate(__ptr
, __size
);
278 // TODO: use __uninitialized_buffer
279 unique_ptr
<_Value
[], decltype(__destroy
)> __values(std::allocator
<_Value
>().allocate(__size
), __destroy
);
281 // Initialize all elements to a moved-from state
282 // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
283 std::__construct_at(__values
.get(), std::move(*__first
));
284 for (__iter_diff_t
<_RandomAccessIterator
> __i
= 1; __i
!= __size
; ++__i
) {
285 std::__construct_at(__values
.get() + __i
, std::move(__values
.get()[__i
- 1]));
287 *__first
= std::move(__values
.get()[__size
- 1]);
289 __libdispatch::__dispatch_parallel_for(
292 [&__leaf_sort
, &__comp
](_RandomAccessIterator __chunk_first
, _RandomAccessIterator __chunk_last
) {
293 __leaf_sort(std::move(__chunk_first
), std::move(__chunk_last
), __comp
);
296 bool __objects_are_in_buffer
= false;
298 const auto __old_chunk_size
= __partitions
.__chunk_size_
;
299 if (__partitions
.__chunk_count_
% 2 == 1) {
300 auto __inplace_merge_chunks
= [&__comp
, &__partitions
](auto __first_chunk_begin
) {
303 __first_chunk_begin
+ __partitions
.__first_chunk_size_
,
304 __first_chunk_begin
+ __partitions
.__first_chunk_size_
+ __partitions
.__chunk_size_
,
307 if (__objects_are_in_buffer
)
308 __inplace_merge_chunks(__values
.get());
310 __inplace_merge_chunks(__first
);
311 __partitions
.__first_chunk_size_
+= 2 * __partitions
.__chunk_size_
;
313 __partitions
.__first_chunk_size_
+= __partitions
.__chunk_size_
;
316 __partitions
.__chunk_size_
*= 2;
317 __partitions
.__chunk_count_
/= 2;
319 auto __merge_chunks
= [__partitions
, __old_chunk_size
, &__comp
](auto __from_first
, auto __to_first
) {
320 __libdispatch::__dispatch_parallel_for(
323 [__old_chunk_size
, &__from_first
, &__to_first
, &__comp
](auto __chunk_first
, auto __chunk_last
) {
324 std::merge(std::make_move_iterator(__chunk_first
),
325 std::make_move_iterator(__chunk_last
- __old_chunk_size
),
326 std::make_move_iterator(__chunk_last
- __old_chunk_size
),
327 std::make_move_iterator(__chunk_last
),
328 __to_first
+ (__chunk_first
- __from_first
),
333 if (__objects_are_in_buffer
)
334 __merge_chunks(__values
.get(), __first
);
336 __merge_chunks(__first
, __values
.get());
337 __objects_are_in_buffer
= !__objects_are_in_buffer
;
338 } while (__partitions
.__chunk_count_
> 1);
340 if (__objects_are_in_buffer
) {
341 std::move(__values
.get(), __values
.get() + __size
, __first
);
347 _LIBCPP_HIDE_FROM_ABI
static void __cancel_execution() {}
349 static constexpr size_t __lane_size
= 64;
352 // Mandatory implementations of the computational basis
353 template <class _ExecutionPolicy
>
354 struct __find_if
<__libdispatch_backend_tag
, _ExecutionPolicy
>
355 : __cpu_parallel_find_if
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
357 template <class _ExecutionPolicy
>
358 struct __for_each
<__libdispatch_backend_tag
, _ExecutionPolicy
>
359 : __cpu_parallel_for_each
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
361 template <class _ExecutionPolicy
>
362 struct __merge
<__libdispatch_backend_tag
, _ExecutionPolicy
>
363 : __cpu_parallel_merge
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
365 template <class _ExecutionPolicy
>
366 struct __stable_sort
<__libdispatch_backend_tag
, _ExecutionPolicy
>
367 : __cpu_parallel_stable_sort
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
369 template <class _ExecutionPolicy
>
370 struct __transform
<__libdispatch_backend_tag
, _ExecutionPolicy
>
371 : __cpu_parallel_transform
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
373 template <class _ExecutionPolicy
>
374 struct __transform_binary
<__libdispatch_backend_tag
, _ExecutionPolicy
>
375 : __cpu_parallel_transform_binary
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
377 template <class _ExecutionPolicy
>
378 struct __transform_reduce
<__libdispatch_backend_tag
, _ExecutionPolicy
>
379 : __cpu_parallel_transform_reduce
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
381 template <class _ExecutionPolicy
>
382 struct __transform_reduce_binary
<__libdispatch_backend_tag
, _ExecutionPolicy
>
383 : __cpu_parallel_transform_reduce_binary
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
385 // Not mandatory, but better optimized
386 template <class _ExecutionPolicy
>
387 struct __any_of
<__libdispatch_backend_tag
, _ExecutionPolicy
>
388 : __cpu_parallel_any_of
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
390 template <class _ExecutionPolicy
>
391 struct __fill
<__libdispatch_backend_tag
, _ExecutionPolicy
>
392 : __cpu_parallel_fill
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
394 } // namespace __pstl
395 _LIBCPP_END_NAMESPACE_STD
397 #endif // _LIBCPP_STD_VER >= 17
401 #endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H