1 //===----------------------------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
10 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
12 #include <__algorithm/inplace_merge.h>
13 #include <__algorithm/lower_bound.h>
14 #include <__algorithm/max.h>
15 #include <__algorithm/merge.h>
16 #include <__algorithm/upper_bound.h>
17 #include <__atomic/atomic.h>
19 #include <__exception/terminate.h>
20 #include <__iterator/iterator_traits.h>
21 #include <__iterator/move_iterator.h>
22 #include <__memory/allocator.h>
23 #include <__memory/construct_at.h>
24 #include <__memory/unique_ptr.h>
25 #include <__numeric/reduce.h>
26 #include <__utility/empty.h>
27 #include <__utility/exception_guard.h>
28 #include <__utility/move.h>
29 #include <__utility/pair.h>
35 #include <__undef_macros>
37 #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
39 _LIBCPP_BEGIN_NAMESPACE_STD
41 namespace __par_backend
{
42 inline namespace __libdispatch
{
44 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
46 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
47 _LIBCPP_EXPORTED_FROM_ABI
void
48 __dispatch_apply(size_t __chunk_count
, void* __context
, void (*__func
)(void* __context
, size_t __chunk
)) noexcept
;
50 template <class _Func
>
51 _LIBCPP_HIDE_FROM_ABI
void __dispatch_apply(size_t __chunk_count
, _Func __func
) noexcept
{
52 __libdispatch::__dispatch_apply(__chunk_count
, &__func
, [](void* __context
, size_t __chunk
) {
53 (*static_cast<_Func
*>(__context
))(__chunk
);
57 struct __chunk_partitions
{
58 ptrdiff_t __chunk_count_
; // includes the first chunk
59 ptrdiff_t __chunk_size_
;
60 ptrdiff_t __first_chunk_size_
;
63 [[__gnu__::__const__
]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions
__partition_chunks(ptrdiff_t __size
) noexcept
;
65 template <class _RandomAccessIterator
, class _Functor
>
66 _LIBCPP_HIDE_FROM_ABI optional
<__empty
>
67 __dispatch_parallel_for(__chunk_partitions __partitions
, _RandomAccessIterator __first
, _Functor __func
) {
68 // Perform the chunked execution.
69 __libdispatch::__dispatch_apply(__partitions
.__chunk_count_
, [&](size_t __chunk
) {
70 auto __this_chunk_size
= __chunk
== 0 ? __partitions
.__first_chunk_size_
: __partitions
.__chunk_size_
;
74 : (__chunk
* __partitions
.__chunk_size_
) + (__partitions
.__first_chunk_size_
- __partitions
.__chunk_size_
);
75 __func(__first
+ __index
, __first
+ __index
+ __this_chunk_size
);
81 template <class _RandomAccessIterator
, class _Functor
>
82 _LIBCPP_HIDE_FROM_ABI optional
<__empty
>
83 __parallel_for(_RandomAccessIterator __first
, _RandomAccessIterator __last
, _Functor __func
) {
84 return __libdispatch::__dispatch_parallel_for(
85 __libdispatch::__partition_chunks(__last
- __first
), std::move(__first
), std::move(__func
));
88 template <class _RandomAccessIterator1
, class _RandomAccessIterator2
, class _RandomAccessIteratorOut
>
89 struct __merge_range
{
90 __merge_range(_RandomAccessIterator1 __mid1
, _RandomAccessIterator2 __mid2
, _RandomAccessIteratorOut __result
)
91 : __mid1_(__mid1
), __mid2_(__mid2
), __result_(__result
) {}
93 _RandomAccessIterator1 __mid1_
;
94 _RandomAccessIterator2 __mid2_
;
95 _RandomAccessIteratorOut __result_
;
98 template <typename _RandomAccessIterator1
,
99 typename _RandomAccessIterator2
,
100 typename _RandomAccessIterator3
,
103 _LIBCPP_HIDE_FROM_ABI optional
<__empty
> __parallel_merge(
104 _RandomAccessIterator1 __first1
,
105 _RandomAccessIterator1 __last1
,
106 _RandomAccessIterator2 __first2
,
107 _RandomAccessIterator2 __last2
,
108 _RandomAccessIterator3 __result
,
110 _LeafMerge __leaf_merge
) noexcept
{
111 __chunk_partitions __partitions
=
112 __libdispatch::__partition_chunks(std::max
<ptrdiff_t>(__last1
- __first1
, __last2
- __first2
));
114 if (__partitions
.__chunk_count_
== 0)
117 if (__partitions
.__chunk_count_
== 1) {
118 __leaf_merge(__first1
, __last1
, __first2
, __last2
, __result
, __comp
);
122 using __merge_range_t
= __merge_range
<_RandomAccessIterator1
, _RandomAccessIterator2
, _RandomAccessIterator3
>;
123 auto const __n_ranges
= __partitions
.__chunk_count_
+ 1;
125 // TODO: use __uninitialized_buffer
126 auto __destroy
= [=](__merge_range_t
* __ptr
) {
127 std::destroy_n(__ptr
, __n_ranges
);
128 std::allocator
<__merge_range_t
>().deallocate(__ptr
, __n_ranges
);
131 unique_ptr
<__merge_range_t
[], decltype(__destroy
)> __ranges(
132 [&]() -> __merge_range_t
* {
133 # ifndef _LIBCPP_HAS_NO_EXCEPTIONS
136 return std::allocator
<__merge_range_t
>().allocate(__n_ranges
);
137 # ifndef _LIBCPP_HAS_NO_EXCEPTIONS
138 } catch (const std::bad_alloc
&) {
148 // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
149 __merge_range_t
* __r
= __ranges
.get();
150 std::__construct_at(__r
++, __first1
, __first2
, __result
);
152 bool __iterate_first_range
= __last1
- __first1
> __last2
- __first2
;
154 auto __compute_chunk
= [&](size_t __chunk_size
) -> __merge_range_t
{
155 auto [__mid1
, __mid2
] = [&] {
156 if (__iterate_first_range
) {
157 auto __m1
= __first1
+ __chunk_size
;
158 auto __m2
= std::lower_bound(__first2
, __last2
, __m1
[-1], __comp
);
159 return std::make_pair(__m1
, __m2
);
161 auto __m2
= __first2
+ __chunk_size
;
162 auto __m1
= std::lower_bound(__first1
, __last1
, __m2
[-1], __comp
);
163 return std::make_pair(__m1
, __m2
);
167 __result
+= (__mid1
- __first1
) + (__mid2
- __first2
);
170 return {std::move(__mid1
), std::move(__mid2
), __result
};
173 // handle first chunk
174 std::__construct_at(__r
++, __compute_chunk(__partitions
.__first_chunk_size_
));
176 // handle 2 -> N - 1 chunks
177 for (ptrdiff_t __i
= 0; __i
!= __partitions
.__chunk_count_
- 2; ++__i
)
178 std::__construct_at(__r
++, __compute_chunk(__partitions
.__chunk_size_
));
181 std::__construct_at(__r
, __last1
, __last2
, __result
);
183 __libdispatch::__dispatch_apply(__partitions
.__chunk_count_
, [&](size_t __index
) {
184 auto __first_iters
= __ranges
[__index
];
185 auto __last_iters
= __ranges
[__index
+ 1];
187 __first_iters
.__mid1_
,
188 __last_iters
.__mid1_
,
189 __first_iters
.__mid2_
,
190 __last_iters
.__mid2_
,
191 __first_iters
.__result_
,
198 template <class _RandomAccessIterator
, class _Transform
, class _Value
, class _Combiner
, class _Reduction
>
199 _LIBCPP_HIDE_FROM_ABI optional
<_Value
> __parallel_transform_reduce(
200 _RandomAccessIterator __first
,
201 _RandomAccessIterator __last
,
202 _Transform __transform
,
204 _Combiner __combiner
,
205 _Reduction __reduction
) {
206 if (__first
== __last
)
209 auto __partitions
= __libdispatch::__partition_chunks(__last
- __first
);
211 auto __destroy
= [__count
= __partitions
.__chunk_count_
](_Value
* __ptr
) {
212 std::destroy_n(__ptr
, __count
);
213 std::allocator
<_Value
>().deallocate(__ptr
, __count
);
216 // TODO: use __uninitialized_buffer
217 // TODO: allocate one element per worker instead of one element per chunk
218 unique_ptr
<_Value
[], decltype(__destroy
)> __values(
219 std::allocator
<_Value
>().allocate(__partitions
.__chunk_count_
), __destroy
);
221 // __dispatch_apply is noexcept
222 __libdispatch::__dispatch_apply(__partitions
.__chunk_count_
, [&](size_t __chunk
) {
223 auto __this_chunk_size
= __chunk
== 0 ? __partitions
.__first_chunk_size_
: __partitions
.__chunk_size_
;
227 : (__chunk
* __partitions
.__chunk_size_
) + (__partitions
.__first_chunk_size_
- __partitions
.__chunk_size_
);
228 if (__this_chunk_size
!= 1) {
230 __values
.get() + __chunk
,
231 __reduction(__first
+ __index
+ 2,
232 __first
+ __index
+ __this_chunk_size
,
233 __combiner(__transform(__first
+ __index
), __transform(__first
+ __index
+ 1))));
235 std::__construct_at(__values
.get() + __chunk
, __transform(__first
+ __index
));
240 std::make_move_iterator(__values
.get()),
241 std::make_move_iterator(__values
.get() + __partitions
.__chunk_count_
),
246 template <class _RandomAccessIterator
, class _Comp
, class _LeafSort
>
247 _LIBCPP_HIDE_FROM_ABI optional
<__empty
> __parallel_stable_sort(
248 _RandomAccessIterator __first
, _RandomAccessIterator __last
, _Comp __comp
, _LeafSort __leaf_sort
) {
249 const auto __size
= __last
- __first
;
250 auto __partitions
= __libdispatch::__partition_chunks(__size
);
252 if (__partitions
.__chunk_count_
== 0)
255 if (__partitions
.__chunk_count_
== 1) {
256 __leaf_sort(__first
, __last
, __comp
);
260 using _Value
= __iter_value_type
<_RandomAccessIterator
>;
262 auto __destroy
= [__size
](_Value
* __ptr
) {
263 std::destroy_n(__ptr
, __size
);
264 std::allocator
<_Value
>().deallocate(__ptr
, __size
);
267 // TODO: use __uninitialized_buffer
268 unique_ptr
<_Value
[], decltype(__destroy
)> __values(std::allocator
<_Value
>().allocate(__size
), __destroy
);
270 // Initialize all elements to a moved-from state
271 // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
272 std::__construct_at(__values
.get(), std::move(*__first
));
273 for (__iter_diff_t
<_RandomAccessIterator
> __i
= 1; __i
!= __size
; ++__i
) {
274 std::__construct_at(__values
.get() + __i
, std::move(__values
.get()[__i
- 1]));
276 *__first
= std::move(__values
.get()[__size
- 1]);
278 __libdispatch::__dispatch_parallel_for(
281 [&__leaf_sort
, &__comp
](_RandomAccessIterator __chunk_first
, _RandomAccessIterator __chunk_last
) {
282 __leaf_sort(std::move(__chunk_first
), std::move(__chunk_last
), __comp
);
285 bool __objects_are_in_buffer
= false;
287 const auto __old_chunk_size
= __partitions
.__chunk_size_
;
288 if (__partitions
.__chunk_count_
% 2 == 1) {
289 auto __inplace_merge_chunks
= [&__comp
, &__partitions
](auto __first_chunk_begin
) {
292 __first_chunk_begin
+ __partitions
.__first_chunk_size_
,
293 __first_chunk_begin
+ __partitions
.__first_chunk_size_
+ __partitions
.__chunk_size_
,
296 if (__objects_are_in_buffer
)
297 __inplace_merge_chunks(__values
.get());
299 __inplace_merge_chunks(__first
);
300 __partitions
.__first_chunk_size_
+= 2 * __partitions
.__chunk_size_
;
302 __partitions
.__first_chunk_size_
+= __partitions
.__chunk_size_
;
305 __partitions
.__chunk_size_
*= 2;
306 __partitions
.__chunk_count_
/= 2;
308 auto __merge_chunks
= [__partitions
, __old_chunk_size
, &__comp
](auto __from_first
, auto __to_first
) {
309 __libdispatch::__dispatch_parallel_for(
312 [__old_chunk_size
, &__from_first
, &__to_first
, &__comp
](auto __chunk_first
, auto __chunk_last
) {
313 std::merge(std::make_move_iterator(__chunk_first
),
314 std::make_move_iterator(__chunk_last
- __old_chunk_size
),
315 std::make_move_iterator(__chunk_last
- __old_chunk_size
),
316 std::make_move_iterator(__chunk_last
),
317 __to_first
+ (__chunk_first
- __from_first
),
322 if (__objects_are_in_buffer
)
323 __merge_chunks(__values
.get(), __first
);
325 __merge_chunks(__first
, __values
.get());
326 __objects_are_in_buffer
= !__objects_are_in_buffer
;
327 } while (__partitions
.__chunk_count_
> 1);
329 if (__objects_are_in_buffer
) {
330 std::move(__values
.get(), __values
.get() + __size
, __first
);
336 _LIBCPP_HIDE_FROM_ABI
inline void __cancel_execution() {}
338 } // namespace __libdispatch
339 } // namespace __par_backend
341 _LIBCPP_END_NAMESPACE_STD
343 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
347 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H