1 //===----------------------------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
10 #define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
12 #include <__cxx03/__algorithm/inplace_merge.h>
13 #include <__cxx03/__algorithm/lower_bound.h>
14 #include <__cxx03/__algorithm/max.h>
15 #include <__cxx03/__algorithm/merge.h>
16 #include <__cxx03/__algorithm/upper_bound.h>
17 #include <__cxx03/__atomic/atomic.h>
18 #include <__cxx03/__config>
19 #include <__cxx03/__exception/terminate.h>
20 #include <__cxx03/__iterator/iterator_traits.h>
21 #include <__cxx03/__iterator/move_iterator.h>
22 #include <__cxx03/__memory/allocator.h>
23 #include <__cxx03/__memory/construct_at.h>
24 #include <__cxx03/__memory/unique_ptr.h>
25 #include <__cxx03/__numeric/reduce.h>
26 #include <__cxx03/__pstl/backend_fwd.h>
27 #include <__cxx03/__pstl/cpu_algos/any_of.h>
28 #include <__cxx03/__pstl/cpu_algos/cpu_traits.h>
29 #include <__cxx03/__pstl/cpu_algos/fill.h>
30 #include <__cxx03/__pstl/cpu_algos/find_if.h>
31 #include <__cxx03/__pstl/cpu_algos/for_each.h>
32 #include <__cxx03/__pstl/cpu_algos/merge.h>
33 #include <__cxx03/__pstl/cpu_algos/stable_sort.h>
34 #include <__cxx03/__pstl/cpu_algos/transform.h>
35 #include <__cxx03/__pstl/cpu_algos/transform_reduce.h>
36 #include <__cxx03/__utility/empty.h>
37 #include <__cxx03/__utility/exception_guard.h>
38 #include <__cxx03/__utility/move.h>
39 #include <__cxx03/__utility/pair.h>
40 #include <__cxx03/cstddef>
41 #include <__cxx03/new>
42 #include <__cxx03/optional>
45 #include <__cxx03/__undef_macros>
47 _LIBCPP_BEGIN_NAMESPACE_STD
50 namespace __libdispatch
{
51 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
53 // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]?
54 _LIBCPP_EXPORTED_FROM_ABI
void
55 __dispatch_apply(size_t __chunk_count
, void* __context
, void (*__func
)(void* __context
, size_t __chunk
)) noexcept
;
57 template <class _Func
>
58 _LIBCPP_HIDE_FROM_ABI
void __dispatch_apply(size_t __chunk_count
, _Func __func
) noexcept
{
59 __libdispatch::__dispatch_apply(__chunk_count
, &__func
, [](void* __context
, size_t __chunk
) {
60 (*static_cast<_Func
*>(__context
))(__chunk
);
64 struct __chunk_partitions
{
65 ptrdiff_t __chunk_count_
; // includes the first chunk
66 ptrdiff_t __chunk_size_
;
67 ptrdiff_t __first_chunk_size_
;
70 [[__gnu__::__const__
]] _LIBCPP_EXPORTED_FROM_ABI __chunk_partitions
__partition_chunks(ptrdiff_t __size
) noexcept
;
72 template <class _RandomAccessIterator
, class _Functor
>
73 _LIBCPP_HIDE_FROM_ABI optional
<__empty
>
74 __dispatch_parallel_for(__chunk_partitions __partitions
, _RandomAccessIterator __first
, _Functor __func
) {
75 // Perform the chunked execution.
76 __libdispatch::__dispatch_apply(__partitions
.__chunk_count_
, [&](size_t __chunk
) {
77 auto __this_chunk_size
= __chunk
== 0 ? __partitions
.__first_chunk_size_
: __partitions
.__chunk_size_
;
81 : (__chunk
* __partitions
.__chunk_size_
) + (__partitions
.__first_chunk_size_
- __partitions
.__chunk_size_
);
82 __func(__first
+ __index
, __first
+ __index
+ __this_chunk_size
);
87 } // namespace __libdispatch
90 struct __cpu_traits
<__libdispatch_backend_tag
> {
91 template <class _RandomAccessIterator
, class _Functor
>
92 _LIBCPP_HIDE_FROM_ABI
static optional
<__empty
>
93 __for_each(_RandomAccessIterator __first
, _RandomAccessIterator __last
, _Functor __func
) {
94 return __libdispatch::__dispatch_parallel_for(
95 __libdispatch::__partition_chunks(__last
- __first
), std::move(__first
), std::move(__func
));
98 template <class _RandomAccessIterator1
, class _RandomAccessIterator2
, class _RandomAccessIteratorOut
>
99 struct __merge_range
{
100 __merge_range(_RandomAccessIterator1 __mid1
, _RandomAccessIterator2 __mid2
, _RandomAccessIteratorOut __result
)
101 : __mid1_(__mid1
), __mid2_(__mid2
), __result_(__result
) {}
103 _RandomAccessIterator1 __mid1_
;
104 _RandomAccessIterator2 __mid2_
;
105 _RandomAccessIteratorOut __result_
;
108 template <typename _RandomAccessIterator1
,
109 typename _RandomAccessIterator2
,
110 typename _RandomAccessIterator3
,
113 _LIBCPP_HIDE_FROM_ABI
static optional
<__empty
>
114 __merge(_RandomAccessIterator1 __first1
,
115 _RandomAccessIterator1 __last1
,
116 _RandomAccessIterator2 __first2
,
117 _RandomAccessIterator2 __last2
,
118 _RandomAccessIterator3 __result
,
120 _LeafMerge __leaf_merge
) noexcept
{
121 __libdispatch::__chunk_partitions __partitions
=
122 __libdispatch::__partition_chunks(std::max
<ptrdiff_t>(__last1
- __first1
, __last2
- __first2
));
124 if (__partitions
.__chunk_count_
== 0)
127 if (__partitions
.__chunk_count_
== 1) {
128 __leaf_merge(__first1
, __last1
, __first2
, __last2
, __result
, __comp
);
132 using __merge_range_t
= __merge_range
<_RandomAccessIterator1
, _RandomAccessIterator2
, _RandomAccessIterator3
>;
133 auto const __n_ranges
= __partitions
.__chunk_count_
+ 1;
135 // TODO: use __uninitialized_buffer
136 auto __destroy
= [=](__merge_range_t
* __ptr
) {
137 std::destroy_n(__ptr
, __n_ranges
);
138 std::allocator
<__merge_range_t
>().deallocate(__ptr
, __n_ranges
);
141 unique_ptr
<__merge_range_t
[], decltype(__destroy
)> __ranges(
142 [&]() -> __merge_range_t
* {
143 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
146 return std::allocator
<__merge_range_t
>().allocate(__n_ranges
);
147 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
148 } catch (const std::bad_alloc
&) {
158 // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case
159 __merge_range_t
* __r
= __ranges
.get();
160 std::__construct_at(__r
++, __first1
, __first2
, __result
);
162 bool __iterate_first_range
= __last1
- __first1
> __last2
- __first2
;
164 auto __compute_chunk
= [&](size_t __chunk_size
) -> __merge_range_t
{
165 auto [__mid1
, __mid2
] = [&] {
166 if (__iterate_first_range
) {
167 auto __m1
= __first1
+ __chunk_size
;
168 auto __m2
= std::lower_bound(__first2
, __last2
, __m1
[-1], __comp
);
169 return std::make_pair(__m1
, __m2
);
171 auto __m2
= __first2
+ __chunk_size
;
172 auto __m1
= std::lower_bound(__first1
, __last1
, __m2
[-1], __comp
);
173 return std::make_pair(__m1
, __m2
);
177 __result
+= (__mid1
- __first1
) + (__mid2
- __first2
);
180 return {std::move(__mid1
), std::move(__mid2
), __result
};
183 // handle first chunk
184 std::__construct_at(__r
++, __compute_chunk(__partitions
.__first_chunk_size_
));
186 // handle 2 -> N - 1 chunks
187 for (ptrdiff_t __i
= 0; __i
!= __partitions
.__chunk_count_
- 2; ++__i
)
188 std::__construct_at(__r
++, __compute_chunk(__partitions
.__chunk_size_
));
191 std::__construct_at(__r
, __last1
, __last2
, __result
);
193 __libdispatch::__dispatch_apply(__partitions
.__chunk_count_
, [&](size_t __index
) {
194 auto __first_iters
= __ranges
[__index
];
195 auto __last_iters
= __ranges
[__index
+ 1];
197 __first_iters
.__mid1_
,
198 __last_iters
.__mid1_
,
199 __first_iters
.__mid2_
,
200 __last_iters
.__mid2_
,
201 __first_iters
.__result_
,
208 template <class _RandomAccessIterator
, class _Transform
, class _Value
, class _Combiner
, class _Reduction
>
209 _LIBCPP_HIDE_FROM_ABI
static optional
<_Value
> __transform_reduce(
210 _RandomAccessIterator __first
,
211 _RandomAccessIterator __last
,
212 _Transform __transform
,
214 _Combiner __combiner
,
215 _Reduction __reduction
) {
216 if (__first
== __last
)
219 auto __partitions
= __libdispatch::__partition_chunks(__last
- __first
);
221 auto __destroy
= [__count
= __partitions
.__chunk_count_
](_Value
* __ptr
) {
222 std::destroy_n(__ptr
, __count
);
223 std::allocator
<_Value
>().deallocate(__ptr
, __count
);
226 // TODO: use __uninitialized_buffer
227 // TODO: allocate one element per worker instead of one element per chunk
228 unique_ptr
<_Value
[], decltype(__destroy
)> __values(
229 std::allocator
<_Value
>().allocate(__partitions
.__chunk_count_
), __destroy
);
231 // __dispatch_apply is noexcept
232 __libdispatch::__dispatch_apply(__partitions
.__chunk_count_
, [&](size_t __chunk
) {
233 auto __this_chunk_size
= __chunk
== 0 ? __partitions
.__first_chunk_size_
: __partitions
.__chunk_size_
;
234 auto __index
= __chunk
== 0 ? 0
235 : (__chunk
* __partitions
.__chunk_size_
) +
236 (__partitions
.__first_chunk_size_
- __partitions
.__chunk_size_
);
237 if (__this_chunk_size
!= 1) {
239 __values
.get() + __chunk
,
240 __reduction(__first
+ __index
+ 2,
241 __first
+ __index
+ __this_chunk_size
,
242 __combiner(__transform(__first
+ __index
), __transform(__first
+ __index
+ 1))));
244 std::__construct_at(__values
.get() + __chunk
, __transform(__first
+ __index
));
249 std::make_move_iterator(__values
.get()),
250 std::make_move_iterator(__values
.get() + __partitions
.__chunk_count_
),
255 template <class _RandomAccessIterator
, class _Comp
, class _LeafSort
>
256 _LIBCPP_HIDE_FROM_ABI
static optional
<__empty
>
257 __stable_sort(_RandomAccessIterator __first
, _RandomAccessIterator __last
, _Comp __comp
, _LeafSort __leaf_sort
) {
258 const auto __size
= __last
- __first
;
259 auto __partitions
= __libdispatch::__partition_chunks(__size
);
261 if (__partitions
.__chunk_count_
== 0)
264 if (__partitions
.__chunk_count_
== 1) {
265 __leaf_sort(__first
, __last
, __comp
);
269 using _Value
= __iter_value_type
<_RandomAccessIterator
>;
271 auto __destroy
= [__size
](_Value
* __ptr
) {
272 std::destroy_n(__ptr
, __size
);
273 std::allocator
<_Value
>().deallocate(__ptr
, __size
);
276 // TODO: use __uninitialized_buffer
277 unique_ptr
<_Value
[], decltype(__destroy
)> __values(std::allocator
<_Value
>().allocate(__size
), __destroy
);
279 // Initialize all elements to a moved-from state
280 // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928
281 std::__construct_at(__values
.get(), std::move(*__first
));
282 for (__iter_diff_t
<_RandomAccessIterator
> __i
= 1; __i
!= __size
; ++__i
) {
283 std::__construct_at(__values
.get() + __i
, std::move(__values
.get()[__i
- 1]));
285 *__first
= std::move(__values
.get()[__size
- 1]);
287 __libdispatch::__dispatch_parallel_for(
290 [&__leaf_sort
, &__comp
](_RandomAccessIterator __chunk_first
, _RandomAccessIterator __chunk_last
) {
291 __leaf_sort(std::move(__chunk_first
), std::move(__chunk_last
), __comp
);
294 bool __objects_are_in_buffer
= false;
296 const auto __old_chunk_size
= __partitions
.__chunk_size_
;
297 if (__partitions
.__chunk_count_
% 2 == 1) {
298 auto __inplace_merge_chunks
= [&__comp
, &__partitions
](auto __first_chunk_begin
) {
301 __first_chunk_begin
+ __partitions
.__first_chunk_size_
,
302 __first_chunk_begin
+ __partitions
.__first_chunk_size_
+ __partitions
.__chunk_size_
,
305 if (__objects_are_in_buffer
)
306 __inplace_merge_chunks(__values
.get());
308 __inplace_merge_chunks(__first
);
309 __partitions
.__first_chunk_size_
+= 2 * __partitions
.__chunk_size_
;
311 __partitions
.__first_chunk_size_
+= __partitions
.__chunk_size_
;
314 __partitions
.__chunk_size_
*= 2;
315 __partitions
.__chunk_count_
/= 2;
317 auto __merge_chunks
= [__partitions
, __old_chunk_size
, &__comp
](auto __from_first
, auto __to_first
) {
318 __libdispatch::__dispatch_parallel_for(
321 [__old_chunk_size
, &__from_first
, &__to_first
, &__comp
](auto __chunk_first
, auto __chunk_last
) {
322 std::merge(std::make_move_iterator(__chunk_first
),
323 std::make_move_iterator(__chunk_last
- __old_chunk_size
),
324 std::make_move_iterator(__chunk_last
- __old_chunk_size
),
325 std::make_move_iterator(__chunk_last
),
326 __to_first
+ (__chunk_first
- __from_first
),
331 if (__objects_are_in_buffer
)
332 __merge_chunks(__values
.get(), __first
);
334 __merge_chunks(__first
, __values
.get());
335 __objects_are_in_buffer
= !__objects_are_in_buffer
;
336 } while (__partitions
.__chunk_count_
> 1);
338 if (__objects_are_in_buffer
) {
339 std::move(__values
.get(), __values
.get() + __size
, __first
);
345 _LIBCPP_HIDE_FROM_ABI
static void __cancel_execution() {}
347 static constexpr size_t __lane_size
= 64;
350 // Mandatory implementations of the computational basis
351 template <class _ExecutionPolicy
>
352 struct __find_if
<__libdispatch_backend_tag
, _ExecutionPolicy
>
353 : __cpu_parallel_find_if
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
355 template <class _ExecutionPolicy
>
356 struct __for_each
<__libdispatch_backend_tag
, _ExecutionPolicy
>
357 : __cpu_parallel_for_each
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
359 template <class _ExecutionPolicy
>
360 struct __merge
<__libdispatch_backend_tag
, _ExecutionPolicy
>
361 : __cpu_parallel_merge
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
363 template <class _ExecutionPolicy
>
364 struct __stable_sort
<__libdispatch_backend_tag
, _ExecutionPolicy
>
365 : __cpu_parallel_stable_sort
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
367 template <class _ExecutionPolicy
>
368 struct __transform
<__libdispatch_backend_tag
, _ExecutionPolicy
>
369 : __cpu_parallel_transform
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
371 template <class _ExecutionPolicy
>
372 struct __transform_binary
<__libdispatch_backend_tag
, _ExecutionPolicy
>
373 : __cpu_parallel_transform_binary
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
375 template <class _ExecutionPolicy
>
376 struct __transform_reduce
<__libdispatch_backend_tag
, _ExecutionPolicy
>
377 : __cpu_parallel_transform_reduce
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
379 template <class _ExecutionPolicy
>
380 struct __transform_reduce_binary
<__libdispatch_backend_tag
, _ExecutionPolicy
>
381 : __cpu_parallel_transform_reduce_binary
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
383 // Not mandatory, but better optimized
384 template <class _ExecutionPolicy
>
385 struct __any_of
<__libdispatch_backend_tag
, _ExecutionPolicy
>
386 : __cpu_parallel_any_of
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
388 template <class _ExecutionPolicy
>
389 struct __fill
<__libdispatch_backend_tag
, _ExecutionPolicy
>
390 : __cpu_parallel_fill
<__libdispatch_backend_tag
, _ExecutionPolicy
> {};
392 } // namespace __pstl
393 _LIBCPP_END_NAMESPACE_STD
397 #endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H