1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #ifndef INCLUDED_COMPHELPER_PARALLELSORT_HXX
11 #define INCLUDED_COMPHELPER_PARALLELSORT_HXX
13 #include <comphelper/threadpool.hxx>
14 #include <tools/cpuid.hxx>
28 const size_t nThreadCountGlobal
= std::thread::hardware_concurrency();
29 const bool bHyperThreadingActive
= cpuid::hasHyperThreading();
30 static comphelper::ThreadPool
& rTPool(comphelper::ThreadPool::getSharedOptimalPool());
32 static thread_local
std::mt19937 aGenerator
{ std::random_device
{}() };
34 #define PARALLELSORT_ENABLEPZ 0
41 #if PARALLELSORT_ENABLEPZ
42 ProfileZone(const char* pTag
)
44 , maStart(std::chrono::steady_clock::now())
61 ProfileZone(const char* /*pTag*/)
68 // Avoid loplugin:staticmethods, loplugin:staticaccess errors
74 #if PARALLELSORT_ENABLEPZ
76 void showTimeElapsed()
78 auto end
= std::chrono::steady_clock::now();
80 = std::chrono::duration_cast
<std::chrono::milliseconds
>(end
- maStart
).count();
81 std::cout
<< maTag
<< " : " << elapsed
<< " ms" << std::endl
<< std::flush
;
85 std::chrono::steady_clock::time_point maStart
;
95 class Executor final
: public comphelper::ThreadTask
98 Executor(const std::shared_ptr
<comphelper::ThreadTaskTag
>& rTag
,
99 std::function
<void()> aFunc
)
100 : comphelper::ThreadTask(rTag
)
101 , maFunc(std::move(aFunc
))
105 virtual void doWork() override
{ maFunc(); }
108 const std::function
<void()> maFunc
;
112 ParallelRunner() { maTag
= comphelper::ThreadPool::createThreadTaskTag(); }
114 void enqueue(std::function
<void()> aFunc
)
116 rTPool
.pushTask(std::make_unique
<Executor
>(maTag
, aFunc
));
119 void wait() { rTPool
.waitUntilDone(maTag
, false); }
122 std::shared_ptr
<comphelper::ThreadTaskTag
> maTag
;
125 constexpr size_t nMaxTreeArraySize
= 64;
127 size_t lcl_tree_array_size(size_t nNum
)
130 for (nPow2
= 1; nPow2
<= nNum
; nPow2
<<= 1)
132 return std::clamp((nPow2
>> 1), size_t(1), nMaxTreeArraySize
);
135 template <class RandItr
> struct Sampler
137 using ValueType
= typename
std::iterator_traits
<RandItr
>::value_type
;
139 static void sample(RandItr aBegin
, RandItr aEnd
, ValueType
* pSamples
, size_t nSamples
,
140 size_t /*nParallelism*/)
142 ProfileZone
aZone("\tsample()");
143 assert(aBegin
<= aEnd
);
144 size_t nLen
= static_cast<std::size_t>(aEnd
- aBegin
);
145 assert(std::mt19937::max() >= nLen
);
147 for (size_t nIdx
= 0; nIdx
< nSamples
; ++nIdx
)
149 size_t nSel
= aGenerator() % nLen
--;
151 swap(*(aBegin
+ nSel
), *(aBegin
+ nLen
));
152 pSamples
[nIdx
] = *(aBegin
+ nLen
);
157 template <class RandItr
, class Compare
> class Binner
159 using ValueType
= typename
std::iterator_traits
<RandItr
>::value_type
;
161 const size_t mnTreeArraySize
;
162 const size_t mnDividers
;
163 constexpr static size_t mnMaxStaticSize
= 1024 * 50;
164 uint8_t maLabels
[mnMaxStaticSize
];
165 ValueType maDividers
[nMaxTreeArraySize
];
166 std::unique_ptr
<uint8_t[]> pLabels
;
167 size_t maSepBinEnds
[nMaxTreeArraySize
* nMaxTreeArraySize
];
171 size_t maBinEnds
[nMaxTreeArraySize
];
173 Binner(const ValueType
* pSamples
, size_t nSamples
, size_t nBins
, bool bThreaded
)
174 : mnTreeArraySize(lcl_tree_array_size(nBins
))
175 , mnDividers(mnTreeArraySize
- 1)
176 , mbThreaded(bThreaded
)
178 assert((nSamples
% mnTreeArraySize
) == 0);
179 assert(mnTreeArraySize
<= nMaxTreeArraySize
);
180 std::fill(maBinEnds
, maBinEnds
+ mnTreeArraySize
, 0);
181 std::fill(maSepBinEnds
, maSepBinEnds
+ mnTreeArraySize
* mnTreeArraySize
, 0);
182 fillTreeArray(1, pSamples
, pSamples
+ nSamples
);
185 void fillTreeArray(size_t nPos
, const ValueType
* pLow
, const ValueType
* pHigh
)
187 assert(pLow
<= pHigh
);
188 const ValueType
* pMid
= pLow
+ (pHigh
- pLow
) / 2;
189 maDividers
[nPos
] = *pMid
;
191 if (2 * nPos
< mnDividers
) // So that 2*nPos < mnTreeArraySize
193 fillTreeArray(2 * nPos
, pLow
, pMid
);
194 fillTreeArray(2 * nPos
+ 1, pMid
+ 1, pHigh
);
198 constexpr inline size_t findBin(const ValueType
& rVal
, Compare
& aComp
)
201 while (nIdx
<= mnDividers
)
202 nIdx
= ((nIdx
<< 1) + aComp(maDividers
[nIdx
], rVal
));
203 return (nIdx
- mnTreeArraySize
);
206 void label(const RandItr aBegin
, const RandItr aEnd
, Compare
& aComp
)
208 ProfileZone
aZoneSetup("\tlabel():setup");
209 size_t nLen
= static_cast<std::size_t>(aEnd
- aBegin
);
210 if (nLen
> mnMaxStaticSize
)
211 pLabels
= std::make_unique
<uint8_t[]>(nLen
);
212 uint8_t* pLabelsRaw
= (nLen
> mnMaxStaticSize
) ? pLabels
.get() : maLabels
;
214 ProfileZone
aZoneFindBins("\tFindBins()");
217 ParallelRunner aPRunner
;
218 const size_t nBins
= mnTreeArraySize
;
219 for (size_t nTIdx
= 0; nTIdx
< nBins
; ++nTIdx
)
221 aPRunner
.enqueue([this, nTIdx
, nBins
, nLen
, aBegin
, pLabelsRaw
, &aComp
] {
222 ProfileZone
aZoneIn("\t\tFindBinsThreaded()");
223 size_t nBinEndsStartIdx
= nTIdx
* mnTreeArraySize
;
224 size_t* pBinEnds
= maSepBinEnds
+ nBinEndsStartIdx
;
225 size_t aBinEndsF
[nMaxTreeArraySize
] = { 0 };
226 for (size_t nIdx
= nTIdx
; nIdx
< nLen
; nIdx
+= nBins
)
228 size_t nBinIdx
= findBin(*(aBegin
+ nIdx
), aComp
);
229 pLabelsRaw
[nIdx
] = static_cast<uint8_t>(nBinIdx
);
230 ++aBinEndsF
[nBinIdx
];
233 for (size_t nIdx
= 0; nIdx
< mnTreeArraySize
; ++nIdx
)
234 pBinEnds
[nIdx
] = aBinEndsF
[nIdx
];
240 // Populate maBinEnds from maSepBinEnds
241 for (size_t nTIdx
= 0; nTIdx
< mnTreeArraySize
; ++nTIdx
)
243 for (size_t nSepIdx
= 0; nSepIdx
< mnTreeArraySize
; ++nSepIdx
)
244 maBinEnds
[nTIdx
] += maSepBinEnds
[nSepIdx
* mnTreeArraySize
+ nTIdx
];
249 uint8_t* pLabel
= pLabelsRaw
;
250 for (RandItr aItr
= aBegin
; aItr
!= aEnd
; ++aItr
)
252 size_t nBinIdx
= findBin(*aItr
, aComp
);
254 ++maBinEnds
[nBinIdx
];
258 aZoneFindBins
.stop();
261 // Store each bin's starting position in maBinEnds array for now.
262 for (size_t nIdx
= 0; nIdx
< mnTreeArraySize
; ++nIdx
)
264 size_t nSize
= maBinEnds
[nIdx
];
265 maBinEnds
[nIdx
] = nSum
;
269 // Now maBinEnds has end positions of each bin.
272 void bin(const RandItr aBegin
, const RandItr aEnd
, ValueType
* pOut
)
274 ProfileZone
aZone("\tbin()");
275 const size_t nLen
= static_cast<std::size_t>(aEnd
- aBegin
);
276 uint8_t* pLabelsRaw
= (nLen
> mnMaxStaticSize
) ? pLabels
.get() : maLabels
;
278 for (nIdx
= 0; nIdx
< nLen
; ++nIdx
)
280 pOut
[maBinEnds
[pLabelsRaw
[nIdx
]]++] = *(aBegin
+ nIdx
);
285 template <class RandItr
, class Compare
= std::less
<>>
286 void s3sort(const RandItr aBegin
, const RandItr aEnd
, Compare aComp
= Compare(),
287 bool bThreaded
= true)
289 static size_t nThreadCount
= nThreadCountGlobal
;
291 constexpr size_t nBaseCaseSize
= 1024;
292 const std::size_t nLen
= static_cast<std::size_t>(aEnd
- aBegin
);
293 if (nLen
< nBaseCaseSize
)
295 std::stable_sort(aBegin
, aEnd
, aComp
);
299 using ValueType
= typename
std::iterator_traits
<RandItr
>::value_type
;
300 auto pOut
= std::make_unique
<ValueType
[]>(nLen
);
302 const size_t nBins
= lcl_tree_array_size(nThreadCount
);
304 const size_t nOverSamplingFactor
= std::max(1.0, std::sqrt(static_cast<double>(nLen
) / 64));
305 const size_t nSamples
= nOverSamplingFactor
* nBins
;
306 auto aSamples
= std::make_unique
<ValueType
[]>(nSamples
);
307 ProfileZone
aZoneSampleAnsSort("SampleAndSort");
308 // Select samples and sort them
309 Sampler
<RandItr
>::sample(aBegin
, aEnd
, aSamples
.get(), nSamples
, nBins
);
310 std::sort(aSamples
.get(), aSamples
.get() + nSamples
, aComp
);
311 aZoneSampleAnsSort
.stop();
313 if (!aComp(aSamples
[0], aSamples
[nSamples
- 1]))
315 // All samples are equal, fallback to standard sort.
316 std::sort(aBegin
, aEnd
, aComp
);
320 ProfileZone
aZoneBinner("Binner");
321 // Create and populate bins using pOut from input iterators.
322 Binner
<RandItr
, Compare
> aBinner(aSamples
.get(), nSamples
, nBins
, bThreaded
);
323 aBinner
.label(aBegin
, aEnd
, aComp
);
324 aBinner
.bin(aBegin
, aEnd
, pOut
.get());
327 ProfileZone
aZoneSortBins("SortBins");
328 ValueType
* pOutRaw
= pOut
.get();
331 ParallelRunner aPRunner
;
332 // Sort the bins separately.
333 for (size_t nBinIdx
= 0, nBinStart
= 0; nBinIdx
< nBins
; ++nBinIdx
)
335 size_t nBinEnd
= aBinner
.maBinEnds
[nBinIdx
];
336 aPRunner
.enqueue([pOutRaw
, nBinStart
, nBinEnd
, &aComp
] {
337 std::sort(pOutRaw
+ nBinStart
, pOutRaw
+ nBinEnd
, aComp
);
347 for (size_t nBinIdx
= 0, nBinStart
= 0; nBinIdx
< nBins
; ++nBinIdx
)
349 auto nBinEnd
= aBinner
.maBinEnds
[nBinIdx
];
350 std::sort(pOutRaw
+ nBinStart
, pOutRaw
+ nBinEnd
, aComp
);
355 aZoneSortBins
.stop();
357 // Move the sorted array to the array specified by input iterators.
358 std::move(pOutRaw
, pOutRaw
+ nLen
, aBegin
);
361 } // anonymous namespace
363 template <class RandItr
, class Compare
= std::less
<>>
364 void parallelSort(const RandItr aBegin
, const RandItr aEnd
, Compare aComp
= Compare())
366 assert(aBegin
<= aEnd
);
367 s3sort(aBegin
, aEnd
, aComp
);
370 } // namespace comphelper
372 #endif // INCLUDED_COMPHELPER_PARALLELSORT_HXX
374 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */