1 /******************************************************************************
2 * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 * * Redistributions of source code must retain the above copyright
7 * notice, this list of conditions and the following disclaimer.
8 * * Redistributions in binary form must reproduce the above copyright
9 * notice, this list of conditions and the following disclaimer in the
10 * documentation and/or other materials provided with the distribution.
11 * * Neither the name of the NVIDIA CORPORATION nor the
12 * names of its contributors may be used to endorse or promote products
13 * derived from this software without specific prior written permission.
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ******************************************************************************/
28 /******************************************************************************
30 * Code and text by Sean Baxter, NVIDIA Research
31 * See http://nvlabs.github.io/moderngpu for repository and documentation.
33 ******************************************************************************/
37 #include "mgpudevice.cuh"
38 #include "util/mgpucontext.h"
42 ////////////////////////////////////////////////////////////////////////////////
45 // Reduce input and return variable in device memory or host memory, or both.
46 // Provide a non-null pointer to retrieve data.
47 template<typename InputIt, typename T, typename Op>
48 MGPU_HOST void Reduce(InputIt data_global, int count, T identity, Op op,
49 T* reduce_global, T* reduce_host, CudaContext& context);
51 // T = std::iterator_traits<InputIt>::value_type.
52 // Reduce with identity = 0 and op = mgpu::plus<T>.
53 // Returns the value in host memory.
54 template<typename InputIt>
55 MGPU_HOST typename std::iterator_traits<InputIt>::value_type
56 Reduce(InputIt data_global, int count, CudaContext& context);
59 ////////////////////////////////////////////////////////////////////////////////
62 // Scan inputs in device memory.
63 // MgpuScanType may be:
64 // MgpuScanTypeExc (exclusive) or
65 // MgpuScanTypeInc (inclusive).
66 // Return the total in device memory, host memory, or both.
67 template<MgpuScanType Type, typename DataIt, typename T, typename Op,
69 MGPU_HOST void Scan(DataIt data_global, int count, T identity, Op op,
70 T* reduce_global, T* reduce_host, DestIt dest_global,
71 CudaContext& context);
73 // Exclusive scan with identity = 0 and op = mgpu::plus<T>.
74 // Returns the total in host memory.
75 template<typename InputIt, typename TotalType>
76 MGPU_HOST void ScanExc(InputIt data_global, int count, TotalType* total,
77 CudaContext& context);
79 // Like above, but don't return the total.
80 template<typename InputIt>
81 MGPU_HOST void ScanExc(InputIt data_global, int count, CudaContext& context);
83 ////////////////////////////////////////////////////////////////////////////////
84 // kernels/bulkremove.cuh
86 // Compact the elements in source_global by removing elements identified by
87 // indices_global. indices_global must be unique, sorted, and range between 0
88 // and sourceCount - 1. The number of outputs is sourceCount - indicesCount.
90 // IndicesIt should resolve to an integer type. iterators like step_iterator
93 // If sourceCount = 10, indicesCount = 6, and indices = (1, 3, 4, 5, 7, 8), then
94 // dest = A0 A2 A6 A9. (All indices between 0 and sourceCount - 1 except those
95 // in indices_global).
96 template<typename InputIt, typename IndicesIt, typename OutputIt>
97 MGPU_HOST void BulkRemove(InputIt source_global, int sourceCount,
98 IndicesIt indices_global, int indicesCount, OutputIt dest_global,
99 CudaContext& context);
102 ////////////////////////////////////////////////////////////////////////////////
103 // kernels/bulkinsert.cuh
105 // Combine aCount elements in a_global with bCount elements in b_global.
106 // Each element a_global[i] is inserted before position indices_global[i] and
107 // stored to dest_global. The insertion indices are relative to the B array,
108 // not the output. Indices must be sorted but not necessarily unique.
110 // If aCount = 5, bCount = 3, and indices = (1, 1, 2, 3, 3), the output is:
111 // B0 A0 A1 B1 A2 B2 A3 A4.
112 template<typename InputIt1, typename IndicesIt, typename InputIt2,
114 MGPU_HOST void BulkInsert(InputIt1 a_global, IndicesIt indices_global,
115 int aCount, InputIt2 b_global, int bCount, OutputIt dest_global,
116 CudaContext& context);
119 ////////////////////////////////////////////////////////////////////////////////
122 // MergeKeys merges two arrays of sorted inputs with C++-comparison semantics.
123 // aCount items from aKeys_global and bCount items from bKeys_global are merged
124 // into aCount + bCount items in keys_global.
125 // Comp is a comparator type supporting strict weak ordering.
126 // If !comp(b, a), then a is placed before b in the output.
127 template<typename KeysIt1, typename KeysIt2, typename KeysIt3, typename Comp>
128 MGPU_HOST void MergeKeys(KeysIt1 aKeys_global, int aCount, KeysIt2 bKeys_global,
129 int bCount, KeysIt3 keys_global, Comp comp, CudaContext& context);
131 // MergeKeys specialized with Comp = mgpu::less<T>.
132 template<typename KeysIt1, typename KeysIt2, typename KeysIt3>
133 MGPU_HOST void MergeKeys(KeysIt1 aKeys_global, int aCount, KeysIt2 bKeys_global,
134 int bCount, KeysIt3 keys_global, CudaContext& context);
136 // MergePairs merges two arrays of sorted inputs by key and copies values.
137 // If !comp(bKey, aKey), then aKey is placed before bKey in the output, and
138 // the corresponding aData is placed before bData. This corresponds to *_by_key
139 // functions in Thrust.
140 template<typename KeysIt1, typename KeysIt2, typename KeysIt3, typename ValsIt1,
141 typename ValsIt2, typename ValsIt3, typename Comp>
142 MGPU_HOST void MergePairs(KeysIt1 aKeys_global, ValsIt1 aVals_global,
143 int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
144 KeysIt3 keys_global, ValsIt3 vals_global, Comp comp, CudaContext& context);
146 // MergePairs specialized with Comp = mgpu::less<T>.
147 template<typename KeysIt1, typename KeysIt2, typename KeysIt3, typename ValsIt1,
148 typename ValsIt2, typename ValsIt3>
149 MGPU_HOST void MergePairs(KeysIt1 aKeys_global, ValsIt1 aVals_global,
150 int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
151 KeysIt3 keys_global, ValsIt3 vals_global, CudaContext& context);
154 ////////////////////////////////////////////////////////////////////////////////
155 // kernels/mergesort.cuh
157 // MergesortKeys sorts data_global using comparator Comp.
158 // If !comp(b, a), then a comes before b in the output. The data is sorted
160 template<typename T, typename Comp>
161 MGPU_HOST void MergesortKeys(T* data_global, int count, Comp comp,
162 CudaContext& context);
164 // MergesortKeys specialized with Comp = mgpu::less<T>.
166 MGPU_HOST void MergesortKeys(T* data_global, int count, CudaContext& context);
168 // MergesortPairs sorts data by key, copying data. This corresponds to
169 // sort_by_key in Thrust.
170 template<typename KeyType, typename ValType, typename Comp>
171 MGPU_HOST void MergesortPairs(KeyType* keys_global, ValType* values_global,
172 int count, Comp comp, CudaContext& context);
174 // MergesortPairs specialized with Comp = mgpu::less<KeyType>.
175 template<typename KeyType, typename ValType>
176 MGPU_HOST void MergesortPairs(KeyType* keys_global, ValType* values_global,
177 int count, CudaContext& context);
179 // MergesortIndices is like MergesortPairs where values_global is treated as
180 // if initialized with integers (0 ... count - 1).
181 template<typename KeyType, typename Comp>
182 MGPU_HOST void MergesortIndices(KeyType* keys_global, int* values_global,
183 int count, Comp comp, CudaContext& context);
185 // MergesortIndices specialized with Comp = mgpu::less<KeyType>.
186 template<typename KeyType>
187 MGPU_HOST void MergesortIndices(KeyType* keys_global, int* values_global,
188 int count, CudaContext& context);
191 ////////////////////////////////////////////////////////////////////////////////
192 // kernels/segmentedsort.cuh
194 // Mergesort count items in-place in data_global. Keys are compared with Comp
195 // (as they are in MergesortKeys), however keys remain inside the segments
196 // defined by flags_global.
198 // flags_global is a bitfield cast to uint*. Each bit in flags_global is a
199 // segment head flag. Only keys between segment head flags (inclusive on the
200 // left and exclusive on the right) may be exchanged. The first element is
201 // assumed to start a segment, regardless of the value of bit 0.
203 // Passing verbose=true causes the function to print mergepass statistics to the
204 // console. This may be helpful for developers to understand the performance
205 // characteristics of the function and how effectively it early-exits merge
207 template<typename T, typename Comp>
208 MGPU_HOST void SegSortKeysFromFlags(T* data_global, int count,
209 const uint* flags_global, CudaContext& context, Comp comp,
210 bool verbose = false);
212 // SegSortKeysFromFlags specialized with Comp = mgpu::less<T>.
214 MGPU_HOST void SegSortKeysFromFlags(T* data_global, int count,
215 const uint* flags_global, CudaContext& context, bool verbose = false);
217 // Segmented sort using head flags and supporting value exchange.
218 template<bool Stable, typename KeyType, typename ValType, typename Comp>
219 MGPU_HOST void SegSortPairsFromFlags(KeyType* keys_global,
220 ValType* values_global, const uint* flags_global, int count,
221 CudaContext& context, Comp comp, bool verbose = false);
223 // SegSortPairsFromFlags specialized with Comp = mgpu::less<T>.
224 template<bool Stable, typename KeyType, typename ValType>
225 MGPU_HOST void SegSortPairsFromFlags(KeyType* keys_global,
226 ValType* values_global, const uint* flags_global, int count,
227 CudaContext& context, bool verbose = false);
229 // Segmented sort using segment indices rather than head flags. indices_global
230 // is a sorted and unique list of indicesCount segment start locations. These
231 // indices correspond to the set bits in the flags_global field. A segment
232 // head index for position 0 may be omitted.
233 template<typename T, typename Comp>
234 MGPU_HOST void SegSortKeysFromIndices(T* data_global, int count,
235 const int* indices_global, int indicesCount, CudaContext& context,
236 Comp comp, bool verbose = false);
238 // SegSortKeysFromIndices specialized with Comp = mgpu::less<T>.
240 MGPU_HOST void SegSortKeysFromIndices(T* data_global, int count,
241 const int* indices_global, int indicesCount, CudaContext& context,
242 bool verbose = false);
244 // Segmented sort using segment indices and supporting value exchange.
245 template<typename KeyType, typename ValType, typename Comp>
246 MGPU_HOST void SegSortPairsFromIndices(KeyType* keys_global,
247 ValType* values_global, int count, const int* indices_global,
248 int indicesCount, CudaContext& context, Comp comp, bool verbose = false);
250 // SegSortPairsFromIndices specialized with Comp = mgpu::less<KeyType>.
251 template<typename KeyType, typename ValType>
252 MGPU_HOST void SegSortPairsFromIndices(KeyType* keys_global,
253 ValType* values_global, int count, const int* indices_global,
254 int indicesCount, CudaContext& context, bool verbose = false);
257 ////////////////////////////////////////////////////////////////////////////////
258 // kernels/localitysort.cuh
260 // LocalitySortKeys is a version of MergesortKeys optimized for non-uniformly
261 // random input arrays. If the keys begin close to their sorted destinations,
262 // this function may exploit the structure to early-exit merge passes.
264 // Passing verbose=true causes the function to print mergepass statistics to the
265 // console. This may be helpful for developers to understand the performance
266 // characteristics of the function and how effectively it early-exits merge
268 template<typename T, typename Comp>
269 MGPU_HOST void LocalitySortKeys(T* data_global, int count, CudaContext& context,
270 Comp comp, bool verbose = false);
272 // LocalitySortKeys specialized with Comp = mgpu::less<T>.
274 MGPU_HOST void LocalitySortKeys(T* data_global, int count, CudaContext& context,
275 bool verbose = false);
277 // Locality sort supporting value exchange.
278 template<typename KeyType, typename ValType, typename Comp>
279 MGPU_HOST void LocalitySortPairs(KeyType* keys_global, ValType* values_global,
280 int count, CudaContext& context, Comp comp, bool verbose = false);
282 // LocalitySortPairs specialized with Comp = mpgu::less<T>.
283 template<typename KeyType, typename ValType>
284 MGPU_HOST void LocalitySortPairs(KeyType* keys_global, ValType* values_global,
285 int count, CudaContext& context, bool verbose = false);
288 ////////////////////////////////////////////////////////////////////////////////
289 // kernels/sortedsearch.cuh
291 // Vectorized sorted search. This is the most general form of the function.
292 // Executes two simultaneous linear searches on two sorted inputs.
296 // lower-bound search of A into B.
297 // upper-bound search of B into A.
299 // upper-bound search of A into B.
300 // lower-bound search of B into A.
302 // MgpuSearchTypeNone - no output for this input.
303 // MgpuSearchTypeIndex - return search indices as integers.
304 // MgpuSearchTypeMatch - return 0 (no match) or 1 (match).
305 // For TypeA, returns 1 if there is at least 1 matching element in B
307 // For TypeB, returns 1 if there is at least 1 matching element in A
309 // MgpuSearchTypeIndexMatch - return search indices as integers. Most
310 // significant bit is match bit.
311 // aMatchCount, bMatchCount - If Type is Match or IndexMatch, return the total
312 // number of elements in A or B with matches in B or A, if the pointer is
313 // not null. This generates a synchronous cudaMemcpyDeviceToHost call that
314 // callers using streams should be aware of.
315 template<MgpuBounds Bounds, MgpuSearchType TypeA, MgpuSearchType TypeB,
316 typename InputIt1, typename InputIt2, typename OutputIt1,
317 typename OutputIt2, typename Comp>
318 MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
319 int bCount, OutputIt1 aIndices_global, OutputIt2 bIndices_global,
320 Comp comp, CudaContext& context, int* aMatchCount = 0,
321 int* bMatchCount = 0);
323 // SortedSearch specialized with Comp = mgpu::less<T>.
324 template<MgpuBounds Bounds, MgpuSearchType TypeA, MgpuSearchType TypeB,
325 typename InputIt1, typename InputIt2, typename OutputIt1,
327 MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
328 int bCount, OutputIt1 aIndices_global, OutputIt2 bIndices_global,
329 CudaContext& context, int* aMatchCount = 0, int* bMatchCount = 0);
331 // SortedSearch specialized with
332 // TypeA = MgpuSearchTypeIndex
333 // TypeB = MgpuSearchTypeNone
334 // aMatchCount = bMatchCount = 0.
335 template<MgpuBounds Bounds, typename InputIt1, typename InputIt2,
336 typename OutputIt, typename Comp>
337 MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
338 int bCount, OutputIt aIndices_global, Comp comp, CudaContext& context);
340 // SortedSearch specialized with Comp = mgpu::less<T>.
341 template<MgpuBounds Bounds, typename InputIt1, typename InputIt2,
343 MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
344 int bCount, OutputIt aIndices_global, CudaContext& context);
346 // SortedEqualityCount returns the difference between upper-bound (computed by
347 // this function) and lower-bound (passed as an argument). That is, it computes
348 // the number of occurences of a key in B that match each key in A.
350 // The provided operator must have a method:
351 // int operator()(int lb, int ub) const;
352 // It returns the count given the lower- and upper-bound indices.
354 // An object SortedEqualityOp is provided:
355 // struct SortedEqualityOp {
356 // MGPU_HOST_DEVICE int operator()(int lb, int ub) const {
360 template<typename InputIt1, typename InputIt2, typename InputIt3,
361 typename OutputIt, typename Comp, typename Op>
362 MGPU_HOST void SortedEqualityCount(InputIt1 a_global, int aCount,
363 InputIt2 b_global, int bCount, InputIt3 lb_global, OutputIt counts_global,
364 Comp comp, Op op, CudaContext& context);
366 // Specialization of SortedEqualityCount with Comp = mgpu::less<T>.
367 template<typename InputIt1, typename InputIt2, typename InputIt3,
368 typename OutputIt, typename Op>
369 MGPU_HOST void SortedEqualityCount(InputIt1 a_global, int aCount,
370 InputIt2 b_global, int bCount, InputIt3 lb_global, OutputIt counts_global,
371 Op op, CudaContext& context);
373 ////////////////////////////////////////////////////////////////////////////////
374 // kernels/loadbalance.cuh
376 // LoadBalanceSearch is a special vectorized sorted search. Consider bCount
377 // objects that generate a variable number of work items, with aCount work
378 // items in total. The caller computes an exclusive scan of the work-item counts
381 // indices_global has aCount outputs. indices_global[i] is the index of the
382 // object that generated the i'th work item.
384 // work-item counts: 2, 5, 3, 0, 1.
385 // scan counts: 0, 2, 7, 10, 10. aCount = 11.
387 // LoadBalanceSearch computes the upper-bound of counting_iterator<int>(0) with
388 // the scan of the work-item counts and subtracts 1:
389 // LBS: 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 4.
391 // This is equivalent to expanding the index of each object by the object's
395 template<typename InputIt>
396 MGPU_HOST void LoadBalanceSearch(int aCount, InputIt b_global, int bCount,
397 int* indices_global, CudaContext& context);
399 ////////////////////////////////////////////////////////////////////////////////
400 // kernels/intervalmove.cuh
402 // IntervalExpand duplicates intervalCount items in values_global.
403 // indices_global is an intervalCount-sized array filled with the scan of item
404 // expand counts. moveCount is the total number of outputs (sum of expand
408 // values = 0, 1, 2, 3, 4, 5, 6, 7, 8
409 // counts = 1, 2, 1, 0, 4, 2, 3, 0, 2
410 // indices = 0, 1, 3, 4, 4, 8, 10, 13, 13 (moveCount = 15).
411 // Expand values[i] by counts[i]:
412 // output = 0, 1, 1, 2, 4, 4, 4, 4, 5, 5, 6, 6, 6, 8, 8
413 template<typename IndicesIt, typename ValuesIt, typename OutputIt>
414 MGPU_HOST void IntervalExpand(int moveCount, IndicesIt indices_global,
415 ValuesIt values_global, int intervalCount, OutputIt output_global,
416 CudaContext& context);
418 // IntervalMove is a load-balanced and vectorized device memcpy.
419 // It copies intervalCount variable-length intervals from user-defined sources
420 // to user-defined destinations. If destination intervals overlap, results are
425 // 0: 3 9 1 9 8 5 10 2 5 2
426 // 10: 8 6 5 2 4 0 8 2 5 6
427 // Scan of interval counts (indices_global):
428 // 0: 0 3 12 13 22 30 35 45 47 52
429 // 10: 54 62 68 73 75 79 79 87 89 94 (moveCount = 100).
430 // Interval gather (gather_global):
431 // 0: 75 86 17 2 67 24 37 11 95 35
432 // 10: 52 18 47 0 13 75 78 60 62 29
433 // Interval scatter (scatter_global):
434 // 0: 10 80 99 27 41 71 15 0 36 13
435 // 10: 89 49 66 97 76 76 2 25 61 55
437 // This vectorizes into 20 independent memcpy operations which are load-balanced
439 // move 0: (75, 78)->(10, 13) move 10: (52, 60)->(10, 18)
440 // move 1: (86, 95)->(80, 89) move 11: (18, 24)->(49, 55)
441 // move 2: (17, 18)->(99,100) move 12: (47, 52)->(66, 71)
442 // move 3: ( 2, 11)->(27, 36) move 13: ( 0, 2)->(97, 99)
443 // move 4: (67, 75)->(41, 49) move 14: (13, 17)->(76, 80)
444 // move 5: (24, 29)->(71, 76) move 15: (75, 75)->(76, 76)
445 // move 6: (37, 47)->(15, 25) move 16: (78, 86)->( 2, 10)
446 // move 7: (11, 13)->( 0, 3) move 17: (60, 62)->(25, 27)
447 // move 8: (95,100)->(36, 41) move 18: (62, 67)->(61, 66)
448 // move 9: (35, 37)->(13, 15) move 19: (29, 35)->(55, 61)
449 template<typename GatherIt, typename ScatterIt, typename IndicesIt,
450 typename InputIt, typename OutputIt>
451 MGPU_HOST void IntervalMove(int moveCount, GatherIt gather_global,
452 ScatterIt scatter_global, IndicesIt indices_global, int intervalCount,
453 InputIt input_global, OutputIt output_global, CudaContext& context);
455 // IntervalGather is a specialization of IntervalMove that stores output data
456 // sequentially into output_global. For the example above, IntervalGather treats
457 // scatter_global the same as indices_global.
458 template<typename GatherIt, typename IndicesIt, typename InputIt,
460 MGPU_HOST void IntervalGather(int moveCount, GatherIt gather_global,
461 IndicesIt indices_global, int intervalCount, InputIt input_global,
462 OutputIt output_global, CudaContext& context);
464 // IntervalScatter is a specialization of IntervalMove that loads input data
465 // sequentially from input_global. For the example above, IntervalScatter treats
466 // gather_global the same as indices_global.
467 template<typename ScatterIt, typename IndicesIt, typename InputIt,
469 MGPU_HOST void IntervalScatter(int moveCount, ScatterIt scatter_global,
470 IndicesIt indices_global, int intervalCount, InputIt input_global,
471 OutputIt output_global, CudaContext& context);
474 ////////////////////////////////////////////////////////////////////////////////
477 // RelationalJoin is a sort-merge join that returns indices into one of the four
482 // MgpuJoinKindOuter.
484 // A = 100, 101, 103, 103
485 // B = 100, 100, 102, 103
487 // ai, bi a[ai], b[bi]
488 // 0: (0, 0) - (100, 100) // cross-product expansion for key 100
489 // 1: (0, 1) - (100, 100)
490 // 2: (1, -) - (101, ---) // left-join for key 101
491 // 3: (-, 2) - (---, 102) // right-join for key 102
492 // 4: (3, 3) - (103, 103) // cross-product expansion for key 103
494 // MgpuJoinKindLeft drops the right-join on line 3.
495 // MgpuJoinKindRight drops the left-join on line 2.
496 // MgpuJoinKindInner drops both the left- and right-joins.
498 // The caller passes MGPU_MEM(int) pointers to hold indices. Memory is allocated
499 // by the join function using the allocator associated with the context. It
500 // returns the number of outputs.
502 // RelationalJoin performs one cudaMemcpyDeviceToHost to retrieve the size of
503 // the output array. This is a synchronous operation and may prevent queueing
504 // for callers using streams.
505 template<MgpuJoinKind Kind, typename InputIt1, typename InputIt2,
507 MGPU_HOST int RelationalJoin(InputIt1 a_global, int aCount, InputIt2 b_global,
508 int bCount, MGPU_MEM(int)* ppAJoinIndices, MGPU_MEM(int)* ppBJoinIndices,
509 Comp comp, CudaContext& context);
511 // Specialization of RelationJoil with Comp = mgpu::less<T>.
512 template<MgpuJoinKind Kind, typename InputIt1, typename InputIt2>
513 MGPU_HOST int RelationalJoin(InputIt1 a_global, int aCount, InputIt2 b_global,
514 int bCount, MGPU_MEM(int)* ppAJoinIndices, MGPU_MEM(int)* ppBJoinIndices,
515 CudaContext& context);
518 ////////////////////////////////////////////////////////////////////////////////
521 // SetOpKeys implements multiset operations with C++ set_* semantics.
523 // MgpuSetOpIntersection - like std::set_intersection
524 // MgpuSetOpUnion - like std::set_union
525 // MgpuSetOpDiff - like std::set_difference
526 // MgpuSetOpSymDiff - like std::set_symmetric_difference
528 // Setting Duplicates to false increases performance for inputs with no
529 // duplicate keys in either array.
531 // The caller passes MGPU_MEM(T) pointers to hold outputs. Memory is allocated
532 // by the multiset function using the allocator associated with the context. It
533 // returns the number of outputs.
535 // SetOpKeys performs one cudaMemcpyDeviceToHost to retrieve the size of
536 // the output array. This is a synchronous operation and may prevent queueing
537 // for callers using streams.
539 // If compact = true, SetOpKeys pre-allocates an output buffer is large as the
540 // sum of the input arrays. Partials results are computed into this temporary
541 // array before being moved into the final array. It consumes more space but
542 // results in higher performance.
543 template<MgpuSetOp Op, bool Duplicates, typename It1, typename It2,
544 typename T, typename Comp>
545 MGPU_HOST int SetOpKeys(It1 a_global, int aCount, It2 b_global, int bCount,
546 MGPU_MEM(T)* ppKeys_global, Comp comp, CudaContext& context,
547 bool compact = true);
549 // Specialization of SetOpKeys with Comp = mgpu::less<T>.
550 template<MgpuSetOp Op, bool Duplicates, typename It1, typename It2, typename T>
551 MGPU_HOST int SetOpKeys(It1 a_global, int aCount, It2 b_global, int bCount,
552 MGPU_MEM(T)* ppKeys_global, CudaContext& context, bool compact = true);
554 // SetOpPairs runs multiset operations by key and supports value exchange.
555 template<MgpuSetOp Op, bool Duplicates, typename KeysIt1, typename KeysIt2,
556 typename ValsIt1, typename ValsIt2, typename KeyType, typename ValType,
558 MGPU_HOST int SetOpPairs(KeysIt1 aKeys_global, ValsIt1 aVals_global, int aCount,
559 KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
560 MGPU_MEM(KeyType)* ppKeys_global, MGPU_MEM(ValType)* ppVals_global,
561 Comp comp, CudaContext& context);
563 // Specialization of SetOpPairs with Comp = mgpu::less<T>.
564 template<MgpuSetOp Op, bool Duplicates, typename KeysIt1, typename KeysIt2,
565 typename ValsIt1, typename ValsIt2, typename KeyType, typename ValType>
566 MGPU_HOST int SetOpPairs(KeysIt1 aKeys_global, ValsIt1 aVals_global, int aCount,
567 KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
568 MGPU_MEM(KeyType)* ppKeys_global, MGPU_MEM(ValType)* ppVals_global,
569 CudaContext& context);
571 ////////////////////////////////////////////////////////////////////////////////
572 // kernels/segreducecsr.cuh
574 // SegReducePreprocessData is defined in segreduce.cuh. It includes:
575 // - limits for CSR->tiles
576 // - packed thread codes for each thread in the reduction
577 // - (optional) CSR2 array of filtered segment offsets
578 struct SegReducePreprocessData;
580 // SegReduceCsr runs a segmented reduction given an input and a sorted list of
581 // segment start offsets. This implementation requires operators support
582 // commutative (a + b = b + a) and associative (a + (b + c) = (a + b) + c)
585 // In the segmented reduction, reduce-by-key, and Spmv documentation, "segment"
586 // and "row" are used interchangably. A
589 // InputIt data_global - Data value input.
590 // int count - Size of input array data_global.
591 // CsrIt csr_global - List of integers for start of each segment.
592 // The first entry must be 0 (indicating that the
593 // first segment starts at offset 0).
594 // Equivalent to exc-scan of segment sizes.
595 // If supportEmpty is false: must be ascending.
596 // If supportEmpty is true: must be non-descending.
597 // int numSegments - Size of segment list csr_global. Must be >= 1.
598 // bool supportEmpty - Basic seg-reduce code does not support empty
600 // Set supportEmpty = true to add pre- and post-
601 // processing to support empty segments.
602 // OutputIt dest_global - Output array for segmented reduction. Allocate
603 // numSegments elements. Should be same data type as
604 // InputIt and identity.
605 // T identity - Identity for reduction operation. Eg, use 0 for
606 // addition or 1 for multiplication.
607 // Op op - Reduction operator. Model on std::plus<>. MGPU
608 // provides operators mgpu::plus<>, minus<>,
609 // multiplies<>, modulus<>, bit_or<> bit_and<>,
610 // bit_xor<>, maximum<>, and minimum<>.
611 // CudaContext& context - MGPU context support object. All kernels are
612 // launched on the associated stream.
613 template<typename InputIt, typename CsrIt, typename OutputIt, typename T,
615 MGPU_HOST void SegReduceCsr(InputIt data_global, int count, CsrIt csr_global,
616 int numSegments, bool supportEmpty, OutputIt dest_global, T identity, Op op,
617 CudaContext& context);
619 // IndirectReduceCsr is like SegReduceCsr but with one level of source
620 // indirection. The start of each segment/row i in data_global starts at
621 // sources_global[i].
622 // SourcesIt sources_global - List of integers for source data of each segment.
623 // Must be numSegments in size.
624 template<typename InputIt, typename CsrIt, typename SourcesIt,
625 typename OutputIt, typename T, typename Op>
626 MGPU_HOST void IndirectReduceCsr(InputIt data_global, int count,
627 CsrIt csr_global, SourcesIt sources_global, int numSegments,
628 bool supportEmpty, OutputIt dest_global, T identity, Op op,
629 CudaContext& context);
631 // SegReduceCsrPreprocess accelerates multiple seg-reduce calls on different
632 // data with the same segment geometry. Partitioning and CSR->CSR2 transform is
633 // off-loaded to a preprocessing pass. The actual reduction is evaluated by
635 template<typename T, typename CsrIt>
636 MGPU_HOST void SegReduceCsrPreprocess(int count, CsrIt csr_global,
637 int numSegments, bool supportEmpty,
638 std::auto_ptr<SegReducePreprocessData>* ppData, CudaContext& context);
640 template<typename InputIt, typename DestIt, typename T, typename Op>
641 MGPU_HOST void SegReduceApply(const SegReducePreprocessData& preprocess,
642 InputIt data_global, T identity, Op op, DestIt dest_global,
643 CudaContext& context);
645 ////////////////////////////////////////////////////////////////////////////////
646 // kernels/reducebykey.csr
648 typedef SegReducePreprocessData ReduceByKeyPreprocessData;
650 // ReduceByKey runs a segmented reduction given a data input and a matching set
651 // of keys. This implementation requires operators support commutative
652 // (a + b = b + a) and associative (a + (b + c) = (a + b) + c) evaluation.
653 // It roughly matches the behavior of thrust::reduce_by_key.
655 // KeysIt keys_global - Key identifier for the segment.
656 // InputIt data_global - Data value input.
657 // int count - Size of input arrays keys_global and
659 // ValType identity - Identity for reduction operation. Eg, use 0 for
660 // addition or 1 for multiplication.
661 // Op op - Reduction operator. Model on std::plus<>. MGPU
662 // provides operators mgpu::plus<>, minus<>,
663 // multiplies<>, modulus<>, bit_or<> bit_and<>,
664 // bit_xor<>, maximum<>, and minimum<>.
665 // Comp comp - Operator for comparing adjacent adjacent keys.
666 // Must return true if two adjacent keys are in the
667 // same segment. Use mgpu::equal_to<KeyType>() by
669 // KeyType* keysDest_global - If this pointer is not null, return the first
670 // key from each segment. Must be sized to at least
671 // the number of segments.
672 // DestIt dest_global - Holds the reduced data. Must be sized to at least
673 // the number of segments.
674 // int* count_host - The number of segments, returned in host memory.
676 // int* count_global - The number of segments, returned in device memory.
677 // This avoids a D->H synchronization. May be null.
678 // CudaContext& context - MGPU context support object.
679 template<typename KeysIt, typename InputIt, typename DestIt,
680 typename KeyType, typename ValType, typename Op, typename Comp>
681 MGPU_HOST void ReduceByKey(KeysIt keys_global, InputIt data_global, int count,
682 ValType identity, Op op, Comp comp, KeyType* keysDest_global,
683 DestIt dest_global, int* count_host, int* count_global,
684 CudaContext& context);
686 // ReduceByKeyPreprocess accelerates multiple reduce-by-key calls on different
687 // data with the same segment geometry. The actual reduction is evaluated by
689 // Note that the caller must explicitly specify the ValType argument. Kernel
690 // tunings are based on the value type, not the key type.
691 template<typename ValType, typename KeyType, typename KeysIt, typename Comp>
692 MGPU_HOST void ReduceByKeyPreprocess(int count, KeysIt keys_global,
693 KeyType* keysDest_global, Comp comp, int* count_host, int* count_global,
694 std::auto_ptr<ReduceByKeyPreprocessData>* ppData, CudaContext& context);
696 template<typename InputIt, typename DestIt, typename T, typename Op>
697 MGPU_HOST void ReduceByKeyApply(const ReduceByKeyPreprocessData& preprocess,
698 InputIt data_global, T identity, Op op, DestIt dest_global,
699 CudaContext& context);
701 ////////////////////////////////////////////////////////////////////////////////
702 // kernels/spmvcsr.cuh
704 typedef SegReducePreprocessData SpmvPreprocessData;
706 // SpmvCsr[Unary|Binary] evaluates the product of a sparse matrix (CSR format)
707 // with a dense vector.
708 // SpmvCsrIndirect[Unary|Binary] uses indirection to lookup the start of each
709 // matrix_global and cols_global on a per-row basis.
711 // Unary methods reduce on the right-hand side vector values.
712 // Binary methods reduce the product of the left-hand side matrix value with the
713 // right-hand side vector values.
715 // MatrixIt matrix_global - Left-hand side data for binary Spmv. There are nz
716 // non-zero matrix elements. These are loaded and
717 // combined with the vector values with mulOp.
718 // ColsIt cols_global - Row identifiers for the right-hand side of the
719 // matrix/value products. If element i is the k'th
720 // non-zero in row j, the product is formed as
721 // matrix_global[i] * vec_global[cols_global[i]]
722 // for direct indexing, or,
723 // m = source_global[j] + k
724 // matrix_global[m] * vec_global[cols_global[m]].
725 // int nz - Number of non-zeros in LHS matrix. Size of
726 // matrix_global and cols_global.
727 // CsrIt csr_global - List of integers for start of each row.
728 // The first entry must be 0 (indicating that the
729 // first row starts at offset 0).
730 // Equivalent to exc-scan of row sizes.
731 // If supportEmpty is false: must be ascending.
732 // If supportEmpty is true: must be non-descending.
733 // SourcesIt sources_global - An indirection array to source each row's data.
734 // The size of each row i is
735 // size_i = csr_global[i + 1] - csr_global[i].
736 // The starting offset for both the data and column
738 // offset_i = sources_global[i].
739 // The direct Spmv methods (i.e. those not taking
740 // a sources_global parameter) can be thought of as
741 // indirect methods with sources_global = csr_global.
742 // int numRows - Size of segment list csr_global. Must be >= 1.
743 // VecIt vec_global - Input array. Size is the width of the matrix.
744 // For unary Spmv, these values are reduced.
745 // For binary Spmv, the products of the matrix and
746 // vector values are reduced.
747 // bool supportEmpty - Basic seg-reduce code does not support empty rows.
748 // Set supportEmpty = true to add pre- and post-
749 // processing to support empty rows.
750 // DestIt dest_global - Output array. Must be numRows in size.
751 // T identity - Identity for reduction operation. Eg, use 0 for
752 // addition or 1 for multiplication.
753 // MulOp mulOp - Reduction operator for combining matrix value with
754 // vector value. Only defined for binary Spmv.
755 // Use mgpu::multiplies<T>() for default behavior.
756 // AddOp addOp - Reduction operator for reducing vector values
757 // (unary Spmv) or matrix-vector products (binary
758 // Spmv). Use mgpu::plus<T>() for default behavior.
759 // CudaContext& context - MGPU context support object. All kernels are
760 // launched on the associated stream.
761 template<typename ColsIt, typename CsrIt, typename VecIt, typename DestIt,
762 typename T, typename AddOp>
763 MGPU_HOST void SpmvCsrUnary(ColsIt cols_global, int nz, CsrIt csr_global,
764 int numRows, VecIt vec_global, bool supportEmpty, DestIt dest_global,
765 T identity, AddOp addOp, CudaContext& context);
767 template<typename MatrixIt, typename ColsIt, typename CsrIt, typename VecIt,
768 typename DestIt, typename T, typename MulOp, typename AddOp>
769 MGPU_HOST void SpmvCsrBinary(MatrixIt matrix_global, ColsIt cols_global,
770 int nz, CsrIt csr_global, int numRows, VecIt vec_global,
771 bool supportEmpty, DestIt dest_global, T identity, MulOp mulOp, AddOp addOp,
772 CudaContext& context);
774 template<typename ColsIt, typename CsrIt, typename SourcesIt, typename VecIt,
775 typename DestIt, typename T, typename AddOp>
776 MGPU_HOST void SpmvCsrIndirectUnary(ColsIt cols_global, int nz,
777 CsrIt csr_global, SourcesIt sources_global, int numRows, VecIt vec_global,
778 bool supportEmpty, DestIt dest_global, T identity, AddOp addOp,
779 CudaContext& context);
781 template<typename MatrixIt, typename ColsIt, typename CsrIt, typename SourcesIt,
782 typename VecIt, typename DestIt, typename T, typename MulOp, typename AddOp>
783 MGPU_HOST void SpmvCsrIndirectBinary(MatrixIt matrix_global, ColsIt cols_global,
784 int nz, CsrIt csr_global, SourcesIt sources_global, int numRows,
785 VecIt vec_global, bool supportEmpty, DestIt dest_global, T identity,
786 MulOp mulOp, AddOp addOp, CudaContext& context);
788 // SpmvPreprocess[Unary|Binary] accelerates multiple Spmv calls on different
789 // matrix/vector pairs with the same matrix geometry. The actual reduction is
790 // evaluated Spmv[Unary|Binary]Apply.
791 template<typename T, typename CsrIt>
792 MGPU_HOST void SpmvPreprocessUnary(int nz, CsrIt csr_global, int numRows,
793 bool supportEmpty, std::auto_ptr<SpmvPreprocessData>* ppData,
794 CudaContext& context);
796 template<typename T, typename CsrIt>
797 MGPU_HOST void SpmvPreprocessBinary(int nz, CsrIt csr_global, int numRows,
798 bool supportEmpty, std::auto_ptr<SpmvPreprocessData>* ppData,
799 CudaContext& context);
801 template<typename ColsIt, typename VecIt, typename DestIt, typename T,
802 typename MulOp, typename AddOp>
803 MGPU_HOST void SpmvUnaryApply(const SpmvPreprocessData& preprocess,
804 ColsIt cols_global, VecIt vec_global, DestIt dest_global, T identity,
805 AddOp addOp, CudaContext& context);
807 template<typename MatrixIt, typename ColsIt, typename VecIt, typename DestIt,
808 typename T, typename MulOp, typename AddOp>
809 MGPU_HOST void SpmvBinaryApply(const SpmvPreprocessData& preprocess,
810 MatrixIt matrix_global, ColsIt cols_global, VecIt vec_global,
811 DestIt dest_global, T identity, MulOp mulOp, AddOp addOp,
812 CudaContext& context);