libhoomd/extern/mgpuhost.cuh

   1 /******************************************************************************
   2  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions are met:
   6  *     * Redistributions of source code must retain the above copyright
   7  *       notice, this list of conditions and the following disclaimer.
   8  *     * Redistributions in binary form must reproduce the above copyright
   9  *       notice, this list of conditions and the following disclaimer in the
  10  *       documentation and/or other materials provided with the distribution.
  11  *     * Neither the name of the NVIDIA CORPORATION nor the
  12  *       names of its contributors may be used to endorse or promote products
  13  *       derived from this software without specific prior written permission.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  19  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  *
  26  ******************************************************************************/
  27
  28 /******************************************************************************
  29  *
  30  * Code and text by Sean Baxter, NVIDIA Research
  31  * See http://nvlabs.github.io/moderngpu for repository and documentation.
  32  *
  33  ******************************************************************************/
  34
  35 #pragma once
  36
  37 #include "mgpudevice.cuh"
  38 #include "util/mgpucontext.h"
  39
  40 namespace mgpu {
  41
  42 ////////////////////////////////////////////////////////////////////////////////
  43 // kernels/reduce.cuh
  44
  45 // Reduce input and return variable in device memory or host memory, or both.
  46 // Provide a non-null pointer to retrieve data.
  47 template<typename InputIt, typename T, typename Op>
  48 MGPU_HOST void Reduce(InputIt data_global, int count, T identity, Op op,
  49         T* reduce_global, T* reduce_host, CudaContext& context);
  50
  51 // T = std::iterator_traits<InputIt>::value_type.
  52 // Reduce with identity = 0 and op = mgpu::plus<T>.
  53 // Returns the value in host memory.
  54 template<typename InputIt>
  55 MGPU_HOST typename std::iterator_traits<InputIt>::value_type
  56 Reduce(InputIt data_global, int count, CudaContext& context);
  57
  58
  59 ////////////////////////////////////////////////////////////////////////////////
  60 // kernels/scan.cuh
  61
  62 // Scan inputs in device memory.
  63 // MgpuScanType may be:
  64 //              MgpuScanTypeExc (exclusive) or
  65 //              MgpuScanTypeInc (inclusive).
  66 // Return the total in device memory, host memory, or both.
  67 template<MgpuScanType Type, typename DataIt, typename T, typename Op,
  68         typename DestIt>
  69 MGPU_HOST void Scan(DataIt data_global, int count, T identity, Op op,
  70         T* reduce_global, T* reduce_host, DestIt dest_global,
  71         CudaContext& context);
  72
  73 // Exclusive scan with identity = 0 and op = mgpu::plus<T>.
  74 // Returns the total in host memory.
  75 template<typename InputIt, typename TotalType>
  76 MGPU_HOST void ScanExc(InputIt data_global, int count, TotalType* total,
  77         CudaContext& context);
  78
  79 // Like above, but don't return the total.
  80 template<typename InputIt>
  81 MGPU_HOST void ScanExc(InputIt data_global, int count, CudaContext& context);
  82
  83 ////////////////////////////////////////////////////////////////////////////////
  84 // kernels/bulkremove.cuh
  85
  86 // Compact the elements in source_global by removing elements identified by
  87 // indices_global. indices_global must be unique, sorted, and range between 0
  88 // and sourceCount - 1. The number of outputs is sourceCount - indicesCount.
  89
  90 // IndicesIt should resolve to an integer type. iterators like step_iterator
  91 // are supported.
  92
  93 // If sourceCount = 10, indicesCount = 6, and indices = (1, 3, 4, 5, 7, 8), then
  94 // dest = A0 A2 A6 A9. (All indices between 0 and sourceCount - 1 except those
  95 // in indices_global).
  96 template<typename InputIt, typename IndicesIt, typename OutputIt>
  97 MGPU_HOST void BulkRemove(InputIt source_global, int sourceCount,
  98         IndicesIt indices_global, int indicesCount, OutputIt dest_global,
  99         CudaContext& context);
 100
 101
 102 ////////////////////////////////////////////////////////////////////////////////
 103 // kernels/bulkinsert.cuh
 104
 105 // Combine aCount elements in a_global with bCount elements in b_global.
 106 // Each element a_global[i] is inserted before position indices_global[i] and
 107 // stored to dest_global. The insertion indices are relative to the B array,
 108 // not the output. Indices must be sorted but not necessarily unique.
 109
 110 // If aCount = 5, bCount = 3, and indices = (1, 1, 2, 3, 3), the output is:
 111 // B0 A0 A1 B1 A2 B2 A3 A4.
 112 template<typename InputIt1, typename IndicesIt, typename InputIt2,
 113         typename OutputIt>
 114 MGPU_HOST void BulkInsert(InputIt1 a_global, IndicesIt indices_global,
 115         int aCount, InputIt2 b_global, int bCount, OutputIt dest_global,
 116         CudaContext& context);
 117
 118
 119 ////////////////////////////////////////////////////////////////////////////////
 120 // kernels/merge.cuh
 121
 122 // MergeKeys merges two arrays of sorted inputs with C++-comparison semantics.
 123 // aCount items from aKeys_global and bCount items from bKeys_global are merged
 124 // into aCount + bCount items in keys_global.
 125 // Comp is a comparator type supporting strict weak ordering.
 126 // If !comp(b, a), then a is placed before b in the output.
 127 template<typename KeysIt1, typename KeysIt2, typename KeysIt3, typename Comp>
 128 MGPU_HOST void MergeKeys(KeysIt1 aKeys_global, int aCount, KeysIt2 bKeys_global,
 129         int bCount, KeysIt3 keys_global, Comp comp, CudaContext& context);
 130
 131 // MergeKeys specialized with Comp = mgpu::less<T>.
 132 template<typename KeysIt1, typename KeysIt2, typename KeysIt3>
 133 MGPU_HOST void MergeKeys(KeysIt1 aKeys_global, int aCount, KeysIt2 bKeys_global,
 134         int bCount, KeysIt3 keys_global, CudaContext& context);
 135
 136 // MergePairs merges two arrays of sorted inputs by key and copies values.
 137 // If !comp(bKey, aKey), then aKey is placed before bKey in the output, and
 138 // the corresponding aData is placed before bData. This corresponds to *_by_key
 139 // functions in Thrust.
 140 template<typename KeysIt1, typename KeysIt2, typename KeysIt3, typename ValsIt1,
 141         typename ValsIt2, typename ValsIt3, typename Comp>
 142 MGPU_HOST void MergePairs(KeysIt1 aKeys_global, ValsIt1 aVals_global,
 143         int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
 144         KeysIt3 keys_global, ValsIt3 vals_global, Comp comp, CudaContext& context);
 145
 146 // MergePairs specialized with Comp = mgpu::less<T>.
 147 template<typename KeysIt1, typename KeysIt2, typename KeysIt3, typename ValsIt1,
 148         typename ValsIt2, typename ValsIt3>
 149 MGPU_HOST void MergePairs(KeysIt1 aKeys_global, ValsIt1 aVals_global,
 150         int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
 151         KeysIt3 keys_global, ValsIt3 vals_global, CudaContext& context);
 152
 153
 154 ////////////////////////////////////////////////////////////////////////////////
 155 // kernels/mergesort.cuh
 156
 157 // MergesortKeys sorts data_global using comparator Comp.
 158 // If !comp(b, a), then a comes before b in the output. The data is sorted
 159 // in-place.
 160 template<typename T, typename Comp>
 161 MGPU_HOST void MergesortKeys(T* data_global, int count, Comp comp,
 162         CudaContext& context);
 163
 164 // MergesortKeys specialized with Comp = mgpu::less<T>.
 165 template<typename T>
 166 MGPU_HOST void MergesortKeys(T* data_global, int count, CudaContext& context);
 167
 168 // MergesortPairs sorts data by key, copying data. This corresponds to
 169 // sort_by_key in Thrust.
 170 template<typename KeyType, typename ValType, typename Comp>
 171 MGPU_HOST void MergesortPairs(KeyType* keys_global, ValType* values_global,
 172         int count, Comp comp, CudaContext& context);
 173
 174 // MergesortPairs specialized with Comp = mgpu::less<KeyType>.
 175 template<typename KeyType, typename ValType>
 176 MGPU_HOST void MergesortPairs(KeyType* keys_global, ValType* values_global,
 177         int count, CudaContext& context);
 178
 179 // MergesortIndices is like MergesortPairs where values_global is treated as
 180 // if initialized with integers (0 ... count - 1).
 181 template<typename KeyType, typename Comp>
 182 MGPU_HOST void MergesortIndices(KeyType* keys_global, int* values_global,
 183         int count, Comp comp, CudaContext& context);
 184
 185 // MergesortIndices specialized with Comp = mgpu::less<KeyType>.
 186 template<typename KeyType>
 187 MGPU_HOST void MergesortIndices(KeyType* keys_global, int* values_global,
 188         int count, CudaContext& context);
 189
 190
 191 ////////////////////////////////////////////////////////////////////////////////
 192 // kernels/segmentedsort.cuh
 193
 194 // Mergesort count items in-place in data_global. Keys are compared with Comp
 195 // (as they are in MergesortKeys), however keys remain inside the segments
 196 // defined by flags_global.
 197
 198 // flags_global is a bitfield cast to uint*. Each bit in flags_global is a
 199 // segment head flag. Only keys between segment head flags (inclusive on the
 200 // left and exclusive on the right) may be exchanged. The first element is
 201 // assumed to start a segment, regardless of the value of bit 0.
 202
 203 // Passing verbose=true causes the function to print mergepass statistics to the
 204 // console. This may be helpful for developers to understand the performance
 205 // characteristics of the function and how effectively it early-exits merge
 206 // operations.
 207 template<typename T, typename Comp>
 208 MGPU_HOST void SegSortKeysFromFlags(T* data_global, int count,
 209         const uint* flags_global, CudaContext& context, Comp comp,
 210         bool verbose = false);
 211
 212 // SegSortKeysFromFlags specialized with Comp = mgpu::less<T>.
 213 template<typename T>
 214 MGPU_HOST void SegSortKeysFromFlags(T* data_global, int count,
 215         const uint* flags_global, CudaContext& context, bool verbose = false);
 216
 217 // Segmented sort using head flags and supporting value exchange.
 218 template<bool Stable, typename KeyType, typename ValType, typename Comp>
 219 MGPU_HOST void SegSortPairsFromFlags(KeyType* keys_global,
 220         ValType* values_global, const uint* flags_global, int count,
 221         CudaContext& context, Comp comp, bool verbose = false);
 222
 223 // SegSortPairsFromFlags specialized with Comp = mgpu::less<T>.
 224 template<bool Stable, typename KeyType, typename ValType>
 225 MGPU_HOST void SegSortPairsFromFlags(KeyType* keys_global,
 226         ValType* values_global, const uint* flags_global, int count,
 227         CudaContext& context, bool verbose = false);
 228
 229 // Segmented sort using segment indices rather than head flags. indices_global
 230 // is a sorted and unique list of indicesCount segment start locations. These
 231 // indices correspond to the set bits in the flags_global field. A segment
 232 // head index for position 0 may be omitted.
 233 template<typename T, typename Comp>
 234 MGPU_HOST void SegSortKeysFromIndices(T* data_global, int count,
 235         const int* indices_global, int indicesCount, CudaContext& context,
 236         Comp comp, bool verbose = false);
 237
 238 // SegSortKeysFromIndices specialized with Comp = mgpu::less<T>.
 239 template<typename T>
 240 MGPU_HOST void SegSortKeysFromIndices(T* data_global, int count,
 241         const int* indices_global, int indicesCount, CudaContext& context,
 242         bool verbose = false);
 243
 244 // Segmented sort using segment indices and supporting value exchange.
 245 template<typename KeyType, typename ValType, typename Comp>
 246 MGPU_HOST void SegSortPairsFromIndices(KeyType* keys_global,
 247         ValType* values_global, int count, const int* indices_global,
 248         int indicesCount, CudaContext& context, Comp comp, bool verbose = false);
 249
 250 // SegSortPairsFromIndices specialized with Comp = mgpu::less<KeyType>.
 251 template<typename KeyType, typename ValType>
 252 MGPU_HOST void SegSortPairsFromIndices(KeyType* keys_global,
 253         ValType* values_global, int count, const int* indices_global,
 254         int indicesCount, CudaContext& context, bool verbose = false);
 255
 256
 257 ////////////////////////////////////////////////////////////////////////////////
 258 // kernels/localitysort.cuh
 259
 260 // LocalitySortKeys is a version of MergesortKeys optimized for non-uniformly
 261 // random input arrays. If the keys begin close to their sorted destinations,
 262 // this function may exploit the structure to early-exit merge passes.
 263
 264 // Passing verbose=true causes the function to print mergepass statistics to the
 265 // console. This may be helpful for developers to understand the performance
 266 // characteristics of the function and how effectively it early-exits merge
 267 // operations.
 268 template<typename T, typename Comp>
 269 MGPU_HOST void LocalitySortKeys(T* data_global, int count, CudaContext& context,
 270         Comp comp, bool verbose = false);
 271
 272 // LocalitySortKeys specialized with Comp = mgpu::less<T>.
 273 template<typename T>
 274 MGPU_HOST void LocalitySortKeys(T* data_global, int count, CudaContext& context,
 275         bool verbose = false);
 276
 277 // Locality sort supporting value exchange.
 278 template<typename KeyType, typename ValType, typename Comp>
 279 MGPU_HOST void LocalitySortPairs(KeyType* keys_global, ValType* values_global,
 280         int count, CudaContext& context, Comp comp, bool verbose = false);
 281
 282 // LocalitySortPairs specialized with Comp = mpgu::less<T>.
 283 template<typename KeyType, typename ValType>
 284 MGPU_HOST void LocalitySortPairs(KeyType* keys_global, ValType* values_global,
 285         int count, CudaContext& context, bool verbose = false);
 286
 287
 288 ////////////////////////////////////////////////////////////////////////////////
 289 // kernels/sortedsearch.cuh
 290
 291 // Vectorized sorted search. This is the most general form of the function.
 292 // Executes two simultaneous linear searches on two sorted inputs.
 293
 294 // Bounds:
 295 //              MgpuBoundsLower -
 296 //                      lower-bound search of A into B.
 297 //                      upper-bound search of B into A.
 298 //              MgpuBoundsUpper -
 299 //                      upper-bound search of A into B.
 300 //                      lower-bound search of B into A.
 301 // Type[A|B]:
 302 //              MgpuSearchTypeNone - no output for this input.
 303 //              MgpuSearchTypeIndex - return search indices as integers.
 304 //              MgpuSearchTypeMatch - return 0 (no match) or 1 (match).
 305 //                      For TypeA, returns 1 if there is at least 1 matching element in B
 306 //                              for element in A.
 307 //                      For TypeB, returns 1 if there is at least 1 matching element in A
 308 //                              for element in B.
 309 //              MgpuSearchTypeIndexMatch - return search indices as integers. Most
 310 //                      significant bit is match bit.
 311 //      aMatchCount, bMatchCount - If Type is Match or IndexMatch, return the total
 312 //              number of elements in A or B with matches in B or A, if the pointer is
 313 //              not null. This generates a synchronous cudaMemcpyDeviceToHost call that
 314 //              callers using streams should be aware of.
 315 template<MgpuBounds Bounds, MgpuSearchType TypeA, MgpuSearchType TypeB,
 316         typename InputIt1, typename InputIt2, typename OutputIt1,
 317         typename OutputIt2, typename Comp>
 318 MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
 319         int bCount, OutputIt1 aIndices_global, OutputIt2 bIndices_global,
 320         Comp comp, CudaContext& context, int* aMatchCount = 0,
 321         int* bMatchCount = 0);
 322
 323 // SortedSearch specialized with Comp = mgpu::less<T>.
 324 template<MgpuBounds Bounds, MgpuSearchType TypeA, MgpuSearchType TypeB,
 325         typename InputIt1, typename InputIt2, typename OutputIt1,
 326         typename OutputIt2>
 327 MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
 328         int bCount, OutputIt1 aIndices_global, OutputIt2 bIndices_global,
 329         CudaContext& context, int* aMatchCount = 0, int* bMatchCount = 0);
 330
 331 // SortedSearch specialized with
 332 // TypeA = MgpuSearchTypeIndex
 333 // TypeB = MgpuSearchTypeNone
 334 // aMatchCount = bMatchCount = 0.
 335 template<MgpuBounds Bounds, typename InputIt1, typename InputIt2,
 336         typename OutputIt, typename Comp>
 337 MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
 338         int bCount, OutputIt aIndices_global, Comp comp, CudaContext& context);
 339
 340 // SortedSearch specialized with Comp = mgpu::less<T>.
 341 template<MgpuBounds Bounds, typename InputIt1, typename InputIt2,
 342         typename OutputIt>
 343 MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
 344         int bCount, OutputIt aIndices_global, CudaContext& context);
 345
 346 // SortedEqualityCount returns the difference between upper-bound (computed by
 347 // this function) and lower-bound (passed as an argument). That is, it computes
 348 // the number of occurences of a key in B that match each key in A.
 349
 350 // The provided operator must have a method:
 351 //              int operator()(int lb, int ub) const;
 352 // It returns the count given the lower- and upper-bound indices.
 353 //
 354 // An object SortedEqualityOp is provided:
 355 //              struct SortedEqualityOp {
 356 //                      MGPU_HOST_DEVICE int operator()(int lb, int ub) const {
 357 //                              return ub - lb;
 358 //                      }
 359 //              };
 360 template<typename InputIt1, typename InputIt2, typename InputIt3,
 361         typename OutputIt, typename Comp, typename Op>
 362 MGPU_HOST void SortedEqualityCount(InputIt1 a_global, int aCount,
 363         InputIt2 b_global, int bCount, InputIt3 lb_global, OutputIt counts_global,
 364         Comp comp, Op op, CudaContext& context);
 365
 366 // Specialization of SortedEqualityCount with Comp = mgpu::less<T>.
 367 template<typename InputIt1, typename InputIt2, typename InputIt3,
 368         typename OutputIt, typename Op>
 369 MGPU_HOST void SortedEqualityCount(InputIt1 a_global, int aCount,
 370         InputIt2 b_global, int bCount, InputIt3 lb_global, OutputIt counts_global,
 371         Op op, CudaContext& context);
 372
 373 ////////////////////////////////////////////////////////////////////////////////
 374 // kernels/loadbalance.cuh
 375
 376 // LoadBalanceSearch is a special vectorized sorted search. Consider bCount
 377 // objects that generate a variable number of work items, with aCount work
 378 // items in total. The caller computes an exclusive scan of the work-item counts
 379 // into b_global.
 380
 381 // indices_global has aCount outputs. indices_global[i] is the index of the
 382 // object that generated the i'th work item.
 383 // Eg:
 384 // work-item counts:  2,  5,  3,  0,  1.
 385 // scan counts:       0,  2,  7, 10, 10.   aCount = 11.
 386 //
 387 // LoadBalanceSearch computes the upper-bound of counting_iterator<int>(0) with
 388 // the scan of the work-item counts and subtracts 1:
 389 // LBS: 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 4.
 390
 391 // This is equivalent to expanding the index of each object by the object's
 392 // work-item count.
 393
 394
 395 template<typename InputIt>
 396 MGPU_HOST void LoadBalanceSearch(int aCount, InputIt b_global, int bCount,
 397         int* indices_global, CudaContext& context);
 398
 399 ////////////////////////////////////////////////////////////////////////////////
 400 // kernels/intervalmove.cuh
 401
 402 // IntervalExpand duplicates intervalCount items in values_global.
 403 // indices_global is an intervalCount-sized array filled with the scan of item
 404 // expand counts. moveCount is the total number of outputs (sum of expand
 405 // counts).
 406
 407 // Eg:
 408 //              values  =  0,  1,  2,  3,  4,  5,  6,  7,  8
 409 //              counts  =  1,  2,  1,  0,  4,  2,  3,  0,  2
 410 //              indices =  0,  1,  3,  4,  4,  8, 10, 13, 13 (moveCount = 15).
 411 // Expand values[i] by counts[i]:
 412 // output  =  0, 1, 1, 2, 4, 4, 4, 4, 5, 5, 6, 6, 6, 8, 8
 413 template<typename IndicesIt, typename ValuesIt, typename OutputIt>
 414 MGPU_HOST void IntervalExpand(int moveCount, IndicesIt indices_global,
 415         ValuesIt values_global, int intervalCount, OutputIt output_global,
 416         CudaContext& context);
 417
 418 // IntervalMove is a load-balanced and vectorized device memcpy.
 419 // It copies intervalCount variable-length intervals from user-defined sources
 420 // to user-defined destinations. If destination intervals overlap, results are
 421 // undefined.
 422
 423 // Eg:
 424 // Interval counts:
 425 //    0:     3    9    1    9    8    5   10    2    5    2
 426 //   10:     8    6    5    2    4    0    8    2    5    6
 427 // Scan of interval counts (indices_global):
 428 //    0:     0    3   12   13   22   30   35   45   47   52
 429 //   10:    54   62   68   73   75   79   79   87   89   94  (moveCount = 100).
 430 // Interval gather (gather_global):
 431 //    0:    75   86   17    2   67   24   37   11   95   35
 432 //   10:    52   18   47    0   13   75   78   60   62   29
 433 // Interval scatter (scatter_global):
 434 //    0:    10   80   99   27   41   71   15    0   36   13
 435 //   10:    89   49   66   97   76   76    2   25   61   55
 436
 437 // This vectorizes into 20 independent memcpy operations which are load-balanced
 438 // across CTAs:
 439 // move 0: (75, 78)->(10, 13)       move 10: (52, 60)->(10, 18)
 440 // move 1: (86, 95)->(80, 89)       move 11: (18, 24)->(49, 55)
 441 // move 2: (17, 18)->(99,100)       move 12: (47, 52)->(66, 71)
 442 // move 3: ( 2, 11)->(27, 36)       move 13: ( 0,  2)->(97, 99)
 443 // move 4: (67, 75)->(41, 49)       move 14: (13, 17)->(76, 80)
 444 // move 5: (24, 29)->(71, 76)       move 15: (75, 75)->(76, 76)
 445 // move 6: (37, 47)->(15, 25)       move 16: (78, 86)->( 2, 10)
 446 // move 7: (11, 13)->( 0,  3)       move 17: (60, 62)->(25, 27)
 447 // move 8: (95,100)->(36, 41)       move 18: (62, 67)->(61, 66)
 448 // move 9: (35, 37)->(13, 15)       move 19: (29, 35)->(55, 61)
 449 template<typename GatherIt, typename ScatterIt, typename IndicesIt,
 450         typename InputIt, typename OutputIt>
 451 MGPU_HOST void IntervalMove(int moveCount, GatherIt gather_global,
 452         ScatterIt scatter_global, IndicesIt indices_global, int intervalCount,
 453         InputIt input_global, OutputIt output_global, CudaContext& context);
 454
 455 // IntervalGather is a specialization of IntervalMove that stores output data
 456 // sequentially into output_global. For the example above, IntervalGather treats
 457 // scatter_global the same as indices_global.
 458 template<typename GatherIt, typename IndicesIt, typename InputIt,
 459         typename OutputIt>
 460 MGPU_HOST void IntervalGather(int moveCount, GatherIt gather_global,
 461         IndicesIt indices_global, int intervalCount, InputIt input_global,
 462         OutputIt output_global, CudaContext& context);
 463
 464 // IntervalScatter is a specialization of IntervalMove that loads input data
 465 // sequentially from input_global. For the example above, IntervalScatter treats
 466 // gather_global the same as indices_global.
 467 template<typename ScatterIt, typename IndicesIt, typename InputIt,
 468         typename OutputIt>
 469 MGPU_HOST void IntervalScatter(int moveCount, ScatterIt scatter_global,
 470         IndicesIt indices_global, int intervalCount, InputIt input_global,
 471         OutputIt output_global, CudaContext& context);
 472
 473
 474 ////////////////////////////////////////////////////////////////////////////////
 475 // kernels/join.cuh
 476
 477 // RelationalJoin is a sort-merge join that returns indices into one of the four
 478 // relational joins:
 479 //              MgpuJoinKindInner
 480 //              MgpuJoinKindLeft
 481 //              MgpuJoinKindRight
 482 //              MgpuJoinKindOuter.
 483
 484 // A =  100, 101, 103, 103
 485 // B =  100, 100, 102, 103
 486 // Outer join:
 487 //     ai, bi   a[ai], b[bi]
 488 // 0:  (0, 0) -  (100, 100)    // cross-product expansion for key 100
 489 // 1:  (0, 1) -  (100, 100)
 490 // 2:  (1, -) -  (101, ---)    // left-join for key 101
 491 // 3:  (-, 2) -  (---, 102)    // right-join for key 102
 492 // 4:  (3, 3) -  (103, 103)    // cross-product expansion for key 103
 493
 494 // MgpuJoinKindLeft drops the right-join on line 3.
 495 // MgpuJoinKindRight drops the left-join on line 2.
 496 // MgpuJoinKindInner drops both the left- and right-joins.
 497
 498 // The caller passes MGPU_MEM(int) pointers to hold indices. Memory is allocated
 499 // by the join function using the allocator associated with the context. It
 500 // returns the number of outputs.
 501
 502 // RelationalJoin performs one cudaMemcpyDeviceToHost to retrieve the size of
 503 // the output array. This is a synchronous operation and may prevent queueing
 504 // for callers using streams.
 505 template<MgpuJoinKind Kind, typename InputIt1, typename InputIt2,
 506         typename Comp>
 507 MGPU_HOST int RelationalJoin(InputIt1 a_global, int aCount, InputIt2 b_global,
 508         int bCount, MGPU_MEM(int)* ppAJoinIndices, MGPU_MEM(int)* ppBJoinIndices,
 509         Comp comp, CudaContext& context);
 510
 511 // Specialization of RelationJoil with Comp = mgpu::less<T>.
 512 template<MgpuJoinKind Kind, typename InputIt1, typename InputIt2>
 513 MGPU_HOST int RelationalJoin(InputIt1 a_global, int aCount, InputIt2 b_global,
 514         int bCount, MGPU_MEM(int)* ppAJoinIndices, MGPU_MEM(int)* ppBJoinIndices,
 515         CudaContext& context);
 516
 517
 518 ////////////////////////////////////////////////////////////////////////////////
 519 // kernels/sets.cuh
 520
 521 // SetOpKeys implements multiset operations with C++ set_* semantics.
 522 // MgpuSetOp may be:
 523 //              MgpuSetOpIntersection -         like std::set_intersection
 524 //              MgpuSetOpUnion -                        like std::set_union
 525 //              MgpuSetOpDiff -                         like std::set_difference
 526 //              MgpuSetOpSymDiff -                      like std::set_symmetric_difference
 527
 528 // Setting Duplicates to false increases performance for inputs with no
 529 // duplicate keys in either array.
 530
 531 // The caller passes MGPU_MEM(T) pointers to hold outputs. Memory is allocated
 532 // by the multiset function using the allocator associated with the context. It
 533 // returns the number of outputs.
 534
 535 // SetOpKeys performs one cudaMemcpyDeviceToHost to retrieve the size of
 536 // the output array. This is a synchronous operation and may prevent queueing
 537 // for callers using streams.
 538
 539 // If compact = true, SetOpKeys pre-allocates an output buffer is large as the
 540 // sum of the input arrays. Partials results are computed into this temporary
 541 // array before being moved into the final array. It consumes more space but
 542 // results in higher performance.
 543 template<MgpuSetOp Op, bool Duplicates, typename It1, typename It2,
 544         typename T, typename Comp>
 545 MGPU_HOST int SetOpKeys(It1 a_global, int aCount, It2 b_global, int bCount,
 546         MGPU_MEM(T)* ppKeys_global, Comp comp, CudaContext& context,
 547         bool compact = true);
 548
 549 // Specialization of SetOpKeys with Comp = mgpu::less<T>.
 550 template<MgpuSetOp Op, bool Duplicates, typename It1, typename It2, typename T>
 551 MGPU_HOST int SetOpKeys(It1 a_global, int aCount, It2 b_global, int bCount,
 552         MGPU_MEM(T)* ppKeys_global, CudaContext& context, bool compact = true);
 553
 554 // SetOpPairs runs multiset operations by key and supports value exchange.
 555 template<MgpuSetOp Op, bool Duplicates, typename KeysIt1, typename KeysIt2,
 556         typename ValsIt1, typename ValsIt2, typename KeyType, typename ValType,
 557         typename Comp>
 558 MGPU_HOST int SetOpPairs(KeysIt1 aKeys_global, ValsIt1 aVals_global, int aCount,
 559         KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
 560         MGPU_MEM(KeyType)* ppKeys_global, MGPU_MEM(ValType)* ppVals_global,
 561         Comp comp, CudaContext& context);
 562
 563 // Specialization of SetOpPairs with Comp = mgpu::less<T>.
 564 template<MgpuSetOp Op, bool Duplicates, typename KeysIt1, typename KeysIt2,
 565         typename ValsIt1, typename ValsIt2, typename KeyType, typename ValType>
 566 MGPU_HOST int SetOpPairs(KeysIt1 aKeys_global, ValsIt1 aVals_global, int aCount,
 567         KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
 568         MGPU_MEM(KeyType)* ppKeys_global, MGPU_MEM(ValType)* ppVals_global,
 569         CudaContext& context);
 570
 571 ////////////////////////////////////////////////////////////////////////////////
 572 // kernels/segreducecsr.cuh
 573
 574 // SegReducePreprocessData is defined in segreduce.cuh. It includes:
 575 // -    limits for CSR->tiles
 576 // -    packed thread codes for each thread in the reduction
 577 // -    (optional) CSR2 array of filtered segment offsets
 578 struct SegReducePreprocessData;
 579
 580 // SegReduceCsr runs a segmented reduction given an input and a sorted list of
 581 // segment start offsets. This implementation requires operators support
 582 // commutative (a + b = b + a) and associative (a + (b + c) = (a + b) + c)
 583 // evaluation.
 584
 585 // In the segmented reduction, reduce-by-key, and Spmv documentation, "segment"
 586 // and "row" are used interchangably. A
 587 //
 588
 589 // InputIt data_global          - Data value input.
 590 // int count                            - Size of input array data_global.
 591 // CsrIt csr_global                     - List of integers for start of each segment.
 592 //                                                        The first entry must be 0 (indicating that the
 593 //                                                        first segment starts at offset 0).
 594 //                                                        Equivalent to exc-scan of segment sizes.
 595 //                                                        If supportEmpty is false: must be ascending.
 596 //                                                        If supportEmpty is true: must be non-descending.
 597 // int numSegments                      - Size of segment list csr_global. Must be >= 1.
 598 // bool supportEmpty            - Basic seg-reduce code does not support empty
 599 //                                                        segments.
 600 //                                                        Set supportEmpty = true to add pre- and post-
 601 //                                                        processing to support empty segments.
 602 // OutputIt dest_global         - Output array for segmented reduction. Allocate
 603 //                                                        numSegments elements. Should be same data type as
 604 //                                                        InputIt and identity.
 605 // T identity                           - Identity for reduction operation. Eg, use 0 for
 606 //                                                        addition or 1 for multiplication.
 607 // Op op                                        - Reduction operator. Model on std::plus<>. MGPU
 608 //                                                        provides operators mgpu::plus<>, minus<>,
 609 //                                                        multiplies<>, modulus<>, bit_or<> bit_and<>,
 610 //                                                        bit_xor<>, maximum<>, and minimum<>.
 611 // CudaContext& context         - MGPU context support object. All kernels are
 612 //                                                        launched on the associated stream.
 613 template<typename InputIt, typename CsrIt, typename OutputIt, typename T,
 614         typename Op>
 615 MGPU_HOST void SegReduceCsr(InputIt data_global, int count, CsrIt csr_global,
 616         int numSegments, bool supportEmpty, OutputIt dest_global, T identity, Op op,
 617         CudaContext& context);
 618
 619 // IndirectReduceCsr is like SegReduceCsr but with one level of source
 620 // indirection. The start of each segment/row i in data_global starts at
 621 // sources_global[i].
 622 // SourcesIt sources_global     - List of integers for source data of each segment.
 623 //                                                        Must be numSegments in size.
 624 template<typename InputIt, typename CsrIt, typename SourcesIt,
 625         typename OutputIt, typename T, typename Op>
 626 MGPU_HOST void IndirectReduceCsr(InputIt data_global, int count,
 627         CsrIt csr_global, SourcesIt sources_global, int numSegments,
 628         bool supportEmpty, OutputIt dest_global, T identity, Op op,
 629         CudaContext& context);
 630
 631 // SegReduceCsrPreprocess accelerates multiple seg-reduce calls on different
 632 // data with the same segment geometry. Partitioning and CSR->CSR2 transform is
 633 // off-loaded to a preprocessing pass. The actual reduction is evaluated by
 634 // SegReduceApply.
 635 template<typename T, typename CsrIt>
 636 MGPU_HOST void SegReduceCsrPreprocess(int count, CsrIt csr_global,
 637         int numSegments, bool supportEmpty,
 638         std::auto_ptr<SegReducePreprocessData>* ppData, CudaContext& context);
 639
 640 template<typename InputIt, typename DestIt, typename T, typename Op>
 641 MGPU_HOST void SegReduceApply(const SegReducePreprocessData& preprocess,
 642         InputIt data_global, T identity, Op op, DestIt dest_global,
 643         CudaContext& context);
 644
 645 ////////////////////////////////////////////////////////////////////////////////
 646 // kernels/reducebykey.csr
 647
 648 typedef SegReducePreprocessData ReduceByKeyPreprocessData;
 649
 650 // ReduceByKey runs a segmented reduction given a data input and a matching set
 651 // of keys. This implementation requires operators support commutative
 652 // (a + b = b + a) and associative (a + (b + c) = (a + b) + c) evaluation.
 653 // It roughly matches the behavior of thrust::reduce_by_key.
 654
 655 // KeysIt keys_global           - Key identifier for the segment.
 656 // InputIt data_global          - Data value input.
 657 // int count                            - Size of input arrays keys_global and
 658 //                                                        data_global.
 659 // ValType identity                     - Identity for reduction operation. Eg, use 0 for
 660 //                                                        addition or 1 for multiplication.
 661 // Op op                                        - Reduction operator. Model on std::plus<>. MGPU
 662 //                                                        provides operators mgpu::plus<>, minus<>,
 663 //                                                        multiplies<>, modulus<>, bit_or<> bit_and<>,
 664 //                                                        bit_xor<>, maximum<>, and minimum<>.
 665 // Comp comp                            - Operator for comparing adjacent adjacent keys.
 666 //                                                        Must return true if two adjacent keys are in the
 667 //                                                        same segment. Use mgpu::equal_to<KeyType>() by
 668 //                                                        default.
 669 // KeyType* keysDest_global     - If this pointer is not null, return the first
 670 //                                                        key from each segment. Must be sized to at least
 671 //                                                        the number of segments.
 672 // DestIt dest_global           - Holds the reduced data. Must be sized to at least
 673 //                                                        the number of segments.
 674 // int* count_host                      - The number of segments, returned in host memory.
 675 //                                                        May be null.
 676 // int* count_global            - The number of segments, returned in device memory.
 677 //                                                        This avoids a D->H synchronization. May be null.
 678 // CudaContext& context         - MGPU context support object.
 679 template<typename KeysIt, typename InputIt, typename DestIt,
 680         typename KeyType, typename ValType, typename Op, typename Comp>
 681 MGPU_HOST void ReduceByKey(KeysIt keys_global, InputIt data_global, int count,
 682         ValType identity, Op op, Comp comp, KeyType* keysDest_global,
 683         DestIt dest_global, int* count_host, int* count_global,
 684         CudaContext& context);
 685
 686 // ReduceByKeyPreprocess accelerates multiple reduce-by-key calls on different
 687 // data with the same segment geometry. The actual reduction is evaluated by
 688 // ReduceByKeyApply.
 689 // Note that the caller must explicitly specify the ValType argument. Kernel
 690 // tunings are based on the value type, not the key type.
 691 template<typename ValType, typename KeyType, typename KeysIt, typename Comp>
 692 MGPU_HOST void ReduceByKeyPreprocess(int count, KeysIt keys_global,
 693         KeyType* keysDest_global, Comp comp, int* count_host, int* count_global,
 694         std::auto_ptr<ReduceByKeyPreprocessData>* ppData, CudaContext& context);
 695
 696 template<typename InputIt, typename DestIt, typename T, typename Op>
 697 MGPU_HOST void ReduceByKeyApply(const ReduceByKeyPreprocessData& preprocess,
 698         InputIt data_global, T identity, Op op, DestIt dest_global,
 699         CudaContext& context);
 700
 701 ////////////////////////////////////////////////////////////////////////////////
 702 // kernels/spmvcsr.cuh
 703
 704 typedef SegReducePreprocessData SpmvPreprocessData;
 705
 706 // SpmvCsr[Unary|Binary] evaluates the product of a sparse matrix (CSR format)
 707 // with a dense vector.
 708 // SpmvCsrIndirect[Unary|Binary] uses indirection to lookup the start of each
 709 // matrix_global and cols_global on a per-row basis.
 710
 711 // Unary methods reduce on the right-hand side vector values.
 712 // Binary methods reduce the product of the left-hand side matrix value with the
 713 // right-hand side vector values.
 714
 715 // MatrixIt matrix_global       - Left-hand side data for binary Spmv. There are nz
 716 //                                                        non-zero matrix elements. These are loaded and
 717 //                                                        combined with the vector values with mulOp.
 718 // ColsIt cols_global           - Row identifiers for the right-hand side of the
 719 //                                                        matrix/value products. If element i is the k'th
 720 //                                                        non-zero in row j, the product is formed as
 721 //                                                            matrix_global[i] * vec_global[cols_global[i]]
 722 //                                                        for direct indexing, or,
 723 //                                                            m = source_global[j] + k
 724 //                                                            matrix_global[m] * vec_global[cols_global[m]].
 725 // int nz                                       - Number of non-zeros in LHS matrix. Size of
 726 //                                                        matrix_global and cols_global.
 727 // CsrIt csr_global                     - List of integers for start of each row.
 728 //                                                        The first entry must be 0 (indicating that the
 729 //                                                        first row starts at offset 0).
 730 //                                                        Equivalent to exc-scan of row sizes.
 731 //                                                        If supportEmpty is false: must be ascending.
 732 //                                                        If supportEmpty is true: must be non-descending.
 733 // SourcesIt sources_global     - An indirection array to source each row's data.
 734 //                                                        The size of each row i is
 735 //                                                                 size_i = csr_global[i + 1] - csr_global[i].
 736 //                                                        The starting offset for both the data and column
 737 //                                                        identifiers is
 738 //                                                                 offset_i = sources_global[i].
 739 //                                                        The direct Spmv methods (i.e. those not taking
 740 //                                                        a sources_global parameter) can be thought of as
 741 //                                                        indirect methods with sources_global = csr_global.
 742 // int numRows                          - Size of segment list csr_global. Must be >= 1.
 743 // VecIt vec_global                     - Input array. Size is the width of the matrix.
 744 //                                                        For unary Spmv, these values are reduced.
 745 //                                                        For binary Spmv, the products of the matrix and
 746 //                                                        vector values are reduced.
 747 // bool supportEmpty            - Basic seg-reduce code does not support empty rows.
 748 //                                                        Set supportEmpty = true to add pre- and post-
 749 //                                                        processing to support empty rows.
 750 // DestIt dest_global           - Output array. Must be numRows in size.
 751 // T identity                           - Identity for reduction operation. Eg, use 0 for
 752 //                                                        addition or 1 for multiplication.
 753 // MulOp mulOp                          - Reduction operator for combining matrix value with
 754 //                                                        vector value. Only defined for binary Spmv.
 755 //                                                        Use mgpu::multiplies<T>() for default behavior.
 756 // AddOp addOp                          - Reduction operator for reducing vector values
 757 //                                                    (unary Spmv) or matrix-vector products (binary
 758 //                                                        Spmv). Use mgpu::plus<T>() for default behavior.
 759 // CudaContext& context         - MGPU context support object. All kernels are
 760 //                                                        launched on the associated stream.
 761 template<typename ColsIt, typename CsrIt, typename VecIt, typename DestIt,
 762         typename T, typename AddOp>
 763 MGPU_HOST void SpmvCsrUnary(ColsIt cols_global, int nz, CsrIt csr_global,
 764         int numRows, VecIt vec_global, bool supportEmpty, DestIt dest_global,
 765         T identity, AddOp addOp, CudaContext& context);
 766
 767 template<typename MatrixIt, typename ColsIt, typename CsrIt, typename VecIt,
 768         typename DestIt, typename T, typename MulOp, typename AddOp>
 769 MGPU_HOST void SpmvCsrBinary(MatrixIt matrix_global, ColsIt cols_global,
 770         int nz, CsrIt csr_global, int numRows, VecIt vec_global,
 771         bool supportEmpty, DestIt dest_global, T identity, MulOp mulOp, AddOp addOp,
 772         CudaContext& context);
 773
 774 template<typename ColsIt, typename CsrIt, typename SourcesIt, typename VecIt,
 775         typename DestIt, typename T, typename AddOp>
 776 MGPU_HOST void SpmvCsrIndirectUnary(ColsIt cols_global, int nz,
 777         CsrIt csr_global, SourcesIt sources_global, int numRows, VecIt vec_global,
 778         bool supportEmpty, DestIt dest_global, T identity, AddOp addOp,
 779         CudaContext& context);
 780
 781 template<typename MatrixIt, typename ColsIt, typename CsrIt, typename SourcesIt,
 782         typename VecIt, typename DestIt, typename T, typename MulOp, typename AddOp>
 783 MGPU_HOST void SpmvCsrIndirectBinary(MatrixIt matrix_global, ColsIt cols_global,
 784         int nz, CsrIt csr_global, SourcesIt sources_global, int numRows,
 785         VecIt vec_global, bool supportEmpty, DestIt dest_global, T identity,
 786         MulOp mulOp, AddOp addOp, CudaContext& context);
 787
 788 // SpmvPreprocess[Unary|Binary] accelerates multiple Spmv calls on different
 789 // matrix/vector pairs with the same matrix geometry. The actual reduction is
 790 // evaluated Spmv[Unary|Binary]Apply.
 791 template<typename T, typename CsrIt>
 792 MGPU_HOST void SpmvPreprocessUnary(int nz, CsrIt csr_global, int numRows,
 793         bool supportEmpty, std::auto_ptr<SpmvPreprocessData>* ppData,
 794         CudaContext& context);
 795
 796 template<typename T, typename CsrIt>
 797 MGPU_HOST void SpmvPreprocessBinary(int nz, CsrIt csr_global, int numRows,
 798         bool supportEmpty, std::auto_ptr<SpmvPreprocessData>* ppData,
 799         CudaContext& context);
 800
 801 template<typename ColsIt, typename VecIt, typename DestIt, typename T,
 802         typename MulOp, typename AddOp>
 803 MGPU_HOST void SpmvUnaryApply(const SpmvPreprocessData& preprocess,
 804         ColsIt cols_global, VecIt vec_global, DestIt dest_global, T identity,
 805         AddOp addOp, CudaContext& context);
 806
 807 template<typename MatrixIt, typename ColsIt, typename VecIt, typename DestIt,
 808         typename T, typename MulOp, typename AddOp>
 809 MGPU_HOST void SpmvBinaryApply(const SpmvPreprocessData& preprocess,
 810         MatrixIt matrix_global, ColsIt cols_global, VecIt vec_global,
 811         DestIt dest_global, T identity, MulOp mulOp, AddOp addOp,
 812         CudaContext& context);
 813
 814 } // namespace mgpu
 815