3 lv2_gpu_sort.cu sort lev2 with GPU
5 # Copyright (C) 2015, The University of Hong Kong.
7 # This program is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU General Public License
9 # as published by the Free Software Foundation; either version 3
10 # of the License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 03110-1301, USA.
23 Change : Generate this file by fishing from Chi Man LIU's code.
39 #include "lv2_gpu_sort.h"
41 #include "b40c/radix_sort/enactor.cuh"
42 #include "b40c/util/multiple_buffering.cuh"
46 size_t free_gpu_mem, total_gpu_mem;
47 assert(cudaMemGetInfo(&free_gpu_mem, &total_gpu_mem) == cudaSuccess);
48 fprintf(stderr, "Free GPU mem: %lld\n", free_gpu_mem);
52 __global__ void permutation_kernel( uint32_t* index, uint32_t* val, uint32_t* new_val, int num_elements ) {
53 int tid = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
55 if (tid < num_elements)
56 new_val[tid] = val[index[tid] & 0x1FFFFFFF];
60 void lv2_gpu_sort(uint32_t *lv2_substrings, uint32_t *permutation, int uint32_ts_per_substring, int64_t width, int64_t lv2_num_items){
62 uint32_t *d_keys, *d_values;
63 assert(cudaMalloc( (void**) &d_keys, sizeof(uint32_t) * lv2_num_items ) == cudaSuccess);
64 assert(cudaMalloc( (void**) &d_values, sizeof(uint32_t) * lv2_num_items ) == cudaSuccess);
65 b40c::radix_sort::Enactor enactor;
66 b40c::util::DoubleBuffer<uint32_t, uint32_t> ss(d_keys, d_values);
68 assert(cudaMemcpy( ss.d_values[ss.selector], permutation, sizeof(uint32_t) * lv2_num_items, cudaMemcpyHostToDevice ) == cudaSuccess);
69 for (int iteration = uint32_ts_per_substring - 1; iteration >= 0; --iteration ) { // TODO uint32_ts_per_suffix ?
70 if ( iteration == uint32_ts_per_substring - 1 ) { // TODO uint32_ts_per_suffix
71 assert(cudaMemcpy( ss.d_keys[ss.selector], lv2_substrings + ( iteration * width ),
72 sizeof(uint32_t) * lv2_num_items, cudaMemcpyHostToDevice ) == cudaSuccess);
74 assert(cudaMemcpy( ss.d_keys[1-ss.selector], lv2_substrings + ( iteration * width ),
75 sizeof(uint32_t) * lv2_num_items, cudaMemcpyHostToDevice ) == cudaSuccess);
76 int num_blocks = ( lv2_num_items + THREADS_PER_BLOCK - 1 ) / THREADS_PER_BLOCK;
78 permutation_kernel<<<num_blocks, THREADS_PER_BLOCK>>>( ss.d_values[ss.selector], ss.d_keys[1-ss.selector],
79 ss.d_keys[ss.selector], lv2_num_items );
81 assert(enactor.Sort<b40c::radix_sort::LARGE_SIZE>( ss, lv2_num_items ) == cudaSuccess);
84 // free device memory EXCEPT sort_indexes
85 if (ss.d_keys[ss.selector]) cudaFree(ss.d_keys[ss.selector]);
86 if (ss.d_keys[1-ss.selector]) cudaFree(ss.d_keys[1-ss.selector]);
87 if (ss.d_values[1-ss.selector]) cudaFree(ss.d_values[1-ss.selector]);
88 ///////////////////// END GPU SORT ////////////////////////////
89 assert(cudaMemcpy( permutation, ss.d_values[ss.selector], sizeof(int) * lv2_num_items, cudaMemcpyDeviceToHost ) == cudaSuccess);
90 if (ss.d_values[ss.selector]) cudaFree(ss.d_values[ss.selector]);