modified: myjupyterlab.sh
[GalaxyCodeBases.git] / BGI / BASE / src / 2bwt / readbwt.cpp
blobaf99bd545e0bb5be2975dcb0a6b8532fe2bdaf01
1 /*
3 readbwt.cpp Build index of short reads.
5 This program builds index of short reads for use of assembler BASE and some other programs.
7 # Copyright (C) 2015, The University of Hong Kong.
9 # This program is free software; you can redistribute it and/or
10 # modify it under the terms of the GNU General Public License
11 # as published by the Free Software Foundation; either version 3
12 # of the License, or (at your option) any later version.
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License
20 # along with this program; if not, write to the Free Software
21 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 03110-1301, USA.
23 Date : 1st Jan 2014
24 Author : Chi Man LIU
25 Change : Developped this product to build BWT of short reads.
26 This is the implementation of CX1 method.
28 Date : 1st Jan 2015
29 Author : Binghang LIU
30 Change : Add the implementation of CX1 method without GPU.
31 Add the loading of SOAPdenovo read libarary files, add qual_cutoff to set quality threshold.
32 Add a new output file for pair end reads with different insert sizes.
36 #include <omp.h>
37 #include <pthread.h>
38 #include <assert.h>
39 #include <stdarg.h>
40 #include <sys/sysinfo.h> // memory size.
41 #include <vector>
42 #include <algorithm>
43 #include <string>
45 // for sorting
46 #include <zlib.h>
47 #include "lv2_cpu_sort.h"
48 #include "lv2_gpu_sort.h"
50 using namespace std;
52 typedef unsigned int word;
54 #define NUM_CPU_THREADS 24
55 //#define THREADS_PER_BLOCK 256 // GPU
57 // do not modify below
58 #define BITS_PER_WORD 32 //used for quality.
59 #define BITS_PER_CHAR 2
61 #define BITS_PER_QUAL 1 //used for quality.
63 #define CHARS_PER_WORD 16
64 #define CHAR_MASK 0x3
66 #define PREFIX_LENGTH 8
67 #define SPECIAL_HANDLE_SUFFIX_LENGTH 3 // suffixes with length <= 3 will be handled specially
68 #define SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS 125 // 5^3
69 #define NUM_BUCKETS 390625
70 #define PREFIX_MASK 0xFFFF
72 char dna_map[256]; // map from ACGT to 0123
73 char dna_char[5] = {'A','C','G','T','$'};
74 static char tabs[2][1024];
75 word* g_packed_reads;
76 word* packed_quals;
77 word* packed_reads_p;
79 int words_per_qual;
80 int last_shift;
81 int last_shift_qual;
82 int number_cpu_threads = 24;
84 #define SENTINEL_INT 4 // '$'
86 //**********************************define structures start
87 struct global_data_t;
88 struct section_data_t {
89 int section_id;
90 struct global_data_t *parent;
91 int64 section_start, section_end; // start and end read ID ('end' is exlusive)
92 string bwt_buffer[SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS];
93 int64 *offset_pos;
94 int64 offset_diff_base;
97 struct suffix_fetch_data_t {
98 int thread_id;
99 struct global_data_t *parent;
100 int bucket_start, bucket_end;
101 int64 index_start;
104 struct global_data_t {
105 int node_bucket_start, node_bucket_end; // buckets to be processed on this node
106 int num_cpu_threads;
107 int64 max_num_reads;
108 int64 num_reads;
109 word *packed_reads;
110 uint64_t *cpu_sort_space;
112 int read_len;
113 int words_per_read;
114 int words_per_suffix;
115 int offset_num_bits; // number of bits to represent 0...read_len
116 int offset_mask; // -(1 << offset_num_bits)
118 int64 max_lv1_entries;
119 int64 max_lv2_entries;
121 int cur_special_handle_prefix;
122 int64 num_lv2_to_output;
123 bool need_ridt;
125 int *buckets, *section_buckets[ NUM_CPU_THREADS ]; // buckets (aggregated for all read; for each section). TODO int64?
126 pthread_t threads[ NUM_CPU_THREADS ];
127 struct section_data_t locals[ NUM_CPU_THREADS ];
128 struct suffix_fetch_data_t suffix_fetches[ NUM_CPU_THREADS - 1 ];
129 int lv1_bucket_start, lv1_bucket_end; // 'end' is exclusive
130 int lv2_bucket_start, lv2_bucket_end; // 'end' is exclusive
131 int64 lv2_offset_start_index;
132 int64 lv2_suffixes_row_width;
133 int *lv1_offsets;
134 word *sort_indexes; //permutation.
135 word *sort_indexes_to_output;
136 unsigned int *lv2_read_id;
137 unsigned int *lv2_read_id_to_output;
138 word *lv2_suffixes;
139 word *lv2_suffixes_to_output;
140 FILE *out_file;
141 FILE *map_out_file;
142 // for large diff
143 vector<int64> v_large_diff;
144 pthread_spinlock_t large_diff_lock;
147 typedef struct lib_info
149 int sd;
150 int qual;
151 int max_ins;
152 int min_ins;
153 int avg_ins;
154 int rd_len_cutoff; //read length cutoff
155 int reverse;
156 int asm_flag;
157 int map_len;
158 int pair_num_cut;
159 int rank;
161 //whether last read is read1 in pair
162 int paired; // 0 -- single; 1 -- read1; 2 -- read2;
164 //type1
165 char ** a1_fname;
166 char ** a2_fname;
167 int num_a1_file;
168 int num_a2_file;
170 //type2
171 char ** q1_fname;
172 char ** q2_fname;
173 int num_q1_file;
174 int num_q2_file;
176 //type3
177 char ** p_fname;
178 int num_p_file; //fasta only
180 //type4 &5
181 char ** s_a_fname;
182 int num_s_a_file;
183 char ** s_q_fname;
184 int num_s_q_file;
186 char ** b_fname; //the name of the bam file
187 int num_b_file; //the number of the bam file
188 } LIB_INFO;
189 LIB_INFO* lib_array;
191 //***************** define structure finished.
193 //***************** Initialize start.
194 //initial dna_map, here N is treated as G.
195 void init_dna_map() {
196 for (int i='A';i<='Z';++i) dna_map[i] = 2; // G
197 for (int i='a';i<='z';++i) dna_map[i] = 2; // G
198 dna_map['A'] = dna_map['a'] = 0;
199 dna_map['C'] = dna_map['c'] = 1;
200 dna_map['G'] = dna_map['g'] = 2;
201 dna_map['T'] = dna_map['t'] = 3;
202 dna_map['N'] = dna_map['n'] = 2; // G
205 static boolean splitColumn ( char * line )
207 int len = strlen ( line );
208 int i = 0, j;
209 int tabs_n = 0;
211 while ( i < len )
213 if ( line[i] >= 32 && line[i] <= 126 && line[i] != '=' )
215 j = 0;
216 while ( i < len && line[i] >= 32 && line[i] <= 126 && line[i] != '=' )
218 tabs[tabs_n][j++] = line[i];
219 i++;
222 tabs[tabs_n][j] = '\0';
223 tabs_n++;
225 if ( tabs_n == 2 )
226 return 1;
228 i++;
231 if ( tabs_n == 2 )
232 return 1;
233 return 0;
236 static int cmp_lib ( const void * a, const void * b )
238 LIB_INFO * A, *B;
239 A = ( LIB_INFO * ) a;
240 B = ( LIB_INFO * ) b;
242 if ( A->avg_ins > B->avg_ins )
243 return 1;
244 if ( A->avg_ins == B->avg_ins )
245 return 0;
246 return -1;
249 void scan_libInfo ( char * libfile, int *lib_num, int *read_len)
251 char line[1024], ch;
252 int i, j, index;
253 boolean flag;
254 boolean * pe;
255 FILE* fp = fopen ( libfile, "r" );
256 int num_libs = 0;
257 int maxReadLen = 0;
259 while ( fgets ( line, 1024, fp ) )
261 ch = line[5];
262 line[5] = '\0';
263 if ( strcmp ( line, "[LIB]" ) == 0 )
264 num_libs++;
266 if ( !num_libs )
268 line[5] = ch;
269 flag = splitColumn ( line );
271 if ( !flag )
272 continue;
274 if ( strcmp ( tabs[0], "max_rd_len" ) == 0 || strcmp ( tabs[0], "max_read_len" ) == 0)
275 maxReadLen = atoi ( tabs[1] );
279 if ( num_libs == 0 )
281 fprintf ( stderr, "Config file error: no [LIB] in file\n" );
282 exit ( -1 );
284 *lib_num = num_libs;
285 *read_len = maxReadLen;
287 //count file numbers of each type
288 lib_array = ( LIB_INFO * ) calloc (1, num_libs * sizeof ( LIB_INFO ) );
289 pe = ( boolean * ) calloc (1, num_libs * sizeof ( boolean ) );
291 for ( i = 0; i < num_libs; i++ )
293 lib_array[i].qual = 74;
294 lib_array[i].sd = 0;
295 lib_array[i].avg_ins = 0;
296 lib_array[i].asm_flag = 3;
297 lib_array[i].rank = 0;
298 lib_array[i].pair_num_cut = 0;
299 lib_array[i].rd_len_cutoff = 0;
300 lib_array[i].map_len = 0;
301 lib_array[i].num_s_a_file = 0;
302 lib_array[i].num_s_q_file = 0;
303 lib_array[i].num_p_file = 0;
304 lib_array[i].num_a1_file = 0;
305 lib_array[i].num_a2_file = 0;
306 lib_array[i].num_q1_file = 0;
307 lib_array[i].num_q2_file = 0;
308 lib_array[i].num_b_file = 0; //init
309 pe[i] = false;
312 rewind ( fp );
313 i = -1;
315 while ( fgets ( line, 1024, fp ) )
317 ch = line[5];
318 line[5] = '\0';
320 if ( strcmp ( line, "[LIB]" ) == 0 )
322 i++;
323 continue;
326 line[5] = ch;
327 flag = splitColumn ( line );
329 if ( !flag )
331 continue;
334 if ( strcmp ( tabs[0], "f1" ) == 0 )
336 lib_array[i].num_a1_file++;
337 pe[i] = true;
339 else if ( strcmp ( tabs[0], "q1" ) == 0 )
341 lib_array[i].num_q1_file++;
342 pe[i] = true;
344 else if ( strcmp ( tabs[0], "f2" ) == 0 )
346 lib_array[i].num_a2_file++;
347 pe[i] = true;
349 else if ( strcmp ( tabs[0], "q2" ) == 0 )
351 lib_array[i].num_q2_file++;
352 pe[i] = true;
354 else if ( strcmp ( tabs[0], "f" ) == 0 )
356 lib_array[i].num_s_a_file++;
358 else if ( strcmp ( tabs[0], "q" ) == 0 )
360 lib_array[i].num_s_q_file++;
362 else if ( strcmp ( tabs[0], "p" ) == 0 )
364 lib_array[i].num_p_file++;
365 pe[i] = true;
367 else if ( strcmp ( tabs[0], "b" ) == 0 ) // the bam file
369 lib_array[i].num_b_file++;
370 pe[i] = true;
374 //allocate memory for filenames
375 for ( i = 0; i < num_libs; i++ )
377 if ( lib_array[i].num_a2_file != lib_array[i].num_a1_file )
379 fprintf ( stderr, "Config file error: the number of mark \"f1\" is not the same as \"f2\"!\n" );
380 exit ( -1 );
383 if ( lib_array[i].num_q2_file != lib_array[i].num_q1_file )
385 fprintf ( stderr, "Config file error: the number of mark \"q1\" is not the same as \"q2\"!\n" );
386 exit ( -1 );
389 if ( lib_array[i].num_s_a_file )
391 lib_array[i].s_a_fname = ( char ** ) calloc (1, lib_array[i].num_s_a_file * sizeof ( char * ) );
393 for ( j = 0; j < lib_array[i].num_s_a_file; j++ )
395 lib_array[i].s_a_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
399 if ( lib_array[i].num_s_q_file )
401 lib_array[i].s_q_fname = ( char ** ) calloc (1, lib_array[i].num_s_q_file * sizeof ( char * ) );
403 for ( j = 0; j < lib_array[i].num_s_q_file; j++ )
405 lib_array[i].s_q_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
409 if ( lib_array[i].num_p_file )
411 lib_array[i].p_fname = ( char ** ) calloc (1, lib_array[i].num_p_file * sizeof ( char * ) );
413 for ( j = 0; j < lib_array[i].num_p_file; j++ )
415 lib_array[i].p_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
419 if ( lib_array[i].num_a1_file )
421 lib_array[i].a1_fname = ( char ** ) calloc (1, lib_array[i].num_a1_file * sizeof ( char * ) );
423 for ( j = 0; j < lib_array[i].num_a1_file; j++ )
425 lib_array[i].a1_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
429 if ( lib_array[i].num_a2_file )
431 lib_array[i].a2_fname = ( char ** ) calloc (1, lib_array[i].num_a2_file * sizeof ( char * ) );
433 for ( j = 0; j < lib_array[i].num_a2_file; j++ )
435 lib_array[i].a2_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
439 if ( lib_array[i].num_q1_file )
441 lib_array[i].q1_fname = ( char ** ) calloc (1, lib_array[i].num_q1_file * sizeof ( char * ) );
443 for ( j = 0; j < lib_array[i].num_q1_file; j++ )
445 lib_array[i].q1_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
449 if ( lib_array[i].num_q2_file )
451 lib_array[i].q2_fname = ( char ** ) calloc (1, lib_array[i].num_q2_file * sizeof ( char * ) );
453 for ( j = 0; j < lib_array[i].num_q2_file; j++ )
455 lib_array[i].q2_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
459 if ( lib_array[i].num_b_file ) //allot memory for bam file name
461 lib_array[i].b_fname = ( char ** ) calloc (1, lib_array[i].num_b_file * sizeof ( char * ) );
463 for ( j = 0; j < lib_array[i].num_b_file; j++ )
464 { lib_array[i].b_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) ); }
468 // get file names
469 for ( i = 0; i < num_libs; i++ )
471 lib_array[i].num_s_a_file = 0;
472 lib_array[i].num_s_q_file = 0;
473 lib_array[i].num_p_file = 0;
474 lib_array[i].num_a1_file = 0;
475 lib_array[i].num_a2_file = 0;
476 lib_array[i].num_q1_file = 0;
477 lib_array[i].num_q2_file = 0;
478 lib_array[i].num_b_file = 0; //init
481 rewind ( fp );
482 i = -1;
484 while ( fgets ( line, 1024, fp ) )
486 ch = line[5];
487 line[5] = '\0';
489 if ( strcmp ( line, "[LIB]" ) == 0 )
491 i++;
492 continue;
495 line[5] = ch;
496 flag = splitColumn ( line );
498 if ( !flag )
500 continue;
503 if ( strcmp ( tabs[0], "f1" ) == 0 )
505 index = lib_array[i].num_a1_file++;
506 strcpy ( lib_array[i].a1_fname[index], tabs[1] );
508 else if ( strcmp ( tabs[0], "q1" ) == 0 )
510 index = lib_array[i].num_q1_file++;
511 strcpy ( lib_array[i].q1_fname[index], tabs[1] );
513 else if ( strcmp ( tabs[0], "f2" ) == 0 )
515 index = lib_array[i].num_a2_file++;
516 strcpy ( lib_array[i].a2_fname[index], tabs[1] );
518 if ( strcmp ( lib_array[i].a2_fname[index], lib_array[i].a1_fname[index] ) == 0 )
520 fprintf ( stderr, "Config file error: f2 file is the same as f1 file\n" );
521 fprintf ( stderr, "f1=%s\n", lib_array[i].a1_fname[index] );
522 fprintf ( stderr, "f2=%s\n", lib_array[i].a2_fname[index] );
523 exit ( -1 );
526 else if ( strcmp ( tabs[0], "q2" ) == 0 )
528 index = lib_array[i].num_q2_file++;
529 strcpy ( lib_array[i].q2_fname[index], tabs[1] );
531 if ( strcmp ( lib_array[i].q2_fname[index], lib_array[i].q1_fname[index] ) == 0 )
533 fprintf ( stderr, "Config file error: q2 file is the same as q1 file\n" );
534 fprintf ( stderr, "q1=%s\n", lib_array[i].q1_fname[index] );
535 fprintf ( stderr, "q2=%s\n", lib_array[i].q2_fname[index] );
536 exit ( -1 );
539 else if ( strcmp ( tabs[0], "f" ) == 0 )
541 index = lib_array[i].num_s_a_file++;
542 strcpy ( lib_array[i].s_a_fname[index], tabs[1] );
544 else if ( strcmp ( tabs[0], "q" ) == 0 )
546 index = lib_array[i].num_s_q_file++;
547 strcpy ( lib_array[i].s_q_fname[index], tabs[1] );
549 else if ( strcmp ( tabs[0], "p" ) == 0 )
551 index = lib_array[i].num_p_file++;
552 strcpy ( lib_array[i].p_fname[index], tabs[1] );
554 else if ( strcmp ( tabs[0], "b" ) == 0 )
556 //bam file
557 index = lib_array[i].num_b_file++;
558 strcpy ( lib_array[i].b_fname[index], tabs[1] );
560 else if ( strcmp ( tabs[0], "min_ins" ) == 0 )
562 lib_array[i].min_ins = atoi ( tabs[1] );
564 else if ( strcmp ( tabs[0], "max_ins" ) == 0 )
566 lib_array[i].max_ins = atoi ( tabs[1] );
568 else if ( strcmp ( tabs[0], "avg_ins" ) == 0 )
570 lib_array[i].avg_ins = atoi ( tabs[1] );
572 else if ( strcmp ( tabs[0], "sd" ) == 0)
574 lib_array[i].sd = atoi(tabs[1]);
576 else if( strcmp ( tabs[0], "qual_cutoff" ) == 0 )
578 lib_array[i].qual = atoi(tabs[1]);
580 else if ( strcmp ( tabs[0], "rd_len_cutoff" ) == 0 )
582 lib_array[i].rd_len_cutoff = atoi ( tabs[1] );
584 else if ( strcmp ( tabs[0], "reverse_seq" ) == 0 )
586 lib_array[i].reverse = atoi ( tabs[1] );
588 else if ( strcmp ( tabs[0], "asm_flags" ) == 0 )
590 lib_array[i].asm_flag = atoi ( tabs[1] );
592 else if ( strcmp ( tabs[0], "rank" ) == 0 )
594 lib_array[i].rank = atoi ( tabs[1] );
596 else if ( strcmp ( tabs[0], "pair_num_cutoff" ) == 0 )
598 lib_array[i].pair_num_cut = atoi ( tabs[1] );
600 else if ( strcmp ( tabs[0], "rd_len_cutoff" ) == 0 )
602 lib_array[i].rd_len_cutoff = atoi ( tabs[1] );
604 else if ( strcmp ( tabs[0], "map_len" ) == 0 )
606 lib_array[i].map_len = atoi ( tabs[1] );
610 for ( i = 0; i < num_libs; i++ )
612 if ( pe[i] && lib_array[i].avg_ins == 0 )
614 fprintf ( stderr, "Config file error: PE reads need avg_ins in [LIB] %d\n", i + 1 );
615 exit ( -1 );
618 //check sd.
619 if( pe[i] && lib_array[i].sd == 0)
621 if(lib_array[i].max_ins > 0 || lib_array[i].min_ins > 0)
623 if( lib_array[i].max_ins > 0 && lib_array[i].min_ins > 0 )
624 lib_array[i].sd = lib_array[i].max_ins - lib_array[i].avg_ins > lib_array[i].min_ins - lib_array[i].avg_ins ? (lib_array[i].max_ins - lib_array[i].avg_ins)/3 : (lib_array[i].min_ins - lib_array[i].avg_ins)/3;
625 else if( lib_array[i].max_ins > 0 )
626 lib_array[i].sd = (lib_array[i].max_ins - lib_array[i].avg_ins)/3;
627 else
628 lib_array[i].sd = (lib_array[i].min_ins - lib_array[i].avg_ins)/3;
629 }else{
630 lib_array[i].sd = lib_array[i].avg_ins * 0.05;
636 fclose ( fp );
637 qsort ( &lib_array[0], num_libs, sizeof ( LIB_INFO ), cmp_lib );
640 void free_libs (LIB_INFO* lib_array, int num_libs)
642 if ( !lib_array )
644 return;
647 int i, j;
648 fprintf ( stderr, "LIB(s) information:\n" );
650 for ( i = 0; i < num_libs; i++ )
652 fprintf ( stderr, " [LIB] %d, avg_ins %d, reverse %d.\n", i, lib_array[i].avg_ins, lib_array[i].reverse );
654 if ( lib_array[i].num_s_a_file )
656 //printf("%d single fasta files\n",lib_array[i].num_s_a_file);
657 for ( j = 0; j < lib_array[i].num_s_a_file; j++ )
659 free ( ( void * ) lib_array[i].s_a_fname[j] );
662 free ( ( void * ) lib_array[i].s_a_fname );
665 if ( lib_array[i].num_s_q_file )
667 //printf("%d single fastq files\n",lib_array[i].num_s_q_file);
668 for ( j = 0; j < lib_array[i].num_s_q_file; j++ )
670 free ( ( void * ) lib_array[i].s_q_fname[j] );
673 free ( ( void * ) lib_array[i].s_q_fname );
676 if ( lib_array[i].num_p_file )
678 //printf("%d paired fasta files\n",lib_array[i].num_p_file);
679 for ( j = 0; j < lib_array[i].num_p_file; j++ )
681 free ( ( void * ) lib_array[i].p_fname[j] );
684 free ( ( void * ) lib_array[i].p_fname );
687 if ( lib_array[i].num_a1_file )
689 //printf("%d read1 fasta files\n",lib_array[i].num_a1_file);
690 for ( j = 0; j < lib_array[i].num_a1_file; j++ )
692 free ( ( void * ) lib_array[i].a1_fname[j] );
695 free ( ( void * ) lib_array[i].a1_fname );
698 if ( lib_array[i].num_a2_file )
700 //printf("%d read2 fasta files\n",lib_array[i].num_a2_file);
701 for ( j = 0; j < lib_array[i].num_a2_file; j++ )
703 free ( ( void * ) lib_array[i].a2_fname[j] );
706 free ( ( void * ) lib_array[i].a2_fname );
709 if ( lib_array[i].num_q1_file )
711 //printf("%d read1 fastq files\n",lib_array[i].num_q1_file);
712 for ( j = 0; j < lib_array[i].num_q1_file; j++ )
714 free ( ( void * ) lib_array[i].q1_fname[j] );
717 free ( ( void * ) lib_array[i].q1_fname );
720 if ( lib_array[i].num_q2_file )
722 //printf("%d read2 fastq files\n",lib_array[i].num_q2_file);
723 for ( j = 0; j < lib_array[i].num_q2_file; j++ )
725 free ( ( void * ) lib_array[i].q2_fname[j] );
728 free ( ( void * ) lib_array[i].q2_fname );
731 if ( lib_array[i].num_b_file )
733 //free the bam file name
734 //printf("%d bam files\n",lib_array[i].num_b_file);
735 for ( j = 0; j < lib_array[i].num_b_file; j++ )
736 { free ( ( void * ) lib_array[i].b_fname[j] ); }
738 free ( ( void * ) lib_array[i].b_fname );
742 num_libs = 0;
743 free ( ( void * ) lib_array );
745 //***************** Initialize finished.
747 //***************** for debugging only start
748 void debug_prefix( FILE* file, word prefix ) {
749 for (int i = PREFIX_LENGTH - 1; i >= 0; --i) {
750 int c = prefix % 5;
751 prefix /= 5;
752 fprintf( file, "%c", dna_char[c] );
754 fprintf( file, "^r" );
757 void log(const char* format, ...) { //add const to char* can remove warning.
758 va_list args;
759 va_start( args, format );
760 vfprintf( stderr, format, args );
761 va_end( args );
762 fflush( stderr );
765 // for debugging only
766 void debug_word( FILE* file, word w ) {
767 for (int i = CHARS_PER_WORD - 1; i >= 0; --i) {
768 int c = (w>>(BITS_PER_CHAR*i)) & CHAR_MASK;
769 fprintf( file, "%c", dna_char[c] );
773 // for debugging only
774 void dump_special_prefix(int key) {
775 for (int i = 0; i < SPECIAL_HANDLE_SUFFIX_LENGTH; ++i) {
776 log("%c", dna_char[key / (SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS / 5)]);
777 key = key * 5 % SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS;
779 log("\n");
782 // for debug only
783 void dump_read(word *read_p, int words_per_read, int read_len) {
784 for (int i = 0; i < words_per_read; ++i) {
785 for (int j = 0; j < CHARS_PER_WORD; ++j) {
786 log("%c", dna_char[(read_p[i] >> (CHARS_PER_WORD - 1 - j) * BITS_PER_CHAR) & 3]);
787 if (i * CHARS_PER_WORD + j == read_len - 1) {
788 log("_");
792 log("\n");
795 void debug_suffix( word* start, struct global_data_t *_data ) {
796 struct global_data_t &globals = *_data;
797 for (int j=0; j<globals.words_per_read; ++j) {
798 word w = *(start + j * globals.lv2_suffixes_row_width);
799 for (int i = CHARS_PER_WORD - 1; i >= 0; --i) {
800 int c = (w>>(BITS_PER_CHAR*i)) & CHAR_MASK;
801 fprintf(stderr, "%c", dna_char[c]);
805 //***************** for debugging only end
807 inline void* MallocAndCheck(size_t size_in_byte,
808 const char *malloc_from_which_file = __FILE__,
809 int malloc_from_which_line = __LINE__) {
810 void *ptr = malloc(size_in_byte);
811 if (ptr == NULL && size_in_byte != 0) {
812 log( "[ERROR] Ran out of memory while applying %llubytes\n", (unsigned long long)size_in_byte);
813 log( "In file: %s, line %d\n", malloc_from_which_file, malloc_from_which_line);
814 log( "There may be errors as follows:\n");
815 log( "1) Not enough memory.\n");
816 log( "2) The ARRAY may be overrode.\n");
817 log( "3) The wild pointers.\n");
818 exit(-1);
821 return ptr;
824 inline bool is_valid_special_prefix(int prefix_key) {
825 // TODO hard
826 if (prefix_key % 5 != 4) { return false; }
827 for (int i = 0, start = 0; i < SPECIAL_HANDLE_SUFFIX_LENGTH; ++i) {
828 if (prefix_key % 5 != 4) {
829 start = 1;
830 } else if (start) {
831 return false;
833 prefix_key /= 5;
835 return prefix_key == 0;
838 void* preprocess_fill_buckets( void *_data ) {
839 struct section_data_t &locals = *( (struct section_data_t*)_data );
840 struct global_data_t &globals = *(locals.parent);
841 int section_id = locals.section_id;
842 int *buckets = globals.section_buckets[ section_id ];
844 int read_len = globals.read_len;
845 int words_per_read = globals.words_per_read;
846 int w_shift = BITS_PER_WORD - BITS_PER_CHAR;
848 for ( int64 r = locals.section_start; r < locals.section_end; ++r ) {
849 word *w_p = globals.packed_reads + ( words_per_read * r );
850 word key = 0;
851 word w = *(w_p++);
852 for ( int i = 0; i < PREFIX_LENGTH - 1; ++i ) {
853 key = key * 5 + ( w >> w_shift ); // replaced by this
854 w <<= BITS_PER_CHAR;
856 for ( int i = PREFIX_LENGTH - 1; i < read_len; ++i ) {
857 if ( i % CHARS_PER_WORD == 0 ) // TODO optimize
858 w = *(w_p++);
859 key = ( key * 5 + ( w >> w_shift ) ) % NUM_BUCKETS; // replaced by this
860 w <<= BITS_PER_CHAR;
861 buckets[key]++;
863 int special_key = (key * 5 + SENTINEL_INT) % (SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS * 5);
865 for ( int i = PREFIX_LENGTH - 1; i >= SPECIAL_HANDLE_SUFFIX_LENGTH; --i ) {
866 key = ( key * 5 + SENTINEL_INT ) % NUM_BUCKETS; // replaced by this
867 buckets[key]++;
870 // special handle buckets
871 for ( int i = 0; i < SPECIAL_HANDLE_SUFFIX_LENGTH; ++i ) {
872 int index = special_key % SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS;
873 assert(is_valid_special_prefix(index));
874 assert(special_key / SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS != 4);
875 locals.bwt_buffer[index].push_back(dna_char[ special_key / SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS ]);
876 special_key = (special_key * 5 + SENTINEL_INT) % (SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS * 5);
880 return NULL;
883 void* lv1_fetch_offsets( void *_data) {
884 struct section_data_t &locals = *( (struct section_data_t*)_data );
885 struct global_data_t &globals = *(locals.parent);
887 const int bucket_start = globals.lv1_bucket_start;
888 const int bucket_end = globals.lv1_bucket_end;
890 const int section_start = locals.section_start;
891 const int section_end = locals.section_end;
893 int read_len = globals.read_len;
894 int words_per_read = globals.words_per_read;
895 int w_shift = BITS_PER_WORD - BITS_PER_CHAR;
896 int64 *offset_pos = locals.offset_pos;
898 int id_shift = globals.offset_num_bits;
899 int64 *recent_offsets = (int64*) malloc( NUM_BUCKETS * sizeof( int64 ) ); // for calculating differences
900 int64 recent_base = ( locals.section_start << id_shift ); // offset of first read first char
901 for ( int b = bucket_start; b < bucket_end; ++b ) {
902 recent_offsets[b] = recent_base;
904 locals.offset_diff_base = recent_base;
906 // scan all reads and record offsets
907 for ( int64 r = section_start; r < section_end; ++r ) {
908 word *w_p = globals.packed_reads + ( words_per_read * r );
909 word key = 0;
910 word w = *(w_p++);
911 // int prev_char = SENTINEL_INT;
912 for ( int i = 0; i < PREFIX_LENGTH - 1; ++i ) {
913 // key = ((key<<BITS_PER_CHAR) | (w>>w_shift));
914 key = key * 5 + ( w >> w_shift ); // replaced by this
915 w <<= BITS_PER_CHAR;
917 for ( int i = PREFIX_LENGTH - 1; i < read_len; ++i ) {
918 if ( i % CHARS_PER_WORD == 0 ) // TODO optimize
919 w = *(w_p++);
920 // key = (((key<<BITS_PER_CHAR)&PREFIX_MASK)|(w>>w_shift));
921 key = ( key * 5 + ( w >> w_shift ) ) % NUM_BUCKETS; // replaced by this
922 w <<= BITS_PER_CHAR;
923 // check if key within range
924 // TODO 0x80000000 hardcode
925 // this condition is equivalent to bucket_start <= key < bucket_end
926 if ( (( key - bucket_start ) ^ ( key - bucket_end )) & 0x80000000 ) {
927 // read ID = r, within-read offset = i - PREFIX_LENGTH + 1
928 int64 true_offset = ( r << id_shift ) | ( i - PREFIX_LENGTH + 1 );
929 int64 diff_offset = true_offset - recent_offsets[ key ];
931 if ( diff_offset < 0 ) { // TODO remove this assertion
932 log("ERROR: offset diff negative (%lld %lld)! ABORT.\n", recent_offsets[key], true_offset);
933 exit(1);
935 if ( diff_offset > 2147483647 ) {
936 pthread_spin_lock(&globals.large_diff_lock);
937 diff_offset = -1 - (int64)globals.v_large_diff.size();
938 globals.v_large_diff.push_back(true_offset);
939 pthread_spin_unlock(&globals.large_diff_lock);
941 globals.lv1_offsets[ offset_pos[key]++ ] = diff_offset;
942 recent_offsets[ key ] = true_offset;
945 for ( int i = PREFIX_LENGTH - 1; i >= SPECIAL_HANDLE_SUFFIX_LENGTH; --i ) {
946 key = ( key * 5 + SENTINEL_INT ) % NUM_BUCKETS; // replaced by this
947 // check if key within range
948 // TODO 0x80000000 hardcode
949 // this condition is equivalent to bucket_start <= key < bucket_end
950 if ( (( key - bucket_start ) ^ ( key - bucket_end )) & 0x80000000 ) {
951 // read ID = r, within-read offset = read_len - i
952 int64 true_offset = ( r << id_shift ) | ( read_len - i );
953 int64 diff_offset = true_offset - recent_offsets[ key ];
955 if ( diff_offset < 0 ) { // TODO remove this assertion
956 log("ERROR: offset diff negative (%lld %lld)! ABORT.\n", recent_offsets[key], true_offset);
957 exit(1);
959 if ( diff_offset > 2147483647 ) {
960 pthread_spin_lock(&globals.large_diff_lock);
961 diff_offset = -1 - (int64)globals.v_large_diff.size();
962 globals.v_large_diff.push_back(true_offset);
963 pthread_spin_unlock(&globals.large_diff_lock);
965 globals.lv1_offsets[ offset_pos[key]++ ] = diff_offset;
966 recent_offsets[ key ] = true_offset;
972 free( recent_offsets );
974 return NULL;
977 void* lv2_fetch_suffixes( void *_data ){
978 struct suffix_fetch_data_t &fetch = *( (struct suffix_fetch_data_t*)_data );
979 struct global_data_t &globals = *(fetch.parent);
981 int read_len = globals.read_len;
982 int offset_num_bits = 1;
983 while ((1 << offset_num_bits) - 1 < read_len) {
984 ++offset_num_bits;
986 int words_per_read = globals.words_per_read;
987 int words_per_suffix = globals.words_per_suffix;
988 const int bucket_start = fetch.bucket_start;
989 const int bucket_end = fetch.bucket_end;
990 word *packed_reads = globals.packed_reads;
992 int id_shift = globals.offset_num_bits; // offsets are in the form [read_id][read_offset]; id_shift = # bits for read_offset
993 int read_offset_mask = globals.offset_mask;
995 word *suffix_p = globals.lv2_suffixes + fetch.index_start;
996 int num_suffixes = fetch.index_start;
998 int *offset_p = globals.lv1_offsets + globals.lv2_offset_start_index + fetch.index_start;
999 for ( int b = bucket_start; b < bucket_end; ++b ) {
1001 for ( int section = 0; section < globals.num_cpu_threads; ++section ) {
1003 int64 offset = globals.locals[section].offset_diff_base; // reset per local bucket
1004 int64 num_in_section_bucket = globals.section_buckets[section][b];
1005 while ( num_in_section_bucket-- ) {
1006 if (*offset_p < 0) {
1007 offset = globals.v_large_diff[-(*offset_p++) - 1];
1008 } else {
1009 offset += *(offset_p++); // remember we are using difference encoding
1011 // offset in the form [read_id][read_offset]
1012 int64 read_id = offset >> id_shift;
1013 int read_offset = offset & read_offset_mask;
1014 word *read_p = packed_reads + ( words_per_read * read_id );
1016 // find previous character (the char to be stored in BWT)
1017 int prev_char;
1018 if ( !read_offset ) { // special case
1019 prev_char = SENTINEL_INT;
1020 globals.lv2_read_id[ num_suffixes ] = read_id;
1021 } else {
1022 int prev_which_word = ( read_offset - 1 ) / CHARS_PER_WORD;
1023 int prev_word_offset = ( read_offset - 1 ) % CHARS_PER_WORD;
1024 word w = *(read_p + prev_which_word);
1025 prev_char = ( w >> ( BITS_PER_WORD - BITS_PER_CHAR - prev_word_offset * 2 ) ) & CHAR_MASK;
1028 // copy words of the suffix to the suffix pool
1029 int which_word = read_offset / CHARS_PER_WORD;
1030 int word_offset = read_offset % CHARS_PER_WORD;
1031 word *dest = suffix_p;
1032 word *src = read_p + which_word;
1033 if ( !word_offset ) { // special case (word aligned), easy
1034 while ( which_word < words_per_read ) {
1035 *dest = *src; // write out
1036 dest += globals.lv2_suffixes_row_width;
1037 src++;
1038 which_word++;
1040 } else { // not word-aligned
1041 int bit_shift = read_offset * 2;
1042 word s = *src;
1043 word d = s << bit_shift;
1044 which_word++;
1045 while ( which_word < words_per_read ) {
1046 s = *(++src);
1047 d |= s >> (BITS_PER_WORD - bit_shift);
1048 *dest = d; // write out
1049 dest += globals.lv2_suffixes_row_width;
1050 d = s << bit_shift;
1051 which_word++;
1053 // write last word
1054 *dest = d | ( 0xFFFFFFFF >> ( BITS_PER_WORD - bit_shift ) ); // bit padding by '1's. TODO hard
1056 *(suffix_p + (globals.lv2_suffixes_row_width * (words_per_suffix - 1) ) ) &= ~globals.offset_mask; // TODO hard
1057 *(suffix_p + (globals.lv2_suffixes_row_width * (words_per_suffix - 1) ) ) |= read_offset;
1058 suffix_p++;
1059 globals.sort_indexes[ num_suffixes ] = ( prev_char << 29 ) | num_suffixes; // TODO hard
1060 num_suffixes++;
1065 return NULL;
1068 int get_base5_prefix(word item) {
1069 item >>= (CHARS_PER_WORD - SPECIAL_HANDLE_SUFFIX_LENGTH) * BITS_PER_CHAR;
1070 static int base4_mask = (1 << SPECIAL_HANDLE_SUFFIX_LENGTH * BITS_PER_CHAR) - 1;
1071 int ret = 0;
1072 for (int i = 0; i < SPECIAL_HANDLE_SUFFIX_LENGTH; ++i) {
1073 ret = ret * 5 + (item >> (SPECIAL_HANDLE_SUFFIX_LENGTH - 1) * BITS_PER_CHAR);
1074 item = item * 4 & base4_mask;
1076 return ret;
1079 inline unsigned int mirror(unsigned int v) {
1080 // swap consecutive pairs
1081 v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
1082 // swap nibbles ...
1083 v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
1084 // swap bytes
1085 v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
1086 // swap 2-byte long pairs
1087 v = ( v >> 16 ) | ( v << 16);
1088 return v;
1091 inline void reverse_one_read(word *read_p, int words_per_read, int last_shift) {
1092 // shift all words
1093 if (last_shift > 0) {
1094 for (int i = words_per_read - 1; i > 0; --i) {
1095 read_p[i] >>= last_shift;
1096 read_p[i] |= read_p[i - 1] << (BITS_PER_WORD - last_shift);
1098 read_p[0] >>= last_shift;
1099 read_p[0] |= -(1 << (BITS_PER_WORD - last_shift));
1101 // mirror all words
1102 for (int i = 0; i < words_per_read; ++i) {
1103 read_p[i] = mirror(read_p[i]);
1107 void reverse_all_reads(global_data_t &globals) {
1108 omp_set_num_threads(globals.num_cpu_threads);
1109 int last_shift = globals.read_len * BITS_PER_CHAR % BITS_PER_WORD;
1110 if (last_shift != 0) last_shift = BITS_PER_WORD - last_shift;
1112 #pragma omp for
1113 for (int64 i = 0; i < globals.num_reads; ++i) {
1114 reverse_one_read(globals.packed_reads + globals.words_per_read * i, globals.words_per_read, last_shift);
1117 int64 mid = globals.num_reads / 2;
1119 #pragma omp for
1120 for (int64 i = 0; i < mid; ++i) {
1121 int64 i2 = globals.num_reads - 1 - i;
1122 for (int j = 0; j < globals.words_per_read; ++j) {
1123 swap(globals.packed_reads[i * globals.words_per_read + j], globals.packed_reads[(i2 + 1) * globals.words_per_read - 1 - j]);
1127 if (globals.num_reads % 2 == 1) {
1128 for (int j = 0; j < globals.words_per_read / 2; ++j) {
1129 swap(globals.packed_reads[mid * globals.words_per_read + j], globals.packed_reads[(mid + 1) * globals.words_per_read - 1 - j]);
1134 void init_global_data(global_data_t &globals, int read_len, int64 max_host_mem, int number_cpu_threads) {
1135 globals.read_len = read_len;
1136 globals.words_per_read = ( read_len + ( CHARS_PER_WORD - 1 ) ) / CHARS_PER_WORD;
1137 globals.num_reads = 0;
1138 globals.max_num_reads = max_host_mem * 0.8 / (globals.words_per_read * sizeof(word));
1139 globals.num_cpu_threads = number_cpu_threads;
1140 globals.max_lv2_entries = 0;
1142 globals.offset_num_bits = 1;
1143 while ((1 << globals.offset_num_bits) - 1 < read_len) {
1144 ++globals.offset_num_bits;
1146 globals.offset_mask = (1 << globals.offset_num_bits) - 1;
1147 globals.words_per_suffix = (read_len * BITS_PER_CHAR + globals.offset_num_bits + BITS_PER_WORD - 1) / BITS_PER_WORD;
1149 log("Max host mem: %lld, max number of reads can be loaded: %lld\n", max_host_mem, globals.max_num_reads);
1150 log("Words per read: %d, words per suffix: %d\n", globals.words_per_read, globals.words_per_suffix);
1153 void set_global_bucket(global_data_t &globals)
1155 for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1156 globals.locals[t].parent = &globals;
1157 globals.locals[t].section_id = t;
1159 for ( int t = 0; t < globals.num_cpu_threads - 1; ++t ) {
1160 globals.suffix_fetches[t].parent = &globals;
1161 globals.suffix_fetches[t].thread_id = t;
1164 globals.buckets = NULL;
1165 for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1166 globals.section_buckets[t] = NULL;
1167 globals.locals[t].offset_pos = NULL;
1170 // realloc
1171 int64 mem_packed_reads = ( globals.num_reads + 32 ) * globals.words_per_read * sizeof(word); // +32 for safety
1172 globals.packed_reads = (word*) realloc( globals.packed_reads, mem_packed_reads );
1173 if ( ! globals.packed_reads ) {
1174 fprintf( stderr, "realloc error, quit\n" );
1175 exit(1);
1177 log("Mem allocated for packed reads: %lld bytes\n", mem_packed_reads );
1179 { // distribute reads among threads
1180 int64 each = globals.num_reads / globals.num_cpu_threads;
1181 for ( int t = 0; t < globals.num_cpu_threads - 1; ++t ) {
1182 globals.locals[t].section_start = t * each;
1183 globals.locals[t].section_end = (t + 1) * each;
1185 globals.locals[ globals.num_cpu_threads - 1 ].section_start = (globals.num_cpu_threads-1) * each;
1186 globals.locals[ globals.num_cpu_threads - 1 ].section_end = globals.num_reads;
1191 void set_global_after_reading_initial(global_data_t &globals, int64 max_host_mem, int64 free_gpu_mem) {
1192 for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1193 globals.locals[t].parent = &globals;
1194 globals.locals[t].section_id = t;
1196 for ( int t = 0; t < globals.num_cpu_threads - 1; ++t ) {
1197 globals.suffix_fetches[t].parent = &globals;
1198 globals.suffix_fetches[t].thread_id = t;
1201 globals.buckets = NULL;
1202 for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1203 globals.section_buckets[t] = NULL;
1204 globals.locals[t].offset_pos = NULL;
1207 // realloc
1208 int64 mem_packed_reads = ( globals.num_reads + 32 ) * globals.words_per_read * sizeof(word); // +32 for safety
1209 globals.packed_reads = (word*) realloc( globals.packed_reads, mem_packed_reads );
1210 if ( ! globals.packed_reads ) {
1211 fprintf( stderr, "realloc error, quit\n" );
1212 exit(1);
1214 log("Mem allocated for packed reads: %lld bytes\n", mem_packed_reads );
1216 { // distribute reads among threads
1217 int64 each = globals.num_reads / globals.num_cpu_threads;
1218 for ( int t = 0; t < globals.num_cpu_threads - 1; ++t ) {
1219 globals.locals[t].section_start = t * each;
1220 globals.locals[t].section_end = (t + 1) * each;
1222 globals.locals[ globals.num_cpu_threads - 1 ].section_start = (globals.num_cpu_threads-1) * each;
1223 globals.locals[ globals.num_cpu_threads - 1 ].section_end = globals.num_reads;
1226 // preprocessing - compute memory limits
1228 free_gpu_mem -= 1073741824; // remain 1G for b40c
1229 const int GPU_BYTES_PER_ENTRY = 16; // key, value, and x2 for radix sort buffer
1230 int64 max_lv2_entries = free_gpu_mem / GPU_BYTES_PER_ENTRY;
1232 if (max_lv2_entries >= (1 << 29)) {
1233 max_lv2_entries = (1 << 29) - 1; // TODO hard
1235 globals.max_lv2_entries = max_lv2_entries;
1236 log( "Free GPU MEM = %lld\n", free_gpu_mem);
1238 int64 lv2_bytes_per_entry = ( globals.words_per_suffix + 1 + 1) * sizeof(word); // read_id_table, permutation
1239 int64 mem_lv2 = lv2_bytes_per_entry * max_lv2_entries * 2; // *2 for double buffering
1241 log( "max host mem: %lld, mem_packed_reads %lld, mem_lv2 %lld, global.num_reads %lld", max_host_mem, mem_packed_reads, mem_lv2, globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2);
1242 int64 avail_host_mem = max_host_mem - mem_packed_reads - mem_lv2 - globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2;
1244 if(avail_host_mem < 1024*1024)
1246 log( "Please set max_host_mem larger than: %lld\n", mem_packed_reads + mem_lv2 + globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2 + 1024*1024);
1247 exit(2);
1249 const int LV1_BYTES_PER_ENTRY = 4; // 32-bit adjacent-difference representation
1250 int64 max_lv1_entries = avail_host_mem / LV1_BYTES_PER_ENTRY;
1251 globals.max_lv1_entries = max_lv1_entries;
1253 log( "Avail host MEM = %lld\n", avail_host_mem );
1254 log( "Max # of lv.1 entries = %lld\n", max_lv1_entries );
1255 log( "Max # of lv.2 entries = %lld\n", max_lv2_entries );
1256 log( "\n" );
1258 #ifdef DISABLE_GPU
1259 globals.cpu_sort_space = (uint64_t*) MallocAndCheck(sizeof(uint64_t) * globals.max_lv2_entries, __FILE__, __LINE__); // as CPU memory is used to simulate GPU
1260 assert(globals.cpu_sort_space != NULL);
1261 #endif
1263 // allocate memory for lv.1 and lv.2 arrays
1264 globals.lv1_offsets = (int*) malloc( max_lv1_entries * sizeof(int) );
1265 globals.lv2_suffixes = (word*) malloc( max_lv2_entries * globals.words_per_suffix * sizeof(word) );
1266 globals.lv2_suffixes_row_width = max_lv2_entries;
1267 globals.lv2_read_id = (unsigned int*) malloc( max_lv2_entries * sizeof(unsigned int) );
1268 globals.sort_indexes = (word*) malloc( max_lv2_entries * sizeof(word) );
1270 globals.lv2_suffixes_to_output = (word*) malloc( max_lv2_entries * globals.words_per_suffix * sizeof(word) );
1271 globals.lv2_read_id_to_output = (unsigned int*) malloc( max_lv2_entries * sizeof(unsigned int) );
1272 globals.sort_indexes_to_output = (word*) malloc( max_lv2_entries * sizeof(word) );
1274 pthread_spin_init(&globals.large_diff_lock, 0);
1277 void set_global_after_lv0(global_data_t &globals, int64 max_host_mem, int64 free_gpu_mem, uint64_t max_bucket_size)
1279 int64 max_lv2_entries;
1281 #ifdef DISABLE_GPU
1282 if(max_bucket_size*1.2 < 2*1024*1024)
1283 max_lv2_entries = 2*1024*1024;
1284 else
1285 max_lv2_entries = max_bucket_size * 1.2;
1287 globals.max_lv2_entries = max_lv2_entries;
1288 globals.cpu_sort_space = (uint64_t*) MallocAndCheck(sizeof(uint64_t) * globals.max_lv2_entries, __FILE__, __LINE__); // as CPU memory is used to simulate GPU
1289 assert(globals.cpu_sort_space != NULL);
1290 #else
1291 const int GPU_BYTES_PER_ENTRY = 16; // key, value, and x2 for radix sort buffer
1292 max_lv2_entries = (free_gpu_mem - 1073741824) / GPU_BYTES_PER_ENTRY;
1293 if (max_lv2_entries >= (1 << 29)) {
1294 max_lv2_entries = (1 << 29) - 1;
1296 if(max_lv2_entries < (int64_t)max_bucket_size)
1298 log ("Too many items for GPU sorting %lld > %lld! Please try larger GPU memory or use CPU version instead.", (int64_t)max_bucket_size, max_lv2_entries);
1299 exit(1);
1301 globals.max_lv2_entries = max_lv2_entries;
1302 #endif
1303 int64 lv2_bytes_per_entry = ( globals.words_per_suffix + 1 + 1) * sizeof(word); // read_id_table, permutation
1304 int64 mem_lv2 = lv2_bytes_per_entry * max_lv2_entries * 2; // *2 for double buffering
1306 int64 mem_packed_reads = ( globals.num_reads + 32 ) * globals.words_per_read * sizeof(word); // +32 for safety
1308 log( "max host mem: %lld, mem_packed_reads %lld, mem_lv2 %lld, global.num_reads %lld", max_host_mem, mem_packed_reads, mem_lv2, globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2);
1309 int64 avail_host_mem = max_host_mem - mem_packed_reads - mem_lv2 - globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2;
1311 if(avail_host_mem < 1024*1024)
1313 log( "Please set max_host_mem larger than: %lld\n", mem_packed_reads + mem_lv2 + globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2 + 1024*1024);
1314 exit(2);
1317 const int LV1_BYTES_PER_ENTRY = 4; // 32-bit adjacent-difference representation
1318 int64 max_lv1_entries = avail_host_mem / LV1_BYTES_PER_ENTRY;
1319 globals.max_lv1_entries = max_lv1_entries;
1321 log( "Avail host MEM = %lld\n", avail_host_mem );
1322 log( "Max # of lv.1 entries = %lld\n", max_lv1_entries );
1323 log( "Max # of lv.2 entries = %lld\n", max_lv2_entries );
1324 log( "\n" );
1326 globals.lv1_offsets = (int*) malloc( max_lv1_entries * sizeof(int) );
1327 globals.lv2_suffixes = (word*) malloc( max_lv2_entries * globals.words_per_suffix * sizeof(word) );
1328 globals.lv2_suffixes_row_width = max_lv2_entries;
1329 globals.lv2_read_id = (unsigned int*) malloc( max_lv2_entries * sizeof(unsigned int) );
1330 globals.sort_indexes = (word*) malloc( max_lv2_entries * sizeof(word) );
1332 globals.lv2_suffixes_to_output = (word*) malloc( max_lv2_entries * globals.words_per_suffix * sizeof(word) );
1333 globals.lv2_read_id_to_output = (unsigned int*) malloc( max_lv2_entries * sizeof(unsigned int) );
1334 globals.sort_indexes_to_output = (word*) malloc( max_lv2_entries * sizeof(word) );
1336 pthread_spin_init(&globals.large_diff_lock, 0);
1339 uint64_t lv0_fill_buckets(global_data_t &globals) {
1340 // preprocessing - fill buckets
1341 if (globals.buckets == NULL)
1342 globals.buckets = (int*) malloc( NUM_BUCKETS * sizeof(int) );
1343 for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1344 if (globals.section_buckets[t] == NULL)
1345 globals.section_buckets[t] = (int*) malloc( NUM_BUCKETS * sizeof(int) );
1346 memset(globals.section_buckets[t], 0, sizeof(int) * NUM_BUCKETS);
1348 log( "Preprocessing: filling buckets... " );
1349 for ( int t = 1; t < globals.num_cpu_threads; ++t ) {
1350 pthread_create( &globals.threads[t], NULL, preprocess_fill_buckets, &globals.locals[t] );
1352 preprocess_fill_buckets( &globals.locals[0] );
1353 for ( int t = 1; t < globals.num_cpu_threads; ++t ) {
1354 pthread_join( globals.threads[t], NULL );
1356 log( " Done\n" );
1358 // sum up local buckets
1359 memset( globals.buckets, 0, NUM_BUCKETS * sizeof(int) );
1360 for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1361 for ( int b = 0; b < NUM_BUCKETS; ++b )
1362 globals.buckets[b] += globals.section_buckets[t][b];
1363 if (globals.locals[t].offset_pos == NULL)
1364 globals.locals[t].offset_pos = (int64*) malloc( NUM_BUCKETS * sizeof( int64 ) );
1365 memset(globals.locals[t].offset_pos, 0, sizeof(int64) * NUM_BUCKETS);
1368 int64_t max_buckets=0;
1369 for (int i = 0; i < NUM_BUCKETS; ++i)
1371 if(globals.max_lv2_entries == 0)
1373 if(globals.buckets[i] > max_buckets)
1374 max_buckets = globals.buckets[i];
1375 }else{
1376 if (globals.buckets[i] > globals.max_lv2_entries) {
1377 fprintf(stderr, "Too many items for GPU sorting! Bucket: %d, number of items: %d\n", i, globals.buckets[i]);
1378 exit(1);
1382 if(globals.max_lv2_entries == 0)
1383 log ("Maximum Bucket size: %lld\n", max_buckets);
1385 return max_buckets;
1388 void* lv2_output_thread(void *_data) {
1389 global_data_t &globals = *((global_data_t*)_data);
1390 log( "Writing output... \n" );
1391 for (int i = 0; i < globals.num_lv2_to_output; ++i) {
1392 int c = globals.sort_indexes_to_output[i] >> 29; // TODO hard
1393 int cur_prefix = get_base5_prefix( globals.lv2_suffixes_to_output[globals.sort_indexes_to_output[i] & ((1 << 29) - 1)]);
1394 assert(cur_prefix < SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS);
1395 while (cur_prefix > globals.cur_special_handle_prefix) {
1396 dump_special_prefix(globals.cur_special_handle_prefix);
1397 for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1398 fwrite( globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].c_str(), sizeof(char),
1399 globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].size(), globals.out_file );
1400 globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].clear();
1403 ++globals.cur_special_handle_prefix;
1404 while (globals.cur_special_handle_prefix < SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS) {
1405 if (is_valid_special_prefix(globals.cur_special_handle_prefix)) {
1406 break;
1408 ++globals.cur_special_handle_prefix;
1411 fprintf(globals.out_file, "%c", dna_char[ c ] );
1412 if ( c == SENTINEL_INT && globals.need_ridt ) {
1413 // TODO offsets are accumulated....
1414 unsigned int id = globals.lv2_read_id_to_output[ globals.sort_indexes_to_output[i] & 0x1FFFFFFF ]; // TODO hard
1415 fwrite( &id, sizeof(unsigned int), 1, globals.map_out_file );
1418 return NULL;
1421 void build_bwt(global_data_t &globals, const char *output_prefix, bool need_ridt = false) {
1423 for (int64 i = globals.num_reads - 10; i < globals.num_reads; ++i) {
1424 dump_read(globals.packed_reads + i * globals.words_per_read, globals.words_per_read, globals.read_len);
1427 pthread_t output_thread;
1428 bool output_started = false;
1430 if(need_ridt)
1431 globals.out_file = fopen64( (string(output_prefix) + ".f.bwt.ascii").c_str(), "w" );
1432 else
1433 globals.out_file = fopen64( (string(output_prefix) + ".r.bwt.ascii").c_str(), "w" );
1434 globals.need_ridt = need_ridt;
1436 if (need_ridt) {
1437 globals.map_out_file = fopen64( (string(output_prefix) + ".ridt").c_str(), "wb" );
1438 fwrite( &globals.num_reads, sizeof(unsigned int), 1, globals.map_out_file );
1440 // level 1 loop - pick consecutive buckets
1441 int lv1_round = 0;
1442 globals.cur_special_handle_prefix = 0;
1443 while (!is_valid_special_prefix(globals.cur_special_handle_prefix)) {
1444 ++globals.cur_special_handle_prefix;
1446 globals.lv1_bucket_start = 0;
1447 while ( globals.lv1_bucket_start < NUM_BUCKETS ) {
1448 lv1_round++;
1449 globals.v_large_diff.clear();
1450 int64 num_lv1 = globals.buckets[ globals.lv1_bucket_start ];
1451 int temp_end = globals.lv1_bucket_start + 1; // remember that 'end' is exclusive
1452 while ( temp_end < NUM_BUCKETS && num_lv1 + globals.buckets[ temp_end ] <= globals.max_lv1_entries ) {
1453 num_lv1 += globals.buckets[ temp_end++ ];
1455 globals.lv1_bucket_end = temp_end; // remember that 'end' is exclusive
1457 log( "Lv.1 Round %d: buckets %d to %d, num items = %lld\n", lv1_round,
1458 globals.lv1_bucket_start, globals.lv1_bucket_end-1, num_lv1 );
1459 // lv 1 bucket start & end found! fetch the offsets now
1461 // compute bucket positions for the offset arrays
1462 { // t = 0
1463 int64 *pos = globals.locals[0].offset_pos;
1464 pos[globals.lv1_bucket_start] = 0;
1465 for ( int i = globals.lv1_bucket_start+1; i < globals.lv1_bucket_end; ++i ) {
1466 pos[i] = pos[i-1] + globals.buckets[i-1];
1470 for ( int t = 1; t < globals.num_cpu_threads; ++t ) { // t > 0
1471 int64 *pos = globals.locals[t].offset_pos + globals.lv1_bucket_start;
1472 int64 *pre_pos = globals.locals[t-1].offset_pos + globals.lv1_bucket_start;
1473 int *pre_buckets = globals.section_buckets[t-1] + globals.lv1_bucket_start;
1474 for ( int i = globals.lv1_bucket_start; i < globals.lv1_bucket_end; ++i ) {
1475 *(pos++) = *(pre_pos++) + *(pre_buckets++);
1479 log( " Fetching offsets... " );
1480 for ( int t = 1; t < globals.num_cpu_threads; ++t ) {
1481 pthread_create( &globals.threads[t], NULL, lv1_fetch_offsets, &globals.locals[t] );
1483 lv1_fetch_offsets( &globals.locals[0] );
1484 for ( int t = 1; t < globals.num_cpu_threads; ++t ) {
1485 pthread_join( globals.threads[t], NULL );
1487 log( "Done. Number of large diff: %lld\n", globals.v_large_diff.size());
1489 // level 2 loop
1490 int lv2_round = 0;
1491 globals.lv2_offset_start_index = 0;
1492 globals.lv2_bucket_start = globals.lv1_bucket_start;
1493 while ( globals.lv2_bucket_start < globals.lv1_bucket_end ) {
1494 lv2_round++;
1495 int64 num_lv2 = globals.buckets[ globals.lv2_bucket_start ];
1496 if (num_lv2 > globals.max_lv2_entries) { // TROUBLESOME CASES
1497 fprintf( stderr, "ERROR: Single bucket too large for lv.2: %lld (", num_lv2 );
1498 debug_prefix( stderr, globals.lv2_bucket_start );
1499 fprintf( stderr, ")\n");
1500 globals.lv2_bucket_start++;
1501 globals.lv2_offset_start_index += num_lv2;
1502 continue;
1504 int temp_end2 = globals.lv2_bucket_start + 1; // remember that 'end' is exclusive
1505 while ( temp_end2 < globals.lv1_bucket_end && num_lv2 + globals.buckets[ temp_end2 ] <= globals.max_lv2_entries ) {
1506 num_lv2 += globals.buckets[ temp_end2++ ];
1508 globals.lv2_bucket_end = temp_end2; // remember that 'end' is exclusive
1510 if (num_lv2 == 0) { // TROUBLESOME CASES
1511 fprintf( stderr, "ERROR: Single bucket too large for lv.2: %d (", globals.buckets[temp_end2] );
1512 debug_prefix( stderr, temp_end2 );
1513 fprintf( stderr, ")\n");
1514 globals.lv2_bucket_start = temp_end2 + 1;
1515 globals.lv2_offset_start_index += globals.buckets[temp_end2];
1516 continue;
1520 log(" Lv.2 Round %d: buckets %d to %d, num items = %lld\n", lv2_round,
1521 globals.lv2_bucket_start, globals.lv2_bucket_end-1, num_lv2 );
1522 // lv 2 bucket start & end found! fetch the suffixes now
1524 // distribute to threads
1525 int64 each = num_lv2 / ( globals.num_cpu_threads - 1 );
1526 each = !each ? 1 : each; // min is 1
1527 int b_start = globals.lv2_bucket_start;
1528 int64 cumulate_num_items = 0;
1529 for (int t = 0; t < (globals.num_cpu_threads-1) - 1; ++t) {
1530 globals.suffix_fetches[t].bucket_start = b_start;
1531 globals.suffix_fetches[t].index_start = cumulate_num_items;
1532 int64 num_items = 0;
1533 while ( num_items < each && b_start < globals.lv2_bucket_end ) {
1534 num_items += globals.buckets[ b_start++ ];
1536 globals.suffix_fetches[t].bucket_end = b_start;
1537 cumulate_num_items += num_items;
1539 globals.suffix_fetches[ (globals.num_cpu_threads-1) - 1 ].bucket_start = b_start;
1540 globals.suffix_fetches[ (globals.num_cpu_threads-1) - 1 ].index_start = cumulate_num_items;
1541 globals.suffix_fetches[ (globals.num_cpu_threads-1) -1 ].bucket_end = globals.lv2_bucket_end;
1543 memset( globals.lv2_suffixes, 0xFF, globals.lv2_suffixes_row_width * globals.words_per_suffix * sizeof(word) ); // important: must fill all '1' bits
1544 log( " -> Fetching suffixes... " );
1546 for ( int t = 0; t < globals.num_cpu_threads-1; ++t ) {
1547 pthread_create( &globals.threads[t], NULL, lv2_fetch_suffixes, &globals.suffix_fetches[t] );
1549 for ( int t = 0; t < globals.num_cpu_threads-1; ++t ) {
1550 pthread_join( globals.threads[t], NULL );
1553 // suffixes fetched! now sort them with GPU
1554 /////////////////////////////////// GPU SORTING ////////////////////////////////
1555 log( "Sorting suffixes with CPU ... " );
1556 #ifdef DISABLE_GPU
1557 omp_set_num_threads(globals.num_cpu_threads - 1);
1558 lv2_cpu_sort(globals.lv2_suffixes, globals.sort_indexes, globals.cpu_sort_space, globals.words_per_suffix, globals.lv2_suffixes_row_width, num_lv2);
1559 omp_set_num_threads(globals.num_cpu_threads);
1560 #else
1561 lv2_gpu_sort(globals.lv2_suffixes, globals.sort_indexes, globals.words_per_suffix, globals.lv2_suffixes_row_width, num_lv2);
1562 #endif
1563 /////// output here ///////
1564 if (output_started) {
1565 pthread_join(output_thread, NULL);
1567 swap(globals.sort_indexes, globals.sort_indexes_to_output);
1568 swap(globals.lv2_suffixes, globals.lv2_suffixes_to_output);
1569 swap(globals.lv2_read_id, globals.lv2_read_id_to_output);
1570 globals.num_lv2_to_output = num_lv2;
1571 pthread_create(&output_thread, NULL, lv2_output_thread, (void*)&globals);
1572 output_started = true;
1575 globals.lv2_bucket_start = globals.lv2_bucket_end;
1576 globals.lv2_offset_start_index += num_lv2;
1577 log( "Round ends.\n" );
1580 globals.lv1_bucket_start = globals.lv1_bucket_end;
1581 log( "Lv.1 Round %d ends\n\n", lv1_round );
1583 // END level 1 loop
1585 if (output_started) {
1586 pthread_join(output_thread, NULL);
1588 // print last BWT segment (those chars corresponding to '$')
1589 while (globals.cur_special_handle_prefix < SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS) {
1590 if (is_valid_special_prefix(globals.cur_special_handle_prefix)) {
1591 dump_special_prefix(globals.cur_special_handle_prefix);
1592 for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1593 fwrite( globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].c_str(), sizeof(char),
1594 globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].size(), globals.out_file );
1595 globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].clear();
1598 ++globals.cur_special_handle_prefix;
1601 fclose( globals.out_file );
1602 if (need_ridt)
1603 fclose( globals.map_out_file );
1606 void read_input_file( struct global_data_t &globals, char* filename ) {
1607 /////////////////// READ INPUT (FASTQ) /////////////////////
1608 int read_len = globals.read_len;
1609 int words_per_read = globals.words_per_read;
1610 word *packed_reads = NULL;
1611 int64 num_reads = 0;
1613 #define READ_BUFFER_SIZE 4096
1614 gzFile in_file = gzopen( filename, "r" );
1615 int num_bytes = 0;
1616 char buffer[ READ_BUFFER_SIZE + 10 ];
1617 packed_reads = (word*) malloc( globals.max_num_reads * words_per_read * sizeof(word) );
1618 // memset( packed_reads, 0xFF, globals.max_num_reads * words_per_read * sizeof(word) );
1619 word *packed_reads_p = packed_reads;
1620 int last_shift = read_len % CHARS_PER_WORD ? ( CHARS_PER_WORD - read_len % CHARS_PER_WORD ) * BITS_PER_CHAR : 0;
1622 #define REFILL_BUFFER( _in_file, _buffer, _num_bytes ) \
1623 do { \
1624 (_num_bytes) = gzread((_in_file),(_buffer),READ_BUFFER_SIZE); \
1625 (_buffer)[(_num_bytes)] = 0; \
1626 } while (0)
1628 REFILL_BUFFER( in_file, buffer, num_bytes );
1629 char *p = buffer;
1630 word *read_start_ptr;
1631 int is_FASTQ = ( buffer[0] == '@' );
1632 char read_terminator = is_FASTQ ? '+' : '>';
1633 while ( num_bytes ) {
1634 read_start_ptr = packed_reads_p;
1635 // consume read name @xxxxxxx
1636 p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1637 while (!p) {
1638 REFILL_BUFFER( in_file, buffer, num_bytes );
1639 if ( !num_bytes ) break; // end of file
1640 p = (char*) memchr( buffer, '\n', num_bytes );
1642 if ( !num_bytes ) break; // end of file
1643 if (! *(++p) ) {
1644 REFILL_BUFFER( in_file, buffer, num_bytes );
1645 p = buffer;
1647 // consume read sequence ACGT
1648 word w = 0;
1649 int index = 0;
1650 while ( *p != read_terminator ) { // FASTQ: + FASTA: >
1651 if ( *p >= 'A' ) {
1652 if ( index % CHARS_PER_WORD == 0 ) {
1653 if ( index ) {
1654 *packed_reads_p = w;
1655 packed_reads_p++;
1656 w = 0;
1659 w = (w << BITS_PER_CHAR) | dna_map[(unsigned char)( *p) ]; // order: MSB to LSB
1660 index++;
1662 p++;
1663 if (!(*p)) {
1664 REFILL_BUFFER( in_file, buffer, num_bytes );
1665 if ( !num_bytes ) break;
1666 p = buffer;
1669 // write last word
1670 *packed_reads_p = w << last_shift;
1671 *packed_reads_p |= ( 0xFFFFFFFF >> ( BITS_PER_WORD - last_shift ) ); // TODO hard. add padding
1672 packed_reads_p++;
1674 if ( is_FASTQ ) {
1675 // consume + sign
1676 p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1677 while (!p) {
1678 REFILL_BUFFER( in_file, buffer, num_bytes );
1679 p = (char*) memchr( buffer, '\n', num_bytes );
1681 p++;
1682 // consume base quality
1683 p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1684 while (!p) {
1685 REFILL_BUFFER( in_file, buffer, num_bytes );
1686 if ( !num_bytes ) break; // end of file
1687 p = (char*) memchr( buffer, '\n', num_bytes );
1689 p++;
1690 } // end FASTQ only
1692 if (index == globals.read_len) {
1693 ++num_reads;
1694 } else { // trouble...
1695 log( "ERROR: read length of read #%lld is %d\n", num_reads, index);
1696 packed_reads_p = read_start_ptr;
1699 if ( !num_bytes ) break; // end of file
1702 gzclose( in_file );
1704 globals.packed_reads = packed_reads;
1705 globals.num_reads = num_reads;
1707 ////////////////////// END READING INPUT //////////////////////
1710 void read_single_file(FILE *fq, char* filename, int64* read_num, int read_len, int threshold)
1712 /////////////////// READ INPUT (FASTQ) /////////////////////
1713 int64 num_reads = 0;
1715 #define READ_BUFFER_SIZE 4096
1716 gzFile in_file = gzopen( filename, "r" );
1717 //#define GZBUFFER_SIZE 65536
1718 // gzbuffer( infile, GZBUFFER_SIZE ); // needs zlib 1.2.4
1719 int num_bytes = 0;
1720 char buffer[ READ_BUFFER_SIZE + 10 ];
1722 //memset( packed_quals, 0, 1 * words_per_qual * sizeof(word) );
1724 #define REFILL_BUFFER( _in_file, _buffer, _num_bytes ) \
1725 do { \
1726 (_num_bytes) = gzread((_in_file),(_buffer),READ_BUFFER_SIZE); \
1727 (_buffer)[(_num_bytes)] = 0; \
1728 } while (0)
1730 REFILL_BUFFER( in_file, buffer, num_bytes );
1731 char *p = buffer;
1732 word *read_start_ptr;
1733 int is_FASTQ = ( buffer[0] == '@' );
1734 char read_terminator = is_FASTQ ? '+' : '>';
1735 while ( num_bytes ) {
1736 read_start_ptr = packed_reads_p;
1737 // consume read name @xxxxxxx
1738 p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1739 while (!p) {
1740 REFILL_BUFFER( in_file, buffer, num_bytes );
1741 if ( !num_bytes ) break; // end of file
1742 p = (char*) memchr( buffer, '\n', num_bytes );
1744 if ( !num_bytes ) break; // end of file
1745 if (! *(++p) ) {
1746 REFILL_BUFFER( in_file, buffer, num_bytes );
1747 p = buffer;
1749 // consume read sequence ACGT
1750 word w = 0;
1751 int index = 0;
1752 while ( *p != read_terminator ) { // FASTQ: + FASTA: >
1753 if ( *p >= 'A' ) {
1754 if ( index % CHARS_PER_WORD == 0 ) {
1755 if ( index ) {
1756 *packed_reads_p = w;
1757 packed_reads_p++;
1758 w = 0;
1761 w = (w << BITS_PER_CHAR) | dna_map[(unsigned char)( *p )]; // order: MSB to LSB
1762 index++;
1764 p++;
1765 if (!(*p)) {
1766 REFILL_BUFFER( in_file, buffer, num_bytes );
1767 if ( !num_bytes ) break;
1768 p = buffer;
1771 // write last word
1772 *packed_reads_p = w << last_shift;
1773 *packed_reads_p |= ( 0xFFFFFFFF >> ( BITS_PER_WORD - last_shift ) ); // TODO hard. add padding
1774 packed_reads_p++;
1776 if ( is_FASTQ ) {
1777 word *packed_quals_p = packed_quals;
1778 // consume + sign
1779 p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1780 while (!p) {
1781 REFILL_BUFFER( in_file, buffer, num_bytes );
1782 p = (char*) memchr( buffer, '\n', num_bytes );
1784 p++;
1785 if (!(*p)) {
1786 REFILL_BUFFER( in_file, buffer, num_bytes );
1787 if ( !num_bytes ) break;
1788 p = buffer;
1790 // consume base quality
1791 w = 0;
1792 index = 0;
1793 while( *p != '@' && *p != '\n')
1795 //fprintf(stderr, ";%c,%d", *p, w);
1796 if(index % BITS_PER_WORD == 0){
1797 if(index){
1798 *packed_quals_p = w;
1799 packed_quals_p++;
1800 w = 0;
1803 w = (w << BITS_PER_QUAL) | (*p >= threshold);
1804 index++;
1805 p++;
1806 if (!(*p)) {
1807 REFILL_BUFFER( in_file, buffer, num_bytes );
1808 if ( !num_bytes ) break;
1809 p = buffer;
1812 *packed_quals_p = w << last_shift_qual;
1813 //*packed_quals_p |= (0xFFFFFFFF >> ( BITS_PER_QUAL - last_shift_qual ));
1814 packed_quals_p ++;
1815 fwrite( packed_quals, sizeof(word), 1 * words_per_qual, fq);
1816 //memset( packed_quals, 0, 1 * words_per_qual * sizeof(word) );
1817 } // end FASTQ only
1819 if (index == read_len) {
1820 ++num_reads;
1821 } else { // trouble...
1822 log( "SINGLE READ ERROR: read length of read #%lld is %d\n", num_reads, index);
1823 packed_reads_p = read_start_ptr;
1826 if ( !num_bytes ) break; // end of file
1829 gzclose( in_file );
1831 (*read_num) = (*read_num) + num_reads;
1832 ////////////////////// END READING INPUT //////////////////////
1835 void read_pair_file(FILE *fq, char* file1, char* file2, int64* read_num, int read_len, int threshold)
1837 /////////////////// READ INPUT (FASTQ) /////////////////////
1838 int64 num_reads = 0;
1840 #define READ_BUFFER_SIZE 4096
1841 gzFile in_file = gzopen( file1, "r" );
1842 gzFile in_file2 = gzopen( file2, "r" );
1843 //#define GZBUFFER_SIZE 65536
1844 // gzbuffer( infile, GZBUFFER_SIZE ); // needs zlib 1.2.4
1845 int num_bytes = 0, num_bytes2 = 0;
1846 char buffer[ READ_BUFFER_SIZE + 10 ];
1847 char buffer2[ READ_BUFFER_SIZE + 10 ];
1849 // memset( packed_quals, 0, 1 * words_per_qual * sizeof(word) );
1850 #define REFILL_BUFFER( _in_file, _buffer, _num_bytes ) \
1851 do { \
1852 (_num_bytes) = gzread((_in_file),(_buffer),READ_BUFFER_SIZE); \
1853 (_buffer)[(_num_bytes)] = 0; \
1854 } while (0)
1856 REFILL_BUFFER( in_file, buffer, num_bytes );
1857 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1858 char *p = buffer;
1859 char *p2 = buffer2;
1860 word *read_start_ptr;
1861 int is_FASTQ = ( buffer[0] == '@' );
1862 char read_terminator = is_FASTQ ? '+' : '>';
1863 while ( num_bytes && num_bytes2) {
1864 read_start_ptr = packed_reads_p;
1865 // consume read name @xxxxxxx
1866 p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1867 p2 = (char*) memchr( p2, '\n', num_bytes2 - (p2-buffer2) );
1868 /*while (!p || !p2) {
1869 REFILL_BUFFER( in_file, buffer, num_bytes );
1870 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1871 if ( !num_bytes || !num_bytes2) break; // end of file
1872 p = (char*) memchr( buffer, '\n', num_bytes );
1873 p2 = (char*) memchr( buffer2, '\n', num_bytes2 );
1875 while(!p)
1877 REFILL_BUFFER( in_file, buffer, num_bytes );
1878 if ( !num_bytes)
1879 break;
1880 p = (char*) memchr( buffer, '\n', num_bytes );
1882 while(!p2)
1884 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1885 if( !num_bytes2)
1886 break;
1887 p2 = (char*) memchr( buffer2, '\n', num_bytes2 );
1889 if ( !num_bytes || !num_bytes2) break; // end of file
1890 // consume read sequence ACGT
1891 word w = 0;
1892 int index = 0;
1893 //fprintf(stderr, "R1:");
1894 //if(*p == '@' || *p == '>')
1896 while(*p != '\n')
1898 p++;
1899 if (!(*p)) {
1900 REFILL_BUFFER( in_file, buffer, num_bytes );
1901 if ( !num_bytes ) break;
1902 p= buffer;
1905 p++;
1906 if (!(*p)) {
1907 REFILL_BUFFER( in_file, buffer, num_bytes );
1908 if ( !num_bytes ) break;
1909 p= buffer;
1913 while ( *p != read_terminator ) { // FASTQ: + FASTA: >
1914 //fprintf(stderr, "%c", *p);
1915 if ( *p >= 'A') {
1916 if ( index % CHARS_PER_WORD == 0 ) {
1917 if ( index ) {
1918 *packed_reads_p = w;
1919 packed_reads_p++;
1920 w = 0;
1923 w = (w << BITS_PER_CHAR) | dna_map[(unsigned char)(*p) ]; // order: MSB to LSB
1924 index++;
1926 p++;
1927 if (!(*p)) {
1928 REFILL_BUFFER( in_file, buffer, num_bytes );
1929 if ( !num_bytes ) break;
1930 p = buffer;
1934 if (index != read_len) {
1935 log( "ERROR: read length of read1 #%lld is %d\n", num_reads, index);
1936 p -= index;
1937 int c=0;
1938 while(c < index)
1940 log( " %d,%c", c, *p);
1941 p++;
1942 c++;
1944 log( "\n");
1945 exit(1);
1947 //fprintf(stderr, "\nR2:");
1948 // write last word
1949 *packed_reads_p = w << last_shift;
1950 *packed_reads_p |= ( 0xFFFFFFFF >> ( BITS_PER_WORD - last_shift ) ); // TODO hard. add padding
1951 packed_reads_p++;
1953 w = 0;
1954 index = 0;
1955 //if(*(p2) == '@' || *(p2) == '>')
1957 while(*p2 != '\n')
1959 p2++;
1960 if (!(*p2)) {
1961 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1962 if ( !num_bytes2 ) break;
1963 p2 = buffer2;
1966 p2++;
1968 if (!(*p2)) {
1969 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1970 if ( !num_bytes2 ) break;
1971 p2 = buffer2;
1975 while ( *p2 != read_terminator ) { // FASTQ: + FASTA: >
1976 //fprintf(stderr, "%c", *p2);
1977 if ( *p2 >= 'A') {
1978 if ( index % CHARS_PER_WORD == 0 ) {
1979 if ( index ) {
1980 *packed_reads_p = w;
1981 packed_reads_p++;
1982 w = 0;
1985 w = (w << BITS_PER_CHAR) | dna_map[(unsigned char) (*p2) ]; // order: MSB to LSB
1986 index++;
1988 p2++;
1989 if (!(*p2)) {
1990 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1991 if ( !num_bytes2 ) break;
1992 p2 = buffer2;
1995 //fprintf(stderr, "\nq1:");
1996 // write last word
1997 *packed_reads_p = w << last_shift;
1998 *packed_reads_p |= ( 0xFFFFFFFF >> ( BITS_PER_WORD - last_shift ) ); // TODO hard. add padding
1999 packed_reads_p++;
2001 if (index != read_len) {
2002 log( "READ ERROR: read length of read2 #%lld is %d\n", num_reads, index);
2003 p2 -= index;
2004 int c=0;
2005 while(c < index)
2007 log( " %d,%c", c, *p2);
2008 p2++;
2009 c++;
2011 log( "\n");
2012 exit(1);
2014 if ( is_FASTQ ) {
2015 word *packed_quals_p1 = packed_quals;
2016 word *packed_quals_p2 = packed_quals;
2017 // consume + sign
2018 p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
2019 while (!p) {
2020 REFILL_BUFFER( in_file, buffer, num_bytes );
2021 p = (char*) memchr( buffer, '\n', num_bytes );
2023 //p++;
2024 while( *p != '\n')
2026 p++;
2027 if(!(*p)){
2028 REFILL_BUFFER( in_file, buffer, num_bytes );
2029 if ( !num_bytes ) break;
2030 p = buffer;
2033 p++;
2035 if(!(*p)){
2036 REFILL_BUFFER( in_file, buffer, num_bytes );
2037 if ( !num_bytes ) break;
2038 p = buffer;
2040 // consume base quality
2041 w = 0;
2042 index = 0;
2043 while( *p != '\n')
2045 //fprintf(stderr, "%c", *p);
2046 if(index % BITS_PER_WORD == 0){
2047 if(index){
2048 *packed_quals_p1 = w;
2049 packed_quals_p1++;
2050 w = 0;
2053 w = (w << BITS_PER_QUAL) | (*p >= threshold);
2054 index++;
2055 p++;
2056 if (!(*p)) {
2057 REFILL_BUFFER( in_file, buffer, num_bytes );
2058 if ( !num_bytes ) break;
2059 p = buffer;
2062 p++;
2063 //fprintf(stderr, "\nq2:");
2064 *packed_quals_p1 = w << last_shift_qual;
2065 //*packed_quals_p1 |= (0xFFFFFFFF >> ( BITS_PER_QUAL - last_shift_qual ));
2066 packed_quals_p1 ++;
2067 fwrite( packed_quals, sizeof(word), 1 * words_per_qual, fq);
2068 //memset( packed_quals, 0, 1 * words_per_qual * sizeof(word) );
2069 // consume + sign
2070 p2 = (char*) memchr( p2, '\n', num_bytes2 - (p2-buffer2) );
2071 while (!p2) {
2072 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
2073 p2 = (char*) memchr( buffer2, '\n', num_bytes2 );
2075 //p2++;
2076 while( *p2 != '\n')
2078 p2++;
2079 if (!(*p2)) {
2080 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
2081 if ( !num_bytes2 ) break;
2082 p2 = buffer2;
2085 p2++;
2087 if (!(*p2)) {
2088 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
2089 if ( !num_bytes2 ) break;
2090 p2 = buffer2;
2092 // consume base quality
2093 w = 0;
2094 index = 0;
2095 while( *p2 != '\n')
2097 //fprintf(stderr, "%c", *p2);
2098 if(index % BITS_PER_WORD == 0){
2099 if(index){
2100 *packed_quals_p2 = w;
2101 packed_quals_p2++;
2102 w = 0;
2105 w = (w << BITS_PER_QUAL) | (*p2 >= threshold);
2106 index++;
2107 p2++;
2108 if (!(*p2)) {
2109 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
2110 if ( !num_bytes2 ) break;
2111 p2 = buffer2;
2114 p2++;
2115 //fprintf(stderr, "\n");
2116 *packed_quals_p2 = w << last_shift_qual;
2117 //*packed_quals_p2 |= (0xFFFFFFFF >> ( BITS_PER_QUAL - last_shift_qual ));
2118 packed_quals_p2 ++;
2119 fwrite( packed_quals, sizeof(word), 1 * words_per_qual, fq);
2120 //memset( packed_quals, 0, 1 * words_per_qual * sizeof(word) );
2121 } // end FASTQ only
2122 if (index == read_len) {
2123 num_reads += 2;
2124 } else { // trouble...
2125 log( "READ ERROR: read length of read #%lld is %d\n", num_reads, index);
2126 p2 -= index;
2127 int c=0;
2128 while(c < index)
2130 log( " %d,%c", c, *p2);
2131 p2++;
2132 c++;
2134 log( "\n");
2135 exit(1);
2136 packed_reads_p = read_start_ptr;
2139 if ( !num_bytes ) break; // end of file
2142 gzclose( in_file );
2144 (*read_num) = (*read_num) + num_reads;
2145 ////////////////////// END READING INPUT //////////////////////
2148 void lib2read( struct global_data_t &globals, char* lib_file, char* prefix)
2150 int num_libs=0, i, j, read_length = globals.read_len;
2151 int64 start_num=0, g_num_reads=0;
2153 g_packed_reads = (word*) malloc( globals.max_num_reads * globals.words_per_read * sizeof(word) );
2154 //memset( g_packed_reads, 0, globals.max_num_reads * globals.words_per_read * sizeof(word) );
2155 last_shift = globals.read_len % CHARS_PER_WORD ? ( CHARS_PER_WORD - globals.read_len % CHARS_PER_WORD ) * BITS_PER_CHAR : 0;
2156 packed_reads_p = g_packed_reads;
2157 g_num_reads=0;
2159 words_per_qual = ( globals.read_len + ( BITS_PER_WORD - 1 ) ) / BITS_PER_WORD;
2160 packed_quals = (word*) malloc( 1 * words_per_qual * sizeof(word) );
2161 last_shift_qual = globals.read_len % BITS_PER_WORD ? ( BITS_PER_WORD - globals.read_len % BITS_PER_WORD) * BITS_PER_QUAL : 0;
2163 log("Starting load libraries...\n");
2164 scan_libInfo(lib_file, &num_libs, &read_length);
2166 if(read_length != globals.read_len)
2168 log( "Ignore max read length in read library file\n");
2169 read_length = globals.read_len;
2172 char name[255];
2173 sprintf(name, "%s.qual.bit", prefix);
2174 FILE *fq = fopen(name, "w"); //output base quality as 0/1.
2176 sprintf(name, "%s.index.peGrade", prefix);
2177 FILE *fg = fopen(name, "w"); //output insert size grade.
2179 log( "Totally there are %d libs\n", num_libs);
2180 for( i=0; i< num_libs; i++ )
2182 if(lib_array[i].asm_flag != 1 && lib_array[i].asm_flag != 3)
2183 continue;
2184 if(lib_array[i].avg_ins == 0)
2185 continue;
2186 log( "lib %d has %d paired fastqs\n", i, lib_array[i].num_q1_file);
2187 start_num = g_num_reads;
2188 for( j = 0; j < lib_array[i].num_q1_file; j++ )
2190 log( "Loading file: %s ", lib_array[i].q1_fname[j]);
2191 read_pair_file(fq, lib_array[i].q1_fname[j], lib_array[i].q2_fname[j], &g_num_reads, globals.read_len, lib_array[i].qual);
2192 log( ", Using %lld reads\n", g_num_reads);
2194 for( j = 0; j < lib_array[i].num_a1_file; j++ )
2196 log( "Loading file: %s ", lib_array[i].a1_fname[j]);
2197 read_pair_file(fq, lib_array[i].a1_fname[j], lib_array[i].a2_fname[j], &g_num_reads, globals.read_len, lib_array[i].qual);
2198 log( ", Using %lld reads\n", g_num_reads);
2200 for( j = 0; j < lib_array[i].num_p_file; j++)
2202 log( "Loading file: %s ", lib_array[i].p_fname[j]);
2203 read_single_file(fq, lib_array[i].p_fname[j], &g_num_reads, globals.read_len, lib_array[i].qual);
2204 log( ", Using %lld reads\n", g_num_reads);
2207 if(g_num_reads > start_num)
2208 fprintf(fg, "%lld %lld %d %d\n", start_num, g_num_reads, lib_array[i].avg_ins, lib_array[i].sd);
2211 log( "Starting Single-End libraries to reads...\n");
2212 for( i=0; i< num_libs; i++ )
2214 if(lib_array[i].asm_flag != 1 && lib_array[i].asm_flag != 3)
2215 continue;
2216 log( "lib %d has %d single fastqs\n", i, lib_array[i].num_s_q_file);
2217 for( j = 0; j < lib_array[i].num_s_q_file; j++)
2219 read_single_file(fq, lib_array[i].s_q_fname[j], &g_num_reads, globals.read_len, lib_array[i].qual);
2221 for( j = 0; j < lib_array[i].num_s_a_file; j++)
2222 read_single_file(fq, lib_array[i].s_a_fname[j], &g_num_reads, globals.read_len, lib_array[i].qual);
2225 globals.packed_reads = g_packed_reads;
2226 globals.num_reads = g_num_reads;
2228 fclose(fq);
2229 fclose(fg);
2230 free_libs(lib_array, num_libs);
2231 free( packed_quals );
2232 log("Loading library finished.\n");
2235 int main( int argc, char** argv ) {
2236 struct sysinfo s_info;
2237 sysinfo(&s_info);
2239 if ( argc <3 ) {
2240 #ifdef DISABLE_GPU
2241 log( "Usage:\n %s read_library read_len output_prefix [thread_num] [max_host_mem] 2> bwt.log\n", argv[0] );
2242 log( "Input:\n 1) read_library has the same format to that of SOAPdenovo, except to set qual_cutoff to define high quality threshold\n 2) read length must be set and reads only with fixed length could be used.\n");
2243 #else
2244 log( "Usage:\n %s read_library read_len output_prefix [thread_num] [max_host_mem] [GPU_mem] 2> bwt.log\n", argv[0] );
2245 log( "Input:\n 1) read_library has the same format to that of SOAPdenovo, except to set qual_cutoff to define high quality threshold\n 2) read length must be set and reads only with fixed length could be used.\n");
2246 #endif
2247 log( "Output:\n output_prefix.f.bwt.ascii, output_prefix.r.bwt.ascii, output_prefix.ridt\n" );
2248 log("\nMachine Information:\n"
2249 "RAM: total %lld / free %lld\n"
2250 "Swap: total %lld / free %lld\n"
2251 "Number of CPU = %d\n",
2252 s_info.totalram, s_info.freeram,
2253 s_info.totalswap, s_info.freeswap,
2254 sysconf(_SC_NPROCESSORS_ONLN)
2256 #ifdef DISABLE_GPU
2257 exit(1);
2258 #else
2259 get_gpu_mem();
2260 exit(1);
2261 #endif
2264 init_dna_map();
2266 int read_len = atoi(argv[2]);
2267 char *output_prefix = argv[3];
2268 uint64_t gpu_mem = 0; // = atoll(argv[4]);
2269 uint64_t max_host_mem = s_info.totalram;
2270 int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
2272 if(argc > 4)
2274 number_cpu_threads = atoi(argv[4]);
2275 if(number_cpu_threads > cpu_count || number_cpu_threads == 0 || number_cpu_threads > NUM_CPU_THREADS)
2276 number_cpu_threads = cpu_count < NUM_CPU_THREADS ? cpu_count : NUM_CPU_THREADS;
2277 log("Using thread number: %d\n", number_cpu_threads);
2278 }else{
2279 number_cpu_threads = cpu_count < NUM_CPU_THREADS ? cpu_count : NUM_CPU_THREADS;
2280 log("Using thread number: %d\n", number_cpu_threads);
2283 if(argc > 5)
2285 uint64_t set_host_mem = atoll(argv[5]);
2286 if(set_host_mem > max_host_mem || set_host_mem == 0)
2288 log("Use maximum host memory %lld\n", max_host_mem);
2289 }else{
2290 max_host_mem = set_host_mem;
2292 }else{
2293 log("Using total RAM %lld\n", max_host_mem);
2296 #ifdef DISABLE_GPU
2297 gpu_mem = 0;
2298 #else
2299 size_t free_gpu_mem = get_gpu_mem();
2300 if(gpu_mem < 100 || gpu_mem > free_gpu_mem)
2302 gpu_mem = free_gpu_mem;
2304 #endif
2306 if(argc > 6)
2308 #ifdef DISABLE_GPU
2309 gpu_mem = atoll(argv[6]);
2310 #else
2311 uint64_t set_gpu_mem = atoll(argv[6]);
2312 if(set_gpu_mem < 100 || set_gpu_mem > gpu_mem)
2314 log("Use Maximum GPU memory %lld.\n", gpu_mem);
2315 }else{
2316 gpu_mem = set_gpu_mem;
2317 log("Use Set GPU memory %lld.\n", gpu_mem);
2319 #endif
2320 }else{
2321 #ifdef DISABLE_GPU
2322 log("Use maximum bucket size.\n");
2323 #else
2324 log("Use Maximum GPU memory %lld.\n", gpu_mem);
2325 #endif
2328 struct global_data_t globals;
2330 init_global_data(globals, read_len, max_host_mem, number_cpu_threads);
2332 // read input
2333 lib2read( globals, argv[1], output_prefix);
2334 log("Number of reads: %lld\n", globals.num_reads);
2336 //building bwt.
2337 //set_global_after_reading(globals, max_host_mem);
2338 set_global_bucket(globals);
2339 //level 1 fill buckets.
2340 uint64_t max_bucket_size = lv0_fill_buckets(globals);
2342 //set level2 memeory.
2343 set_global_after_lv0(globals, max_host_mem, gpu_mem, max_bucket_size);
2345 build_bwt(globals, output_prefix, true);
2347 log("Reverse all reads..");
2348 reverse_all_reads(globals);
2349 log("Done\n");
2351 lv0_fill_buckets(globals);
2352 build_bwt(globals, output_prefix, false);
2354 log("All finished.\n");
2355 // free memory
2357 free( globals.packed_reads );
2358 free( globals.buckets );
2359 free( globals.lv1_offsets );
2360 free( globals.lv2_suffixes );
2361 free( globals.lv2_read_id );
2362 free( globals.sort_indexes );
2364 #ifdef DISABLE_GPU
2365 free(globals.cpu_sort_space);
2366 #endif
2368 pthread_spin_destroy(&globals.large_diff_lock);
2370 for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
2371 free( globals.section_buckets[t] );
2372 free( globals.locals[t].offset_pos );
2375 return 0;