BGI/BASE/src/2bwt/readbwt.cpp

   1 /*
   2
   3    readbwt.cpp        Build index of short reads.
   4
   5    This program builds index of short reads for use of assembler BASE and some other programs.
   6
   7 #    Copyright (C) 2015, The University of Hong Kong.
   8 #
   9 #    This program is free software; you can redistribute it and/or
  10 #    modify it under the terms of the GNU General Public License
  11 #    as published by the Free Software Foundation; either version 3
  12 #    of the License, or (at your option) any later version.
  13 #
  14 #    This program is distributed in the hope that it will be useful,
  15 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 #    GNU General Public License for more details.
  18 #
  19 #    You should have received a copy of the GNU General Public License
  20 #    along with this program; if not, write to the Free Software
  21 #    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  03110-1301, USA.
  22
  23    Date   : 1st Jan 2014
  24    Author : Chi Man LIU
  25    Change : Developped this product to build BWT of short reads.
  26             This is the implementation of CX1 method.
  27
  28    Date   : 1st Jan 2015
  29    Author : Binghang LIU
  30    Change : Add the implementation of CX1 method without GPU.
  31                         Add the loading of SOAPdenovo read libarary files, add qual_cutoff to set quality threshold.
  32                         Add a new output file for pair end reads with different insert sizes.
  33 */
  34
  35
  36 #include <omp.h>
  37 #include <pthread.h>
  38 #include <assert.h>
  39 #include <stdarg.h>
  40 #include <sys/sysinfo.h> // memory size.
  41 #include <vector>
  42 #include <algorithm>
  43 #include <string>
  44
  45 // for sorting
  46 #include <zlib.h>
  47 #include "lv2_cpu_sort.h"
  48 #include "lv2_gpu_sort.h"
  49
  50 using namespace std;
  51
  52 typedef unsigned int word;
  53
  54 #define NUM_CPU_THREADS 24
  55 //#define THREADS_PER_BLOCK 256 // GPU
  56
  57 // do not modify below
  58 #define BITS_PER_WORD 32 //used for quality.
  59 #define BITS_PER_CHAR 2
  60
  61 #define BITS_PER_QUAL 1 //used for quality.
  62
  63 #define CHARS_PER_WORD 16
  64 #define CHAR_MASK 0x3
  65
  66 #define PREFIX_LENGTH 8
  67 #define SPECIAL_HANDLE_SUFFIX_LENGTH 3 // suffixes with length <= 3 will be handled specially
  68 #define SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS 125 // 5^3
  69 #define NUM_BUCKETS 390625
  70 #define PREFIX_MASK 0xFFFF
  71
  72 char dna_map[256]; // map from ACGT to 0123
  73 char dna_char[5] = {'A','C','G','T','$'};
  74 static char tabs[2][1024];
  75 word* g_packed_reads;
  76 word* packed_quals;
  77 word* packed_reads_p;
  78
  79 int words_per_qual;
  80 int last_shift;
  81 int last_shift_qual;
  82 int number_cpu_threads = 24;
  83
  84 #define SENTINEL_INT 4 // '$'
  85
  86 //**********************************define structures start
  87 struct global_data_t;
  88 struct section_data_t {
  89     int section_id;
  90     struct global_data_t *parent;
  91     int64 section_start, section_end; // start and end read ID ('end' is exlusive)
  92     string bwt_buffer[SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS];
  93     int64 *offset_pos;
  94     int64 offset_diff_base;
  95 };
  96
  97 struct suffix_fetch_data_t {
  98     int thread_id;
  99     struct global_data_t *parent;
 100     int bucket_start, bucket_end;
 101     int64 index_start;
 102 };
 103
 104 struct global_data_t {
 105     int node_bucket_start, node_bucket_end; // buckets to be processed on this node
 106     int num_cpu_threads;
 107     int64 max_num_reads;
 108     int64 num_reads;
 109     word *packed_reads;
 110     uint64_t *cpu_sort_space;
 111
 112     int read_len;
 113     int words_per_read;
 114     int words_per_suffix;
 115     int offset_num_bits; // number of bits to represent 0...read_len
 116     int offset_mask; // -(1 << offset_num_bits)
 117
 118     int64 max_lv1_entries;
 119     int64 max_lv2_entries;
 120
 121     int cur_special_handle_prefix;
 122     int64 num_lv2_to_output;
 123     bool need_ridt;
 124
 125     int *buckets, *section_buckets[ NUM_CPU_THREADS ]; // buckets (aggregated for all read; for each section). TODO int64?
 126     pthread_t threads[ NUM_CPU_THREADS ];
 127     struct section_data_t locals[ NUM_CPU_THREADS ];
 128     struct suffix_fetch_data_t suffix_fetches[ NUM_CPU_THREADS - 1 ];
 129     int lv1_bucket_start, lv1_bucket_end; // 'end' is exclusive
 130     int lv2_bucket_start, lv2_bucket_end; // 'end' is exclusive
 131     int64 lv2_offset_start_index;
 132     int64 lv2_suffixes_row_width;
 133     int *lv1_offsets;
 134     word *sort_indexes; //permutation.
 135     word *sort_indexes_to_output;
 136     unsigned int *lv2_read_id;
 137     unsigned int *lv2_read_id_to_output;
 138     word *lv2_suffixes;
 139     word *lv2_suffixes_to_output;
 140     FILE *out_file;
 141     FILE *map_out_file;
 142     // for large diff
 143     vector<int64> v_large_diff;
 144     pthread_spinlock_t large_diff_lock;
 145 };
 146
 147 typedef struct lib_info
 148 {
 149     int sd;
 150     int qual;
 151     int max_ins;
 152     int min_ins;
 153     int avg_ins;
 154     int rd_len_cutoff;  //read length cutoff
 155     int reverse;
 156     int asm_flag;
 157     int map_len;
 158     int pair_num_cut;
 159     int rank;
 160
 161     //whether last read is read1 in pair
 162     int paired;  // 0 -- single; 1 -- read1; 2 -- read2;
 163
 164     //type1
 165     char ** a1_fname;
 166     char ** a2_fname;
 167     int num_a1_file;
 168     int num_a2_file;
 169
 170     //type2
 171     char ** q1_fname;
 172     char ** q2_fname;
 173     int num_q1_file;
 174     int num_q2_file;
 175
 176     //type3
 177     char ** p_fname;
 178     int num_p_file;  //fasta only
 179
 180     //type4 &5
 181     char ** s_a_fname;
 182     int num_s_a_file;
 183     char ** s_q_fname;
 184     int num_s_q_file;
 185
 186     char ** b_fname; //the name of the bam file
 187     int num_b_file; //the number of the bam file
 188 } LIB_INFO;
 189 LIB_INFO* lib_array;
 190
 191 //***************** define structure finished.
 192
 193 //***************** Initialize start.
 194 //initial dna_map, here N is treated as G.
 195 void init_dna_map() {
 196     for (int i='A';i<='Z';++i) dna_map[i] = 2; // G
 197     for (int i='a';i<='z';++i) dna_map[i] = 2; // G
 198     dna_map['A'] = dna_map['a'] = 0;
 199     dna_map['C'] = dna_map['c'] = 1;
 200     dna_map['G'] = dna_map['g'] = 2;
 201     dna_map['T'] = dna_map['t'] = 3;
 202     dna_map['N'] = dna_map['n'] = 2; // G
 203 }
 204
 205 static boolean splitColumn ( char * line )
 206 {
 207         int len = strlen ( line );
 208         int i = 0, j;
 209         int tabs_n = 0;
 210
 211         while ( i < len )
 212         {
 213                 if ( line[i] >= 32 && line[i] <= 126 && line[i] != '=' )
 214                 {
 215                         j = 0;
 216                         while ( i < len && line[i] >= 32 && line[i] <= 126 && line[i] != '=' )
 217                         {
 218                                 tabs[tabs_n][j++] = line[i];
 219                                 i++;
 220                         }
 221
 222                         tabs[tabs_n][j] = '\0';
 223                         tabs_n++;
 224
 225                         if ( tabs_n == 2 )
 226                                 return 1;
 227                 }
 228                 i++;
 229         }
 230
 231         if ( tabs_n == 2 )
 232                 return 1;
 233     return 0;
 234 }
 235
 236 static int cmp_lib ( const void * a, const void * b )
 237 {
 238         LIB_INFO * A, *B;
 239         A = ( LIB_INFO * ) a;
 240         B = ( LIB_INFO * ) b;
 241
 242         if ( A->avg_ins > B->avg_ins )
 243                 return 1;
 244         if ( A->avg_ins == B->avg_ins )
 245                 return 0;
 246     return -1;
 247 }
 248
 249 void scan_libInfo ( char * libfile, int *lib_num, int *read_len)
 250 {
 251     char line[1024], ch;
 252     int i, j, index;
 253     boolean flag;
 254     boolean * pe;
 255     FILE* fp = fopen ( libfile, "r" );
 256     int num_libs = 0;
 257     int maxReadLen = 0;
 258
 259         while ( fgets ( line, 1024, fp ) )
 260         {
 261                 ch = line[5];
 262                 line[5] = '\0';
 263                 if ( strcmp ( line, "[LIB]" ) == 0 )
 264                         num_libs++;
 265
 266                 if ( !num_libs )
 267                 {
 268                         line[5] = ch;
 269                         flag = splitColumn ( line );
 270
 271                         if ( !flag )
 272                                 continue;
 273
 274                         if ( strcmp ( tabs[0], "max_rd_len" ) == 0 || strcmp ( tabs[0], "max_read_len" ) == 0)
 275                                 maxReadLen = atoi ( tabs[1] );
 276                 }
 277         }
 278
 279         if ( num_libs == 0 )
 280         {
 281                 fprintf ( stderr, "Config file error: no [LIB] in file\n" );
 282                 exit ( -1 );
 283         }
 284     *lib_num = num_libs;
 285     *read_len = maxReadLen;
 286
 287         //count file numbers of each type
 288     lib_array = ( LIB_INFO * ) calloc (1, num_libs * sizeof ( LIB_INFO ) );
 289         pe = ( boolean * ) calloc (1, num_libs * sizeof ( boolean ) );
 290
 291         for ( i = 0; i < num_libs; i++ )
 292         {
 293         lib_array[i].qual = 74;
 294         lib_array[i].sd = 0;
 295         lib_array[i].avg_ins = 0;
 296                 lib_array[i].asm_flag = 3;
 297                 lib_array[i].rank = 0;
 298                 lib_array[i].pair_num_cut = 0;
 299                 lib_array[i].rd_len_cutoff = 0;
 300                 lib_array[i].map_len = 0;
 301                 lib_array[i].num_s_a_file = 0;
 302                 lib_array[i].num_s_q_file = 0;
 303                 lib_array[i].num_p_file = 0;
 304                 lib_array[i].num_a1_file = 0;
 305                 lib_array[i].num_a2_file = 0;
 306                 lib_array[i].num_q1_file = 0;
 307                 lib_array[i].num_q2_file = 0;
 308                 lib_array[i].num_b_file = 0;    //init
 309                 pe[i] = false;
 310         }
 311
 312         rewind ( fp );
 313         i = -1;
 314
 315         while ( fgets ( line, 1024, fp ) )
 316         {
 317                 ch = line[5];
 318                 line[5] = '\0';
 319
 320                 if ( strcmp ( line, "[LIB]" ) == 0 )
 321                 {
 322                         i++;
 323                         continue;
 324                 }
 325
 326                 line[5] = ch;
 327                 flag = splitColumn ( line );
 328
 329                 if ( !flag )
 330                 {
 331                         continue;
 332                 }
 333
 334                 if ( strcmp ( tabs[0], "f1" ) == 0 )
 335                 {
 336                         lib_array[i].num_a1_file++;
 337                         pe[i] = true;
 338                 }
 339                 else if ( strcmp ( tabs[0], "q1" ) == 0 )
 340                 {
 341                         lib_array[i].num_q1_file++;
 342                         pe[i] = true;
 343                 }
 344                 else if ( strcmp ( tabs[0], "f2" ) == 0 )
 345                 {
 346                         lib_array[i].num_a2_file++;
 347                         pe[i] = true;
 348                 }
 349                 else if ( strcmp ( tabs[0], "q2" ) == 0 )
 350                 {
 351                         lib_array[i].num_q2_file++;
 352                         pe[i] = true;
 353                 }
 354                 else if ( strcmp ( tabs[0], "f" ) == 0 )
 355                 {
 356                         lib_array[i].num_s_a_file++;
 357                 }
 358                 else if ( strcmp ( tabs[0], "q" ) == 0 )
 359                 {
 360                         lib_array[i].num_s_q_file++;
 361                 }
 362                 else if ( strcmp ( tabs[0], "p" ) == 0 )
 363                 {
 364                         lib_array[i].num_p_file++;
 365                         pe[i] = true;
 366                 }
 367                 else if ( strcmp ( tabs[0], "b" ) == 0 ) // the bam file
 368                 {
 369                         lib_array[i].num_b_file++;
 370                         pe[i] = true;
 371                 }
 372         }
 373
 374         //allocate memory for filenames
 375         for ( i = 0; i < num_libs; i++ )
 376         {
 377                 if ( lib_array[i].num_a2_file != lib_array[i].num_a1_file )
 378                 {
 379                         fprintf ( stderr, "Config file error: the number of mark \"f1\" is not the same as \"f2\"!\n" );
 380                         exit ( -1 );
 381                 }
 382
 383                 if ( lib_array[i].num_q2_file != lib_array[i].num_q1_file )
 384                 {
 385                         fprintf ( stderr, "Config file error: the number of mark \"q1\" is not the same as \"q2\"!\n" );
 386                         exit ( -1 );
 387                 }
 388
 389                 if ( lib_array[i].num_s_a_file )
 390                 {
 391                         lib_array[i].s_a_fname = ( char ** ) calloc (1, lib_array[i].num_s_a_file * sizeof ( char * ) );
 392
 393                         for ( j = 0; j < lib_array[i].num_s_a_file; j++ )
 394                         {
 395                                 lib_array[i].s_a_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
 396                         }
 397                 }
 398
 399                 if ( lib_array[i].num_s_q_file )
 400                 {
 401                         lib_array[i].s_q_fname = ( char ** ) calloc (1, lib_array[i].num_s_q_file * sizeof ( char * ) );
 402
 403                         for ( j = 0; j < lib_array[i].num_s_q_file; j++ )
 404                         {
 405                                 lib_array[i].s_q_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
 406                         }
 407                 }
 408
 409                 if ( lib_array[i].num_p_file )
 410                 {
 411                         lib_array[i].p_fname = ( char ** ) calloc (1, lib_array[i].num_p_file * sizeof ( char * ) );
 412
 413                         for ( j = 0; j < lib_array[i].num_p_file; j++ )
 414                         {
 415                                 lib_array[i].p_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
 416                         }
 417                 }
 418
 419                 if ( lib_array[i].num_a1_file )
 420                 {
 421                         lib_array[i].a1_fname = ( char ** ) calloc (1, lib_array[i].num_a1_file * sizeof ( char * ) );
 422
 423                         for ( j = 0; j < lib_array[i].num_a1_file; j++ )
 424                         {
 425                                 lib_array[i].a1_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
 426                         }
 427                 }
 428
 429                 if ( lib_array[i].num_a2_file )
 430                 {
 431                         lib_array[i].a2_fname = ( char ** ) calloc (1, lib_array[i].num_a2_file * sizeof ( char * ) );
 432
 433                         for ( j = 0; j < lib_array[i].num_a2_file; j++ )
 434                         {
 435                                 lib_array[i].a2_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
 436                         }
 437                 }
 438
 439                 if ( lib_array[i].num_q1_file )
 440                 {
 441                         lib_array[i].q1_fname = ( char ** ) calloc (1, lib_array[i].num_q1_file * sizeof ( char * ) );
 442
 443                         for ( j = 0; j < lib_array[i].num_q1_file; j++ )
 444                         {
 445                                 lib_array[i].q1_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
 446                         }
 447                 }
 448
 449                 if ( lib_array[i].num_q2_file )
 450                 {
 451                         lib_array[i].q2_fname = ( char ** ) calloc (1, lib_array[i].num_q2_file * sizeof ( char * ) );
 452
 453                         for ( j = 0; j < lib_array[i].num_q2_file; j++ )
 454                         {
 455                                 lib_array[i].q2_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) );
 456                         }
 457                 }
 458
 459                 if ( lib_array[i].num_b_file )  //allot memory for bam file name
 460                 {
 461                         lib_array[i].b_fname = ( char ** ) calloc (1, lib_array[i].num_b_file * sizeof ( char * ) );
 462
 463                         for ( j = 0; j < lib_array[i].num_b_file; j++ )
 464                                 { lib_array[i].b_fname[j] = ( char * ) calloc (1, 1024 * sizeof ( char ) ); }
 465                 }
 466         }
 467
 468         // get file names
 469         for ( i = 0; i < num_libs; i++ )
 470         {
 471                 lib_array[i].num_s_a_file = 0;
 472                 lib_array[i].num_s_q_file = 0;
 473                 lib_array[i].num_p_file = 0;
 474                 lib_array[i].num_a1_file = 0;
 475                 lib_array[i].num_a2_file = 0;
 476                 lib_array[i].num_q1_file = 0;
 477                 lib_array[i].num_q2_file = 0;
 478                 lib_array[i].num_b_file = 0;    //init
 479         }
 480
 481         rewind ( fp );
 482         i = -1;
 483
 484         while ( fgets ( line, 1024, fp ) )
 485         {
 486                 ch = line[5];
 487                 line[5] = '\0';
 488
 489                 if ( strcmp ( line, "[LIB]" ) == 0 )
 490                 {
 491                         i++;
 492                         continue;
 493                 }
 494
 495                 line[5] = ch;
 496                 flag = splitColumn ( line );
 497
 498                 if ( !flag )
 499                 {
 500                         continue;
 501                 }
 502
 503                 if ( strcmp ( tabs[0], "f1" ) == 0 )
 504                 {
 505                         index = lib_array[i].num_a1_file++;
 506                         strcpy ( lib_array[i].a1_fname[index], tabs[1] );
 507                 }
 508                 else if ( strcmp ( tabs[0], "q1" ) == 0 )
 509                 {
 510                         index = lib_array[i].num_q1_file++;
 511                         strcpy ( lib_array[i].q1_fname[index], tabs[1] );
 512                 }
 513                 else if ( strcmp ( tabs[0], "f2" ) == 0 )
 514                 {
 515                         index = lib_array[i].num_a2_file++;
 516                         strcpy ( lib_array[i].a2_fname[index], tabs[1] );
 517
 518                         if ( strcmp ( lib_array[i].a2_fname[index], lib_array[i].a1_fname[index] ) == 0 )
 519                         {
 520                                 fprintf ( stderr, "Config file error: f2 file is the same as f1 file\n" );
 521                                 fprintf ( stderr, "f1=%s\n", lib_array[i].a1_fname[index] );
 522                                 fprintf ( stderr, "f2=%s\n", lib_array[i].a2_fname[index] );
 523                                 exit ( -1 );
 524                         }
 525                 }
 526                 else if ( strcmp ( tabs[0], "q2" ) == 0 )
 527                 {
 528                         index = lib_array[i].num_q2_file++;
 529                         strcpy ( lib_array[i].q2_fname[index], tabs[1] );
 530
 531                         if ( strcmp ( lib_array[i].q2_fname[index], lib_array[i].q1_fname[index] ) == 0 )
 532                         {
 533                                 fprintf ( stderr, "Config file error: q2 file is the same as q1 file\n" );
 534                                 fprintf ( stderr, "q1=%s\n", lib_array[i].q1_fname[index] );
 535                                 fprintf ( stderr, "q2=%s\n", lib_array[i].q2_fname[index] );
 536                                 exit ( -1 );
 537                         }
 538                 }
 539                 else if ( strcmp ( tabs[0], "f" ) == 0 )
 540                 {
 541                         index = lib_array[i].num_s_a_file++;
 542                         strcpy ( lib_array[i].s_a_fname[index], tabs[1] );
 543                 }
 544                 else if ( strcmp ( tabs[0], "q" ) == 0 )
 545                 {
 546                         index = lib_array[i].num_s_q_file++;
 547                         strcpy ( lib_array[i].s_q_fname[index], tabs[1] );
 548                 }
 549                 else if ( strcmp ( tabs[0], "p" ) == 0 )
 550                 {
 551                         index = lib_array[i].num_p_file++;
 552                         strcpy ( lib_array[i].p_fname[index], tabs[1] );
 553                 }
 554                 else if ( strcmp ( tabs[0], "b" ) == 0 )
 555                 {
 556                         //bam file
 557                         index = lib_array[i].num_b_file++;
 558                         strcpy ( lib_array[i].b_fname[index], tabs[1] );
 559                 }
 560                 else if ( strcmp ( tabs[0], "min_ins" ) == 0 )
 561                 {
 562                         lib_array[i].min_ins = atoi ( tabs[1] );
 563                 }
 564                 else if ( strcmp ( tabs[0], "max_ins" ) == 0 )
 565                 {
 566                         lib_array[i].max_ins = atoi ( tabs[1] );
 567                 }
 568                 else if ( strcmp ( tabs[0], "avg_ins" ) == 0 )
 569                 {
 570                         lib_array[i].avg_ins = atoi ( tabs[1] );
 571                 }
 572         else if ( strcmp ( tabs[0], "sd" ) == 0)
 573         {
 574             lib_array[i].sd = atoi(tabs[1]);
 575         }
 576         else if( strcmp ( tabs[0], "qual_cutoff" ) == 0 )
 577         {
 578             lib_array[i].qual = atoi(tabs[1]);
 579         }
 580                 else if ( strcmp ( tabs[0], "rd_len_cutoff" ) == 0 )
 581                 {
 582                         lib_array[i].rd_len_cutoff = atoi ( tabs[1] );
 583                 }
 584                 else if ( strcmp ( tabs[0], "reverse_seq" ) == 0 )
 585                 {
 586                         lib_array[i].reverse = atoi ( tabs[1] );
 587                 }
 588                 else if ( strcmp ( tabs[0], "asm_flags" ) == 0 )
 589                 {
 590                         lib_array[i].asm_flag = atoi ( tabs[1] );
 591                 }
 592                 else if ( strcmp ( tabs[0], "rank" ) == 0 )
 593                 {
 594                         lib_array[i].rank = atoi ( tabs[1] );
 595                 }
 596                 else if ( strcmp ( tabs[0], "pair_num_cutoff" ) == 0 )
 597                 {
 598                         lib_array[i].pair_num_cut = atoi ( tabs[1] );
 599                 }
 600                 else if ( strcmp ( tabs[0], "rd_len_cutoff" ) == 0 )
 601                 {
 602                         lib_array[i].rd_len_cutoff = atoi ( tabs[1] );
 603                 }
 604                 else if ( strcmp ( tabs[0], "map_len" ) == 0 )
 605                 {
 606                         lib_array[i].map_len = atoi ( tabs[1] );
 607                 }
 608         }
 609
 610         for ( i = 0; i < num_libs; i++ )
 611         {
 612                 if ( pe[i] && lib_array[i].avg_ins == 0 )
 613                 {
 614                         fprintf ( stderr, "Config file error: PE reads need avg_ins in [LIB] %d\n", i + 1 );
 615                         exit ( -1 );
 616                 }
 617
 618         //check sd.
 619         if( pe[i] && lib_array[i].sd == 0)
 620         {
 621             if(lib_array[i].max_ins > 0 || lib_array[i].min_ins > 0)
 622             {
 623                 if( lib_array[i].max_ins > 0 && lib_array[i].min_ins > 0 )
 624                     lib_array[i].sd = lib_array[i].max_ins - lib_array[i].avg_ins > lib_array[i].min_ins - lib_array[i].avg_ins ? (lib_array[i].max_ins - lib_array[i].avg_ins)/3 : (lib_array[i].min_ins - lib_array[i].avg_ins)/3;
 625                 else if( lib_array[i].max_ins > 0 )
 626                     lib_array[i].sd = (lib_array[i].max_ins - lib_array[i].avg_ins)/3;
 627                 else
 628                     lib_array[i].sd = (lib_array[i].min_ins - lib_array[i].avg_ins)/3;
 629             }else{
 630                 lib_array[i].sd = lib_array[i].avg_ins * 0.05;
 631             }
 632         }
 633
 634         }
 635
 636         fclose ( fp );
 637         qsort ( &lib_array[0], num_libs, sizeof ( LIB_INFO ), cmp_lib );
 638 }
 639
 640 void free_libs (LIB_INFO* lib_array, int num_libs)
 641 {
 642         if ( !lib_array )
 643         {
 644                 return;
 645         }
 646
 647         int i, j;
 648         fprintf ( stderr, "LIB(s) information:\n" );
 649
 650         for ( i = 0; i < num_libs; i++ )
 651         {
 652                 fprintf ( stderr, " [LIB] %d, avg_ins %d, reverse %d.\n", i, lib_array[i].avg_ins, lib_array[i].reverse );
 653
 654                 if ( lib_array[i].num_s_a_file )
 655                 {
 656                         //printf("%d single fasta files\n",lib_array[i].num_s_a_file);
 657                         for ( j = 0; j < lib_array[i].num_s_a_file; j++ )
 658                         {
 659                                 free ( ( void * ) lib_array[i].s_a_fname[j] );
 660                         }
 661
 662                         free ( ( void * ) lib_array[i].s_a_fname );
 663                 }
 664
 665                 if ( lib_array[i].num_s_q_file )
 666                 {
 667                         //printf("%d single fastq files\n",lib_array[i].num_s_q_file);
 668                         for ( j = 0; j < lib_array[i].num_s_q_file; j++ )
 669                         {
 670                                 free ( ( void * ) lib_array[i].s_q_fname[j] );
 671                         }
 672
 673                         free ( ( void * ) lib_array[i].s_q_fname );
 674                 }
 675
 676                 if ( lib_array[i].num_p_file )
 677                 {
 678                         //printf("%d paired fasta files\n",lib_array[i].num_p_file);
 679                         for ( j = 0; j < lib_array[i].num_p_file; j++ )
 680                         {
 681                                 free ( ( void * ) lib_array[i].p_fname[j] );
 682                         }
 683
 684                         free ( ( void * ) lib_array[i].p_fname );
 685                 }
 686
 687                 if ( lib_array[i].num_a1_file )
 688                 {
 689                         //printf("%d read1 fasta files\n",lib_array[i].num_a1_file);
 690                         for ( j = 0; j < lib_array[i].num_a1_file; j++ )
 691                         {
 692                                 free ( ( void * ) lib_array[i].a1_fname[j] );
 693                         }
 694
 695                         free ( ( void * ) lib_array[i].a1_fname );
 696                 }
 697
 698                 if ( lib_array[i].num_a2_file )
 699                 {
 700                         //printf("%d read2 fasta files\n",lib_array[i].num_a2_file);
 701                         for ( j = 0; j < lib_array[i].num_a2_file; j++ )
 702                         {
 703                                 free ( ( void * ) lib_array[i].a2_fname[j] );
 704                         }
 705
 706                         free ( ( void * ) lib_array[i].a2_fname );
 707                 }
 708
 709                 if ( lib_array[i].num_q1_file )
 710                 {
 711                         //printf("%d read1 fastq files\n",lib_array[i].num_q1_file);
 712                         for ( j = 0; j < lib_array[i].num_q1_file; j++ )
 713                         {
 714                                 free ( ( void * ) lib_array[i].q1_fname[j] );
 715                         }
 716
 717                         free ( ( void * ) lib_array[i].q1_fname );
 718                 }
 719
 720                 if ( lib_array[i].num_q2_file )
 721                 {
 722                         //printf("%d read2 fastq files\n",lib_array[i].num_q2_file);
 723                         for ( j = 0; j < lib_array[i].num_q2_file; j++ )
 724                         {
 725                                 free ( ( void * ) lib_array[i].q2_fname[j] );
 726                         }
 727
 728                         free ( ( void * ) lib_array[i].q2_fname );
 729                 }
 730
 731                 if ( lib_array[i].num_b_file )
 732                 {
 733                         //free the bam file name
 734                         //printf("%d bam files\n",lib_array[i].num_b_file);
 735                         for ( j = 0; j < lib_array[i].num_b_file; j++ )
 736                                 { free ( ( void * ) lib_array[i].b_fname[j] ); }
 737
 738                         free ( ( void * ) lib_array[i].b_fname );
 739                 }
 740         }
 741
 742         num_libs = 0;
 743         free ( ( void * ) lib_array );
 744 }
 745 //***************** Initialize finished.
 746
 747 //***************** for debugging only start
 748 void debug_prefix( FILE* file, word prefix ) {
 749     for (int i = PREFIX_LENGTH - 1; i >= 0; --i) {
 750         int c = prefix % 5;
 751         prefix /= 5;
 752         fprintf( file, "%c", dna_char[c] );
 753     }
 754     fprintf( file, "^r" );
 755 }
 756
 757 void log(const char* format, ...) {  //add const to char* can remove warning.
 758     va_list args;
 759     va_start( args, format );
 760     vfprintf( stderr, format, args );
 761     va_end( args );
 762     fflush( stderr );
 763 }
 764
 765 // for debugging only
 766 void debug_word( FILE* file, word w ) {
 767     for (int i = CHARS_PER_WORD - 1; i >= 0; --i) {
 768         int c = (w>>(BITS_PER_CHAR*i)) & CHAR_MASK;
 769         fprintf( file, "%c", dna_char[c] );
 770     }
 771 }
 772
 773 // for debugging only
 774 void dump_special_prefix(int key) {
 775     for (int i = 0; i < SPECIAL_HANDLE_SUFFIX_LENGTH; ++i) {
 776         log("%c", dna_char[key / (SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS / 5)]);
 777         key = key * 5 % SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS;
 778     }
 779     log("\n");
 780 }
 781
 782 // for debug only
 783 void dump_read(word *read_p, int words_per_read, int read_len) {
 784     for (int i = 0; i < words_per_read; ++i) {
 785         for (int j = 0; j < CHARS_PER_WORD; ++j) {
 786             log("%c", dna_char[(read_p[i] >> (CHARS_PER_WORD - 1 - j) * BITS_PER_CHAR) & 3]);
 787             if (i * CHARS_PER_WORD + j == read_len - 1) {
 788                 log("_");
 789             }
 790         }
 791     }
 792     log("\n");
 793 }
 794
 795 void debug_suffix( word* start, struct global_data_t *_data ) {
 796     struct global_data_t &globals = *_data;
 797     for (int j=0; j<globals.words_per_read; ++j) {
 798         word w = *(start + j * globals.lv2_suffixes_row_width);
 799         for (int i = CHARS_PER_WORD - 1; i >= 0; --i) {
 800             int c = (w>>(BITS_PER_CHAR*i)) & CHAR_MASK;
 801             fprintf(stderr, "%c", dna_char[c]);
 802         }
 803     }
 804 }
 805 //***************** for debugging only end
 806
 807 inline void* MallocAndCheck(size_t size_in_byte,
 808                             const char *malloc_from_which_file = __FILE__,
 809                             int malloc_from_which_line = __LINE__) {
 810     void *ptr = malloc(size_in_byte);
 811     if (ptr == NULL && size_in_byte != 0) {
 812         log( "[ERROR] Ran out of memory while applying %llubytes\n", (unsigned long long)size_in_byte);
 813         log( "In file: %s, line %d\n", malloc_from_which_file, malloc_from_which_line);
 814         log( "There may be errors as follows:\n");
 815         log( "1) Not enough memory.\n");
 816         log( "2) The ARRAY may be overrode.\n");
 817         log( "3) The wild pointers.\n");
 818         exit(-1);
 819     }
 820
 821     return ptr;
 822 }
 823
 824 inline bool is_valid_special_prefix(int prefix_key) {
 825     // TODO hard
 826     if (prefix_key % 5 != 4) { return false; }
 827     for (int i = 0, start = 0; i < SPECIAL_HANDLE_SUFFIX_LENGTH; ++i) {
 828         if (prefix_key % 5 != 4) {
 829             start = 1;
 830         } else if (start) {
 831             return false;
 832         }
 833         prefix_key /= 5;
 834     }
 835     return prefix_key == 0;
 836 }
 837
 838 void* preprocess_fill_buckets( void *_data ) {
 839     struct section_data_t &locals = *( (struct section_data_t*)_data );
 840     struct global_data_t &globals = *(locals.parent);
 841     int section_id = locals.section_id;
 842     int *buckets = globals.section_buckets[ section_id ];
 843
 844     int read_len = globals.read_len;
 845     int words_per_read = globals.words_per_read;
 846     int w_shift = BITS_PER_WORD - BITS_PER_CHAR;
 847
 848     for ( int64 r = locals.section_start; r < locals.section_end; ++r )  {
 849         word *w_p = globals.packed_reads + ( words_per_read * r );
 850         word key = 0;
 851         word w = *(w_p++);
 852         for ( int i = 0; i < PREFIX_LENGTH - 1; ++i ) {
 853             key = key * 5 + ( w >> w_shift ); // replaced by this
 854             w <<= BITS_PER_CHAR;
 855         }
 856         for ( int i = PREFIX_LENGTH - 1; i < read_len; ++i ) {
 857             if ( i % CHARS_PER_WORD == 0 ) // TODO optimize
 858                 w = *(w_p++);
 859             key = ( key * 5 + ( w >> w_shift ) ) % NUM_BUCKETS; // replaced by this
 860             w <<= BITS_PER_CHAR;
 861             buckets[key]++;
 862         }
 863         int special_key = (key * 5 + SENTINEL_INT) % (SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS * 5);
 864
 865         for ( int i = PREFIX_LENGTH - 1; i >= SPECIAL_HANDLE_SUFFIX_LENGTH; --i ) {
 866             key = ( key * 5 + SENTINEL_INT ) % NUM_BUCKETS; // replaced by this
 867             buckets[key]++;
 868         }
 869
 870         // special handle buckets
 871         for ( int i = 0; i < SPECIAL_HANDLE_SUFFIX_LENGTH; ++i ) {
 872             int index = special_key % SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS;
 873             assert(is_valid_special_prefix(index));
 874             assert(special_key / SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS != 4);
 875             locals.bwt_buffer[index].push_back(dna_char[ special_key / SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS ]);
 876             special_key = (special_key * 5 + SENTINEL_INT) % (SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS * 5);
 877         }
 878     }
 879
 880     return NULL;
 881 }
 882
 883 void* lv1_fetch_offsets( void *_data) {
 884     struct section_data_t &locals = *( (struct section_data_t*)_data );
 885     struct global_data_t &globals = *(locals.parent);
 886
 887     const int bucket_start = globals.lv1_bucket_start;
 888     const int bucket_end = globals.lv1_bucket_end;
 889
 890     const int section_start = locals.section_start;
 891     const int section_end = locals.section_end;
 892
 893     int read_len = globals.read_len;
 894     int words_per_read = globals.words_per_read;
 895     int w_shift = BITS_PER_WORD - BITS_PER_CHAR;
 896     int64 *offset_pos = locals.offset_pos;
 897
 898     int id_shift = globals.offset_num_bits;
 899     int64 *recent_offsets = (int64*) malloc( NUM_BUCKETS * sizeof( int64 ) ); // for calculating differences
 900     int64 recent_base = ( locals.section_start << id_shift ); // offset of first read first char
 901     for ( int b = bucket_start; b < bucket_end; ++b ) {
 902         recent_offsets[b] = recent_base;
 903     }
 904     locals.offset_diff_base = recent_base;
 905
 906     // scan all reads and record offsets
 907     for ( int64 r = section_start; r < section_end; ++r )  {
 908         word *w_p = globals.packed_reads + ( words_per_read * r );
 909         word key = 0;
 910         word w = *(w_p++);
 911         //        int prev_char = SENTINEL_INT;
 912         for ( int i = 0; i < PREFIX_LENGTH - 1; ++i ) {
 913             //            key = ((key<<BITS_PER_CHAR) | (w>>w_shift));
 914             key = key * 5 + ( w >> w_shift ); // replaced by this
 915             w <<= BITS_PER_CHAR;
 916         }
 917         for ( int i = PREFIX_LENGTH - 1; i < read_len; ++i ) {
 918             if ( i % CHARS_PER_WORD == 0 ) // TODO optimize
 919                 w = *(w_p++);
 920             //            key = (((key<<BITS_PER_CHAR)&PREFIX_MASK)|(w>>w_shift));
 921             key = ( key * 5 + ( w >> w_shift ) ) % NUM_BUCKETS; // replaced by this
 922             w <<= BITS_PER_CHAR;
 923             // check if key within range
 924             // TODO 0x80000000 hardcode
 925             // this condition is equivalent to bucket_start <= key < bucket_end
 926             if ( (( key - bucket_start ) ^ ( key - bucket_end )) & 0x80000000 ) {
 927                 // read ID = r, within-read offset = i - PREFIX_LENGTH + 1
 928                 int64 true_offset = ( r << id_shift ) | ( i - PREFIX_LENGTH + 1 );
 929                 int64 diff_offset = true_offset - recent_offsets[ key ];
 930
 931                 if ( diff_offset < 0 ) { // TODO remove this assertion
 932                     log("ERROR: offset diff negative (%lld %lld)! ABORT.\n", recent_offsets[key], true_offset);
 933                     exit(1);
 934                 }
 935                 if ( diff_offset > 2147483647 ) {
 936                     pthread_spin_lock(&globals.large_diff_lock);
 937                     diff_offset = -1 - (int64)globals.v_large_diff.size();
 938                     globals.v_large_diff.push_back(true_offset);
 939                     pthread_spin_unlock(&globals.large_diff_lock);
 940                 }
 941                 globals.lv1_offsets[ offset_pos[key]++ ] = diff_offset;
 942                 recent_offsets[ key ] = true_offset;
 943             }
 944         }
 945         for ( int i = PREFIX_LENGTH - 1; i >= SPECIAL_HANDLE_SUFFIX_LENGTH; --i ) {
 946             key = ( key * 5 + SENTINEL_INT ) % NUM_BUCKETS; // replaced by this
 947             // check if key within range
 948             // TODO 0x80000000 hardcode
 949             // this condition is equivalent to bucket_start <= key < bucket_end
 950             if ( (( key - bucket_start ) ^ ( key - bucket_end )) & 0x80000000 ) {
 951                 // read ID = r, within-read offset = read_len - i
 952                 int64 true_offset = ( r << id_shift ) | ( read_len - i );
 953                 int64 diff_offset = true_offset - recent_offsets[ key ];
 954
 955                 if ( diff_offset < 0 ) { // TODO remove this assertion
 956                     log("ERROR: offset diff negative (%lld %lld)! ABORT.\n", recent_offsets[key], true_offset);
 957                     exit(1);
 958                 }
 959                 if ( diff_offset > 2147483647 ) {
 960                     pthread_spin_lock(&globals.large_diff_lock);
 961                     diff_offset = -1 - (int64)globals.v_large_diff.size();
 962                     globals.v_large_diff.push_back(true_offset);
 963                     pthread_spin_unlock(&globals.large_diff_lock);
 964                 }
 965                 globals.lv1_offsets[ offset_pos[key]++ ] = diff_offset;
 966                 recent_offsets[ key ] = true_offset;
 967             }
 968         }
 969
 970     }
 971
 972     free( recent_offsets );
 973
 974     return NULL;
 975 }
 976
 977 void* lv2_fetch_suffixes( void *_data ){
 978     struct suffix_fetch_data_t &fetch = *( (struct suffix_fetch_data_t*)_data );
 979     struct global_data_t &globals = *(fetch.parent);
 980
 981     int read_len = globals.read_len;
 982     int offset_num_bits = 1;
 983     while ((1 << offset_num_bits) - 1 < read_len) {
 984         ++offset_num_bits;
 985     }
 986     int words_per_read = globals.words_per_read;
 987     int words_per_suffix = globals.words_per_suffix;
 988     const int bucket_start = fetch.bucket_start;
 989     const int bucket_end = fetch.bucket_end;
 990     word *packed_reads = globals.packed_reads;
 991
 992     int id_shift = globals.offset_num_bits; // offsets are in the form [read_id][read_offset]; id_shift = # bits for read_offset
 993     int read_offset_mask = globals.offset_mask;
 994
 995     word *suffix_p = globals.lv2_suffixes + fetch.index_start;
 996     int num_suffixes = fetch.index_start;
 997
 998     int *offset_p = globals.lv1_offsets + globals.lv2_offset_start_index + fetch.index_start;
 999     for ( int b = bucket_start; b < bucket_end; ++b ) {
1000
1001         for ( int section = 0; section < globals.num_cpu_threads; ++section ) {
1002
1003             int64 offset = globals.locals[section].offset_diff_base; // reset per local bucket
1004             int64 num_in_section_bucket = globals.section_buckets[section][b];
1005             while ( num_in_section_bucket-- ) {
1006                 if (*offset_p < 0) {
1007                     offset = globals.v_large_diff[-(*offset_p++) - 1];
1008                 } else {
1009                     offset += *(offset_p++); // remember we are using difference encoding
1010                 }
1011                 // offset in the form [read_id][read_offset]
1012                 int64 read_id = offset >> id_shift;
1013                 int read_offset = offset & read_offset_mask;
1014                 word *read_p = packed_reads + ( words_per_read * read_id );
1015
1016                 // find previous character (the char to be stored in BWT)
1017                 int prev_char;
1018                 if ( !read_offset ) { // special case
1019                     prev_char = SENTINEL_INT;
1020                     globals.lv2_read_id[ num_suffixes ] = read_id;
1021                 } else {
1022                     int prev_which_word = ( read_offset - 1 ) / CHARS_PER_WORD;
1023                     int prev_word_offset = ( read_offset - 1 ) % CHARS_PER_WORD;
1024                     word w = *(read_p + prev_which_word);
1025                     prev_char = ( w >> ( BITS_PER_WORD - BITS_PER_CHAR - prev_word_offset * 2 ) ) & CHAR_MASK;
1026                 }
1027
1028                 // copy words of the suffix to the suffix pool
1029                 int which_word = read_offset / CHARS_PER_WORD;
1030                 int word_offset = read_offset % CHARS_PER_WORD;
1031                 word *dest = suffix_p;
1032                 word *src = read_p + which_word;
1033                 if ( !word_offset ) { // special case (word aligned), easy
1034                     while ( which_word < words_per_read ) {
1035                         *dest = *src; // write out
1036                         dest += globals.lv2_suffixes_row_width;
1037                         src++;
1038                         which_word++;
1039                     }
1040                 } else { // not word-aligned
1041                     int bit_shift = read_offset * 2;
1042                     word s = *src;
1043                     word d = s << bit_shift;
1044                     which_word++;
1045                     while ( which_word < words_per_read ) {
1046                         s = *(++src);
1047                         d |= s >> (BITS_PER_WORD - bit_shift);
1048                         *dest = d; // write out
1049                         dest += globals.lv2_suffixes_row_width;
1050                         d = s << bit_shift;
1051                         which_word++;
1052                     }
1053                     // write last word
1054                     *dest = d | ( 0xFFFFFFFF >> ( BITS_PER_WORD - bit_shift ) ); // bit padding by '1's. TODO hard
1055                 }
1056                 *(suffix_p + (globals.lv2_suffixes_row_width * (words_per_suffix - 1) ) ) &= ~globals.offset_mask; // TODO hard
1057                 *(suffix_p + (globals.lv2_suffixes_row_width * (words_per_suffix - 1) ) ) |= read_offset;
1058                 suffix_p++;
1059                 globals.sort_indexes[ num_suffixes ] = ( prev_char << 29 ) | num_suffixes; // TODO hard
1060                 num_suffixes++;
1061             }
1062         }
1063     }
1064
1065     return NULL;
1066 }
1067
1068 int get_base5_prefix(word item) {
1069     item >>= (CHARS_PER_WORD - SPECIAL_HANDLE_SUFFIX_LENGTH) * BITS_PER_CHAR;
1070     static int base4_mask = (1 << SPECIAL_HANDLE_SUFFIX_LENGTH * BITS_PER_CHAR) - 1;
1071     int ret = 0;
1072     for (int i = 0; i < SPECIAL_HANDLE_SUFFIX_LENGTH; ++i) {
1073         ret = ret * 5 + (item >> (SPECIAL_HANDLE_SUFFIX_LENGTH - 1) * BITS_PER_CHAR);
1074         item = item * 4 & base4_mask;
1075     }
1076     return ret;
1077 }
1078
1079 inline unsigned int mirror(unsigned int v) {
1080     // swap consecutive pairs
1081     v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
1082     // swap nibbles ...
1083     v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
1084     // swap bytes
1085     v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
1086     // swap 2-byte long pairs
1087     v = ( v >> 16             ) | ( v               << 16);
1088     return v;
1089 }
1090
1091 inline void reverse_one_read(word *read_p, int words_per_read, int last_shift) {
1092     // shift all words
1093     if (last_shift > 0) {
1094         for (int i = words_per_read - 1; i > 0; --i) {
1095             read_p[i] >>= last_shift;
1096             read_p[i] |= read_p[i - 1] << (BITS_PER_WORD - last_shift);
1097         }
1098         read_p[0] >>= last_shift;
1099         read_p[0] |= -(1 << (BITS_PER_WORD - last_shift));
1100     }
1101     // mirror all words
1102     for (int i = 0; i < words_per_read; ++i) {
1103         read_p[i] = mirror(read_p[i]);
1104     }
1105 }
1106
1107 void reverse_all_reads(global_data_t &globals) {
1108     omp_set_num_threads(globals.num_cpu_threads);
1109     int last_shift = globals.read_len * BITS_PER_CHAR % BITS_PER_WORD;
1110     if (last_shift != 0) last_shift = BITS_PER_WORD - last_shift;
1111
1112 #pragma omp for
1113     for (int64 i = 0; i < globals.num_reads; ++i) {
1114         reverse_one_read(globals.packed_reads + globals.words_per_read * i, globals.words_per_read, last_shift);
1115     }
1116
1117     int64 mid = globals.num_reads / 2;
1118
1119 #pragma omp for
1120     for (int64 i = 0; i < mid; ++i) {
1121         int64 i2 = globals.num_reads - 1 - i;
1122         for (int j = 0; j < globals.words_per_read; ++j) {
1123             swap(globals.packed_reads[i * globals.words_per_read + j], globals.packed_reads[(i2 + 1) * globals.words_per_read - 1 - j]);
1124         }
1125     }
1126
1127     if (globals.num_reads % 2 == 1) {
1128         for (int j = 0; j < globals.words_per_read / 2; ++j) {
1129             swap(globals.packed_reads[mid * globals.words_per_read + j], globals.packed_reads[(mid + 1) * globals.words_per_read - 1 - j]);
1130         }
1131     }
1132 }
1133
1134 void init_global_data(global_data_t &globals, int read_len, int64 max_host_mem, int number_cpu_threads) {
1135     globals.read_len = read_len;
1136     globals.words_per_read = ( read_len + ( CHARS_PER_WORD - 1 ) ) / CHARS_PER_WORD;
1137     globals.num_reads = 0;
1138     globals.max_num_reads = max_host_mem * 0.8 / (globals.words_per_read * sizeof(word));
1139     globals.num_cpu_threads = number_cpu_threads;
1140     globals.max_lv2_entries = 0;
1141
1142     globals.offset_num_bits = 1;
1143     while ((1 << globals.offset_num_bits) - 1 < read_len) {
1144         ++globals.offset_num_bits;
1145     }
1146     globals.offset_mask = (1 << globals.offset_num_bits) - 1;
1147     globals.words_per_suffix = (read_len * BITS_PER_CHAR + globals.offset_num_bits + BITS_PER_WORD - 1) / BITS_PER_WORD;
1148
1149     log("Max host mem: %lld, max number of reads can be loaded: %lld\n", max_host_mem, globals.max_num_reads);
1150     log("Words per read: %d, words per suffix: %d\n", globals.words_per_read, globals.words_per_suffix);
1151 }
1152
1153 void set_global_bucket(global_data_t &globals)
1154 {
1155     for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1156         globals.locals[t].parent = &globals;
1157         globals.locals[t].section_id = t;
1158     }
1159     for ( int t = 0; t < globals.num_cpu_threads - 1; ++t ) {
1160         globals.suffix_fetches[t].parent = &globals;
1161         globals.suffix_fetches[t].thread_id = t;
1162     }
1163
1164     globals.buckets = NULL;
1165     for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1166         globals.section_buckets[t] = NULL;
1167         globals.locals[t].offset_pos = NULL;
1168     }
1169
1170     // realloc
1171     int64 mem_packed_reads = ( globals.num_reads + 32 ) * globals.words_per_read * sizeof(word); // +32 for safety
1172     globals.packed_reads = (word*) realloc( globals.packed_reads, mem_packed_reads );
1173     if ( ! globals.packed_reads ) {
1174         fprintf( stderr, "realloc error, quit\n" );
1175         exit(1);
1176     }
1177     log("Mem allocated for packed reads: %lld bytes\n", mem_packed_reads );
1178
1179     { // distribute reads among threads
1180         int64 each = globals.num_reads / globals.num_cpu_threads;
1181         for ( int t = 0; t < globals.num_cpu_threads - 1; ++t ) {
1182             globals.locals[t].section_start = t * each;
1183             globals.locals[t].section_end = (t + 1) * each;
1184         }
1185         globals.locals[ globals.num_cpu_threads - 1 ].section_start = (globals.num_cpu_threads-1) * each;
1186         globals.locals[ globals.num_cpu_threads - 1 ].section_end = globals.num_reads;
1187     }
1188 }
1189
1190
1191 void set_global_after_reading_initial(global_data_t &globals, int64 max_host_mem, int64 free_gpu_mem) {
1192     for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1193         globals.locals[t].parent = &globals;
1194         globals.locals[t].section_id = t;
1195     }
1196     for ( int t = 0; t < globals.num_cpu_threads - 1; ++t ) {
1197         globals.suffix_fetches[t].parent = &globals;
1198         globals.suffix_fetches[t].thread_id = t;
1199     }
1200
1201     globals.buckets = NULL;
1202     for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1203         globals.section_buckets[t] = NULL;
1204         globals.locals[t].offset_pos = NULL;
1205     }
1206
1207     // realloc
1208     int64 mem_packed_reads = ( globals.num_reads + 32 ) * globals.words_per_read * sizeof(word); // +32 for safety
1209     globals.packed_reads = (word*) realloc( globals.packed_reads, mem_packed_reads );
1210     if ( ! globals.packed_reads ) {
1211         fprintf( stderr, "realloc error, quit\n" );
1212         exit(1);
1213     }
1214     log("Mem allocated for packed reads: %lld bytes\n", mem_packed_reads );
1215
1216     { // distribute reads among threads
1217         int64 each = globals.num_reads / globals.num_cpu_threads;
1218         for ( int t = 0; t < globals.num_cpu_threads - 1; ++t ) {
1219             globals.locals[t].section_start = t * each;
1220             globals.locals[t].section_end = (t + 1) * each;
1221         }
1222         globals.locals[ globals.num_cpu_threads - 1 ].section_start = (globals.num_cpu_threads-1) * each;
1223         globals.locals[ globals.num_cpu_threads - 1 ].section_end = globals.num_reads;
1224     }
1225
1226     // preprocessing - compute memory limits
1227
1228     free_gpu_mem -= 1073741824; // remain 1G for b40c
1229     const int GPU_BYTES_PER_ENTRY = 16; // key, value, and x2 for radix sort buffer
1230     int64 max_lv2_entries = free_gpu_mem / GPU_BYTES_PER_ENTRY;
1231
1232     if (max_lv2_entries >= (1 << 29)) {
1233         max_lv2_entries = (1 << 29) - 1; // TODO hard
1234     }
1235     globals.max_lv2_entries = max_lv2_entries;
1236     log( "Free GPU MEM = %lld\n", free_gpu_mem);
1237
1238     int64 lv2_bytes_per_entry = ( globals.words_per_suffix + 1 + 1) * sizeof(word); // read_id_table, permutation
1239     int64 mem_lv2 = lv2_bytes_per_entry * max_lv2_entries * 2; // *2 for double buffering
1240
1241     log( "max host mem: %lld, mem_packed_reads %lld, mem_lv2 %lld, global.num_reads %lld", max_host_mem, mem_packed_reads, mem_lv2, globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2);
1242     int64 avail_host_mem = max_host_mem - mem_packed_reads - mem_lv2 - globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2;
1243
1244     if(avail_host_mem < 1024*1024)
1245     {
1246         log( "Please set max_host_mem larger than: %lld\n", mem_packed_reads + mem_lv2 + globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2 + 1024*1024);
1247         exit(2);
1248     }
1249     const int LV1_BYTES_PER_ENTRY = 4; // 32-bit adjacent-difference representation
1250     int64 max_lv1_entries = avail_host_mem / LV1_BYTES_PER_ENTRY;
1251     globals.max_lv1_entries = max_lv1_entries;
1252
1253     log( "Avail host MEM = %lld\n", avail_host_mem );
1254     log( "Max # of lv.1 entries = %lld\n", max_lv1_entries );
1255     log( "Max # of lv.2 entries = %lld\n", max_lv2_entries );
1256     log( "\n" );
1257
1258 #ifdef DISABLE_GPU
1259     globals.cpu_sort_space = (uint64_t*) MallocAndCheck(sizeof(uint64_t) * globals.max_lv2_entries, __FILE__, __LINE__); // as CPU memory is used to simulate GPU
1260     assert(globals.cpu_sort_space != NULL);
1261 #endif
1262
1263     // allocate memory for lv.1 and lv.2 arrays
1264     globals.lv1_offsets = (int*) malloc( max_lv1_entries * sizeof(int) );
1265     globals.lv2_suffixes = (word*) malloc( max_lv2_entries * globals.words_per_suffix * sizeof(word) );
1266     globals.lv2_suffixes_row_width = max_lv2_entries;
1267     globals.lv2_read_id = (unsigned int*) malloc( max_lv2_entries * sizeof(unsigned int) );
1268     globals.sort_indexes = (word*) malloc( max_lv2_entries * sizeof(word) );
1269
1270     globals.lv2_suffixes_to_output = (word*) malloc( max_lv2_entries * globals.words_per_suffix * sizeof(word) );
1271     globals.lv2_read_id_to_output = (unsigned int*) malloc( max_lv2_entries * sizeof(unsigned int) );
1272     globals.sort_indexes_to_output = (word*) malloc( max_lv2_entries * sizeof(word) );
1273
1274     pthread_spin_init(&globals.large_diff_lock, 0);
1275 }
1276
1277 void set_global_after_lv0(global_data_t &globals, int64 max_host_mem, int64 free_gpu_mem, uint64_t max_bucket_size)
1278 {
1279     int64 max_lv2_entries;
1280
1281 #ifdef DISABLE_GPU
1282     if(max_bucket_size*1.2 < 2*1024*1024)
1283         max_lv2_entries = 2*1024*1024;
1284     else
1285         max_lv2_entries = max_bucket_size * 1.2;
1286
1287     globals.max_lv2_entries = max_lv2_entries;
1288     globals.cpu_sort_space = (uint64_t*) MallocAndCheck(sizeof(uint64_t) * globals.max_lv2_entries, __FILE__, __LINE__); // as CPU memory is used to simulate GPU
1289     assert(globals.cpu_sort_space != NULL);
1290 #else
1291     const int GPU_BYTES_PER_ENTRY = 16; // key, value, and x2 for radix sort buffer
1292     max_lv2_entries = (free_gpu_mem - 1073741824) / GPU_BYTES_PER_ENTRY;
1293     if (max_lv2_entries >= (1 << 29)) {
1294         max_lv2_entries = (1 << 29) - 1;
1295     }
1296     if(max_lv2_entries < (int64_t)max_bucket_size)
1297     {
1298         log ("Too many items for GPU sorting %lld > %lld! Please try larger GPU memory or use CPU version instead.", (int64_t)max_bucket_size, max_lv2_entries);
1299         exit(1);
1300     }
1301     globals.max_lv2_entries = max_lv2_entries;
1302 #endif
1303     int64 lv2_bytes_per_entry = ( globals.words_per_suffix + 1 + 1) * sizeof(word); // read_id_table, permutation
1304     int64 mem_lv2 = lv2_bytes_per_entry * max_lv2_entries * 2; // *2 for double buffering
1305
1306     int64 mem_packed_reads = ( globals.num_reads + 32 ) * globals.words_per_read * sizeof(word); // +32 for safety
1307
1308     log( "max host mem: %lld, mem_packed_reads %lld, mem_lv2 %lld, global.num_reads %lld", max_host_mem, mem_packed_reads, mem_lv2, globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2);
1309     int64 avail_host_mem = max_host_mem - mem_packed_reads - mem_lv2 - globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2;
1310
1311     if(avail_host_mem < 1024*1024)
1312     {
1313         log( "Please set max_host_mem larger than: %lld\n", mem_packed_reads + mem_lv2 + globals.num_reads * sizeof(char) * SPECIAL_HANDLE_SUFFIX_LENGTH * 2 + 1024*1024);
1314         exit(2);
1315     }
1316
1317     const int LV1_BYTES_PER_ENTRY = 4; // 32-bit adjacent-difference representation
1318     int64 max_lv1_entries = avail_host_mem / LV1_BYTES_PER_ENTRY;
1319     globals.max_lv1_entries = max_lv1_entries;
1320
1321     log( "Avail host MEM = %lld\n", avail_host_mem );
1322     log( "Max # of lv.1 entries = %lld\n", max_lv1_entries );
1323     log( "Max # of lv.2 entries = %lld\n", max_lv2_entries );
1324     log( "\n" );
1325
1326     globals.lv1_offsets = (int*) malloc( max_lv1_entries * sizeof(int) );
1327     globals.lv2_suffixes = (word*) malloc( max_lv2_entries * globals.words_per_suffix * sizeof(word) );
1328     globals.lv2_suffixes_row_width = max_lv2_entries;
1329     globals.lv2_read_id = (unsigned int*) malloc( max_lv2_entries * sizeof(unsigned int) );
1330     globals.sort_indexes = (word*) malloc( max_lv2_entries * sizeof(word) );
1331
1332     globals.lv2_suffixes_to_output = (word*) malloc( max_lv2_entries * globals.words_per_suffix * sizeof(word) );
1333     globals.lv2_read_id_to_output = (unsigned int*) malloc( max_lv2_entries * sizeof(unsigned int) );
1334     globals.sort_indexes_to_output = (word*) malloc( max_lv2_entries * sizeof(word) );
1335
1336     pthread_spin_init(&globals.large_diff_lock, 0);
1337 }
1338
1339 uint64_t lv0_fill_buckets(global_data_t &globals) {
1340     // preprocessing - fill buckets
1341     if (globals.buckets == NULL)
1342         globals.buckets = (int*) malloc( NUM_BUCKETS * sizeof(int) );
1343     for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1344         if (globals.section_buckets[t] == NULL)
1345             globals.section_buckets[t] = (int*) malloc( NUM_BUCKETS * sizeof(int) );
1346         memset(globals.section_buckets[t], 0, sizeof(int) * NUM_BUCKETS);
1347     }
1348     log( "Preprocessing: filling buckets... " );
1349     for ( int t = 1; t < globals.num_cpu_threads; ++t ) {
1350         pthread_create( &globals.threads[t], NULL, preprocess_fill_buckets, &globals.locals[t] );
1351     }
1352     preprocess_fill_buckets( &globals.locals[0] );
1353     for ( int t = 1; t < globals.num_cpu_threads; ++t ) {
1354         pthread_join( globals.threads[t], NULL );
1355     }
1356     log( " Done\n" );
1357
1358     // sum up local buckets
1359     memset( globals.buckets, 0, NUM_BUCKETS * sizeof(int) );
1360     for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1361         for ( int b = 0; b < NUM_BUCKETS; ++b )
1362             globals.buckets[b] += globals.section_buckets[t][b];
1363         if (globals.locals[t].offset_pos == NULL)
1364             globals.locals[t].offset_pos = (int64*) malloc( NUM_BUCKETS * sizeof( int64 ) );
1365         memset(globals.locals[t].offset_pos, 0, sizeof(int64) * NUM_BUCKETS);
1366     }
1367
1368     int64_t max_buckets=0;
1369     for (int i = 0; i < NUM_BUCKETS; ++i)
1370     {
1371         if(globals.max_lv2_entries == 0)
1372         {
1373             if(globals.buckets[i] > max_buckets)
1374                 max_buckets = globals.buckets[i];
1375         }else{
1376             if (globals.buckets[i] > globals.max_lv2_entries) {
1377                 fprintf(stderr, "Too many items for GPU sorting! Bucket: %d, number of items: %d\n", i, globals.buckets[i]);
1378                 exit(1);
1379             }
1380         }
1381     }
1382     if(globals.max_lv2_entries == 0)
1383         log ("Maximum Bucket size: %lld\n", max_buckets);
1384
1385     return max_buckets;
1386 }
1387
1388 void* lv2_output_thread(void *_data) {
1389     global_data_t &globals = *((global_data_t*)_data);
1390     log( "Writing output... \n" );
1391     for (int i = 0; i < globals.num_lv2_to_output; ++i) {
1392         int c = globals.sort_indexes_to_output[i] >> 29; // TODO hard
1393         int cur_prefix = get_base5_prefix( globals.lv2_suffixes_to_output[globals.sort_indexes_to_output[i] & ((1 << 29) - 1)]);
1394         assert(cur_prefix < SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS);
1395         while (cur_prefix > globals.cur_special_handle_prefix) {
1396             dump_special_prefix(globals.cur_special_handle_prefix);
1397             for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1398                 fwrite( globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].c_str(), sizeof(char),
1399                         globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].size(), globals.out_file );
1400                 globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].clear();
1401             }
1402
1403             ++globals.cur_special_handle_prefix;
1404             while (globals.cur_special_handle_prefix < SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS) {
1405                 if (is_valid_special_prefix(globals.cur_special_handle_prefix)) {
1406                     break;
1407                 }
1408                 ++globals.cur_special_handle_prefix;
1409             }
1410         }
1411         fprintf(globals.out_file, "%c", dna_char[ c ] );
1412         if ( c == SENTINEL_INT && globals.need_ridt ) {
1413             // TODO offsets are accumulated....
1414             unsigned int id = globals.lv2_read_id_to_output[ globals.sort_indexes_to_output[i] & 0x1FFFFFFF ]; // TODO hard
1415             fwrite( &id, sizeof(unsigned int), 1, globals.map_out_file );
1416         }
1417     }
1418     return NULL;
1419 }
1420
1421 void build_bwt(global_data_t &globals, const char *output_prefix, bool need_ridt = false) {
1422
1423     for (int64 i = globals.num_reads - 10; i < globals.num_reads; ++i) {
1424         dump_read(globals.packed_reads + i * globals.words_per_read, globals.words_per_read, globals.read_len);
1425     }
1426
1427     pthread_t output_thread;
1428     bool output_started = false;
1429
1430     if(need_ridt)
1431         globals.out_file = fopen64( (string(output_prefix) + ".f.bwt.ascii").c_str(), "w" );
1432     else
1433         globals.out_file = fopen64( (string(output_prefix) + ".r.bwt.ascii").c_str(), "w" );
1434     globals.need_ridt = need_ridt;
1435
1436     if (need_ridt) {
1437         globals.map_out_file = fopen64( (string(output_prefix) + ".ridt").c_str(), "wb" );
1438         fwrite( &globals.num_reads, sizeof(unsigned int), 1, globals.map_out_file );
1439     }
1440     // level 1 loop - pick consecutive buckets
1441     int lv1_round = 0;
1442     globals.cur_special_handle_prefix = 0;
1443     while (!is_valid_special_prefix(globals.cur_special_handle_prefix)) {
1444         ++globals.cur_special_handle_prefix;
1445     }
1446     globals.lv1_bucket_start = 0;
1447     while ( globals.lv1_bucket_start < NUM_BUCKETS ) {
1448         lv1_round++;
1449         globals.v_large_diff.clear();
1450         int64 num_lv1 = globals.buckets[ globals.lv1_bucket_start ];
1451         int temp_end = globals.lv1_bucket_start + 1; // remember that 'end' is exclusive
1452         while ( temp_end < NUM_BUCKETS && num_lv1 + globals.buckets[ temp_end ] <= globals.max_lv1_entries ) {
1453             num_lv1 += globals.buckets[ temp_end++ ];
1454         }
1455         globals.lv1_bucket_end = temp_end; // remember that 'end' is exclusive
1456
1457         log( "Lv.1 Round %d: buckets %d to %d, num items = %lld\n", lv1_round,
1458              globals.lv1_bucket_start, globals.lv1_bucket_end-1, num_lv1 );
1459         // lv 1 bucket start & end found! fetch the offsets now
1460
1461         // compute bucket positions for the offset arrays
1462         { // t = 0
1463             int64 *pos = globals.locals[0].offset_pos;
1464             pos[globals.lv1_bucket_start] = 0;
1465             for ( int i = globals.lv1_bucket_start+1; i < globals.lv1_bucket_end; ++i ) {
1466                 pos[i] = pos[i-1] + globals.buckets[i-1];
1467             }
1468         }
1469
1470         for ( int t = 1; t < globals.num_cpu_threads; ++t ) { // t > 0
1471             int64 *pos = globals.locals[t].offset_pos + globals.lv1_bucket_start;
1472             int64 *pre_pos = globals.locals[t-1].offset_pos + globals.lv1_bucket_start;
1473             int *pre_buckets = globals.section_buckets[t-1] + globals.lv1_bucket_start;
1474             for ( int i = globals.lv1_bucket_start; i < globals.lv1_bucket_end; ++i ) {
1475                 *(pos++) = *(pre_pos++) + *(pre_buckets++);
1476             }
1477         }
1478
1479         log( "  Fetching offsets... " );
1480         for ( int t = 1; t < globals.num_cpu_threads; ++t ) {
1481             pthread_create( &globals.threads[t], NULL, lv1_fetch_offsets, &globals.locals[t] );
1482         }
1483         lv1_fetch_offsets( &globals.locals[0] );
1484         for ( int t = 1; t < globals.num_cpu_threads; ++t ) {
1485             pthread_join( globals.threads[t], NULL );
1486         }
1487         log( "Done. Number of large diff: %lld\n", globals.v_large_diff.size());
1488
1489         // level 2 loop
1490         int lv2_round = 0;
1491         globals.lv2_offset_start_index = 0;
1492         globals.lv2_bucket_start = globals.lv1_bucket_start;
1493         while ( globals.lv2_bucket_start < globals.lv1_bucket_end ) {
1494             lv2_round++;
1495             int64 num_lv2 = globals.buckets[ globals.lv2_bucket_start ];
1496             if (num_lv2 > globals.max_lv2_entries) { // TROUBLESOME CASES
1497                 fprintf( stderr, "ERROR: Single bucket too large for lv.2: %lld (", num_lv2 );
1498                 debug_prefix( stderr, globals.lv2_bucket_start );
1499                 fprintf( stderr, ")\n");
1500                 globals.lv2_bucket_start++;
1501                 globals.lv2_offset_start_index += num_lv2;
1502                 continue;
1503             }
1504             int temp_end2 = globals.lv2_bucket_start + 1; // remember that 'end' is exclusive
1505             while ( temp_end2 < globals.lv1_bucket_end && num_lv2 + globals.buckets[ temp_end2 ] <= globals.max_lv2_entries ) {
1506                 num_lv2 += globals.buckets[ temp_end2++ ];
1507             }
1508             globals.lv2_bucket_end = temp_end2; // remember that 'end' is exclusive
1509
1510             if (num_lv2 == 0) { // TROUBLESOME CASES
1511                 fprintf( stderr, "ERROR: Single bucket too large for lv.2: %d (", globals.buckets[temp_end2] );
1512                 debug_prefix( stderr, temp_end2 );
1513                 fprintf( stderr, ")\n");
1514                 globals.lv2_bucket_start = temp_end2 + 1;
1515                 globals.lv2_offset_start_index += globals.buckets[temp_end2];
1516                 continue;
1517             }
1518
1519             {
1520                 log("  Lv.2 Round %d: buckets %d to %d, num items = %lld\n", lv2_round,
1521                     globals.lv2_bucket_start, globals.lv2_bucket_end-1, num_lv2 );
1522                 // lv 2 bucket start & end found! fetch the suffixes now
1523
1524                 // distribute to threads
1525                 int64 each = num_lv2 / ( globals.num_cpu_threads - 1 );
1526                 each = !each ? 1 : each; // min is 1
1527                 int b_start = globals.lv2_bucket_start;
1528                 int64 cumulate_num_items = 0;
1529                 for (int t = 0; t < (globals.num_cpu_threads-1) - 1; ++t) {
1530                     globals.suffix_fetches[t].bucket_start = b_start;
1531                     globals.suffix_fetches[t].index_start = cumulate_num_items;
1532                     int64 num_items = 0;
1533                     while ( num_items < each && b_start < globals.lv2_bucket_end ) {
1534                         num_items += globals.buckets[ b_start++ ];
1535                     }
1536                     globals.suffix_fetches[t].bucket_end = b_start;
1537                     cumulate_num_items += num_items;
1538                 }
1539                 globals.suffix_fetches[ (globals.num_cpu_threads-1) - 1 ].bucket_start = b_start;
1540                 globals.suffix_fetches[ (globals.num_cpu_threads-1) - 1 ].index_start = cumulate_num_items;
1541                 globals.suffix_fetches[ (globals.num_cpu_threads-1) -1 ].bucket_end = globals.lv2_bucket_end;
1542
1543                 memset( globals.lv2_suffixes, 0xFF, globals.lv2_suffixes_row_width * globals.words_per_suffix * sizeof(word) ); // important: must fill all '1' bits
1544                 log( "   -> Fetching suffixes... " );
1545
1546                 for ( int t = 0; t < globals.num_cpu_threads-1; ++t ) {
1547                     pthread_create( &globals.threads[t], NULL, lv2_fetch_suffixes, &globals.suffix_fetches[t] );
1548                 }
1549                 for ( int t = 0; t < globals.num_cpu_threads-1; ++t ) {
1550                     pthread_join( globals.threads[t], NULL );
1551                 }
1552
1553                 // suffixes fetched! now sort them with GPU
1554                 /////////////////////////////////// GPU SORTING ////////////////////////////////
1555                 log( "Sorting suffixes with CPU ... " );
1556 #ifdef DISABLE_GPU
1557                                 omp_set_num_threads(globals.num_cpu_threads - 1);
1558                                 lv2_cpu_sort(globals.lv2_suffixes, globals.sort_indexes, globals.cpu_sort_space, globals.words_per_suffix, globals.lv2_suffixes_row_width, num_lv2);
1559                                 omp_set_num_threads(globals.num_cpu_threads);
1560 #else
1561                                 lv2_gpu_sort(globals.lv2_suffixes, globals.sort_indexes, globals.words_per_suffix, globals.lv2_suffixes_row_width, num_lv2);
1562 #endif
1563                 /////// output here ///////
1564                 if (output_started) {
1565                     pthread_join(output_thread, NULL);
1566                 }
1567                 swap(globals.sort_indexes, globals.sort_indexes_to_output);
1568                 swap(globals.lv2_suffixes, globals.lv2_suffixes_to_output);
1569                 swap(globals.lv2_read_id, globals.lv2_read_id_to_output);
1570                 globals.num_lv2_to_output = num_lv2;
1571                 pthread_create(&output_thread, NULL, lv2_output_thread, (void*)&globals);
1572                 output_started = true;
1573             }
1574
1575             globals.lv2_bucket_start = globals.lv2_bucket_end;
1576             globals.lv2_offset_start_index += num_lv2;
1577             log( "Round ends.\n" );
1578         }
1579
1580         globals.lv1_bucket_start = globals.lv1_bucket_end;
1581         log( "Lv.1 Round %d ends\n\n", lv1_round );
1582     }
1583     // END level 1 loop
1584
1585     if (output_started) {
1586         pthread_join(output_thread, NULL);
1587     }
1588     // print last BWT segment (those chars corresponding to '$')
1589     while (globals.cur_special_handle_prefix < SPECIAL_HANDLE_SUFFIX_NUM_BUCKETS) {
1590         if (is_valid_special_prefix(globals.cur_special_handle_prefix)) {
1591             dump_special_prefix(globals.cur_special_handle_prefix);
1592             for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
1593                 fwrite( globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].c_str(), sizeof(char),
1594                         globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].size(), globals.out_file );
1595                 globals.locals[t].bwt_buffer[globals.cur_special_handle_prefix].clear();
1596             }
1597         }
1598         ++globals.cur_special_handle_prefix;
1599     }
1600
1601     fclose( globals.out_file );
1602     if (need_ridt)
1603         fclose( globals.map_out_file );
1604 }
1605
1606 void read_input_file( struct global_data_t &globals, char* filename ) {
1607     /////////////////// READ INPUT (FASTQ) /////////////////////
1608     int read_len = globals.read_len;
1609     int words_per_read = globals.words_per_read;
1610     word *packed_reads = NULL;
1611     int64 num_reads = 0;
1612
1613 #define READ_BUFFER_SIZE 4096
1614     gzFile in_file = gzopen( filename, "r" );
1615     int num_bytes = 0;
1616     char buffer[ READ_BUFFER_SIZE + 10 ];
1617     packed_reads = (word*) malloc( globals.max_num_reads * words_per_read * sizeof(word) );
1618     // memset( packed_reads, 0xFF, globals.max_num_reads * words_per_read * sizeof(word) );
1619     word *packed_reads_p = packed_reads;
1620     int last_shift = read_len % CHARS_PER_WORD ? ( CHARS_PER_WORD - read_len % CHARS_PER_WORD ) * BITS_PER_CHAR : 0;
1621
1622 #define REFILL_BUFFER( _in_file, _buffer, _num_bytes )                  \
1623     do {                                                                \
1624         (_num_bytes) = gzread((_in_file),(_buffer),READ_BUFFER_SIZE);   \
1625         (_buffer)[(_num_bytes)] = 0;                                    \
1626     } while (0)
1627
1628     REFILL_BUFFER( in_file, buffer, num_bytes );
1629     char *p = buffer;
1630     word *read_start_ptr;
1631     int is_FASTQ = ( buffer[0] == '@' );
1632     char read_terminator = is_FASTQ ? '+' : '>';
1633     while ( num_bytes ) {
1634         read_start_ptr = packed_reads_p;
1635         // consume read name @xxxxxxx
1636         p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1637         while (!p) {
1638             REFILL_BUFFER( in_file, buffer, num_bytes );
1639             if ( !num_bytes ) break; // end of file
1640             p = (char*) memchr( buffer, '\n', num_bytes );
1641         }
1642         if ( !num_bytes ) break; // end of file
1643         if (! *(++p) ) {
1644             REFILL_BUFFER( in_file, buffer, num_bytes );
1645             p = buffer;
1646         }
1647         // consume read sequence ACGT
1648         word w = 0;
1649         int index = 0;
1650         while ( *p != read_terminator ) { // FASTQ: +  FASTA:  >
1651             if ( *p >= 'A' ) {
1652                 if ( index % CHARS_PER_WORD == 0 ) {
1653                     if ( index ) {
1654                         *packed_reads_p = w;
1655                         packed_reads_p++;
1656                         w = 0;
1657                     }
1658                 }
1659                 w = (w << BITS_PER_CHAR) | dna_map[(unsigned char)( *p) ]; // order: MSB to LSB
1660                 index++;
1661             }
1662             p++;
1663             if (!(*p)) {
1664                 REFILL_BUFFER( in_file, buffer, num_bytes );
1665                 if ( !num_bytes ) break;
1666                 p = buffer;
1667             }
1668         }
1669         // write last word
1670         *packed_reads_p = w << last_shift;
1671         *packed_reads_p |= ( 0xFFFFFFFF >> ( BITS_PER_WORD - last_shift ) ); // TODO hard. add padding
1672         packed_reads_p++;
1673
1674         if ( is_FASTQ ) {
1675             // consume + sign
1676             p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1677             while (!p) {
1678                 REFILL_BUFFER( in_file, buffer, num_bytes );
1679                 p = (char*) memchr( buffer, '\n', num_bytes );
1680             }
1681             p++;
1682             // consume base quality
1683             p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1684             while (!p) {
1685                 REFILL_BUFFER( in_file, buffer, num_bytes );
1686                 if ( !num_bytes ) break; // end of file
1687                 p = (char*) memchr( buffer, '\n', num_bytes );
1688             }
1689             p++;
1690         } // end FASTQ only
1691
1692         if (index == globals.read_len) {
1693             ++num_reads;
1694         } else { // trouble...
1695             log( "ERROR: read length of read #%lld is %d\n", num_reads, index);
1696             packed_reads_p = read_start_ptr;
1697         }
1698
1699         if ( !num_bytes ) break; // end of file
1700     }
1701
1702     gzclose( in_file );
1703
1704     globals.packed_reads = packed_reads;
1705     globals.num_reads = num_reads;
1706
1707     ////////////////////// END READING INPUT //////////////////////
1708 }
1709
1710 void read_single_file(FILE *fq,  char* filename,  int64* read_num, int read_len, int threshold)
1711 {
1712     /////////////////// READ INPUT (FASTQ) /////////////////////
1713     int64 num_reads = 0;
1714
1715 #define READ_BUFFER_SIZE 4096
1716     gzFile in_file = gzopen( filename, "r" );
1717     //#define GZBUFFER_SIZE 65536
1718     //    gzbuffer( infile, GZBUFFER_SIZE ); // needs zlib 1.2.4
1719     int num_bytes = 0;
1720     char buffer[ READ_BUFFER_SIZE + 10 ];
1721
1722     //memset( packed_quals, 0, 1 * words_per_qual * sizeof(word) );
1723
1724 #define REFILL_BUFFER( _in_file, _buffer, _num_bytes )                  \
1725     do {                                                                \
1726         (_num_bytes) = gzread((_in_file),(_buffer),READ_BUFFER_SIZE);   \
1727         (_buffer)[(_num_bytes)] = 0;                                    \
1728     } while (0)
1729
1730     REFILL_BUFFER( in_file, buffer, num_bytes );
1731     char *p = buffer;
1732     word *read_start_ptr;
1733     int is_FASTQ = ( buffer[0] == '@' );
1734     char read_terminator = is_FASTQ ? '+' : '>';
1735     while ( num_bytes ) {
1736         read_start_ptr = packed_reads_p;
1737         // consume read name @xxxxxxx
1738         p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1739         while (!p) {
1740             REFILL_BUFFER( in_file, buffer, num_bytes );
1741             if ( !num_bytes ) break; // end of file
1742             p = (char*) memchr( buffer, '\n', num_bytes );
1743         }
1744         if ( !num_bytes ) break; // end of file
1745         if (! *(++p) ) {
1746             REFILL_BUFFER( in_file, buffer, num_bytes );
1747             p = buffer;
1748         }
1749         // consume read sequence ACGT
1750         word w = 0;
1751         int index = 0;
1752         while ( *p != read_terminator ) { // FASTQ: +  FASTA:  >
1753             if ( *p >= 'A' ) {
1754                 if ( index % CHARS_PER_WORD == 0 ) {
1755                     if ( index ) {
1756                         *packed_reads_p = w;
1757                         packed_reads_p++;
1758                         w = 0;
1759                     }
1760                 }
1761                 w = (w << BITS_PER_CHAR) | dna_map[(unsigned char)( *p )]; // order: MSB to LSB
1762                 index++;
1763             }
1764             p++;
1765             if (!(*p)) {
1766                 REFILL_BUFFER( in_file, buffer, num_bytes );
1767                 if ( !num_bytes ) break;
1768                 p = buffer;
1769             }
1770         }
1771         // write last word
1772         *packed_reads_p = w << last_shift;
1773         *packed_reads_p |= ( 0xFFFFFFFF >> ( BITS_PER_WORD - last_shift ) ); // TODO hard. add padding
1774         packed_reads_p++;
1775
1776         if ( is_FASTQ ) {
1777             word *packed_quals_p = packed_quals;
1778             // consume + sign
1779             p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1780             while (!p) {
1781                 REFILL_BUFFER( in_file, buffer, num_bytes );
1782                 p = (char*) memchr( buffer, '\n', num_bytes );
1783             }
1784             p++;
1785             if (!(*p)) {
1786                 REFILL_BUFFER( in_file, buffer, num_bytes );
1787                 if ( !num_bytes ) break;
1788                 p = buffer;
1789             }
1790             // consume base quality
1791             w = 0;
1792             index = 0;
1793             while( *p != '@' && *p != '\n')
1794             {
1795                 //fprintf(stderr, ";%c,%d", *p, w);
1796                 if(index % BITS_PER_WORD == 0){
1797                     if(index){
1798                         *packed_quals_p = w;
1799                         packed_quals_p++;
1800                         w = 0;
1801                     }
1802                 }
1803                 w = (w << BITS_PER_QUAL) | (*p >= threshold);
1804                 index++;
1805                 p++;
1806                 if (!(*p)) {
1807                     REFILL_BUFFER( in_file, buffer, num_bytes );
1808                     if ( !num_bytes ) break;
1809                     p = buffer;
1810                 }
1811             }
1812             *packed_quals_p = w << last_shift_qual;
1813             //*packed_quals_p |= (0xFFFFFFFF >> ( BITS_PER_QUAL - last_shift_qual ));
1814             packed_quals_p ++;
1815             fwrite( packed_quals, sizeof(word), 1 * words_per_qual, fq);
1816             //memset( packed_quals, 0, 1 * words_per_qual * sizeof(word) );
1817         } // end FASTQ only
1818
1819         if (index == read_len) {
1820             ++num_reads;
1821         } else { // trouble...
1822             log( "SINGLE READ ERROR: read length of read #%lld is %d\n", num_reads, index);
1823             packed_reads_p = read_start_ptr;
1824         }
1825
1826         if ( !num_bytes ) break; // end of file
1827     }
1828
1829     gzclose( in_file );
1830
1831     (*read_num) = (*read_num) + num_reads;
1832     ////////////////////// END READING INPUT //////////////////////
1833 }
1834
1835 void read_pair_file(FILE *fq, char* file1, char* file2,  int64* read_num, int read_len, int threshold)
1836 {
1837     /////////////////// READ INPUT (FASTQ) /////////////////////
1838     int64 num_reads = 0;
1839
1840 #define READ_BUFFER_SIZE 4096
1841     gzFile in_file = gzopen( file1, "r" );
1842     gzFile in_file2 = gzopen( file2, "r" );
1843     //#define GZBUFFER_SIZE 65536
1844     //    gzbuffer( infile, GZBUFFER_SIZE ); // needs zlib 1.2.4
1845     int num_bytes = 0, num_bytes2 = 0;
1846     char buffer[ READ_BUFFER_SIZE + 10 ];
1847     char buffer2[ READ_BUFFER_SIZE + 10 ];
1848
1849    // memset( packed_quals, 0, 1 * words_per_qual * sizeof(word) );
1850 #define REFILL_BUFFER( _in_file, _buffer, _num_bytes )                  \
1851     do {                                                                \
1852         (_num_bytes) = gzread((_in_file),(_buffer),READ_BUFFER_SIZE);   \
1853         (_buffer)[(_num_bytes)] = 0;                                    \
1854     } while (0)
1855
1856     REFILL_BUFFER( in_file, buffer, num_bytes );
1857     REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1858     char *p = buffer;
1859     char *p2 = buffer2;
1860     word *read_start_ptr;
1861     int is_FASTQ = ( buffer[0] == '@' );
1862     char read_terminator = is_FASTQ ? '+' : '>';
1863     while ( num_bytes && num_bytes2) {
1864         read_start_ptr = packed_reads_p;
1865         // consume read name @xxxxxxx
1866         p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
1867         p2 = (char*) memchr( p2, '\n', num_bytes2 - (p2-buffer2) );
1868         /*while (!p || !p2) {
1869             REFILL_BUFFER( in_file, buffer, num_bytes );
1870             REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1871             if ( !num_bytes || !num_bytes2) break; // end of file
1872             p = (char*) memchr( buffer, '\n', num_bytes );
1873             p2 = (char*) memchr( buffer2, '\n', num_bytes2 );
1874         }*/
1875         while(!p)
1876         {
1877             REFILL_BUFFER( in_file, buffer, num_bytes );
1878             if ( !num_bytes)
1879                 break;
1880             p = (char*) memchr( buffer, '\n', num_bytes );
1881         }
1882         while(!p2)
1883         {
1884             REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1885             if( !num_bytes2)
1886                 break;
1887             p2 = (char*) memchr( buffer2, '\n', num_bytes2 );
1888         }
1889         if ( !num_bytes || !num_bytes2) break; // end of file
1890         // consume read sequence ACGT
1891         word w = 0;
1892         int index = 0;
1893         //fprintf(stderr, "R1:");
1894         //if(*p == '@' || *p == '>')
1895         {
1896             while(*p != '\n')
1897             {
1898                 p++;
1899                 if (!(*p)) {
1900                     REFILL_BUFFER( in_file, buffer, num_bytes );
1901                     if ( !num_bytes ) break;
1902                     p= buffer;
1903                 }
1904             }
1905             p++;
1906             if (!(*p)) {
1907                 REFILL_BUFFER( in_file, buffer, num_bytes );
1908                 if ( !num_bytes ) break;
1909                 p= buffer;
1910             }
1911
1912         }
1913         while ( *p != read_terminator ) { // FASTQ: +  FASTA:  >
1914             //fprintf(stderr, "%c", *p);
1915             if ( *p >= 'A') {
1916                 if ( index % CHARS_PER_WORD == 0 ) {
1917                     if ( index ) {
1918                         *packed_reads_p = w;
1919                         packed_reads_p++;
1920                         w = 0;
1921                     }
1922                 }
1923                 w = (w << BITS_PER_CHAR) | dna_map[(unsigned char)(*p) ]; // order: MSB to LSB
1924                 index++;
1925             }
1926             p++;
1927             if (!(*p)) {
1928                 REFILL_BUFFER( in_file, buffer, num_bytes );
1929                 if ( !num_bytes ) break;
1930                 p = buffer;
1931             }
1932         }
1933
1934         if (index != read_len) {
1935             log( "ERROR: read length of read1 #%lld is %d\n", num_reads, index);
1936             p -= index;
1937             int c=0;
1938             while(c < index)
1939             {
1940                 log( " %d,%c", c, *p);
1941                 p++;
1942                 c++;
1943             }
1944             log( "\n");
1945             exit(1);
1946         }
1947         //fprintf(stderr, "\nR2:");
1948         // write last word
1949         *packed_reads_p = w << last_shift;
1950         *packed_reads_p |= ( 0xFFFFFFFF >> ( BITS_PER_WORD - last_shift ) ); // TODO hard. add padding
1951         packed_reads_p++;
1952
1953         w = 0;
1954         index = 0;
1955         //if(*(p2) == '@' || *(p2) == '>')
1956         {
1957             while(*p2 != '\n')
1958             {
1959                 p2++;
1960                 if (!(*p2)) {
1961                     REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1962                     if ( !num_bytes2 ) break;
1963                     p2 = buffer2;
1964                 }
1965             }
1966             p2++;
1967
1968             if (!(*p2)) {
1969                 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1970                 if ( !num_bytes2 ) break;
1971                 p2 = buffer2;
1972             }
1973         }
1974
1975         while ( *p2 != read_terminator ) { // FASTQ: +  FASTA:  >
1976             //fprintf(stderr, "%c", *p2);
1977             if ( *p2 >= 'A') {
1978                 if ( index % CHARS_PER_WORD == 0 ) {
1979                     if ( index ) {
1980                         *packed_reads_p = w;
1981                         packed_reads_p++;
1982                         w = 0;
1983                     }
1984                 }
1985                 w = (w << BITS_PER_CHAR) | dna_map[(unsigned char) (*p2) ]; // order: MSB to LSB
1986                 index++;
1987             }
1988             p2++;
1989             if (!(*p2)) {
1990                 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
1991                 if ( !num_bytes2 ) break;
1992                 p2 = buffer2;
1993             }
1994         }
1995         //fprintf(stderr, "\nq1:");
1996         // write last word
1997         *packed_reads_p = w << last_shift;
1998         *packed_reads_p |= ( 0xFFFFFFFF >> ( BITS_PER_WORD - last_shift ) ); // TODO hard. add padding
1999         packed_reads_p++;
2000
2001         if (index != read_len) {
2002             log( "READ ERROR: read length of read2 #%lld is %d\n", num_reads, index);
2003             p2 -= index;
2004             int c=0;
2005             while(c < index)
2006             {
2007                 log( " %d,%c", c, *p2);
2008                 p2++;
2009                 c++;
2010             }
2011             log( "\n");
2012             exit(1);
2013         }
2014         if ( is_FASTQ ) {
2015             word *packed_quals_p1 = packed_quals;
2016             word *packed_quals_p2 = packed_quals;
2017             // consume + sign
2018             p = (char*) memchr( p, '\n', num_bytes - (p-buffer) );
2019             while (!p) {
2020                 REFILL_BUFFER( in_file, buffer, num_bytes );
2021                 p = (char*) memchr( buffer, '\n', num_bytes );
2022             }
2023             //p++;
2024             while( *p != '\n')
2025             {
2026                p++;
2027                if(!(*p)){
2028                    REFILL_BUFFER( in_file, buffer, num_bytes );
2029                    if ( !num_bytes ) break;
2030                    p = buffer;
2031                }
2032             }
2033             p++;
2034
2035             if(!(*p)){
2036                REFILL_BUFFER( in_file, buffer, num_bytes );
2037                if ( !num_bytes ) break;
2038                p = buffer;
2039             }
2040             // consume base quality
2041             w = 0;
2042             index = 0;
2043             while( *p != '\n')
2044             {
2045                 //fprintf(stderr, "%c", *p);
2046                 if(index % BITS_PER_WORD == 0){
2047                     if(index){
2048                         *packed_quals_p1 = w;
2049                         packed_quals_p1++;
2050                         w = 0;
2051                     }
2052                 }
2053                 w = (w << BITS_PER_QUAL) | (*p >= threshold);
2054                 index++;
2055                 p++;
2056                 if (!(*p)) {
2057                     REFILL_BUFFER( in_file, buffer, num_bytes );
2058                     if ( !num_bytes ) break;
2059                     p = buffer;
2060                 }
2061             }
2062             p++;
2063             //fprintf(stderr, "\nq2:");
2064             *packed_quals_p1 = w << last_shift_qual;
2065             //*packed_quals_p1 |= (0xFFFFFFFF >> ( BITS_PER_QUAL - last_shift_qual ));
2066             packed_quals_p1 ++;
2067             fwrite( packed_quals, sizeof(word), 1 * words_per_qual, fq);
2068             //memset( packed_quals, 0, 1 * words_per_qual * sizeof(word) );
2069             // consume + sign
2070             p2 = (char*) memchr( p2, '\n', num_bytes2 - (p2-buffer2) );
2071             while (!p2) {
2072                 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
2073                 p2 = (char*) memchr( buffer2, '\n', num_bytes2 );
2074             }
2075             //p2++;
2076             while( *p2 != '\n')
2077             {
2078                 p2++;
2079                 if (!(*p2)) {
2080                    REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
2081                     if ( !num_bytes2 ) break;
2082                     p2 = buffer2;
2083                 }
2084             }
2085             p2++;
2086
2087             if (!(*p2)) {
2088                 REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
2089                 if ( !num_bytes2 ) break;
2090                 p2 = buffer2;
2091             }
2092             // consume base quality
2093             w = 0;
2094             index = 0;
2095             while( *p2 != '\n')
2096             {
2097                 //fprintf(stderr, "%c", *p2);
2098                 if(index % BITS_PER_WORD == 0){
2099                     if(index){
2100                         *packed_quals_p2 = w;
2101                         packed_quals_p2++;
2102                         w = 0;
2103                     }
2104                 }
2105                 w = (w << BITS_PER_QUAL) | (*p2 >= threshold);
2106                 index++;
2107                 p2++;
2108                 if (!(*p2)) {
2109                     REFILL_BUFFER( in_file2, buffer2, num_bytes2 );
2110                     if ( !num_bytes2 ) break;
2111                     p2 = buffer2;
2112                 }
2113             }
2114             p2++;
2115             //fprintf(stderr, "\n");
2116             *packed_quals_p2 = w << last_shift_qual;
2117             //*packed_quals_p2 |= (0xFFFFFFFF >> ( BITS_PER_QUAL - last_shift_qual ));
2118             packed_quals_p2 ++;
2119             fwrite( packed_quals, sizeof(word), 1 * words_per_qual, fq);
2120             //memset( packed_quals, 0, 1 * words_per_qual * sizeof(word) );
2121         } // end FASTQ only
2122         if (index == read_len) {
2123             num_reads += 2;
2124         } else { // trouble...
2125             log( "READ ERROR: read length of read #%lld is %d\n", num_reads, index);
2126             p2 -= index;
2127             int c=0;
2128             while(c < index)
2129             {
2130                 log( " %d,%c", c, *p2);
2131                 p2++;
2132                 c++;
2133             }
2134             log( "\n");
2135             exit(1);
2136             packed_reads_p = read_start_ptr;
2137         }
2138
2139         if ( !num_bytes ) break; // end of file
2140     }
2141
2142     gzclose( in_file );
2143
2144     (*read_num) = (*read_num) + num_reads;
2145     ////////////////////// END READING INPUT //////////////////////
2146 }
2147
2148 void lib2read( struct global_data_t &globals, char* lib_file, char* prefix)
2149 {
2150     int num_libs=0, i, j, read_length = globals.read_len;
2151     int64 start_num=0, g_num_reads=0;
2152
2153     g_packed_reads = (word*) malloc( globals.max_num_reads * globals.words_per_read * sizeof(word) );
2154     //memset( g_packed_reads, 0, globals.max_num_reads * globals.words_per_read * sizeof(word) );
2155     last_shift = globals.read_len % CHARS_PER_WORD ? ( CHARS_PER_WORD - globals.read_len % CHARS_PER_WORD ) * BITS_PER_CHAR : 0;
2156     packed_reads_p = g_packed_reads;
2157     g_num_reads=0;
2158
2159     words_per_qual = ( globals.read_len + ( BITS_PER_WORD - 1 ) ) / BITS_PER_WORD;
2160     packed_quals = (word*) malloc( 1 * words_per_qual * sizeof(word) );
2161     last_shift_qual = globals.read_len % BITS_PER_WORD ? ( BITS_PER_WORD - globals.read_len % BITS_PER_WORD) * BITS_PER_QUAL : 0;
2162
2163     log("Starting load libraries...\n");
2164     scan_libInfo(lib_file, &num_libs, &read_length);
2165
2166     if(read_length != globals.read_len)
2167     {
2168         log( "Ignore max read length in read library file\n");
2169         read_length = globals.read_len;
2170     }
2171
2172     char name[255];
2173     sprintf(name, "%s.qual.bit", prefix);
2174     FILE *fq = fopen(name, "w"); //output base quality as 0/1.
2175
2176     sprintf(name, "%s.index.peGrade", prefix);
2177     FILE *fg = fopen(name, "w"); //output insert size grade.
2178
2179     log( "Totally there are %d libs\n", num_libs);
2180     for( i=0; i< num_libs; i++ )
2181     {
2182         if(lib_array[i].asm_flag != 1 && lib_array[i].asm_flag != 3)
2183             continue;
2184         if(lib_array[i].avg_ins == 0)
2185             continue;
2186         log( "lib %d has %d paired fastqs\n", i, lib_array[i].num_q1_file);
2187         start_num = g_num_reads;
2188         for( j = 0; j < lib_array[i].num_q1_file; j++ )
2189         {
2190             log( "Loading file: %s ", lib_array[i].q1_fname[j]);
2191             read_pair_file(fq, lib_array[i].q1_fname[j], lib_array[i].q2_fname[j], &g_num_reads, globals.read_len, lib_array[i].qual);
2192             log( ", Using %lld reads\n", g_num_reads);
2193         }
2194         for( j = 0; j < lib_array[i].num_a1_file; j++ )
2195         {
2196             log( "Loading file: %s ", lib_array[i].a1_fname[j]);
2197             read_pair_file(fq, lib_array[i].a1_fname[j], lib_array[i].a2_fname[j], &g_num_reads, globals.read_len, lib_array[i].qual);
2198             log( ", Using %lld reads\n", g_num_reads);
2199         }
2200         for( j = 0; j < lib_array[i].num_p_file; j++)
2201         {
2202             log( "Loading file: %s ", lib_array[i].p_fname[j]);
2203             read_single_file(fq, lib_array[i].p_fname[j], &g_num_reads, globals.read_len, lib_array[i].qual);
2204             log( ", Using %lld reads\n", g_num_reads);
2205         }
2206
2207         if(g_num_reads > start_num)
2208             fprintf(fg, "%lld %lld %d %d\n", start_num, g_num_reads, lib_array[i].avg_ins, lib_array[i].sd);
2209     }
2210
2211     log( "Starting Single-End libraries to reads...\n");
2212     for( i=0; i< num_libs; i++ )
2213     {
2214         if(lib_array[i].asm_flag != 1 && lib_array[i].asm_flag != 3)
2215             continue;
2216         log( "lib %d has %d single fastqs\n", i, lib_array[i].num_s_q_file);
2217         for( j = 0; j < lib_array[i].num_s_q_file; j++)
2218         {
2219             read_single_file(fq, lib_array[i].s_q_fname[j], &g_num_reads, globals.read_len, lib_array[i].qual);
2220         }
2221         for( j = 0; j < lib_array[i].num_s_a_file; j++)
2222             read_single_file(fq, lib_array[i].s_a_fname[j], &g_num_reads, globals.read_len, lib_array[i].qual);
2223     }
2224
2225     globals.packed_reads = g_packed_reads;
2226     globals.num_reads = g_num_reads;
2227
2228     fclose(fq);
2229     fclose(fg);
2230     free_libs(lib_array, num_libs);
2231     free( packed_quals );
2232     log("Loading library finished.\n");
2233 }
2234
2235 int main( int argc, char** argv ) {
2236     struct sysinfo s_info;
2237     sysinfo(&s_info);
2238
2239     if ( argc <3 ) {
2240 #ifdef DISABLE_GPU
2241         log( "Usage:\n  %s read_library read_len output_prefix [thread_num] [max_host_mem] 2> bwt.log\n", argv[0] );
2242         log( "Input:\n 1) read_library has the same format to that of SOAPdenovo, except to set qual_cutoff to define high quality threshold\n 2) read length must be set and reads only with fixed length could be used.\n");
2243 #else
2244         log( "Usage:\n  %s read_library read_len output_prefix [thread_num] [max_host_mem] [GPU_mem] 2> bwt.log\n", argv[0] );
2245         log( "Input:\n 1) read_library has the same format to that of SOAPdenovo, except to set qual_cutoff to define high quality threshold\n 2) read length must be set and reads only with fixed length could be used.\n");
2246 #endif
2247         log( "Output:\n output_prefix.f.bwt.ascii, output_prefix.r.bwt.ascii, output_prefix.ridt\n" );
2248         log("\nMachine Information:\n"
2249                "RAM: total %lld / free %lld\n"
2250                "Swap: total %lld / free %lld\n"
2251                "Number of CPU = %d\n",
2252                s_info.totalram, s_info.freeram,
2253                s_info.totalswap, s_info.freeswap,
2254                sysconf(_SC_NPROCESSORS_ONLN)
2255               );
2256 #ifdef DISABLE_GPU
2257         exit(1);
2258 #else
2259         get_gpu_mem();
2260         exit(1);
2261 #endif
2262     }
2263
2264     init_dna_map();
2265
2266     int read_len = atoi(argv[2]);
2267     char *output_prefix = argv[3];
2268     uint64_t gpu_mem = 0; // = atoll(argv[4]);
2269     uint64_t max_host_mem = s_info.totalram;
2270     int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
2271
2272     if(argc > 4)
2273     {
2274         number_cpu_threads = atoi(argv[4]);
2275         if(number_cpu_threads > cpu_count || number_cpu_threads == 0 || number_cpu_threads > NUM_CPU_THREADS)
2276             number_cpu_threads = cpu_count < NUM_CPU_THREADS ? cpu_count : NUM_CPU_THREADS;
2277         log("Using thread number: %d\n", number_cpu_threads);
2278     }else{
2279         number_cpu_threads = cpu_count < NUM_CPU_THREADS ? cpu_count : NUM_CPU_THREADS;
2280         log("Using thread number: %d\n", number_cpu_threads);
2281     }
2282
2283     if(argc > 5)
2284     {
2285         uint64_t set_host_mem = atoll(argv[5]);
2286         if(set_host_mem > max_host_mem || set_host_mem == 0)
2287         {
2288             log("Use maximum host memory %lld\n", max_host_mem);
2289         }else{
2290             max_host_mem = set_host_mem;
2291         }
2292     }else{
2293         log("Using total RAM %lld\n", max_host_mem);
2294     }
2295
2296 #ifdef DISABLE_GPU
2297     gpu_mem = 0;
2298 #else
2299     size_t free_gpu_mem = get_gpu_mem();
2300     if(gpu_mem < 100 || gpu_mem > free_gpu_mem)
2301     {
2302         gpu_mem = free_gpu_mem;
2303     }
2304 #endif
2305
2306     if(argc > 6)
2307     {
2308 #ifdef DISABLE_GPU
2309         gpu_mem = atoll(argv[6]);
2310 #else
2311         uint64_t set_gpu_mem = atoll(argv[6]);
2312         if(set_gpu_mem < 100 || set_gpu_mem > gpu_mem)
2313         {
2314             log("Use Maximum GPU memory %lld.\n", gpu_mem);
2315         }else{
2316             gpu_mem = set_gpu_mem;
2317             log("Use Set GPU memory %lld.\n", gpu_mem);
2318         }
2319 #endif
2320     }else{
2321 #ifdef DISABLE_GPU
2322         log("Use maximum bucket size.\n");
2323 #else
2324         log("Use Maximum GPU memory %lld.\n", gpu_mem);
2325 #endif
2326     }
2327
2328     struct global_data_t globals;
2329
2330     init_global_data(globals, read_len, max_host_mem, number_cpu_threads);
2331
2332     // read input
2333     lib2read( globals, argv[1], output_prefix);
2334     log("Number of reads: %lld\n", globals.num_reads);
2335
2336     //building bwt.
2337     //set_global_after_reading(globals, max_host_mem);
2338     set_global_bucket(globals);
2339     //level 1 fill buckets.
2340     uint64_t max_bucket_size = lv0_fill_buckets(globals);
2341
2342     //set level2 memeory.
2343     set_global_after_lv0(globals, max_host_mem, gpu_mem, max_bucket_size);
2344
2345     build_bwt(globals, output_prefix, true);
2346
2347     log("Reverse all reads..");
2348     reverse_all_reads(globals);
2349     log("Done\n");
2350
2351     lv0_fill_buckets(globals);
2352     build_bwt(globals, output_prefix, false);
2353
2354     log("All finished.\n");
2355     // free memory
2356
2357     free( globals.packed_reads );
2358     free( globals.buckets );
2359     free( globals.lv1_offsets );
2360     free( globals.lv2_suffixes );
2361     free( globals.lv2_read_id );
2362     free( globals.sort_indexes );
2363
2364 #ifdef DISABLE_GPU
2365     free(globals.cpu_sort_space);
2366 #endif
2367
2368     pthread_spin_destroy(&globals.large_diff_lock);
2369
2370     for ( int t = 0; t < globals.num_cpu_threads; ++t ) {
2371         free( globals.section_buckets[t] );
2372         free( globals.locals[t].offset_pos );
2373     }
2374
2375     return 0;
2376
2377 }