4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
38 #include "sam_header.h"
48 static int state
= -3;
49 static int readstate
= 0;
51 static samfile_t
* openFile4readb ( const char * fname
);
53 void read1seqbam ( char * src_seq
, char * src_name
, int max_read_len
, samfile_t
* in
, int * type
);
56 /*************************************************
62 1. seq_s: input read sequence
64 1. seq_s: read sequence without N
65 2. bad_flag: indicat whether the sequence is vaild
68 *************************************************/
69 void filter_N ( string
& seq_s
, int & bad_flag
)
73 if ( seq_s
.size() > max_rd_len
)
75 max_rd_len
= seq_s
.size();
78 if ( seq_s
.size() < min_rd_len
)
80 min_rd_len
= seq_s
.size();
85 if ( seq_s
[seq_s
.size() - 1] == '\n' || seq_s
[seq_s
.size() - 1] == '\r' )
87 seq_s
.resize ( seq_s
.size() - 1 );
90 int seq_sz
= seq_s
.size();
91 int nN
= seq_sz
, isN
= -1;
93 for ( int i
= 0; i
< seq_sz
; ++i
)
95 if ( seq_s
[i
] == '-' || seq_s
[i
] == 'N' )
97 if ( i
<= seq_sz
/ 2 )
110 if ( nN
== seq_sz
&& isN
== -1 ) {bad_flag
= 0; return;}
112 if ( ( nN
- isN
) <= seq_sz
/ 2 )
117 if ( bad_flag
== 1 ) { return; }
119 seq_s
= seq_s
.substr ( isN
+ 1, nN
- isN
);
124 /*************************************************
128 When the asm_flags equals 1 or 3, adds the filename into vector filenames.
130 1. filenames: filenames vector
131 2. lib_file: read lib config file
133 1. filenames: filenames vector
136 *************************************************/
137 void read_lib ( vector
<string
> &filenames
, char * lib_file
)
139 ifstream
lib_in ( lib_file
);
141 int read_stat
= 0; // 1: begin a lib, 2:asm_flags=1 or 3
145 while ( getline ( lib_in
, str
) )
147 if ( read_stat
== 0 ) //not start a lib
149 found
= str
.find ( "[LIB]" );
151 if ( found
== string::npos
)
161 else if ( read_stat
== 1 ) //start reading a lib fetch asm flags
164 found
= str
.find ( "asm_flags" );
166 if ( found
== string::npos
)
172 found
= str
.find ( "=" );
173 str
= str
.substr ( found
+ 1, str
.size() - found
);
175 if ( str
.size() == 0 )
177 fprintf ( stderr
, "ERROR: please check asm_flags in lib file\n" );
181 asm_flags
= atoi ( str
.c_str() );
183 if ( asm_flags
== 1 || asm_flags
== 3 )
189 read_stat
= 0; // next lib
193 else if ( read_stat
== 2 ) // reading file
195 found
= str
.find_first_of ( "fqpb" );
197 if ( found
== 0 ) //f1 f2 q1 q2 p b
199 found
= str
.find ( "=" );
201 if ( found
> 2 ) {continue;} // the "=" should be the second or thrid poistion
203 str
= str
.substr ( found
+ 1, str
.size() - found
);
204 filenames
.push_back ( str
);
208 found
= str
.find ( "[LIB]" );
210 if ( found
== string::npos
)
225 /*************************************************
229 Initializes the io status, makes the io threads ready to work.
236 *************************************************/
237 void sendIOWorkSignal()
239 if ( io_ready
== 2 ) { return ; } //finish io job
247 /*************************************************
251 This is the main io thread working rountine.
252 Two buffer "read_buf0" and "read_buf0" are applied to implement AIO.
253 It makes the pointer "seq_t" always pointing to the filled buffer.
255 1. arg: args for io threads
260 *************************************************/
262 void * run_io_thread_main ( void * arg
)
264 io_para_main
* paras
;
265 paras
= ( io_para_main
* ) arg
;
269 fprintf ( stderr
, "ERROR: the argument passed to main io thread is NULL!\n" );
273 int read_buf_sz
= paras
->read_buf_sz
;
274 vector
<string
> in_filenames_vt
= * ( paras
->in_filenames_vt
);
276 for(int i=0;i<in_filenames_vt.size();i++){
277 printf("%s\n", in_filenames_vt[i].c_str());
282 int read_buf_len
= 1024;
283 FILE * fp
= NULL
; //for normal and gzip reading
284 samfile_t
* fp3
= NULL
; //for bam reading
285 int filetype
= 0; //0 normal 1 gzip 2 bam
288 if ( in_filenames_vt
.size() >= 1 )
292 found
= in_filenames_vt
[file_num
].find_last_of ( "." );
294 if ( found
== string::npos
)
296 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
297 fp
= ( FILE * ) open_file_robust ( "plain", in_filenames_vt
[file_num
].c_str(), "r" );
302 temp
= in_filenames_vt
[file_num
].substr ( found
);
304 if ( temp
.compare ( ".gz" ) == 0 ) //gzip
306 //temp = "gzip -dc ";
307 //temp.append(in_filenames_vt[file_num]);
308 //fp = popen(temp.c_str(),"r");
309 fp
= ( FILE * ) open_file_robust ( "gz", in_filenames_vt
[file_num
].c_str(), "r" );
312 else if ( temp
.compare ( ".bam" ) == 0 ) //bam
314 //fp3 = openFile4readb(in_filenames_vt[file_num].c_str());
315 fp3
= ( samfile_t
* ) open_file_robust ( "bam", in_filenames_vt
[file_num
].c_str(), "r" );
320 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
321 fp
= ( FILE * ) open_file_robust ( "plain", in_filenames_vt
[file_num
].c_str(), "r" );
328 fprintf ( stderr
, "ERROR: can't open file %s \n", in_filenames_vt
[file_num
].c_str() );
334 fprintf ( stderr
, "ERROR: input filenames vector is empty! please check the reads config file,option \"asm_flags\" is requried!\n" );
338 //fprintf(stderr,"processing file %d %s \n",file_num,in_filenames_vt[file_num].c_str());
339 fprintf ( stderr
, "Import reads from file:\n %s\n", in_filenames_vt
[file_num
].c_str() );
343 while ( ( io_stat0
) && ( io_stat1
) )
348 if ( ! ( io_stat0
) ) //fill buf0
350 io_stat0
= 1;// reading reads stat
354 if ( filetype
== 0 || filetype
== 1 ) //normal or gzip reading ...
356 while ( i
< read_buf_sz
)
358 if ( fgets ( line
, read_buf_len
, fp
) != NULL
)
373 read_buf0
[i
].clear();
374 read_buf0
[i
].append ( line
);
385 else if ( filetype
== 2 ) //bam reading
390 while ( i
< read_buf_sz
&& readstate
>= 0 ) //readstate
392 read1seqbam ( line
, src_name
, read_buf_len
, fp3
, &type
);
396 read_buf0
[i
].clear();
397 read_buf0
[i
].append ( line
);
398 //cout<<"line:"<<line<<endl;
399 //cout<<"buf0:"<<read_buf0[i]<<endl;
406 fprintf ( stderr
, "ERROR: filetype is not support or filename has a wrong suffix!\n" );
413 while ( io_ready
!= 0 ) {usleep ( 1 );}; //wait for main send work sign
415 if ( i
== read_buf_sz
)
419 else if ( i
!= read_buf_sz
&& file_num
< in_filenames_vt
.size() - 1 ) //still has file unread
427 else if ( filetype
== 1 )
431 else if ( filetype
== 2 )
439 //open a new file ...
442 found
= in_filenames_vt
[file_num
].find_last_of ( "." );
444 if ( found
== string::npos
)
446 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
447 fp
= ( FILE * ) open_file_robust ( "plain", in_filenames_vt
[file_num
].c_str(), "r" );
452 temp
= in_filenames_vt
[file_num
].substr ( found
);
454 if ( temp
.compare ( ".gz" ) == 0 ) //gzip
456 //temp = "gzip -dc ";
457 //temp.append(in_filenames_vt[file_num]);
458 //fp = popen(temp.c_str(),"r");
459 fp
= ( FILE * ) open_file_robust ( "gz", in_filenames_vt
[file_num
].c_str(), "r" );
462 else if ( temp
.compare ( ".bam" ) == 0 ) //bam
464 //fp3 = openFile4readb(in_filenames_vt[file_num].c_str());
465 fp3
= ( samfile_t
* ) open_file_robust ( "bam", in_filenames_vt
[file_num
].c_str(), "r" );
470 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
471 fp
= ( FILE * ) open_file_robust ( "plain", in_filenames_vt
[file_num
].c_str(), "r" );
478 fprintf ( stderr
, "ERROR: can't open file %s \n", in_filenames_vt
[file_num
].c_str() );
482 //fprintf(stderr, "processing file %d %s \n",file_num,in_filenames_vt[file_num].c_str());
483 fprintf ( stderr
, "Import reads from file:\n %s\n", in_filenames_vt
[file_num
].c_str() );
491 read_num
= read_num0
;
495 //printf("Io thread's job is finished! all reads: %llu \n",reads_all_num);
502 else if ( filetype
== 1 )
506 else if ( filetype
== 2 )
520 if ( ! ( io_stat1
) ) //fill buf1
522 io_stat1
= 1; //reading...
526 if ( filetype
== 0 || filetype
== 1 ) //normal or gzip reading ...
528 while ( i
< read_buf_sz
&& fp
)
530 if ( fgets ( line
, read_buf_len
, fp
) != NULL
)
545 read_buf1
[i
].clear();
546 read_buf1
[i
].append ( line
);
557 else if ( filetype
== 2 ) //bam reading
562 while ( i
< read_buf_sz
&& readstate
>= 0 ) //readstate
564 read1seqbam ( line
, src_name
, read_buf_len
, fp3
, &type
);
568 read_buf1
[i
].clear();
569 read_buf1
[i
].append ( line
);
576 fprintf ( stderr
, "ERROR: filetype is not support or filename has a wrong suffix!\n" );
583 while ( io_ready
!= 0 ) {usleep ( 1 );}; //wait for main send work sign
585 if ( i
== read_buf_sz
&& ( fp
|| fp3
) )
589 else if ( i
!= read_buf_sz
&& file_num
< in_filenames_vt
.size() - 1 ) //still has file unread
597 else if ( filetype
== 1 )
601 else if ( filetype
== 2 )
609 //open a new file ...
612 found
= in_filenames_vt
[file_num
].find_last_of ( "." );
614 if ( found
== string::npos
)
616 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
617 fp
= ( FILE * ) open_file_robust ( "plain", in_filenames_vt
[file_num
].c_str(), "r" );
622 temp
= in_filenames_vt
[file_num
].substr ( found
);
624 if ( temp
.compare ( ".gz" ) == 0 ) //gzip
626 //temp = "gzip -dc ";
627 //temp.append(in_filenames_vt[file_num]);
628 //fp = popen(temp.c_str(),"r");
629 fp
= ( FILE * ) open_file_robust ( "gz", in_filenames_vt
[file_num
].c_str(), "r" );
632 else if ( temp
.compare ( ".bam" ) == 0 ) //bam
634 //fp3 = openFile4readb(in_filenames_vt[file_num].c_str());
635 fp3
= ( samfile_t
* ) open_file_robust ( "bam", in_filenames_vt
[file_num
].c_str(), "r" );
640 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
641 fp
= ( FILE * ) open_file_robust ( "plain", in_filenames_vt
[file_num
].c_str(), "r" );
648 fprintf ( stderr
, "ERRPR: can't open file %s \n", in_filenames_vt
[file_num
].c_str() );
652 //fprintf(stderr,"processing file %d %s \n",file_num,in_filenames_vt[file_num].c_str());
653 fprintf ( stderr
, "Import reads from file:\n %s\n", in_filenames_vt
[file_num
].c_str() );
661 read_num
= read_num1
;
665 //fprintf(stderr,"Io thread's job is finished! all reads: %llu \n",reads_all_num);
670 else if ( filetype
== 1 )
674 else if ( filetype
== 2 )
693 /*************************************************
697 Reads sequence from bam and write it into *src_seq.
699 1. max_read_len: max read length
702 1. src_seq: read sequence
703 2. src_name: read name
704 3. type: record the "state" situation
707 *************************************************/
708 void read1seqbam ( char * src_seq
, char * src_name
, int max_read_len
, samfile_t
* in
, int * type
) //read one sequence from bam file
710 bam1_t
* b
= bam_init1 ();
717 unsigned int flag1
= 0;
721 if ( ( readstate
= samread ( in
, b
) ) >= 0 )
723 if ( !__g_skip_aln ( in
->header
, b
) )
725 line1
= bam_format1_core ( in
->header
, b
, in
->type
>> 2 & 3 );
728 //printf("%s\n", line2);
729 seq1
= strtok ( line1
, "\t" );
731 for ( i
= 0; i
< 10; i
++ )
735 sscanf ( seq1
, "%s", src_name
);
739 flag1
= atoi ( seq1
);
741 if ( flag1
& 0x0200 ) //whether it's good or not
743 //state(1st read state, 2nd read state) : -3(init), -2(0), -1(1), 0(0, 0), 1(0, 1), 2(1, 0), 3(1, 1)
777 else if ( i
== 9 ) //the sequence
779 //printf("%s\n", seq1);
780 len
= strlen ( seq1
);
782 if ( len
+ n
> max_read_len
)
783 { len
= max_read_len
- n
; }
785 for ( j
= 0; j
< len
; j
++ )
787 if ( seq1
[j
] >= 'a' && seq1
[j
] <= 'z' )
789 src_seq
[n
++] = ( char ) ( seq1
[j
] - 'a' + 'A' );
791 else if ( seq1
[j
] >= 'A' && seq1
[j
] <= 'Z' )
793 src_seq
[n
++] = seq1
[j
];
794 // after pre-process all the symbles would be a,g,c,t,n in lower or upper case.
796 else if ( seq1
[j
] == '.' )
799 } // after pre-process all the symbles would be a,g,c,t,n in lower or upper case.
808 if ( 0 == state
|| 1 == state
|| 2 == state
)
816 seq1
= strtok ( NULL
, "\t" );
825 /*************************************************
829 Opens a bam file for reads.
831 1. fname bam file name
836 *************************************************/
837 static samfile_t
* openFile4readb ( const char * fname
) //open file to read bam file
842 if ( ( in
= ( samfile_t
* ) samopen ( fname
, "rb", fn_list
) ) == 0 )
844 fprintf ( stderr
, "ERROR: Cannot open %s. Now exit to system...\n", fname
);
849 if ( in
->header
== 0 )
851 fprintf ( stderr
, "ERROR: Cannot read the header.\n" );
862 /*************************************************
866 It opens a file in a "failed-try-again" way.
867 It supports plain , gzip and bam format.
869 1. filetype the value enumerate here: "plain", "gz", "bam"
870 2. path the file path
871 3. mode read/write (rw) mode for plain type file, NOTE: gz or bam files are read only
875 A file pointer with void* type
876 *************************************************/
877 void * open_file_robust ( const char * filetype
, const char * path
, const char * mode
)
880 const int max_times
= 10;
881 const int max_sleep
= 60;
887 if ( strcmp ( filetype
, "plain" ) == 0 )
889 if ( access ( path
, 0 ) == 0 )
891 fp
= fopen ( path
, mode
);
894 else if ( strcmp ( filetype
, "gz" ) == 0 )
897 sprintf ( tmp
, "gzip -dc %s", path
);
899 if ( access ( path
, 0 ) == 0 )
901 fp
= popen ( tmp
, "r" );
903 if(fp && feof((FILE*)fp)){ //it's useless for "file not found but popen success" bug
909 else if ( strcmp ( filetype
, "bam" ) == 0 )
911 if ( access ( path
, 0 ) == 0 )
913 fp
= openFile4readb ( path
);
919 //fprintf(stderr,"%llx \n",fp);
924 fprintf ( stderr
, "ERROR: open file %s failed!\n", path
);
925 fprintf ( stderr
, "try opening it again after %d seconds\n", cur_sleep
);
930 if ( cur_sleep
>= max_sleep
)
932 cur_sleep
= max_sleep
;
935 if ( cur_times
> max_times
)
937 fprintf ( stderr
, "ERROR: can't open file %s , now exit system !!!", path
);