modified: nfig1.py
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / sparsePregraph / io_func.cpp
blob6832bcea9499377898a621026908d73b1a0559fa
1 /*
2 * io_func.cpp
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
22 #include "io_func.h"
23 #include "stdinc.h"
25 #include <math.h>
26 #include <unistd.h>
28 #include "bam.h"
29 #include "faidx.h"
30 #include "knetfile.h"
31 #include "sam_view.h"
32 #include "xcurses.h"
33 #include "zlib.h"
34 #include "bgzf.h"
35 #include "glf.h"
36 #include "kstring.h"
37 #include "razf.h"
38 #include "sam_header.h"
39 #include "zconf.h"
41 #include "sam.h"
42 #include "errno.h"
47 //for bam reading ...
48 static int state = -3;
49 static int readstate = 0;
51 static samfile_t * openFile4readb ( const char * fname );
53 void read1seqbam ( char * src_seq, char * src_name, int max_read_len, samfile_t * in, int * type );
56 /*************************************************
57 Function:
58 filter_N
59 Description:
60 filters N on reads.
61 Input:
62 1. seq_s: input read sequence
63 Output:
64 1. seq_s: read sequence without N
65 2. bad_flag: indicat whether the sequence is vaild
66 Return:
67 None.
68 *************************************************/
69 void filter_N ( string & seq_s, int & bad_flag )
71 //global max_rd_len
72 //global min_rd_len
73 if ( seq_s.size() > max_rd_len )
75 max_rd_len = seq_s.size();
78 if ( seq_s.size() < min_rd_len )
80 min_rd_len = seq_s.size();
83 bad_flag = 0;
85 if ( seq_s[seq_s.size() - 1] == '\n' || seq_s[seq_s.size() - 1] == '\r' )
87 seq_s.resize ( seq_s.size() - 1 );
90 int seq_sz = seq_s.size();
91 int nN = seq_sz, isN = -1;
93 for ( int i = 0; i < seq_sz; ++i )
95 if ( seq_s[i] == '-' || seq_s[i] == 'N' )
97 if ( i <= seq_sz / 2 )
99 isN = i;
100 continue;
102 else
104 nN = i - 1;
105 break;
110 if ( nN == seq_sz && isN == -1 ) {bad_flag = 0; return;}
112 if ( ( nN - isN ) <= seq_sz / 2 )
114 bad_flag = 1;
117 if ( bad_flag == 1 ) { return; }
119 seq_s = seq_s.substr ( isN + 1, nN - isN );
124 /*************************************************
125 Function:
126 read_lib
127 Description:
128 When the asm_flags equals 1 or 3, adds the filename into vector filenames.
129 Input:
130 1. filenames: filenames vector
131 2. lib_file: read lib config file
132 Output:
133 1. filenames: filenames vector
134 Return:
135 None.
136 *************************************************/
137 void read_lib ( vector<string> &filenames, char * lib_file )
139 ifstream lib_in ( lib_file );
140 string str;
141 int read_stat = 0; // 1: begin a lib, 2:asm_flags=1 or 3
142 size_t found;
143 int asm_flags;
145 while ( getline ( lib_in, str ) )
147 if ( read_stat == 0 ) //not start a lib
149 found = str.find ( "[LIB]" );
151 if ( found == string::npos )
153 continue;
155 else
157 read_stat = 1;
158 asm_flags = 0;
161 else if ( read_stat == 1 ) //start reading a lib fetch asm flags
163 //split by "="
164 found = str.find ( "asm_flags" );
166 if ( found == string::npos )
168 continue;
170 else
172 found = str.find ( "=" );
173 str = str.substr ( found + 1, str.size() - found );
175 if ( str.size() == 0 )
177 fprintf ( stderr, "ERROR: please check asm_flags in lib file\n" );
178 exit ( -1 );
181 asm_flags = atoi ( str.c_str() );
183 if ( asm_flags == 1 || asm_flags == 3 )
185 read_stat = 2;
187 else
189 read_stat = 0; // next lib
193 else if ( read_stat == 2 ) // reading file
195 found = str.find_first_of ( "fqpb" );
197 if ( found == 0 ) //f1 f2 q1 q2 p b
199 found = str.find ( "=" );
201 if ( found > 2 ) {continue;} // the "=" should be the second or thrid poistion
203 str = str.substr ( found + 1, str.size() - found );
204 filenames.push_back ( str );
206 else
208 found = str.find ( "[LIB]" );
210 if ( found == string::npos )
212 continue;
214 else
216 read_stat = 1;
217 asm_flags = 0;
225 /*************************************************
226 Function:
227 sendIOWorkSignal
228 Description:
229 Initializes the io status, makes the io threads ready to work.
230 Input:
231 None
232 Output:
233 None
234 Return:
235 None
236 *************************************************/
237 void sendIOWorkSignal()
239 if ( io_ready == 2 ) { return ; } //finish io job
241 io_stat0 = 0;
242 io_stat1 = 0;
243 io_ready = 0;
247 /*************************************************
248 Function:
249 run_io_thread_main
250 Description:
251 This is the main io thread working rountine.
252 Two buffer "read_buf0" and "read_buf0" are applied to implement AIO.
253 It makes the pointer "seq_t" always pointing to the filled buffer.
254 Input:
255 1. arg: args for io threads
256 Output:
257 None.
258 Return:
259 None.
260 *************************************************/
262 void * run_io_thread_main ( void * arg )
264 io_para_main * paras;
265 paras = ( io_para_main * ) arg;
267 if ( !paras )
269 fprintf ( stderr, "ERROR: the argument passed to main io thread is NULL!\n" );
270 exit ( -1 );
273 int read_buf_sz = paras->read_buf_sz;
274 vector<string> in_filenames_vt = * ( paras->in_filenames_vt );
276 for(int i=0;i<in_filenames_vt.size();i++){
277 printf("%s\n", in_filenames_vt[i].c_str());
279 int read_num0 = 0;
280 int read_num1 = 0;
281 char line[1024];
282 int read_buf_len = 1024;
283 FILE * fp = NULL; //for normal and gzip reading
284 samfile_t * fp3 = NULL; //for bam reading
285 int filetype = 0; //0 normal 1 gzip 2 bam
286 int file_num = 0;
288 if ( in_filenames_vt.size() >= 1 )
290 string temp;
291 size_t found;
292 found = in_filenames_vt[file_num].find_last_of ( "." );
294 if ( found == string::npos )
296 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
297 fp = ( FILE * ) open_file_robust ( "plain", in_filenames_vt[file_num].c_str(), "r" );
298 filetype = 0;
300 else
302 temp = in_filenames_vt[file_num].substr ( found );
304 if ( temp.compare ( ".gz" ) == 0 ) //gzip
306 //temp = "gzip -dc ";
307 //temp.append(in_filenames_vt[file_num]);
308 //fp = popen(temp.c_str(),"r");
309 fp = ( FILE * ) open_file_robust ( "gz", in_filenames_vt[file_num].c_str(), "r" );
310 filetype = 1;
312 else if ( temp.compare ( ".bam" ) == 0 ) //bam
314 //fp3 = openFile4readb(in_filenames_vt[file_num].c_str());
315 fp3 = ( samfile_t * ) open_file_robust ( "bam", in_filenames_vt[file_num].c_str(), "r" );
316 filetype = 2;
318 else
320 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
321 fp = ( FILE * ) open_file_robust ( "plain", in_filenames_vt[file_num].c_str(), "r" );
322 filetype = 0;
326 if ( !fp && !fp3 )
328 fprintf ( stderr, "ERROR: can't open file %s \n", in_filenames_vt[file_num].c_str() );
329 exit ( 1 );
332 else
334 fprintf ( stderr, "ERROR: input filenames vector is empty! please check the reads config file,option \"asm_flags\" is requried!\n" );
335 exit ( 1 );
338 //fprintf(stderr,"processing file %d %s \n",file_num,in_filenames_vt[file_num].c_str());
339 fprintf ( stderr, "Import reads from file:\n %s\n", in_filenames_vt[file_num].c_str() );
341 while ( 1 )
343 while ( ( io_stat0 ) && ( io_stat1 ) )
345 usleep ( 1 );
348 if ( ! ( io_stat0 ) ) //fill buf0
350 io_stat0 = 1;// reading reads stat
351 int ready = 0;
352 int i = 0;
354 if ( filetype == 0 || filetype == 1 ) //normal or gzip reading ...
356 while ( i < read_buf_sz )
358 if ( fgets ( line, read_buf_len, fp ) != NULL )
360 switch ( line[0] )
362 case '@':
363 case '>':
364 ready = 1;
365 break;
366 case '+':
367 ready = 0;
368 break;
369 default:
371 if ( ready )
373 read_buf0[i].clear();
374 read_buf0[i].append ( line );
375 i++;
379 else
381 break;
385 else if ( filetype == 2 ) //bam reading
387 int type = 0;
388 char src_name[128];
390 while ( i < read_buf_sz && readstate >= 0 ) //readstate
392 read1seqbam ( line, src_name, read_buf_len, fp3, &type );
394 if ( type != -1 )
396 read_buf0[i].clear();
397 read_buf0[i].append ( line );
398 //cout<<"line:"<<line<<endl;
399 //cout<<"buf0:"<<read_buf0[i]<<endl;
400 i++;
404 else
406 fprintf ( stderr, "ERROR: filetype is not support or filename has a wrong suffix!\n" );
407 exit ( -1 );
410 read_num0 = i;
411 reads_all_num += i;
413 while ( io_ready != 0 ) {usleep ( 1 );}; //wait for main send work sign
415 if ( i == read_buf_sz )
417 io_stat0 = 2;
419 else if ( i != read_buf_sz && file_num < in_filenames_vt.size() - 1 ) //still has file unread
421 io_stat0 = 2;
423 if ( filetype == 0 )
425 fclose ( fp );
427 else if ( filetype == 1 )
429 pclose ( fp );
431 else if ( filetype == 2 )
433 samclose ( fp3 );
434 state = -3;
435 readstate = 0;
438 file_num++;
439 //open a new file ...
440 string temp;
441 size_t found;
442 found = in_filenames_vt[file_num].find_last_of ( "." );
444 if ( found == string::npos )
446 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
447 fp = ( FILE * ) open_file_robust ( "plain", in_filenames_vt[file_num].c_str(), "r" );
448 filetype = 0;
450 else
452 temp = in_filenames_vt[file_num].substr ( found );
454 if ( temp.compare ( ".gz" ) == 0 ) //gzip
456 //temp = "gzip -dc ";
457 //temp.append(in_filenames_vt[file_num]);
458 //fp = popen(temp.c_str(),"r");
459 fp = ( FILE * ) open_file_robust ( "gz", in_filenames_vt[file_num].c_str(), "r" );
460 filetype = 1;
462 else if ( temp.compare ( ".bam" ) == 0 ) //bam
464 //fp3 = openFile4readb(in_filenames_vt[file_num].c_str());
465 fp3 = ( samfile_t * ) open_file_robust ( "bam", in_filenames_vt[file_num].c_str(), "r" );
466 filetype = 2;
468 else
470 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
471 fp = ( FILE * ) open_file_robust ( "plain", in_filenames_vt[file_num].c_str(), "r" );
472 filetype = 0;
476 if ( !fp && !fp3 )
478 fprintf ( stderr, "ERROR: can't open file %s \n", in_filenames_vt[file_num].c_str() );
479 exit ( 1 );
482 //fprintf(stderr, "processing file %d %s \n",file_num,in_filenames_vt[file_num].c_str());
483 fprintf ( stderr, "Import reads from file:\n %s\n", in_filenames_vt[file_num].c_str() );
485 else
487 io_stat0 = 3;
490 seq_t = read_buf0;
491 read_num = read_num0;
493 if ( io_stat0 == 3 )
495 //printf("Io thread's job is finished! all reads: %llu \n",reads_all_num);
497 //close the file...
498 if ( filetype == 0 )
500 fclose ( fp );
502 else if ( filetype == 1 )
504 pclose ( fp );
506 else if ( filetype == 2 )
508 samclose ( fp3 );
509 state = -3;
510 readstate = 0;
513 io_ready = 2;
514 break;
517 io_ready = 1;
520 if ( ! ( io_stat1 ) ) //fill buf1
522 io_stat1 = 1; //reading...
523 int ready = 0;
524 int i = 0;
526 if ( filetype == 0 || filetype == 1 ) //normal or gzip reading ...
528 while ( i < read_buf_sz && fp )
530 if ( fgets ( line, read_buf_len, fp ) != NULL )
532 switch ( line[0] )
534 case '@':
535 case '>':
536 ready = 1;
537 break;
538 case '+':
539 ready = 0;
540 break;
541 default:
543 if ( ready )
545 read_buf1[i].clear();
546 read_buf1[i].append ( line );
547 i++;
551 else
553 break;
557 else if ( filetype == 2 ) //bam reading
559 int type = 0;
560 char src_name[128];
562 while ( i < read_buf_sz && readstate >= 0 ) //readstate
564 read1seqbam ( line, src_name, read_buf_len, fp3, &type );
566 if ( type != -1 )
568 read_buf1[i].clear();
569 read_buf1[i].append ( line );
570 i++;
574 else
576 fprintf ( stderr, "ERROR: filetype is not support or filename has a wrong suffix!\n" );
577 exit ( 1 );
580 read_num1 = i;
581 reads_all_num += i;
583 while ( io_ready != 0 ) {usleep ( 1 );}; //wait for main send work sign
585 if ( i == read_buf_sz && ( fp || fp3 ) )
587 io_stat1 = 2;
589 else if ( i != read_buf_sz && file_num < in_filenames_vt.size() - 1 ) //still has file unread
591 io_stat1 = 2;
593 if ( filetype == 0 )
595 fclose ( fp );
597 else if ( filetype == 1 )
599 pclose ( fp );
601 else if ( filetype == 2 )
603 samclose ( fp3 );
604 state = -3;
605 readstate = 0;
608 file_num++;
609 //open a new file ...
610 string temp;
611 size_t found;
612 found = in_filenames_vt[file_num].find_last_of ( "." );
614 if ( found == string::npos )
616 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
617 fp = ( FILE * ) open_file_robust ( "plain", in_filenames_vt[file_num].c_str(), "r" );
618 filetype = 0;
620 else
622 temp = in_filenames_vt[file_num].substr ( found );
624 if ( temp.compare ( ".gz" ) == 0 ) //gzip
626 //temp = "gzip -dc ";
627 //temp.append(in_filenames_vt[file_num]);
628 //fp = popen(temp.c_str(),"r");
629 fp = ( FILE * ) open_file_robust ( "gz", in_filenames_vt[file_num].c_str(), "r" );
630 filetype = 1;
632 else if ( temp.compare ( ".bam" ) == 0 ) //bam
634 //fp3 = openFile4readb(in_filenames_vt[file_num].c_str());
635 fp3 = ( samfile_t * ) open_file_robust ( "bam", in_filenames_vt[file_num].c_str(), "r" );
636 filetype = 2;
638 else
640 //fp = fopen(in_filenames_vt[file_num].c_str(),"r"); //normal
641 fp = ( FILE * ) open_file_robust ( "plain", in_filenames_vt[file_num].c_str(), "r" );
642 filetype = 0;
646 if ( !fp && !fp3 )
648 fprintf ( stderr, "ERRPR: can't open file %s \n", in_filenames_vt[file_num].c_str() );
649 exit ( 1 );
652 //fprintf(stderr,"processing file %d %s \n",file_num,in_filenames_vt[file_num].c_str());
653 fprintf ( stderr, "Import reads from file:\n %s\n", in_filenames_vt[file_num].c_str() );
655 else
657 io_stat1 = 3;
660 seq_t = read_buf1;
661 read_num = read_num1;
663 if ( io_stat1 == 3 )
665 //fprintf(stderr,"Io thread's job is finished! all reads: %llu \n",reads_all_num);
666 if ( filetype == 0 )
668 fclose ( fp );
670 else if ( filetype == 1 )
672 pclose ( fp );
674 else if ( filetype == 2 )
676 samclose ( fp3 );
677 state = -3;
678 readstate = 0;
681 io_ready = 2;
682 break;
685 io_ready = 1;
689 return NULL;
693 /*************************************************
694 Function:
695 read1seqbam
696 Description:
697 Reads sequence from bam and write it into *src_seq.
698 Input:
699 1. max_read_len: max read length
700 2. in: sam file
701 Output:
702 1. src_seq: read sequence
703 2. src_name: read name
704 3. type: record the "state" situation
705 Return:
706 None.
707 *************************************************/
708 void read1seqbam ( char * src_seq, char * src_name, int max_read_len, samfile_t * in, int * type ) //read one sequence from bam file
710 bam1_t * b = bam_init1 ();
711 char c;
712 char * line1 = NULL;
713 int n = 0;
714 int len;
715 int i, j;
716 char * seq1;
717 unsigned int flag1 = 0;
718 *type = 0;
719 readstate = 0;
721 if ( ( readstate = samread ( in, b ) ) >= 0 )
723 if ( !__g_skip_aln ( in->header, b ) )
725 line1 = bam_format1_core ( in->header, b, in->type >> 2 & 3 );
728 //printf("%s\n", line2);
729 seq1 = strtok ( line1, "\t" );
731 for ( i = 0; i < 10; i++ )
733 if ( i == 0 )
735 sscanf ( seq1, "%s", src_name );
737 else if ( i == 1 )
739 flag1 = atoi ( seq1 );
741 if ( flag1 & 0x0200 ) //whether it's good or not
743 //state(1st read state, 2nd read state) : -3(init), -2(0), -1(1), 0(0, 0), 1(0, 1), 2(1, 0), 3(1, 1)
744 switch ( state )
746 case -3:
747 state = -2;
748 break;
749 case -2:
750 state = 0;
751 break;
752 case -1:
753 state = 2;
754 break;
755 default:
756 state = -3;
759 else
761 switch ( state )
763 case -3:
764 state = -1;
765 break;
766 case -2:
767 state = 1;
768 break;
769 case -1:
770 state = 3;
771 break;
772 default:
773 state = -3;
777 else if ( i == 9 ) //the sequence
779 //printf("%s\n", seq1);
780 len = strlen ( seq1 );
782 if ( len + n > max_read_len )
783 { len = max_read_len - n; }
785 for ( j = 0; j < len; j++ )
787 if ( seq1[j] >= 'a' && seq1[j] <= 'z' )
789 src_seq[n++] = ( char ) ( seq1[j] - 'a' + 'A' );
791 else if ( seq1[j] >= 'A' && seq1[j] <= 'Z' )
793 src_seq[n++] = seq1[j];
794 // after pre-process all the symbles would be a,g,c,t,n in lower or upper case.
796 else if ( seq1[j] == '.' )
798 src_seq[n++] = 'A';
799 } // after pre-process all the symbles would be a,g,c,t,n in lower or upper case.
802 if ( 3 == state )
804 state = -3;
806 else
808 if ( 0 == state || 1 == state || 2 == state )
810 state = -3;
811 *type = -1;
816 seq1 = strtok ( NULL, "\t" );
820 free ( line1 );
821 bam_destroy1 ( b );
822 src_seq[n++] = '\0';
825 /*************************************************
826 Function:
827 openFile4readb
828 Description:
829 Opens a bam file for reads.
830 Input:
831 1. fname bam file name
832 Output:
833 None.
834 Return:
835 a samfile pointer
836 *************************************************/
837 static samfile_t * openFile4readb ( const char * fname ) //open file to read bam file
839 samfile_t * in;
840 char * fn_list = 0;
842 if ( ( in = ( samfile_t * ) samopen ( fname, "rb", fn_list ) ) == 0 )
844 fprintf ( stderr, "ERROR: Cannot open %s. Now exit to system...\n", fname );
845 return NULL;
846 //exit (-1);
849 if ( in->header == 0 )
851 fprintf ( stderr, "ERROR: Cannot read the header.\n" );
852 return NULL;
853 //exit (-1);
856 return ( in );
862 /*************************************************
863 Function:
864 open_file_robust
865 Description:
866 It opens a file in a "failed-try-again" way.
867 It supports plain , gzip and bam format.
868 Input:
869 1. filetype the value enumerate here: "plain", "gz", "bam"
870 2. path the file path
871 3. mode read/write (rw) mode for plain type file, NOTE: gz or bam files are read only
872 Output:
873 None
874 Return:
875 A file pointer with void* type
876 *************************************************/
877 void * open_file_robust ( const char * filetype, const char * path, const char * mode )
879 void * fp = NULL;
880 const int max_times = 10;
881 const int max_sleep = 60;
882 int cur_times = 1;
883 int cur_sleep = 1;
885 while ( !fp )
887 if ( strcmp ( filetype, "plain" ) == 0 )
889 if ( access ( path, 0 ) == 0 )
891 fp = fopen ( path, mode );
894 else if ( strcmp ( filetype, "gz" ) == 0 )
896 char tmp[256];
897 sprintf ( tmp, "gzip -dc %s", path );
899 if ( access ( path, 0 ) == 0 )
901 fp = popen ( tmp, "r" );
903 if(fp && feof((FILE*)fp)){ //it's useless for "file not found but popen success" bug
904 pclose((FILE*)fp);
905 fp = NULL;
909 else if ( strcmp ( filetype, "bam" ) == 0 )
911 if ( access ( path, 0 ) == 0 )
913 fp = openFile4readb ( path );
917 if ( fp )
919 //fprintf(stderr,"%llx \n",fp);
920 return fp;
922 else
924 fprintf ( stderr, "ERROR: open file %s failed!\n", path );
925 fprintf ( stderr, "try opening it again after %d seconds\n", cur_sleep );
926 sleep ( cur_sleep );
927 cur_times ++;
928 cur_sleep *= 2;
930 if ( cur_sleep >= max_sleep )
932 cur_sleep = max_sleep;
935 if ( cur_times > max_times )
937 fprintf ( stderr, "ERROR: can't open file %s , now exit system !!!", path );
938 exit ( -1 );
939 return NULL;