modified: nfig1.py
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / lib.c
blob95e4eb753f8d49add71c47ec7e4ef617b9dc5ca6
1 /*
2 * lib.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
29 static char tabs[2][1024]; //for splitColumn()
31 /*************************************************
32 Function:
33 getMaxLongReadLen
34 Description:
35 Get the max length for long reads (asm_flags=4) in lib.
36 Input:
37 1. num_libs: number of lib
38 Output:
39 None.
40 Return:
41 Max read length.
42 *************************************************/
43 int getMaxLongReadLen ( int num_libs )
45 int i;
46 int maxLong = 0;
47 boolean Has = 0;
49 for ( i = 0; i < num_libs; i++ )
51 if ( lib_array[i].asm_flag != 4 )
53 continue;
56 Has = 1;
57 maxLong = maxLong < lib_array[i].rd_len_cutoff ? lib_array[i].rd_len_cutoff : maxLong;
60 if ( !Has )
62 return maxLong;
64 else
66 return maxLong > 0 ? maxLong : maxReadLen;
70 static boolean splitColumn ( char * line )
72 int len = strlen ( line );
73 int i = 0, j;
74 int tabs_n = 0;
76 while ( i < len )
78 if ( line[i] >= 32 && line[i] <= 126 && line[i] != '=' )
80 j = 0;
82 while ( i < len && line[i] >= 32 && line[i] <= 126 && line[i] != '=' )
84 tabs[tabs_n][j++] = line[i];
85 i++;
88 tabs[tabs_n][j] = '\0';
89 tabs_n++;
91 if ( tabs_n == 2 )
93 return 1;
97 i++;
100 if ( tabs_n == 2 )
102 return 1;
104 else
106 return 0;
110 static int cmp_lib ( const void * a, const void * b )
112 LIB_INFO * A, *B;
113 A = ( LIB_INFO * ) a;
114 B = ( LIB_INFO * ) b;
116 if ( A->avg_ins > B->avg_ins )
118 return 1;
120 else if ( A->avg_ins == B->avg_ins )
122 return 0;
124 else
126 return -1;
130 void scan_libInfo ( char * libfile )
132 FILE * fp;
133 char line[1024], ch;
134 int i, j, index;
135 int libCounter;
136 boolean flag;
137 boolean * pe;
138 fp = ckopen ( libfile, "r" );
139 num_libs = 0;
141 while ( fgets ( line, 1024, fp ) )
143 ch = line[5];
144 line[5] = '\0';
146 if ( strcmp ( line, "[LIB]" ) == 0 )
148 num_libs++;
151 if ( !num_libs )
153 line[5] = ch;
154 flag = splitColumn ( line );
156 if ( !flag )
158 continue;
161 if ( strcmp ( tabs[0], "max_rd_len" ) == 0 )
163 maxReadLen = atoi ( tabs[1] );
168 if ( num_libs == 0 )
170 fprintf ( stderr, "Config file error: no [LIB] in file\n" );
171 exit ( -1 );
174 //count file numbers of each type
175 lib_array = ( LIB_INFO * ) ckalloc ( num_libs * sizeof ( LIB_INFO ) );
176 pe = ( boolean * ) ckalloc ( num_libs * sizeof ( boolean ) );
178 for ( i = 0; i < num_libs; i++ )
180 lib_array[i].asm_flag = 3;
181 lib_array[i].rank = 0;
182 lib_array[i].pair_num_cut = 0;
183 lib_array[i].rd_len_cutoff = 0;
184 lib_array[i].map_len = 0;
185 lib_array[i].num_s_a_file = 0;
186 lib_array[i].num_s_q_file = 0;
187 lib_array[i].num_p_file = 0;
188 lib_array[i].num_a1_file = 0;
189 lib_array[i].num_a2_file = 0;
190 lib_array[i].num_q1_file = 0;
191 lib_array[i].num_q2_file = 0;
192 lib_array[i].num_b_file = 0; //init
193 pe[i] = false;
196 libCounter = -1;
197 rewind ( fp );
198 i = -1;
200 while ( fgets ( line, 1024, fp ) )
202 ch = line[5];
203 line[5] = '\0';
205 if ( strcmp ( line, "[LIB]" ) == 0 )
207 i++;
208 continue;
211 line[5] = ch;
212 flag = splitColumn ( line );
214 if ( !flag )
216 continue;
219 if ( strcmp ( tabs[0], "f1" ) == 0 )
221 lib_array[i].num_a1_file++;
222 pe[i] = true;
224 else if ( strcmp ( tabs[0], "q1" ) == 0 )
226 lib_array[i].num_q1_file++;
227 pe[i] = true;
229 else if ( strcmp ( tabs[0], "f2" ) == 0 )
231 lib_array[i].num_a2_file++;
232 pe[i] = true;
234 else if ( strcmp ( tabs[0], "q2" ) == 0 )
236 lib_array[i].num_q2_file++;
237 pe[i] = true;
239 else if ( strcmp ( tabs[0], "f" ) == 0 )
241 lib_array[i].num_s_a_file++;
243 else if ( strcmp ( tabs[0], "q" ) == 0 )
245 lib_array[i].num_s_q_file++;
247 else if ( strcmp ( tabs[0], "p" ) == 0 )
249 lib_array[i].num_p_file++;
250 pe[i] = true;
252 else if ( strcmp ( tabs[0], "b" ) == 0 ) // the bam file
254 lib_array[i].num_b_file++;
255 pe[i] = true;
259 //allocate memory for filenames
260 for ( i = 0; i < num_libs; i++ )
262 if ( lib_array[i].num_a2_file != lib_array[i].num_a1_file )
264 fprintf ( stderr, "Config file error: the number of mark \"f1\" is not the same as \"f2\"!\n" );
265 exit ( -1 );
268 if ( lib_array[i].num_q2_file != lib_array[i].num_q1_file )
270 fprintf ( stderr, "Config file error: the number of mark \"q1\" is not the same as \"q2\"!\n" );
271 exit ( -1 );
274 if ( lib_array[i].num_s_a_file )
276 lib_array[i].s_a_fname = ( char ** ) ckalloc ( lib_array[i].num_s_a_file * sizeof ( char * ) );
278 for ( j = 0; j < lib_array[i].num_s_a_file; j++ )
280 lib_array[i].s_a_fname[j] = ( char * ) ckalloc ( 1024 * sizeof ( char ) );
284 if ( lib_array[i].num_s_q_file )
286 lib_array[i].s_q_fname = ( char ** ) ckalloc ( lib_array[i].num_s_q_file * sizeof ( char * ) );
288 for ( j = 0; j < lib_array[i].num_s_q_file; j++ )
290 lib_array[i].s_q_fname[j] = ( char * ) ckalloc ( 1024 * sizeof ( char ) );
294 if ( lib_array[i].num_p_file )
296 lib_array[i].p_fname = ( char ** ) ckalloc ( lib_array[i].num_p_file * sizeof ( char * ) );
298 for ( j = 0; j < lib_array[i].num_p_file; j++ )
300 lib_array[i].p_fname[j] = ( char * ) ckalloc ( 1024 * sizeof ( char ) );
304 if ( lib_array[i].num_a1_file )
306 lib_array[i].a1_fname = ( char ** ) ckalloc ( lib_array[i].num_a1_file * sizeof ( char * ) );
308 for ( j = 0; j < lib_array[i].num_a1_file; j++ )
310 lib_array[i].a1_fname[j] = ( char * ) ckalloc ( 1024 * sizeof ( char ) );
314 if ( lib_array[i].num_a2_file )
316 lib_array[i].a2_fname = ( char ** ) ckalloc ( lib_array[i].num_a2_file * sizeof ( char * ) );
318 for ( j = 0; j < lib_array[i].num_a2_file; j++ )
320 lib_array[i].a2_fname[j] = ( char * ) ckalloc ( 1024 * sizeof ( char ) );
324 if ( lib_array[i].num_q1_file )
326 lib_array[i].q1_fname = ( char ** ) ckalloc ( lib_array[i].num_q1_file * sizeof ( char * ) );
328 for ( j = 0; j < lib_array[i].num_q1_file; j++ )
330 lib_array[i].q1_fname[j] = ( char * ) ckalloc ( 1024 * sizeof ( char ) );
334 if ( lib_array[i].num_q2_file )
336 lib_array[i].q2_fname = ( char ** ) ckalloc ( lib_array[i].num_q2_file * sizeof ( char * ) );
338 for ( j = 0; j < lib_array[i].num_q2_file; j++ )
340 lib_array[i].q2_fname[j] = ( char * ) ckalloc ( 1024 * sizeof ( char ) );
344 if ( lib_array[i].num_b_file ) //allot memory for bam file name
346 lib_array[i].b_fname = ( char ** ) ckalloc ( lib_array[i].num_b_file * sizeof ( char * ) );
348 for ( j = 0; j < lib_array[i].num_b_file; j++ )
349 { lib_array[i].b_fname[j] = ( char * ) ckalloc ( 1024 * sizeof ( char ) ); }
353 // get file names
354 for ( i = 0; i < num_libs; i++ )
356 lib_array[i].curr_type = 1;
357 lib_array[i].curr_index = 0;
358 lib_array[i].fp1 = NULL;
359 lib_array[i].fp2 = NULL;
360 lib_array[i].num_s_a_file = 0;
361 lib_array[i].num_s_q_file = 0;
362 lib_array[i].num_p_file = 0;
363 lib_array[i].num_a1_file = 0;
364 lib_array[i].num_a2_file = 0;
365 lib_array[i].num_q1_file = 0;
366 lib_array[i].num_q2_file = 0;
367 lib_array[i].num_b_file = 0; //init
368 lib_array[i].fp3 = NULL;
371 libCounter = -1;
372 rewind ( fp );
373 i = -1;
375 while ( fgets ( line, 1024, fp ) )
377 ch = line[5];
378 line[5] = '\0';
380 if ( strcmp ( line, "[LIB]" ) == 0 )
382 i++;
383 continue;
386 line[5] = ch;
387 flag = splitColumn ( line );
389 if ( !flag )
391 continue;
394 if ( strcmp ( tabs[0], "f1" ) == 0 )
396 index = lib_array[i].num_a1_file++;
397 strcpy ( lib_array[i].a1_fname[index], tabs[1] );
399 else if ( strcmp ( tabs[0], "q1" ) == 0 )
401 index = lib_array[i].num_q1_file++;
402 strcpy ( lib_array[i].q1_fname[index], tabs[1] );
404 else if ( strcmp ( tabs[0], "f2" ) == 0 )
406 index = lib_array[i].num_a2_file++;
407 strcpy ( lib_array[i].a2_fname[index], tabs[1] );
409 if ( strcmp ( lib_array[i].a2_fname[index], lib_array[i].a1_fname[index] ) == 0 )
411 fprintf ( stderr, "Config file error: f2 file is the same as f1 file\n" );
412 fprintf ( stderr, "f1=%s\n", lib_array[i].a1_fname[index] );
413 fprintf ( stderr, "f2=%s\n", lib_array[i].a2_fname[index] );
414 exit ( -1 );
417 else if ( strcmp ( tabs[0], "q2" ) == 0 )
419 index = lib_array[i].num_q2_file++;
420 strcpy ( lib_array[i].q2_fname[index], tabs[1] );
422 if ( strcmp ( lib_array[i].q2_fname[index], lib_array[i].q1_fname[index] ) == 0 )
424 fprintf ( stderr, "Config file error: q2 file is the same as q1 file\n" );
425 fprintf ( stderr, "q1=%s\n", lib_array[i].q1_fname[index] );
426 fprintf ( stderr, "q2=%s\n", lib_array[i].q2_fname[index] );
427 exit ( -1 );
430 else if ( strcmp ( tabs[0], "f" ) == 0 )
432 index = lib_array[i].num_s_a_file++;
433 strcpy ( lib_array[i].s_a_fname[index], tabs[1] );
435 else if ( strcmp ( tabs[0], "q" ) == 0 )
437 index = lib_array[i].num_s_q_file++;
438 strcpy ( lib_array[i].s_q_fname[index], tabs[1] );
440 else if ( strcmp ( tabs[0], "p" ) == 0 )
442 index = lib_array[i].num_p_file++;
443 strcpy ( lib_array[i].p_fname[index], tabs[1] );
445 else if ( strcmp ( tabs[0], "b" ) == 0 )
447 //bam file
448 index = lib_array[i].num_b_file++;
449 strcpy ( lib_array[i].b_fname[index], tabs[1] );
451 else if ( strcmp ( tabs[0], "min_ins" ) == 0 )
453 lib_array[i].min_ins = atoi ( tabs[1] );
455 else if ( strcmp ( tabs[0], "max_ins" ) == 0 )
457 lib_array[i].max_ins = atoi ( tabs[1] );
459 else if ( strcmp ( tabs[0], "avg_ins" ) == 0 )
461 lib_array[i].avg_ins = atoi ( tabs[1] );
463 else if ( strcmp ( tabs[0], "rd_len_cutoff" ) == 0 )
465 lib_array[i].rd_len_cutoff = atoi ( tabs[1] );
467 else if ( strcmp ( tabs[0], "reverse_seq" ) == 0 )
469 lib_array[i].reverse = atoi ( tabs[1] );
471 else if ( strcmp ( tabs[0], "asm_flags" ) == 0 )
473 lib_array[i].asm_flag = atoi ( tabs[1] );
475 else if ( strcmp ( tabs[0], "rank" ) == 0 )
477 lib_array[i].rank = atoi ( tabs[1] );
479 else if ( strcmp ( tabs[0], "pair_num_cutoff" ) == 0 )
481 lib_array[i].pair_num_cut = atoi ( tabs[1] );
483 else if ( strcmp ( tabs[0], "rd_len_cutoff" ) == 0 )
485 lib_array[i].rd_len_cutoff = atoi ( tabs[1] );
487 else if ( strcmp ( tabs[0], "map_len" ) == 0 )
489 lib_array[i].map_len = atoi ( tabs[1] );
493 for ( i = 0; i < num_libs; i++ )
495 if ( pe[i] && lib_array[i].avg_ins == 0 )
497 fprintf ( stderr, "Config file error: PE reads need avg_ins in [LIB] %d\n", i + 1 );
498 exit ( -1 );
502 fclose ( fp );
503 qsort ( &lib_array[0], num_libs, sizeof ( LIB_INFO ), cmp_lib );
506 void free_libs ()
508 if ( !lib_array )
510 return;
513 int i, j;
514 fprintf ( stderr, "LIB(s) information:\n" );
516 for ( i = 0; i < num_libs; i++ )
518 fprintf ( stderr, " [LIB] %d, avg_ins %d, reverse %d.\n", i, lib_array[i].avg_ins, lib_array[i].reverse );
520 if ( lib_array[i].num_s_a_file )
522 //printf("%d single fasta files\n",lib_array[i].num_s_a_file);
523 for ( j = 0; j < lib_array[i].num_s_a_file; j++ )
525 free ( ( void * ) lib_array[i].s_a_fname[j] );
528 free ( ( void * ) lib_array[i].s_a_fname );
531 if ( lib_array[i].num_s_q_file )
533 //printf("%d single fastq files\n",lib_array[i].num_s_q_file);
534 for ( j = 0; j < lib_array[i].num_s_q_file; j++ )
536 free ( ( void * ) lib_array[i].s_q_fname[j] );
539 free ( ( void * ) lib_array[i].s_q_fname );
542 if ( lib_array[i].num_p_file )
544 //printf("%d paired fasta files\n",lib_array[i].num_p_file);
545 for ( j = 0; j < lib_array[i].num_p_file; j++ )
547 free ( ( void * ) lib_array[i].p_fname[j] );
550 free ( ( void * ) lib_array[i].p_fname );
553 if ( lib_array[i].num_a1_file )
555 //printf("%d read1 fasta files\n",lib_array[i].num_a1_file);
556 for ( j = 0; j < lib_array[i].num_a1_file; j++ )
558 free ( ( void * ) lib_array[i].a1_fname[j] );
561 free ( ( void * ) lib_array[i].a1_fname );
564 if ( lib_array[i].num_a2_file )
566 //printf("%d read2 fasta files\n",lib_array[i].num_a2_file);
567 for ( j = 0; j < lib_array[i].num_a2_file; j++ )
569 free ( ( void * ) lib_array[i].a2_fname[j] );
572 free ( ( void * ) lib_array[i].a2_fname );
575 if ( lib_array[i].num_q1_file )
577 //printf("%d read1 fastq files\n",lib_array[i].num_q1_file);
578 for ( j = 0; j < lib_array[i].num_q1_file; j++ )
580 free ( ( void * ) lib_array[i].q1_fname[j] );
583 free ( ( void * ) lib_array[i].q1_fname );
586 if ( lib_array[i].num_q2_file )
588 //printf("%d read2 fastq files\n",lib_array[i].num_q2_file);
589 for ( j = 0; j < lib_array[i].num_q2_file; j++ )
591 free ( ( void * ) lib_array[i].q2_fname[j] );
594 free ( ( void * ) lib_array[i].q2_fname );
597 if ( lib_array[i].num_b_file )
599 //free the bam file name
600 //printf("%d bam files\n",lib_array[i].num_b_file);
601 for ( j = 0; j < lib_array[i].num_b_file; j++ )
602 { free ( ( void * ) lib_array[i].b_fname[j] ); }
604 free ( ( void * ) lib_array[i].b_fname );
608 num_libs = 0;
609 free ( ( void * ) lib_array );
612 void alloc_pe_mem ( int gradsCounter )
614 if ( gradsCounter )
616 pes = ( PE_INFO * ) ckalloc ( gradsCounter * sizeof ( PE_INFO ) );
620 void free_pe_mem ()
622 if ( pes )
624 free ( ( void * ) pes );
625 pes = NULL;