modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / loadGraph.c
blobf9dafdd2ab1abcf0640e7b7ecddd03b7b79439c0
1 /*
2 * loadGraph.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
29 #define preARCBLOCKSIZE 100000
31 static unsigned int loadArcs ( char * graphfile );
32 static void loadContig ( char * graphfile );
33 static int maskRepeatByArc ( unsigned avg_weight );
36 void loadUpdatedVertex (char *graphfile)
38 char name[256], line[256];
39 FILE *fp;
40 Kmer word, bal_word;
41 int num_kmer, i;
42 char ch;
44 sprintf (name, "%s.updated.vertex", graphfile);
45 fp = ckopen (name, "r");
47 while (fgets (line, sizeof (line), fp) != NULL)
49 if (line[0] == 'V')
51 sscanf (line + 6, "%d %c %d", &num_kmer, &ch, &overlaplen);
52 printf ("there're %d kmers in vertex file\n", num_kmer);
53 break;
57 vt_array = (VERTEX *) ckalloc ((2 * num_kmer) * sizeof (VERTEX));
59 for (i = 0; i < num_kmer; i++)
61 fscanf (fp, "%llx ", &word);
62 vt_array[i].kmer = word;
65 fclose (fp);
67 for (i = 0; i < num_kmer; i++)
69 bal_word = reverseComplement (vt_array[i].kmer, overlaplen);
70 vt_array[i + num_kmer].kmer = bal_word;
73 num_vt = num_kmer;
74 }*/
76 int uniqueLenSearch ( unsigned int * len_array, unsigned int * flag_array, int num, unsigned int target )
78 int mid, low, high;
79 low = 1;
80 high = num;
82 while ( low <= high )
84 mid = ( low + high ) / 2;
86 if ( len_array[mid] == target )
88 break;
90 else if ( target > len_array[mid] )
92 low = mid + 1;
94 else
96 high = mid - 1;
100 if ( low > high )
102 return -1;
105 //locate the first same length unflaged
106 return flag_array[mid]++;
109 int lengthSearch ( unsigned int * len_array, unsigned int * flag_array, int num, unsigned int target )
111 int mid, low, high, i;
112 low = 1;
113 high = num;
115 while ( low <= high )
117 mid = ( low + high ) / 2;
119 if ( len_array[mid] == target )
121 break;
123 else if ( target > len_array[mid] )
125 low = mid + 1;
127 else
129 high = mid - 1;
133 if ( low > high )
135 return -1;
138 //locate the first same length unflaged
139 if ( !flag_array[mid] )
141 for ( i = mid - 1; i > 0; i-- )
143 if ( len_array[i] != len_array[mid] || flag_array[i] )
145 break;
149 flag_array[i + 1] = 1;
150 return i + 1;
152 else
154 for ( i = mid + 1; i <= num; i++ )
156 if ( !flag_array[i] )
158 break;
162 flag_array[i] = 1;
163 return i;
167 void quick_sort_int ( unsigned int * length_array, int low, int high )
169 int i, j;
170 unsigned int pivot;
172 if ( low < high )
174 pivot = length_array[low];
175 i = low;
176 j = high;
178 while ( i < j )
180 while ( i < j && length_array[j] >= pivot )
182 j--;
185 if ( i < j )
187 length_array[i++] = length_array[j];
190 while ( i < j && length_array[i] <= pivot )
192 i++;
195 if ( i < j )
197 length_array[j--] = length_array[i];
201 length_array[i] = pivot;
202 quick_sort_int ( length_array, low, i - 1 );
203 quick_sort_int ( length_array, i + 1, high );
207 static int maskRepeatByArc ( unsigned avg_weight )
209 unsigned int i, bal_i;
210 int counter = 0;
211 int arc_num;
212 unsigned int arc_weight1, arc_weight2;
213 preARC * arc;
215 for ( i = 1; i <= num_ctg; ++i )
217 if ( contig_array[i].mask == 1 )
219 if ( isSmallerThanTwin ( i ) )
221 ++i;
224 continue;
227 bal_i = getTwinCtg ( i );
228 arc = contig_array[bal_i].arcs;
229 arc_weight1 = maxArcWeight ( arc );
230 arc = contig_array[i].arcs;
231 arc_weight2 = maxArcWeight ( arc );
233 if ( arc_weight1 + arc_weight2 >= 4 * avg_weight )
235 contig_array[i].mask = 1;
236 contig_array[bal_i].mask = 1;
238 if ( i == bal_i ) { counter += 1; }
239 else { counter += 2; }
242 if ( isSmallerThanTwin ( i ) )
244 ++i;
248 return counter;
251 /*************************************************
252 Function:
253 loadUpdatedEdges
254 Description:
255 Loads contig information and masks some contigs according to setting.
256 Input:
257 1. graphfile: prefix of graph file
258 Output:
259 None.
260 Return:
261 None.
262 *************************************************/
263 void loadUpdatedEdges ( char * graphfile )
265 char c, name[256], line[1024];
266 int bal_ed, cvg;
267 FILE * fp, *out_fp;
268 Kmer from_kmer, to_kmer;
269 unsigned int num_ctgge, length, index = 0, num_kmer;
270 unsigned int i = 0, j;
271 int newIndex;
272 unsigned int * length_array, *flag_array, diff_len;
273 char * outfile = graphfile;
274 long long cvgSum = 0;
275 long long counter = 0;
276 unsigned int avg_arc_wt;
277 int ctg_short_cutoff;
278 float high_cvg_cutoff1, high_cvg_cutoff2, low_cvg_cutoff;
279 int cut_len;
280 //get overlaplen from *.preGraphBasic
281 sprintf ( name, "%s.preGraphBasic", graphfile );
282 fp = ckopen ( name, "r" );
284 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
286 if ( line[0] == 'V' )
288 sscanf ( line + 6, "%d %c %d", &num_kmer, &c, &overlaplen );
289 fprintf ( stderr, "Kmer size: %d\n", overlaplen );
290 break;
294 cut_len = COMPATIBLE_MODE == 0 ? overlaplen : 0;
296 if ( ctg_short == 0 )
298 ctg_short = overlaplen + 2;
301 ctg_short_cutoff = 2 * overlaplen + 2 < 100 ? 100 : 0;
302 fclose ( fp );
303 sprintf ( name, "%s.updated.edge", graphfile );
304 fp = ckopen ( name, "r" );
305 sprintf ( name, "%s.newContigIndex", outfile );
306 out_fp = ckopen ( name, "w" );
308 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
310 if ( line[0] == 'E' )
312 sscanf ( line + 5, "%d", &num_ctgge );
313 fprintf ( stderr, "There are %d edge(s) in edge file.\n", num_ctgge );
314 break;
318 index_array = ( unsigned int * ) ckalloc ( ( num_ctgge + 1 ) * sizeof ( unsigned int ) );
319 length_array = ( unsigned int * ) ckalloc ( ( num_ctgge + 1 ) * sizeof ( unsigned int ) );
320 flag_array = ( unsigned int * ) ckalloc ( ( num_ctgge + 1 ) * sizeof ( unsigned int ) );
322 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
324 if ( line[0] == '>' )
326 sscanf ( line + 7, "%d", &length );
327 index_array[++index] = length;
328 length_array[++i] = length;
332 num_ctg = index;
333 orig2new = 1;
334 qsort ( & ( length_array[1] ), num_ctg, sizeof ( length_array[0] ), cmp_int );
335 //extract unique length
336 diff_len = 0;
338 for ( i = 1; i <= num_ctg; i++ )
340 for ( j = i + 1; j <= num_ctg; j++ )
341 if ( length_array[j] != length_array[i] )
343 break;
346 length_array[++diff_len] = length_array[i];
347 flag_array[diff_len] = i;
348 i = j - 1;
351 contig_array = ( CONTIG * ) ckalloc ( ( num_ctg + 1 ) * sizeof ( CONTIG ) );
352 //load edges
353 index = 0;
354 rewind ( fp );
356 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
358 if ( line[0] == '>' )
360 sscanf ( line, ">length %u,%d,%d", &length, &bal_ed, &cvg );
361 newIndex = uniqueLenSearch ( length_array, flag_array, diff_len, length );
362 index_array[++index] = newIndex;
364 if ( length != 0 ) { contig_array[newIndex].length = length - cut_len; }
365 else { contig_array[newIndex].length = 0; }
367 contig_array[newIndex].bal_edge = bal_ed + 1;
368 contig_array[newIndex].downwardConnect = NULL;
369 contig_array[newIndex].mask = 0;
370 contig_array[newIndex].flag = 0;
371 contig_array[newIndex].arcs = NULL;
372 contig_array[newIndex].seq = NULL;
373 contig_array[newIndex].multi = 0;
374 contig_array[newIndex].inSubGraph = 0;
375 contig_array[newIndex].bubbleInScaff = 0;
376 contig_array[newIndex].cvg = cvg / 10;
378 if ( cvg && length > 100 )
380 counter += length - cut_len;
381 cvgSum += cvg * ( length - cut_len );
384 fprintf ( out_fp, "%d %d %d\n", index, newIndex, contig_array[newIndex].bal_edge );
388 if ( counter )
390 cvgAvg = cvgSum / counter / 10 > 2 ? cvgSum / counter / 10 : 3;
393 //mark repeats
394 int bal_i;
396 if ( maskRep )
398 high_cvg_cutoff1 = cvg_high * cvgAvg;
399 high_cvg_cutoff2 = cvg_high * cvgAvg * 0.8;
400 low_cvg_cutoff = cvg_low * cvgAvg;
401 counter = 0;
402 fprintf ( stderr, "Mask contigs with coverage lower than %.1f or higher than %.1f, and strict length %d.\n", low_cvg_cutoff, high_cvg_cutoff1, ctg_short_cutoff );
404 for ( i = 1; i <= num_ctg; i++ )
406 bal_i = getTwinCtg ( i );
408 if ( ( contig_array[i].cvg + contig_array[bal_i].cvg ) > 2 * high_cvg_cutoff1 )
410 contig_array[i].mask = 1;
411 contig_array[bal_i].mask = 1;
413 if ( i == bal_i ) { counter += 1; }
414 else { counter += 2; }
416 else if ( contig_array[i].length < ctg_short_cutoff && ( contig_array[i].cvg > high_cvg_cutoff2 || contig_array[bal_i].cvg > high_cvg_cutoff2 || ( contig_array[i].cvg < low_cvg_cutoff && contig_array[bal_i].cvg < low_cvg_cutoff ) ) )
418 contig_array[i].mask = 1;
419 contig_array[bal_i].mask = 1;
421 if ( i == bal_i ) { counter += 1; }
422 else { counter += 2; }
424 else if ( cvgAvg < 50 && ( contig_array[i].cvg >= 63 || contig_array[bal_i].cvg >= 63 ) )
426 contig_array[i].mask = 1;
427 contig_array[bal_i].mask = 1;
429 if ( i == bal_i ) { counter += 1; }
430 else { counter += 2; }
433 if ( isSmallerThanTwin ( i ) )
435 i++;
439 fprintf ( stderr, "Average contig coverage is %d, %lld contig(s) masked.\n", cvgAvg, counter );
442 counter = 0;
444 for ( i = 1; i <= num_ctg; i++ )
446 if ( contig_array[i].mask )
448 continue;
451 bal_i = getTwinCtg ( i );
453 if ( contig_array[i].length < ctg_short )
455 contig_array[i].mask = 1;
456 contig_array[bal_i].mask = 1;
458 if ( i == bal_i ) { counter += 1; }
459 else { counter += 2; }
462 if ( isSmallerThanTwin ( i ) )
464 i++;
468 fprintf ( stderr, "Mask contigs shorter than %d, %lld contig(s) masked.\n", ctg_short, counter );
469 avg_arc_wt = loadArcs ( graphfile );
470 counter = 0;
471 //counter = maskRepeatByArc(avg_arc_wt);
472 //printf ("Mask contigs with multi arcs, %d contig masked\n", counter);
473 //tipsCount();
474 loadContig ( graphfile );
475 fprintf ( stderr, "Done loading updated edges.\n" );
476 free ( ( void * ) length_array );
477 free ( ( void * ) flag_array );
478 fclose ( fp );
479 fclose ( out_fp );
482 static void add1Arc ( unsigned int from_ed, unsigned int to_ed, unsigned int weight )
484 preARC * parc;
485 unsigned int from_c = index_array[from_ed];
486 unsigned int to_c = index_array[to_ed];
487 parc = allocatePreArc ( to_c );
488 parc->multiplicity = weight;
489 parc->next = contig_array[from_c].arcs;
490 contig_array[from_c].arcs = parc;
493 /*************************************************
494 Function:
495 loadArcs
496 Description:
497 Loads arc information of contigs and calculates the average weight of arcs.
498 Input:
499 1. graphfile: prefix of graph file
500 Output:
501 None.
502 Return:
503 The average weight of arcs.
504 *************************************************/
505 static unsigned int loadArcs ( char * graphfile )
507 FILE * fp;
508 char name[256], line[1024];
509 unsigned int target, weight;
510 unsigned int from_ed;
511 char * seg;
512 unsigned int avg_weight = 0, weight_sum = 0, arc_num = 0;
513 sprintf ( name, "%s.Arc", graphfile );
514 fp = ckopen ( name, "r" );
515 createPreArcMemManager ();
516 arcCounter = 0;
518 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
520 seg = strtok ( line, " " );
521 from_ed = atoi ( seg );
523 //printf("%d\n",from_ed);
524 while ( ( seg = strtok ( NULL, " " ) ) != NULL )
526 target = atoi ( seg );
527 seg = strtok ( NULL, " " );
528 weight = atoi ( seg );
529 add1Arc ( from_ed, target, weight );
531 if ( !contig_array[index_array[from_ed]].mask && !contig_array[index_array[target]].mask )
533 weight_sum += weight;
534 ++arc_num;
539 if ( arc_num )
541 avg_weight = weight_sum / arc_num;
544 fprintf ( stderr, "%lld arc(s) loaded, average weight is %u.\n", arcCounter, avg_weight );
545 fclose ( fp );
546 return avg_weight;
549 /*************************************************
550 Function:
551 loadContig
552 Description:
553 Loads contigs sequence.
554 Input:
555 1. graphfile: prefix of graph file
556 Output:
557 None.
558 Return:
559 None.
560 *************************************************/
561 void loadContig ( char * graphfile )
563 char c, name[256], line[1024], *tightSeq = NULL;
564 FILE * fp;
565 int n = 0, length, index = -1, edgeno;
566 unsigned int i;
567 unsigned int newIndex;
568 sprintf ( name, "%s.contig", graphfile );
569 fp = ckopen ( name, "r" );
571 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
573 if ( line[0] == '>' )
575 if ( index >= 0 )
577 newIndex = index_array[edgeno];
578 contig_array[newIndex].seq = tightSeq;
581 n = 0;
582 index++;
583 sscanf ( line + 1, "%d %s %d", &edgeno, name, &length );
584 //printf("contig %d, length %d\n",edgeno,length);
585 tightSeq = ( char * ) ckalloc ( ( length / 4 + 1 ) * sizeof ( char ) );
587 else
589 for ( i = 0; i < strlen ( line ); i++ )
591 if ( line[i] >= 'a' && line[i] <= 'z' )
593 c = base2int ( line[i] - 'a' + 'A' );
594 writeChar2tightString ( c, tightSeq, n++ );
596 else if ( line[i] >= 'A' && line[i] <= 'Z' )
598 c = base2int ( line[i] );
599 writeChar2tightString ( c, tightSeq, n++ );
605 if ( index >= 0 )
607 newIndex = index_array[edgeno];
608 contig_array[newIndex].seq = tightSeq;
611 fprintf ( stderr, "%d contig(s) loaded.\n", index + 1 );
612 fclose ( fp );
613 //printf("the %dth contig with index 107\n",index);
616 void freeContig_array ()
618 if ( !contig_array )
620 return;
623 unsigned int i;
625 for ( i = 1; i <= num_ctg; i++ )
627 if ( contig_array[i].seq )
629 free ( ( void * ) contig_array[i].seq );
632 if ( contig_array[i].closeReads )
634 freeStack ( contig_array[i].closeReads );
638 free ( ( void * ) contig_array );
639 contig_array = NULL;
643 void loadCvg(char *graphfile)
645 char name[256],line[1024];
646 FILE *fp;
647 int cvg;
648 unsigned int newIndex,edgeno,bal_ctg;
650 sprintf(name,"%s.contigCVG",graphfile);
651 fp = fopen(name,"r");
652 if(!fp){
653 printf("contig coverage file %s is not found!\n",name);
654 return;
657 while(fgets(line,sizeof(line),fp)!=NULL){
658 if(line[0]=='>'){
659 sscanf(line+1,"%d %d",&edgeno,&cvg);
660 newIndex = index_array[edgeno];
661 cvg = cvg <= 255 ? cvg:255;
662 contig_array[newIndex].multi = cvg;
663 bal_ctg = getTwinCtg(newIndex);
664 contig_array[bal_ctg].multi= cvg;
667 fclose(fp);