4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
29 #define preARCBLOCKSIZE 100000
31 static unsigned int loadArcs ( char * graphfile
);
32 static void loadContig ( char * graphfile
);
33 static int maskRepeatByArc ( unsigned avg_weight
);
36 void loadUpdatedVertex (char *graphfile)
38 char name[256], line[256];
44 sprintf (name, "%s.updated.vertex", graphfile);
45 fp = ckopen (name, "r");
47 while (fgets (line, sizeof (line), fp) != NULL)
51 sscanf (line + 6, "%d %c %d", &num_kmer, &ch, &overlaplen);
52 printf ("there're %d kmers in vertex file\n", num_kmer);
57 vt_array = (VERTEX *) ckalloc ((2 * num_kmer) * sizeof (VERTEX));
59 for (i = 0; i < num_kmer; i++)
61 fscanf (fp, "%llx ", &word);
62 vt_array[i].kmer = word;
67 for (i = 0; i < num_kmer; i++)
69 bal_word = reverseComplement (vt_array[i].kmer, overlaplen);
70 vt_array[i + num_kmer].kmer = bal_word;
76 int uniqueLenSearch ( unsigned int * len_array
, unsigned int * flag_array
, int num
, unsigned int target
)
84 mid
= ( low
+ high
) / 2;
86 if ( len_array
[mid
] == target
)
90 else if ( target
> len_array
[mid
] )
105 //locate the first same length unflaged
106 return flag_array
[mid
]++;
109 int lengthSearch ( unsigned int * len_array
, unsigned int * flag_array
, int num
, unsigned int target
)
111 int mid
, low
, high
, i
;
115 while ( low
<= high
)
117 mid
= ( low
+ high
) / 2;
119 if ( len_array
[mid
] == target
)
123 else if ( target
> len_array
[mid
] )
138 //locate the first same length unflaged
139 if ( !flag_array
[mid
] )
141 for ( i
= mid
- 1; i
> 0; i
-- )
143 if ( len_array
[i
] != len_array
[mid
] || flag_array
[i
] )
149 flag_array
[i
+ 1] = 1;
154 for ( i
= mid
+ 1; i
<= num
; i
++ )
156 if ( !flag_array
[i
] )
167 void quick_sort_int ( unsigned int * length_array
, int low
, int high
)
174 pivot
= length_array
[low
];
180 while ( i
< j
&& length_array
[j
] >= pivot
)
187 length_array
[i
++] = length_array
[j
];
190 while ( i
< j
&& length_array
[i
] <= pivot
)
197 length_array
[j
--] = length_array
[i
];
201 length_array
[i
] = pivot
;
202 quick_sort_int ( length_array
, low
, i
- 1 );
203 quick_sort_int ( length_array
, i
+ 1, high
);
207 static int maskRepeatByArc ( unsigned avg_weight
)
209 unsigned int i
, bal_i
;
212 unsigned int arc_weight1
, arc_weight2
;
215 for ( i
= 1; i
<= num_ctg
; ++i
)
217 if ( contig_array
[i
].mask
== 1 )
219 if ( isSmallerThanTwin ( i
) )
227 bal_i
= getTwinCtg ( i
);
228 arc
= contig_array
[bal_i
].arcs
;
229 arc_weight1
= maxArcWeight ( arc
);
230 arc
= contig_array
[i
].arcs
;
231 arc_weight2
= maxArcWeight ( arc
);
233 if ( arc_weight1
+ arc_weight2
>= 4 * avg_weight
)
235 contig_array
[i
].mask
= 1;
236 contig_array
[bal_i
].mask
= 1;
238 if ( i
== bal_i
) { counter
+= 1; }
239 else { counter
+= 2; }
242 if ( isSmallerThanTwin ( i
) )
251 /*************************************************
255 Loads contig information and masks some contigs according to setting.
257 1. graphfile: prefix of graph file
262 *************************************************/
263 void loadUpdatedEdges ( char * graphfile
)
265 char c
, name
[256], line
[1024];
268 Kmer from_kmer
, to_kmer
;
269 unsigned int num_ctgge
, length
, index
= 0, num_kmer
;
270 unsigned int i
= 0, j
;
272 unsigned int * length_array
, *flag_array
, diff_len
;
273 char * outfile
= graphfile
;
274 long long cvgSum
= 0;
275 long long counter
= 0;
276 unsigned int avg_arc_wt
;
277 int ctg_short_cutoff
;
278 float high_cvg_cutoff1
, high_cvg_cutoff2
, low_cvg_cutoff
;
280 //get overlaplen from *.preGraphBasic
281 sprintf ( name
, "%s.preGraphBasic", graphfile
);
282 fp
= ckopen ( name
, "r" );
284 while ( fgets ( line
, sizeof ( line
), fp
) != NULL
)
286 if ( line
[0] == 'V' )
288 sscanf ( line
+ 6, "%d %c %d", &num_kmer
, &c
, &overlaplen
);
289 fprintf ( stderr
, "Kmer size: %d\n", overlaplen
);
294 cut_len
= COMPATIBLE_MODE
== 0 ? overlaplen
: 0;
296 if ( ctg_short
== 0 )
298 ctg_short
= overlaplen
+ 2;
301 ctg_short_cutoff
= 2 * overlaplen
+ 2 < 100 ? 100 : 0;
303 sprintf ( name
, "%s.updated.edge", graphfile
);
304 fp
= ckopen ( name
, "r" );
305 sprintf ( name
, "%s.newContigIndex", outfile
);
306 out_fp
= ckopen ( name
, "w" );
308 while ( fgets ( line
, sizeof ( line
), fp
) != NULL
)
310 if ( line
[0] == 'E' )
312 sscanf ( line
+ 5, "%d", &num_ctgge
);
313 fprintf ( stderr
, "There are %d edge(s) in edge file.\n", num_ctgge
);
318 index_array
= ( unsigned int * ) ckalloc ( ( num_ctgge
+ 1 ) * sizeof ( unsigned int ) );
319 length_array
= ( unsigned int * ) ckalloc ( ( num_ctgge
+ 1 ) * sizeof ( unsigned int ) );
320 flag_array
= ( unsigned int * ) ckalloc ( ( num_ctgge
+ 1 ) * sizeof ( unsigned int ) );
322 while ( fgets ( line
, sizeof ( line
), fp
) != NULL
)
324 if ( line
[0] == '>' )
326 sscanf ( line
+ 7, "%d", &length
);
327 index_array
[++index
] = length
;
328 length_array
[++i
] = length
;
334 qsort ( & ( length_array
[1] ), num_ctg
, sizeof ( length_array
[0] ), cmp_int
);
335 //extract unique length
338 for ( i
= 1; i
<= num_ctg
; i
++ )
340 for ( j
= i
+ 1; j
<= num_ctg
; j
++ )
341 if ( length_array
[j
] != length_array
[i
] )
346 length_array
[++diff_len
] = length_array
[i
];
347 flag_array
[diff_len
] = i
;
351 contig_array
= ( CONTIG
* ) ckalloc ( ( num_ctg
+ 1 ) * sizeof ( CONTIG
) );
356 while ( fgets ( line
, sizeof ( line
), fp
) != NULL
)
358 if ( line
[0] == '>' )
360 sscanf ( line
, ">length %u,%d,%d", &length
, &bal_ed
, &cvg
);
361 newIndex
= uniqueLenSearch ( length_array
, flag_array
, diff_len
, length
);
362 index_array
[++index
] = newIndex
;
364 if ( length
!= 0 ) { contig_array
[newIndex
].length
= length
- cut_len
; }
365 else { contig_array
[newIndex
].length
= 0; }
367 contig_array
[newIndex
].bal_edge
= bal_ed
+ 1;
368 contig_array
[newIndex
].downwardConnect
= NULL
;
369 contig_array
[newIndex
].mask
= 0;
370 contig_array
[newIndex
].flag
= 0;
371 contig_array
[newIndex
].arcs
= NULL
;
372 contig_array
[newIndex
].seq
= NULL
;
373 contig_array
[newIndex
].multi
= 0;
374 contig_array
[newIndex
].inSubGraph
= 0;
375 contig_array
[newIndex
].bubbleInScaff
= 0;
376 contig_array
[newIndex
].cvg
= cvg
/ 10;
378 if ( cvg
&& length
> 100 )
380 counter
+= length
- cut_len
;
381 cvgSum
+= cvg
* ( length
- cut_len
);
384 fprintf ( out_fp
, "%d %d %d\n", index
, newIndex
, contig_array
[newIndex
].bal_edge
);
390 cvgAvg
= cvgSum
/ counter
/ 10 > 2 ? cvgSum
/ counter
/ 10 : 3;
398 high_cvg_cutoff1
= cvg_high
* cvgAvg
;
399 high_cvg_cutoff2
= cvg_high
* cvgAvg
* 0.8;
400 low_cvg_cutoff
= cvg_low
* cvgAvg
;
402 fprintf ( stderr
, "Mask contigs with coverage lower than %.1f or higher than %.1f, and strict length %d.\n", low_cvg_cutoff
, high_cvg_cutoff1
, ctg_short_cutoff
);
404 for ( i
= 1; i
<= num_ctg
; i
++ )
406 bal_i
= getTwinCtg ( i
);
408 if ( ( contig_array
[i
].cvg
+ contig_array
[bal_i
].cvg
) > 2 * high_cvg_cutoff1
)
410 contig_array
[i
].mask
= 1;
411 contig_array
[bal_i
].mask
= 1;
413 if ( i
== bal_i
) { counter
+= 1; }
414 else { counter
+= 2; }
416 else if ( contig_array
[i
].length
< ctg_short_cutoff
&& ( contig_array
[i
].cvg
> high_cvg_cutoff2
|| contig_array
[bal_i
].cvg
> high_cvg_cutoff2
|| ( contig_array
[i
].cvg
< low_cvg_cutoff
&& contig_array
[bal_i
].cvg
< low_cvg_cutoff
) ) )
418 contig_array
[i
].mask
= 1;
419 contig_array
[bal_i
].mask
= 1;
421 if ( i
== bal_i
) { counter
+= 1; }
422 else { counter
+= 2; }
424 else if ( cvgAvg
< 50 && ( contig_array
[i
].cvg
>= 63 || contig_array
[bal_i
].cvg
>= 63 ) )
426 contig_array
[i
].mask
= 1;
427 contig_array
[bal_i
].mask
= 1;
429 if ( i
== bal_i
) { counter
+= 1; }
430 else { counter
+= 2; }
433 if ( isSmallerThanTwin ( i
) )
439 fprintf ( stderr
, "Average contig coverage is %d, %lld contig(s) masked.\n", cvgAvg
, counter
);
444 for ( i
= 1; i
<= num_ctg
; i
++ )
446 if ( contig_array
[i
].mask
)
451 bal_i
= getTwinCtg ( i
);
453 if ( contig_array
[i
].length
< ctg_short
)
455 contig_array
[i
].mask
= 1;
456 contig_array
[bal_i
].mask
= 1;
458 if ( i
== bal_i
) { counter
+= 1; }
459 else { counter
+= 2; }
462 if ( isSmallerThanTwin ( i
) )
468 fprintf ( stderr
, "Mask contigs shorter than %d, %lld contig(s) masked.\n", ctg_short
, counter
);
469 avg_arc_wt
= loadArcs ( graphfile
);
471 //counter = maskRepeatByArc(avg_arc_wt);
472 //printf ("Mask contigs with multi arcs, %d contig masked\n", counter);
474 loadContig ( graphfile
);
475 fprintf ( stderr
, "Done loading updated edges.\n" );
476 free ( ( void * ) length_array
);
477 free ( ( void * ) flag_array
);
482 static void add1Arc ( unsigned int from_ed
, unsigned int to_ed
, unsigned int weight
)
485 unsigned int from_c
= index_array
[from_ed
];
486 unsigned int to_c
= index_array
[to_ed
];
487 parc
= allocatePreArc ( to_c
);
488 parc
->multiplicity
= weight
;
489 parc
->next
= contig_array
[from_c
].arcs
;
490 contig_array
[from_c
].arcs
= parc
;
493 /*************************************************
497 Loads arc information of contigs and calculates the average weight of arcs.
499 1. graphfile: prefix of graph file
503 The average weight of arcs.
504 *************************************************/
505 static unsigned int loadArcs ( char * graphfile
)
508 char name
[256], line
[1024];
509 unsigned int target
, weight
;
510 unsigned int from_ed
;
512 unsigned int avg_weight
= 0, weight_sum
= 0, arc_num
= 0;
513 sprintf ( name
, "%s.Arc", graphfile
);
514 fp
= ckopen ( name
, "r" );
515 createPreArcMemManager ();
518 while ( fgets ( line
, sizeof ( line
), fp
) != NULL
)
520 seg
= strtok ( line
, " " );
521 from_ed
= atoi ( seg
);
523 //printf("%d\n",from_ed);
524 while ( ( seg
= strtok ( NULL
, " " ) ) != NULL
)
526 target
= atoi ( seg
);
527 seg
= strtok ( NULL
, " " );
528 weight
= atoi ( seg
);
529 add1Arc ( from_ed
, target
, weight
);
531 if ( !contig_array
[index_array
[from_ed
]].mask
&& !contig_array
[index_array
[target
]].mask
)
533 weight_sum
+= weight
;
541 avg_weight
= weight_sum
/ arc_num
;
544 fprintf ( stderr
, "%lld arc(s) loaded, average weight is %u.\n", arcCounter
, avg_weight
);
549 /*************************************************
553 Loads contigs sequence.
555 1. graphfile: prefix of graph file
560 *************************************************/
561 void loadContig ( char * graphfile
)
563 char c
, name
[256], line
[1024], *tightSeq
= NULL
;
565 int n
= 0, length
, index
= -1, edgeno
;
567 unsigned int newIndex
;
568 sprintf ( name
, "%s.contig", graphfile
);
569 fp
= ckopen ( name
, "r" );
571 while ( fgets ( line
, sizeof ( line
), fp
) != NULL
)
573 if ( line
[0] == '>' )
577 newIndex
= index_array
[edgeno
];
578 contig_array
[newIndex
].seq
= tightSeq
;
583 sscanf ( line
+ 1, "%d %s %d", &edgeno
, name
, &length
);
584 //printf("contig %d, length %d\n",edgeno,length);
585 tightSeq
= ( char * ) ckalloc ( ( length
/ 4 + 1 ) * sizeof ( char ) );
589 for ( i
= 0; i
< strlen ( line
); i
++ )
591 if ( line
[i
] >= 'a' && line
[i
] <= 'z' )
593 c
= base2int ( line
[i
] - 'a' + 'A' );
594 writeChar2tightString ( c
, tightSeq
, n
++ );
596 else if ( line
[i
] >= 'A' && line
[i
] <= 'Z' )
598 c
= base2int ( line
[i
] );
599 writeChar2tightString ( c
, tightSeq
, n
++ );
607 newIndex
= index_array
[edgeno
];
608 contig_array
[newIndex
].seq
= tightSeq
;
611 fprintf ( stderr
, "%d contig(s) loaded.\n", index
+ 1 );
613 //printf("the %dth contig with index 107\n",index);
616 void freeContig_array ()
625 for ( i
= 1; i
<= num_ctg
; i
++ )
627 if ( contig_array
[i
].seq
)
629 free ( ( void * ) contig_array
[i
].seq
);
632 if ( contig_array
[i
].closeReads
)
634 freeStack ( contig_array
[i
].closeReads
);
638 free ( ( void * ) contig_array
);
643 void loadCvg(char *graphfile)
645 char name[256],line[1024];
648 unsigned int newIndex,edgeno,bal_ctg;
650 sprintf(name,"%s.contigCVG",graphfile);
651 fp = fopen(name,"r");
653 printf("contig coverage file %s is not found!\n",name);
657 while(fgets(line,sizeof(line),fp)!=NULL){
659 sscanf(line+1,"%d %d",&edgeno,&cvg);
660 newIndex = index_array[edgeno];
661 cvg = cvg <= 255 ? cvg:255;
662 contig_array[newIndex].multi = cvg;
663 bal_ctg = getTwinCtg(newIndex);
664 contig_array[bal_ctg].multi= cvg;