4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
28 static char * kmerSeq
;
29 static unsigned int * flag_array
;
30 void output_graph ( char * outfile
)
34 unsigned int i
, bal_i
;
35 sprintf ( name
, "%s.edge.gvz", outfile
);
36 fp
= ckopen ( name
, "w" );
37 fprintf ( fp
, "digraph G{\n" );
38 fprintf ( fp
, "\tsize=\"512,512\";\n" );
40 for ( i
= num_ed
; i
> 0; i
-- )
42 if ( edge_array
[i
].deleted
)
52 bal_i
= getTwinEdge ( i
);
54 arcCount(bal_i,&arcNum);
58 fprintf ( fp
, "\tV%d -> V%d[label =\"%d(%d)\"];\n", edge_array
[i
].from_vt
, edge_array
[i
].to_vt
, i
, edge_array
[i
].length
);
61 fprintf ( fp
, "}\n" );
65 static void output_1contig ( int id
, EDGE
* edge
, FILE * fp
, boolean tip
)
69 fprintf ( fp
, ">%d length %d cvg_%.1f_tip_%d\n", id
, edge
->length
+ overlaplen
, ( double ) edge
->cvg
/ 10, tip
);
70 //fprintf(fp,">%d\n",id);
71 kmer
= vt_array
[edge
->from_vt
].kmer
;
72 printKmerSeq ( fp
, kmer
);
74 for ( i
= 0; i
< edge
->length
; i
++ )
76 fprintf ( fp
, "%c", int2base ( ( int ) getCharInTightString ( edge
->seq
, i
) ) );
78 if ( ( i
+ overlaplen
+ 1 ) % 100 == 0 )
84 if ( ( edge
->length
+ overlaplen
) % 100 != 0 )
90 int cmp_int ( const void * a
, const void * b
)
110 int cmp_edge ( const void * a
, const void * b
)
116 if ( A
->length
> B
->length
)
120 else if ( A
->length
== B
->length
)
130 /*************************************************
134 1. Sorts the edges by the length.
135 2. Makes statistic of the contig.
136 3. Outputs the info of the contig.
138 1. ed_array: edges array
139 2. ed_num: the count of the edge
140 3. outfile: the output file prefix
141 4. cut_len: cutoff length
146 *************************************************/
147 void output_contig ( EDGE
* ed_array
, unsigned int ed_num
, char * outfile
, int cut_len
)
150 FILE * fp
, *fp_contig
;
151 int flag
, count
, len_c
;
153 unsigned int i
, j
, diff_len
= 0;
154 long long sum
= 0, N90
, N50
;
155 unsigned int * length_array
;
157 sprintf ( temp
, "%s.contig", outfile
);
158 fp
= ckopen ( temp
, "w" );
159 unsigned int * all_length_arr
= ( unsigned int * ) ckalloc ( ( ed_num
+ 1 ) * sizeof ( unsigned int ) );
160 index_array
= ( unsigned int * ) ckalloc ( ( ed_num
+ 1 ) * sizeof ( unsigned int ) );
161 flag_array
= ( unsigned int * ) ckalloc ( ( ed_num
+ 1 ) * sizeof ( unsigned int ) );
163 for ( i
= 1; i
<= ed_num
; ++i
)
165 index_array
[i
] = ed_array
[i
].length
;
166 all_length_arr
[i
] = ed_array
[i
].length
;
169 qsort ( &all_length_arr
[1], ed_num
, sizeof ( all_length_arr
[0] ), cmp_int
);
171 for ( i
= 1; i
<= ed_num
; ++i
)
173 for ( j
= i
+ 1; j
<= ed_num
; ++j
)
175 if ( all_length_arr
[i
] != all_length_arr
[j
] )
179 all_length_arr
[++diff_len
] = all_length_arr
[i
];
180 flag_array
[diff_len
] = i
;
184 for ( i
= 1; i
<= ed_num
; ++i
)
186 index_array
[i
] = uniqueLenSearch ( all_length_arr
, flag_array
, diff_len
, index_array
[i
] );
189 for ( i
= 1; i
<= ed_num
; ++i
)
191 flag_array
[index_array
[i
]] = i
;
194 free ( ( void * ) all_length_arr
);
195 length_array
= ( unsigned int * ) ckalloc ( ed_num
* sizeof ( unsigned int ) );
196 kmerSeq
= ( char * ) ckalloc ( overlaplen
* sizeof ( char ) );
197 //first scan for number counting
200 for ( i
= 1; i
<= ed_num
; i
++ )
202 if ( ( ed_array
[i
].length
+ overlaplen
) >= len_bar
)
204 length_array
[len_c
++] = ed_array
[i
].length
+ overlaplen
;
207 if ( ed_array
[i
].length
< 1 || ed_array
[i
].deleted
)
214 if ( EdSmallerThanTwin ( i
) )
222 for ( signI
= len_c
- 1; signI
>= 0; signI
-- )
224 sum
+= length_array
[signI
];
227 qsort ( length_array
, len_c
, sizeof ( length_array
[0] ), cmp_int
);
231 fprintf ( stderr
, "\nThere are %d contig(s) longer than %d, sum up %lld bp, with average length %lld.\n", len_c
, len_bar
, sum
, sum
/ len_c
);
232 fprintf ( stderr
, "The longest length is %d bp, ", length_array
[len_c
- 1] );
236 fprintf ( stderr
, "No contig was constructed!\n" );
243 for ( signI
= len_c
- 1; signI
>= 0; signI
-- )
245 sum
+= length_array
[signI
];
247 if ( !flag
&& sum
>= N50
)
249 fprintf ( stderr
, "contig N50 is %d bp,", length_array
[signI
] );
255 fprintf ( stderr
, "contig N90 is %d bp.\n", length_array
[signI
] );
260 for ( i
= 1; i
<= ed_num
; i
++ )
264 if ( ed_array
[j
].deleted
|| ed_array
[j
].length
< 1 )
269 if ( ed_array
[j
].arcs
&& ed_array
[getTwinEdge ( j
)].arcs
)
278 output_1contig ( i
, & ( ed_array
[j
] ), fp
, tip
);
280 if ( EdSmallerThanTwin ( j
) )
287 free ( ( void * ) kmerSeq
);
288 free ( ( void * ) length_array
);
289 fprintf ( stderr
, "%d contig(s) longer than %d output.\n", count
, cut_len
);
290 sprintf ( temp
, "%s.ContigIndex", outfile
);
291 fp_contig
= ckopen ( temp
, "w" );
292 fprintf ( fp_contig
, "Edge_num %d %d\n", ed_num
, count
);
293 fprintf ( fp_contig
, "index\tlength\treverseComplement\n" );
295 for ( i
= 1; i
<= num_ed
; i
++ )
298 fprintf ( fp_contig
, "%d\t%d\t", i
, edge_array
[j
].length
+ overlaplen
);
300 if ( EdSmallerThanTwin ( j
) )
302 fprintf ( fp_contig
, "1\n" );
305 else if ( EdLargerThanTwin ( j
) )
307 fprintf ( fp_contig
, "-1\n" );
311 fprintf ( fp_contig
, "0\n" );
315 fclose ( fp_contig
);
318 /*************************************************
322 Outputs the info of the edges.
324 1. outfile: the output file prefix
329 *************************************************/
331 void output_updated_edges ( char * outfile
)
335 unsigned int i
, j
, validCounter
= 0;
337 sprintf ( name
, "%s.updated.edge", outfile
);
338 fp
= ckopen ( name
, "w" );
340 for ( i
= 1; i
<= num_ed
; i
++ )
345 fprintf ( fp
, "EDGEs %d\n", validCounter
);
348 for ( i
= 1; i
<= num_ed
; i
++ )
351 edge
= &edge_array
[j
];
353 if ( edge
->length
!= 0 )
354 { fprintf ( fp
, ">length %d,", edge
->length
+ overlaplen
); }
355 else { fprintf ( fp
, ">length %d,", edge
->length
); }
357 if ( EdSmallerThanTwin ( j
) )
359 fprintf ( fp
, "1," );
361 else if ( EdLargerThanTwin ( j
) )
363 fprintf ( fp
, "-1," );
367 fprintf ( fp
, "0," );
370 fprintf ( fp
, "%d,", edge
->cvg
);
371 print_kmer ( fp
, vt_array
[edge
->from_vt
].kmer
, ',' );
372 print_kmer ( fp
, vt_array
[edge
->to_vt
].kmer
, ',' );
373 fprintf ( fp
, "\n" );
379 /*************************************************
383 Outputs the info of arcs.
385 1. outfile: the output file prefix
390 *************************************************/
391 void output_heavyArcs ( char * outfile
)
397 sprintf ( name
, "%s.Arc", outfile
);
398 outfp
= ckopen ( name
, "w" );
400 for ( i
= 1; i
<= num_ed
; i
++ )
402 parc
= edge_array
[flag_array
[i
]].arcs
;
410 fprintf ( outfp
, "%u", i
);
414 fprintf ( outfp
, " %u %u", index_array
[parc
->to_ed
], parc
->multiplicity
);
416 if ( ( ++j
) % 10 == 0 )
418 fprintf ( outfp
, "\n%u", i
);
424 fprintf ( outfp
, "\n" );
428 free ( ( void * ) index_array
);
429 free ( ( void * ) flag_array
);