modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / output_contig.c
blobde5d0c563af0a5f00b29036e2d3fb8dca83f9f37
1 /*
2 * output_contig.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
28 static char * kmerSeq;
29 static unsigned int * flag_array;
30 void output_graph ( char * outfile )
32 char name[256];
33 FILE * fp;
34 unsigned int i, bal_i;
35 sprintf ( name, "%s.edge.gvz", outfile );
36 fp = ckopen ( name, "w" );
37 fprintf ( fp, "digraph G{\n" );
38 fprintf ( fp, "\tsize=\"512,512\";\n" );
40 for ( i = num_ed; i > 0; i-- )
42 if ( edge_array[i].deleted )
44 continue;
48 arcCount(i,&arcNum);
49 if(arcNum<1)
50 continue;
52 bal_i = getTwinEdge ( i );
54 arcCount(bal_i,&arcNum);
55 if(arcNum<1)
56 continue;
58 fprintf ( fp, "\tV%d -> V%d[label =\"%d(%d)\"];\n", edge_array[i].from_vt, edge_array[i].to_vt, i, edge_array[i].length );
61 fprintf ( fp, "}\n" );
62 fclose ( fp );
65 static void output_1contig ( int id, EDGE * edge, FILE * fp, boolean tip )
67 int i;
68 Kmer kmer;
69 fprintf ( fp, ">%d length %d cvg_%.1f_tip_%d\n", id, edge->length + overlaplen, ( double ) edge->cvg / 10, tip );
70 //fprintf(fp,">%d\n",id);
71 kmer = vt_array[edge->from_vt].kmer;
72 printKmerSeq ( fp, kmer );
74 for ( i = 0; i < edge->length; i++ )
76 fprintf ( fp, "%c", int2base ( ( int ) getCharInTightString ( edge->seq, i ) ) );
78 if ( ( i + overlaplen + 1 ) % 100 == 0 )
80 fprintf ( fp, "\n" );
84 if ( ( edge->length + overlaplen ) % 100 != 0 )
86 fprintf ( fp, "\n" );
90 int cmp_int ( const void * a, const void * b )
92 int * A, *B;
93 A = ( int * ) a;
94 B = ( int * ) b;
96 if ( *A > *B )
98 return 1;
100 else if ( *A == *B )
102 return 0;
104 else
106 return -1;
110 int cmp_edge ( const void * a, const void * b )
112 EDGE * A, *B;
113 A = ( EDGE * ) a;
114 B = ( EDGE * ) b;
116 if ( A->length > B->length )
118 return 1;
120 else if ( A->length == B->length )
122 return 0;
124 else
126 return -1;
130 /*************************************************
131 Function:
132 output_contig
133 Description:
134 1. Sorts the edges by the length.
135 2. Makes statistic of the contig.
136 3. Outputs the info of the contig.
137 Input:
138 1. ed_array: edges array
139 2. ed_num: the count of the edge
140 3. outfile: the output file prefix
141 4. cut_len: cutoff length
142 Output:
143 None.
144 Return:
145 None.
146 *************************************************/
147 void output_contig ( EDGE * ed_array, unsigned int ed_num, char * outfile, int cut_len )
149 char temp[256];
150 FILE * fp, *fp_contig;
151 int flag, count, len_c;
152 int signI;
153 unsigned int i, j, diff_len = 0;
154 long long sum = 0, N90, N50;
155 unsigned int * length_array;
156 boolean tip;
157 sprintf ( temp, "%s.contig", outfile );
158 fp = ckopen ( temp, "w" );
159 unsigned int * all_length_arr = ( unsigned int * ) ckalloc ( ( ed_num + 1 ) * sizeof ( unsigned int ) );
160 index_array = ( unsigned int * ) ckalloc ( ( ed_num + 1 ) * sizeof ( unsigned int ) );
161 flag_array = ( unsigned int * ) ckalloc ( ( ed_num + 1 ) * sizeof ( unsigned int ) );
163 for ( i = 1; i <= ed_num; ++i )
165 index_array[i] = ed_array[i].length;
166 all_length_arr[i] = ed_array[i].length;
169 qsort ( &all_length_arr[1], ed_num, sizeof ( all_length_arr[0] ), cmp_int );
171 for ( i = 1; i <= ed_num; ++i )
173 for ( j = i + 1; j <= ed_num; ++j )
175 if ( all_length_arr[i] != all_length_arr[j] )
176 { break; }
179 all_length_arr[++diff_len] = all_length_arr[i];
180 flag_array[diff_len] = i;
181 i = j - 1;
184 for ( i = 1; i <= ed_num; ++i )
186 index_array[i] = uniqueLenSearch ( all_length_arr, flag_array, diff_len, index_array[i] );
189 for ( i = 1; i <= ed_num; ++i )
191 flag_array[index_array[i]] = i;
194 free ( ( void * ) all_length_arr );
195 length_array = ( unsigned int * ) ckalloc ( ed_num * sizeof ( unsigned int ) );
196 kmerSeq = ( char * ) ckalloc ( overlaplen * sizeof ( char ) );
197 //first scan for number counting
198 count = len_c = 0;
200 for ( i = 1; i <= ed_num; i++ )
202 if ( ( ed_array[i].length + overlaplen ) >= len_bar )
204 length_array[len_c++] = ed_array[i].length + overlaplen;
207 if ( ed_array[i].length < 1 || ed_array[i].deleted )
209 continue;
212 count++;
214 if ( EdSmallerThanTwin ( i ) )
216 i++;
220 sum = 0;
222 for ( signI = len_c - 1; signI >= 0; signI-- )
224 sum += length_array[signI];
227 qsort ( length_array, len_c, sizeof ( length_array[0] ), cmp_int );
229 if ( len_c > 0 )
231 fprintf ( stderr, "\nThere are %d contig(s) longer than %d, sum up %lld bp, with average length %lld.\n", len_c, len_bar, sum, sum / len_c );
232 fprintf ( stderr, "The longest length is %d bp, ", length_array[len_c - 1] );
234 else
236 fprintf ( stderr, "No contig was constructed!\n" );
239 N50 = sum * 0.5;
240 N90 = sum * 0.9;
241 sum = flag = 0;
243 for ( signI = len_c - 1; signI >= 0; signI-- )
245 sum += length_array[signI];
247 if ( !flag && sum >= N50 )
249 fprintf ( stderr, "contig N50 is %d bp,", length_array[signI] );
250 flag = 1;
253 if ( sum >= N90 )
255 fprintf ( stderr, "contig N90 is %d bp.\n", length_array[signI] );
256 break;
260 for ( i = 1; i <= ed_num; i++ )
262 j = flag_array[i];
264 if ( ed_array[j].deleted || ed_array[j].length < 1 )
266 continue;
269 if ( ed_array[j].arcs && ed_array[getTwinEdge ( j )].arcs )
271 tip = 0;
273 else
275 tip = 1;
278 output_1contig ( i, & ( ed_array[j] ), fp, tip );
280 if ( EdSmallerThanTwin ( j ) )
282 i++;
286 fclose ( fp );
287 free ( ( void * ) kmerSeq );
288 free ( ( void * ) length_array );
289 fprintf ( stderr, "%d contig(s) longer than %d output.\n", count, cut_len );
290 sprintf ( temp, "%s.ContigIndex", outfile );
291 fp_contig = ckopen ( temp, "w" );
292 fprintf ( fp_contig, "Edge_num %d %d\n", ed_num, count );
293 fprintf ( fp_contig, "index\tlength\treverseComplement\n" );
295 for ( i = 1; i <= num_ed; i++ )
297 j = flag_array[i];
298 fprintf ( fp_contig, "%d\t%d\t", i, edge_array[j].length + overlaplen );
300 if ( EdSmallerThanTwin ( j ) )
302 fprintf ( fp_contig, "1\n" );
303 ++i;
305 else if ( EdLargerThanTwin ( j ) )
307 fprintf ( fp_contig, "-1\n" );
309 else
311 fprintf ( fp_contig, "0\n" );
315 fclose ( fp_contig );
318 /*************************************************
319 Function:
320 output_updated_edges
321 Description:
322 Outputs the info of the edges.
323 Input:
324 1. outfile: the output file prefix
325 Output:
326 None.
327 Return:
328 None.
329 *************************************************/
331 void output_updated_edges ( char * outfile )
333 FILE * fp;
334 char name[256];
335 unsigned int i, j, validCounter = 0;
336 EDGE * edge;
337 sprintf ( name, "%s.updated.edge", outfile );
338 fp = ckopen ( name, "w" );
340 for ( i = 1; i <= num_ed; i++ )
342 validCounter++;
345 fprintf ( fp, "EDGEs %d\n", validCounter );
346 validCounter = 0;
348 for ( i = 1; i <= num_ed; i++ )
350 j = flag_array[i];
351 edge = &edge_array[j];
353 if ( edge->length != 0 )
354 { fprintf ( fp, ">length %d,", edge->length + overlaplen ); }
355 else { fprintf ( fp, ">length %d,", edge->length ); }
357 if ( EdSmallerThanTwin ( j ) )
359 fprintf ( fp, "1," );
361 else if ( EdLargerThanTwin ( j ) )
363 fprintf ( fp, "-1," );
365 else
367 fprintf ( fp, "0," );
370 fprintf ( fp, "%d,", edge->cvg );
371 print_kmer ( fp, vt_array[edge->from_vt].kmer, ',' );
372 print_kmer ( fp, vt_array[edge->to_vt].kmer, ',' );
373 fprintf ( fp, "\n" );
376 fclose ( fp );
379 /*************************************************
380 Function:
381 output_heavyArcs
382 Description:
383 Outputs the info of arcs.
384 Input:
385 1. outfile: the output file prefix
386 Output:
387 None.
388 Return:
389 None.
390 *************************************************/
391 void output_heavyArcs ( char * outfile )
393 unsigned int i, j;
394 char name[256];
395 FILE * outfp;
396 ARC * parc;
397 sprintf ( name, "%s.Arc", outfile );
398 outfp = ckopen ( name, "w" );
400 for ( i = 1; i <= num_ed; i++ )
402 parc = edge_array[flag_array[i]].arcs;
404 if ( !parc )
406 continue;
409 j = 0;
410 fprintf ( outfp, "%u", i );
412 while ( parc )
414 fprintf ( outfp, " %u %u", index_array[parc->to_ed], parc->multiplicity );
416 if ( ( ++j ) % 10 == 0 )
418 fprintf ( outfp, "\n%u", i );
421 parc = parc->next;
424 fprintf ( outfp, "\n" );
427 fclose ( outfp );
428 free ( ( void * ) index_array );
429 free ( ( void * ) flag_array );