modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / contig.c
blobf1ce2290b8b2c1eb71a618c71ddbd307b482935c
1 /*
2 * contig.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
28 static void initenv ( int argc, char ** argv );
29 static void display_contig_usage ();
30 char shortrdsfile[256], graphfile[256];
31 static boolean repeatSolve; //whether solve repeat or not
32 //static boolean keepReadFile = 0; //whether keep tmp selected reads file or not
33 static boolean iter = 0; //whether use multikmer or not
34 static boolean cleanBubble = 0; //whether merge clean bubble before iterate
35 static int M = 1; //merge bubble level
36 static int maxk = 0; //max kmer of multikmer
38 /*************************************************
39 Function:
40 call_heavygraph
41 Description:
42 The main function for contig step . its processes are as below:
43 1. Solve repeat
44 2. Merge bubble(clean bubble is optional for multikmer)
45 3. Remove weak edge and low coverage edge
46 4. Cut tips
47 5. Iterate multikmer(optional)
48 Input:
49 @see display_contig_usage ()
50 Output:
51 The files below:
52 1. *.contig
53 2. *.ContigIndex
54 3. *.update.edge
55 4. *.Arc
56 5. *.read [optional]
57 6. *.preGraphBasic [optional]
58 Return:
59 None.
60 *************************************************/
61 int call_heavygraph ( int argc, char ** argv )
63 time_t start_t, stop_t, time_bef, time_aft;
64 time ( &start_t );
65 boolean ret;
66 fprintf ( stderr, "\n********************\n" );
67 fprintf ( stderr, "Contig\n" );
68 fprintf ( stderr, "********************\n\n" );
69 initenv ( argc, argv );
70 loadVertex ( graphfile );
71 loadEdge ( graphfile );
73 if ( repeatSolve )
75 ret = loadPathBin ( graphfile );
78 swapedge();
79 sortedge();
80 freshArc();
82 if ( repeatSolve )
84 time ( &time_bef );
86 // ret = loadPathBin (graphfile);
87 if ( ret )
89 solveReps ();
91 else
93 fprintf ( stderr, "Repeat solving can't be done...\n" );
96 time ( &time_aft );
97 fprintf ( stderr, "Time spent on solving repeat: %ds.\n", ( int ) ( time_aft - time_bef ) );
100 //edgecvg_bar(edge_array,num_ed,graphfile,100);
102 if ( !iter && M > 0 )
104 time ( &time_bef );
105 bubblePinch ( 0.90, graphfile, M, 0, 1 );
106 time ( &time_aft );
107 fprintf ( stderr, "Time spent on pinching bubbles: %ds.\n", ( int ) ( time_aft - time_bef ) );
110 if ( iter && cleanBubble && M > 0 )
112 time ( &time_bef );
113 clean = 1;
114 long long oldpinCounter = 0;
115 long long min = 10;
116 int times = 0;
118 while ( min >= 10 )
120 times++;
122 if ( times >= 4 ) { break; }
124 bubblePinch ( 0.90, graphfile, M, 1, 0 );
125 min = pinCounter;
126 fprintf ( stderr, "%lld clean bubbles merged.\n", pinCounter );
129 time ( &time_aft );
130 fprintf ( stderr, "Time spent on pinching clean bubbles: %ds.\n", ( int ) ( time_aft - time_bef ) );
131 clean = 0;
134 if ( deLowEdge )
136 removeWeakEdges ( 2 * overlaplen, 1 );
137 removeLowCovEdges ( 2 * overlaplen, deLowEdge, !iter );
140 cutTipsInGraph ( 0, 0, !iter );
142 if ( iter )
144 Iterate ( shortrdsfile, graphfile, maxk, M ); //keepReadFile,
146 if ( M > 0 )
148 time ( &time_bef );
149 bubblePinch ( 0.90, graphfile, M, 1, 0 );
150 time ( &time_aft );
151 fprintf ( stderr, "Time spent on pinching bubbles: %ds.\n", ( int ) ( time_aft - time_bef ) );
154 freshpreGraphBasic ( iter, maxk, graphfile );
157 //output_graph(graphfile);
158 output_contig ( edge_array, num_ed, graphfile, overlaplen + 1 );
159 output_updated_edges ( graphfile );
160 output_heavyArcs ( graphfile );
162 if ( vt_array )
164 free ( ( void * ) vt_array );
165 vt_array = NULL;
168 if ( edge_array )
170 free_edge_array ( edge_array, num_ed_limit );
171 edge_array = NULL;
174 destroyArcMem ();
175 time ( &stop_t );
176 fprintf ( stderr, "\nTime spent on constructing contig: %dm.\n\n", ( int ) ( stop_t - start_t ) / 60 );
177 return 0;
181 /*****************************************************************************
182 * Parse command line switches
183 *****************************************************************************/
184 void initenv ( int argc, char ** argv )
186 int copt;
187 int inpseq, outseq;
188 extern char * optarg;
189 char temp[100];
190 inpseq = outseq = repeatSolve = iter = cleanBubble = 0;//keepReadFile =
191 optind = 1;
192 fprintf ( stderr, "Parameters: contig " );
194 while ( ( copt = getopt ( argc, argv, "g:M:D:Rs:m:p:e:E" ) ) != EOF ) // r
196 switch ( copt )
198 case 'M':
199 fprintf ( stderr, "-M %s ", optarg );
200 sscanf ( optarg, "%s", temp );
201 M = atoi ( temp );
202 break;
203 case 'D':
204 fprintf ( stderr, "-D %s ", optarg );
205 sscanf ( optarg, "%s", temp );
206 deLowEdge = atoi ( temp ) >= 0 ? atoi ( temp ) : 0;
207 break;
208 case 'g':
209 fprintf ( stderr, "-g %s ", optarg );
210 inGraph = 1;
211 sscanf ( optarg, "%s", graphfile );
212 break;
213 case 'R':
214 repeatSolve = 1;
215 fprintf ( stderr, "-R " );
216 break;
217 case 's':
218 fprintf ( stderr, "-s %s ", optarg );
219 inpseq = 1;
220 sscanf ( optarg, "%s", shortrdsfile );
221 break;
222 case 'm':
223 fprintf ( stderr, "-m %s ", optarg );
224 iter = 1;
225 sscanf ( optarg, "%s", temp );
226 maxk = atoi ( temp );
227 break;
229 case 'r':
230 keepReadFile = 1;
231 fprintf(stderr, "-r ");
232 break;
234 case 'e':
235 fprintf ( stderr, "-e %s ", optarg );
236 sscanf ( optarg, "%s", temp );
237 arcfilter = atoi ( temp );
238 break;
239 case 'p':
240 fprintf ( stderr, "-p %s ", optarg );
241 sscanf ( optarg, "%s", temp );
242 thrd_num = atoi ( temp );
243 break;
244 case 'E':
245 cleanBubble = 1;
246 fprintf ( stderr, "-E " );
247 break;
248 default:
250 if ( ( iter && inpseq == 0 ) || inGraph == 0 )
252 display_contig_usage ();
253 exit ( -1 );
258 fprintf ( stderr, "\n\n" );
260 if ( iter )
262 if ( maxk % 2 == 0 )
264 maxk++;
265 fprintf ( stderr, "Max K should be an odd number, change to %d.\n", maxk );
268 if ( maxk < 13 )
270 maxk = 13;
271 fprintf ( stderr, "Max K should not be less than 13, change to %d.\n", maxk );
274 #ifdef MER127
275 else if ( maxk > 127 )
277 maxk = 127;
278 fprintf ( stderr, "Max K should not be greater than 127, change to %d.\n", maxk );
281 #else
282 else if ( maxk > 63 )
284 maxk = 63;
285 fprintf ( stderr, "Max K should not be greater than 63, change to %d.\n", maxk );
288 #endif
290 if ( maxk <= overlaplen )
292 fprintf ( stderr, "Max K %d is not greater than overlaplen %d.\n", maxk, overlaplen );
293 display_contig_usage ();
294 exit ( -1 );
298 if ( ( iter && inpseq == 0 ) || inGraph == 0 )
300 display_contig_usage ();
301 exit ( -1 );
305 static void display_contig_usage ()
307 fprintf ( stderr, "\ncontig -g InputGraph [-R] [-M mergeLevel -D EdgeCovCutoff] [-s readsInfoFile -m maxkmer -p n_cpu -r]\n" );
308 fprintf ( stderr, " -g <string> inputGraph: prefix of input graph file names\n" );
309 fprintf ( stderr, " -R (optional) resolve repeats using information generated in pregraph step, works only if -R is set in pregraph step too, [NO]\n" );
310 fprintf ( stderr, " -M <int> mergeLevel(min 0, max 3): the strength of merging similar sequences during contiging, [1]\n" );
311 fprintf ( stderr, " -D <int> EdgeCovCutoff: edges shorter than (2*K+1) with coverage no larger than EdgeCovCutoff will be deleted, [1]\n" );
312 fprintf ( stderr, " -e <int> arcWeight: two edges, between which the arc's weight is larger than arcWeight, will be linerized, [0]\n" );
313 fprintf ( stderr, " -m <int> max k when using multi-kmer, and the parameters below are used along with multi-kmer, [NO]\n" );
314 fprintf ( stderr, " -s <string> readsInfoFile:The file contains information of solexa reads(It's necessary when using multi-kmer)\n" );
315 fprintf ( stderr, " -p <int> number of cpu, [8]\n" );
316 fprintf ( stderr, " -E (optional) merge clean bubble before iterate, works only if -M is set when using multi-kmer, [NO]\n" );
317 // fprintf (stderr," -r (optional) keep available read(*.read)\n");