modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / main.c
blob98170e4c2ddc99a006a15d25da10d9281dff341e
1 /*
2 * main.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "global.h"
29 extern int call_pregraph ( int arc, char ** argv );
30 extern int call_pregraph_sparse(int arc, char ** argv);
31 extern int call_heavygraph ( int arc, char ** argv );
32 extern int call_map2contig ( int arc, char ** argv );
33 extern int call_scaffold ( int arc, char ** argv );
34 extern int call_align ( int arc, char ** argv );
36 static void display_usage ();
37 static void display_all_usage ();
38 static void pipeline ( int argc, char ** argv );
40 /*************************************************
41 Function:
42 main
43 Description:
44 The main function. It includes four modules:
45 1.pregraph
46 2.contig
47 3.map
48 4.scaffold
49 Input:
50 @see display_all_usage ()
51 Output:
52 Below files:
53 1. *.contig
54 2. *.scafSeq
55 etc.
56 Return:
57 None.
58 *************************************************/
59 int main ( int argc, char ** argv )
61 crc32c_Init();
62 fprintf ( stderr, "\nVersion 2.04: released on July 13th, 2012\nCompile %s\t%s\n", __DATE__, __TIME__ );
63 argc--;
64 argv++;
66 if ( argc == 0 )
68 display_usage ();
69 return 0;
72 if ( strcmp ( "pregraph", argv[0] ) == 0 )
74 call_pregraph ( argc, argv );
76 else if(strcmp ( "sparse_pregraph", argv[0] ) == 0 ){
77 call_pregraph_sparse ( argc, argv );
80 else if ( strcmp ( "contig", argv[0] ) == 0 )
82 call_heavygraph ( argc, argv );
84 else if ( strcmp ( "map", argv[0] ) == 0 )
86 call_align ( argc, argv );
88 //call_map2contig(argc,argv);
89 else if ( strcmp ( "scaff", argv[0] ) == 0 )
91 call_scaffold ( argc, argv );
93 else if ( strcmp ( "all", argv[0] ) == 0 )
95 pipeline ( argc, argv );
97 else
99 display_usage ();
102 return 0;
105 static void display_usage ()
107 fprintf ( stderr, "\nUsage: SOAPdenovo <command> [option]\n" );
108 fprintf ( stderr, " pregraph construct kmer-graph\n" );
109 fprintf ( stderr, " sparse_pregraph construct sparse kmer-graph\n");
110 fprintf ( stderr, " contig eliminate errors and output contigs\n" );
111 fprintf ( stderr, " map map reads to contigs\n" );
112 fprintf ( stderr, " scaff construct scaffolds\n" );
113 fprintf ( stderr, " all do pregraph-contig-map-scaff in turn\n" );
116 static void pipeline ( int argc, char ** argv )
118 char * options[32];
119 unsigned char getK, getRfile, getOfile, getD, getDD, getL, getR, getP, getF, getf, getk, getu, getG, getc, getC, getb, getB, getN, getw, getV;
120 unsigned char getm, getE; //getr,
121 char readfile[256], outfile[256];
122 char temp[128];
123 char * name;
124 int kmer = 0, cutoff_len = 0, ncpu = 0, lowK = 0, lowC = 0, kmer_small = 0, gap_diff = 0, genome_size = 0;
125 float min_cvg = 0.0, max_cvg = 0.0, insert_size_bound = 0.0, bubble_coverage = 0.0;
126 char kmer_s[16], len_s[16], ncpu_s[16], M_s[16], lowK_s[16], lowC_s[16], kmer_small_s[16], gap_diff_s[16], min_cvg_s[16], max_cvg_s[16], insert_size_bound_s[16], bubble_coverage_s[16], genome_size_s[16];
127 int i, copt, index, M = 1;
128 int maxk;
129 char maxk_s[16];
130 char arcfilter_s[16];
131 extern char * optarg;
132 time_t start_t, stop_t;
133 time ( &start_t );
134 getK = getRfile = getOfile = getD = getDD = getL = getR = getP = getF = getf = getk = getu = getG = getc = getC = getb = getB = getN = getw = getm = getE = getV = 0;
136 while ( ( copt = getopt ( argc, argv, "a:s:o:K:M:L:p:G:d:D:RuFk:fc:C:b:B:N:wm:e:EV" ) ) != EOF ) //r
138 switch ( copt )
140 case 's':
141 getRfile = 1;
142 sscanf ( optarg, "%s", readfile );
143 break;
144 case 'o':
145 getOfile = 1;
146 sscanf ( optarg, "%s", outfile );
147 break;
148 case 'K':
149 getK = 1;
150 sscanf ( optarg, "%s", temp );
151 kmer = atoi ( temp );
152 break;
153 case 'G':
154 getG = 1;
155 sscanf ( optarg, "%s", temp );
156 gap_diff = atoi ( temp );
157 break;
158 case 'M':
159 sscanf ( optarg, "%s", temp );
160 M = atoi ( temp );
161 break;
162 case 'p':
163 getP = 1;
164 sscanf ( optarg, "%s", temp );
165 ncpu = atoi ( temp );
166 break;
167 case 'L':
168 getL = 1;
169 sscanf ( optarg, "%s", temp );
170 cutoff_len = atoi ( temp );
171 break;
172 case 'R':
173 getR = 1;
174 break;
175 case 'u':
176 getu = 1;
177 maskRep = 0;
178 break;
179 case 'd':
180 getD = 1;
181 sscanf ( optarg, "%s", temp );
182 lowK = atoi ( temp );
183 break;
184 case 'D':
185 getDD = 1;
186 sscanf ( optarg, "%s", temp );
187 lowC = atoi ( temp );
188 break;
189 case 'a':
190 initKmerSetSize = atoi ( optarg );
191 break;
192 case 'F':
193 getF = 1;
194 break;
195 case 'k':
196 getk = 1;
197 sscanf ( optarg, "%s", temp );
198 kmer_small = atoi ( temp );
199 break;
200 case 'f':
201 getf = 1;
202 break;
203 case 'c':
204 getc = 1;
205 sscanf ( optarg, "%s", temp );
206 min_cvg = atof ( temp );
207 break;
208 case 'C':
209 getC = 1;
210 sscanf ( optarg, "%s", temp );
211 max_cvg = atof ( temp );
212 break;
213 case 'b':
214 getb = 1;
215 sscanf ( optarg, "%s", temp );
216 insert_size_bound = atof ( temp );
217 break;
218 case 'B':
219 getB = 1;
220 sscanf ( optarg, "%s", temp );
221 bubble_coverage = atof ( temp );
222 break;
223 case 'N':
224 getN = 1;
225 sscanf ( optarg, "%s", temp );
226 genome_size = atoi ( temp );
227 break;
228 case 'w':
229 getw = 1;
230 break;
231 case 'm':
232 getm = 1;
233 sscanf ( optarg, "%s", temp );
234 maxk = atoi ( temp );
235 break;
237 case 'r':
238 getr = 1;
239 break;
241 case 'e':
242 sscanf ( optarg, "%s", temp );
243 arcfilter = atoi ( temp );
244 break;
245 case 'E':
246 getE = 1;
247 break;
248 case 'V':
249 getV = 1;
250 break;
251 default:
253 if ( getRfile == 0 || getOfile == 0 )
255 display_all_usage ();
256 exit ( -1 );
261 if ( getRfile == 0 || getOfile == 0 )
263 display_all_usage ();
264 exit ( -1 );
267 if ( thrd_num < 1 )
269 thrd_num = 1;
272 // getK = getRfile = getOfile = getD = getL = getR = 0;
273 name = "pregraph";
274 index = 0;
275 options[index++] = name;
276 options[index++] = "-s";
277 options[index++] = readfile;
279 if ( getK )
281 options[index++] = "-K";
282 sprintf ( kmer_s, "%d", kmer );
283 options[index++] = kmer_s;
286 if ( getP )
288 options[index++] = "-p";
289 sprintf ( ncpu_s, "%d", ncpu );
290 options[index++] = ncpu_s;
293 if ( getD )
295 options[index++] = "-d";
296 sprintf ( lowK_s, "%d", lowK );
297 options[index++] = lowK_s;
300 if ( getR )
302 options[index++] = "-R";
305 options[index++] = "-o";
306 options[index++] = outfile;
308 for (i = 0; i < index; i++)
310 fprintf (stderr,"%s ", options[i]);
313 fprintf (stderr,"\n");
315 call_pregraph ( index, options );
316 name = "contig";
317 index = 0;
318 options[index++] = name;
319 options[index++] = "-g";
320 options[index++] = outfile;
321 options[index++] = "-M";
322 sprintf ( M_s, "%d", M );
323 options[index++] = M_s;
325 if ( getR )
327 options[index++] = "-R";
330 if ( getDD )
332 options[index++] = "-D";
333 sprintf ( lowC_s, "%d", lowC );
334 options[index++] = lowC_s;
337 if ( getRfile )
339 options[index++] = "-s";
340 options[index++] = readfile;
343 if ( getP )
345 options[index++] = "-p";
346 sprintf ( ncpu_s, "%d", ncpu );
347 options[index++] = ncpu_s;
350 if ( getm )
352 options[index++] = "-m";
353 sprintf ( maxk_s, "%d", maxk );
354 options[index++] = maxk_s;
358 if(getr){
359 options[index++] = "-r";
362 if ( getE )
364 options[index++] = "-E";
367 if ( arcfilter )
369 options[index++] = "-e";
370 sprintf ( arcfilter_s, "%d", arcfilter );
371 options[index++] = arcfilter_s;
375 for (i = 0; i < index; i++)
377 fprintf (stderr,"%s ", options[i]);
380 fprintf (stderr,"\n");
382 call_heavygraph ( index, options );
383 name = "map";
384 index = 0;
385 options[index++] = name;
386 options[index++] = "-s";
387 options[index++] = readfile;
388 options[index++] = "-g";
389 options[index++] = outfile;
391 if ( getP )
393 options[index++] = "-p";
394 sprintf ( ncpu_s, "%d", ncpu );
395 options[index++] = ncpu_s;
398 if ( getK )
400 options[index++] = "-K";
401 sprintf ( kmer_s, "%d", kmer );
402 options[index++] = kmer_s;
405 if ( getk )
407 options[index++] = "-k";
408 sprintf ( kmer_small_s, "%d", kmer_small );
409 options[index++] = kmer_small_s;
412 if ( getf )
414 options[index++] = "-f";
418 for (i = 0; i < index; i++)
420 fprintf (stderr,"%s ", options[i]);
423 fprintf (stderr,"\n");
425 call_align ( index, options );
426 name = "scaff";
427 index = 0;
428 options[index++] = name;
429 options[index++] = "-g";
430 options[index++] = outfile;
432 if ( getF )
434 options[index++] = "-F";
437 if ( getP )
439 options[index++] = "-p";
440 sprintf ( ncpu_s, "%d", ncpu );
441 options[index++] = ncpu_s;
444 if ( getL )
446 options[index++] = "-L";
447 sprintf ( len_s, "%d", cutoff_len );
448 options[index++] = len_s;
451 if ( getG )
453 options[index++] = "-G";
454 sprintf ( gap_diff_s, "%d", gap_diff );
455 options[index++] = gap_diff_s;
458 if ( getu )
460 options[index++] = "-u";
463 if ( getc )
465 options[index++] = "-c";
466 sprintf ( min_cvg_s, "%f", min_cvg );
467 options[index++] = min_cvg_s;
470 if ( getC )
472 options[index++] = "-C";
473 sprintf ( max_cvg_s, "%f", max_cvg );
474 options[index++] = max_cvg_s;
477 if ( getb )
479 options[index++] = "-b";
480 sprintf ( insert_size_bound_s, "%f", insert_size_bound );
481 options[index++] = insert_size_bound_s;
484 if ( getB )
486 options[index++] = "-B";
487 sprintf ( bubble_coverage_s, "%f", bubble_coverage );
488 options[index++] = bubble_coverage_s;
491 if ( getN )
493 options[index++] = "-N";
494 sprintf ( genome_size_s, "%d", genome_size );
495 options[index++] = genome_size_s;
498 if ( getw )
500 options[index++] = "-w";
503 if ( getV )
505 options[index++] = "-V";
509 for (i = 0; i < index; i++)
511 fprintf (stderr,"%s ", options[i]);
514 fprintf (stderr,"\n");
516 call_scaffold ( index, options );
517 time ( &stop_t );
518 fprintf ( stderr, "Time for the whole pipeline: %dm.\n", ( int ) ( stop_t - start_t ) / 60 );
521 static void display_all_usage ()
523 // fprintf (stderr,"\nSOAPdenovo all -s configFile -o outputGraph [-R -f -F -u -w] [-K kmer -p n_cpu -a initMemoryAssumption -d KmerFreqCutOff -D EdgeCovCutoff -M mergeLevel -k kmer_R2C, -G gapLenDiff -L minContigLen -c minContigCvg -C maxContigCvg -b insertSizeUpperBound -B bubbleCoverage -N genomeSize]\n");
524 fprintf ( stderr, "\nSOAPdenovo all -s configFile -o outputGraph [-R -F -u -w] [-K kmer -p n_cpu -a initMemoryAssumption -d KmerFreqCutOff -D EdgeCovCutoff -M mergeLevel -k kmer_R2C, -G gapLenDiff -L minContigLen -c minContigCvg -C maxContigCvg -b insertSizeUpperBound -B bubbleCoverage -N genomeSize]\n" );
525 fprintf ( stderr, " -s <string> configFile: the config file of solexa reads\n" );
526 fprintf ( stderr, " -o <string> outputGraph: prefix of output graph file name\n" );
527 #ifdef MER127
528 fprintf ( stderr, " -K <int> kmer(min 13, max 127): kmer size, [23]\n" );
529 #else
530 fprintf ( stderr, " -K <int> kmer(min 13, max 63): kmer size, [23]\n" );
531 #endif
532 fprintf ( stderr, " -p <int> n_cpu: number of cpu for use, [8]\n" );
533 fprintf ( stderr, " -a <int> initMemoryAssumption: memory assumption initialized to avoid further reallocation, unit G, [0]\n" );
534 fprintf ( stderr, " -d <int> kmerFreqCutoff: kmers with frequency no larger than KmerFreqCutoff will be deleted, [0]\n" );
535 fprintf ( stderr, " -R (optional) resolve repeats by reads, [NO]\n" );
536 fprintf ( stderr, " -D <int> edgeCovCutoff: edges with coverage no larger than EdgeCovCutoff will be deleted, [1]\n" );
537 fprintf ( stderr, " -M <int> mergeLevel(min 0, max 3): the strength of merging similar sequences during contiging, [1]\n" );
538 fprintf ( stderr, " -e <int> arcWeight: two edges, between which the arc's weight is larger than arcWeight, will be linerized, [0]\n" );
539 #ifdef MER127
540 fprintf ( stderr, " -m <int> maxKmer (max 127): maximum kmer size used for multi-kmer, [NO]\n" );
541 #else
542 fprintf ( stderr, " -m <int> maxKmer (max 63): maximum kmer size used for multi-kmer, [NO]\n" );
543 #endif
544 fprintf ( stderr, " -E (optional) merge clean bubble before iterate, works only if -M is set when using multi-kmer, [NO]\n" );
545 // printf (" -O (optional)\toutput contig of each kmer when iterating\n");
546 // fprintf (stderr," -f (optional) output gap related reads in map step for using SRkgf to fill gaps, [NO]\n");
547 #ifdef MER127
548 fprintf ( stderr, " -k <int> kmer_R2C(min 13, max 127): kmer size used for mapping reads to contigs, [K]\n" );
549 #else
550 fprintf ( stderr, " -k <int> kmer_R2C(min 13, max 63): kmer size used for mapping reads to contigs, [K]\n" );
551 #endif
552 fprintf ( stderr, " -F (optional) fill gaps in scaffolds, [NO]\n" );
553 fprintf ( stderr, " -u (optional) un-mask contigs with high/low coverage before scaffolding, [mask]\n" );
554 fprintf ( stderr, " -w (optional) keep contigs weakly connected to other contigs in scaffold, [NO]\n" );
555 fprintf ( stderr, " -G <int> gapLenDiff: allowed length difference between estimated and filled gap, [50]\n" );
556 fprintf ( stderr, " -L <int> minContigLen: shortest contig for scaffolding, [K+2]\n" );
557 fprintf ( stderr, " -c <float> minContigCvg: minimum contig coverage (c*avgCvg), contigs shorter than 100bp with coverage smaller than c*avgCvg will be masked before scaffolding unless -u is set, [0.1]\n" );
558 fprintf ( stderr, " -C <float> maxContigCvg: maximum contig coverage (C*avgCvg), contigs with coverage larger than C*avgCvg or contigs shorter than 100bp with coverage larger than 0.8*C*avgCvg will be masked before scaffolding unless -u is set, [2]\n" );
559 fprintf ( stderr, " -b <float> insertSizeUpperBound: (b*avg_ins) will be used as upper bound of insert size for large insert size ( > 1000) when handling pair-end connections between contigs if b is set to larger than 1, [1.5]\n" );
560 fprintf ( stderr, " -B <float> bubbleCoverage: remove contig with lower cvoerage in bubble structure if both contigs' coverage are smaller than bubbleCoverage*avgCvg, [0.6]\n" );
561 fprintf ( stderr, " -N <int> genomeSize: genome size for statistics, [0]\n" );
562 fprintf ( stderr, " -V (optional) output information for Hawkeye to visualize the assembly, [NO]\n" );