4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
29 extern int call_pregraph ( int arc
, char ** argv
);
30 extern int call_pregraph_sparse(int arc
, char ** argv
);
31 extern int call_heavygraph ( int arc
, char ** argv
);
32 extern int call_map2contig ( int arc
, char ** argv
);
33 extern int call_scaffold ( int arc
, char ** argv
);
34 extern int call_align ( int arc
, char ** argv
);
36 static void display_usage ();
37 static void display_all_usage ();
38 static void pipeline ( int argc
, char ** argv
);
40 /*************************************************
44 The main function. It includes four modules:
50 @see display_all_usage ()
58 *************************************************/
59 int main ( int argc
, char ** argv
)
62 fprintf ( stderr
, "\nVersion 2.04: released on July 13th, 2012\nCompile %s\t%s\n", __DATE__
, __TIME__
);
72 if ( strcmp ( "pregraph", argv
[0] ) == 0 )
74 call_pregraph ( argc
, argv
);
76 else if(strcmp ( "sparse_pregraph", argv
[0] ) == 0 ){
77 call_pregraph_sparse ( argc
, argv
);
80 else if ( strcmp ( "contig", argv
[0] ) == 0 )
82 call_heavygraph ( argc
, argv
);
84 else if ( strcmp ( "map", argv
[0] ) == 0 )
86 call_align ( argc
, argv
);
88 //call_map2contig(argc,argv);
89 else if ( strcmp ( "scaff", argv
[0] ) == 0 )
91 call_scaffold ( argc
, argv
);
93 else if ( strcmp ( "all", argv
[0] ) == 0 )
95 pipeline ( argc
, argv
);
105 static void display_usage ()
107 fprintf ( stderr
, "\nUsage: SOAPdenovo <command> [option]\n" );
108 fprintf ( stderr
, " pregraph construct kmer-graph\n" );
109 fprintf ( stderr
, " sparse_pregraph construct sparse kmer-graph\n");
110 fprintf ( stderr
, " contig eliminate errors and output contigs\n" );
111 fprintf ( stderr
, " map map reads to contigs\n" );
112 fprintf ( stderr
, " scaff construct scaffolds\n" );
113 fprintf ( stderr
, " all do pregraph-contig-map-scaff in turn\n" );
116 static void pipeline ( int argc
, char ** argv
)
119 unsigned char getK
, getRfile
, getOfile
, getD
, getDD
, getL
, getR
, getP
, getF
, getf
, getk
, getu
, getG
, getc
, getC
, getb
, getB
, getN
, getw
, getV
;
120 unsigned char getm
, getE
; //getr,
121 char readfile
[256], outfile
[256];
124 int kmer
= 0, cutoff_len
= 0, ncpu
= 0, lowK
= 0, lowC
= 0, kmer_small
= 0, gap_diff
= 0, genome_size
= 0;
125 float min_cvg
= 0.0, max_cvg
= 0.0, insert_size_bound
= 0.0, bubble_coverage
= 0.0;
126 char kmer_s
[16], len_s
[16], ncpu_s
[16], M_s
[16], lowK_s
[16], lowC_s
[16], kmer_small_s
[16], gap_diff_s
[16], min_cvg_s
[16], max_cvg_s
[16], insert_size_bound_s
[16], bubble_coverage_s
[16], genome_size_s
[16];
127 int i
, copt
, index
, M
= 1;
130 char arcfilter_s
[16];
131 extern char * optarg
;
132 time_t start_t
, stop_t
;
134 getK
= getRfile
= getOfile
= getD
= getDD
= getL
= getR
= getP
= getF
= getf
= getk
= getu
= getG
= getc
= getC
= getb
= getB
= getN
= getw
= getm
= getE
= getV
= 0;
136 while ( ( copt
= getopt ( argc
, argv
, "a:s:o:K:M:L:p:G:d:D:RuFk:fc:C:b:B:N:wm:e:EV" ) ) != EOF
) //r
142 sscanf ( optarg
, "%s", readfile
);
146 sscanf ( optarg
, "%s", outfile
);
150 sscanf ( optarg
, "%s", temp
);
151 kmer
= atoi ( temp
);
155 sscanf ( optarg
, "%s", temp
);
156 gap_diff
= atoi ( temp
);
159 sscanf ( optarg
, "%s", temp
);
164 sscanf ( optarg
, "%s", temp
);
165 ncpu
= atoi ( temp
);
169 sscanf ( optarg
, "%s", temp
);
170 cutoff_len
= atoi ( temp
);
181 sscanf ( optarg
, "%s", temp
);
182 lowK
= atoi ( temp
);
186 sscanf ( optarg
, "%s", temp
);
187 lowC
= atoi ( temp
);
190 initKmerSetSize
= atoi ( optarg
);
197 sscanf ( optarg
, "%s", temp
);
198 kmer_small
= atoi ( temp
);
205 sscanf ( optarg
, "%s", temp
);
206 min_cvg
= atof ( temp
);
210 sscanf ( optarg
, "%s", temp
);
211 max_cvg
= atof ( temp
);
215 sscanf ( optarg
, "%s", temp
);
216 insert_size_bound
= atof ( temp
);
220 sscanf ( optarg
, "%s", temp
);
221 bubble_coverage
= atof ( temp
);
225 sscanf ( optarg
, "%s", temp
);
226 genome_size
= atoi ( temp
);
233 sscanf ( optarg
, "%s", temp
);
234 maxk
= atoi ( temp
);
242 sscanf ( optarg
, "%s", temp
);
243 arcfilter
= atoi ( temp
);
253 if ( getRfile
== 0 || getOfile
== 0 )
255 display_all_usage ();
261 if ( getRfile
== 0 || getOfile
== 0 )
263 display_all_usage ();
272 // getK = getRfile = getOfile = getD = getL = getR = 0;
275 options
[index
++] = name
;
276 options
[index
++] = "-s";
277 options
[index
++] = readfile
;
281 options
[index
++] = "-K";
282 sprintf ( kmer_s
, "%d", kmer
);
283 options
[index
++] = kmer_s
;
288 options
[index
++] = "-p";
289 sprintf ( ncpu_s
, "%d", ncpu
);
290 options
[index
++] = ncpu_s
;
295 options
[index
++] = "-d";
296 sprintf ( lowK_s
, "%d", lowK
);
297 options
[index
++] = lowK_s
;
302 options
[index
++] = "-R";
305 options
[index
++] = "-o";
306 options
[index
++] = outfile
;
308 for (i = 0; i < index; i++)
310 fprintf (stderr,"%s ", options[i]);
313 fprintf (stderr,"\n");
315 call_pregraph ( index
, options
);
318 options
[index
++] = name
;
319 options
[index
++] = "-g";
320 options
[index
++] = outfile
;
321 options
[index
++] = "-M";
322 sprintf ( M_s
, "%d", M
);
323 options
[index
++] = M_s
;
327 options
[index
++] = "-R";
332 options
[index
++] = "-D";
333 sprintf ( lowC_s
, "%d", lowC
);
334 options
[index
++] = lowC_s
;
339 options
[index
++] = "-s";
340 options
[index
++] = readfile
;
345 options
[index
++] = "-p";
346 sprintf ( ncpu_s
, "%d", ncpu
);
347 options
[index
++] = ncpu_s
;
352 options
[index
++] = "-m";
353 sprintf ( maxk_s
, "%d", maxk
);
354 options
[index
++] = maxk_s
;
359 options[index++] = "-r";
364 options
[index
++] = "-E";
369 options
[index
++] = "-e";
370 sprintf ( arcfilter_s
, "%d", arcfilter
);
371 options
[index
++] = arcfilter_s
;
375 for (i = 0; i < index; i++)
377 fprintf (stderr,"%s ", options[i]);
380 fprintf (stderr,"\n");
382 call_heavygraph ( index
, options
);
385 options
[index
++] = name
;
386 options
[index
++] = "-s";
387 options
[index
++] = readfile
;
388 options
[index
++] = "-g";
389 options
[index
++] = outfile
;
393 options
[index
++] = "-p";
394 sprintf ( ncpu_s
, "%d", ncpu
);
395 options
[index
++] = ncpu_s
;
400 options
[index
++] = "-K";
401 sprintf ( kmer_s
, "%d", kmer
);
402 options
[index
++] = kmer_s
;
407 options
[index
++] = "-k";
408 sprintf ( kmer_small_s
, "%d", kmer_small
);
409 options
[index
++] = kmer_small_s
;
414 options
[index
++] = "-f";
418 for (i = 0; i < index; i++)
420 fprintf (stderr,"%s ", options[i]);
423 fprintf (stderr,"\n");
425 call_align ( index
, options
);
428 options
[index
++] = name
;
429 options
[index
++] = "-g";
430 options
[index
++] = outfile
;
434 options
[index
++] = "-F";
439 options
[index
++] = "-p";
440 sprintf ( ncpu_s
, "%d", ncpu
);
441 options
[index
++] = ncpu_s
;
446 options
[index
++] = "-L";
447 sprintf ( len_s
, "%d", cutoff_len
);
448 options
[index
++] = len_s
;
453 options
[index
++] = "-G";
454 sprintf ( gap_diff_s
, "%d", gap_diff
);
455 options
[index
++] = gap_diff_s
;
460 options
[index
++] = "-u";
465 options
[index
++] = "-c";
466 sprintf ( min_cvg_s
, "%f", min_cvg
);
467 options
[index
++] = min_cvg_s
;
472 options
[index
++] = "-C";
473 sprintf ( max_cvg_s
, "%f", max_cvg
);
474 options
[index
++] = max_cvg_s
;
479 options
[index
++] = "-b";
480 sprintf ( insert_size_bound_s
, "%f", insert_size_bound
);
481 options
[index
++] = insert_size_bound_s
;
486 options
[index
++] = "-B";
487 sprintf ( bubble_coverage_s
, "%f", bubble_coverage
);
488 options
[index
++] = bubble_coverage_s
;
493 options
[index
++] = "-N";
494 sprintf ( genome_size_s
, "%d", genome_size
);
495 options
[index
++] = genome_size_s
;
500 options
[index
++] = "-w";
505 options
[index
++] = "-V";
509 for (i = 0; i < index; i++)
511 fprintf (stderr,"%s ", options[i]);
514 fprintf (stderr,"\n");
516 call_scaffold ( index
, options
);
518 fprintf ( stderr
, "Time for the whole pipeline: %dm.\n", ( int ) ( stop_t
- start_t
) / 60 );
521 static void display_all_usage ()
523 // fprintf (stderr,"\nSOAPdenovo all -s configFile -o outputGraph [-R -f -F -u -w] [-K kmer -p n_cpu -a initMemoryAssumption -d KmerFreqCutOff -D EdgeCovCutoff -M mergeLevel -k kmer_R2C, -G gapLenDiff -L minContigLen -c minContigCvg -C maxContigCvg -b insertSizeUpperBound -B bubbleCoverage -N genomeSize]\n");
524 fprintf ( stderr
, "\nSOAPdenovo all -s configFile -o outputGraph [-R -F -u -w] [-K kmer -p n_cpu -a initMemoryAssumption -d KmerFreqCutOff -D EdgeCovCutoff -M mergeLevel -k kmer_R2C, -G gapLenDiff -L minContigLen -c minContigCvg -C maxContigCvg -b insertSizeUpperBound -B bubbleCoverage -N genomeSize]\n" );
525 fprintf ( stderr
, " -s <string> configFile: the config file of solexa reads\n" );
526 fprintf ( stderr
, " -o <string> outputGraph: prefix of output graph file name\n" );
528 fprintf ( stderr
, " -K <int> kmer(min 13, max 127): kmer size, [23]\n" );
530 fprintf ( stderr
, " -K <int> kmer(min 13, max 63): kmer size, [23]\n" );
532 fprintf ( stderr
, " -p <int> n_cpu: number of cpu for use, [8]\n" );
533 fprintf ( stderr
, " -a <int> initMemoryAssumption: memory assumption initialized to avoid further reallocation, unit G, [0]\n" );
534 fprintf ( stderr
, " -d <int> kmerFreqCutoff: kmers with frequency no larger than KmerFreqCutoff will be deleted, [0]\n" );
535 fprintf ( stderr
, " -R (optional) resolve repeats by reads, [NO]\n" );
536 fprintf ( stderr
, " -D <int> edgeCovCutoff: edges with coverage no larger than EdgeCovCutoff will be deleted, [1]\n" );
537 fprintf ( stderr
, " -M <int> mergeLevel(min 0, max 3): the strength of merging similar sequences during contiging, [1]\n" );
538 fprintf ( stderr
, " -e <int> arcWeight: two edges, between which the arc's weight is larger than arcWeight, will be linerized, [0]\n" );
540 fprintf ( stderr
, " -m <int> maxKmer (max 127): maximum kmer size used for multi-kmer, [NO]\n" );
542 fprintf ( stderr
, " -m <int> maxKmer (max 63): maximum kmer size used for multi-kmer, [NO]\n" );
544 fprintf ( stderr
, " -E (optional) merge clean bubble before iterate, works only if -M is set when using multi-kmer, [NO]\n" );
545 // printf (" -O (optional)\toutput contig of each kmer when iterating\n");
546 // fprintf (stderr," -f (optional) output gap related reads in map step for using SRkgf to fill gaps, [NO]\n");
548 fprintf ( stderr
, " -k <int> kmer_R2C(min 13, max 127): kmer size used for mapping reads to contigs, [K]\n" );
550 fprintf ( stderr
, " -k <int> kmer_R2C(min 13, max 63): kmer size used for mapping reads to contigs, [K]\n" );
552 fprintf ( stderr
, " -F (optional) fill gaps in scaffolds, [NO]\n" );
553 fprintf ( stderr
, " -u (optional) un-mask contigs with high/low coverage before scaffolding, [mask]\n" );
554 fprintf ( stderr
, " -w (optional) keep contigs weakly connected to other contigs in scaffold, [NO]\n" );
555 fprintf ( stderr
, " -G <int> gapLenDiff: allowed length difference between estimated and filled gap, [50]\n" );
556 fprintf ( stderr
, " -L <int> minContigLen: shortest contig for scaffolding, [K+2]\n" );
557 fprintf ( stderr
, " -c <float> minContigCvg: minimum contig coverage (c*avgCvg), contigs shorter than 100bp with coverage smaller than c*avgCvg will be masked before scaffolding unless -u is set, [0.1]\n" );
558 fprintf ( stderr
, " -C <float> maxContigCvg: maximum contig coverage (C*avgCvg), contigs with coverage larger than C*avgCvg or contigs shorter than 100bp with coverage larger than 0.8*C*avgCvg will be masked before scaffolding unless -u is set, [2]\n" );
559 fprintf ( stderr
, " -b <float> insertSizeUpperBound: (b*avg_ins) will be used as upper bound of insert size for large insert size ( > 1000) when handling pair-end connections between contigs if b is set to larger than 1, [1.5]\n" );
560 fprintf ( stderr
, " -B <float> bubbleCoverage: remove contig with lower cvoerage in bubble structure if both contigs' coverage are smaller than bubbleCoverage*avgCvg, [0.6]\n" );
561 fprintf ( stderr
, " -N <int> genomeSize: genome size for statistics, [0]\n" );
562 fprintf ( stderr
, " -V (optional) output information for Hawkeye to visualize the assembly, [NO]\n" );