4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
30 static char shortrdsfile
[256]; //the reads config file name ,see -s option
31 static char graphfile
[256]; //the output prefix name ,see -o option
32 static int cutTips
= 1; //whether remove single tips or not. single tips , the tips starting from a kmer which coverage = 1
34 static void initenv ( int argc
, char ** argv
);
35 static void display_pregraph_usage ();
38 /*************************************************
42 The main function for pregraph step . its processes are as below:
43 1. Builds the kmer hash sets and remove the low coverage kmers.
44 2. Removes the tips which length are no greater than 2*K.
45 3. Builds edges by combining linear kmers.
46 4. Maps the reads back to edges and build preArcs (the connection between edges).
48 @see display_pregraph_usage ()
56 *.markOnEdge (optional)
60 *************************************************/
62 int call_pregraph ( int argc
, char ** argv
)
64 time_t start_t
, stop_t
, time_bef
, time_aft
;
66 fprintf ( stderr
, "\n********************\n" );
67 fprintf ( stderr
, "Pregraph\n" );
68 fprintf ( stderr
, "********************\n\n" );
69 initenv ( argc
, argv
);
71 if ( overlaplen
% 2 == 0 )
74 fprintf ( stderr
, "K should be an odd number.\n" );
77 if ( overlaplen
< 13 )
80 fprintf ( stderr
, "K should not be less than 13.\n" );
84 else if ( overlaplen
> 127 )
87 fprintf ( stderr
, "K should not be greater than 127.\n" );
91 else if ( overlaplen
> 63 )
94 fprintf ( stderr
, "K should not be greater than 63.\n" );
99 prlRead2HashTable ( shortrdsfile
, graphfile
);
101 fprintf ( stderr
, "Time spent on pre-graph construction: %ds.\n\n", ( int ) ( time_aft
- time_bef
) );
102 // printf ("deLowKmer %d, deLowEdge %d\n", deLowKmer, deLowEdge);
103 // fprintf (stderr,"DeLowKmer %d\n", deLowKmer);
105 //analyzeTips(hash_table, graphfile);
106 if ( !deLowKmer
&& cutTips
)
112 fprintf ( stderr
, "Time spent on removing tips: %ds.\n\n", ( int ) ( time_aft
- time_bef
) );
119 fprintf ( stderr
, "Time spent on removing tips: %ds.\n\n", ( int ) ( time_aft
- time_bef
) );
123 //combine each linear part to an edge
125 kmer2edges ( graphfile
);
127 fprintf ( stderr
, "Time spent on constructing edges: %ds.\n\n", ( int ) ( time_aft
- time_bef
) );
128 //map read to edge one by one
130 prlRead2edge ( shortrdsfile
, graphfile
);
132 fprintf ( stderr
, "Time spent on aligning reads: %ds.\n\n", ( int ) ( time_aft
- time_bef
) );
133 output_vertex ( graphfile
);
134 free_Sets ( KmerSets
, thrd_num
);
135 free_Sets ( KmerSetsPatch
, thrd_num
);
137 fprintf ( stderr
, "Overall time spent on constructing pre-graph: %dm.\n\n", ( int ) ( stop_t
- start_t
) / 60 );
142 void initenv ( int argc
, char ** argv
)
146 extern char * optarg
;
150 fprintf ( stderr
, "Parameters: pregraph " );
152 while ( ( copt
= getopt ( argc
, argv
, "a:s:o:K:p:d:R" ) ) != EOF
)
154 //printf("get option\n");
158 fprintf ( stderr
, "-s %s ", optarg
);
160 sscanf ( optarg
, "%s", shortrdsfile
);
163 fprintf ( stderr
, "-o %s ", optarg
);
165 sscanf ( optarg
, "%s", graphfile
);
168 fprintf ( stderr
, "-K %s ", optarg
);
169 sscanf ( optarg
, "%s", temp
);
170 overlaplen
= atoi ( temp
);
173 fprintf ( stderr
, "-p %s ", optarg
);
174 sscanf ( optarg
, "%s", temp
);
175 thrd_num
= atoi ( temp
);
179 fprintf ( stderr
, "-R " );
182 fprintf ( stderr
, "-d %s ", optarg
);
183 sscanf ( optarg
, "%s", temp
);
184 deLowKmer
= atoi ( temp
) >= 0 ? atoi ( temp
) : 0;
192 fprintf ( stderr
, "-a %s ", optarg
);
193 initKmerSetSize
= atoi ( optarg
);
197 if ( inpseq
== 0 || outseq
== 0 )
199 display_pregraph_usage ();
205 fprintf ( stderr
, "\n\n" );
207 if ( inpseq
== 0 || outseq
== 0 )
209 //printf("need more\n");
210 display_pregraph_usage ();
215 static void display_pregraph_usage ()
217 fprintf ( stderr
, "\npregraph -s configFile -o outputGraph [-R] [-K kmer -p n_cpu -a initMemoryAssumption -d KmerFreqCutoff]\n" );
218 fprintf ( stderr
, " -s <string> configFile: the config file of solexa reads\n" );
219 fprintf ( stderr
, " -o <string> outputGraph: prefix of output graph file name\n" );
221 fprintf ( stderr
, " -K <int> kmer(min 13, max 127): kmer size, [23]\n" );
223 fprintf ( stderr
, " -K <int> kmer(min 13, max 63): kmer size, [23]\n" );
225 fprintf ( stderr
, " -p <int> n_cpu: number of cpu for use, [8]\n" );
226 fprintf ( stderr
, " -a <int> initMemoryAssumption: memory assumption initialized to avoid further reallocation, unit GB, [0]\n" );
227 fprintf ( stderr
, " -R (optional) output extra information for resolving repeats in contig step, [NO]\n" );
228 fprintf ( stderr
, " -d <int> KmerFreqCutoff: kmers with frequency no larger than KmerFreqCutoff will be deleted, [0]\n" );