limit fstBC to 30bp in Python3 ver.
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / pregraph.c
blobc4151b3f1a82ea5b52c04c721e7301f57b268951
1 /*
2 * pregraph.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
30 static char shortrdsfile[256]; //the reads config file name ,see -s option
31 static char graphfile[256]; //the output prefix name ,see -o option
32 static int cutTips = 1; //whether remove single tips or not. single tips , the tips starting from a kmer which coverage = 1
34 static void initenv ( int argc, char ** argv );
35 static void display_pregraph_usage ();
38 /*************************************************
39 Function:
40 call_pregraph
41 Description:
42 The main function for pregraph step . its processes are as below:
43 1. Builds the kmer hash sets and remove the low coverage kmers.
44 2. Removes the tips which length are no greater than 2*K.
45 3. Builds edges by combining linear kmers.
46 4. Maps the reads back to edges and build preArcs (the connection between edges).
47 Input:
48 @see display_pregraph_usage ()
49 Output:
50 Below files:
51 *.kmerFreq
52 *.edge.gz
53 *.vertex
54 *.preArc
55 *.preGraphBasic
56 *.markOnEdge (optional)
57 *.path (optional)
58 Return:
59 Zero always
60 *************************************************/
62 int call_pregraph ( int argc, char ** argv )
64 time_t start_t, stop_t, time_bef, time_aft;
65 time ( &start_t );
66 fprintf ( stderr, "\n********************\n" );
67 fprintf ( stderr, "Pregraph\n" );
68 fprintf ( stderr, "********************\n\n" );
69 initenv ( argc, argv );
71 if ( overlaplen % 2 == 0 )
73 overlaplen++;
74 fprintf ( stderr, "K should be an odd number.\n" );
77 if ( overlaplen < 13 )
79 overlaplen = 13;
80 fprintf ( stderr, "K should not be less than 13.\n" );
83 #ifdef MER127
84 else if ( overlaplen > 127 )
86 overlaplen = 127;
87 fprintf ( stderr, "K should not be greater than 127.\n" );
90 #else
91 else if ( overlaplen > 63 )
93 overlaplen = 63;
94 fprintf ( stderr, "K should not be greater than 63.\n" );
97 #endif
98 time ( &time_bef );
99 prlRead2HashTable ( shortrdsfile, graphfile );
100 time ( &time_aft );
101 fprintf ( stderr, "Time spent on pre-graph construction: %ds.\n\n", ( int ) ( time_aft - time_bef ) );
102 // printf ("deLowKmer %d, deLowEdge %d\n", deLowKmer, deLowEdge);
103 // fprintf (stderr,"DeLowKmer %d\n", deLowKmer);
105 //analyzeTips(hash_table, graphfile);
106 if ( !deLowKmer && cutTips )
108 time ( &time_bef );
109 removeSingleTips ();
110 removeMinorTips ();
111 time ( &time_aft );
112 fprintf ( stderr, "Time spent on removing tips: %ds.\n\n", ( int ) ( time_aft - time_bef ) );
114 else
116 time ( &time_bef );
117 removeMinorTips ();
118 time ( &time_aft );
119 fprintf ( stderr, "Time spent on removing tips: %ds.\n\n", ( int ) ( time_aft - time_bef ) );
122 initKmerSetSize = 0;
123 //combine each linear part to an edge
124 time ( &time_bef );
125 kmer2edges ( graphfile );
126 time ( &time_aft );
127 fprintf ( stderr, "Time spent on constructing edges: %ds.\n\n", ( int ) ( time_aft - time_bef ) );
128 //map read to edge one by one
129 time ( &time_bef );
130 prlRead2edge ( shortrdsfile, graphfile );
131 time ( &time_aft );
132 fprintf ( stderr, "Time spent on aligning reads: %ds.\n\n", ( int ) ( time_aft - time_bef ) );
133 output_vertex ( graphfile );
134 free_Sets ( KmerSets, thrd_num );
135 free_Sets ( KmerSetsPatch, thrd_num );
136 time ( &stop_t );
137 fprintf ( stderr, "Overall time spent on constructing pre-graph: %dm.\n\n", ( int ) ( stop_t - start_t ) / 60 );
138 return 0;
142 void initenv ( int argc, char ** argv )
144 int copt;
145 int inpseq, outseq;
146 extern char * optarg;
147 char temp[100];
148 optind = 1;
149 inpseq = outseq = 0;
150 fprintf ( stderr, "Parameters: pregraph " );
152 while ( ( copt = getopt ( argc, argv, "a:s:o:K:p:d:R" ) ) != EOF )
154 //printf("get option\n");
155 switch ( copt )
157 case 's':
158 fprintf ( stderr, "-s %s ", optarg );
159 inpseq = 1;
160 sscanf ( optarg, "%s", shortrdsfile );
161 break;
162 case 'o':
163 fprintf ( stderr, "-o %s ", optarg );
164 outseq = 1;
165 sscanf ( optarg, "%s", graphfile );
166 break;
167 case 'K':
168 fprintf ( stderr, "-K %s ", optarg );
169 sscanf ( optarg, "%s", temp );
170 overlaplen = atoi ( temp );
171 break;
172 case 'p':
173 fprintf ( stderr, "-p %s ", optarg );
174 sscanf ( optarg, "%s", temp );
175 thrd_num = atoi ( temp );
176 break;
177 case 'R':
178 repsTie = 1;
179 fprintf ( stderr, "-R " );
180 break;
181 case 'd':
182 fprintf ( stderr, "-d %s ", optarg );
183 sscanf ( optarg, "%s", temp );
184 deLowKmer = atoi ( temp ) >= 0 ? atoi ( temp ) : 0;
185 break;
187 case 'D':
188 deLowEdge = 1;
189 break;
191 case 'a':
192 fprintf ( stderr, "-a %s ", optarg );
193 initKmerSetSize = atoi ( optarg );
194 break;
195 default:
197 if ( inpseq == 0 || outseq == 0 )
199 display_pregraph_usage ();
200 exit ( -1 );
205 fprintf ( stderr, "\n\n" );
207 if ( inpseq == 0 || outseq == 0 )
209 //printf("need more\n");
210 display_pregraph_usage ();
211 exit ( -1 );
215 static void display_pregraph_usage ()
217 fprintf ( stderr, "\npregraph -s configFile -o outputGraph [-R] [-K kmer -p n_cpu -a initMemoryAssumption -d KmerFreqCutoff]\n" );
218 fprintf ( stderr, " -s <string> configFile: the config file of solexa reads\n" );
219 fprintf ( stderr, " -o <string> outputGraph: prefix of output graph file name\n" );
220 #ifdef MER127
221 fprintf ( stderr, " -K <int> kmer(min 13, max 127): kmer size, [23]\n" );
222 #else
223 fprintf ( stderr, " -K <int> kmer(min 13, max 63): kmer size, [23]\n" );
224 #endif
225 fprintf ( stderr, " -p <int> n_cpu: number of cpu for use, [8]\n" );
226 fprintf ( stderr, " -a <int> initMemoryAssumption: memory assumption initialized to avoid further reallocation, unit GB, [0]\n" );
227 fprintf ( stderr, " -R (optional) output extra information for resolving repeats in contig step, [NO]\n" );
228 fprintf ( stderr, " -d <int> KmerFreqCutoff: kmers with frequency no larger than KmerFreqCutoff will be deleted, [0]\n" );