modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / map.c
blobc762ec3e0d34677091199e1c6c20a69e69b7a2bc
1 /*
2 * map.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
29 static void initenv ( int argc, char ** argv );
30 static char shortrdsfile[256];
31 static char graphfile[256];
33 static void display_map_usage ();
35 /*************************************************
36 Function:
37 getMinOverlap
38 Description:
39 Parses file *.preGraphBasic, and get vertex number,
40 K size, MaxReadLen, MinReadLen, MaxNameLen.
41 Input:
42 1. gfile the graph file prefix
43 Output:
44 None.
45 Return:
46 The kmer size.
47 *************************************************/
48 static int getMinOverlap ( char * gfile )
50 char name[256], ch;
51 FILE * fp;
52 int num_kmer, overlaplen = 23;
53 char line[1024];
54 sprintf ( name, "%s.preGraphBasic", gfile );
55 fp = fopen ( name, "r" );
57 if ( !fp )
59 return overlaplen;
62 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
64 if ( line[0] == 'V' )
66 sscanf ( line + 6, "%d %c %d", &num_kmer, &ch, &overlaplen );
68 else if ( line[0] == 'M' )
70 sscanf ( line, "MaxReadLen %d MinReadLen %d MaxNameLen %d", &maxReadLen, &minReadLen, &maxNameLen );
74 fclose ( fp );
75 return overlaplen;
78 /*************************************************
79 Function:
80 call_align
81 Description:
82 This is the main function for map step. It includes the following steps:
83 1.Chops contig (>=k+2) into kmer sets, marks the kmer occured twice or more as deleted,
84 records the unique kmers related contig's id and the position of the kmer on the contig.
85 2.Maps reads to edges.
86 Input:
87 @see display_map_usage()
88 Output:
89 Below files:
90 1. *.readOnCtg.gz
91 2. *.readInGap.gz
92 Return:
93 0 if exits normally.
94 *************************************************/
96 int call_align ( int argc, char ** argv )
98 time_t start_t, stop_t, time_bef, time_aft;
99 time ( &start_t );
100 fprintf ( stderr, "\n********************\n" );
101 fprintf ( stderr, "Map\n" );
102 fprintf ( stderr, "********************\n\n" );
103 initenv ( argc, argv );
104 overlaplen = getMinOverlap ( graphfile );
105 #ifdef MER127
107 if ( smallKmer > 12 && smallKmer < 128 && smallKmer % 2 == 1 )
109 deltaKmer = overlaplen - smallKmer;
110 overlaplen = smallKmer;
113 #else
115 if ( smallKmer > 12 && smallKmer < 64 && smallKmer % 2 == 1 )
117 deltaKmer = overlaplen - smallKmer;
118 overlaplen = smallKmer;
121 #endif
122 fprintf ( stderr, "Kmer size: %d.\n", overlaplen );
123 time ( &time_bef );
124 ctg_short = overlaplen + 2;
125 fprintf ( stderr, "Contig length cutoff: %d.\n", ctg_short );
126 prlContig2nodes ( graphfile, ctg_short );
127 time ( &time_aft );
128 fprintf ( stderr, "Time spent on graph construction: %ds.\n\n", ( int ) ( time_aft - time_bef ) );
129 //map long read (asm_flags=4) to edge one by one
130 time ( &time_bef );
131 prlLongRead2Ctg ( shortrdsfile, graphfile );
132 time ( &time_aft );
133 fprintf ( stderr, "Time spent on aligning long reads: %ds.\n\n", ( int ) ( time_aft - time_bef ) );
134 //map read to edge one by one
135 time ( &time_bef );
136 prlRead2Ctg ( shortrdsfile, graphfile );
137 time ( &time_aft );
138 fprintf ( stderr, "Time spent on aligning reads: %ds.\n\n", ( int ) ( time_aft - time_bef ) );
139 free_Sets ( KmerSets, thrd_num );
140 time ( &stop_t );
141 fprintf ( stderr, "Overall time spent on alignment: %dm.\n\n", ( int ) ( stop_t - start_t ) / 60 );
142 return 0;
148 /*****************************************************************************
149 * Parse command line switches
150 *****************************************************************************/
152 void initenv ( int argc, char ** argv )
154 int copt;
155 int inpseq, outseq;
156 extern char * optarg;
157 char temp[100];
158 optind = 1;
159 inpseq = outseq = 0;
160 fprintf ( stderr, "Parameters: map " );
162 while ( ( copt = getopt ( argc, argv, "s:g:K:p:k:f" ) ) != EOF )
164 //printf("get option\n");
165 switch ( copt )
167 case 's':
168 fprintf ( stderr, "-s %s ", optarg );
169 inpseq = 1;
170 sscanf ( optarg, "%s", shortrdsfile );
171 break;
172 case 'g':
173 fprintf ( stderr, "-g %s ", optarg );
174 outseq = 1;
175 sscanf ( optarg, "%s", graphfile );
176 break;
177 case 'K':
178 fprintf ( stderr, "-K %s ", optarg );
179 sscanf ( optarg, "%s", temp );
180 overlaplen = atoi ( temp );
181 break;
182 case 'p':
183 fprintf ( stderr, "-p %s ", optarg );
184 sscanf ( optarg, "%s", temp );
185 thrd_num = atoi ( temp );
186 break;
187 case 'k':
188 fprintf ( stderr, "-k %s ", optarg );
189 sscanf ( optarg, "%s", temp );
190 smallKmer = atoi ( temp );
191 break;
192 case 'f':
193 fill = 1;
194 fprintf ( stderr, "-f " );
195 break;
196 default:
198 if ( inpseq == 0 || outseq == 0 )
200 display_map_usage ();
201 exit ( 1 );
206 fprintf ( stderr, "\n\n" );
208 if ( inpseq == 0 || outseq == 0 )
210 display_map_usage ();
211 exit ( 1 );
215 static void display_map_usage ()
217 fprintf ( stderr, "\nmap -s configFile -g inputGraph [-f] [-p n_cpu -k kmer_R2C]\n" );
218 fprintf ( stderr, " -s <string> configFile: the config file of solexa reads\n" );
219 fprintf ( stderr, " -g <string> inputGraph: prefix of input graph file names\n" );
220 fprintf ( stderr, " -f (optional) output gap related reads in map step for using SRkgf to fill gap, [NO]\n" );
221 fprintf ( stderr, " -p <int> n_cpu: number of cpu for use, [8]\n" );
222 #ifdef MER127
223 fprintf ( stderr, " -k <int> kmer_R2C(min 13, max 127): kmer size used for mapping read to contig, [K]\n" );
224 #else
225 fprintf ( stderr, " -k <int> kmer_R2C(min 13, max 63): kmer size used for mapping read to contig, [K]\n" );
226 #endif