modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / read2scaf.c
blob00e98279e0fa99cf2974cbd73d8635189566676d
1 /*
2 * read2scaf.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
29 static int Ncounter;
30 static int allGaps;
32 // for multi threads
33 static int scafBufSize = 100;
34 static STACK ** ctgStackBuffer;
35 static int scafCounter;
36 static int scafInBuf;
38 static void convertIndex ()
40 int * length_array = ( int * ) ckalloc ( ( num_ctg + 1 ) * sizeof ( int ) );
41 unsigned int i;
43 for ( i = 1; i <= num_ctg; i++ )
45 length_array[i] = 0;
48 for ( i = 1; i <= num_ctg; i++ )
50 if ( index_array[i] > 0 )
52 length_array[index_array[i]] = i;
56 for ( i = 1; i <= num_ctg; i++ )
58 index_array[i] = length_array[i];
59 } //contig i with new index: index_array[i]
61 free ( ( void * ) length_array );
64 static void reverseStack ( STACK * dStack, STACK * sStack )
66 CTGinSCAF * actg, *ctgPt;
67 emptyStack ( dStack );
69 while ( ( actg = ( CTGinSCAF * ) stackPop ( sStack ) ) != NULL )
71 ctgPt = ( CTGinSCAF * ) stackPush ( dStack );
72 ctgPt->ctgID = actg->ctgID;
73 ctgPt->start = actg->start;
74 ctgPt->end = actg->end;
77 stackBackup ( dStack );
80 static void initStackBuf ( STACK ** ctgStackBuffer, int scafBufSize )
82 int i;
84 for ( i = 0; i < scafBufSize; i++ )
86 ctgStackBuffer[i] = ( STACK * ) createStack ( 100, sizeof ( CTGinSCAF ) );
89 static void freeStackBuf ( STACK ** ctgStackBuffer, int scafBufSize )
91 int i;
93 for ( i = 0; i < scafBufSize; i++ )
95 freeStack ( ctgStackBuffer[i] );
99 static void mapCtg2Scaf ( int scafInBuf )
101 int i, scafID;
102 CTGinSCAF * actg;
103 STACK * ctgsStack;
104 unsigned int ctg, bal_ctg;
106 for ( i = 0; i < scafInBuf; i++ )
108 scafID = scafCounter + i + 1;
109 ctgsStack = ctgStackBuffer[i];
111 while ( ( actg = stackPop ( ctgsStack ) ) != NULL )
113 ctg = actg->ctgID;
114 bal_ctg = getTwinCtg ( ctg );
116 if ( contig_array[ctg].from_vt != 0 )
118 contig_array[ctg].multi = 1;
119 contig_array[bal_ctg].multi = 1;
120 continue;
123 contig_array[ctg].from_vt = scafID;
124 contig_array[ctg].to_vt = actg->start;
125 contig_array[ctg].flag = 0; //ctg and scaf on the same strand
126 contig_array[bal_ctg].from_vt = scafID;
127 contig_array[bal_ctg].to_vt = actg->start;
128 contig_array[bal_ctg].flag = 1;
133 static void locateContigOnscaff ( char * graphfile )
135 FILE * fp;
136 char line[1024];
137 CTGinSCAF * actg;
138 STACK * ctgStack, *aStack;
139 int index = 0, counter, overallLen;
140 int starter, prev_start, gapN, scafLen;
141 unsigned int ctg, prev_ctg = 0;
143 for ( ctg = 1; ctg <= num_ctg; ctg++ )
145 contig_array[ctg].from_vt = 0;
146 contig_array[ctg].multi = 0;
149 ctgStack = ( STACK * ) createStack ( 1000, sizeof ( CTGinSCAF ) );
150 sprintf ( line, "%s.scaf_gap", graphfile );
151 fp = ckopen ( line, "r" );
152 ctgStackBuffer = ( STACK ** ) ckalloc ( scafBufSize * sizeof ( STACK * ) );
153 initStackBuf ( ctgStackBuffer, scafBufSize );
154 Ncounter = scafCounter = scafInBuf = allGaps = 0;
156 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
158 if ( line[0] == '>' )
160 if ( index )
162 aStack = ctgStackBuffer[scafInBuf++];
163 reverseStack ( aStack, ctgStack );
165 if ( scafInBuf == scafBufSize )
167 mapCtg2Scaf ( scafInBuf );
168 scafCounter += scafInBuf;
169 scafInBuf = 0;
172 if ( index % 1000 == 0 )
174 fprintf ( stderr, "Processed %d scaffolds.\n", index );
178 //read next scaff
179 scafLen = prev_ctg = 0;
180 emptyStack ( ctgStack );
181 sscanf ( line + 9, "%d %d %d", &index, &counter, &overallLen );
182 //fprintf(stderr,">%d\n",index);
183 continue;
186 if ( line[0] == 'G' ) // gap appears
188 continue;
191 if ( line[0] >= '0' && line[0] <= '9' ) // a contig line
193 sscanf ( line, "%d %d", &ctg, &starter );
194 actg = ( CTGinSCAF * ) stackPush ( ctgStack );
195 actg->ctgID = ctg;
197 if ( !prev_ctg )
199 actg->start = scafLen;
200 actg->end = actg->start + overlaplen + contig_array[ctg].length - 1;
202 else
204 gapN = starter - prev_start - ( int ) contig_array[prev_ctg].length;
205 gapN = gapN < 1 ? 1 : gapN;
206 actg->start = scafLen + gapN;
207 actg->end = actg->start + contig_array[ctg].length - 1;
210 //fprintf(stderr,"%d\t%d\n",actg->start,actg->end);
211 scafLen = actg->end + 1;
212 prev_ctg = ctg;
213 prev_start = starter;
217 if ( index )
219 aStack = ctgStackBuffer[scafInBuf++];
220 reverseStack ( aStack, ctgStack );
221 mapCtg2Scaf ( scafInBuf );
224 gapN = 0;
226 for ( ctg = 1; ctg <= num_ctg; ctg++ )
228 if ( contig_array[ctg].from_vt == 0 || contig_array[ctg].multi == 1 )
230 continue;
233 gapN++;
236 fprintf ( stderr, "\nDone with %d scaffolds, %d contigs in Scaffolld\n", index, gapN );
238 if(readSeqInGap)
239 freeDarray(readSeqInGap);
241 fclose ( fp );
242 freeStack ( ctgStack );
243 freeStackBuf ( ctgStackBuffer, scafBufSize );
244 free ( ( void * ) ctgStackBuffer );
247 static boolean contigElligible ( unsigned int contigno )
249 unsigned int ctg = index_array[contigno];
251 if ( contig_array[ctg].from_vt == 0 || contig_array[ctg].multi == 1 )
253 return 0;
255 else
257 return 1;
260 static void output1read ( FILE * fo, long long readno, unsigned int contigno, int pos )
262 unsigned int ctg = index_array[contigno];
263 int posOnScaf;
264 char orien;
265 pos = pos < 0 ? 0 : pos;
267 if ( contig_array[ctg].flag == 0 )
269 posOnScaf = contig_array[ctg].to_vt + pos - overlaplen;
270 orien = '+';
272 else
274 posOnScaf = contig_array[ctg].to_vt + contig_array[ctg].length - pos;
275 orien = '-';
279 if(readno==676)
280 printf("Read %lld in region from %d, extend %d, pos %d, orien %c\n",
281 readno,contig_array[ctg].to_vt,contig_array[ctg].length,posOnScaf,orien);
283 fprintf ( fo, "%lld\t%d\t%d\t%c\n", readno, contig_array[ctg].from_vt, posOnScaf, orien );
286 void locateReadOnScaf ( char * graphfile )
288 char name[1024], line[1024];
289 FILE * fp, *fo;
290 long long readno, counter = 0, pre_readno = 0;
291 unsigned int contigno, pre_contigno;
292 int pre_pos, pos;
293 locateContigOnscaff ( graphfile );
294 sprintf ( name, "%s.readOnContig", graphfile );
295 fp = ckopen ( name, "r" );
296 sprintf ( name, "%s.readOnScaf", graphfile );
297 fo = ckopen ( name, "w" );
299 if ( !orig2new )
301 convertIndex ();
302 orig2new = 1;
305 fgets ( line, 1024, fp );
307 while ( fgets ( line, 1024, fp ) != NULL )
309 sscanf ( line, "%lld %d %d", &readno, &contigno, &pos );
311 if ( ( readno % 2 == 0 ) && ( pre_readno == readno - 1 ) // they are a pair of reads
312 && contigElligible ( pre_contigno ) && contigElligible ( contigno ) )
314 output1read ( fo, pre_readno, pre_contigno, pre_pos );
315 output1read ( fo, readno, contigno, pos );
316 counter++;
319 pre_readno = readno;
320 pre_contigno = contigno;
321 pre_pos = pos;
324 fprintf ( stderr, "%lld pair(s) on contig\n", counter );
325 fclose ( fp );
326 fclose ( fo );