modified: nfig1.py
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / linearEdge.c
blob95772e9b66f9f8e3271ebf033440a79994d5e694
1 /*
2 * linearEdge.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
29 #ifdef MER127
30 //Get the char in Kmer.
31 char getCharInKmer ( Kmer kmer, int pos )
33 if ( 2 * pos < 64 )
35 kmer.low2 >>= 2 * pos;
36 return kmer.low2 & 3;
38 else if ( 2 * pos < 128 )
40 kmer.high2 >>= 2 * pos - 64;
41 return kmer.high2 & 3;
43 else if ( 2 * pos < 192 )
45 kmer.low1 >>= 2 * pos - 128;
46 return kmer.low1 & 3;
48 else
50 kmer.high1 >>= 2 * pos - 192;
51 return kmer.high1 & 3;
54 #else
55 char getCharInKmer ( Kmer kmer, int pos )
57 if ( 2 * pos < 64 )
59 kmer.low >>= 2 * pos;
60 return kmer.low & 3;
62 else
64 kmer.high >>= 2 * pos - 64;
65 return kmer.high & 3;
68 #endif
70 /*************************************************
71 Function:
72 copyinter
73 Description:
74 Coyies the interger kmer to char seq.
75 Input:
76 1. targetS: the target seq
77 2. sourceS: the source kmer
78 3. pos: the pos at kmer
79 4. length: the length of sequence to copy
80 Output:
81 None.
82 Return:
83 None.
84 *************************************************/
85 void copyinter ( char * targetS, Kmer sourceS, int pos, int length )
87 char ch;
88 int i, index;
89 index = pos;
91 for ( i = 0; i < length; ++i )
93 ch = getCharInKmer ( sourceS, step - i - 1 );
94 writeChar2tightString ( ch, targetS, index++ );
98 /*************************************************
99 Function:
100 copySeq2
101 Description:
102 Copies seq from source to target.
103 Input:
104 1. targetS: the target seq
105 2. sourceS: the source seq
106 3. pos: the pos at source
107 4. length: the length of sequence to copy
108 Output:
109 None.
110 Return:
111 None.
112 *************************************************/
113 void copySeq2 ( char * targetS, char * sourceS, int pos, int length )
115 char ch;
116 int i, index;
117 index = pos;
119 for ( i = 0; i < length; ++i )
121 ch = getCharInTightString ( sourceS, i );
122 writeChar2tightString ( ch, targetS, index++ );
126 /*************************************************
127 Function:
128 checkstep
129 Description:
130 Checks the step between two kmer.
131 Input:
132 1. to_vt: the to vertex
133 2. from_vt: the from vertex
134 Output:
135 None.
136 Return:
137 step between two kmer.
138 *************************************************/
139 int checkstep ( unsigned int to_vt, unsigned int from_vt )
141 Kmer to, from;
142 Kmer filtertemp;
143 to = vt_arraynew[to_vt].kmer;
144 from = vt_arraynew[from_vt].kmer;
145 int i = 1;
146 filtertemp = createFilter ( overlaplen - i );
148 if ( KmerEqual ( KmerRightBitMove ( from, i << 1 ), KmerAnd ( to, filtertemp ) ) )
149 { return i; }
151 fprintf ( stderr, "When checking step of two edge, step is not found and step is changed to %d, 'to kmer' is ", step );
152 printKmerSeq ( stderr, to );
153 fprintf ( stderr, " , 'from kmer' is " );
154 printKmerSeq ( stderr, from );
155 fprintf ( stderr, " .\n" );
156 return step;
159 /*************************************************
160 Function:
161 linearUpdateConnection2
162 Description:
163 Updates the arc and coverage information of edges to be merged.
164 Input:
165 1. e1: edge1 index
166 2. e2: edge2 index
167 3. indicate: indicates which edge would remain
168 Output:
169 None.
170 Return:
171 None.
172 *************************************************/
173 void linearUpdateConnection2 ( unsigned int e1, unsigned int e2, int indicate )
175 unsigned int bal_ed;
176 ARC * parc;
178 //caution: length and seq
179 if ( !indicate )
181 // edge_array[e1].to_vt = edge_array[e2].to_vt;
182 bal_ed = getTwinEdge ( e1 );
183 parc = edge_array[e2].arcs;
185 while ( parc )
187 parc->bal_arc->to_ed = bal_ed;
188 parc = parc->next;
191 edge_array[e1].arcs = edge_array[e2].arcs;
192 edge_array[e2].arcs = NULL;
194 if ( edge_array[e1].length < 0 || edge_array[e2].length < 0 )
195 { fprintf ( stderr, "Error: length < 0.\n" ); }
197 if ( ( edge_array[e1].length || edge_array[e2].length ) && ( edge_array[e1].length + edge_array[e2].length + step > 0 ) )
198 edge_array[e1].cvg = ( edge_array[e1].cvg * ( edge_array[e1].length + step )
199 + edge_array[e2].cvg * ( edge_array[e2].length + step ) )
200 / ( edge_array[e1].length + edge_array[e2].length + step );
202 edge_array[e2].deleted = 1;
204 else
206 //all the arcs pointing to e1 switched to e2
207 parc = edge_array[getTwinEdge ( e1 )].arcs;
209 while ( parc )
211 parc->bal_arc->to_ed = e2;
212 parc = parc->next;
215 edge_array[e1].arcs = NULL;
217 // edge_array[e2].from_vt = edge_array[e1].from_vt;
218 if ( edge_array[e1].length < 0 || edge_array[e2].length < 0 )
219 { fprintf ( stderr, "Error: length < 0.\n" ); }
221 if ( ( edge_array[e1].length || edge_array[e2].length ) && ( edge_array[e1].length + edge_array[e2].length + step > 0 ) )
222 edge_array[e2].cvg = ( edge_array[e1].cvg * ( edge_array[e1].length + step )
223 + edge_array[e2].cvg * ( edge_array[e2].length + step ) )
224 / ( edge_array[e1].length + edge_array[e2].length + step );
226 edge_array[e1].deleted = 1;
230 /*************************************************
231 Function:
232 allpathUpdateEdge2
233 Description:
234 Update graph topology.
235 Input:
236 1. e1: edge1 index
237 2. e2: edge2 index
238 3. indicate: indicates which edge would remain.
239 4. last: whether it's the last iterate.
240 Output:
241 None.
242 Return:
243 None.
244 *************************************************/
245 void allpathUpdateEdge2 ( unsigned int e1, unsigned int e2, int indicate, boolean last )
247 int tightLen;
248 char * tightSeq = NULL;
249 int tempstep = 0;
251 //caution: length and seq
252 if ( edge_array[e1].cvg == 0 )
253 { edge_array[e1].cvg = edge_array[e2].cvg; }
255 if ( edge_array[e2].cvg == 0 )
256 { edge_array[e2].cvg = edge_array[e1].cvg; }
258 unsigned int cvgsum =
259 edge_array[e1].cvg * ( edge_array[e1].length + step )
260 + edge_array[e2].cvg * ( edge_array[e2].length + step );
261 tightLen = edge_array[e1].length + edge_array[e2].length + step;
263 if ( tightLen )
264 { tightSeq = ( char * ) ckalloc ( ( tightLen / 4 + 1 ) * sizeof ( char ) ); }
266 tightLen = 0;
268 if ( edge_array[e1].length )
270 copySeq2 ( tightSeq, edge_array[e1].seq, 0, edge_array[e1].length );
271 tightLen = edge_array[e1].length;
273 if ( edge_array[e1].seq )
275 free ( ( void * ) edge_array[e1].seq );
276 edge_array[e1].seq = NULL;
278 else
279 { fprintf ( stderr, "AllpathUpdateEdge: edge %d with length %d, but without seq.\n", e1, edge_array[e1].length ); }
283 if ( step > 0 )
285 tempstep = checkstep ( edge_array[e1].to_vt, edge_array[e2].from_vt );
286 copyinter ( tightSeq, vt_arraynew[edge_array[e2].from_vt].kmer, tightLen, tempstep );
287 tightLen += tempstep;
291 if ( edge_array[e2].length )
293 copySeq2 ( tightSeq, edge_array[e2].seq, tightLen, edge_array[e2].length );
294 tightLen += edge_array[e2].length;
296 if ( edge_array[e2].seq )
298 free ( ( void * ) edge_array[e2].seq );
299 edge_array[e2].seq = NULL;
301 else
302 { fprintf ( stderr, "AllpathUpdateEdge: edge %d with length %d, but without seq.\n", e2, edge_array[e2].length ); }
305 //edge_array[e2].extend_len = tightLen-edge_array[e2].length;
306 //the sequence of e1 is to be updated
307 if ( !indicate )
309 edge_array[e2].length = 0; //e2 is removed from the graph
310 edge_array[e1].to_vt = edge_array[e2].to_vt; //e2 is part of e1 now
311 edge_array[e1].length = tightLen;
312 edge_array[e1].seq = tightSeq;
314 if ( tightLen )
315 { edge_array[e1].cvg = cvgsum / tightLen; }
317 if ( last )
318 { edge_array[e1].cvg = edge_array[e1].cvg > 0 ? edge_array[e1].cvg : 1; }
320 else
322 edge_array[e1].length = 0; //e1 is removed from the graph
323 edge_array[e2].from_vt = edge_array[e1].from_vt; //e1 is part of e2 now
324 edge_array[e2].length = tightLen;
325 edge_array[e2].seq = tightSeq;
327 if ( tightLen )
328 { edge_array[e2].cvg = cvgsum / tightLen; }
330 if ( last )
331 { edge_array[e2].cvg = edge_array[e2].cvg > 0 ? edge_array[e2].cvg : 1; }
335 static void debugging ( unsigned int i )
337 ARC * parc;
338 parc = edge_array[i].arcs;
340 if ( !parc )
341 { fprintf ( stderr, "No downward connection for %d.\n", i ); }
343 while ( parc )
345 fprintf ( stderr, "%d -> %d\n", i, parc->to_ed );
346 parc = parc->next;
351 /*************************************************
352 Function:
353 linearConcatenate2
354 Description:
355 Concatenates two edges if they are linearly linked.
356 Input:
357 1. last: whether it's the last iterations.
358 Output:
359 None.
360 Return:
361 None.
362 *************************************************/
363 void linearConcatenate2 ( boolean last )
365 unsigned int i;
366 int conc_c = 1;
367 int counter;
368 unsigned int from_ed, to_ed, bal_ed;
369 ARC * parc, *parc2;
370 unsigned int bal_fe;
371 ARC * temp;
372 int donot1 = 0;
373 int round = 1;
375 while ( conc_c )
377 conc_c = 0;
378 counter = 0;
379 donot1 = 0;
381 for ( i = 1; i <= num_ed; i++ )
383 if ( edge_array[i].deleted || EdSameAsTwin ( i ) )
384 { continue; }
386 if ( edge_array[i].length > 0 )
387 { counter++; }
389 parc = edge_array[i].arcs;
391 if ( !parc || parc->next )
392 { continue; }
394 to_ed = parc->to_ed;
395 bal_ed = getTwinEdge ( to_ed );
396 parc2 = edge_array[bal_ed].arcs;
398 if ( bal_ed == to_ed || !parc2 || parc2->next )
399 { continue; }
401 from_ed = i;
403 if ( from_ed == to_ed || from_ed == bal_ed )
404 { continue; }
406 //linear connection found
407 if ( parc->multiplicity <= arcfilter )
409 donot1++;
410 continue;
413 conc_c++;
414 bal_fe = getTwinEdge ( from_ed );
415 linearUpdateConnection2 ( from_ed, to_ed, 0 );
416 allpathUpdateEdge2 ( from_ed, to_ed, 0, last );
417 linearUpdateConnection2 ( bal_ed, bal_fe, 1 );
418 allpathUpdateEdge2 ( bal_ed, bal_fe, 1, last );
421 fprintf ( stderr, "%d edge(s) concatenated in cycle %d.\n", conc_c, round++ );
423 if ( arcfilter )
424 { fprintf ( stderr, "%d edge(s) are not linearized because of arc weight is %d.\n", donot1, arcfilter ); }
427 fprintf ( stderr, "%d edge(s) in the graph.\n", counter );