4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
30 //Get the char in Kmer.
31 char getCharInKmer ( Kmer kmer
, int pos
)
35 kmer
.low2
>>= 2 * pos
;
38 else if ( 2 * pos
< 128 )
40 kmer
.high2
>>= 2 * pos
- 64;
41 return kmer
.high2
& 3;
43 else if ( 2 * pos
< 192 )
45 kmer
.low1
>>= 2 * pos
- 128;
50 kmer
.high1
>>= 2 * pos
- 192;
51 return kmer
.high1
& 3;
55 char getCharInKmer ( Kmer kmer
, int pos
)
64 kmer
.high
>>= 2 * pos
- 64;
70 /*************************************************
74 Coyies the interger kmer to char seq.
76 1. targetS: the target seq
77 2. sourceS: the source kmer
78 3. pos: the pos at kmer
79 4. length: the length of sequence to copy
84 *************************************************/
85 void copyinter ( char * targetS
, Kmer sourceS
, int pos
, int length
)
91 for ( i
= 0; i
< length
; ++i
)
93 ch
= getCharInKmer ( sourceS
, step
- i
- 1 );
94 writeChar2tightString ( ch
, targetS
, index
++ );
98 /*************************************************
102 Copies seq from source to target.
104 1. targetS: the target seq
105 2. sourceS: the source seq
106 3. pos: the pos at source
107 4. length: the length of sequence to copy
112 *************************************************/
113 void copySeq2 ( char * targetS
, char * sourceS
, int pos
, int length
)
119 for ( i
= 0; i
< length
; ++i
)
121 ch
= getCharInTightString ( sourceS
, i
);
122 writeChar2tightString ( ch
, targetS
, index
++ );
126 /*************************************************
130 Checks the step between two kmer.
132 1. to_vt: the to vertex
133 2. from_vt: the from vertex
137 step between two kmer.
138 *************************************************/
139 int checkstep ( unsigned int to_vt
, unsigned int from_vt
)
143 to
= vt_arraynew
[to_vt
].kmer
;
144 from
= vt_arraynew
[from_vt
].kmer
;
146 filtertemp
= createFilter ( overlaplen
- i
);
148 if ( KmerEqual ( KmerRightBitMove ( from
, i
<< 1 ), KmerAnd ( to
, filtertemp
) ) )
151 fprintf ( stderr
, "When checking step of two edge, step is not found and step is changed to %d, 'to kmer' is ", step
);
152 printKmerSeq ( stderr
, to
);
153 fprintf ( stderr
, " , 'from kmer' is " );
154 printKmerSeq ( stderr
, from
);
155 fprintf ( stderr
, " .\n" );
159 /*************************************************
161 linearUpdateConnection2
163 Updates the arc and coverage information of edges to be merged.
167 3. indicate: indicates which edge would remain
172 *************************************************/
173 void linearUpdateConnection2 ( unsigned int e1
, unsigned int e2
, int indicate
)
178 //caution: length and seq
181 // edge_array[e1].to_vt = edge_array[e2].to_vt;
182 bal_ed
= getTwinEdge ( e1
);
183 parc
= edge_array
[e2
].arcs
;
187 parc
->bal_arc
->to_ed
= bal_ed
;
191 edge_array
[e1
].arcs
= edge_array
[e2
].arcs
;
192 edge_array
[e2
].arcs
= NULL
;
194 if ( edge_array
[e1
].length
< 0 || edge_array
[e2
].length
< 0 )
195 { fprintf ( stderr
, "Error: length < 0.\n" ); }
197 if ( ( edge_array
[e1
].length
|| edge_array
[e2
].length
) && ( edge_array
[e1
].length
+ edge_array
[e2
].length
+ step
> 0 ) )
198 edge_array
[e1
].cvg
= ( edge_array
[e1
].cvg
* ( edge_array
[e1
].length
+ step
)
199 + edge_array
[e2
].cvg
* ( edge_array
[e2
].length
+ step
) )
200 / ( edge_array
[e1
].length
+ edge_array
[e2
].length
+ step
);
202 edge_array
[e2
].deleted
= 1;
206 //all the arcs pointing to e1 switched to e2
207 parc
= edge_array
[getTwinEdge ( e1
)].arcs
;
211 parc
->bal_arc
->to_ed
= e2
;
215 edge_array
[e1
].arcs
= NULL
;
217 // edge_array[e2].from_vt = edge_array[e1].from_vt;
218 if ( edge_array
[e1
].length
< 0 || edge_array
[e2
].length
< 0 )
219 { fprintf ( stderr
, "Error: length < 0.\n" ); }
221 if ( ( edge_array
[e1
].length
|| edge_array
[e2
].length
) && ( edge_array
[e1
].length
+ edge_array
[e2
].length
+ step
> 0 ) )
222 edge_array
[e2
].cvg
= ( edge_array
[e1
].cvg
* ( edge_array
[e1
].length
+ step
)
223 + edge_array
[e2
].cvg
* ( edge_array
[e2
].length
+ step
) )
224 / ( edge_array
[e1
].length
+ edge_array
[e2
].length
+ step
);
226 edge_array
[e1
].deleted
= 1;
230 /*************************************************
234 Update graph topology.
238 3. indicate: indicates which edge would remain.
239 4. last: whether it's the last iterate.
244 *************************************************/
245 void allpathUpdateEdge2 ( unsigned int e1
, unsigned int e2
, int indicate
, boolean last
)
248 char * tightSeq
= NULL
;
251 //caution: length and seq
252 if ( edge_array
[e1
].cvg
== 0 )
253 { edge_array
[e1
].cvg
= edge_array
[e2
].cvg
; }
255 if ( edge_array
[e2
].cvg
== 0 )
256 { edge_array
[e2
].cvg
= edge_array
[e1
].cvg
; }
258 unsigned int cvgsum
=
259 edge_array
[e1
].cvg
* ( edge_array
[e1
].length
+ step
)
260 + edge_array
[e2
].cvg
* ( edge_array
[e2
].length
+ step
);
261 tightLen
= edge_array
[e1
].length
+ edge_array
[e2
].length
+ step
;
264 { tightSeq
= ( char * ) ckalloc ( ( tightLen
/ 4 + 1 ) * sizeof ( char ) ); }
268 if ( edge_array
[e1
].length
)
270 copySeq2 ( tightSeq
, edge_array
[e1
].seq
, 0, edge_array
[e1
].length
);
271 tightLen
= edge_array
[e1
].length
;
273 if ( edge_array
[e1
].seq
)
275 free ( ( void * ) edge_array
[e1
].seq
);
276 edge_array
[e1
].seq
= NULL
;
279 { fprintf ( stderr
, "AllpathUpdateEdge: edge %d with length %d, but without seq.\n", e1
, edge_array
[e1
].length
); }
285 tempstep
= checkstep ( edge_array
[e1
].to_vt
, edge_array
[e2
].from_vt
);
286 copyinter ( tightSeq
, vt_arraynew
[edge_array
[e2
].from_vt
].kmer
, tightLen
, tempstep
);
287 tightLen
+= tempstep
;
291 if ( edge_array
[e2
].length
)
293 copySeq2 ( tightSeq
, edge_array
[e2
].seq
, tightLen
, edge_array
[e2
].length
);
294 tightLen
+= edge_array
[e2
].length
;
296 if ( edge_array
[e2
].seq
)
298 free ( ( void * ) edge_array
[e2
].seq
);
299 edge_array
[e2
].seq
= NULL
;
302 { fprintf ( stderr
, "AllpathUpdateEdge: edge %d with length %d, but without seq.\n", e2
, edge_array
[e2
].length
); }
305 //edge_array[e2].extend_len = tightLen-edge_array[e2].length;
306 //the sequence of e1 is to be updated
309 edge_array
[e2
].length
= 0; //e2 is removed from the graph
310 edge_array
[e1
].to_vt
= edge_array
[e2
].to_vt
; //e2 is part of e1 now
311 edge_array
[e1
].length
= tightLen
;
312 edge_array
[e1
].seq
= tightSeq
;
315 { edge_array
[e1
].cvg
= cvgsum
/ tightLen
; }
318 { edge_array
[e1
].cvg
= edge_array
[e1
].cvg
> 0 ? edge_array
[e1
].cvg
: 1; }
322 edge_array
[e1
].length
= 0; //e1 is removed from the graph
323 edge_array
[e2
].from_vt
= edge_array
[e1
].from_vt
; //e1 is part of e2 now
324 edge_array
[e2
].length
= tightLen
;
325 edge_array
[e2
].seq
= tightSeq
;
328 { edge_array
[e2
].cvg
= cvgsum
/ tightLen
; }
331 { edge_array
[e2
].cvg
= edge_array
[e2
].cvg
> 0 ? edge_array
[e2
].cvg
: 1; }
335 static void debugging ( unsigned int i
)
338 parc
= edge_array
[i
].arcs
;
341 { fprintf ( stderr
, "No downward connection for %d.\n", i
); }
345 fprintf ( stderr
, "%d -> %d\n", i
, parc
->to_ed
);
351 /*************************************************
355 Concatenates two edges if they are linearly linked.
357 1. last: whether it's the last iterations.
362 *************************************************/
363 void linearConcatenate2 ( boolean last
)
368 unsigned int from_ed
, to_ed
, bal_ed
;
381 for ( i
= 1; i
<= num_ed
; i
++ )
383 if ( edge_array
[i
].deleted
|| EdSameAsTwin ( i
) )
386 if ( edge_array
[i
].length
> 0 )
389 parc
= edge_array
[i
].arcs
;
391 if ( !parc
|| parc
->next
)
395 bal_ed
= getTwinEdge ( to_ed
);
396 parc2
= edge_array
[bal_ed
].arcs
;
398 if ( bal_ed
== to_ed
|| !parc2
|| parc2
->next
)
403 if ( from_ed
== to_ed
|| from_ed
== bal_ed
)
406 //linear connection found
407 if ( parc
->multiplicity
<= arcfilter
)
414 bal_fe
= getTwinEdge ( from_ed
);
415 linearUpdateConnection2 ( from_ed
, to_ed
, 0 );
416 allpathUpdateEdge2 ( from_ed
, to_ed
, 0, last
);
417 linearUpdateConnection2 ( bal_ed
, bal_fe
, 1 );
418 allpathUpdateEdge2 ( bal_ed
, bal_fe
, 1, last
);
421 fprintf ( stderr
, "%d edge(s) concatenated in cycle %d.\n", conc_c
, round
++ );
424 { fprintf ( stderr
, "%d edge(s) are not linearized because of arc weight is %d.\n", donot1
, arcfilter
); }
427 fprintf ( stderr
, "%d edge(s) in the graph.\n", counter
);