4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
29 static Kmer
* kmerBuffer
;
30 static ubyte8
* hashBanBuffer
;
32 static char * flagBuffer
;
33 static int buffer_size
= 100000000;//2013-5-13
34 long long foundcount
= 0;
35 long long notfoundcount
= 0;
36 long long newfoundcount
= 0;
37 long long newnotfoundcount
= 0;
38 long long edgeaddnumber
= 0;
40 unsigned int ** arcBuffer
;
41 unsigned int arcBufferCount
= 0;
43 unsigned int ** delarcBuffer
;
44 unsigned int delarcBufferCount
= 0;
46 static void forward ( unsigned int index
, int first
);
48 //Fresh preGraphBasic to record the original message and the multikmer message.
49 void freshpreGraphBasic ( boolean iterate
, int maxk
, char * graph
)
51 char name
[256], line
[256];
57 int maxreadlen
= 0, minreadlen
= 0, maxnamelen
= 0;
58 sprintf ( name
, "%s.preGraphBasic", graph
);
59 fp
= ckopen ( name
, "r" );
61 while ( fgets ( line
, sizeof ( line
), fp
) != NULL
)
65 sscanf ( line
+ 6, "%d %c %d", &num_kmer
, &ch
, &min
);
67 else if ( line
[0] == 'E' )
69 sscanf ( line
+ 5, "%d", &numed
);
71 else if ( line
[0] == 'M' )
73 sscanf ( line
, "MaxReadLen %d MinReadLen %d MaxNameLen %d", &maxreadlen
, &minreadlen
, &maxnamelen
);
75 else if ( line
[0] == 'B' )
79 sscanf ( line
, "Backup VERTEX %d %c %d", &num_kmer
, &ch
, &min
);
81 else if ( line
[7] == 'E' )
83 sscanf ( line
, "Backup EDGEs %d", &numed
);
85 else if ( line
[7] == 'M' )
87 sscanf ( line
, "Backup MaxReadLen %d MinReadLen %d MaxNameLen %d", &maxreadlen
, &minreadlen
, &maxnamelen
);
93 sprintf ( name
, "%s.preGraphBasic", graph
);
94 fp
= ckopen ( name
, "w" );
98 fprintf ( fp
, "VERTEX %d K %d\n", num_vt
, maxk
);
99 fprintf ( fp
, "\nEDGEs %d\n", num_ed
);
100 fprintf ( fp
, "\nMaxReadLen %d MinReadLen %d MaxNameLen %d\n", maxreadlen
, minreadlen
, maxnamelen
);
101 fprintf ( fp
, "\nBackup VERTEX %d K %d\n", num_kmer
, min
);
102 fprintf ( fp
, "\nBackup EDGEs %d\n", numed
);
103 fprintf ( fp
, "\nBackup MaxReadLen %d MinReadLen %d MaxNameLen %d\n", maxreadlen
, minreadlen
, maxnamelen
);
107 fprintf ( fp
, "VERTEX %d K %d\n", num_kmer
, min
);
108 fprintf ( fp
, "\nEDGEs %d\n", numed
);
109 fprintf ( fp
, "\nMaxReadLen %d MinReadLen %d MaxNameLen %d\n", maxreadlen
, minreadlen
, maxnamelen
);
116 // kmer1 = kmer1 | kmer2
117 inline Kmer
KmerOr ( Kmer kmer1
, Kmer kmer2
)
119 kmer1
.high1
|= kmer2
.high1
;
120 kmer1
.low1
|= kmer2
.low1
;
121 kmer1
.high2
|= kmer2
.high2
;
122 kmer1
.low2
|= kmer2
.low2
;
126 //Add ch at the head of prev.
127 inline Kmer
KmerPlusHead ( Kmer prev
, char ch
, int len
)
130 word
.high1
= word
.low1
= word
.high2
= word
.low2
= 0;
134 word
.low2
= ch
& 0x3;
135 word
.low2
<<= ( 2 * len
);
137 else if ( 2 * len
< 128 )
139 word
.high2
= ch
& 0x3;
140 word
.high2
<<= ( 2 * len
- 64 );
142 else if ( 2 * len
< 192 )
144 word
.low1
= ch
& 0x3;
145 word
.low1
<<= ( 2 * len
- 128 );
149 word
.high1
= ch
& 0x3;
150 word
.high1
<<= ( 2 * len
- 192 );
153 word
= KmerOr ( word
, prev
);
158 //Add ch at the tail of prev.
159 inline Kmer
KmerPlusTail ( Kmer prev
, char ch
)
161 Kmer word
= KmerLeftBitMoveBy2 ( prev
);
162 word
.low2
+= ch
& 0x3;
166 static const Kmer kmerZero
= { 0, 0, 0, 0 };
168 //kmer1 = kmer1 | kmer2
169 Kmer
KmerOr ( Kmer kmer1
, Kmer kmer2
)
171 kmer1
.high
|= kmer2
.high
;
172 kmer1
.low
|= kmer2
.low
;
176 //Add ch at the head of prev.
177 Kmer
KmerPlusHead ( Kmer prev
, char ch
, int len
)
180 word
.high
= word
.low
= 0;
185 word
.low
<<= ( 2 * len
);
189 word
.high
= ch
& 0x3;
190 word
.high
<<= ( 2 * len
- 64 );
193 word
= KmerOr ( word
, prev
);
198 //Add ch at the tail of prev.
199 inline Kmer
KmerPlusTail ( Kmer prev
, char ch
)
201 Kmer word
= KmerLeftBitMoveBy2 ( prev
);
202 word
.low
+= ch
& 0x3;
206 static const Kmer kmerZero
= { 0, 0 };
209 /*************************************************
213 Get front (k+1)mer of edge.
215 1. index : index in edge array
219 (k+1)mer at the front.
220 *************************************************/
221 Kmer
getFromKmer ( unsigned int index
)
223 Kmer temp
= kmerZero
;
224 temp
= vt_array
[edge_array
[index
].from_vt
].kmer
;
228 for ( i
= 0; i
< step
; ++i
)
230 c
= getCharInTightString ( edge_array
[index
].seq
, i
);
231 temp
= KmerPlusTail ( temp
, c
);
237 /*************************************************
241 Get last (k+1)mer of edge.
243 1. index : index in edge array
248 *************************************************/
249 Kmer
getToKmer ( unsigned int index
)
251 Kmer temp
= kmerZero
;
252 Kmer temp2
= kmerZero
;
253 int len
= edge_array
[index
].length
- overlaplen
+ step
;
255 temp
= vt_array
[edge_array
[index
].to_vt
].kmer
;
256 temp2
= vt_array
[edge_array
[index
].from_vt
].kmer
;
261 for ( i
= 0; i
< step
; ++i
)
263 c
= getCharInTightString ( edge_array
[index
].seq
, len
- i
- 1 );
264 temp
= KmerPlusHead ( temp
, c
, i
+ overlaplen
- step
);
267 else if ( len
> 0 && len
< step
)
269 for ( i
= 0; i
< len
; ++i
)
271 c
= getCharInTightString ( edge_array
[index
].seq
, len
- i
- 1 );
272 temp
= KmerPlusHead ( temp
, c
, i
+ overlaplen
- step
);
275 for ( i
= 0; i
< ( step
- len
); ++i
)
277 c
= lastCharInKmer ( KmerRightBitMove ( temp2
, i
<< 1 ) ); //.low2 & 0x3;
278 temp
= KmerPlusHead ( temp
, c
, i
+ overlaplen
- step
+ len
);
283 for ( i
= 0; i
< step
; ++i
)
285 c
= lastCharInKmer ( KmerRightBitMove ( temp2
, ( i
- len
) << 1 ) ); //.low2 & 0x3;
286 temp
= KmerPlusHead ( temp
, c
, i
+ overlaplen
- step
);
294 inline void delete1Edge ( unsigned int index
)
296 edge_array
[index
].deleted
= 1;
299 /*************************************************
303 Updates kmer edge to (k+1)mer edge.
305 1. index : edge index in array
309 index of new kmer set.
311 *************************************************/
312 int kmer2vtnew ( unsigned int index
, int from
)
318 kmer_t2
* node
= NULL
;
321 { kmer
= getFromKmer ( index
); }
323 { kmer
= getToKmer ( index
); }
325 bal_word
= reverseComplement ( kmer
, overlaplen
);
327 if ( KmerSmaller ( kmer
, bal_word
) )
329 vt_id
= bisearch ( &vt_arraynew
[0], num_vtnew
, kmer
);
334 fprintf ( stderr
, "Updating edge, small vertex " );
335 printKmerSeq ( stderr
, kmer
);
336 fprintf ( stderr
, " is not found, it's twin is " );
337 printKmerSeq ( stderr
, reverseComplement ( kmer
, overlaplen
) );
338 fprintf ( stderr
, " .\n" );
339 found
= search_kmerset2 ( KmerSetsNew
, kmer
, &node
);
342 { fprintf ( stderr
, "The kmer is in kmer set but not in vt_array.\n" ); }
344 { fprintf ( stderr
, "The kmer is not in kmer set and vt_array.\n" ); }
355 vt_id
= bisearch ( &vt_arraynew
[0], num_vtnew
, bal_word
);
365 fprintf ( stderr
, "Updating edge, big vertex " );
366 printKmerSeq ( stderr
, reverseComplement ( bal_word
, overlaplen
) );
367 fprintf ( stderr
, " is not found, it's twin is " );
368 printKmerSeq ( stderr
, bal_word
);
369 fprintf ( stderr
, " .\n" );
370 found
= search_kmerset2 ( KmerSetsNew
, bal_word
, &node
);
373 { fprintf ( stderr
, "The kmer is in kmer set but not in vt_array.\n" ); }
375 { fprintf ( stderr
, "The kmer is not in kmer set and vt_array.\n" ); }
382 /*************************************************
386 Updates kmer edge to (k+1)mer edge.
388 1. index : edge index in array
393 *************************************************/
394 void update1Edge ( unsigned int index
)
396 int templength
= edge_array
[index
].length
;
400 char * tightSeq
= NULL
;
401 temp_from_vt
= kmer2vtnew ( index
, 1 );
402 temp_to_vt
= kmer2vtnew ( index
, 0 );
404 if ( temp_from_vt
< 0 || temp_to_vt
< 0 )
406 destroyEdge2 ( index
);
407 delete1Edge ( index
);
408 fprintf ( stderr
, "Warning : Kmer is not found, from_vt %d, to_vt %d.\n", temp_from_vt
, temp_to_vt
);
412 edge_array
[index
].from_vt
= temp_from_vt
;
413 edge_array
[index
].to_vt
= temp_to_vt
;
414 edge_array
[index
].length
-= step
;
416 if ( edge_array
[index
].length
== 1 || edge_array
[index
].length
== 0 )
417 { edge_array
[index
].cvg
= 0; }
419 tightSeq
= ( char * ) ckalloc ( ( edge_array
[index
].length
/ 4 + 1 ) * sizeof ( char ) );
421 for ( i
= 0; i
< edge_array
[index
].length
; ++i
)
422 { writeChar2tightString ( getCharInTightString ( edge_array
[index
].seq
, i
+ step
), tightSeq
, i
); }
424 if ( edge_array
[index
].seq
)
426 free ( ( void * ) edge_array
[index
].seq
);
427 edge_array
[index
].seq
= NULL
;
430 edge_array
[index
].seq
= tightSeq
;
431 edge_array
[index
].rv
= NULL
;
432 ARC
* currArc
= edge_array
[index
].arcs
;
433 ARC
* tempArc
= NULL
;
438 currArc
= currArc
->next
;
439 edge_array
[index
].arcs
= deleteArc ( edge_array
[index
].arcs
, tempArc
);
442 edge_array
[index
].arcs
= NULL
;
443 edge_array
[index
].markers
= NULL
;
446 /*************************************************
450 Gets (k+1)mer hash set.
457 *************************************************/
461 ubyte8 hash_ban
, bal_hash_ban
;
464 unsigned int deletecount
= 0;
466 kmerBuffer
= ( Kmer
* ) ckalloc ( 2 * ( num_ed
+ 1 ) * sizeof ( Kmer
) );
467 hashBanBuffer
= ( ubyte8
* ) ckalloc ( 2 * ( num_ed
+ 1 ) * sizeof ( ubyte8
) );
468 edge_id
= ( unsigned int * ) ckalloc ( 2 * ( num_ed
+ 1 ) * sizeof ( unsigned int ) );
469 //00 : big to 01 : big from,10 : small to, 11 : small from
470 flagBuffer
= ( boolean
* ) ckalloc ( 2 * ( num_ed
+ 1 ) * sizeof ( boolean
) );
472 for ( i
= 1; i
<= num_ed
; ++i
)
474 if ( edge_array
[i
].deleted
)
477 if ( edge_array
[i
].length
< step
) //=
488 if ( edge_array
[i
].length
== step
)
490 word
= getFromKmer ( i
);
491 bal_word
= reverseComplement ( word
, overlaplen
);
493 if ( KmerSmaller ( word
, bal_word
) )
495 hash_ban
= hash_kmer ( word
);
496 hashBanBuffer
[kmer_cnew
] = hash_ban
;
497 kmerBuffer
[kmer_cnew
] = word
;
498 flagBuffer
[kmer_cnew
] = 4;
502 bal_hash_ban
= hash_kmer ( bal_word
);
503 hashBanBuffer
[kmer_cnew
] = bal_hash_ban
;
504 kmerBuffer
[kmer_cnew
] = bal_word
;
505 flagBuffer
[kmer_cnew
] = 5;
508 edge_id
[kmer_cnew
] = i
;
513 if ( edge_array
[i
].bal_edge
== 1 )
515 word
= getFromKmer ( i
);
516 bal_word
= reverseComplement ( word
, overlaplen
);
518 if ( KmerSmaller ( word
, bal_word
) )
520 hash_ban
= hash_kmer ( word
);
521 hashBanBuffer
[kmer_cnew
] = hash_ban
;
522 kmerBuffer
[kmer_cnew
] = word
;
523 flagBuffer
[kmer_cnew
] = 6;
527 bal_hash_ban
= hash_kmer ( bal_word
);
528 hashBanBuffer
[kmer_cnew
] = bal_hash_ban
;
529 kmerBuffer
[kmer_cnew
] = bal_word
;
530 flagBuffer
[kmer_cnew
] = 7;
533 edge_id
[kmer_cnew
] = i
;
537 word
= getToKmer ( i
);
538 bal_word
= reverseComplement ( word
, overlaplen
);
540 if ( KmerSmaller ( word
, bal_word
) )
542 hash_ban
= hash_kmer ( word
);
543 hashBanBuffer
[kmer_cnew
] = hash_ban
;
544 kmerBuffer
[kmer_cnew
] = word
;
545 flagBuffer
[kmer_cnew
] = 8;
549 bal_hash_ban
= hash_kmer ( bal_word
);
550 hashBanBuffer
[kmer_cnew
] = bal_hash_ban
;
551 kmerBuffer
[kmer_cnew
] = bal_word
;
552 flagBuffer
[kmer_cnew
] = 9;
555 edge_id
[kmer_cnew
] = i
;
560 word
= getFromKmer ( i
);
561 bal_word
= reverseComplement ( word
, overlaplen
);
563 if ( KmerSmaller ( word
, bal_word
) )
565 hash_ban
= hash_kmer ( word
);
566 hashBanBuffer
[kmer_cnew
] = hash_ban
;
567 kmerBuffer
[kmer_cnew
] = word
;
568 flagBuffer
[kmer_cnew
] = 3;
572 bal_hash_ban
= hash_kmer ( bal_word
);
573 hashBanBuffer
[kmer_cnew
] = bal_hash_ban
;
574 kmerBuffer
[kmer_cnew
] = bal_word
;
575 flagBuffer
[kmer_cnew
] = 1;
578 edge_id
[kmer_cnew
] = i
;
582 word
= getToKmer ( i
);
583 bal_word
= reverseComplement ( word
, overlaplen
);
585 if ( KmerSmaller ( word
, bal_word
) )
587 hash_ban
= hash_kmer ( word
);
588 hashBanBuffer
[kmer_cnew
] = hash_ban
;
589 kmerBuffer
[kmer_cnew
] = word
;
590 flagBuffer
[kmer_cnew
] = 2;
594 bal_hash_ban
= hash_kmer ( bal_word
);
595 hashBanBuffer
[kmer_cnew
] = bal_hash_ban
;
596 kmerBuffer
[kmer_cnew
] = bal_word
;
597 flagBuffer
[kmer_cnew
] = 0;
600 edge_id
[kmer_cnew
] = i
;
604 fprintf ( stderr
, "%lld edge(s) deleted in length of 0.\n", deletecount
);
605 KmerSetsNew
= init_kmerset2 ( 1024, 0.77f
);
607 for ( i
= 0; i
< kmer_cnew
; ++i
)
609 put_kmerset2 ( KmerSetsNew
, kmerBuffer
[i
], edge_id
[i
], flagBuffer
[i
], &node
);
612 num_vtnew
= count_kmerset2 ( KmerSetsNew
);
613 fprintf ( stderr
, "%u new kmer(s).\n", num_vtnew
);
616 free ( hashBanBuffer
);
617 hashBanBuffer
= NULL
;
624 /*************************************************
628 Gets (k+1)mer vertex from hash.
635 *************************************************/
640 vt_arraynew
= ( VERTEX
* ) ckalloc ( 4 * ( num_vtnew
+ 1 ) * sizeof ( VERTEX
) );
641 num_kmer_limit
= 4 * num_vtnew
;
644 unsigned int count
= 0;
648 while ( set
->iter_ptr
< set
->size
)
650 if ( !is_kmer_entity_null ( set
->flags
, set
->iter_ptr
) )
652 node
= set
->array
+ set
->iter_ptr
;
657 bal_word
= reverseComplement ( word
, overlaplen
);
659 if ( KmerSmaller ( word
, bal_word
) )
660 { vt_arraynew
[count
].kmer
= word
; }
662 { vt_arraynew
[count
].kmer
= bal_word
; }
672 qsort ( &vt_arraynew
[0], num_vtnew
, sizeof ( vt_arraynew
[0] ), cmp_vertex
);
674 for ( i
= 0; i
< num_vtnew
; ++i
)
676 bal_word
= reverseComplement ( vt_arraynew
[i
].kmer
, overlaplen
);
677 vt_arraynew
[i
+ num_vtnew
].kmer
= bal_word
;
681 /*************************************************
685 1. Builds (k+1)mer graph in memory based on kmer graph.
686 2. Gets (k+1)mer hash set.
687 3. Gets (k+1)mer vertex.
688 4. Updates kmer edges to (k+1)mer edges.
695 *************************************************/
696 void buildGraphHash()
699 unsigned int count
= 0;
700 //use from kmer & to kmer to build hash
701 fprintf ( stderr
, "Construct new kmer hash.\n" );
707 for ( i
= 1; i
<= num_ed
; ++i
)
709 if ( edge_array
[i
].deleted
)
712 if ( edge_array
[i
].length
< step
) //=
719 //update twin edge together
726 fprintf ( stderr
, "There are %lld kmer(s) found.\n", foundcount
);
727 fprintf ( stderr
, "There are %lld kmer(s) not found.\n", notfoundcount
);
730 fprintf ( stderr
, "%u edge(s) updated to %dmer edge.\n", count
, overlaplen
);
734 free ( ( void * ) vt_array
);
738 vt_array
= vt_arraynew
;
742 //FILE *tempF = NULL;
744 /*************************************************
748 1. Re-build arcs by reads.
749 // 2. Output selected reads if -r is set(keepReadFile is true).
751 1. libfile : the reads config file
752 2. graph : the output prefix
753 3. flag : whether the selected reads file exist
754 4. last : whether it's the last iterate
755 5. maxk : the max kmer when using multikmer
756 // 6. keepReadFile : keep tmp reads file that selected for building arc
758 1. *.read : the selected reads that use for assembly
761 *************************************************/
762 void addArc ( char * libfile
, char * graph
, int flag
, int last
, int maxk
) //, boolean keepReadFile
767 char readSeqName[256];
768 sprintf(readSeqName,"%s.read",graph);
769 tempF=fopen(readSeqName,"r");
776 if ( flag
) // || tempF)
780 Read2edge2 ( libfile
, graph
, last
, maxk
); //, keepReadFile
783 { Read2edge ( libfile
, graph
, maxk
); }
787 for ( i
= 1; i
<= num_ed
; ++i
)
789 if ( edge_array
[i
].deleted
)
798 /*************************************************
804 1. from : whether changing from kmer
805 2. index : index of edge
806 3. ch : next char to extend
807 4. backup : backup char
811 Index of new extended kmer.
812 *************************************************/
813 int kmer2edge ( int from
, unsigned int index
, char ch
, char * backup
)
816 Kmer kmer
= kmerZero
, bal_word
= kmerZero
;
818 char * tightSeq
= NULL
;
820 // if(edge_array[index].arcs && edge_array[edge_array[index].arcs->to_ed].length <= 1)
825 kmer
= vt_array
[edge_array
[index
].from_vt
].kmer
;
826 *backup
= lastCharInKmer ( kmer
); //.low2 & 3;
827 kmer
= prevKmer ( kmer
, ch
);
831 kmer
= vt_array
[edge_array
[index
].to_vt
].kmer
;
833 kmer
= nextKmer ( kmer
, ch
);
836 bal_word
= reverseComplement ( kmer
, overlaplen
);
838 if ( KmerSmaller ( kmer
, bal_word
) )
840 vt_id
= bisearch ( &vt_array
[0], num_vt
, kmer
);
845 fprintf ( stderr
, "When extending edge 'small vertex' is not found, edge %d kmer ", index
);
846 printKmerSeq ( stderr
, kmer
);
847 fprintf ( stderr
, " , it's twin " );
848 printKmerSeq ( stderr
, bal_word
);
849 fprintf ( stderr
, " .\n" );
858 vt_id
= bisearch ( &vt_array
[0], num_vt
, bal_word
);
868 fprintf ( stderr
, "When extending edge 'big vertex' is not found, edge %d kmer ", index
);
869 printKmerSeq ( stderr
, kmer
);
870 fprintf ( stderr
, " , it's twin " );
871 printKmerSeq ( stderr
, bal_word
);
872 fprintf ( stderr
, " .\n" );
882 char ch1
= int_comp ( ch
);
883 Kmer kmer1
= kmerZero
, bal_word1
= kmerZero
;
885 char * tightSeq1
= NULL
;
889 kmer1
= vt_array
[edge_array
[index
+ 1].to_vt
].kmer
;
891 kmer1
= nextKmer ( kmer1
, ch1
);
895 kmer1
= vt_array
[edge_array
[index
+ 1].from_vt
].kmer
;
896 backup1
= lastCharInKmer ( kmer1
); //.low2 & 3;
897 kmer1
= prevKmer ( kmer1
, ch1
);
900 bal_word1
= reverseComplement ( kmer1
, overlaplen
);
902 if ( KmerSmaller ( kmer1
, bal_word1
) )
904 vt_id1
= bisearch ( &vt_array
[0], num_vt
, kmer1
);
909 fprintf ( stderr
, "When extending edge 'small vertex' is not found, edge %d kmer ", index
+ 1 );
910 printKmerSeq ( stderr
, kmer1
);
911 fprintf ( stderr
, " , it's twin " );
912 printKmerSeq ( stderr
, bal_word1
);
913 fprintf ( stderr
, " .\n" );
922 vt_id1
= bisearch ( &vt_array
[0], num_vt
, bal_word1
);
932 fprintf ( stderr
, "When extending edge big vertex is not found, edge %d kmer ", index
+ 1 );
933 printKmerSeq ( stderr
, kmer1
);
934 fprintf ( stderr
, " , it's twin " );
935 printKmerSeq ( stderr
, bal_word1
);
936 fprintf ( stderr
, " .\n" );
950 edge_array
[index
].from_vt
= vt_id
;
951 tightSeq
= ( char * ) ckalloc ( ( ( edge_array
[index
].length
+ 1 ) / 4 + 1 ) * sizeof ( char ) );
952 writeChar2tightString ( *backup
, tightSeq
, 0 );
954 for ( i
= 0; i
< edge_array
[index
].length
; ++i
)
955 { writeChar2tightString ( getCharInTightString ( edge_array
[index
].seq
, i
), tightSeq
, i
+ 1 ); }
957 if ( edge_array
[index
].seq
)
959 free ( ( void * ) edge_array
[index
].seq
);
960 edge_array
[index
].seq
= NULL
;
963 edge_array
[index
].seq
= tightSeq
;
964 edge_array
[index
].length
+= 1;
966 edge_array
[index
+ 1].to_vt
= vt_id1
;
967 tightSeq1
= ( char * ) ckalloc ( ( ( edge_array
[index
+ 1].length
+ 1 ) / 4 + 1 ) * sizeof ( char ) );
969 for ( i
= 0; i
< edge_array
[index
+ 1].length
; ++i
)
970 { writeChar2tightString ( getCharInTightString ( edge_array
[index
+ 1].seq
, i
), tightSeq1
, i
); }
972 writeChar2tightString ( backup1
, tightSeq1
, i
);
974 if ( edge_array
[index
+ 1].seq
)
976 free ( ( void * ) edge_array
[index
+ 1].seq
);
977 edge_array
[index
+ 1].seq
= NULL
;
980 edge_array
[index
+ 1].seq
= tightSeq1
;
981 edge_array
[index
+ 1].length
+= 1;
986 edge_array
[index
].to_vt
= vt_id
;
987 tightSeq
= ( char * ) ckalloc ( ( ( edge_array
[index
].length
+ 1 ) / 4 + 1 ) * sizeof ( char ) );
989 for ( i
= 0; i
< edge_array
[index
].length
; ++i
)
990 { writeChar2tightString ( getCharInTightString ( edge_array
[index
].seq
, i
), tightSeq
, i
); }
992 writeChar2tightString ( *backup
, tightSeq
, i
);
994 if ( edge_array
[index
].seq
)
996 free ( ( void * ) edge_array
[index
].seq
);
997 edge_array
[index
].seq
= NULL
;
1000 edge_array
[index
].seq
= tightSeq
;
1001 edge_array
[index
].length
+= 1;
1003 edge_array
[index
+ 1].from_vt
= vt_id1
;
1004 tightSeq1
= ( char * ) ckalloc ( ( ( edge_array
[index
+ 1].length
+ 1 ) / 4 + 1 ) * sizeof ( char ) );
1005 writeChar2tightString ( backup1
, tightSeq1
, 0 );
1007 for ( i
= 0; i
< edge_array
[index
+ 1].length
; ++i
)
1008 { writeChar2tightString ( getCharInTightString ( edge_array
[index
+ 1].seq
, i
), tightSeq1
, i
+ 1 ); }
1010 if ( edge_array
[index
+ 1].seq
)
1012 free ( ( void * ) edge_array
[index
+ 1].seq
);
1013 edge_array
[index
+ 1].seq
= NULL
;
1016 edge_array
[index
+ 1].seq
= tightSeq1
;
1017 edge_array
[index
+ 1].length
+= 1;
1023 //Add edge and reverse complement.
1024 void addEdge ( unsigned int from
, unsigned int to
, char ch
, int bal_edge
, unsigned int cvg
)
1026 if ( num_ed_temp
+ 1 > num_ed_limit
)
1028 unsigned int new_num_ed
= num_ed_limit
* 1.2;
1029 edge_array
= ( EDGE
* ) ckrealloc ( edge_array
, ( new_num_ed
+ 1 ) * sizeof ( EDGE
), ( num_ed_limit
+ 1 ) * sizeof ( EDGE
) );
1030 num_ed_limit
= new_num_ed
;
1033 for ( j
= num_ed_temp
+ 1; j
<= num_ed_limit
; j
++ )
1035 edge_array
[j
].seq
= NULL
;
1038 fprintf ( stderr
, "Realloc edge array.\n" );
1041 char * tightSeq
= ( char * ) ckalloc ( sizeof ( char ) );
1042 writeChar2tightString ( ch
, tightSeq
, 0 );
1043 edge_array
[num_ed_temp
+ 1].from_vt
= from
;
1044 edge_array
[num_ed_temp
+ 1].to_vt
= to
;
1045 edge_array
[num_ed_temp
+ 1].length
= 1;
1046 edge_array
[num_ed_temp
+ 1].cvg
= cvg
;
1047 edge_array
[num_ed_temp
+ 1].bal_edge
= bal_edge
;
1048 edge_array
[num_ed_temp
+ 1].multi
= 0;
1049 edge_array
[num_ed_temp
+ 1].deleted
= 0;
1050 edge_array
[num_ed_temp
+ 1].flag
= 0;
1052 if ( edge_array
[num_ed_temp
+ 1].seq
)
1053 { free ( edge_array
[num_ed_temp
+ 1].seq
); }
1055 edge_array
[num_ed_temp
+ 1].seq
= tightSeq
;
1056 edge_array
[num_ed_temp
+ 1].rv
= NULL
;
1057 edge_array
[num_ed_temp
+ 1].arcs
= NULL
;
1058 edge_array
[num_ed_temp
+ 1].markers
= NULL
;
1062 //Check whether kmers are equal to the front kmer and last kmer of the edge.
1063 boolean
checkEqual ( unsigned int from
, unsigned int to
, char ch
, unsigned int index
, unsigned int * getIndex
)
1065 if ( edge_array
[index
].length
== 1 && ( ( edge_array
[index
].from_vt
== from
&& edge_array
[index
].to_vt
== to
) ) )
1073 //Whether edge exist in set.
1074 boolean
EdgeExist ( unsigned int from
, unsigned int to
, char ch
, kmer_t2
* node
, unsigned int * index
)
1077 EDGEID
* temp
= node
->edgeId
;
1081 *index
= temp
->edge
;
1083 if ( checkEqual ( from
, to
, ch
, temp
->edge
, index
) )
1092 //Update edgeId in node.
1093 void updateNode ( kmer_t2
* node
, kmer_t2
* node1
)
1095 struct edgeID
* edgeid
;
1096 edgeid
= ( struct edgeID
* ) malloc ( sizeof ( struct edgeID
) );
1097 edgeid
->edge
= num_ed_temp
+ 1;
1099 edgeid
->next
= NULL
;
1102 { edgeid
->next
= node
->edgeId
; }
1104 node
->edgeId
= edgeid
;
1106 edgeid
= ( struct edgeID
* ) malloc ( sizeof ( struct edgeID
) );
1107 edgeid
->edge
= num_ed_temp
+ 2;
1109 edgeid
->next
= NULL
;
1112 { edgeid
->next
= node
->edgeId
; }
1114 node
->edgeId
= edgeid
;
1116 edgeid
= ( struct edgeID
* ) malloc ( sizeof ( struct edgeID
) );
1117 edgeid
->edge
= num_ed_temp
+ 1;
1119 edgeid
->next
= NULL
;
1121 if ( node1
->edgeId
)
1122 { edgeid
->next
= node1
->edgeId
; }
1124 node1
->edgeId
= edgeid
;
1126 edgeid
= ( struct edgeID
* ) malloc ( sizeof ( struct edgeID
) );
1127 edgeid
->edge
= num_ed_temp
+ 2;
1129 edgeid
->next
= NULL
;
1131 if ( node1
->edgeId
)
1132 { edgeid
->next
= node1
->edgeId
; }
1134 node1
->edgeId
= edgeid
;
1138 /*************************************************
1142 1. Check whether it can be solved or not.
1143 2. Add edge when necessary.
1145 1. from : whether it's 'from kmer'
1146 2. from_ed : index of 'from edge'
1147 3. to_ed : index of 'to edge'
1148 4. node : node of 'last kmer' of 'from edge'
1149 5. node1 : node of 'front kmer' of 'to edge'
1155 *************************************************/
1156 void checkindegree ( int from
, unsigned int from_ed
, unsigned int to_ed
, kmer_t2
* node
, kmer_t2
* node1
, int maxk
)
1160 boolean exist
= false;
1161 char ch
= lastCharInKmer ( vt_array
[edge_array
[to_ed
].from_vt
].kmer
); //.low2 & 3;
1162 char backup
= lastCharInKmer ( KmerRightBitMove ( vt_array
[edge_array
[from_ed
].to_vt
].kmer
, ( overlaplen
- 1 ) << 1 ) ); //.low2 & 3;
1164 ARC
* originalArc
= NULL
;
1169 arcCount2 ( from_ed
, &arcRight_n
);
1171 if ( arcRight_n
> 1 )
1173 exist
= EdgeExist ( edge_array
[from_ed
].to_vt
, edge_array
[to_ed
].from_vt
, ch
, node
, &index
);
1177 updateNode ( node
, node1
);
1178 originalArc
= getArcBetween ( from_ed
, to_ed
);
1180 addEdge ( edge_array
[from_ed
].to_vt
, edge_array
[to_ed
].from_vt
, ch
, 2, originalArc
->multiplicity
* 10 );
1181 // if(overlaplen + step > maxk)
1183 arcBuffer
[0][arcBufferCount
] = from_ed
;
1184 arcBuffer
[1][arcBufferCount
] = num_ed_temp
;
1185 arcBuffer
[2][arcBufferCount
++] = to_ed
;
1187 addEdge ( edge_array
[getTwinEdge ( to_ed
)].to_vt
, edge_array
[getTwinEdge ( from_ed
)].from_vt
, int_comp ( backup
), 0, originalArc
->multiplicity
* 10 );
1191 // if(overlaplen + step > maxk)
1193 arcBuffer
[0][arcBufferCount
] = from_ed
;
1194 arcBuffer
[1][arcBufferCount
] = index
;
1195 arcBuffer
[2][arcBufferCount
++] = to_ed
;
1203 arcCount2 ( getTwinEdge ( to_ed
), &arcLeft_n
);
1205 if ( arcLeft_n
> 1 )
1207 exist
= EdgeExist ( edge_array
[from_ed
].to_vt
, edge_array
[to_ed
].from_vt
, ch
, node
, &index
);
1211 updateNode ( node
, node1
);
1212 originalArc
= getArcBetween ( from_ed
, to_ed
);
1214 addEdge ( edge_array
[from_ed
].to_vt
, edge_array
[to_ed
].from_vt
, ch
, 2, originalArc
->multiplicity
* 10 );
1215 // if(overlaplen + step > maxk)
1217 arcBuffer
[0][arcBufferCount
] = from_ed
;
1218 arcBuffer
[1][arcBufferCount
] = num_ed_temp
;
1219 arcBuffer
[2][arcBufferCount
++] = to_ed
;
1221 addEdge ( edge_array
[getTwinEdge ( to_ed
)].to_vt
, edge_array
[getTwinEdge ( from_ed
)].from_vt
, int_comp ( backup
), 0, originalArc
->multiplicity
* 10 );
1225 // if(overlaplen + step > maxk)
1227 arcBuffer
[0][arcBufferCount
] = from_ed
;
1228 arcBuffer
[1][arcBufferCount
] = index
;
1229 arcBuffer
[2][arcBufferCount
++] = to_ed
;
1236 //Add arc between two edges.
1237 static void add1Arc2 ( unsigned int from_ed
, unsigned int to_ed
, unsigned int weight
)
1239 if ( edge_array
[from_ed
].to_vt
!= edge_array
[to_ed
].from_vt
)
1241 fprintf ( stderr
, "Warning : Inconsistant joins between %d and %d.\n", from_ed
, to_ed
);
1244 unsigned int bal_fe
= getTwinEdge ( from_ed
);
1245 unsigned int bal_te
= getTwinEdge ( to_ed
);
1247 // fprintf(stderr, "from %u, bal %u\n", from_ed, bal_fe);
1248 // fprintf(stderr, "to %u, bal %u\n", to_ed, bal_te);
1250 if ( from_ed
> num_ed_temp
|| to_ed
> num_ed_temp
|| bal_fe
> num_ed_temp
|| bal_te
> num_ed_temp
)
1252 fprintf ( stderr
, "Error : Edge id is out of range.\n" );
1256 ARC
* parc
, *bal_parc
;
1257 //both arcs already exist
1258 parc
= getArcBetween ( from_ed
, to_ed
);
1262 bal_parc
= parc
->bal_arc
;
1263 parc
->multiplicity
+= weight
;
1264 bal_parc
->multiplicity
+= weight
;
1269 parc
= allocateArc ( to_ed
);
1270 parc
->multiplicity
= weight
;
1273 if ( edge_array
[from_ed
].arcs
)
1274 { edge_array
[from_ed
].arcs
->prev
= parc
; }
1276 parc
->next
= edge_array
[from_ed
].arcs
;
1277 edge_array
[from_ed
].arcs
= parc
;
1280 if ( bal_te
== from_ed
)
1282 parc
->bal_arc
= parc
;
1283 parc
->multiplicity
+= weight
;
1287 bal_parc
= allocateArc ( bal_fe
);
1288 bal_parc
->multiplicity
= weight
;
1289 bal_parc
->prev
= NULL
;
1291 if ( edge_array
[bal_te
].arcs
)
1292 { edge_array
[bal_te
].arcs
->prev
= bal_parc
; }
1294 bal_parc
->next
= edge_array
[bal_te
].arcs
;
1295 edge_array
[bal_te
].arcs
= bal_parc
;
1296 //link them to each other
1297 parc
->bal_arc
= bal_parc
;
1298 bal_parc
->bal_arc
= parc
;
1301 //Count step between two neighbour edges.
1302 int countstep ( unsigned int to_vt
, unsigned int from_vt
)
1304 if ( to_vt
== from_vt
)
1311 to
= vt_array
[to_vt
].kmer
;
1312 from
= vt_array
[from_vt
].kmer
;
1315 for ( i
= 0; i
<= nowstep2
; ++i
)
1317 filtertemp
= createFilter ( overlaplen
- i
);
1319 if ( KmerEqual ( KmerRightBitMove ( from
, i
<< 1 ), KmerAnd ( to
, filtertemp
) ) )
1326 /*************************************************
1330 Extend edges or add edges.
1332 1. maxk : max kmer of multi kmer
1337 *************************************************/
1338 void freshEdge ( int maxk
)
1340 unsigned int i
= 0, j
= 0;
1342 kmer_t2
* node
, *node1
;
1345 Kmer word
, bal_word
;
1346 int from_vt_id
= 0, to_vt_id
= 0;
1347 char from_backup
, to_backup
;
1348 char * tightSeq
= NULL
;
1350 int arcLeft_n
= 0, arcRight_n
= 0;
1351 ARC
* temparc
= NULL
;
1352 unsigned int tempto_ed
= 0;
1353 fprintf ( stderr
, "There are %d edge(s).\n", num_ed
);
1354 // if(overlaplen + step > maxk)
1356 arcBuffer
= ( unsigned int ** ) ckalloc ( sizeof ( unsigned int * ) * 3 );
1357 arcBuffer
[0] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed
* 3 );
1358 arcBuffer
[1] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed
* 3 );
1359 arcBuffer
[2] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed
* 3 );
1361 int count_noextend
= 0;
1362 num_ed_temp
= num_ed
;
1364 for ( i
= 1; i
<= num_ed
; ++i
)
1366 if ( edge_array
[i
].deleted
|| EdSameAsTwin ( i
) )
1369 bal_ed
= getTwinEdge ( i
);
1370 arcCount2 ( i
, &arcRight_n
);
1371 arcCount2 ( bal_ed
, &arcLeft_n
);
1373 if ( arcLeft_n
== 1 )
1375 if ( edge_array
[bal_ed
].to_vt
!= edge_array
[edge_array
[bal_ed
].arcs
->to_ed
].from_vt
)
1377 ch
= lastCharInKmer ( KmerRightBitMove ( vt_array
[edge_array
[getTwinEdge ( edge_array
[bal_ed
].arcs
->to_ed
)].to_vt
].kmer
, ( overlaplen
- 1 ) << 1 ) ); //.low2 & 3;
1378 int temp
= kmer2edge ( 1, i
, ch
, &from_backup
);
1381 { count_noextend
++; }
1384 else if ( arcLeft_n
> 1 )
1386 word
= vt_array
[edge_array
[i
].from_vt
].kmer
;
1387 bal_word
= reverseComplement ( word
, overlaplen
);
1389 if ( KmerSmaller ( word
, bal_word
) )
1390 { found
= search_kmerset2 ( KmerSetsNew
, word
, &node
); }
1392 { found
= search_kmerset2 ( KmerSetsNew
, bal_word
, &node
); }
1396 fprintf ( stderr
, "When refreshing edges, 'from vertex' is not found, to_vt %d kmer ", edge_array
[i
].from_vt
);
1397 printKmerSeq ( stderr
, vt_array
[edge_array
[i
].from_vt
].kmer
);
1398 fprintf ( stderr
, " .\n" );
1402 // if(overlaplen < maxk)
1404 temparc
= edge_array
[bal_ed
].arcs
;
1408 tempto_ed
= getTwinEdge ( temparc
->to_ed
);
1409 word
= vt_array
[edge_array
[tempto_ed
].to_vt
].kmer
;
1410 bal_word
= reverseComplement ( word
, overlaplen
);
1412 if ( KmerSmaller ( word
, bal_word
) )
1413 { found
= search_kmerset2 ( KmerSetsNew
, word
, &node1
); }
1415 { found
= search_kmerset2 ( KmerSetsNew
, bal_word
, &node1
); }
1419 fprintf ( stderr
, "When refreshing edges, 'to vertex' is not found, to_vt %d kmer ", edge_array
[tempto_ed
].to_vt
);
1420 printKmerSeq ( stderr
, vt_array
[edge_array
[tempto_ed
].to_vt
].kmer
);
1421 fprintf ( stderr
, " .\n" );
1425 if ( node1
&& node
)
1426 { checkindegree ( 1, tempto_ed
, i
, node1
, node
, maxk
); }
1428 temparc
= temparc
->next
;
1433 if ( arcRight_n
== 1 )
1435 if ( edge_array
[i
].to_vt
!= edge_array
[edge_array
[i
].arcs
->to_ed
].from_vt
)
1437 ch
= lastCharInKmer ( vt_array
[edge_array
[edge_array
[i
].arcs
->to_ed
].from_vt
].kmer
); //.low2 & 3;
1438 int temp
= kmer2edge ( 3, i
, ch
, &to_backup
);
1441 { count_noextend
++; }
1444 else if ( arcRight_n
> 1 )
1446 word
= vt_array
[edge_array
[i
].to_vt
].kmer
;
1447 bal_word
= reverseComplement ( word
, overlaplen
);
1449 if ( KmerSmaller ( word
, bal_word
) )
1450 { found
= search_kmerset2 ( KmerSetsNew
, word
, &node
); }
1452 { found
= search_kmerset2 ( KmerSetsNew
, bal_word
, &node
); }
1456 fprintf ( stderr
, "When refreshing edges, 'to vertex' is not found, to_vt %d kmer ", edge_array
[i
].to_vt
);
1457 printKmerSeq ( stderr
, vt_array
[edge_array
[i
].to_vt
].kmer
);
1458 fprintf ( stderr
, " .\n" );
1462 // if(overlaplen < maxk)
1464 temparc
= edge_array
[i
].arcs
;
1468 tempto_ed
= temparc
->to_ed
;
1469 word
= vt_array
[edge_array
[tempto_ed
].from_vt
].kmer
;
1470 bal_word
= reverseComplement ( word
, overlaplen
);
1472 if ( KmerSmaller ( word
, bal_word
) )
1473 { found
= search_kmerset2 ( KmerSetsNew
, word
, &node1
); }
1475 { found
= search_kmerset2 ( KmerSetsNew
, bal_word
, &node1
); }
1479 fprintf ( stderr
, "When refreshing edges, 'from vertex' is not found, from_vt %d kmer ", edge_array
[tempto_ed
].from_vt
);
1480 printKmerSeq ( stderr
, vt_array
[edge_array
[tempto_ed
].from_vt
].kmer
);
1481 fprintf ( stderr
, " .\n" );
1485 if ( node1
&& node
)
1486 { checkindegree ( 3, i
, tempto_ed
, node
, node1
, maxk
); }
1488 temparc
= temparc
->next
;
1493 //two edge change at the same time
1497 if ( count_noextend
)
1498 { fprintf ( stderr
, "%d edge(s) not extended.\n", count_noextend
); }
1500 // if(overlaplen + step > maxk)
1502 ARC
* tempArc
, *tempBalArc
, *originalArc
, *temp
, *bal_temp
;
1503 unsigned int from
= 0;
1504 unsigned int mid
= 0;
1505 unsigned int to
= 0;
1506 int count_arcdelete
= 0, count_arcadd
= 0;
1508 int arcnotfound
= 0;
1510 for ( i
= 0; i
< arcBufferCount
; ++i
)
1512 from
= arcBuffer
[0][i
];
1513 mid
= arcBuffer
[1][i
];
1514 to
= arcBuffer
[2][i
];
1516 if ( from
> num_ed
|| mid
> num_ed_temp
|| to
> num_ed
)
1518 fprintf ( stderr
, "Error : Edge id is out of range.\n" );
1522 originalArc
= getArcBetween ( from
, to
);
1526 arcmulti
= originalArc
->multiplicity
;
1528 edge_array
[from
].arcs
= deleteArc ( edge_array
[from
].arcs
, originalArc
);
1530 add1Arc2 ( from
, mid
, arcmulti
);
1531 add1Arc2 ( mid
, to
, arcmulti
);
1535 originalArc
= getArcBetween ( getTwinEdge ( to
), getTwinEdge ( from
) );
1539 arcmulti
= originalArc
->multiplicity
;
1541 edge_array
[getTwinEdge ( to
)].arcs
= deleteArc ( edge_array
[getTwinEdge ( to
)].arcs
, originalArc
);
1550 add1Arc2 ( from
, mid
, arcmulti
);
1551 add1Arc2 ( mid
, to
, arcmulti
);
1555 fprintf ( stderr
, "Add edges to the graph: %d arc(s) deleted, %d arc(s) added.\n", count_arcdelete
, count_arcadd
);
1558 { fprintf ( stderr
, "Warning : %d arc(s) are not found when checking.\n", arcnotfound
); }
1562 num_ed
= num_ed_temp
;
1563 // if(overlaplen + step > maxk)
1565 free ( arcBuffer
[2] );
1566 free ( arcBuffer
[1] );
1567 free ( arcBuffer
[0] );
1572 //Copy edge from source to target.
1573 void copy1Edge ( EDGE
* source
, EDGE
* target
)
1575 target
->from_vt
= source
->from_vt
;
1576 target
->to_vt
= source
->to_vt
;
1577 target
->length
= source
->length
;
1578 target
->cvg
= source
->cvg
;
1579 target
->multi
= source
->multi
;
1583 free ( ( void * ) target
->seq
);
1586 target
->seq
= source
->seq
;
1588 target
->arcs
= source
->arcs
;
1589 source
->arcs
= NULL
;
1590 target
->deleted
= source
->deleted
;
1593 //Check whether two bases are equal.
1594 int BaseEqual ( char ch1
, char ch2
)
1598 else if ( ch1
> ch2
)
1604 //Check whether two edges are equal.
1605 int EdgeEqual ( unsigned int prev
, unsigned int next
)
1608 int length
= edge_array
[prev
].length
;
1612 for ( i
= 0; i
< length
; ++i
)
1614 ch1
= int2base ( ( int ) getCharInTightString ( edge_array
[prev
].seq
, i
) );
1615 ch2
= int2base ( ( int ) getCharInTightString ( edge_array
[next
].seq
, i
) );
1617 if ( ( equal
= BaseEqual ( ch1
, ch2
) ) )
1625 /*************************************************
1629 Re-arrange the edges, swap smaller edges at front.
1636 *************************************************/
1640 ARC
* arc
, *bal_arc
, *temp_arc
;
1641 int count_swap
= 0, count_equal
= 0;
1643 for ( i
= 1; i
<= num_ed
; ++i
)
1645 if ( edge_array
[i
].deleted
|| EdSameAsTwin ( i
) )
1648 if ( EdSmallerThanTwin ( i
) )
1650 if ( KmerLarger ( vt_array
[edge_array
[i
].from_vt
].kmer
, vt_array
[edge_array
[i
+ 1].from_vt
].kmer
) )
1653 copyEdge ( i
, num_ed
+ 1 + 1 );
1654 copyEdge ( i
+ 1, num_ed
+ 1 );
1655 copyEdge ( num_ed
+ 1, i
);
1656 copyEdge ( num_ed
+ 1 + 1, i
+ 1 );
1657 edge_array
[i
].bal_edge
= 2;
1658 edge_array
[i
+ 1].bal_edge
= 0;
1659 //take care of the arcs
1660 arc
= edge_array
[i
].arcs
;
1664 arc
->bal_arc
->to_ed
= i
+ 1;
1668 arc
= edge_array
[i
+ 1].arcs
;
1672 arc
->bal_arc
->to_ed
= i
;
1676 else if ( KmerEqual ( vt_array
[edge_array
[i
].from_vt
].kmer
, vt_array
[edge_array
[i
+ 1].from_vt
].kmer
) )
1678 int temp
= EdgeEqual ( i
, i
+ 1 );
1683 edge_array
[i
].bal_edge
= 1;
1684 delete1Edge ( i
+ 1 );
1685 //take care of the arcs
1686 arc
= edge_array
[i
].arcs
;
1690 arc
->bal_arc
->to_ed
= i
;
1694 bal_arc
= edge_array
[i
+ 1].arcs
;
1695 edge_array
[i
+ 1].arcs
= NULL
;
1700 bal_arc
= bal_arc
->next
;
1702 if ( edge_array
[i
].arcs
)
1703 { edge_array
[i
].arcs
->prev
= temp_arc
; }
1705 temp_arc
->next
= edge_array
[i
].arcs
;
1706 edge_array
[i
].arcs
= temp_arc
;
1709 else if ( temp
> 0 )
1712 copyEdge ( i
, num_ed
+ 1 + 1 );
1713 copyEdge ( i
+ 1, num_ed
+ 1 );
1714 copyEdge ( num_ed
+ 1, i
);
1715 copyEdge ( num_ed
+ 1 + 1, i
+ 1 );
1716 edge_array
[i
].bal_edge
= 2;
1717 edge_array
[i
+ 1].bal_edge
= 0;
1718 //take care of the arcs
1719 arc
= edge_array
[i
].arcs
;
1723 arc
->bal_arc
->to_ed
= i
+ 1;
1727 arc
= edge_array
[i
+ 1].arcs
;
1731 arc
->bal_arc
->to_ed
= i
;
1742 fprintf ( stderr
, "Warning : Front edge %d is larger than %d.\n", i
, i
+ 1 );
1746 fprintf ( stderr
, "%d none-palindrome edge(s) swapped, %d palindrome edge(s) processed.\n", count_swap
, count_equal
);
1748 /*************************************************
1759 1 if a larger than b.
1760 -1 if a smaller than b.
1762 *************************************************/
1763 static int cmp_seq ( const void * a
, const void * b
)
1766 A
= ( EDGE_SUB
* ) a
;
1767 B
= ( EDGE_SUB
* ) b
;
1769 if ( KmerLarger ( vt_array
[A
->from_vt
].kmer
, vt_array
[B
->from_vt
].kmer
) )
1773 else if ( KmerSmaller ( vt_array
[A
->from_vt
].kmer
, vt_array
[B
->from_vt
].kmer
) )
1779 if ( A
->seq
[0] > B
->seq
[0] )
1783 else if ( A
->seq
[0] == B
->seq
[0] )
1787 for ( i
= 1; i
< A
->length
&& i
< B
->length
; i
++ )
1789 if ( getCharInTightString ( A
->seq
, i
) > getCharInTightString ( B
->seq
, i
) )
1791 else if ( getCharInTightString ( A
->seq
, i
) < getCharInTightString ( B
->seq
, i
) )
1795 if ( i
== A
->length
&& i
< B
->length
)
1797 else if ( i
< A
->length
&& i
== B
->length
)
1801 printKmerSeq ( stderr
, vt_array
[A
->from_vt
].kmer
);
1802 fprintf ( stderr
, "\n" );
1803 printKmerSeq ( stderr
, vt_array
[B
->from_vt
].kmer
);
1804 fprintf ( stderr
, "\n" );
1806 for ( i
= 0; i
< A
->length
; i
++ )
1808 fprintf ( stderr
, "%c", int2base ( ( int ) getCharInTightString ( A
->seq
, i
) ) );
1811 fprintf ( stderr
, "\n" );
1813 for ( i
= 0; i
< B
->length
; i
++ )
1815 fprintf ( stderr
, "%c", int2base ( ( int ) getCharInTightString ( B
->seq
, i
) ) );
1818 fprintf ( stderr
, "\n" );
1819 fprintf ( stderr
, "cmp_seq:\terr\n" );
1831 //Copy edge from source to target.
1832 static void copyOneEdge ( EDGE
* target
, EDGE
* source
)
1834 target
->from_vt
= source
->from_vt
;
1835 target
->to_vt
= source
->to_vt
;
1836 target
->length
= source
->length
;
1837 target
->cvg
= source
->cvg
;
1838 target
->multi
= source
->multi
;
1839 target
->flag
= source
->flag
;
1840 target
->bal_edge
= source
->bal_edge
;
1841 target
->seq
= source
->seq
;
1843 target
->arcs
= source
->arcs
;
1844 source
->arcs
= NULL
;
1845 target
->markers
= source
->markers
;
1846 source
->markers
= NULL
;
1847 target
->deleted
= source
->deleted
;
1850 /*************************************************
1854 Update the arcs of edges.
1856 1. ed_index : the index of edge
1861 *************************************************/
1862 static void updateArcToEd ( unsigned int ed_index
)
1864 ARC
* arc
= edge_array
[ed_index
].arcs
;
1868 arc
->to_ed
= index_array
[arc
->to_ed
];
1873 /*************************************************
1877 Sort edges base on seq of edges.
1884 *************************************************/
1888 unsigned int index
;
1889 EDGE_SUB
* sort_edge
;
1890 sort_edge
= ( EDGE_SUB
* ) ckalloc ( sizeof ( EDGE_SUB
) * ( num_ed
+ 1 ) );
1893 for ( index
= 1 ; index
<= num_ed
; index
++ )
1895 sort_edge
[i
].from_vt
= edge_array
[index
].from_vt
;
1896 sort_edge
[i
].seq
= edge_array
[index
].seq
;
1897 sort_edge
[i
].to_vt
= index
; // record old id
1898 sort_edge
[i
].length
= edge_array
[index
].length
;
1901 if ( !EdSameAsTwin ( index
) )
1907 qsort ( & ( sort_edge
[1] ), i
- 1, sizeof ( sort_edge
[1] ), cmp_seq
);
1908 index_array
= ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * ( num_ed
+ 1 ) ); // used to record new id
1909 unsigned int new_index
= 1, old_index
;
1911 for ( index
= 1; index
<= i
- 1; index
++ )
1913 old_index
= sort_edge
[index
].to_vt
; // old id
1914 sort_edge
[index
].seq
= NULL
;
1915 index_array
[old_index
] = new_index
++;// old id -> new id
1917 if ( !EdSameAsTwin ( old_index
) )
1919 index_array
[old_index
+ 1] = new_index
++; // old id -> new id
1923 bool * copy_array
= (bool * ) ckalloc ( sizeof ( bool ) * ( num_ed
+ 1 ) );
1924 EDGE
*old_edge
= ( EDGE
* ) ckalloc ( sizeof ( EDGE
) );
1925 EDGE
*new_edge
= ( EDGE
* ) ckalloc ( sizeof ( EDGE
) );
1926 unsigned int next_index
;
1927 for ( index
= 1; index
<= num_ed
; index
++ )
1929 if(!copy_array
[index
])
1932 new_index
= index_array
[next_index
];
1933 if(!copy_array
[next_index
])// && next_index != new_index
1935 if(copy_array
[new_index
])
1937 fprintf(stderr
, "Copy error: never reach here.");
1939 copy_array
[next_index
] = 1;
1940 if(next_index
!= new_index
)
1942 copyOneEdge (old_edge
, &(edge_array
[new_index
]));
1943 copyOneEdge ( & ( edge_array
[new_index
] ), & ( edge_array
[next_index
] ) );
1945 updateArcToEd ( new_index
);
1947 next_index
= new_index
;
1948 new_index
= index_array
[next_index
];
1949 while(!copy_array
[next_index
])
1951 if(next_index
== new_index
)
1953 fprintf(stderr
, "Index error: never reach here.");
1955 copy_array
[next_index
] = 1;
1956 copyOneEdge (new_edge
, &(edge_array
[new_index
]));
1957 copyOneEdge ( & ( edge_array
[new_index
] ), old_edge
);
1958 updateArcToEd ( new_index
);
1959 copyOneEdge (old_edge
, new_edge
);
1961 next_index
= new_index
;
1962 new_index
= index_array
[next_index
];
1971 free ( index_array
);
1973 fprintf(stderr
, "%d edge(s) sorted.\n", num_ed
);
1979 unsigned int index ;
1980 EDGE * backup_edge ;
1981 EDGE_SUB * sort_edge;
1982 sort_edge = ( EDGE_SUB * ) ckalloc ( sizeof ( EDGE_SUB ) * ( num_ed + 1 ) );
1983 backup_edge = ( EDGE * ) ckalloc ( sizeof ( EDGE ) * ( num_ed + 1 ) );
1986 for ( index = 1 ; index <= num_ed ; index ++ )
1988 sort_edge[i].from_vt = edge_array[index].from_vt;
1989 sort_edge[i].seq = edge_array[index].seq;
1990 sort_edge[i].to_vt = index; // record old id
1991 sort_edge[i].length = edge_array[index].length;
1993 copyOneEdge ( & ( backup_edge[index] ) , & ( edge_array[index] ) );
1995 if ( !EdSameAsTwin ( index ) )
1998 copyOneEdge ( & ( backup_edge[index] ) , & ( edge_array[index] ) );
2002 qsort ( & ( sort_edge[1] ), i - 1, sizeof ( sort_edge[1] ), cmp_seq );
2003 index_array = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * ( num_ed + 1 ) ); // used to record new id
2004 unsigned int new_index = 1, old_index;
2006 for ( index = 1; index <= i - 1; index++ )
2008 old_index = sort_edge[index].to_vt; // old id
2009 sort_edge[index].seq = NULL;
2010 index_array[old_index] = new_index++;// old id -> new id
2012 if ( !EdSameAsTwin ( old_index ) )
2014 index_array[old_index + 1] = new_index++; // old id -> new id
2018 for ( index = 1; index <= num_ed; index++ )
2020 new_index = index_array[index];
2021 copyOneEdge ( & ( edge_array[new_index] ), & ( backup_edge[index] ) );
2022 updateArcToEd ( new_index );
2025 free ( index_array );
2027 free ( backup_edge );
2028 fprintf(stderr, "%d edge(s) sorted.\n", num_ed);
2031 /*************************************************
2035 Delete edge whose leght is zero.
2042 *************************************************/
2046 ARC
* arc_left
, *arc_right
;
2048 arcBuffer
= ( unsigned int ** ) ckalloc ( sizeof ( unsigned int * ) * 3 );
2049 arcBuffer
[0] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed
* 3 );
2050 arcBuffer
[1] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed
* 3 );
2051 arcBuffer
[2] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed
* 3 );
2053 for ( i
= 1; i
<= num_ed
; ++i
)
2055 if ( edge_array
[i
].deleted
|| EdSameAsTwin ( i
) )
2058 if ( edge_array
[i
].length
== 0 )
2060 arc_left
= edge_array
[i
+ 1].arcs
;
2064 arc_right
= edge_array
[i
].arcs
;
2068 arcBuffer
[0][arcBufferCount
] = getTwinEdge ( arc_left
->to_ed
);
2069 arcBuffer
[1][arcBufferCount
] = ( arc_left
->multiplicity
+ arc_right
->multiplicity
+ 1 ) / 2;
2070 arcBuffer
[2][arcBufferCount
++] = arc_right
->to_ed
;
2071 arc_right
= arc_right
->next
;
2074 arc_left
= arc_left
->next
;
2081 unsigned int from
= 0;
2082 unsigned int multi
= 0;
2083 unsigned int to
= 0;
2084 int count_edgedelete
= 0, count_arcadd
= 0;
2086 for ( i
= 1; i
<= num_ed
; ++i
)
2088 if ( edge_array
[i
].deleted
|| EdSameAsTwin ( i
) )
2091 if ( edge_array
[i
].length
== 0 )
2094 count_edgedelete
+= 2;
2100 for ( i
= 0; i
< arcBufferCount
; ++i
)
2102 from
= arcBuffer
[0][i
];
2103 multi
= arcBuffer
[1][i
];
2104 to
= arcBuffer
[2][i
];
2106 if ( from
== 0 || to
== 0 )
2108 fprintf ( stderr
, "Error : Edge id is zero.\n" );
2112 if ( from
> num_ed
|| to
> num_ed
)
2114 fprintf ( stderr
, "Error : Edge id is out of range.\n" );
2119 add1Arc2 ( from
, to
, multi
);
2123 fprintf ( stderr
, "%d edge(s) in length of 0, %d arc(s) added.\n", count_edgedelete
, count_arcadd
);
2124 free ( arcBuffer
[0] );
2125 free ( arcBuffer
[1] );
2126 free ( arcBuffer
[2] );
2130 /*************************************************
2135 2. Delete edges whose leght is zero.
2136 3. Re-arrange the edges, swap smaller edge at front.
2138 1. maxk : max kmer of multi kmer
2143 *************************************************/
2144 void fresh ( int maxk
)
2147 ARC
* arc_temp
, *parc
;
2149 newnotfoundcount
= 0;
2152 fprintf ( stderr
, "Refresh edge: %lld edge(s) added.\n", edgeaddnumber
);
2154 if ( newnotfoundcount
)
2156 fprintf ( stderr
, "Refresh edge: %d kmer(s) found.\n", newfoundcount
);
2157 fprintf ( stderr
, "Refresh edge: %d kmer(s) not found.\n", newnotfoundcount
);
2160 if ( overlaplen
+ step
> maxk
)
2165 //swap the smaller one forward
2170 /*************************************************
2174 Output statistics of edge array, the N50 N90 longest, etc.
2176 1. ed_array : the edge array
2177 2. ed_num : the number of edges
2179 1. statistics of edges
2182 *************************************************/
2183 void statistics ( EDGE
* ed_array
, unsigned int ed_num
)
2186 unsigned int * length_array
;
2187 int flag
, count
, len_c
;
2188 long long sum
= 0, N90
, N50
;
2190 length_array
= ( unsigned int * ) ckalloc ( ed_num
* sizeof ( unsigned int ) );
2191 //first scan for number counting
2194 for ( i
= 1; i
<= ed_num
; i
++ )
2196 if ( ( ed_array
[i
].length
+ overlaplen
- 1 ) >= len_bar
)
2198 length_array
[len_c
++] = ed_array
[i
].length
+ overlaplen
- 1;
2201 if ( ed_array
[i
].length
< 1 || ed_array
[i
].deleted
)
2208 if ( EdSmallerThanTwin ( i
) )
2216 for ( signI
= len_c
- 1; signI
>= 0; signI
-- )
2218 sum
+= length_array
[signI
];
2223 fprintf ( stderr
, "\nThere are %d contig(s) longer than %d, sum up %lld bp, with average length %lld.\n", len_c
, len_bar
, sum
, sum
/ len_c
);
2226 qsort ( length_array
, len_c
, sizeof ( length_array
[0] ), cmp_int
);
2227 fprintf ( stderr
, "The longest length is %d bp, ", length_array
[len_c
- 1] );
2232 for ( signI
= len_c
- 1; signI
>= 0; signI
-- )
2234 sum
+= length_array
[signI
];
2236 if ( !flag
&& sum
>= N50
)
2238 fprintf ( stderr
, "contig N50 is %d bp, ", length_array
[signI
] );
2244 fprintf ( stderr
, "contig N90 is %d bp.\n", length_array
[signI
] );
2249 free ( ( void * ) length_array
);
2252 /*************************************************
2258 1. list : unsorted arcs list
2263 *************************************************/
2264 ARC
* sort_arc ( ARC
* list
)
2269 // ARC * head = ( ARC * ) malloc ( sizeof ( ARC ) );
2270 ARC
* head
= ( ARC
* ) ckalloc ( sizeof ( ARC
));
2287 if ( temp
->to_ed
> temp1
->to_ed
)
2290 temp1
= temp1
->next
;
2294 if ( temp
&& temp
!= curr
)
2298 temp
->prev
->next
= temp
->next
;
2299 temp
->next
->prev
= temp
->prev
;
2303 temp
->prev
->next
= NULL
;
2307 temp
->prev
= curr
->prev
;
2308 curr
->prev
->next
= temp
;
2324 //Sort disorder arcs causing by multi thread.
2328 ARC
* arc_temp
, *parc
;
2330 for ( i
= 1; i
<= num_ed
; ++i
)
2332 if ( edge_array
[i
].deleted
)
2335 edge_array
[i
].arcs
= sort_arc ( edge_array
[i
].arcs
);
2337 fprintf(stderr
, "Arcs sorted.\n");
2340 /*************************************************
2344 Delete arcs that are chose.
2351 *************************************************/
2352 static void deleteUnlikeArc()
2354 unsigned int i
, bal
;
2358 for ( i
= 0; i
< delarcBufferCount
; ++i
)
2360 arc_temp
= getArcBetween ( delarcBuffer
[0][i
], delarcBuffer
[1][i
] );
2364 edge_array
[delarcBuffer
[0][i
]].arcs
= deleteArc ( edge_array
[delarcBuffer
[0][i
]].arcs
, arc_temp
);
2368 arc_temp
= getArcBetween ( getTwinEdge ( delarcBuffer
[1][i
] ), getTwinEdge ( delarcBuffer
[0][i
] ) );
2372 edge_array
[getTwinEdge ( delarcBuffer
[1][i
] )].arcs
= deleteArc ( edge_array
[getTwinEdge ( delarcBuffer
[1][i
] )].arcs
, arc_temp
);
2377 fprintf ( stderr
, "%d unreliable arc(s) deleted.\n", count
);
2378 free ( delarcBuffer
[0] );
2379 free ( delarcBuffer
[1] );
2380 free ( delarcBuffer
);
2381 delarcBufferCount
= 0;
2384 /*************************************************
2388 Go forward to collect the related arcs out going from index.
2390 1. index : the index of edge
2391 2. first : whether it's the first one to be parsed
2396 *************************************************/
2397 static void forward ( unsigned int index
, int first
)
2400 fArc
= edge_array
[index
].arcs
;
2401 unsigned int twin
= getTwinEdge ( index
);
2402 // if(!EdSameAsTwin(index))
2404 if ( edge_array
[index
].multi
!= 1 )
2405 { edge_array
[index
].multi
= 2; }
2407 if ( edge_array
[twin
].multi
!= 1 )
2408 { edge_array
[twin
].multi
= 2; }
2410 edge_array
[index
].flag
= 1;
2411 edge_array
[twin
].flag
= 1;
2417 delarcBuffer
[0][delarcBufferCount
] = index
;
2418 delarcBuffer
[1][delarcBufferCount
++] = temp
->to_ed
;
2420 if ( edge_array
[temp
->to_ed
].flag
)
2423 forward ( getTwinEdge ( temp
->to_ed
), 0 );
2427 /*************************************************
2431 Get arcs that could be processed incorrectly.
2438 *************************************************/
2439 static void getUnlikeArc()
2441 unsigned int i
, bal
;
2443 for ( i
= 1; i
<= num_ed
; ++i
)
2445 if ( edge_array
[i
].deleted
)
2448 if ( EdSameAsTwin ( i
) )
2450 edge_array
[i
].multi
= 1;
2454 delarcBuffer
= ( unsigned int ** ) ckalloc ( sizeof ( unsigned int * ) * 2 );
2455 delarcBuffer
[0] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed
* 3 );
2456 delarcBuffer
[1] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed
* 3 );
2457 unsigned int last
= 0, curr
= 0;
2459 for ( i
= 1; i
<= num_ed
; ++i
)
2461 if ( edge_array
[i
].deleted
)
2464 if ( edge_array
[i
].multi
== 1 )
2466 last
= delarcBufferCount
;
2469 if ( !EdSameAsTwin ( i
) )
2471 forward ( getTwinEdge ( i
), 1 );
2475 curr
= delarcBufferCount
;
2477 edge_array
[i
].flag
= 0;
2478 edge_array
[getTwinEdge ( i
)].flag
= 0;
2480 for ( j
= last
; j
< curr
; ++j
)
2482 edge_array
[delarcBuffer
[0][j
]].flag
= 0;
2483 edge_array
[getTwinEdge ( delarcBuffer
[0][j
] )].flag
= 0;
2484 edge_array
[delarcBuffer
[1][j
]].flag
= 0;
2485 edge_array
[getTwinEdge ( delarcBuffer
[1][j
] )].flag
= 0;
2491 /*************************************************
2495 1. Iterately gets larger kmer graph and solves some repeats.
2496 2. Builds (k+1)mer graph based on k mer graph.
2497 3. Re-builds arcs by reads.
2498 4. Removes errors(weak edge, low cov edge and tips).
2500 1. libfile : the reads config file
2501 2. graph : the output prefix
2502 3. maxk : the max kmer when using multikmer
2503 // 4. keepReadFile : keep temp reads file that selected for building arcs
2504 5. M : the strength of merging bubbles
2509 *************************************************/
2510 void Iterate ( char * libfile
, char * graph
, int maxk
, int M
) //boolean keepReadFile,
2512 time_t start_t
, stop_t
, time_bef
, time_aft
, inner_start
, inner_stop
;
2516 for ( i
= 1; i
<= num_ed
; ++i
)
2518 edge_array
[i
].multi
= 0;
2521 int cutlen
= 2 * overlaplen
;
2522 int mink
= overlaplen
;
2526 statistics ( edge_array
, num_ed
);
2527 fprintf ( stderr
, "\nIteration start.\n" );
2530 while ( overlaplen
<= maxk
)
2533 time ( &inner_start
);
2534 WORDFILTER
= createFilter ( overlaplen
);
2535 fprintf ( stderr
, "\n***************************\n" );
2536 fprintf ( stderr
, "Iteration %d, kmer: %d\n", round
++, overlaplen
);
2537 fprintf ( stderr
, "Edge number: %d\n", num_ed
);
2539 //build (k+1)mer graph
2540 fprintf ( stderr
, "Construct %dmer graph.\n", overlaplen
);
2543 fprintf ( stderr
, "Time spent on building hash graph: %ds.\n", ( int ) ( time_aft
- time_bef
) );
2545 //add arcs for (k+1)mer graph
2546 fprintf ( stderr
, "\nAdd arcs to graph.\n" );
2547 addArc ( libfile
, graph
, flag
, maxk
- overlaplen
, maxk
); //, keepReadFile
2548 //get arcs that could be processed incorrectly
2554 fprintf ( stderr
, "Time spent on adding arcs: %ds.\n", ( int ) ( time_aft
- time_bef
) );
2556 //sort disorder arcs causing by multi thread
2557 fprintf ( stderr
, "Sort arcs.\n" );
2560 fprintf ( stderr
, "Time spent on sorting arcs: %ds.\n", ( int ) ( time_aft
- time_bef
) );
2565 fprintf ( stderr
, "\nRemove weak edges and low coverage edges.\n" );
2566 removeWeakEdges2 ( cutlen
, 1, mink
);
2567 removeLowCovEdges2 ( cutlen
, deLowEdge
, mink
, 0 );
2569 fprintf ( stderr
, "Time spent on removing Edges: %ds\n", ( int ) ( time_aft
- time_bef
) );
2572 if ( overlaplen
+ step
> maxk
)
2575 fprintf ( stderr
, "Cut tips of the graph.\n" );
2576 cutTipsInGraph2 ( cutlen
, 0, 0 );
2578 fprintf ( stderr
, "Time spent on cutting tips: %ds.\n", ( int ) ( time_aft
- time_bef
) );
2582 fprintf ( stderr
, "Refresh edges.\n" );
2583 //refresh to extend edge and get the edge order right
2586 fprintf ( stderr
, "Time spent on refreshing edges: %ds.\n", ( int ) ( time_aft
- time_bef
) );
2588 free_kmerset2 ( KmerSetsNew
);
2591 statistics ( edge_array
, num_ed
);
2592 time ( &inner_stop
);
2593 fprintf ( stderr
, "Time spent on this round: %dm.\n\n", ( int ) ( inner_stop
- inner_start
) / 60 );
2596 for ( i
= 1; i
<= num_ed
; ++i
)
2598 edge_array
[i
].multi
= 0;
2603 fprintf ( stderr
, "Iteration finished.\n" );
2604 fprintf ( stderr
, "Time spent on iteration: %dm.\n\n", ( int ) ( stop_t
- start_t
) / 60 );