modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / iterate.c
blob610b25384549c8568bd281b2322d696f408c9a75
1 /*
2 * iterate.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
29 static Kmer * kmerBuffer;
30 static ubyte8 * hashBanBuffer;
32 static char * flagBuffer;
33 static int buffer_size = 100000000;//2013-5-13
34 long long foundcount = 0;
35 long long notfoundcount = 0;
36 long long newfoundcount = 0;
37 long long newnotfoundcount = 0;
38 long long edgeaddnumber = 0;
40 unsigned int ** arcBuffer;
41 unsigned int arcBufferCount = 0;
43 unsigned int ** delarcBuffer;
44 unsigned int delarcBufferCount = 0;
46 static void forward ( unsigned int index, int first );
48 //Fresh preGraphBasic to record the original message and the multikmer message.
49 void freshpreGraphBasic ( boolean iterate, int maxk, char * graph )
51 char name[256], line[256];
52 FILE * fp;
53 int num_kmer;
54 char ch;
55 int min = 0, max = 0;
56 int numed = 0;
57 int maxreadlen = 0, minreadlen = 0, maxnamelen = 0;
58 sprintf ( name, "%s.preGraphBasic", graph );
59 fp = ckopen ( name, "r" );
61 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
63 if ( line[0] == 'V' )
65 sscanf ( line + 6, "%d %c %d", &num_kmer, &ch, &min );
67 else if ( line[0] == 'E' )
69 sscanf ( line + 5, "%d", &numed );
71 else if ( line[0] == 'M' )
73 sscanf ( line, "MaxReadLen %d MinReadLen %d MaxNameLen %d", &maxreadlen, &minreadlen, &maxnamelen );
75 else if ( line[0] == 'B' )
77 if ( line[7] == 'V' )
79 sscanf ( line, "Backup VERTEX %d %c %d", &num_kmer, &ch, &min );
81 else if ( line[7] == 'E' )
83 sscanf ( line, "Backup EDGEs %d", &numed );
85 else if ( line[7] == 'M' )
87 sscanf ( line, "Backup MaxReadLen %d MinReadLen %d MaxNameLen %d", &maxreadlen, &minreadlen, &maxnamelen );
92 fclose ( fp );
93 sprintf ( name, "%s.preGraphBasic", graph );
94 fp = ckopen ( name, "w" );
96 if ( iterate )
98 fprintf ( fp, "VERTEX %d K %d\n", num_vt, maxk );
99 fprintf ( fp, "\nEDGEs %d\n", num_ed );
100 fprintf ( fp, "\nMaxReadLen %d MinReadLen %d MaxNameLen %d\n", maxreadlen, minreadlen, maxnamelen );
101 fprintf ( fp, "\nBackup VERTEX %d K %d\n", num_kmer, min );
102 fprintf ( fp, "\nBackup EDGEs %d\n", numed );
103 fprintf ( fp, "\nBackup MaxReadLen %d MinReadLen %d MaxNameLen %d\n", maxreadlen, minreadlen, maxnamelen );
105 else
107 fprintf ( fp, "VERTEX %d K %d\n", num_kmer, min );
108 fprintf ( fp, "\nEDGEs %d\n", numed );
109 fprintf ( fp, "\nMaxReadLen %d MinReadLen %d MaxNameLen %d\n", maxreadlen, minreadlen, maxnamelen );
112 fclose ( fp );
115 #ifdef MER127
116 // kmer1 = kmer1 | kmer2
117 inline Kmer KmerOr ( Kmer kmer1, Kmer kmer2 )
119 kmer1.high1 |= kmer2.high1;
120 kmer1.low1 |= kmer2.low1;
121 kmer1.high2 |= kmer2.high2;
122 kmer1.low2 |= kmer2.low2;
123 return kmer1;
126 //Add ch at the head of prev.
127 inline Kmer KmerPlusHead ( Kmer prev, char ch, int len )
129 Kmer word;
130 word.high1 = word.low1 = word.high2 = word.low2 = 0;
132 if ( 2 * len < 64 )
134 word.low2 = ch & 0x3;
135 word.low2 <<= ( 2 * len );
137 else if ( 2 * len < 128 )
139 word.high2 = ch & 0x3;
140 word.high2 <<= ( 2 * len - 64 );
142 else if ( 2 * len < 192 )
144 word.low1 = ch & 0x3;
145 word.low1 <<= ( 2 * len - 128 );
147 else
149 word.high1 = ch & 0x3;
150 word.high1 <<= ( 2 * len - 192 );
153 word = KmerOr ( word, prev );
154 return word;
158 //Add ch at the tail of prev.
159 inline Kmer KmerPlusTail ( Kmer prev, char ch )
161 Kmer word = KmerLeftBitMoveBy2 ( prev );
162 word.low2 += ch & 0x3;
163 return word;
166 static const Kmer kmerZero = { 0, 0, 0, 0 };
167 #else
168 //kmer1 = kmer1 | kmer2
169 Kmer KmerOr ( Kmer kmer1, Kmer kmer2 )
171 kmer1.high |= kmer2.high;
172 kmer1.low |= kmer2.low;
173 return kmer1;
176 //Add ch at the head of prev.
177 Kmer KmerPlusHead ( Kmer prev, char ch, int len )
179 Kmer word;
180 word.high = word.low = 0;
182 if ( 2 * len < 64 )
184 word.low = ch & 0x3;
185 word.low <<= ( 2 * len );
187 else
189 word.high = ch & 0x3;
190 word.high <<= ( 2 * len - 64 );
193 word = KmerOr ( word, prev );
194 return word;
198 //Add ch at the tail of prev.
199 inline Kmer KmerPlusTail ( Kmer prev, char ch )
201 Kmer word = KmerLeftBitMoveBy2 ( prev );
202 word.low += ch & 0x3;
203 return word;
206 static const Kmer kmerZero = { 0, 0 };
207 #endif
209 /*************************************************
210 Function:
211 getFromKmer
212 Description:
213 Get front (k+1)mer of edge.
214 Input:
215 1. index : index in edge array
216 Output:
217 None.
218 Return:
219 (k+1)mer at the front.
220 *************************************************/
221 Kmer getFromKmer ( unsigned int index )
223 Kmer temp = kmerZero;
224 temp = vt_array[edge_array[index].from_vt].kmer;
225 int i = 0;
226 char c;
228 for ( i = 0; i < step; ++i )
230 c = getCharInTightString ( edge_array[index].seq, i );
231 temp = KmerPlusTail ( temp, c );
234 return temp;
237 /*************************************************
238 Function:
239 getToKmer
240 Description:
241 Get last (k+1)mer of edge.
242 Input:
243 1. index : index in edge array
244 Output:
245 None.
246 Return:
247 (k+1)mer at the end.
248 *************************************************/
249 Kmer getToKmer ( unsigned int index )
251 Kmer temp = kmerZero;
252 Kmer temp2 = kmerZero;
253 int len = edge_array[index].length - overlaplen + step;
254 char c;
255 temp = vt_array[edge_array[index].to_vt].kmer;
256 temp2 = vt_array[edge_array[index].from_vt].kmer;
257 int i = 0;
259 if ( len >= step )
261 for ( i = 0; i < step; ++i )
263 c = getCharInTightString ( edge_array[index].seq, len - i - 1 );
264 temp = KmerPlusHead ( temp, c, i + overlaplen - step );
267 else if ( len > 0 && len < step )
269 for ( i = 0; i < len; ++i )
271 c = getCharInTightString ( edge_array[index].seq, len - i - 1 );
272 temp = KmerPlusHead ( temp, c, i + overlaplen - step );
275 for ( i = 0; i < ( step - len ); ++i )
277 c = lastCharInKmer ( KmerRightBitMove ( temp2, i << 1 ) ); //.low2 & 0x3;
278 temp = KmerPlusHead ( temp, c, i + overlaplen - step + len );
281 else
283 for ( i = 0; i < step; ++i )
285 c = lastCharInKmer ( KmerRightBitMove ( temp2, ( i - len ) << 1 ) ); //.low2 & 0x3;
286 temp = KmerPlusHead ( temp, c, i + overlaplen - step );
290 return temp;
293 //Only mark on edge.
294 inline void delete1Edge ( unsigned int index )
296 edge_array[index].deleted = 1;
299 /*************************************************
300 Function:
301 kmer2vtnew
302 Description:
303 Updates kmer edge to (k+1)mer edge.
304 Input:
305 1. index : edge index in array
306 Output:
307 None.
308 Return:
309 index of new kmer set.
310 -1 if not found.
311 *************************************************/
312 int kmer2vtnew ( unsigned int index, int from )
314 Kmer kmer;
315 Kmer bal_word;
316 int vt_id;
317 int found = 0;
318 kmer_t2 * node = NULL;
320 if ( from )
321 { kmer = getFromKmer ( index ); }
322 else
323 { kmer = getToKmer ( index ); }
325 bal_word = reverseComplement ( kmer, overlaplen );
327 if ( KmerSmaller ( kmer, bal_word ) )
329 vt_id = bisearch ( &vt_arraynew[0], num_vtnew, kmer );
331 if ( vt_id < 0 )
333 ++notfoundcount;
334 fprintf ( stderr, "Updating edge, small vertex " );
335 printKmerSeq ( stderr, kmer );
336 fprintf ( stderr, " is not found, it's twin is " );
337 printKmerSeq ( stderr, reverseComplement ( kmer, overlaplen ) );
338 fprintf ( stderr, " .\n" );
339 found = search_kmerset2 ( KmerSetsNew, kmer, &node );
341 if ( found )
342 { fprintf ( stderr, "The kmer is in kmer set but not in vt_array.\n" ); }
343 else
344 { fprintf ( stderr, "The kmer is not in kmer set and vt_array.\n" ); }
346 else
348 ++foundcount;
351 return vt_id;
353 else
355 vt_id = bisearch ( &vt_arraynew[0], num_vtnew, bal_word );
357 if ( vt_id >= 0 )
359 vt_id += num_vtnew;
360 ++foundcount;
362 else
364 ++notfoundcount;
365 fprintf ( stderr, "Updating edge, big vertex " );
366 printKmerSeq ( stderr, reverseComplement ( bal_word, overlaplen ) );
367 fprintf ( stderr, " is not found, it's twin is " );
368 printKmerSeq ( stderr, bal_word );
369 fprintf ( stderr, " .\n" );
370 found = search_kmerset2 ( KmerSetsNew, bal_word, &node );
372 if ( found )
373 { fprintf ( stderr, "The kmer is in kmer set but not in vt_array.\n" ); }
374 else
375 { fprintf ( stderr, "The kmer is not in kmer set and vt_array.\n" ); }
378 return vt_id;
382 /*************************************************
383 Function:
384 update1Edge
385 Description:
386 Updates kmer edge to (k+1)mer edge.
387 Input:
388 1. index : edge index in array
389 Output:
390 None.
391 Return:
392 None.
393 *************************************************/
394 void update1Edge ( unsigned int index )
396 int templength = edge_array[index].length;
397 int i = 0;
398 int temp_from_vt;
399 int temp_to_vt;
400 char * tightSeq = NULL;
401 temp_from_vt = kmer2vtnew ( index, 1 );
402 temp_to_vt = kmer2vtnew ( index, 0 );
404 if ( temp_from_vt < 0 || temp_to_vt < 0 )
406 destroyEdge2 ( index );
407 delete1Edge ( index );
408 fprintf ( stderr, "Warning : Kmer is not found, from_vt %d, to_vt %d.\n", temp_from_vt, temp_to_vt );
409 return;
412 edge_array[index].from_vt = temp_from_vt;
413 edge_array[index].to_vt = temp_to_vt;
414 edge_array[index].length -= step;
416 if ( edge_array[index].length == 1 || edge_array[index].length == 0 )
417 { edge_array[index].cvg = 0; }
419 tightSeq = ( char * ) ckalloc ( ( edge_array[index].length / 4 + 1 ) * sizeof ( char ) );
421 for ( i = 0; i < edge_array[index].length; ++i )
422 { writeChar2tightString ( getCharInTightString ( edge_array[index].seq, i + step ), tightSeq, i ); }
424 if ( edge_array[index].seq )
426 free ( ( void * ) edge_array[index].seq );
427 edge_array[index].seq = NULL;
430 edge_array[index].seq = tightSeq;
431 edge_array[index].rv = NULL;
432 ARC * currArc = edge_array[index].arcs;
433 ARC * tempArc = NULL;
435 while ( currArc )
437 tempArc = currArc;
438 currArc = currArc->next;
439 edge_array[index].arcs = deleteArc ( edge_array[index].arcs, tempArc );
442 edge_array[index].arcs = NULL;
443 edge_array[index].markers = NULL;
446 /*************************************************
447 Function:
448 getNewHash
449 Description:
450 Gets (k+1)mer hash set.
451 Input:
452 None.
453 Output:
454 None.
455 Return:
456 None.
457 *************************************************/
458 void getNewHash()
460 unsigned int i;
461 ubyte8 hash_ban, bal_hash_ban;
462 Kmer word, bal_word;
463 kmer_t2 * node;
464 unsigned int deletecount = 0;
465 kmer_cnew = 0;
466 kmerBuffer = ( Kmer * ) ckalloc ( 2 * ( num_ed + 1 ) * sizeof ( Kmer ) );
467 hashBanBuffer = ( ubyte8 * ) ckalloc ( 2 * ( num_ed + 1 ) * sizeof ( ubyte8 ) );
468 edge_id = ( unsigned int * ) ckalloc ( 2 * ( num_ed + 1 ) * sizeof ( unsigned int ) );
469 //00 : big to 01 : big from,10 : small to, 11 : small from
470 flagBuffer = ( boolean * ) ckalloc ( 2 * ( num_ed + 1 ) * sizeof ( boolean ) );
472 for ( i = 1; i <= num_ed; ++i )
474 if ( edge_array[i].deleted )
475 { continue; }
477 if ( edge_array[i].length < step ) //=
479 destroyEdge2 ( i );
480 delete1Edge ( i );
481 deletecount++;
482 continue;
485 word = kmerZero;
486 bal_word = kmerZero;
488 if ( edge_array[i].length == step )
490 word = getFromKmer ( i );
491 bal_word = reverseComplement ( word, overlaplen );
493 if ( KmerSmaller ( word, bal_word ) )
495 hash_ban = hash_kmer ( word );
496 hashBanBuffer[kmer_cnew] = hash_ban;
497 kmerBuffer[kmer_cnew] = word;
498 flagBuffer[kmer_cnew] = 4;
500 else
502 bal_hash_ban = hash_kmer ( bal_word );
503 hashBanBuffer[kmer_cnew] = bal_hash_ban;
504 kmerBuffer[kmer_cnew] = bal_word;
505 flagBuffer[kmer_cnew] = 5;
508 edge_id[kmer_cnew] = i;
509 ++kmer_cnew;
510 continue;
513 if ( edge_array[i].bal_edge == 1 )
515 word = getFromKmer ( i );
516 bal_word = reverseComplement ( word, overlaplen );
518 if ( KmerSmaller ( word, bal_word ) )
520 hash_ban = hash_kmer ( word );
521 hashBanBuffer[kmer_cnew] = hash_ban;
522 kmerBuffer[kmer_cnew] = word;
523 flagBuffer[kmer_cnew] = 6;
525 else
527 bal_hash_ban = hash_kmer ( bal_word );
528 hashBanBuffer[kmer_cnew] = bal_hash_ban;
529 kmerBuffer[kmer_cnew] = bal_word;
530 flagBuffer[kmer_cnew] = 7;
533 edge_id[kmer_cnew] = i;
534 ++kmer_cnew;
535 word = kmerZero;
536 bal_word = kmerZero;
537 word = getToKmer ( i );
538 bal_word = reverseComplement ( word, overlaplen );
540 if ( KmerSmaller ( word, bal_word ) )
542 hash_ban = hash_kmer ( word );
543 hashBanBuffer[kmer_cnew] = hash_ban;
544 kmerBuffer[kmer_cnew] = word;
545 flagBuffer[kmer_cnew] = 8;
547 else
549 bal_hash_ban = hash_kmer ( bal_word );
550 hashBanBuffer[kmer_cnew] = bal_hash_ban;
551 kmerBuffer[kmer_cnew] = bal_word;
552 flagBuffer[kmer_cnew] = 9;
555 edge_id[kmer_cnew] = i;
556 ++kmer_cnew;
557 continue;
560 word = getFromKmer ( i );
561 bal_word = reverseComplement ( word, overlaplen );
563 if ( KmerSmaller ( word, bal_word ) )
565 hash_ban = hash_kmer ( word );
566 hashBanBuffer[kmer_cnew] = hash_ban;
567 kmerBuffer[kmer_cnew] = word;
568 flagBuffer[kmer_cnew] = 3;
570 else
572 bal_hash_ban = hash_kmer ( bal_word );
573 hashBanBuffer[kmer_cnew] = bal_hash_ban;
574 kmerBuffer[kmer_cnew] = bal_word;
575 flagBuffer[kmer_cnew] = 1;
578 edge_id[kmer_cnew] = i;
579 ++kmer_cnew;
580 word = kmerZero;
581 bal_word = kmerZero;
582 word = getToKmer ( i );
583 bal_word = reverseComplement ( word, overlaplen );
585 if ( KmerSmaller ( word, bal_word ) )
587 hash_ban = hash_kmer ( word );
588 hashBanBuffer[kmer_cnew] = hash_ban;
589 kmerBuffer[kmer_cnew] = word;
590 flagBuffer[kmer_cnew] = 2;
592 else
594 bal_hash_ban = hash_kmer ( bal_word );
595 hashBanBuffer[kmer_cnew] = bal_hash_ban;
596 kmerBuffer[kmer_cnew] = bal_word;
597 flagBuffer[kmer_cnew] = 0;
600 edge_id[kmer_cnew] = i;
601 ++kmer_cnew;
604 fprintf ( stderr, "%lld edge(s) deleted in length of 0.\n", deletecount );
605 KmerSetsNew = init_kmerset2 ( 1024, 0.77f );
607 for ( i = 0; i < kmer_cnew; ++i )
609 put_kmerset2 ( KmerSetsNew, kmerBuffer[i], edge_id[i], flagBuffer[i], &node );
612 num_vtnew = count_kmerset2 ( KmerSetsNew );
613 fprintf ( stderr, "%u new kmer(s).\n", num_vtnew );
614 free ( kmerBuffer );
615 kmerBuffer = NULL;
616 free ( hashBanBuffer );
617 hashBanBuffer = NULL;
618 free ( edge_id );
619 edge_id = NULL;
620 free ( flagBuffer );
621 flagBuffer = NULL;
624 /*************************************************
625 Function:
626 getNewVertex
627 Description:
628 Gets (k+1)mer vertex from hash.
629 Input:
630 None.
631 Output:
632 None.
633 Return:
634 None.
635 *************************************************/
636 void getNewVertex()
638 unsigned int i;
639 Kmer word, bal_word;
640 vt_arraynew = ( VERTEX * ) ckalloc ( 4 * ( num_vtnew + 1 ) * sizeof ( VERTEX ) );
641 num_kmer_limit = 4 * num_vtnew;
642 KmerSet2 * set;
643 kmer_t2 * node;
644 unsigned int count = 0;
645 set = KmerSetsNew;
646 set->iter_ptr = 0;
648 while ( set->iter_ptr < set->size )
650 if ( !is_kmer_entity_null ( set->flags, set->iter_ptr ) )
652 node = set->array + set->iter_ptr;
654 if ( node )
656 word = node->seq;
657 bal_word = reverseComplement ( word, overlaplen );
659 if ( KmerSmaller ( word, bal_word ) )
660 { vt_arraynew[count].kmer = word; }
661 else
662 { vt_arraynew[count].kmer = bal_word; }
664 ++count;
668 set->iter_ptr ++;
671 num_vtnew = count;
672 qsort ( &vt_arraynew[0], num_vtnew, sizeof ( vt_arraynew[0] ), cmp_vertex );
674 for ( i = 0; i < num_vtnew; ++i )
676 bal_word = reverseComplement ( vt_arraynew[i].kmer, overlaplen );
677 vt_arraynew[i + num_vtnew].kmer = bal_word;
681 /*************************************************
682 Function:
683 buildGraphHash
684 Description:
685 1. Builds (k+1)mer graph in memory based on kmer graph.
686 2. Gets (k+1)mer hash set.
687 3. Gets (k+1)mer vertex.
688 4. Updates kmer edges to (k+1)mer edges.
689 Input:
690 None.
691 Output:
692 None.
693 Return:
694 None.
695 *************************************************/
696 void buildGraphHash()
698 unsigned int i;
699 unsigned int count = 0;
700 //use from kmer & to kmer to build hash
701 fprintf ( stderr, "Construct new kmer hash.\n" );
702 getNewHash();
703 getNewVertex();
704 foundcount = 0;
705 notfoundcount = 0;
707 for ( i = 1; i <= num_ed; ++i )
709 if ( edge_array[i].deleted )
710 { continue; }
712 if ( edge_array[i].length < step ) //=
714 destroyEdge2 ( i );
715 delete1Edge ( i );
716 continue;
719 //update twin edge together
720 update1Edge ( i );
721 ++count;
724 if ( notfoundcount )
726 fprintf ( stderr, "There are %lld kmer(s) found.\n", foundcount );
727 fprintf ( stderr, "There are %lld kmer(s) not found.\n", notfoundcount );
730 fprintf ( stderr, "%u edge(s) updated to %dmer edge.\n", count, overlaplen );
732 if ( vt_array )
734 free ( ( void * ) vt_array );
735 vt_array = NULL;
738 vt_array = vt_arraynew;
739 num_vt = num_vtnew;
742 //FILE *tempF = NULL;
744 /*************************************************
745 Function:
746 addArc
747 Description:
748 1. Re-build arcs by reads.
749 // 2. Output selected reads if -r is set(keepReadFile is true).
750 Input:
751 1. libfile : the reads config file
752 2. graph : the output prefix
753 3. flag : whether the selected reads file exist
754 4. last : whether it's the last iterate
755 5. maxk : the max kmer when using multikmer
756 // 6. keepReadFile : keep tmp reads file that selected for building arc
757 Output:
758 1. *.read : the selected reads that use for assembly
759 Return:
760 None.
761 *************************************************/
762 void addArc ( char * libfile, char * graph, int flag, int last, int maxk ) //, boolean keepReadFile
765 if(!flag)
767 char readSeqName[256];
768 sprintf(readSeqName,"%s.read",graph);
769 tempF=fopen(readSeqName,"r");
771 else
773 tempF = NULL;
776 if ( flag ) // || tempF)
778 // if(tempF)
779 // fclose(tempF);
780 Read2edge2 ( libfile, graph, last, maxk ); //, keepReadFile
782 else
783 { Read2edge ( libfile, graph, maxk ); }
785 unsigned int i;
787 for ( i = 1; i <= num_ed; ++i )
789 if ( edge_array[i].deleted )
791 destroyEdge2 ( i );
795 removeDeadArcs2();
798 /*************************************************
799 Function:
800 kmer2edge
801 Description:
802 Extend edges.
803 Input:
804 1. from : whether changing from kmer
805 2. index : index of edge
806 3. ch : next char to extend
807 4. backup : backup char
808 Output:
809 None.
810 Return:
811 Index of new extended kmer.
812 *************************************************/
813 int kmer2edge ( int from, unsigned int index, char ch, char * backup )
815 //add 1 kmer to edge
816 Kmer kmer = kmerZero, bal_word = kmerZero;
817 int vt_id = 0;
818 char * tightSeq = NULL;
820 // if(edge_array[index].arcs && edge_array[edge_array[index].arcs->to_ed].length <= 1)
821 // return;
823 if ( from <= 2 )
825 kmer = vt_array[edge_array[index].from_vt].kmer;
826 *backup = lastCharInKmer ( kmer ); //.low2 & 3;
827 kmer = prevKmer ( kmer, ch );
829 else
831 kmer = vt_array[edge_array[index].to_vt].kmer;
832 *backup = ch;
833 kmer = nextKmer ( kmer, ch );
836 bal_word = reverseComplement ( kmer, overlaplen );
838 if ( KmerSmaller ( kmer, bal_word ) )
840 vt_id = bisearch ( &vt_array[0], num_vt, kmer );
842 if ( vt_id < 0 )
844 ++newnotfoundcount;
845 fprintf ( stderr, "When extending edge 'small vertex' is not found, edge %d kmer ", index );
846 printKmerSeq ( stderr, kmer );
847 fprintf ( stderr, " , it's twin " );
848 printKmerSeq ( stderr, bal_word );
849 fprintf ( stderr, " .\n" );
851 else
853 ++newfoundcount;
856 else
858 vt_id = bisearch ( &vt_array[0], num_vt, bal_word );
860 if ( vt_id >= 0 )
862 vt_id += num_vt;
863 ++newfoundcount;
865 else
867 ++newnotfoundcount;
868 fprintf ( stderr, "When extending edge 'big vertex' is not found, edge %d kmer ", index );
869 printKmerSeq ( stderr, kmer );
870 fprintf ( stderr, " , it's twin " );
871 printKmerSeq ( stderr, bal_word );
872 fprintf ( stderr, " .\n" );
876 if ( vt_id < 0 )
878 return vt_id;
881 char backup1 = 0;
882 char ch1 = int_comp ( ch );
883 Kmer kmer1 = kmerZero, bal_word1 = kmerZero;
884 int vt_id1 = 0;
885 char * tightSeq1 = NULL;
887 if ( from <= 2 )
889 kmer1 = vt_array[edge_array[index + 1].to_vt].kmer;
890 backup1 = ch1;
891 kmer1 = nextKmer ( kmer1, ch1 );
893 else
895 kmer1 = vt_array[edge_array[index + 1].from_vt].kmer;
896 backup1 = lastCharInKmer ( kmer1 ); //.low2 & 3;
897 kmer1 = prevKmer ( kmer1, ch1 );
900 bal_word1 = reverseComplement ( kmer1, overlaplen );
902 if ( KmerSmaller ( kmer1, bal_word1 ) )
904 vt_id1 = bisearch ( &vt_array[0], num_vt, kmer1 );
906 if ( vt_id1 < 0 )
908 ++newnotfoundcount;
909 fprintf ( stderr, "When extending edge 'small vertex' is not found, edge %d kmer ", index + 1 );
910 printKmerSeq ( stderr, kmer1 );
911 fprintf ( stderr, " , it's twin " );
912 printKmerSeq ( stderr, bal_word1 );
913 fprintf ( stderr, " .\n" );
915 else
917 ++newfoundcount;
920 else
922 vt_id1 = bisearch ( &vt_array[0], num_vt, bal_word1 );
924 if ( vt_id1 >= 0 )
926 vt_id1 += num_vt;
927 ++newfoundcount;
929 else
931 ++newnotfoundcount;
932 fprintf ( stderr, "When extending edge big vertex is not found, edge %d kmer ", index + 1 );
933 printKmerSeq ( stderr, kmer1 );
934 fprintf ( stderr, " , it's twin " );
935 printKmerSeq ( stderr, bal_word1 );
936 fprintf ( stderr, " .\n" );
940 if ( vt_id1 < 0 )
942 return vt_id1;
945 int i = 0;
947 if ( from <= 2 )
949 //small
950 edge_array[index].from_vt = vt_id;
951 tightSeq = ( char * ) ckalloc ( ( ( edge_array[index].length + 1 ) / 4 + 1 ) * sizeof ( char ) );
952 writeChar2tightString ( *backup, tightSeq, 0 );
954 for ( i = 0; i < edge_array[index].length; ++i )
955 { writeChar2tightString ( getCharInTightString ( edge_array[index].seq, i ), tightSeq, i + 1 ); }
957 if ( edge_array[index].seq )
959 free ( ( void * ) edge_array[index].seq );
960 edge_array[index].seq = NULL;
963 edge_array[index].seq = tightSeq;
964 edge_array[index].length += 1;
965 //big
966 edge_array[index + 1].to_vt = vt_id1;
967 tightSeq1 = ( char * ) ckalloc ( ( ( edge_array[index + 1].length + 1 ) / 4 + 1 ) * sizeof ( char ) );
969 for ( i = 0; i < edge_array[index + 1].length; ++i )
970 { writeChar2tightString ( getCharInTightString ( edge_array[index + 1].seq, i ), tightSeq1, i ); }
972 writeChar2tightString ( backup1, tightSeq1, i );
974 if ( edge_array[index + 1].seq )
976 free ( ( void * ) edge_array[index + 1].seq );
977 edge_array[index + 1].seq = NULL;
980 edge_array[index + 1].seq = tightSeq1;
981 edge_array[index + 1].length += 1;
983 else
985 //small
986 edge_array[index].to_vt = vt_id;
987 tightSeq = ( char * ) ckalloc ( ( ( edge_array[index].length + 1 ) / 4 + 1 ) * sizeof ( char ) );
989 for ( i = 0; i < edge_array[index].length; ++i )
990 { writeChar2tightString ( getCharInTightString ( edge_array[index].seq, i ), tightSeq, i ); }
992 writeChar2tightString ( *backup, tightSeq, i );
994 if ( edge_array[index].seq )
996 free ( ( void * ) edge_array[index].seq );
997 edge_array[index].seq = NULL;
1000 edge_array[index].seq = tightSeq;
1001 edge_array[index].length += 1;
1002 //big
1003 edge_array[index + 1].from_vt = vt_id1;
1004 tightSeq1 = ( char * ) ckalloc ( ( ( edge_array[index + 1].length + 1 ) / 4 + 1 ) * sizeof ( char ) );
1005 writeChar2tightString ( backup1, tightSeq1, 0 );
1007 for ( i = 0; i < edge_array[index + 1].length; ++i )
1008 { writeChar2tightString ( getCharInTightString ( edge_array[index + 1].seq, i ), tightSeq1, i + 1 ); }
1010 if ( edge_array[index + 1].seq )
1012 free ( ( void * ) edge_array[index + 1].seq );
1013 edge_array[index + 1].seq = NULL;
1016 edge_array[index + 1].seq = tightSeq1;
1017 edge_array[index + 1].length += 1;
1020 return 0;
1023 //Add edge and reverse complement.
1024 void addEdge ( unsigned int from, unsigned int to, char ch, int bal_edge, unsigned int cvg )
1026 if ( num_ed_temp + 1 > num_ed_limit )
1028 unsigned int new_num_ed = num_ed_limit * 1.2;
1029 edge_array = ( EDGE * ) ckrealloc ( edge_array, ( new_num_ed + 1 ) * sizeof ( EDGE ), ( num_ed_limit + 1 ) * sizeof ( EDGE ) );
1030 num_ed_limit = new_num_ed;
1031 int j;
1033 for ( j = num_ed_temp + 1; j <= num_ed_limit; j++ )
1035 edge_array[j].seq = NULL;
1038 fprintf ( stderr, "Realloc edge array.\n" );
1041 char * tightSeq = ( char * ) ckalloc ( sizeof ( char ) );
1042 writeChar2tightString ( ch, tightSeq, 0 );
1043 edge_array[num_ed_temp + 1].from_vt = from;
1044 edge_array[num_ed_temp + 1].to_vt = to;
1045 edge_array[num_ed_temp + 1].length = 1;
1046 edge_array[num_ed_temp + 1].cvg = cvg;
1047 edge_array[num_ed_temp + 1].bal_edge = bal_edge;
1048 edge_array[num_ed_temp + 1].multi = 0;
1049 edge_array[num_ed_temp + 1].deleted = 0;
1050 edge_array[num_ed_temp + 1].flag = 0;
1052 if ( edge_array[num_ed_temp + 1].seq )
1053 { free ( edge_array[num_ed_temp + 1].seq ); }
1055 edge_array[num_ed_temp + 1].seq = tightSeq;
1056 edge_array[num_ed_temp + 1].rv = NULL;
1057 edge_array[num_ed_temp + 1].arcs = NULL;
1058 edge_array[num_ed_temp + 1].markers = NULL;
1059 ++num_ed_temp;
1062 //Check whether kmers are equal to the front kmer and last kmer of the edge.
1063 boolean checkEqual ( unsigned int from, unsigned int to, char ch, unsigned int index, unsigned int * getIndex )
1065 if ( edge_array[index].length == 1 && ( ( edge_array[index].from_vt == from && edge_array[index].to_vt == to ) ) )
1067 return true;
1070 return false;
1073 //Whether edge exist in set.
1074 boolean EdgeExist ( unsigned int from, unsigned int to, char ch, kmer_t2 * node, unsigned int * index )
1076 int i = 0;
1077 EDGEID * temp = node->edgeId;
1079 while ( temp )
1081 *index = temp->edge;
1083 if ( checkEqual ( from, to, ch, temp->edge, index ) )
1084 { return true; }
1086 temp = temp->next;
1089 return false;
1092 //Update edgeId in node.
1093 void updateNode ( kmer_t2 * node, kmer_t2 * node1 )
1095 struct edgeID * edgeid;
1096 edgeid = ( struct edgeID * ) malloc ( sizeof ( struct edgeID ) );
1097 edgeid->edge = num_ed_temp + 1;
1098 edgeid->flag = 0;
1099 edgeid->next = NULL;
1101 if ( node->edgeId )
1102 { edgeid->next = node->edgeId; }
1104 node->edgeId = edgeid;
1105 node->count++;
1106 edgeid = ( struct edgeID * ) malloc ( sizeof ( struct edgeID ) );
1107 edgeid->edge = num_ed_temp + 2;
1108 edgeid->flag = 1;
1109 edgeid->next = NULL;
1111 if ( node->edgeId )
1112 { edgeid->next = node->edgeId; }
1114 node->edgeId = edgeid;
1115 node->count++;
1116 edgeid = ( struct edgeID * ) malloc ( sizeof ( struct edgeID ) );
1117 edgeid->edge = num_ed_temp + 1;
1118 edgeid->flag = 2;
1119 edgeid->next = NULL;
1121 if ( node1->edgeId )
1122 { edgeid->next = node1->edgeId; }
1124 node1->edgeId = edgeid;
1125 node1->count++;
1126 edgeid = ( struct edgeID * ) malloc ( sizeof ( struct edgeID ) );
1127 edgeid->edge = num_ed_temp + 2;
1128 edgeid->flag = 3;
1129 edgeid->next = NULL;
1131 if ( node1->edgeId )
1132 { edgeid->next = node1->edgeId; }
1134 node1->edgeId = edgeid;
1135 node1->count++;
1138 /*************************************************
1139 Function:
1140 checkindegree
1141 Description:
1142 1. Check whether it can be solved or not.
1143 2. Add edge when necessary.
1144 Input:
1145 1. from : whether it's 'from kmer'
1146 2. from_ed : index of 'from edge'
1147 3. to_ed : index of 'to edge'
1148 4. node : node of 'last kmer' of 'from edge'
1149 5. node1 : node of 'front kmer' of 'to edge'
1150 6. maxk : max kmer
1151 Output:
1152 None.
1153 Return:
1154 None.
1155 *************************************************/
1156 void checkindegree ( int from, unsigned int from_ed, unsigned int to_ed, kmer_t2 * node, kmer_t2 * node1, int maxk )
1158 int arcLeft_n = 0;
1159 int arcRight_n = 0;
1160 boolean exist = false;
1161 char ch = lastCharInKmer ( vt_array[edge_array[to_ed].from_vt].kmer ); //.low2 & 3;
1162 char backup = lastCharInKmer ( KmerRightBitMove ( vt_array[edge_array[from_ed].to_vt].kmer, ( overlaplen - 1 ) << 1 ) ); //.low2 & 3;
1163 unsigned int index;
1164 ARC * originalArc = NULL;
1166 if ( from <= 2 )
1168 //out->in > 1
1169 arcCount2 ( from_ed, &arcRight_n );
1171 if ( arcRight_n > 1 )
1173 exist = EdgeExist ( edge_array[from_ed].to_vt, edge_array[to_ed].from_vt, ch, node, &index );
1175 if ( !exist )
1177 updateNode ( node, node1 );
1178 originalArc = getArcBetween ( from_ed, to_ed );
1179 edgeaddnumber += 2;
1180 addEdge ( edge_array[from_ed].to_vt, edge_array[to_ed].from_vt, ch, 2, originalArc->multiplicity * 10 );
1181 // if(overlaplen + step > maxk)
1183 arcBuffer[0][arcBufferCount] = from_ed;
1184 arcBuffer[1][arcBufferCount] = num_ed_temp;
1185 arcBuffer[2][arcBufferCount++] = to_ed;
1187 addEdge ( edge_array[getTwinEdge ( to_ed )].to_vt, edge_array[getTwinEdge ( from_ed )].from_vt, int_comp ( backup ), 0, originalArc->multiplicity * 10 );
1189 else
1191 // if(overlaplen + step > maxk)
1193 arcBuffer[0][arcBufferCount] = from_ed;
1194 arcBuffer[1][arcBufferCount] = index;
1195 arcBuffer[2][arcBufferCount++] = to_ed;
1200 else
1202 //out->in > 1
1203 arcCount2 ( getTwinEdge ( to_ed ), &arcLeft_n );
1205 if ( arcLeft_n > 1 )
1207 exist = EdgeExist ( edge_array[from_ed].to_vt, edge_array[to_ed].from_vt, ch, node, &index );
1209 if ( !exist )
1211 updateNode ( node, node1 );
1212 originalArc = getArcBetween ( from_ed, to_ed );
1213 edgeaddnumber += 2;
1214 addEdge ( edge_array[from_ed].to_vt, edge_array[to_ed].from_vt, ch, 2, originalArc->multiplicity * 10 );
1215 // if(overlaplen + step > maxk)
1217 arcBuffer[0][arcBufferCount] = from_ed;
1218 arcBuffer[1][arcBufferCount] = num_ed_temp;
1219 arcBuffer[2][arcBufferCount++] = to_ed;
1221 addEdge ( edge_array[getTwinEdge ( to_ed )].to_vt, edge_array[getTwinEdge ( from_ed )].from_vt, int_comp ( backup ), 0, originalArc->multiplicity * 10 );
1223 else
1225 // if(overlaplen + step > maxk)
1227 arcBuffer[0][arcBufferCount] = from_ed;
1228 arcBuffer[1][arcBufferCount] = index;
1229 arcBuffer[2][arcBufferCount++] = to_ed;
1236 //Add arc between two edges.
1237 static void add1Arc2 ( unsigned int from_ed, unsigned int to_ed, unsigned int weight )
1239 if ( edge_array[from_ed].to_vt != edge_array[to_ed].from_vt )
1241 fprintf ( stderr, "Warning : Inconsistant joins between %d and %d.\n", from_ed, to_ed );
1244 unsigned int bal_fe = getTwinEdge ( from_ed );
1245 unsigned int bal_te = getTwinEdge ( to_ed );
1247 // fprintf(stderr, "from %u, bal %u\n", from_ed, bal_fe);
1248 // fprintf(stderr, "to %u, bal %u\n", to_ed, bal_te);
1250 if ( from_ed > num_ed_temp || to_ed > num_ed_temp || bal_fe > num_ed_temp || bal_te > num_ed_temp )
1252 fprintf ( stderr, "Error : Edge id is out of range.\n" );
1253 return;
1256 ARC * parc, *bal_parc;
1257 //both arcs already exist
1258 parc = getArcBetween ( from_ed, to_ed );
1260 if ( parc )
1262 bal_parc = parc->bal_arc;
1263 parc->multiplicity += weight;
1264 bal_parc->multiplicity += weight;
1265 return;
1268 //create new arcs
1269 parc = allocateArc ( to_ed );
1270 parc->multiplicity = weight;
1271 parc->prev = NULL;
1273 if ( edge_array[from_ed].arcs )
1274 { edge_array[from_ed].arcs->prev = parc; }
1276 parc->next = edge_array[from_ed].arcs;
1277 edge_array[from_ed].arcs = parc;
1279 // A->A'
1280 if ( bal_te == from_ed )
1282 parc->bal_arc = parc;
1283 parc->multiplicity += weight;
1284 return;
1287 bal_parc = allocateArc ( bal_fe );
1288 bal_parc->multiplicity = weight;
1289 bal_parc->prev = NULL;
1291 if ( edge_array[bal_te].arcs )
1292 { edge_array[bal_te].arcs->prev = bal_parc; }
1294 bal_parc->next = edge_array[bal_te].arcs;
1295 edge_array[bal_te].arcs = bal_parc;
1296 //link them to each other
1297 parc->bal_arc = bal_parc;
1298 bal_parc->bal_arc = parc;
1301 //Count step between two neighbour edges.
1302 int countstep ( unsigned int to_vt, unsigned int from_vt )
1304 if ( to_vt == from_vt )
1306 return 0;
1309 Kmer to, from;
1310 Kmer filtertemp;
1311 to = vt_array[to_vt].kmer;
1312 from = vt_array[from_vt].kmer;
1313 int i = 0;
1315 for ( i = 0; i <= nowstep2; ++i )
1317 filtertemp = createFilter ( overlaplen - i );
1319 if ( KmerEqual ( KmerRightBitMove ( from, i << 1 ), KmerAnd ( to, filtertemp ) ) )
1320 { return i; }
1323 return -1;
1326 /*************************************************
1327 Function:
1328 freshEdge
1329 Description:
1330 Extend edges or add edges.
1331 Input:
1332 1. maxk : max kmer of multi kmer
1333 Output:
1334 None.
1335 Return:
1336 None.
1337 *************************************************/
1338 void freshEdge ( int maxk )
1340 unsigned int i = 0, j = 0;
1341 boolean found = 0;
1342 kmer_t2 * node, *node1;
1343 int count = 0;
1344 char ch = 0;
1345 Kmer word, bal_word;
1346 int from_vt_id = 0, to_vt_id = 0;
1347 char from_backup, to_backup;
1348 char * tightSeq = NULL;
1349 int bal_ed = 0;
1350 int arcLeft_n = 0, arcRight_n = 0;
1351 ARC * temparc = NULL;
1352 unsigned int tempto_ed = 0;
1353 fprintf ( stderr, "There are %d edge(s).\n", num_ed );
1354 // if(overlaplen + step > maxk)
1356 arcBuffer = ( unsigned int ** ) ckalloc ( sizeof ( unsigned int * ) * 3 );
1357 arcBuffer[0] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed * 3 );
1358 arcBuffer[1] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed * 3 );
1359 arcBuffer[2] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed * 3 );
1361 int count_noextend = 0;
1362 num_ed_temp = num_ed;
1364 for ( i = 1; i <= num_ed; ++i )
1366 if ( edge_array[i].deleted || EdSameAsTwin ( i ) )
1367 { continue; }
1369 bal_ed = getTwinEdge ( i );
1370 arcCount2 ( i, &arcRight_n );
1371 arcCount2 ( bal_ed, &arcLeft_n );
1373 if ( arcLeft_n == 1 )
1375 if ( edge_array[bal_ed].to_vt != edge_array[edge_array[bal_ed].arcs->to_ed].from_vt )
1377 ch = lastCharInKmer ( KmerRightBitMove ( vt_array[edge_array[getTwinEdge ( edge_array[bal_ed].arcs->to_ed )].to_vt].kmer, ( overlaplen - 1 ) << 1 ) ); //.low2 & 3;
1378 int temp = kmer2edge ( 1, i, ch, &from_backup );
1380 if ( temp != 0 )
1381 { count_noextend++; }
1384 else if ( arcLeft_n > 1 )
1386 word = vt_array[edge_array[i].from_vt].kmer;
1387 bal_word = reverseComplement ( word, overlaplen );
1389 if ( KmerSmaller ( word, bal_word ) )
1390 { found = search_kmerset2 ( KmerSetsNew, word, &node ); }
1391 else
1392 { found = search_kmerset2 ( KmerSetsNew, bal_word, &node ); }
1394 if ( !found )
1396 fprintf ( stderr, "When refreshing edges, 'from vertex' is not found, to_vt %d kmer ", edge_array[i].from_vt );
1397 printKmerSeq ( stderr, vt_array[edge_array[i].from_vt].kmer );
1398 fprintf ( stderr, " .\n" );
1399 exit ( -1 );
1402 // if(overlaplen < maxk)
1404 temparc = edge_array[bal_ed].arcs;
1406 while ( temparc )
1408 tempto_ed = getTwinEdge ( temparc->to_ed );
1409 word = vt_array[edge_array[tempto_ed].to_vt].kmer;
1410 bal_word = reverseComplement ( word, overlaplen );
1412 if ( KmerSmaller ( word, bal_word ) )
1413 { found = search_kmerset2 ( KmerSetsNew, word, &node1 ); }
1414 else
1415 { found = search_kmerset2 ( KmerSetsNew, bal_word, &node1 ); }
1417 if ( !found )
1419 fprintf ( stderr, "When refreshing edges, 'to vertex' is not found, to_vt %d kmer ", edge_array[tempto_ed].to_vt );
1420 printKmerSeq ( stderr, vt_array[edge_array[tempto_ed].to_vt].kmer );
1421 fprintf ( stderr, " .\n" );
1422 exit ( -1 );
1425 if ( node1 && node )
1426 { checkindegree ( 1, tempto_ed, i, node1, node, maxk ); }
1428 temparc = temparc->next;
1433 if ( arcRight_n == 1 )
1435 if ( edge_array[i].to_vt != edge_array[edge_array[i].arcs->to_ed].from_vt )
1437 ch = lastCharInKmer ( vt_array[edge_array[edge_array[i].arcs->to_ed].from_vt].kmer ); //.low2 & 3;
1438 int temp = kmer2edge ( 3, i, ch, &to_backup );
1440 if ( temp != 0 )
1441 { count_noextend++; }
1444 else if ( arcRight_n > 1 )
1446 word = vt_array[edge_array[i].to_vt].kmer;
1447 bal_word = reverseComplement ( word, overlaplen );
1449 if ( KmerSmaller ( word, bal_word ) )
1450 { found = search_kmerset2 ( KmerSetsNew, word, &node ); }
1451 else
1452 { found = search_kmerset2 ( KmerSetsNew, bal_word, &node ); }
1454 if ( !found )
1456 fprintf ( stderr, "When refreshing edges, 'to vertex' is not found, to_vt %d kmer ", edge_array[i].to_vt );
1457 printKmerSeq ( stderr, vt_array[edge_array[i].to_vt].kmer );
1458 fprintf ( stderr, " .\n" );
1459 exit ( -1 );
1462 // if(overlaplen < maxk)
1464 temparc = edge_array[i].arcs;
1466 while ( temparc )
1468 tempto_ed = temparc->to_ed;
1469 word = vt_array[edge_array[tempto_ed].from_vt].kmer;
1470 bal_word = reverseComplement ( word, overlaplen );
1472 if ( KmerSmaller ( word, bal_word ) )
1473 { found = search_kmerset2 ( KmerSetsNew, word, &node1 ); }
1474 else
1475 { found = search_kmerset2 ( KmerSetsNew, bal_word, &node1 ); }
1477 if ( !found )
1479 fprintf ( stderr, "When refreshing edges, 'from vertex' is not found, from_vt %d kmer ", edge_array[tempto_ed].from_vt );
1480 printKmerSeq ( stderr, vt_array[edge_array[tempto_ed].from_vt].kmer );
1481 fprintf ( stderr, " .\n" );
1482 exit ( -1 );
1485 if ( node1 && node )
1486 { checkindegree ( 3, i, tempto_ed, node, node1, maxk ); }
1488 temparc = temparc->next;
1493 //two edge change at the same time
1494 ++i;
1497 if ( count_noextend )
1498 { fprintf ( stderr, "%d edge(s) not extended.\n", count_noextend ); }
1500 // if(overlaplen + step > maxk)
1502 ARC * tempArc, *tempBalArc, *originalArc, *temp, *bal_temp;
1503 unsigned int from = 0;
1504 unsigned int mid = 0;
1505 unsigned int to = 0;
1506 int count_arcdelete = 0, count_arcadd = 0;
1507 int arcmulti = 0;
1508 int arcnotfound = 0;
1510 for ( i = 0; i < arcBufferCount; ++i )
1512 from = arcBuffer[0][i];
1513 mid = arcBuffer[1][i];
1514 to = arcBuffer[2][i];
1516 if ( from > num_ed || mid > num_ed_temp || to > num_ed )
1518 fprintf ( stderr, "Error : Edge id is out of range.\n" );
1519 exit ( -1 );
1522 originalArc = getArcBetween ( from, to );
1524 if ( originalArc )
1526 arcmulti = originalArc->multiplicity;
1527 count_arcdelete++;
1528 edge_array[from].arcs = deleteArc ( edge_array[from].arcs, originalArc );
1529 count_arcadd += 2;
1530 add1Arc2 ( from, mid, arcmulti );
1531 add1Arc2 ( mid, to, arcmulti );
1533 else
1535 originalArc = getArcBetween ( getTwinEdge ( to ), getTwinEdge ( from ) );
1537 if ( originalArc )
1539 arcmulti = originalArc->multiplicity;
1540 count_arcdelete++;
1541 edge_array[getTwinEdge ( to )].arcs = deleteArc ( edge_array[getTwinEdge ( to )].arcs, originalArc );
1543 else
1545 ++arcnotfound;
1546 arcmulti = 2;
1549 count_arcadd += 2;
1550 add1Arc2 ( from, mid, arcmulti );
1551 add1Arc2 ( mid, to, arcmulti );
1555 fprintf ( stderr, "Add edges to the graph: %d arc(s) deleted, %d arc(s) added.\n", count_arcdelete, count_arcadd );
1557 if ( arcnotfound )
1558 { fprintf ( stderr, "Warning : %d arc(s) are not found when checking.\n", arcnotfound ); }
1560 arcBufferCount = 0;
1562 num_ed = num_ed_temp;
1563 // if(overlaplen + step > maxk)
1565 free ( arcBuffer[2] );
1566 free ( arcBuffer[1] );
1567 free ( arcBuffer[0] );
1568 free ( arcBuffer );
1572 //Copy edge from source to target.
1573 void copy1Edge ( EDGE * source, EDGE * target )
1575 target->from_vt = source->from_vt;
1576 target->to_vt = source->to_vt;
1577 target->length = source->length;
1578 target->cvg = source->cvg;
1579 target->multi = source->multi;
1581 if ( target->seq )
1583 free ( ( void * ) target->seq );
1586 target->seq = source->seq;
1587 source->seq = NULL;
1588 target->arcs = source->arcs;
1589 source->arcs = NULL;
1590 target->deleted = source->deleted;
1593 //Check whether two bases are equal.
1594 int BaseEqual ( char ch1, char ch2 )
1596 if ( ch1 == ch2 )
1597 { return 0; }
1598 else if ( ch1 > ch2 )
1599 { return 1; }
1600 else
1601 { return -1; }
1604 //Check whether two edges are equal.
1605 int EdgeEqual ( unsigned int prev, unsigned int next )
1607 int i = 0;
1608 int length = edge_array[prev].length;
1609 char ch1, ch2;
1610 int equal = 0;
1612 for ( i = 0; i < length; ++i )
1614 ch1 = int2base ( ( int ) getCharInTightString ( edge_array[prev].seq, i ) );
1615 ch2 = int2base ( ( int ) getCharInTightString ( edge_array[next].seq, i ) );
1617 if ( ( equal = BaseEqual ( ch1, ch2 ) ) )
1619 return equal;
1623 return 0;
1625 /*************************************************
1626 Function:
1627 swapedge
1628 Description:
1629 Re-arrange the edges, swap smaller edges at front.
1630 Input:
1631 None.
1632 Output:
1633 None.
1634 Return:
1635 None.
1636 *************************************************/
1637 void swapedge()
1639 unsigned int i;
1640 ARC * arc, *bal_arc, *temp_arc;
1641 int count_swap = 0, count_equal = 0;
1643 for ( i = 1; i <= num_ed; ++i )
1645 if ( edge_array[i].deleted || EdSameAsTwin ( i ) )
1646 { continue; }
1648 if ( EdSmallerThanTwin ( i ) )
1650 if ( KmerLarger ( vt_array[edge_array[i].from_vt].kmer, vt_array[edge_array[i + 1].from_vt].kmer ) )
1652 count_swap++;
1653 copyEdge ( i, num_ed + 1 + 1 );
1654 copyEdge ( i + 1, num_ed + 1 );
1655 copyEdge ( num_ed + 1, i );
1656 copyEdge ( num_ed + 1 + 1, i + 1 );
1657 edge_array[i].bal_edge = 2;
1658 edge_array[i + 1].bal_edge = 0;
1659 //take care of the arcs
1660 arc = edge_array[i].arcs;
1662 while ( arc )
1664 arc->bal_arc->to_ed = i + 1;
1665 arc = arc->next;
1668 arc = edge_array[i + 1].arcs;
1670 while ( arc )
1672 arc->bal_arc->to_ed = i;
1673 arc = arc->next;
1676 else if ( KmerEqual ( vt_array[edge_array[i].from_vt].kmer, vt_array[edge_array[i + 1].from_vt].kmer ) )
1678 int temp = EdgeEqual ( i, i + 1 );
1680 if ( temp == 0 )
1682 count_equal++;
1683 edge_array[i].bal_edge = 1;
1684 delete1Edge ( i + 1 );
1685 //take care of the arcs
1686 arc = edge_array[i].arcs;
1688 while ( arc )
1690 arc->bal_arc->to_ed = i;
1691 arc = arc->next;
1694 bal_arc = edge_array[i + 1].arcs;
1695 edge_array[i + 1].arcs = NULL;
1697 while ( bal_arc )
1699 temp_arc = bal_arc;
1700 bal_arc = bal_arc->next;
1702 if ( edge_array[i].arcs )
1703 { edge_array[i].arcs->prev = temp_arc; }
1705 temp_arc->next = edge_array[i].arcs;
1706 edge_array[i].arcs = temp_arc;
1709 else if ( temp > 0 )
1711 count_swap++;
1712 copyEdge ( i, num_ed + 1 + 1 );
1713 copyEdge ( i + 1, num_ed + 1 );
1714 copyEdge ( num_ed + 1, i );
1715 copyEdge ( num_ed + 1 + 1, i + 1 );
1716 edge_array[i].bal_edge = 2;
1717 edge_array[i + 1].bal_edge = 0;
1718 //take care of the arcs
1719 arc = edge_array[i].arcs;
1721 while ( arc )
1723 arc->bal_arc->to_ed = i + 1;
1724 arc = arc->next;
1727 arc = edge_array[i + 1].arcs;
1729 while ( arc )
1731 arc->bal_arc->to_ed = i;
1732 arc = arc->next;
1737 ++i;
1739 else
1741 delete1Edge ( i );
1742 fprintf ( stderr, "Warning : Front edge %d is larger than %d.\n", i, i + 1 );
1746 fprintf ( stderr, "%d none-palindrome edge(s) swapped, %d palindrome edge(s) processed.\n", count_swap, count_equal );
1748 /*************************************************
1749 Function:
1750 cmp_seq
1751 Description:
1752 Compare two seq.
1753 Input:
1754 1. a : seq a.
1755 2. b : seq b.
1756 Output:
1757 None.
1758 Return:
1759 1 if a larger than b.
1760 -1 if a smaller than b.
1761 0 if a equal to b.
1762 *************************************************/
1763 static int cmp_seq ( const void * a, const void * b )
1765 EDGE_SUB * A, *B;
1766 A = ( EDGE_SUB * ) a;
1767 B = ( EDGE_SUB * ) b;
1769 if ( KmerLarger ( vt_array[A->from_vt].kmer, vt_array[B->from_vt].kmer ) )
1771 return 1;
1773 else if ( KmerSmaller ( vt_array[A->from_vt].kmer , vt_array[B->from_vt].kmer ) )
1775 return -1;
1777 else
1779 if ( A->seq[0] > B->seq[0] )
1781 return 1;
1783 else if ( A->seq[0] == B->seq[0] )
1785 int i = 0;
1787 for ( i = 1; i < A->length && i < B->length; i++ )
1789 if ( getCharInTightString ( A->seq, i ) > getCharInTightString ( B->seq, i ) )
1790 { return 1; }
1791 else if ( getCharInTightString ( A->seq, i ) < getCharInTightString ( B->seq, i ) )
1792 { return -1; }
1795 if ( i == A->length && i < B->length )
1796 { return -1; }
1797 else if ( i < A->length && i == B->length )
1798 { return 1; }
1799 else
1801 printKmerSeq ( stderr , vt_array[A->from_vt].kmer );
1802 fprintf ( stderr , "\n" );
1803 printKmerSeq ( stderr , vt_array[B->from_vt].kmer );
1804 fprintf ( stderr , "\n" );
1806 for ( i = 0; i < A->length; i++ )
1808 fprintf ( stderr, "%c", int2base ( ( int ) getCharInTightString ( A->seq, i ) ) );
1811 fprintf ( stderr , "\n" );
1813 for ( i = 0; i < B->length; i++ )
1815 fprintf ( stderr, "%c", int2base ( ( int ) getCharInTightString ( B->seq, i ) ) );
1818 fprintf ( stderr , "\n" );
1819 fprintf ( stderr, "cmp_seq:\terr\n" );
1820 exit ( 0 );
1821 return 0;
1824 else
1826 return -1;
1831 //Copy edge from source to target.
1832 static void copyOneEdge ( EDGE * target , EDGE * source )
1834 target->from_vt = source->from_vt;
1835 target->to_vt = source->to_vt;
1836 target->length = source->length;
1837 target->cvg = source->cvg;
1838 target->multi = source->multi;
1839 target->flag = source->flag;
1840 target->bal_edge = source->bal_edge;
1841 target->seq = source->seq;
1842 source->seq = NULL;
1843 target->arcs = source->arcs;
1844 source->arcs = NULL ;
1845 target->markers = source->markers;
1846 source->markers = NULL;
1847 target->deleted = source->deleted;
1850 /*************************************************
1851 Function:
1852 updateArcToEd
1853 Description:
1854 Update the arcs of edges.
1855 Input:
1856 1. ed_index : the index of edge
1857 Output:
1858 None.
1859 Return:
1860 None.
1861 *************************************************/
1862 static void updateArcToEd ( unsigned int ed_index )
1864 ARC * arc = edge_array[ed_index].arcs;
1866 while ( arc )
1868 arc->to_ed = index_array[arc->to_ed];
1869 arc = arc->next;
1873 /*************************************************
1874 Function:
1875 sortedge
1876 Description:
1877 Sort edges base on seq of edges.
1878 Input:
1879 None.
1880 Output:
1881 None.
1882 Return:
1883 None.
1884 *************************************************/
1885 ///*
1886 void sortedge()
1888 unsigned int index ;
1889 EDGE_SUB * sort_edge;
1890 sort_edge = ( EDGE_SUB * ) ckalloc ( sizeof ( EDGE_SUB ) * ( num_ed + 1 ) );
1891 unsigned int i = 1;
1893 for ( index = 1 ; index <= num_ed ; index ++ )
1895 sort_edge[i].from_vt = edge_array[index].from_vt;
1896 sort_edge[i].seq = edge_array[index].seq;
1897 sort_edge[i].to_vt = index; // record old id
1898 sort_edge[i].length = edge_array[index].length;
1899 i++;
1901 if ( !EdSameAsTwin ( index ) )
1903 index++;
1907 qsort ( & ( sort_edge[1] ), i - 1, sizeof ( sort_edge[1] ), cmp_seq );
1908 index_array = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * ( num_ed + 1 ) ); // used to record new id
1909 unsigned int new_index = 1, old_index;
1911 for ( index = 1; index <= i - 1; index++ )
1913 old_index = sort_edge[index].to_vt; // old id
1914 sort_edge[index].seq = NULL;
1915 index_array[old_index] = new_index++;// old id -> new id
1917 if ( !EdSameAsTwin ( old_index ) )
1919 index_array[old_index + 1] = new_index++; // old id -> new id
1923 bool * copy_array = (bool * ) ckalloc ( sizeof ( bool ) * ( num_ed + 1 ) );
1924 EDGE *old_edge = ( EDGE * ) ckalloc ( sizeof ( EDGE ) );
1925 EDGE *new_edge = ( EDGE * ) ckalloc ( sizeof ( EDGE ) );
1926 unsigned int next_index;
1927 for ( index = 1; index <= num_ed; index++ )
1929 if(!copy_array[index])
1931 next_index = index;
1932 new_index = index_array[next_index];
1933 if(!copy_array[next_index])// && next_index != new_index
1935 if(copy_array[new_index])
1937 fprintf(stderr, "Copy error: never reach here.");
1939 copy_array[next_index] = 1;
1940 if(next_index != new_index)
1942 copyOneEdge (old_edge, &(edge_array[new_index]));
1943 copyOneEdge ( & ( edge_array[new_index] ), & ( edge_array[next_index] ) );
1945 updateArcToEd ( new_index );
1947 next_index = new_index;
1948 new_index = index_array[next_index];
1949 while(!copy_array[next_index])
1951 if(next_index == new_index)
1953 fprintf(stderr, "Index error: never reach here.");
1955 copy_array[next_index] = 1;
1956 copyOneEdge (new_edge, &(edge_array[new_index]));
1957 copyOneEdge ( & ( edge_array[new_index] ), old_edge);
1958 updateArcToEd ( new_index );
1959 copyOneEdge (old_edge, new_edge);
1961 next_index = new_index;
1962 new_index = index_array[next_index];
1968 free (copy_array);
1969 free (old_edge);
1970 free (new_edge);
1971 free ( index_array );
1972 free ( sort_edge );
1973 fprintf(stderr, "%d edge(s) sorted.\n", num_ed);
1975 //*/
1977 void sortedge()
1979 unsigned int index ;
1980 EDGE * backup_edge ;
1981 EDGE_SUB * sort_edge;
1982 sort_edge = ( EDGE_SUB * ) ckalloc ( sizeof ( EDGE_SUB ) * ( num_ed + 1 ) );
1983 backup_edge = ( EDGE * ) ckalloc ( sizeof ( EDGE ) * ( num_ed + 1 ) );
1984 unsigned int i = 1;
1986 for ( index = 1 ; index <= num_ed ; index ++ )
1988 sort_edge[i].from_vt = edge_array[index].from_vt;
1989 sort_edge[i].seq = edge_array[index].seq;
1990 sort_edge[i].to_vt = index; // record old id
1991 sort_edge[i].length = edge_array[index].length;
1992 i++;
1993 copyOneEdge ( & ( backup_edge[index] ) , & ( edge_array[index] ) );
1995 if ( !EdSameAsTwin ( index ) )
1997 index++;
1998 copyOneEdge ( & ( backup_edge[index] ) , & ( edge_array[index] ) );
2002 qsort ( & ( sort_edge[1] ), i - 1, sizeof ( sort_edge[1] ), cmp_seq );
2003 index_array = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * ( num_ed + 1 ) ); // used to record new id
2004 unsigned int new_index = 1, old_index;
2006 for ( index = 1; index <= i - 1; index++ )
2008 old_index = sort_edge[index].to_vt; // old id
2009 sort_edge[index].seq = NULL;
2010 index_array[old_index] = new_index++;// old id -> new id
2012 if ( !EdSameAsTwin ( old_index ) )
2014 index_array[old_index + 1] = new_index++; // old id -> new id
2018 for ( index = 1; index <= num_ed; index++ )
2020 new_index = index_array[index];
2021 copyOneEdge ( & ( edge_array[new_index] ), & ( backup_edge[index] ) );
2022 updateArcToEd ( new_index );
2025 free ( index_array );
2026 free ( sort_edge );
2027 free ( backup_edge );
2028 fprintf(stderr, "%d edge(s) sorted.\n", num_ed);
2031 /*************************************************
2032 Function:
2033 delete0Edge
2034 Description:
2035 Delete edge whose leght is zero.
2036 Input:
2037 None.
2038 Output:
2039 None.
2040 Return:
2041 None.
2042 *************************************************/
2043 void delete0Edge()
2045 unsigned int i = 0;
2046 ARC * arc_left, *arc_right;
2047 arcBufferCount = 0;
2048 arcBuffer = ( unsigned int ** ) ckalloc ( sizeof ( unsigned int * ) * 3 );
2049 arcBuffer[0] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed * 3 );
2050 arcBuffer[1] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed * 3 );
2051 arcBuffer[2] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed * 3 );
2053 for ( i = 1; i <= num_ed; ++i )
2055 if ( edge_array[i].deleted || EdSameAsTwin ( i ) )
2056 { continue; }
2058 if ( edge_array[i].length == 0 )
2060 arc_left = edge_array[i + 1].arcs;
2062 while ( arc_left )
2064 arc_right = edge_array[i].arcs;
2066 while ( arc_right )
2068 arcBuffer[0][arcBufferCount] = getTwinEdge ( arc_left->to_ed );
2069 arcBuffer[1][arcBufferCount] = ( arc_left->multiplicity + arc_right->multiplicity + 1 ) / 2;
2070 arcBuffer[2][arcBufferCount++] = arc_right->to_ed;
2071 arc_right = arc_right->next;
2074 arc_left = arc_left->next;
2078 ++i;
2081 unsigned int from = 0;
2082 unsigned int multi = 0;
2083 unsigned int to = 0;
2084 int count_edgedelete = 0, count_arcadd = 0;
2086 for ( i = 1; i <= num_ed; ++i )
2088 if ( edge_array[i].deleted || EdSameAsTwin ( i ) )
2089 { continue; }
2091 if ( edge_array[i].length == 0 )
2093 destroyEdge2 ( i );
2094 count_edgedelete += 2;
2098 removeDeadArcs2();
2100 for ( i = 0; i < arcBufferCount; ++i )
2102 from = arcBuffer[0][i];
2103 multi = arcBuffer[1][i];
2104 to = arcBuffer[2][i];
2106 if ( from == 0 || to == 0 )
2108 fprintf ( stderr, "Error : Edge id is zero.\n" );
2109 continue;
2112 if ( from > num_ed || to > num_ed )
2114 fprintf ( stderr, "Error : Edge id is out of range.\n" );
2115 continue;
2118 count_arcadd++;
2119 add1Arc2 ( from, to, multi );
2122 arcBufferCount = 0;
2123 fprintf ( stderr, "%d edge(s) in length of 0, %d arc(s) added.\n", count_edgedelete, count_arcadd );
2124 free ( arcBuffer[0] );
2125 free ( arcBuffer[1] );
2126 free ( arcBuffer[2] );
2127 free ( arcBuffer );
2130 /*************************************************
2131 Function:
2132 fresh
2133 Description:
2134 1. Extend edges.
2135 2. Delete edges whose leght is zero.
2136 3. Re-arrange the edges, swap smaller edge at front.
2137 Input:
2138 1. maxk : max kmer of multi kmer
2139 Output:
2140 None.
2141 Return:
2142 None.
2143 *************************************************/
2144 void fresh ( int maxk )
2146 int num = 0;
2147 ARC * arc_temp, *parc;
2148 newfoundcount = 0;
2149 newnotfoundcount = 0;
2150 edgeaddnumber = 0;
2151 freshEdge ( maxk );
2152 fprintf ( stderr, "Refresh edge: %lld edge(s) added.\n", edgeaddnumber );
2154 if ( newnotfoundcount )
2156 fprintf ( stderr, "Refresh edge: %d kmer(s) found.\n", newfoundcount );
2157 fprintf ( stderr, "Refresh edge: %d kmer(s) not found.\n", newnotfoundcount );
2160 if ( overlaplen + step > maxk )
2162 delete0Edge();
2165 //swap the smaller one forward
2166 swapedge();
2167 compactEdgeArray();
2170 /*************************************************
2171 Function:
2172 statistics
2173 Description:
2174 Output statistics of edge array, the N50 N90 longest, etc.
2175 Input:
2176 1. ed_array : the edge array
2177 2. ed_num : the number of edges
2178 Output:
2179 1. statistics of edges
2180 Return:
2181 None.
2182 *************************************************/
2183 void statistics ( EDGE * ed_array, unsigned int ed_num )
2185 unsigned int i = 0;
2186 unsigned int * length_array;
2187 int flag, count, len_c;
2188 long long sum = 0, N90, N50;
2189 int signI;
2190 length_array = ( unsigned int * ) ckalloc ( ed_num * sizeof ( unsigned int ) );
2191 //first scan for number counting
2192 count = len_c = 0;
2194 for ( i = 1; i <= ed_num; i++ )
2196 if ( ( ed_array[i].length + overlaplen - 1 ) >= len_bar )
2198 length_array[len_c++] = ed_array[i].length + overlaplen - 1;
2201 if ( ed_array[i].length < 1 || ed_array[i].deleted )
2203 continue;
2206 count++;
2208 if ( EdSmallerThanTwin ( i ) )
2210 i++;
2214 sum = 0;
2216 for ( signI = len_c - 1; signI >= 0; signI-- )
2218 sum += length_array[signI];
2221 if ( len_c > 0 )
2223 fprintf ( stderr, "\nThere are %d contig(s) longer than %d, sum up %lld bp, with average length %lld.\n", len_c, len_bar, sum, sum / len_c );
2226 qsort ( length_array, len_c, sizeof ( length_array[0] ), cmp_int );
2227 fprintf ( stderr, "The longest length is %d bp, ", length_array[len_c - 1] );
2228 N50 = sum * 0.5;
2229 N90 = sum * 0.9;
2230 sum = flag = 0;
2232 for ( signI = len_c - 1; signI >= 0; signI-- )
2234 sum += length_array[signI];
2236 if ( !flag && sum >= N50 )
2238 fprintf ( stderr, "contig N50 is %d bp, ", length_array[signI] );
2239 flag = 1;
2242 if ( sum >= N90 )
2244 fprintf ( stderr, "contig N90 is %d bp.\n", length_array[signI] );
2245 break;
2249 free ( ( void * ) length_array );
2252 /*************************************************
2253 Function:
2254 sort_arc
2255 Description:
2256 Sort arcs
2257 Input:
2258 1. list : unsorted arcs list
2259 Output:
2260 None.
2261 Return:
2262 Sorted arcs list.
2263 *************************************************/
2264 ARC * sort_arc ( ARC * list )
2266 if ( !list )
2267 { return list; }
2269 // ARC * head = ( ARC * ) malloc ( sizeof ( ARC ) );
2270 ARC * head = ( ARC * ) ckalloc ( sizeof ( ARC ));
2271 head->next = list;
2272 list->prev = head;
2273 ARC * curr = list;
2274 ARC * temp = list;
2275 ARC * temp1 = NULL;
2277 while ( curr )
2279 temp = curr;
2281 if ( temp )
2283 temp1 = temp->next;
2285 while ( temp1 )
2287 if ( temp->to_ed > temp1->to_ed )
2288 { temp = temp1; }
2290 temp1 = temp1->next;
2294 if ( temp && temp != curr )
2296 if ( temp->next )
2298 temp->prev->next = temp->next;
2299 temp->next->prev = temp->prev;
2301 else
2303 temp->prev->next = NULL;
2306 temp->next = curr;
2307 temp->prev = curr->prev;
2308 curr->prev->next = temp;
2309 curr->prev = temp;
2311 else
2313 curr = curr->next;
2317 list = head->next;
2318 list->prev = NULL;
2319 head->next = NULL;
2320 free ( head );
2321 return list;
2324 //Sort disorder arcs causing by multi thread.
2325 void freshArc()
2327 unsigned int i;
2328 ARC * arc_temp, *parc;
2330 for ( i = 1; i <= num_ed; ++i )
2332 if ( edge_array[i].deleted )
2333 { continue; }
2335 edge_array[i].arcs = sort_arc ( edge_array[i].arcs );
2337 fprintf(stderr, "Arcs sorted.\n");
2340 /*************************************************
2341 Function:
2342 getUnlikeArc
2343 Description:
2344 Delete arcs that are chose.
2345 Input:
2346 None.
2347 Output:
2348 None.
2349 Return:
2350 None.
2351 *************************************************/
2352 static void deleteUnlikeArc()
2354 unsigned int i, bal;
2355 ARC * arc_temp;
2356 int count = 0;
2358 for ( i = 0; i < delarcBufferCount; ++i )
2360 arc_temp = getArcBetween ( delarcBuffer[0][i], delarcBuffer[1][i] );
2362 if ( arc_temp )
2364 edge_array[delarcBuffer[0][i]].arcs = deleteArc ( edge_array[delarcBuffer[0][i]].arcs, arc_temp );
2365 ++count;
2368 arc_temp = getArcBetween ( getTwinEdge ( delarcBuffer[1][i] ), getTwinEdge ( delarcBuffer[0][i] ) );
2370 if ( arc_temp )
2372 edge_array[getTwinEdge ( delarcBuffer[1][i] )].arcs = deleteArc ( edge_array[getTwinEdge ( delarcBuffer[1][i] )].arcs, arc_temp );
2373 ++count;
2377 fprintf ( stderr, "%d unreliable arc(s) deleted.\n", count );
2378 free ( delarcBuffer[0] );
2379 free ( delarcBuffer[1] );
2380 free ( delarcBuffer );
2381 delarcBufferCount = 0;
2384 /*************************************************
2385 Function:
2386 forward
2387 Description:
2388 Go forward to collect the related arcs out going from index.
2389 Input:
2390 1. index : the index of edge
2391 2. first : whether it's the first one to be parsed
2392 Output:
2393 None.
2394 Return:
2395 None.
2396 *************************************************/
2397 static void forward ( unsigned int index, int first )
2399 ARC * fArc, *temp;
2400 fArc = edge_array[index].arcs;
2401 unsigned int twin = getTwinEdge ( index );
2402 // if(!EdSameAsTwin(index))
2404 if ( edge_array[index].multi != 1 )
2405 { edge_array[index].multi = 2; }
2407 if ( edge_array[twin].multi != 1 )
2408 { edge_array[twin].multi = 2; }
2410 edge_array[index].flag = 1;
2411 edge_array[twin].flag = 1;
2413 while ( fArc )
2415 temp = fArc;
2416 fArc = fArc->next;
2417 delarcBuffer[0][delarcBufferCount] = index;
2418 delarcBuffer[1][delarcBufferCount++] = temp->to_ed;
2420 if ( edge_array[temp->to_ed].flag )
2421 { continue; }
2423 forward ( getTwinEdge ( temp->to_ed ), 0 );
2427 /*************************************************
2428 Function:
2429 getUnlikeArc
2430 Description:
2431 Get arcs that could be processed incorrectly.
2432 Input:
2433 None.
2434 Output:
2435 None.
2436 Return:
2437 None.
2438 *************************************************/
2439 static void getUnlikeArc()
2441 unsigned int i, bal;
2443 for ( i = 1; i <= num_ed; ++i )
2445 if ( edge_array[i].deleted )
2446 { continue; }
2448 if ( EdSameAsTwin ( i ) )
2450 edge_array[i].multi = 1;
2454 delarcBuffer = ( unsigned int ** ) ckalloc ( sizeof ( unsigned int * ) * 2 );
2455 delarcBuffer[0] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed * 3 );
2456 delarcBuffer[1] = ( unsigned int * ) ckalloc ( sizeof ( unsigned int ) * num_ed * 3 );
2457 unsigned int last = 0, curr = 0;
2459 for ( i = 1; i <= num_ed; ++i )
2461 if ( edge_array[i].deleted )
2462 { continue; }
2464 if ( edge_array[i].multi == 1 )
2466 last = delarcBufferCount;
2467 forward ( i, 1 );
2469 if ( !EdSameAsTwin ( i ) )
2471 forward ( getTwinEdge ( i ), 1 );
2472 ++i;
2475 curr = delarcBufferCount;
2476 unsigned int j;
2477 edge_array[i].flag = 0;
2478 edge_array[getTwinEdge ( i )].flag = 0;
2480 for ( j = last; j < curr; ++j )
2482 edge_array[delarcBuffer[0][j]].flag = 0;
2483 edge_array[getTwinEdge ( delarcBuffer[0][j] )].flag = 0;
2484 edge_array[delarcBuffer[1][j]].flag = 0;
2485 edge_array[getTwinEdge ( delarcBuffer[1][j] )].flag = 0;
2491 /*************************************************
2492 Function:
2493 Iterate
2494 Description:
2495 1. Iterately gets larger kmer graph and solves some repeats.
2496 2. Builds (k+1)mer graph based on k mer graph.
2497 3. Re-builds arcs by reads.
2498 4. Removes errors(weak edge, low cov edge and tips).
2499 Input:
2500 1. libfile : the reads config file
2501 2. graph : the output prefix
2502 3. maxk : the max kmer when using multikmer
2503 // 4. keepReadFile : keep temp reads file that selected for building arcs
2504 5. M : the strength of merging bubbles
2505 Output:
2506 None.
2507 Return:
2508 None.
2509 *************************************************/
2510 void Iterate ( char * libfile, char * graph, int maxk, int M ) //boolean keepReadFile,
2512 time_t start_t, stop_t, time_bef, time_aft, inner_start, inner_stop;
2513 time ( &start_t );
2514 unsigned int i;
2516 for ( i = 1; i <= num_ed; ++i )
2518 edge_array[i].multi = 0;
2521 int cutlen = 2 * overlaplen;
2522 int mink = overlaplen;
2523 overlaplen += step;
2524 nowstep2 = step;
2525 int flag = 0;
2526 statistics ( edge_array, num_ed );
2527 fprintf ( stderr, "\nIteration start.\n" );
2528 int round = 1;
2530 while ( overlaplen <= maxk )
2532 unsigned int j;
2533 time ( &inner_start );
2534 WORDFILTER = createFilter ( overlaplen );
2535 fprintf ( stderr, "\n***************************\n" );
2536 fprintf ( stderr, "Iteration %d, kmer: %d\n", round++, overlaplen );
2537 fprintf ( stderr, "Edge number: %d\n", num_ed );
2538 time ( &time_bef );
2539 //build (k+1)mer graph
2540 fprintf ( stderr, "Construct %dmer graph.\n", overlaplen );
2541 buildGraphHash();
2542 time ( &time_aft );
2543 fprintf ( stderr, "Time spent on building hash graph: %ds.\n", ( int ) ( time_aft - time_bef ) );
2544 time ( &time_bef );
2545 //add arcs for (k+1)mer graph
2546 fprintf ( stderr, "\nAdd arcs to graph.\n" );
2547 addArc ( libfile, graph, flag, maxk - overlaplen, maxk ); //, keepReadFile
2548 //get arcs that could be processed incorrectly
2549 getUnlikeArc();
2550 //delete this arcs
2551 deleteUnlikeArc();
2552 flag++;
2553 time ( &time_aft );
2554 fprintf ( stderr, "Time spent on adding arcs: %ds.\n", ( int ) ( time_aft - time_bef ) );
2555 time ( &time_bef );
2556 //sort disorder arcs causing by multi thread
2557 fprintf ( stderr, "Sort arcs.\n" );
2558 freshArc();
2559 time ( &time_aft );
2560 fprintf ( stderr, "Time spent on sorting arcs: %ds.\n", ( int ) ( time_aft - time_bef ) );
2562 if ( deLowEdge )
2564 time ( &time_bef );
2565 fprintf ( stderr, "\nRemove weak edges and low coverage edges.\n" );
2566 removeWeakEdges2 ( cutlen, 1, mink );
2567 removeLowCovEdges2 ( cutlen, deLowEdge, mink, 0 );
2568 time ( &time_aft );
2569 fprintf ( stderr, "Time spent on removing Edges: %ds\n", ( int ) ( time_aft - time_bef ) );
2572 if ( overlaplen + step > maxk )
2574 time ( &time_bef );
2575 fprintf ( stderr, "Cut tips of the graph.\n" );
2576 cutTipsInGraph2 ( cutlen, 0, 0 );
2577 time ( &time_aft );
2578 fprintf ( stderr, "Time spent on cutting tips: %ds.\n", ( int ) ( time_aft - time_bef ) );
2581 time ( &time_bef );
2582 fprintf ( stderr, "Refresh edges.\n" );
2583 //refresh to extend edge and get the edge order right
2584 fresh ( maxk );
2585 time ( &time_aft );
2586 fprintf ( stderr, "Time spent on refreshing edges: %ds.\n", ( int ) ( time_aft - time_bef ) );
2587 //free kmer set
2588 free_kmerset2 ( KmerSetsNew );
2589 overlaplen += step;
2590 nowstep2 += step;
2591 statistics ( edge_array, num_ed );
2592 time ( &inner_stop );
2593 fprintf ( stderr, "Time spent on this round: %dm.\n\n", ( int ) ( inner_stop - inner_start ) / 60 );
2596 for ( i = 1; i <= num_ed; ++i )
2598 edge_array[i].multi = 0;
2601 overlaplen = maxk;
2602 time ( &stop_t );
2603 fprintf ( stderr, "Iteration finished.\n" );
2604 fprintf ( stderr, "Time spent on iteration: %dm.\n\n", ( int ) ( stop_t - start_t ) / 60 );