modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / orderContig.c
blobb79ae61445a0fc84a53a21650d9d5d8623556e79
1 #include "stdinc.h"
2 #include "newhash.h"
3 #include "kmerhash.h"
4 #include "extfunc.h"
5 #include "extvab.h"
6 #include "dfibHeap.h"
7 #include "fibHeap.h"
8 #include "darray.h"
9 #include "zlib.h"
11 #define CNBLOCKSIZE 10000
12 #define MAXC 10000
13 #define MAXCinBetween 200
15 #define MaxNodeInSub 10000
16 #define GapLowerBound -2000
17 #define GapUpperBound 300000
19 #define MaxCntNode 1000
21 static int DEBUG = 0;
22 static int DEBUG1 = 0;
23 static int DEBUG2 = 0;
25 static int bySmall = 1;
27 static boolean static_f = 0;
28 static double OverlapPercent = 0.05;
29 static double ConflPercent = 0.05;
30 static int MinWeakCut = 3;
31 static int gapCounter;
32 static int orienCounter;
33 static int orienCounter2;
34 static int throughCounter;
36 static int breakPointAtRepeat = 0;
37 static FILE * snp_fp = NULL;
39 static DARRAY * solidArray;
40 static DARRAY * tempArray;
42 static int solidCounter;
44 static CTGinHEAP ctg4heapArray[MaxNodeInSub + 1];
45 static unsigned int nodesInSub[MaxNodeInSub];
46 static int nodeDistance[MaxNodeInSub];
47 static int nodeCounter;
50 static int dCntCounter1;
51 static int uCntCounter1;
52 static int dCntCounter2;
53 static int uCntCounter2;
54 static unsigned int dCntNodeArr1[MaxCntNode]; //downstream
55 static unsigned int uCntNodeArr1[MaxCntNode]; //upstream
56 static int dCntGapArr1[MaxCntNode];
57 static int uCntGapArr1[MaxCntNode];
58 static unsigned int dCntNodeArr2[MaxCntNode];
59 static unsigned int uCntNodeArr2[MaxCntNode];
60 static int dCntGapArr2[MaxCntNode];
61 static int uCntGapArr2[MaxCntNode];
63 static unsigned int * cntNodeArr;
64 static int * cntGapArr;
66 static unsigned int nodesInSubInOrder[MaxNodeInSub];
67 static int nodeDistanceInOrder[MaxNodeInSub];
69 static DARRAY * scaf3, *scaf5;
70 static DARRAY * gap3, *gap5;
72 static unsigned int downstreamCTG[MAXCinBetween];
73 static unsigned int upstreamCTG[MAXCinBetween];
74 static int dsCtgCounter;
75 static int usCtgCounter;
77 static CONNECT * checkConnect ( unsigned int from_c, unsigned int to_c );
78 static int maskPuzzle ( int num_connect, unsigned int contigLen );
79 static void freezing();
80 static boolean checkOverlapInBetween ( double tolerance );
81 static int setConnectDelete ( unsigned int from_c, unsigned int to_c, char flag, boolean cleanBinding );
82 static int setConnectMask ( unsigned int from_c, unsigned int to_c, char mask );
83 static int setConnectWP ( unsigned int from_c, unsigned int to_c, char flag );
85 static void general_linearization ( boolean strict );
86 static void debugging2();
87 static void smallScaf();
88 static void clearNewInsFlag();
89 static void detectBreakScaff();
90 static void detectBreakScaf();
91 static boolean checkSimple ( DARRAY * ctgArray, int count );
92 static void checkCircle();
94 /*************************************************
95 Function:
96 checkFiles4Scaff
97 Description:
98 Checks the required files for scaffolding.
99 Input:
100 1. infile: prefix of graph
101 Output:
102 None.
103 Return:
104 1 if all files were OK.
105 *************************************************/
106 boolean checkFiles4Scaff ( char * infile )
108 char name[7][256];
109 boolean filesOK = 1;
110 int i = 0;
111 sprintf ( name[0], "%s.Arc", infile );
112 sprintf ( name[1], "%s.contig", infile );
113 sprintf ( name[2], "%s.peGrads", infile );
114 sprintf ( name[3], "%s.preGraphBasic", infile );
115 sprintf ( name[4], "%s.updated.edge", infile );
116 sprintf ( name[5], "%s.readOnContig.gz", infile );
118 for ( ; i < 6; i++ )
120 filesOK = check_file ( name[i] );
122 if ( !filesOK )
124 fprintf ( stderr, "%s: no such file or empty file!\n\n", name[i] );
125 return filesOK;
129 fprintf ( stderr, "Files for scaffold construction are OK.\n\n" );
130 return filesOK;
134 /*************************************************
135 Function:
136 getBindCnt
137 Description:
138 Gets the only connection of current contig.
139 Input:
140 1. ctg: current contig
141 Output:
142 None.
143 Return:
144 The pointer to connection if only one connection was found.
145 *************************************************/
146 static CONNECT * getBindCnt ( unsigned int ctg )
148 CONNECT * ite_cnt;
149 CONNECT * bindCnt = NULL;
150 CONNECT * temp_cnt = NULL;
151 CONNECT * temp3_cnt = NULL;
152 int count = 0;
153 int count2 = 0;
154 int count3 = 0;
155 ite_cnt = contig_array[ctg].downwardConnect;
157 while ( ite_cnt )
159 if ( ite_cnt->nextInScaf )
161 count++;
162 bindCnt = ite_cnt;
165 if ( ite_cnt->prevInScaf )
167 temp_cnt = ite_cnt;
168 count2++;
171 if ( ite_cnt->singleInScaf )
173 temp3_cnt = ite_cnt;
174 count3++;
177 ite_cnt = ite_cnt->next;
180 if ( count == 1 )
181 { return bindCnt; }
183 if ( count == 0 && count2 == 1 )
184 { return temp_cnt; }
186 if ( count == 0 && count2 == 0 && count3 == 1 )
187 { return temp3_cnt; }
189 return NULL;
192 /*************************************************
193 Function:
194 createAnalogousCnt
195 Description:
196 Creats new relation between two non-connected contigs
197 according to toplogy sturcture supported by large insert size
198 paired-end reads.
199 Input:
200 1. sourceStart: contig that has connection relations to other two
201 contigs
202 2. originCnt: direct connection supported by large insert size
203 paired-end reads
204 3. gap: calculated distance between two non-connected contigs
205 4. targetStart: left contig of those two non-connected contigs
206 5. targetStop: right contig of those two non-connected contigs
207 Output:
208 None.
209 Return:
210 1 if creation successed.
211 *************************************************/
212 static boolean createAnalogousCnt ( unsigned int sourceStart,
213 CONNECT * originCnt, int gap,
214 unsigned int targetStart, unsigned int targetStop )
216 CONNECT * temp_cnt;
217 unsigned int balTargetStart = getTwinCtg ( targetStart );
218 unsigned int balTargetStop = getTwinCtg ( targetStop );
219 unsigned int balSourceStart = getTwinCtg ( sourceStart );
220 unsigned int balSourceStop = getTwinCtg ( originCnt->contigID );
221 boolean change_flag = 0;
222 int add_weight = originCnt->weight;
224 if ( gap < GapLowerBound )
226 gapCounter++;
227 originCnt->deleted = 1;
228 temp_cnt = getCntBetween ( balSourceStop, balSourceStart );
229 temp_cnt->deleted = 1;
230 return change_flag;
233 int startLen = ( int ) contig_array[targetStart].length;
234 int stopLen = ( int ) contig_array[targetStop].length;
236 if ( gap < -overlaplen )
238 if ( ( int ) contig_array[targetStart].length - overlaplen <= -gap && ( int ) contig_array[targetStart].length > 5 * overlaplen )
240 unsigned int tmp_id = targetStart;
241 targetStart = targetStop;
242 targetStop = tmp_id;
243 tmp_id = balTargetStart;
244 balTargetStart = balTargetStop;
245 balTargetStop = tmp_id;
246 gap = -gap;
247 change_flag = 1;
249 else
251 gapCounter++;
252 return change_flag;
256 originCnt->deleted = 1;
257 temp_cnt = getCntBetween ( balSourceStop, balSourceStart );
258 temp_cnt->deleted = 1;
259 temp_cnt = add1Connect ( targetStart, targetStop, gap, add_weight, 1 );
261 if ( temp_cnt )
262 { temp_cnt->inherit = 1; }
264 temp_cnt = add1Connect ( balTargetStop, balTargetStart, gap, add_weight, 1 );
266 if ( temp_cnt )
267 { temp_cnt->inherit = 1; }
269 return change_flag;
273 /*************************************************
274 Function:
275 add1LongPEcov
276 Description:
277 Increases the connections weight supported by large insert size
278 paired-end reads between contigs in one scaffold.
279 Input:
280 1. fromCtg: left contig of pair contig connected by large insert
281 size paired-end reads
282 2. toCtg: right contig of pair contig connected by large insert
283 size paired-end reads
284 3. weight: weight of connection
285 Output:
286 None.
287 Return:
288 None.
289 *************************************************/
290 static void add1LongPEcov ( unsigned int fromCtg, unsigned int toCtg, int weight )
292 //check if they are on the same scaff
293 if ( contig_array[fromCtg].from_vt != contig_array[toCtg].from_vt ||
294 contig_array[fromCtg].to_vt != contig_array[toCtg].to_vt )
296 fprintf ( stderr, "Warning from add1LongPEcov: contig %d and %d not on the same scaffold\n",
297 fromCtg, toCtg );
298 return;
301 if ( contig_array[fromCtg].indexInScaf >= contig_array[toCtg].indexInScaf )
303 fprintf ( stderr, "Warning from add1LongPEcov: wrong about order between contig %d and %d\n",
304 fromCtg, toCtg );
305 return;
308 CONNECT * bindCnt;
309 unsigned int prevCtg = fromCtg;
310 bindCnt = getBindCnt ( fromCtg );
312 while ( bindCnt )
314 if ( bindCnt->maxGap + weight <= 1000 )
315 { bindCnt->maxGap += weight; }
316 else
317 { bindCnt->maxGap = 1000; }
319 if ( fromCtg == 0 && toCtg == 0 )
320 fprintf ( stderr, "link (%d %d ) covered by link (%d %d), wt %d\n",
321 prevCtg, bindCnt->contigID, fromCtg, toCtg, weight );
323 if ( bindCnt->contigID == toCtg )
324 { break; }
326 prevCtg = bindCnt->contigID;
327 bindCnt = bindCnt->nextInScaf;
330 unsigned int bal_fc = getTwinCtg ( fromCtg );
331 unsigned int bal_tc = getTwinCtg ( toCtg );
332 bindCnt = getBindCnt ( bal_tc );
333 prevCtg = bal_tc;
335 while ( bindCnt )
337 if ( bindCnt->maxGap + weight <= 1000 )
338 { bindCnt->maxGap += weight; }
339 else
340 { bindCnt->maxGap = 1000; }
342 if ( fromCtg == 0 && toCtg == 0 )
343 fprintf ( stderr, "link (%d %d ) covered by link (%d %d), wt %d\n",
344 prevCtg, bindCnt->contigID, fromCtg, toCtg, weight );
346 if ( bindCnt->contigID == bal_fc )
347 { return; }
349 prevCtg = bindCnt->contigID;
350 bindCnt = bindCnt->nextInScaf;
354 /*************************************************
355 Function:
356 downSlide
357 Description:
358 Deals with connections supported by large insert size paired-end
359 reads. If a connecton is abnormal, or the two connected contigs
360 are in the same scaffold and the distance between them is normal,
361 set the connection's status to 'deleted'. If the two connected
362 contigs locate in different scaffolds, trys to creat a connection
363 between these two scaffolds.
364 Input:
365 None.
366 Output:
367 None.
368 Return:
369 None.
370 *************************************************/
371 static void downSlide()
373 int len = 0, gap;
374 unsigned int i;
375 CONNECT * ite_cnt, *bindCnt, *temp_cnt;
376 unsigned int bottomCtg, topCtg, bal_i;
377 unsigned int targetCtg, bal_target;
378 boolean getThrough, orienConflict;
379 int slideLen, slideLen2;
380 int slideexchange1 = 0, slideexchange2 = 0;
381 orienCounter = throughCounter = 0;
382 orienCounter2 = 0;
383 int slidebreak1 = 0, slidebreak2 = 0, slidebreak = 0, recoverCnt = 0;
385 for ( i = 1; i <= num_ctg; i++ )
387 if ( contig_array[i].mask || !contig_array[i].downwardConnect )
388 { continue; }
390 bindCnt = getBindCnt ( i );
392 if ( !bindCnt )
393 { continue; }
395 bal_i = getTwinCtg ( i );
396 len = slideLen = 0;
397 bottomCtg = i;
399 //find the last unmasked contig in this binding
400 while ( bindCnt->nextInScaf )
402 len += bindCnt->gapLen + contig_array[bindCnt->contigID].length;
404 if ( contig_array[bindCnt->contigID].mask == 0 )
406 bottomCtg = bindCnt->contigID;
407 slideLen = len;
410 bindCnt = bindCnt->nextInScaf;
413 len += bindCnt->gapLen + contig_array[bindCnt->contigID].length;
415 if ( contig_array[bindCnt->contigID].mask == 0 || bottomCtg == 0 )
417 bottomCtg = bindCnt->contigID;
418 slideLen = len;
421 //check each connetion from long pair ends
422 ite_cnt = contig_array[i].downwardConnect;
424 while ( ite_cnt )
426 if ( ite_cnt->deleted || ite_cnt->mask || ite_cnt->singleInScaf
427 || ite_cnt->nextInScaf || ite_cnt->prevInScaf || ite_cnt->inherit )
429 ite_cnt = ite_cnt->next;
430 continue;
433 targetCtg = ite_cnt->contigID;
435 if ( contig_array[i].from_vt == contig_array[targetCtg].from_vt ) // on the same scaff
437 if ( contig_array[i].indexInScaf > contig_array[targetCtg].indexInScaf )
438 { orienCounter++; }
439 else
440 { throughCounter++; }
442 setConnectDelete ( i, ite_cnt->contigID, 1, 0 );
443 ite_cnt = ite_cnt->next;
444 continue;
447 if ( ( ins_var_idx > 0 ) && ( slideLen > Insert_size * ins_var_idx ) )
449 setConnectDelete ( i, ite_cnt->contigID, 1, 0 );
450 ite_cnt = ite_cnt->next;
451 slidebreak1++;
452 continue;
455 //contig i and targetctg is not in same scaffold
456 //check if this connection conflicts with previous scaffold orientationally
457 temp_cnt = getBindCnt ( targetCtg );
458 orienConflict = 0;
460 if ( temp_cnt )
462 while ( temp_cnt->nextInScaf )
464 if ( temp_cnt->contigID == i )
466 orienConflict = 1;
467 fprintf ( stderr, "Warning from downSlide: still on the same scaff: %d and %d\n"
468 , i, targetCtg );
469 fprintf ( stderr, "on scaff %d and %d\n",
470 contig_array[i].from_vt, contig_array[targetCtg].from_vt );
471 fprintf ( stderr, "on bal_scaff %d and %d\n",
472 contig_array[bal_target].to_vt, contig_array[bal_i].to_vt );
473 break;
476 temp_cnt = temp_cnt->nextInScaf;
479 if ( temp_cnt->contigID == i )
480 { orienConflict = 1; }
483 if ( orienConflict )
485 orienCounter++;
486 orienCounter2++;
487 setConnectDelete ( i, ite_cnt->contigID, 1, 0 );
488 ite_cnt = ite_cnt->next;
489 continue;
492 //connection path to i was not found
493 //find the most top contig along previous scaffold starting with the target contig of this connection
494 bal_target = getTwinCtg ( targetCtg );
495 slideLen2 = 0;
497 if ( contig_array[targetCtg].mask == 0 )
499 topCtg = bal_target;
501 else
503 topCtg = 0;
506 temp_cnt = getBindCnt ( bal_target );
507 getThrough = len = 0;
508 int slidebreak = 0;
510 if ( temp_cnt )
512 //find the last contig in this binding
513 while ( temp_cnt->nextInScaf )
515 //check if this route reaches bal_i
516 if ( temp_cnt->contigID == bal_i )
518 fprintf ( stderr, "Warning from downSlide: (B) still on the same scaff: %d and %d (%d and %d)\n",
519 i, targetCtg, bal_target, bal_i );
520 fprintf ( stderr, "on scaff %d and %d\n",
521 contig_array[i].from_vt, contig_array[targetCtg].from_vt );
522 fprintf ( stderr, "on bal_scaff %d and %d\n",
523 contig_array[bal_target].to_vt, contig_array[bal_i].to_vt );
524 getThrough = 1;
525 break;
528 len += temp_cnt->gapLen + contig_array[temp_cnt->contigID].length;
530 if ( contig_array[temp_cnt->contigID].mask == 0 )
532 topCtg = temp_cnt->contigID;
533 slideLen2 = len;
536 if ( ( ins_var_idx > 0 ) && ( len > ins_var_idx * Insert_size ) )
538 slidebreak = 1;
539 break;
542 temp_cnt = temp_cnt->nextInScaf;
545 len += temp_cnt->gapLen + contig_array[temp_cnt->contigID].length;
547 if ( contig_array[temp_cnt->contigID].mask == 0 || topCtg == 0 )
549 topCtg = temp_cnt->contigID;
550 slideLen2 = len;
553 if ( slidebreak == 1 )
555 setConnectDelete ( i, ite_cnt->contigID, 1, 0 );
556 ite_cnt = ite_cnt->next;
557 slidebreak2++;
558 continue;
561 if ( temp_cnt->contigID == bal_i )
562 { getThrough = 1; }
563 else
564 { topCtg = getTwinCtg ( topCtg ); }
566 else
567 { topCtg = targetCtg; }
569 if ( getThrough )
571 throughCounter++;
572 setConnectDelete ( i, ite_cnt->contigID, 1, 0 );
573 ite_cnt = ite_cnt->next;
574 continue;
577 //connection path to bal_id was not found
578 CONNECT * dh_cnt;
579 gap = ite_cnt->gapLen - slideLen - slideLen2;
580 dh_cnt = getCntBetween ( topCtg, bottomCtg );
582 if ( dh_cnt && dh_cnt->weight >= MinWeakCut )
584 slideexchange1++;
585 setConnectDelete ( topCtg, bottomCtg, 0, 0 );
586 setConnectMask ( topCtg, bottomCtg, 0 );
587 ite_cnt = ite_cnt->next;
588 continue;
591 //add a connection between bottomCtg and topCtg
592 if ( bottomCtg != topCtg && ! ( i == bottomCtg && targetCtg == topCtg ) )
594 boolean creat_flag = createAnalogousCnt ( i, ite_cnt, gap, bottomCtg, topCtg );
596 if ( creat_flag )
597 { slideexchange2++; }
599 if ( contig_array[bottomCtg].mask || contig_array[topCtg].mask )
600 { fprintf ( stderr, "downSlide to masked contig, bottomCtg %u[mask %d], topCtg %u[mask %d]\n", bottomCtg, contig_array[bottomCtg].mask, topCtg, contig_array[topCtg].mask ); }
603 ite_cnt = ite_cnt->next;
607 // fprintf(stderr,"downSliding stat:\norienConflict\tfall_inside\tslidebreak1\tslidebreak2\trecoverCnt\tslideexchange1\tslideexchange2\n%d\t%d\t%d\t%d\t%d\t%d\t%d\n",orienCounter, throughCounter, slidebreak1, slidebreak2, recoverCnt, slideexchange1, slideexchange2);
608 fprintf ( stderr, "Add large insert size PE links: %d orientation-conflict links, %d contigs acrossed by normal links.\n", orienCounter, throughCounter );
611 /*************************************************
612 Function:
613 setNextInScaf
614 Description:
615 Sets the downstream connection of current connection.
616 Input:
617 1. cnt: current connection
618 2. nextCnt: next connection
619 Output:
620 None.
621 Return:
622 1 if setting successed.
623 *************************************************/
624 static boolean setNextInScaf ( CONNECT * cnt, CONNECT * nextCnt )
626 if ( !cnt )
628 fprintf ( stderr, "setNextInScaf: empty pointer\n" );
629 return 0;
632 if ( !nextCnt )
634 cnt->nextInScaf = nextCnt;
635 return 1;
638 if ( cnt->mask || cnt->deleted )
640 fprintf ( stderr, "setNextInScaf: cnt is masked or deleted\n" );
641 return 0;
644 if ( nextCnt->deleted || nextCnt->mask )
646 fprintf ( stderr, "setNextInScaf: nextCnt is masked or deleted\n" );
647 return 0;
650 cnt->nextInScaf = nextCnt;
651 return 1;
654 /*************************************************
655 Function:
656 setPrevInScaf
657 Description:
658 Sets the upstream connection status of current connection.
659 Input:
660 1. cnt: current connection
661 2. flag: new status
662 Output:
663 None.
664 Return:
665 1 if setting successed.
666 *************************************************/
667 static boolean setPrevInScaf ( CONNECT * cnt, boolean flag )
669 if ( !cnt )
671 fprintf ( stderr, "setPrevInScaf: empty pointer\n" );
672 return 0;
675 if ( !flag )
677 cnt->prevInScaf = flag;
678 return 1;
681 if ( cnt->mask || cnt->deleted )
683 fprintf ( stderr, "setPrevInScaf: cnt is masked or deleted\n" );
684 return 0;
687 cnt->prevInScaf = flag;
688 return 1;
692 /*************************************************
693 Function:
694 substitueUSinScaf
695 Description:
696 Substitutes the upstream connection of current connection with
697 new connection.
698 from_c
699 -> branch_c -> to_c
700 from_c_new
701 Input:
702 1. origin: original upstream connection of current connection
703 (from_c -> branch_c)
704 2. from_c_new: new upstream contig of current contig (branch_c)
705 Output:
706 None.
707 Return:
708 None.
709 *************************************************/
710 static void substitueUSinScaf ( CONNECT * origin, unsigned int from_c_new )
712 if ( !origin || !origin->nextInScaf )
713 { return; }
715 unsigned int branch_c, to_c;
716 unsigned int bal_branch_c, bal_to_c;
717 unsigned int bal_from_c_new = getTwinCtg ( from_c_new );
718 CONNECT * bal_origin, *bal_nextCNT, *prevCNT, *bal_prevCNT;
719 branch_c = origin->contigID;
720 to_c = origin->nextInScaf->contigID;
721 bal_branch_c = getTwinCtg ( branch_c );
722 bal_to_c = getTwinCtg ( to_c );
723 prevCNT = checkConnect ( from_c_new, branch_c );
724 bal_nextCNT = checkConnect ( bal_to_c, bal_branch_c );
726 if ( !bal_nextCNT )
728 fprintf ( stderr, "substitueUSinScaf: no connect between %d and %d\n", bal_to_c, bal_branch_c );
729 return;
732 bal_origin = bal_nextCNT->nextInScaf;
733 bal_prevCNT = checkConnect ( bal_branch_c, bal_from_c_new );
734 setPrevInScaf ( bal_nextCNT->nextInScaf, 0 );
735 setNextInScaf ( prevCNT, origin->nextInScaf );
736 setNextInScaf ( bal_nextCNT, bal_prevCNT );
737 setPrevInScaf ( bal_prevCNT, 1 );
738 setNextInScaf ( origin, NULL );
739 setPrevInScaf ( bal_origin, 0 );
743 /*************************************************
744 Function:
745 substitueDSinScaf
746 Description:
747 Substitutes the downstream connection of current connection with
748 new connection.
749 to_c
750 from_c -> branch_c ->
751 to_c_new
752 Input:
753 1. origin: original downstream connection of current connection
754 (branch_c -> to_c)
755 2. branch_c: current contig
756 3. to_c_new: new downstream contig of current contig
757 Output:
758 None.
759 Return:
760 None.
761 *************************************************/
762 static void substitueDSinScaf ( CONNECT * origin, unsigned int branch_c, unsigned int to_c_new )
764 if ( !origin || !origin->prevInScaf )
765 { return; }
767 unsigned int to_c;
768 unsigned int bal_branch_c, bal_to_c, bal_to_c_new;
769 unsigned int from_c, bal_from_c;
770 CONNECT * bal_origin, *prevCNT, *bal_prevCNT;
771 CONNECT * nextCNT, *bal_nextCNT;
772 to_c = origin->contigID;
773 bal_branch_c = getTwinCtg ( branch_c );
774 bal_to_c = getTwinCtg ( to_c );
775 bal_origin = getCntBetween ( bal_to_c, bal_branch_c );
777 if ( !bal_origin )
779 fprintf ( stderr, "substitueDSinScaf: no connect between %d and %d\n", bal_to_c, bal_branch_c );
780 return;
783 if ( bal_origin->nextInScaf )
784 { bal_from_c = bal_origin->nextInScaf->contigID; }
785 else
787 fprintf ( stderr, "next null! %d\t%d\n", bal_to_c, bal_branch_c );
788 exit ( 3 );
791 bal_from_c = bal_origin->nextInScaf->contigID;
792 from_c = getTwinCtg ( bal_from_c );
793 bal_to_c_new = getTwinCtg ( to_c_new );
794 prevCNT = checkConnect ( from_c, branch_c );
795 nextCNT = checkConnect ( branch_c, to_c_new );
796 setNextInScaf ( prevCNT, nextCNT );
797 setPrevInScaf ( nextCNT, 1 );
798 bal_nextCNT = checkConnect ( bal_to_c_new, bal_branch_c );
799 bal_prevCNT = checkConnect ( bal_branch_c, bal_from_c );
800 setNextInScaf ( bal_nextCNT, bal_prevCNT );
801 setPrevInScaf ( origin, 0 );
802 setNextInScaf ( bal_origin, NULL );
805 /*************************************************
806 Function:
807 validConnect
808 Description:
809 Calculates the valid connections number of a contig.
810 1. Check whether the contig has upstream conntcion and
811 downstream connection.
812 2. Calculates the non-deleted and non-masked connections number.
813 Input:
814 1. ctg: contig
815 2. preCNT: upstream connection
816 Output:
817 None.
818 Return:
819 0 if contig did NOT have connection to other contigs.
820 1 if contig had upstream conntcion and downstream connection.
821 Non-deleted and non-masked connections number otherwise.
822 *************************************************/
823 static int validConnect ( unsigned int ctg, CONNECT * preCNT )
825 if ( preCNT && preCNT->nextInScaf )
826 { return 1; }
828 CONNECT * cn_temp;
829 int count = 0;
831 if ( !contig_array[ctg].downwardConnect )
832 { return count; }
834 cn_temp = contig_array[ctg].downwardConnect;
836 while ( cn_temp )
838 if ( !cn_temp->deleted && !cn_temp->mask )
839 { count++; }
841 cn_temp = cn_temp->next;
844 return count;
847 /*************************************************
848 Function:
849 getNextContig
850 Description:
851 Gets the connection to next contig of current contig through constructed
852 connection in scaffold or direct connection to other contigs.
853 In the latter case, one and only one non-deleted and non-masked
854 connection is qualified.
855 Input:
856 1. ctg: current contig
857 2. preCNT: upstream connection of current contig
858 Output:
859 1. exception: indicate that whether found connection through
860 constructed connection in scaffold is deleted or masked
861 Return:
862 Pointer to qualified connection or NULL.
863 *************************************************/
864 static CONNECT * getNextContig ( unsigned int ctg, CONNECT * preCNT, boolean * exception )
866 CONNECT * cn_temp, *retCNT = NULL, *dh_cnt;
867 int count = 0, valid_in;
868 unsigned int nextCtg, bal_ctg;
869 *exception = 0;
871 if ( preCNT && preCNT->nextInScaf )
873 if ( preCNT->contigID != ctg )
874 { fprintf ( stderr, "pre cnt does not lead to %d\n", ctg ); }
876 nextCtg = preCNT->nextInScaf->contigID;
877 cn_temp = getCntBetween ( ctg, nextCtg );
878 dh_cnt = getCntBetween ( getTwinCtg ( nextCtg ), getTwinCtg ( ctg ) );
880 if ( cn_temp && ( cn_temp->mask || cn_temp->deleted ) )
882 int id1 = 0, id2 = 0;
884 if ( dh_cnt->nextInScaf )
886 id1 = dh_cnt->nextInScaf->contigID;
887 id2 = getTwinCtg ( dh_cnt->nextInScaf->contigID );
890 if ( !cn_temp->prevInScaf )
891 { fprintf ( stderr, "not even has a prevInScaf %d and %d, %d and %d with before %d with twin %d\n", ctg, nextCtg, getTwinCtg ( nextCtg ), getTwinCtg ( ctg ), id1, id2 ); }
893 cn_temp = getCntBetween ( getTwinCtg ( nextCtg ),
894 getTwinCtg ( ctg ) );
896 if ( !cn_temp->nextInScaf )
897 { fprintf ( stderr, "its twin cnt not has a nextInScaf\n" ); }
899 fflush ( stdout );
900 *exception = 1;
902 else
903 { return preCNT->nextInScaf; }
906 bal_ctg = getTwinCtg ( ctg );
907 valid_in = validConnect ( bal_ctg, NULL );
909 if ( valid_in > 1 )
910 { return NULL; }
912 if ( !contig_array[ctg].downwardConnect )
913 { return NULL; }
915 cn_temp = contig_array[ctg].downwardConnect;
917 while ( cn_temp )
919 if ( cn_temp->mask || cn_temp->deleted )
921 cn_temp = cn_temp->next;
922 continue;
925 count++;
927 if ( count == 1 )
928 { retCNT = cn_temp; }
929 else if ( count == 2 )
930 { return NULL; }
932 cn_temp = cn_temp->next;
935 return retCNT;
938 /*************************************************
939 Function:
940 checkConnect
941 Description:
942 Check whether the connection between two contigs exists and
943 the connection's status.
944 Input:
945 1. from_c: left contig
946 2. to_c: right contig
947 Output:
948 None.
949 Return:
950 Pointer to qualified connection or NULL.
951 *************************************************/
952 static CONNECT * checkConnect ( unsigned int from_c, unsigned int to_c )
954 CONNECT * cn_temp = getCntBetween ( from_c, to_c );
956 if ( !cn_temp )
957 { return NULL; }
959 if ( !cn_temp->mask && !cn_temp->deleted )
960 { return cn_temp; }
962 //else
963 //printf("masked or deleted: %d\t%d\t%d\t%d\n",from_c,to_c,cn_temp->mask,cn_temp->deleted);
964 return NULL;
967 /*************************************************
968 Function:
969 setConnectMask
970 Description:
971 Sets the "mask" status of connection between two contigs and
972 releated upstream and downstream connection's status if these
973 connections exist.
974 Input:
975 1. from_c: left contig
976 2. to_c: right contig
977 3. mask: new status
978 Output:
979 None.
980 Return:
981 1 if setting successed.
982 *************************************************/
983 static int setConnectMask ( unsigned int from_c, unsigned int to_c, char mask )
985 CONNECT * cn_temp, *cn_bal, *cn_ds, *cn_us;
986 unsigned int bal_fc = getTwinCtg ( from_c );
987 unsigned int bal_tc = getTwinCtg ( to_c );
988 unsigned int ctg3, bal_ctg3;
989 cn_temp = getCntBetween ( from_c, to_c );
990 cn_bal = getCntBetween ( bal_tc, bal_fc );
992 if ( !cn_temp || !cn_bal )
994 return 0;
997 cn_temp->mask = mask;
998 cn_bal->mask = mask;
1000 if ( !mask )
1001 { return 1; }
1003 if ( cn_temp->nextInScaf ) //undo the binding
1005 setPrevInScaf ( cn_temp->nextInScaf, 0 );
1006 ctg3 = cn_temp->nextInScaf->contigID;
1007 setNextInScaf ( cn_temp, NULL );
1008 bal_ctg3 = getTwinCtg ( ctg3 );
1009 cn_ds = getCntBetween ( bal_ctg3, bal_tc );
1010 setNextInScaf ( cn_ds, NULL );
1011 setPrevInScaf ( cn_bal, 0 );
1014 // ctg3 -> from_c -> to_c
1015 // bal_ctg3 <- bal_fc <- bal_tc
1016 if ( cn_bal->nextInScaf )
1018 setPrevInScaf ( cn_bal->nextInScaf, 0 );
1019 bal_ctg3 = cn_bal->nextInScaf->contigID;
1020 setNextInScaf ( cn_bal, NULL );
1021 ctg3 = getTwinCtg ( bal_ctg3 );
1022 cn_us = getCntBetween ( ctg3, from_c );
1023 setNextInScaf ( cn_us, NULL );
1024 setPrevInScaf ( cn_temp, 0 );
1027 return 1;
1030 /*************************************************
1031 Function:
1032 setConnectUsed
1033 Description:
1034 Sets the "used" status of connection between two contigs if the
1035 connection exists.
1036 Input:
1037 1. from_c: left contig
1038 2. to_c: right contig
1039 3. flag: new status
1040 Output:
1041 None.
1042 Return:
1043 1 if setting successed.
1044 *************************************************/
1045 static boolean setConnectUsed ( unsigned int from_c, unsigned int to_c, char flag )
1047 CONNECT * cn_temp, *cn_bal;
1048 unsigned int bal_fc = getTwinCtg ( from_c );
1049 unsigned int bal_tc = getTwinCtg ( to_c );
1050 cn_temp = getCntBetween ( from_c, to_c );
1051 cn_bal = getCntBetween ( bal_tc, bal_fc );
1053 if ( !cn_temp || !cn_bal )
1055 return 0;
1058 cn_temp->used = flag;
1059 cn_bal->used = flag;
1060 return 1;
1063 /*************************************************
1064 Function:
1065 setConnectWP
1066 Description:
1067 Sets the "weakPoint" status of connection between two contigs if the
1068 connection exists.
1069 Input:
1070 1. from_c: left contig
1071 2. to_c: right contig
1072 3. flag: new status
1073 Output:
1074 None.
1075 Return:
1076 1 if setting successed.
1077 *************************************************/
1078 static int setConnectWP ( unsigned int from_c, unsigned int to_c, char flag )
1080 CONNECT * cn_temp, *cn_bal;
1081 unsigned int bal_fc = getTwinCtg ( from_c );
1082 unsigned int bal_tc = getTwinCtg ( to_c );
1083 cn_temp = getCntBetween ( from_c, to_c );
1084 cn_bal = getCntBetween ( bal_tc, bal_fc );
1086 if ( !cn_temp || !cn_bal )
1088 return 0;
1091 cn_temp->weakPoint = flag;
1092 cn_bal->weakPoint = flag;
1093 return 1;
1096 /*************************************************
1097 Function:
1098 setConnectDelete
1099 Description:
1100 Sets the "deleted" status of connection between two contigs if the
1101 connection exists. Cleans the related upstream and downstream
1102 connections if required.
1103 Input:
1104 1. from_c: left contig
1105 2. to_c: right contig
1106 3. flag: new status
1107 4. cleanBinding: indicator of whether cleaning related connections
1108 Output:
1109 None.
1110 Return:
1111 1 if setting successed.
1112 *************************************************/
1113 static int setConnectDelete ( unsigned int from_c, unsigned int to_c, char flag, boolean cleanBinding )
1115 CONNECT * cn_temp, *cn_bal;
1116 unsigned int bal_fc = getTwinCtg ( from_c );
1117 unsigned int bal_tc = getTwinCtg ( to_c );
1118 cn_temp = getCntBetween ( from_c, to_c );
1119 cn_bal = getCntBetween ( bal_tc, bal_fc );
1121 if ( !cn_temp || !cn_bal )
1123 return 0;
1126 cn_temp->deleted = flag;
1127 cn_bal->deleted = flag;
1129 if ( !flag )
1130 { return 1; }
1132 if ( cleanBinding )
1134 cn_temp->prevInScaf = 0;
1135 cn_temp->nextInScaf = NULL;
1136 cn_bal->prevInScaf = 0;
1137 cn_bal->nextInScaf = NULL;
1140 return 1;
1143 /*************************************************
1144 Function:
1145 maskContig
1146 Description:
1147 Sets the "mask" status of target contig and its connections to
1148 other contigs except those connections in scaffold.
1149 Input:
1150 1. ctg: target contig
1151 2. flag: new status
1152 Output:
1153 None.
1154 Return:
1155 None.
1156 *************************************************/
1157 static void maskContig ( unsigned int ctg, boolean flag )
1159 unsigned int bal_ctg, ctg2, bal_ctg2;
1160 CONNECT * cn_temp;
1161 bal_ctg = getTwinCtg ( ctg );
1162 cn_temp = contig_array[ctg].downwardConnect;
1164 while ( cn_temp )
1166 if ( cn_temp->mask || cn_temp->prevInScaf || cn_temp->nextInScaf || cn_temp->singleInScaf )
1168 cn_temp = cn_temp->next;
1169 continue;
1172 ctg2 = cn_temp->contigID;
1173 setConnectMask ( ctg, ctg2, flag );
1174 cn_temp = cn_temp->next;
1177 // bal_ctg2 <- bal_ctg
1178 cn_temp = contig_array[bal_ctg].downwardConnect;
1180 while ( cn_temp )
1182 if ( cn_temp->mask || cn_temp->prevInScaf || cn_temp->nextInScaf || cn_temp->singleInScaf )
1184 cn_temp = cn_temp->next;
1185 continue;
1188 bal_ctg2 = cn_temp->contigID;
1189 setConnectMask ( bal_ctg, bal_ctg2, flag );
1190 cn_temp = cn_temp->next;
1193 contig_array[ctg].mask = flag;
1194 contig_array[bal_ctg].mask = flag;
1198 /*************************************************
1199 Function:
1200 maskPuzzle
1201 Description:
1202 For contigs longer than "contigLen", masks them if they have
1203 more than one valid connection in either upstream of downstream
1204 direction, and have more than 'num_connect' valid connection.
1205 Input:
1206 1. num_connect: allowed valid connection number cutoff
1207 2. contigLen: contig length cutoff
1208 Output:
1209 None.
1210 Return:
1211 Masked contig number.
1212 *************************************************/
1213 static int maskPuzzle ( int num_connect, unsigned int contigLen )
1215 int in_num, out_num, flag = 0, puzzleCounter = 0;
1216 unsigned int i, bal_i;
1217 fprintf ( stderr, "Start to mask puzzles.\n" );
1219 for ( i = 1; i <= num_ctg; i++ )
1221 if ( contigLen && contig_array[i].length > contigLen )
1222 { break; }
1224 if ( contig_array[i].mask )
1225 { continue; }
1227 bal_i = getTwinCtg ( i );
1228 in_num = validConnect ( bal_i, NULL );
1229 out_num = validConnect ( i, NULL );
1231 if ( ( in_num > 1 || out_num > 1 ) && ( in_num + out_num >= num_connect ) )
1233 flag++;
1234 maskContig ( i, 1 );
1237 // upstream connection in scaffold
1238 in_num = validConnect ( bal_i, NULL );
1239 // downstream connection in scaffold
1240 out_num = validConnect ( i, NULL );
1242 if ( in_num > 1 || out_num > 1 )
1244 puzzleCounter++;
1245 //debugging2(i);
1248 if ( isSmallerThanTwin ( i ) )
1249 { i++; }
1252 fprintf ( stderr, " Masked contigs %d\n Remained puzzles %d\n", flag, puzzleCounter );
1253 return flag;
1256 /*************************************************
1257 Function:
1258 deleteWeakCnt
1259 Description:
1260 Updates the status of contigs' connections according to "cut_off",
1261 and then mask those contigs which form circle structure.
1263 A -> B -> A
1265 Input:
1266 1. cut_off: weight cutoff of connection
1267 Output:
1268 None.
1269 Return:
1270 None.
1271 *************************************************/
1272 static void deleteWeakCnt ( int cut_off )
1274 unsigned int i;
1275 CONNECT * cn_temp1;
1276 int weaks = 0, counter = 0;
1278 for ( i = 1; i <= num_ctg; i++ )
1280 cn_temp1 = contig_array[i].downwardConnect;
1282 while ( cn_temp1 )
1284 if ( !cn_temp1->mask && !cn_temp1->deleted && !cn_temp1->nextInScaf
1285 && !cn_temp1->singleInScaf && !cn_temp1->prevInScaf )
1287 counter++;
1290 if ( cn_temp1->weak && cn_temp1->deleted && cn_temp1->weight >= cut_off )
1292 cn_temp1->deleted = 0;
1293 cn_temp1->weak = 0;
1295 else if ( !cn_temp1->deleted && cn_temp1->weight > 0 && cn_temp1->weight < cut_off
1296 && !cn_temp1->nextInScaf && !cn_temp1->prevInScaf )
1298 cn_temp1->deleted = 1;
1299 cn_temp1->weak = 1;
1301 if ( cn_temp1->singleInScaf )
1302 { cn_temp1->singleInScaf = 0; }
1304 if ( !cn_temp1->mask )
1305 { weaks++; }
1308 cn_temp1 = cn_temp1->next;
1312 if ( counter > 0 )
1313 { fprintf ( stderr, " Active connections %d\n Weak connections %d\n Weak ratio %.1f%%\n", counter, weaks, ( float ) weaks / counter * 100 ); }
1315 checkCircle();
1319 /*************************************************
1320 Function:
1321 linearC2C
1322 Description:
1323 For three contigs A, B and C, connections A->B and A->C exist,
1324 check if there is a connection path connecting B and C or
1325 whether a new connection can be created between B and C.
1327 ------->
1328 A -> B -?-> C
1330 Input:
1331 1. starter: leftmost contig
1332 2. cnt2c1: connection A->B
1333 3. c2: rightmost contig
1334 4. min_dis: minimum distance between middle and rightmost contig
1335 5. max_dis: maximum distance between middle and rightmost contig
1336 Output:
1337 None.
1338 Return:
1339 -1 if B and C were the same contig.
1340 1 if connection path was found or connection was created.
1341 0 otherwise.
1342 *************************************************/
1343 static int linearC2C ( unsigned int starter, CONNECT * cnt2c1, unsigned int c2, int min_dis, int max_dis )
1345 int out_num, in_num;
1346 CONNECT * prevCNT, *cnt, *cn_temp;
1347 unsigned int c1, bal_c1, ctg, bal_c2;
1348 int len = 0;
1349 unsigned int bal_start = getTwinCtg ( starter );
1350 boolean excep;
1351 c1 = cnt2c1->contigID;
1353 if ( c1 == c2 )
1355 fprintf ( stderr, "linearC2C: c1(%d) and c2(%d) are the same contig.\n", c1, c2 );
1356 return -1;
1359 bal_c1 = getTwinCtg ( c1 );
1360 dsCtgCounter = 1;
1361 usCtgCounter = 0;
1362 downstreamCTG[dsCtgCounter++] = c1;
1363 bal_c2 = getTwinCtg ( c2 );
1364 upstreamCTG[usCtgCounter++] = bal_c2;
1365 // check if c1 is linearly connected to c2 by pe connections
1366 cnt = prevCNT = cnt2c1;
1368 while ( ( cnt = getNextContig ( c1, prevCNT, &excep ) ) != NULL )
1370 c1 = cnt->contigID;
1371 len += cnt->gapLen + contig_array[c1].length;
1373 if ( c1 == c2 )
1375 usCtgCounter--;
1376 return 1; //is interleaving.
1379 if ( len > max_dis || c1 == starter || c1 == bal_start )
1380 { return 0; }
1382 downstreamCTG[dsCtgCounter++] = c1;
1384 if ( dsCtgCounter >= MAXCinBetween )
1386 fprintf ( stderr, "%d downstream contigs, start at %d, max_dis %d, current dis %d\n"
1387 , dsCtgCounter, starter, max_dis, len );
1388 return 0;
1391 prevCNT = cnt;
1394 out_num = validConnect ( c1, NULL ); //new c1 should have no outgoing link.
1396 if ( out_num )
1397 { return 0; }
1399 //find the most upstream contig to c2
1400 cnt = prevCNT = NULL;
1401 ctg = bal_c2;
1403 while ( ( cnt = getNextContig ( ctg, prevCNT, &excep ) ) != NULL )
1405 ctg = cnt->contigID;
1406 len += cnt->gapLen + contig_array[ctg].length;
1408 if ( len > max_dis || ctg == starter || ctg == bal_start )
1409 { return 0; }
1411 prevCNT = cnt;
1412 upstreamCTG[usCtgCounter++] = ctg;
1414 if ( usCtgCounter >= MAXCinBetween )
1416 fprintf ( stderr, "%d upstream contigs, start at %d, max_dis %d, current dis %d\n"
1417 , usCtgCounter, starter, max_dis, len );
1418 return 0;
1422 if ( dsCtgCounter + usCtgCounter > MAXCinBetween )
1424 fprintf ( stderr, "%d downstream and %d upstream contigs.\n", dsCtgCounter, usCtgCounter );
1425 return 0;
1428 out_num = validConnect ( ctg, NULL ); //new c2 have no incoming link.
1430 if ( out_num )
1432 return 0;
1435 c2 = getTwinCtg ( ctg );
1436 min_dis -= len;
1437 max_dis -= len;
1439 if ( c1 == c2 || c1 == ctg || max_dis < 0 )
1440 { return 0; }
1442 usCtgCounter--;
1443 cn_temp = getCntBetween ( c1, c2 ); //have connection between new c1 and new c2
1445 if ( cn_temp )
1447 setConnectMask ( c1, c2, 0 );
1448 setConnectDelete ( c1, c2, 0, 0 );
1449 return 1;
1452 int oldsize = usCtgCounter;
1454 while ( getCntBetween ( c2, c1 ) && usCtgCounter > 1 )
1456 usCtgCounter--;
1457 c2 = getTwinCtg ( upstreamCTG[usCtgCounter] );
1460 if ( usCtgCounter != oldsize )
1462 unsigned int prev_c2 = upstreamCTG[usCtgCounter + 1];
1463 unsigned int bal_prev_c2 = getTwinCtg ( prev_c2 );
1464 setConnectMask ( bal_prev_c2, c2, 1 );
1465 setConnectMask ( bal_prev_c2, c2, 0 );
1466 int i = usCtgCounter + 1;
1468 for ( ; i <= oldsize; i++ )
1470 contig_array[upstreamCTG[i]].from_vt = prev_c2;
1471 contig_array[getTwinCtg ( upstreamCTG[i] )].to_vt = bal_prev_c2;
1474 if ( ( cn_temp = getCntBetween ( c1, c2 ) ) != NULL )
1476 setConnectMask ( c1, c2, 0 );
1477 setConnectDelete ( c1, c2, 0, 0 );
1478 return 1;
1482 len = ( min_dis + max_dis ) / 2 >= 0 ? ( min_dis + max_dis ) / 2 : 0;
1483 cn_temp = allocateCN ( c2, len );
1485 if ( cntLookupTable )
1486 { putCnt2LookupTable ( c1, cn_temp ); }
1488 cn_temp->weight = 0; // special connect from the original graph
1489 cn_temp->next = contig_array[c1].downwardConnect;
1490 contig_array[c1].downwardConnect = cn_temp;
1491 bal_c1 = getTwinCtg ( c1 );
1492 bal_c2 = getTwinCtg ( c2 );
1493 cn_temp = allocateCN ( bal_c1, len );
1495 if ( cntLookupTable )
1496 { putCnt2LookupTable ( bal_c2, cn_temp ); }
1498 cn_temp->weight = 0; // special connect from the original graph
1499 cn_temp->next = contig_array[bal_c2].downwardConnect;
1500 contig_array[bal_c2].downwardConnect = cn_temp;
1501 return 1;
1505 /*************************************************
1506 Function:
1507 catUsDsContig
1508 Description:
1509 Concatenates upstream and downstream contig arrays together.
1510 Input:
1511 None.
1512 Output:
1513 None.
1514 Return:
1515 None.
1516 *************************************************/
1517 static void catUsDsContig()
1519 int i;
1521 for ( i = 0; i < dsCtgCounter; i++ )
1522 { * ( unsigned int * ) darrayPut ( solidArray, i ) = downstreamCTG[i]; }
1524 for ( i = usCtgCounter; i >= 0; i-- )
1526 * ( unsigned int * ) darrayPut ( solidArray, dsCtgCounter++ ) = getTwinCtg ( upstreamCTG[i] );
1529 solidCounter = dsCtgCounter;
1533 /*************************************************
1534 Function:
1535 consolidate
1536 Description:
1537 Constructs scaffolds by binding connection relation among contigs
1538 in "solidArray".
1539 Input:
1540 None.
1541 Output:
1542 None.
1543 Return:
1544 None.
1545 *************************************************/
1546 static void consolidate()
1548 int i, j;
1549 CONNECT * prevCNT = NULL;
1550 CONNECT * cnt;
1551 unsigned int to_ctg;
1552 unsigned int from_ctg = * ( unsigned int * ) darrayGet ( solidArray, 0 );
1554 for ( i = 1; i < solidCounter; i++ )
1556 to_ctg = * ( unsigned int * ) darrayGet ( solidArray, i );
1557 cnt = checkConnect ( from_ctg, to_ctg );
1559 if ( !cnt )
1561 fprintf ( stderr, "consolidate A: no connect from %d to %d\n",
1562 from_ctg, to_ctg );
1564 for ( j = 0; j < solidCounter; j++ )
1565 { fprintf ( stderr, "%d-->", * ( unsigned int * ) darrayGet ( solidArray, j ) ); }
1567 fprintf ( stderr, "\n" );
1568 return;
1571 cnt->singleInScaf = solidCounter == 2 ? 1 : 0;
1573 if ( prevCNT )
1575 setNextInScaf ( prevCNT, cnt );
1576 setPrevInScaf ( cnt, 1 );
1579 prevCNT = cnt;
1580 from_ctg = to_ctg;
1583 //the reverse complementary path
1584 from_ctg = getTwinCtg ( * ( unsigned int * ) darrayGet ( solidArray, solidCounter - 1 ) );
1585 prevCNT = NULL;
1587 for ( i = solidCounter - 2; i >= 0; i-- )
1589 to_ctg = getTwinCtg ( * ( unsigned int * ) darrayGet ( solidArray, i ) );
1590 cnt = checkConnect ( from_ctg, to_ctg );
1592 if ( !cnt )
1594 fprintf ( stderr, "consolidate B: no connect from %d to %d\n", from_ctg, to_ctg );
1595 return;
1598 cnt->singleInScaf = solidCounter == 2 ? 1 : 0;
1600 if ( prevCNT )
1602 setNextInScaf ( prevCNT, cnt );
1603 setPrevInScaf ( cnt, 1 );
1606 prevCNT = cnt;
1607 from_ctg = to_ctg;
1611 static void debugging1 ( unsigned int ctg1, unsigned int ctg2 )
1613 CONNECT * cn1;
1614 cn1 = getCntBetween ( ctg1, ctg2 );
1616 if ( cn1 )
1618 fprintf ( stderr, "(%d,%d) mask %d deleted %d w %d,singleInScaf %d\n",
1619 ctg1, ctg2, cn1->mask, cn1->deleted, cn1->weight, cn1->singleInScaf );
1621 if ( cn1->nextInScaf )
1622 { fprintf ( stderr, "%d->%d->%d\n", ctg1, ctg2, cn1->nextInScaf->contigID ); }
1624 if ( cn1->prevInScaf )
1625 { fprintf ( stderr, "*->%d->%d\n", ctg1, ctg2 ); }
1626 else if ( !cn1->nextInScaf )
1627 { fprintf ( stderr, "NULL->%d->%d->NULL\n", ctg1, ctg2 ); }
1629 else
1630 { fprintf ( stderr, "%d -X- %d\n", ctg1, ctg2 ); }
1633 /*************************************************
1634 Function:
1635 removeTransitive
1636 Description:
1637 For three contigs A, B and C, connections A->B and A->C exist.
1638 If there is a connection path connecting B and C or a new
1639 connection can be created between B and C, then deletes
1640 connection A->C.
1642 ------->
1643 A -> B -?-> C
1644 Input:
1645 None.
1646 Output:
1647 None.
1648 Return:
1649 None.
1650 *************************************************/
1651 static void removeTransitive()
1653 unsigned int i, bal_ctg;
1654 int flag = 1, out_num, in_num, count, min, max, linear;
1655 CONNECT * cn_temp, *cn1 = NULL, *cn2 = NULL;
1656 int multi_out = 0, single_out = 0, two_out = 0, may_transitive = 0, not_transitive = 0, cycle_num = 0, mask_ctg = 0, no_out = 0;
1657 fprintf ( stderr, "Start to remove transitive connection.\n" );
1659 while ( flag )
1661 flag = 0;
1662 two_out = 0;
1663 not_transitive = 0;
1664 may_transitive = 0;
1665 cycle_num++;
1667 for ( i = 1; i <= num_ctg; i++ )
1669 if ( contig_array[i].mask )
1671 if ( cycle_num == 1 )
1672 { mask_ctg++; }
1674 continue;
1677 out_num = validConnect ( i, NULL );
1679 if ( out_num != 2 )
1681 if ( out_num == 1 && cycle_num == 1 )
1682 { single_out++; }
1684 if ( out_num > 2 && cycle_num == 1 )
1685 { multi_out++; }
1687 if ( out_num == 0 && cycle_num == 1 )
1688 { no_out++; }
1690 continue;
1693 two_out++;
1694 cn_temp = contig_array[i].downwardConnect;
1695 count = 0;
1697 while ( cn_temp )
1699 if ( cn_temp->deleted || cn_temp->mask )
1701 cn_temp = cn_temp->next;
1702 continue;
1705 count++;
1707 if ( count == 1 )
1708 { cn1 = cn_temp; }
1709 else if ( count == 2 )
1711 cn2 = cn_temp;
1713 else
1714 { break; }
1716 cn_temp = cn_temp->next;
1719 if ( count > 2 )
1721 fprintf ( stderr, "%d valid connections from ctg %d\n", count, i );
1722 continue;
1725 if ( cn1->gapLen > cn2->gapLen )
1727 cn_temp = cn1;
1728 cn1 = cn2;
1729 cn2 = cn_temp;
1730 } //make sure cn1 is closer to contig i than cn2
1732 if ( cn1->prevInScaf && cn2->prevInScaf )
1733 { continue; }
1735 bal_ctg = getTwinCtg ( cn2->contigID );
1736 in_num = validConnect ( bal_ctg, NULL );
1738 if ( in_num > 2 )
1739 { continue; }
1741 int bal_c1 = getTwinCtg ( cn1->contigID );
1742 in_num = validConnect ( bal_c1, NULL );
1744 if ( in_num > 1 )
1745 { continue; }
1747 min = cn2->gapLen - cn1->gapLen - contig_array[cn1->contigID].length - ins_size_var / 2;
1748 max = cn2->gapLen - cn1->gapLen - contig_array[cn1->contigID].length + ins_size_var / 2;
1750 if ( max < 0 )
1751 { continue; }
1753 may_transitive++;
1754 //temprarily delete cn2
1755 setConnectDelete ( i, cn2->contigID, 1, 0 );
1756 int oldc2 = cn2->contigID;
1757 linear = linearC2C ( i, cn1, cn2->contigID, min, max );
1759 if ( linear != 1 )
1761 not_transitive++;
1762 setConnectDelete ( i, cn2->contigID, 0, 0 );
1763 continue;
1765 else
1767 downstreamCTG[0] = i;
1768 catUsDsContig();
1770 if ( !checkSimple ( solidArray, solidCounter ) )
1771 { continue; }
1773 cn1 = getCntBetween ( * ( unsigned int * ) darrayGet ( solidArray, solidCounter - 2 ),
1774 * ( unsigned int * ) darrayGet ( solidArray, solidCounter - 1 ) );
1776 if ( cn1 && cn1->nextInScaf && cn2->nextInScaf )
1778 setConnectDelete ( i, cn2->contigID, 0, 0 );
1779 continue;
1782 consolidate();
1784 if ( cn2->prevInScaf )
1785 substitueDSinScaf ( cn2, * ( unsigned int * ) darrayGet ( solidArray, 0 ),
1786 * ( unsigned int * ) darrayGet ( solidArray, 1 ) );
1788 if ( cn2->nextInScaf )
1789 { substitueUSinScaf ( cn2, * ( unsigned int * ) darrayGet ( solidArray, solidCounter - 2 ) ); }
1791 flag++;
1795 if ( cycle_num == 1 )
1797 fprintf ( stderr, "Total contigs %u\n", num_ctg );
1798 fprintf ( stderr, "Masked contigs %d\n", mask_ctg );
1799 fprintf ( stderr, "Remained contigs %u\n", num_ctg - mask_ctg );
1800 fprintf ( stderr, "None-outgoing-connection contigs %d", no_out );
1802 if ( num_ctg - mask_ctg > 0 )
1804 fprintf ( stderr, " (%1f%%)", ( float ) no_out / ( num_ctg - mask_ctg ) * 100 );
1807 fprintf ( stderr, "\nSingle-outgoing-connection contigs %d\n", single_out );
1808 fprintf ( stderr, "Multi-outgoing-connection contigs %d\n", multi_out );
1811 fprintf ( stderr, "Cycle %d\n Two-outgoing-connection contigs %d\n Potential transitive connections %d\n Transitive connections %d\n", cycle_num, two_out, may_transitive, flag );
1813 if ( two_out > 0 )
1815 fprintf ( stderr, " Transitive ratio %.1f%%\n", ( float ) flag / two_out * 100 );
1818 if ( flag == 0 )
1819 { break; }
1823 static void debugging2 ( unsigned int ctg )
1825 if ( ctg > num_ctg )
1827 return;
1830 CONNECT * cn1 = contig_array[ctg].downwardConnect;
1832 while ( cn1 )
1834 if ( cn1->nextInScaf )
1835 { fprintf ( stderr, "with nextInScaf %u,", cn1->nextInScaf->contigID ); }
1837 if ( cn1->prevInScaf )
1838 { fprintf ( stderr, "with prevInScaf," ); }
1840 fprintf ( stderr, "%u >> %u, weight %d, gapLen %d, mask %d deleted %d, inherit %d, singleInScaf %d, bySmall %d\n",
1841 ctg, cn1->contigID, cn1->weight, cn1->gapLen, cn1->mask, cn1->deleted, cn1->inherit, cn1->singleInScaf, cn1->bySmall );
1842 cn1 = cn1->next;
1845 static void debugging()
1847 // debugging1(13298356, 13245956);
1850 /*************************************************
1851 Function:
1852 simplifyCnt
1853 Description:
1854 Simplifys contig graph by two operations.
1855 1) Removes transitive connections.
1856 2) Picks up local contig graph and tries to line involved
1857 contigs.
1858 Input:
1859 None.
1860 Output:
1861 None.
1862 Return:
1863 None.
1864 *************************************************/
1865 static void simplifyCnt()
1867 removeTransitive();
1868 debugging();
1869 general_linearization ( 1 );
1870 debugging();
1873 /*************************************************
1874 Function:
1875 getIndexInArray
1876 Description:
1877 Gets contig's index in array.
1878 Input:
1879 1. node: contig
1880 Output:
1881 None.
1882 Return:
1883 Index of contig if contig existed.
1884 -1 otherwise.
1885 *************************************************/
1886 static int getIndexInArray ( unsigned int node )
1888 int index;
1890 for ( index = 0; index < nodeCounter; index++ )
1891 if ( nodesInSub[index] == node )
1892 { return index; }
1894 return -1;
1897 /*************************************************
1898 Function:
1899 putNodeIntoSubgraph
1900 Description:
1901 Puts contig into sub-graph.
1902 Input:
1903 1. heap: heap to store contigs
1904 2. distance: current contig's distance to base contig
1905 3. node: current contig
1906 4. index: contig's index in array
1907 Output:
1908 None.
1909 Return:
1910 0 if contig already existed in sub-graph.
1911 1 if operation succeeded.
1912 -1 if index was larger than allowed maximum sub-graph size.
1913 *************************************************/
1914 static boolean putNodeIntoSubgraph ( FibHeap * heap, int distance, unsigned int node, int index )
1916 int pos = getIndexInArray ( node );
1918 if ( pos > 0 )
1920 return 0;
1923 if ( index >= MaxNodeInSub )
1924 { return -1; }
1926 insertNodeIntoHeap ( heap, distance, node );
1927 nodesInSub[index] = node;
1928 nodeDistance[index] = distance;
1929 return 1;
1932 /*************************************************
1933 Function:
1934 putChainIntoSubgraph
1935 Description:
1936 Puts contigs of connection chain into sub-graph.
1937 Input:
1938 1. heap: heap to store contigs
1939 2. distance: current contig's distance to base contig
1940 3. node: current contig
1941 4. index: contig's index in array
1942 5. prevC: upstream connection of current contig
1943 Output:
1944 1. index: index of next contig to add
1945 2. prevC: upstream connection of last contig in connection chain
1946 Return:
1947 0 if operation of putting contig into sub-graph failed.
1948 *************************************************/
1949 static boolean putChainIntoSubgraph ( FibHeap * heap, int distance, unsigned int node, int * index, CONNECT * prevC )
1951 unsigned int ctg = node;
1952 CONNECT * nextCnt;
1953 boolean excep, flag;
1954 int counter = *index;
1956 while ( 1 )
1958 nextCnt = getNextContig ( ctg, prevC, &excep );
1960 if ( excep || !nextCnt )
1962 *index = counter;
1963 return 1;
1966 ctg = nextCnt->contigID;
1967 distance += nextCnt->gapLen + contig_array[ctg].length;
1968 flag = putNodeIntoSubgraph ( heap, distance, ctg, counter );
1970 if ( flag < 0 )
1971 { return 0; }
1973 if ( flag > 0 )
1974 { counter++; }
1976 prevC = nextCnt;
1980 /*************************************************
1981 Function:
1982 checkUnique
1983 Description:
1984 Checks if a contig is unique by trying to line its upstream and
1985 downstream contigs.
1986 Input:
1987 1. node: contig to check
1988 2. tolerance: allowed percentage that overlap length among
1989 contigs in sub-graph accounts for in all involved contigs' length
1990 Output:
1991 None.
1992 Return:
1993 1 if contig was unique.
1994 *************************************************/
1995 static boolean checkUnique ( unsigned int node, double tolerance )
1997 CONNECT * ite_cnt;
1998 unsigned int currNode;
1999 int distance;
2000 int popCounter = 0;
2001 boolean flag;
2002 currNode = node;
2003 FibHeap * heap = newFibHeap();
2004 putNodeIntoSubgraph ( heap, 0, currNode, 0 );
2005 nodeCounter = 1;
2006 ite_cnt = contig_array[currNode].downwardConnect;
2008 while ( ite_cnt )
2010 if ( ite_cnt->deleted || ite_cnt->mask )
2012 ite_cnt = ite_cnt->next;
2013 continue;
2016 currNode = ite_cnt->contigID;
2017 distance = ite_cnt->gapLen + contig_array[currNode].length;
2018 flag = putNodeIntoSubgraph ( heap, distance, currNode, nodeCounter );
2020 if ( flag < 0 )
2022 destroyHeap ( heap );
2023 return 0;
2026 if ( flag > 0 )
2027 { nodeCounter++; }
2029 flag = putChainIntoSubgraph ( heap, distance, currNode, &nodeCounter, ite_cnt );
2031 if ( !flag )
2033 destroyHeap ( heap );
2034 return 0;
2037 ite_cnt = ite_cnt->next;
2040 if ( nodeCounter <= 2 ) // no more than 2 valid connections
2042 destroyHeap ( heap );
2043 return 1;
2046 while ( ( currNode = removeNextNodeFromHeap ( heap ) ) != 0 )
2047 { nodesInSubInOrder[popCounter++] = currNode; }
2049 destroyHeap ( heap );
2050 flag = checkOverlapInBetween ( tolerance );
2051 return flag;
2054 /*************************************************
2055 Function:
2056 maskRepeat
2057 Description:
2058 Masks repeat contigs.
2059 Input:
2060 None.
2061 Output:
2062 None.
2063 Return:
2064 None.
2065 *************************************************/
2066 static void maskRepeat()
2068 int in_num, out_num, flagA, flagB;
2069 int counter = 0;
2070 int puzzleCounter = 0;
2071 unsigned int i, bal_i;
2073 for ( i = 1; i <= num_ctg; i++ )
2075 if ( contig_array[i].mask )
2076 { continue; }
2078 bal_i = getTwinCtg ( i );
2079 in_num = validConnect ( bal_i, NULL );
2080 out_num = validConnect ( i, NULL );
2082 if ( in_num > 1 || out_num > 1 )
2083 { puzzleCounter++; }
2084 else
2086 if ( isSmallerThanTwin ( i ) )
2087 { i++; }
2089 continue;
2092 if ( contig_array[i].cvg > 1.4 * cvgAvg )
2094 counter++;
2095 maskContig ( i, 1 );
2097 if ( isSmallerThanTwin ( i ) )
2098 { i++; }
2100 continue;
2103 if ( in_num > 1 )
2104 { flagA = checkUnique ( bal_i, OverlapPercent ); }
2105 else
2106 { flagA = 1; }
2108 if ( out_num > 1 )
2109 { flagB = checkUnique ( i, OverlapPercent ); }
2110 else
2111 { flagB = 1; }
2113 if ( !flagA || !flagB )
2115 counter++;
2116 maskContig ( i, 1 );
2119 if ( isSmallerThanTwin ( i ) )
2120 { i++; }
2123 fprintf ( stderr, "Mask repeats:\n Puzzles %d\n Masked contigs %d\n", puzzleCounter, counter );
2126 /*************************************************
2127 Function:
2128 Countlink
2129 Description:
2130 Counts valid connections of all contigs.
2131 Input:
2132 None.
2133 Output:
2134 None.
2135 Return:
2136 Valid connections of all contigs.
2137 *************************************************/
2138 static int Countlink()
2140 unsigned int i, bal_i;
2141 int conflict_count = 0;
2143 for ( i = 1; i < num_ctg; i++ )
2145 if ( contig_array[i].mask )
2146 { continue; }
2148 int out_num = validConnect ( i, NULL );
2150 if ( out_num > 1 )
2151 { conflict_count++; }
2154 return conflict_count;
2157 /*************************************************
2158 Function:
2159 ordering
2160 Description:
2161 Constructs scaffold using alignment information of paired-end
2162 reads rank by rank.
2163 Input:
2164 1. deWeak: indicator of deleting weak connection.
2165 2. downS: indicator of large insert size paired-end reads
2166 3. nonlinear: indicator of simplifying graph using loose restraint
2167 4. infile: prefix of graph
2168 Output:
2169 None.
2170 Return:
2171 None.
2172 *************************************************/
2173 static void ordering ( boolean deWeak, boolean downS, boolean nonlinear, char * infile )
2175 int conf0, conf1, conf2, conf3, conf4, conf5;
2176 debugging();
2178 if ( downS )
2180 downSlide();
2181 debugging();
2183 if ( deWeak )
2184 { deleteWeakCnt ( weakPE ); }
2186 else
2188 if ( deWeak )
2189 { deleteWeakCnt ( weakPE ); }
2192 debugging();
2193 simplifyCnt();
2194 debugging();
2195 maskRepeat();
2196 debugging();
2197 simplifyCnt();
2198 debugging();
2200 if ( nonlinear )
2202 fprintf ( stderr, "Non-strict linearization.\n" );
2203 general_linearization ( 0 );
2206 maskPuzzle ( 2, 0 );
2207 debugging();
2208 freezing();
2209 debugging();
2213 /*************************************************
2214 Function:
2215 checkOverlapInBetween
2216 Description:
2217 Checks if adjacent contigs in the array have reasonable overlap.
2218 Input:
2219 1. tolerance: max percentage that overlap length accounts for
2220 Output:
2221 None.
2222 Return:
2223 1 if the overlap situation was resonable.
2224 *************************************************/
2225 boolean checkOverlapInBetween ( double tolerance )
2227 int i, gap;
2228 int index;
2229 unsigned int node;
2230 int lenSum, lenOlp;
2231 lenSum = lenOlp = 0;
2233 for ( i = 0; i < nodeCounter; i++ )
2235 node = nodesInSubInOrder[i];
2236 lenSum += contig_array[node].length;
2237 index = getIndexInArray ( node );
2238 nodeDistanceInOrder[i] = nodeDistance[index];
2241 if ( lenSum < 1 )
2242 { return 1; }
2244 for ( i = 0; i < nodeCounter - 1; i++ )
2246 gap = nodeDistanceInOrder[i + 1] - nodeDistanceInOrder[i]
2247 - contig_array[nodesInSubInOrder[i + 1]].length;
2249 if ( -gap > 0 )
2250 { lenOlp += -gap; }
2252 if ( ( double ) lenOlp / lenSum > tolerance )
2254 return 0;
2258 return 1;
2262 /********* the following codes are for freezing current scaffolds ****************/
2263 /*************************************************
2264 Function:
2265 setUsed
2266 Description:
2267 Checks status of connection between contigs. If none of them
2268 equals to "flag", sets all status to "flag" and adjusts the related
2269 indicators accordingly. Otherwise does NOT change the status
2270 at all.
2271 Input:
2272 1. start: the first contig
2273 2. array: contig array
2274 3. max_steps: max number of connection to check
2275 4. flag: new status
2276 Output:
2277 None.
2278 Return:
2279 o if setting successed.
2280 *************************************************/
2281 static boolean setUsed ( unsigned int start, unsigned int * array, int max_steps, boolean flag )
2283 unsigned int prevCtg = start;
2284 unsigned int twinA, twinB;
2285 int j;
2286 CONNECT * cnt;
2287 boolean usedFlag = 0;
2288 // save 'used' to 'checking'
2289 prevCtg = start;
2291 for ( j = 0; j < max_steps; j++ )
2293 if ( array[j] == 0 )
2294 { break; }
2296 cnt = getCntBetween ( prevCtg, array[j] );
2298 if ( !cnt )
2300 fprintf ( stderr, "setUsed: no connect between %d and %d\n", prevCtg, array[j] );
2301 prevCtg = array[j];
2302 continue;
2305 if ( cnt->used == flag || cnt->nextInScaf || cnt->prevInScaf || cnt->singleInScaf )
2307 return 1;
2310 cnt->checking = cnt->used;
2311 twinA = getTwinCtg ( prevCtg );
2312 twinB = getTwinCtg ( array[j] );
2313 cnt = getCntBetween ( twinB, twinA );
2315 if ( cnt )
2316 { cnt->checking = cnt->used; }
2318 prevCtg = array[j];
2321 // set used to flag
2322 prevCtg = start;
2324 for ( j = 0; j < max_steps; j++ )
2326 if ( array[j] == 0 )
2327 { break; }
2329 cnt = getCntBetween ( prevCtg, array[j] );
2331 if ( !cnt )
2333 prevCtg = array[j];
2334 continue;
2337 if ( cnt->used == flag )
2339 usedFlag = 1;
2340 break;
2343 cnt->used = flag;
2344 twinA = getTwinCtg ( prevCtg );
2345 twinB = getTwinCtg ( array[j] );
2346 cnt = getCntBetween ( twinB, twinA );
2348 if ( cnt )
2349 { cnt->used = flag; }
2351 prevCtg = array[j];
2354 // set mask to 'NOT flag' or set used to original value
2355 prevCtg = start;
2357 for ( j = 0; j < max_steps; j++ )
2359 if ( array[j] == 0 )
2360 { break; }
2362 cnt = getCntBetween ( prevCtg, array[j] );
2364 if ( !cnt )
2366 prevCtg = array[j];
2367 continue;
2370 if ( !usedFlag )
2371 { cnt->mask = 1 - flag; }
2372 else
2373 { cnt->used = cnt->checking; }
2375 twinA = getTwinCtg ( prevCtg );
2376 twinB = getTwinCtg ( array[j] );
2377 cnt = getCntBetween ( twinB, twinA );
2378 cnt->used = 1 - flag;
2380 if ( !usedFlag )
2381 { cnt->mask = 1 - flag; }
2382 else
2383 { cnt->used = cnt->checking; }
2385 prevCtg = array[j];
2388 return usedFlag;
2392 /*************************************************
2393 Function:
2394 score_pass
2395 Description:
2396 Checks whether a contig is supposed to locate in a scaffold
2397 according to its connections to other contigs in this scaffold.
2398 Input:
2399 1. array: array of contigs of part of or whole scaffold
2400 2. Counter: contig number in array
2401 3. beforep: index of previous contig
2402 4. afterp: index of next contig
2403 5. id: current contig
2404 Output:
2405 None.
2406 Return:
2407 1 if this contig had enough support from other contigs.
2408 *************************************************/
2409 int score_pass ( DARRAY * array, int Counter, int beforep, int afterp, int id )
2411 int outnum = allConnect ( id, NULL );
2412 int innum = allConnect ( getTwinCtg ( id ), NULL );
2413 int start = beforep - 2 * innum > 0 ? beforep - 2 * innum : 0;
2414 int end = afterp + 2 * outnum < Counter ? afterp + 2 * outnum : Counter;
2415 int i, inc = 1, outc = 1;
2416 CONNECT * dh_cnt;
2418 for ( i = start; i < end; i++ )
2420 if ( i < beforep )
2422 dh_cnt = getCntBetween ( * ( unsigned int * ) darrayGet ( array, i ), id );
2424 if ( dh_cnt )
2425 { inc++; }
2428 if ( i > afterp )
2430 dh_cnt = getCntBetween ( id, * ( unsigned int * ) darrayGet ( array, i ) );
2432 if ( dh_cnt )
2433 { outc++; }
2437 if ( inc == innum || outc == outnum )
2438 { return 1; }
2440 if ( ( inc == 1 && innum > 2 ) || ( outc == 1 && outnum > 2 ) )
2441 { return 0; }
2443 int score = ( int ) ( ( ( double ) ( inc * outc ) / ( double ) ( innum * outnum ) ) * 100 );
2445 if ( score > 30 )
2446 { return 1; }
2448 return 0;
2451 /*************************************************
2452 Function:
2453 recoverMask
2454 Description:
2455 Searchs a contig path between two contigs accroding to
2456 connection relation. If a path is found, recovers those masked
2457 connection between contigs in this path.
2458 A ------ D
2459 > [i] <
2461 Input:
2462 None.
2463 Output:
2464 None.
2465 Return:
2466 None.
2467 *************************************************/
2468 static void recoverMask()
2470 unsigned int i, ctg, bal_ctg, start, finish;
2471 int num3, num5, j, t;
2472 CONNECT * bindCnt, *cnt;
2473 int min, max, max_steps = 5, num_route, length;
2474 int tempCounter, recoverCounter = 0;
2475 boolean multiUSE, change;
2476 int stat[] = {0, 0, 0, 0, 0, 0, 0};
2478 for ( i = 1; i <= num_ctg; i++ )
2479 { contig_array[i].flag = 0; }
2481 so_far = ( unsigned int * ) ckalloc ( max_n_routes * sizeof ( unsigned int ) );
2482 found_routes = ( unsigned int ** ) ckalloc ( max_n_routes * sizeof ( unsigned int * ) );
2484 for ( j = 0; j < max_n_routes; j++ )
2485 { found_routes[j] = ( unsigned int * ) ckalloc ( max_steps * sizeof ( unsigned int ) ); }
2487 for ( i = 1; i <= num_ctg; i++ )
2489 if ( contig_array[i].flag || contig_array[i].mask || !contig_array[i].downwardConnect )
2490 { continue; }
2492 bindCnt = getBindCnt ( i );
2494 if ( !bindCnt )
2495 { continue; }
2497 //first scan get the average coverage by longer pe
2498 num5 = num3 = 0;
2499 ctg = i;
2500 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = i;
2501 contig_array[i].flag = 1;
2502 contig_array[getTwinCtg ( i )].flag = 1;
2504 while ( bindCnt )
2506 if ( bindCnt->used )
2507 { break; }
2509 setConnectUsed ( ctg, bindCnt->contigID, 1 );
2510 ctg = bindCnt->contigID;
2511 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = ctg;
2512 bal_ctg = getTwinCtg ( ctg );
2513 contig_array[ctg].flag = 1;
2514 contig_array[bal_ctg].flag = 1;
2515 bindCnt = bindCnt->nextInScaf;
2518 ctg = getTwinCtg ( i );
2519 bindCnt = getBindCnt ( ctg );
2521 while ( bindCnt )
2523 if ( bindCnt->used )
2524 { break; }
2526 setConnectUsed ( ctg, bindCnt->contigID, 1 );
2527 ctg = bindCnt->contigID;
2528 bal_ctg = getTwinCtg ( ctg );
2529 contig_array[ctg].flag = 1;
2530 contig_array[bal_ctg].flag = 1;
2531 * ( unsigned int * ) darrayPut ( scaf3, num3++ ) = bal_ctg;
2532 bindCnt = bindCnt->nextInScaf;
2535 if ( num5 + num3 < 2 )
2536 { continue; }
2538 tempCounter = solidCounter = 0;
2540 for ( j = num3 - 1; j >= 0; j-- )
2541 * ( unsigned int * ) darrayPut ( tempArray, tempCounter++ ) =
2542 * ( unsigned int * ) darrayGet ( scaf3, j );
2544 for ( j = 0; j < num5; j++ )
2545 * ( unsigned int * ) darrayPut ( tempArray, tempCounter++ ) =
2546 * ( unsigned int * ) darrayGet ( scaf5, j );
2548 change = 0;
2550 for ( t = 0; t < tempCounter - 1; t++ )
2552 * ( unsigned int * ) darrayPut ( solidArray, solidCounter++ ) =
2553 * ( unsigned int * ) darrayGet ( tempArray, t );
2554 start = * ( unsigned int * ) darrayGet ( tempArray, t );
2555 finish = * ( unsigned int * ) darrayGet ( tempArray, t + 1 );
2556 num_route = num_trace = 0;
2557 cnt = checkConnect ( start, finish );
2559 if ( !cnt )
2561 fprintf ( stderr, "Warning from recoverMask: no connection (%d %d), start at %d\n",
2562 start, finish, i );
2563 cnt = getCntBetween ( start, finish );
2565 if ( cnt )
2566 { debugging1 ( start, finish ); }
2568 continue;
2571 length = cnt->gapLen + contig_array[finish].length;
2572 min = length - 1.5 * ins_size_var;
2573 max = length + 1.5 * ins_size_var;
2574 traceAlongMaskedCnt ( finish, start, max_steps, min, max, 0, 0, &num_route );
2576 if ( finish == start )
2578 for ( j = 0; j < tempCounter; j++ )
2579 { fprintf ( stderr, "->%d", * ( unsigned int * ) darrayGet ( tempArray, j ) ); }
2581 fprintf ( stderr, ": start at %d\n", i );
2584 if ( num_route == 1 )
2586 for ( j = 0; j < max_steps; j++ )
2587 if ( found_routes[0][j] == 0 )
2588 { break; }
2590 if ( j < 1 )
2591 { continue; }
2593 //check if connects have been used more than once
2594 multiUSE = setUsed ( start, found_routes[0], max_steps, 1 );
2596 if ( multiUSE )
2597 { continue; }
2599 for ( j = 0; j < max_steps; j++ )
2601 if ( j + 1 == max_steps || found_routes[0][j + 1] == 0 )
2602 { break; }
2604 * ( unsigned int * ) darrayPut ( solidArray, solidCounter++ ) = found_routes[0][j];
2605 contig_array[found_routes[0][j]].flag = 1;
2606 contig_array[getTwinCtg ( found_routes[0][j] )].flag = 1;
2609 recoverCounter += j;
2610 setConnectDelete ( start, finish, 1, 1 );
2611 change = 1;
2612 stat[0]++;
2613 } //end if num_route=1
2614 else if ( num_route > 1 )
2616 //multi-route.
2617 int k, l, num, longest = 0, longestid = 0;
2618 int merg = 0, quality = 0;
2620 // get the longest route.
2621 for ( k = 0; k < num_route; k++ )
2623 for ( j = 0; j < max_steps; j++ )
2625 if ( j + 1 == max_steps || found_routes[k][j + 1] == 0 )
2627 if ( j > longest )
2629 longest = j;
2630 longestid = k;
2633 break;
2638 stat[1]++;
2640 if ( longest == 1 ) //multi one.
2642 stat[2]++;
2644 for ( k = 0; k < num_route; k++ )
2646 if ( score_pass ( tempArray, tempCounter, t, t + 1, found_routes[k][0] ) )
2648 longestid = k;
2649 quality = 1;
2650 stat[3]++;
2651 break;
2655 if ( quality == 0 )
2657 continue;
2660 else
2662 stat[4]++;
2664 for ( k = 0; k < num_route; k++ )
2666 if ( k == longestid )
2667 { continue; }
2669 int merg_num = 0, total = 0;
2671 for ( j = 0; j < max_steps; j++ )
2673 if ( j + 1 == max_steps || found_routes[k][j + 1] == 0 )
2675 total = j;
2676 break;
2679 for ( l = 0; l < longest; l++ )
2681 if ( found_routes[k][j] == found_routes[longestid][l] )
2683 merg_num++;
2684 break;
2689 if ( merg_num == total )
2690 { merg++; }
2694 if ( merg == num_route - 1 || quality == 1 || merg >= longest )
2696 multiUSE = setUsed ( start, found_routes[longestid], max_steps, 1 );
2698 if ( multiUSE )
2699 { continue; }
2701 stat[5]++;
2703 for ( j = 0; j < longest; j++ )
2705 * ( unsigned int * ) darrayPut ( solidArray, solidCounter++ ) = found_routes[longestid][j];
2706 contig_array[found_routes[longestid][j]].flag = 1;
2707 contig_array[getTwinCtg ( found_routes[longestid][j] )].flag = 1;
2710 stat[6] += j;
2711 recoverCounter += j;
2712 setConnectDelete ( start, finish, 1, 1 );
2713 change = 1;
2718 * ( unsigned int * ) darrayPut ( solidArray, solidCounter++ ) =
2719 * ( unsigned int * ) darrayGet ( tempArray, tempCounter - 1 );
2721 if ( change )
2722 { consolidate(); }
2725 fprintf ( stderr, "\nRecover contigs.\n" );
2726 fprintf ( stderr, " Total recovered contigs %d\n", recoverCounter );
2727 fprintf ( stderr, " Single-route cases %d\n", stat[0] );
2728 fprintf ( stderr, " Multi-route cases %d\n", stat[1] );
2730 for ( i = 1; i <= num_ctg; i++ )
2732 cnt = contig_array[i].downwardConnect;
2734 while ( cnt )
2736 cnt->used = 0;
2737 cnt->checking = 0;
2738 cnt = cnt->next;
2742 for ( j = 0; j < max_n_routes; j++ )
2743 { free ( ( void * ) found_routes[j] ); }
2745 free ( ( void * ) found_routes );
2746 free ( ( void * ) so_far );
2750 /*************************************************
2751 Function:
2752 unBindLink
2753 Description:
2754 Destroys upstream and downstram connections of connection
2755 between two specified contigs.
2756 Input:
2757 1. CB: the first specified contig
2758 2. CC: the second specified contig
2759 Output:
2760 None.
2761 Return:
2762 None.
2763 *************************************************/
2764 static void unBindLink ( unsigned int CB, unsigned int CC )
2766 CONNECT * cnt1 = getCntBetween ( CB, CC );
2768 if ( !cnt1 )
2769 { return; }
2771 if ( cnt1->singleInScaf )
2772 { cnt1->singleInScaf = 0; }
2774 CONNECT * cnt2 = getCntBetween ( getTwinCtg ( CC ), getTwinCtg ( CB ) );
2776 if ( !cnt2 )
2777 { return; }
2779 if ( cnt2->singleInScaf )
2780 { cnt2->singleInScaf = 0; }
2782 if ( cnt1->nextInScaf )
2784 unsigned int CD = cnt1->nextInScaf->contigID;
2785 cnt1->nextInScaf->prevInScaf = 0;
2786 cnt1->nextInScaf = NULL;
2787 CONNECT * cnt3 = getCntBetween ( getTwinCtg ( CD ), getTwinCtg ( CC ) );
2789 if ( cnt3 )
2790 { cnt3->nextInScaf = NULL; }
2792 cnt2->prevInScaf = 0;
2795 if ( cnt2->nextInScaf )
2797 unsigned int bal_CA = cnt2->nextInScaf->contigID;
2798 cnt2->nextInScaf->prevInScaf = 0;
2799 cnt2->nextInScaf = NULL;
2800 CONNECT * cnt4 = getCntBetween ( getTwinCtg ( bal_CA ), CB );
2802 if ( cnt4 )
2803 { cnt4->nextInScaf = NULL; }
2805 cnt1->prevInScaf = 0;
2809 /*************************************************
2810 Function:
2811 freezing
2812 Description:
2813 Updates connection relation and scaffold ID of contigs in a scaffold.
2814 Input:
2815 None.
2816 Output:
2817 None.
2818 Return:
2819 None.
2820 *************************************************/
2821 static void freezing()
2823 int num5, num3;
2824 unsigned int ctg, bal_ctg;
2825 unsigned int i;
2826 int j, t;
2827 CONNECT * cnt, *prevCNT, *nextCnt;
2828 boolean excep;
2830 for ( i = 1; i <= num_ctg; i++ )
2832 contig_array[i].flag = 0;
2833 contig_array[i].from_vt = 0;
2834 contig_array[i].to_vt = 0;
2835 cnt = contig_array[i].downwardConnect;
2837 while ( cnt )
2839 cnt->used = 0;
2840 cnt->checking = 0;
2841 cnt->singleInScaf = 0;
2842 cnt = cnt->next;
2846 for ( i = 1; i <= num_ctg; i++ )
2848 if ( contig_array[i].flag || contig_array[i].mask )
2849 { continue; }
2851 if ( !contig_array[i].downwardConnect || !validConnect ( i, NULL ) )
2853 continue;
2856 num5 = num3 = 0;
2857 ctg = i;
2858 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = i;
2859 contig_array[i].flag = 1;
2860 contig_array[getTwinCtg ( i )].flag = 1;
2861 prevCNT = NULL;
2862 cnt = getNextContig ( ctg, prevCNT, &excep );
2864 while ( cnt )
2866 if ( contig_array[cnt->contigID].flag )
2868 unBindLink ( ctg, cnt->contigID );
2869 break;
2872 nextCnt = getNextContig ( cnt->contigID, cnt, &excep );
2873 setConnectUsed ( ctg, cnt->contigID, 1 );
2874 ctg = cnt->contigID;
2875 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = ctg;
2876 bal_ctg = getTwinCtg ( ctg );
2877 contig_array[ctg].flag = 1;
2878 contig_array[bal_ctg].flag = 1;
2879 prevCNT = cnt;
2880 cnt = nextCnt;
2883 ctg = getTwinCtg ( i );
2885 if ( num5 >= 2 )
2886 { prevCNT = checkConnect ( getTwinCtg ( * ( unsigned int * ) darrayGet ( scaf5, 1 ) ), ctg ); }
2887 else
2888 { prevCNT = NULL; }
2890 cnt = getNextContig ( ctg, prevCNT, &excep );
2892 while ( cnt )
2894 if ( contig_array[cnt->contigID].flag )
2896 unBindLink ( ctg, cnt->contigID );
2897 break;
2900 nextCnt = getNextContig ( cnt->contigID, cnt, &excep );
2901 setConnectUsed ( ctg, cnt->contigID, 1 );
2902 ctg = cnt->contigID;
2903 bal_ctg = getTwinCtg ( ctg );
2904 contig_array[ctg].flag = 1;
2905 contig_array[bal_ctg].flag = 1;
2906 * ( unsigned int * ) darrayPut ( scaf3, num3++ ) = bal_ctg;
2907 prevCNT = cnt;
2908 cnt = nextCnt;
2911 if ( num5 + num3 < 2 )
2912 { continue; }
2914 solidCounter = 0;
2916 for ( j = num3 - 1; j >= 0; j-- )
2917 * ( unsigned int * ) darrayPut ( solidArray, solidCounter++ ) =
2918 * ( unsigned int * ) darrayGet ( scaf3, j );
2920 for ( j = 0; j < num5; j++ )
2921 * ( unsigned int * ) darrayPut ( solidArray, solidCounter++ ) =
2922 * ( unsigned int * ) darrayGet ( scaf5, j );
2924 unsigned int firstCtg = 0;
2925 unsigned int lastCtg = 0;
2926 unsigned int firstTwin = 0;
2927 unsigned int lastTwin = 0;
2929 for ( t = 0; t < solidCounter; t++ )
2930 if ( !contig_array[* ( unsigned int * ) darrayGet ( solidArray, t )].mask )
2932 firstCtg = * ( unsigned int * ) darrayGet ( solidArray, t );
2933 break;
2936 for ( t = solidCounter - 1; t >= 0; t-- )
2937 if ( !contig_array[* ( unsigned int * ) darrayGet ( solidArray, t )].mask )
2939 lastCtg = * ( unsigned int * ) darrayGet ( solidArray, t );
2940 break;
2943 if ( firstCtg == 0 || lastCtg == 0 )
2945 fprintf ( stderr, "scaffold start at %d, stop at %d, freezing began with %d\n", firstCtg, lastCtg, i );
2947 for ( j = 0; j < solidCounter; j++ )
2948 fprintf ( stderr, "->%d(%d %d)", * ( unsigned int * ) darrayGet ( solidArray, j )
2949 , contig_array[* ( unsigned int * ) darrayGet ( solidArray, j )].mask
2950 , contig_array[* ( unsigned int * ) darrayGet ( solidArray, j )].flag );
2952 fprintf ( stderr, "\n" );
2954 else
2956 firstTwin = getTwinCtg ( firstCtg );
2957 lastTwin = getTwinCtg ( lastCtg );
2960 for ( t = 0; t < solidCounter; t++ )
2962 unsigned int ctg = * ( unsigned int * ) darrayGet ( solidArray, t );
2964 if ( contig_array[ctg].from_vt > 0 )
2966 contig_array[ctg].mask = 1;
2967 contig_array[getTwinCtg ( ctg )].mask = 1;
2968 fprintf ( stderr, "Repeat: contig %d (%d) appears more than once\n", ctg, getTwinCtg ( ctg ) );
2970 else
2972 contig_array[ctg].from_vt = firstCtg;
2973 contig_array[ctg].to_vt = lastCtg;
2974 contig_array[ctg].indexInScaf = t + 1;
2975 contig_array[getTwinCtg ( ctg )].from_vt = lastTwin;
2976 contig_array[getTwinCtg ( ctg )].to_vt = firstTwin;
2977 contig_array[getTwinCtg ( ctg )].indexInScaf = solidCounter - t;
2981 consolidate();
2984 fprintf ( stderr, "Freezing done.\n" );
2985 fflush ( stdout );
2987 for ( i = 1; i <= num_ctg; i++ )
2989 if ( contig_array[i].flag )
2990 { contig_array[i].flag = 0; }
2992 if ( contig_array[i].from_vt == 0 )
2994 contig_array[i].from_vt = i;
2995 contig_array[i].to_vt = i;
2998 cnt = contig_array[i].downwardConnect;
3000 while ( cnt )
3002 cnt->used = 0;
3003 cnt->checking = 0;
3004 cnt = cnt->next;
3009 /************** codes below this line are for pulling the scaffolds out ************/
3010 /*************************************************
3011 Function:
3012 output1gap
3013 Description:
3014 Outputs information of a filled gap.
3015 Input:
3016 1. fo: output file
3017 2. max_steps: maximum allowed found contigs for filling gap
3018 Output:
3019 None.
3020 Return:
3021 None.
3022 *************************************************/
3023 void output1gap ( FILE * fo, int max_steps )
3025 int i, len, seg;
3026 len = seg = 0;
3028 for ( i = 0; i < max_steps - 1; i++ )
3030 if ( found_routes[0][i + 1] == 0 )
3031 { break; }
3033 len += contig_array[found_routes[0][i]].length;
3034 seg++;
3037 fprintf ( fo, "GAP %d %d", len, seg );
3039 for ( i = 0; i < max_steps - 1; i++ )
3041 if ( found_routes[0][i + 1] == 0 )
3042 { break; }
3044 fprintf ( fo, " %d", found_routes[0][i] );
3047 fprintf ( fo, "\n" );
3050 static int weakCounter;
3052 /*************************************************
3053 Function:
3054 printCnts
3055 Description:
3056 Outputs connection information of specified contig.
3057 Input:
3058 1. fp: output file
3059 2. ctg: specified contig
3060 Output:
3061 None.
3062 Return:
3063 0 if contig's downstream connection was not weak connection.
3064 *************************************************/
3065 static boolean printCnts ( FILE * fp, unsigned int ctg )
3067 CONNECT * cnt = contig_array[ctg].downwardConnect;
3068 boolean flag = 0, ret = 0;
3069 unsigned int bal_ctg = getTwinCtg ( ctg );
3070 unsigned int linkCtg;
3072 if ( isSameAsTwin ( ctg ) )
3073 { return ret; }
3075 CONNECT * bindCnt = getBindCnt ( ctg );
3077 if ( bindCnt && bindCnt->bySmall && bindCnt->weakPoint )
3079 weakCounter++;
3080 fprintf ( fp, "\tWP" );
3081 ret = 1;
3084 while ( cnt )
3086 if ( cnt->weight && !cnt->inherit )
3088 if ( !flag )
3090 flag = 1;
3091 fprintf ( fp, "\t#DOWN " );
3094 linkCtg = cnt->contigID;
3096 if ( isLargerThanTwin ( linkCtg ) )
3097 { linkCtg = getTwinCtg ( linkCtg ); }
3099 fprintf ( fp, "%d:%d:%d ", index_array[linkCtg], cnt->weight, cnt->gapLen );
3102 cnt = cnt->next;
3105 flag = 0;
3106 cnt = contig_array[bal_ctg].downwardConnect;
3108 while ( cnt )
3110 if ( cnt->weight && !cnt->inherit )
3112 if ( !flag )
3114 flag = 1;
3115 fprintf ( fp, "\t#UP " );
3118 linkCtg = cnt->contigID;
3120 if ( isLargerThanTwin ( linkCtg ) )
3121 { linkCtg = getTwinCtg ( linkCtg ); }
3123 fprintf ( fp, "%d:%d:%d ", index_array[linkCtg], cnt->weight, cnt->gapLen );
3126 cnt = cnt->next;
3129 fprintf ( fp, "\n" );
3130 return ret;
3133 /*************************************************
3134 Function:
3135 ScafStat
3136 Description:
3137 Makes statistic of scaffolds and contigs, including total length,
3138 non-N length, number, N50, etc.
3139 Input:
3140 1. len_cut: length cutoff
3141 2. graphfile: prefix of graph
3142 Output:
3143 None.
3144 Return:
3145 None.
3146 *************************************************/
3147 void ScafStat ( int len_cut, char * graphfile )
3149 FILE * fp, *fp2, *fo;
3150 char line[1024];
3151 sprintf ( line, "%s.scafSeq", graphfile );
3152 fp = ckopen ( line, "r" );
3153 sprintf ( line, "%s.contig", graphfile );
3154 fp2 = ckopen ( line, "r" );
3155 sprintf ( line, "%s.scafStatistics", graphfile );
3156 fo = ckopen ( line, "w" );
3157 fprintf ( fo, "<-- Information for assembly Scaffold '%s.scafSeq'.(cut_off_length < 100bp) -->\n\n", graphfile );
3158 int cut_off_len = 0;
3159 char Nucleotide;
3160 char buf[4000];
3161 long Scaffold_Number = 0;
3162 long Scaffold_Number_Scaf = 0;
3163 long Scaffold_Number_Contig = 0;
3164 long Singleton_Number_Scaf = 0;
3165 long Singleton_Number = 0;
3166 long * Singleton_Seq = ( long * ) malloc ( sizeof ( long ) );
3167 long long A_num_all = 0;
3168 long * A_num = ( long * ) malloc ( sizeof ( long ) );
3169 long long C_num_all = 0;
3170 long * C_num = ( long * ) malloc ( sizeof ( long ) );
3171 long long G_num_all = 0;
3172 long * G_num = ( long * ) malloc ( sizeof ( long ) );
3173 long long T_num_all = 0;
3174 long * T_num = ( long * ) malloc ( sizeof ( long ) );
3175 long long N_num_all = 0;
3176 long * N_num = ( long * ) malloc ( sizeof ( long ) );
3177 long long Non_ACGTN_all = 0;
3178 long * Non_ACGTN = ( long * ) malloc ( sizeof ( long ) );
3179 long long Size_includeN = 0;
3180 long * Size_Seq = ( long * ) malloc ( sizeof ( long ) );
3181 int k;
3182 long long Sum = 0;
3183 int flag[10];
3184 int flag_known = 0;
3185 long n100 = 0;
3186 long n500 = 0;
3187 long n1k = 0;
3188 long n10k = 0;
3189 long n100k = 0;
3190 long n1m = 0;
3191 long N50 = 0;
3192 long N50_known = 0;
3193 long Num_N50_known = 0;
3194 cut_off_len = len_cut;
3195 A_num[Scaffold_Number] = 0;
3196 C_num[Scaffold_Number] = 0;
3197 G_num[Scaffold_Number] = 0;
3198 T_num[Scaffold_Number] = 0;
3199 N_num[Scaffold_Number] = 0;
3200 Non_ACGTN[Scaffold_Number] = 0;
3201 Size_Seq[Scaffold_Number] = 0;
3202 Singleton_Seq[Scaffold_Number] = 0;
3203 Nucleotide = fgetc ( fp );
3205 while ( Nucleotide != (char) EOF ) /* Bug Fix */
3207 if ( Nucleotide == '>' )
3209 if ( ( Scaffold_Number > 0 ) && ( Size_Seq[Scaffold_Number - 1] < cut_off_len ) )
3211 A_num_all = A_num_all - A_num[Scaffold_Number - 1];
3212 C_num_all = C_num_all - C_num[Scaffold_Number - 1];
3213 G_num_all = G_num_all - G_num[Scaffold_Number - 1];
3214 T_num_all = T_num_all - T_num[Scaffold_Number - 1];
3215 N_num_all = N_num_all - N_num[Scaffold_Number - 1];
3216 Non_ACGTN_all = Non_ACGTN_all - Non_ACGTN[Scaffold_Number - 1];
3217 Size_includeN = Size_includeN - Size_Seq[Scaffold_Number - 1];
3218 Singleton_Number = Singleton_Number - Singleton_Seq[Scaffold_Number - 1];
3219 Scaffold_Number = Scaffold_Number - 1;
3221 else
3223 Size_Seq = ( long * ) realloc ( Size_Seq, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3224 A_num = ( long * ) realloc ( A_num, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3225 C_num = ( long * ) realloc ( C_num, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3226 G_num = ( long * ) realloc ( G_num, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3227 T_num = ( long * ) realloc ( T_num, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3228 N_num = ( long * ) realloc ( N_num, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3229 Non_ACGTN = ( long * ) realloc ( Non_ACGTN, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3230 Singleton_Seq = ( long * ) realloc ( Singleton_Seq, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3233 Scaffold_Number++;
3234 A_num[Scaffold_Number - 1] = 0;
3235 C_num[Scaffold_Number - 1] = 0;
3236 G_num[Scaffold_Number - 1] = 0;
3237 T_num[Scaffold_Number - 1] = 0;
3238 N_num[Scaffold_Number - 1] = 0;
3239 Non_ACGTN[Scaffold_Number - 1] = 0;
3240 Size_Seq[Scaffold_Number - 1] = 0;
3241 Singleton_Seq[Scaffold_Number - 1] = 0;
3242 Nucleotide = fgetc ( fp );
3244 if ( Nucleotide == 'C' )
3246 Singleton_Number++;
3247 Singleton_Seq[Scaffold_Number - 1] ++;
3250 fgets ( buf, 4000, fp );
3252 else if ( ( Nucleotide == 'N' ) || ( Nucleotide == 'n' ) )
3254 N_num[Scaffold_Number - 1] ++;
3255 N_num_all++;
3256 Size_Seq[Scaffold_Number - 1] ++;
3257 Size_includeN++;
3259 else if ( ( Nucleotide == 'A' ) || ( Nucleotide == 'a' ) )
3261 A_num[Scaffold_Number - 1] ++;
3262 A_num_all++;
3263 Size_Seq[Scaffold_Number - 1] ++;
3264 Size_includeN++;
3266 else if ( ( Nucleotide == 'C' ) || ( Nucleotide == 'c' ) )
3268 C_num[Scaffold_Number - 1] ++;
3269 C_num_all++;
3270 Size_Seq[Scaffold_Number - 1] ++;
3271 Size_includeN++;
3273 else if ( ( Nucleotide == 'G' ) || ( Nucleotide == 'g' ) )
3275 G_num[Scaffold_Number - 1] ++;
3276 G_num_all++;
3277 Size_Seq[Scaffold_Number - 1] ++;
3278 Size_includeN++;
3280 else if ( ( Nucleotide == 'T' ) || ( Nucleotide == 't' ) )
3282 T_num[Scaffold_Number - 1] ++;
3283 T_num_all++;
3284 Size_Seq[Scaffold_Number - 1] ++;
3285 Size_includeN++;
3287 else
3289 if ( ( Nucleotide != '\n' ) && ( Nucleotide != '\r' ) )
3291 Non_ACGTN[Scaffold_Number - 1] ++;
3292 Non_ACGTN_all++;
3293 Size_Seq[Scaffold_Number - 1] ++;
3294 Size_includeN++;
3298 Nucleotide = fgetc ( fp );
3301 if ( Size_Seq[Scaffold_Number - 1] < cut_off_len )
3303 A_num_all = A_num_all - A_num[Scaffold_Number - 1];
3304 C_num_all = C_num_all - C_num[Scaffold_Number - 1];
3305 G_num_all = G_num_all - G_num[Scaffold_Number - 1];
3306 T_num_all = T_num_all - T_num[Scaffold_Number - 1];
3307 N_num_all = N_num_all - N_num[Scaffold_Number - 1];
3308 Non_ACGTN_all = Non_ACGTN_all - Non_ACGTN[Scaffold_Number - 1];
3309 Size_includeN = Size_includeN - Size_Seq[Scaffold_Number - 1];
3310 Singleton_Number = Singleton_Number - Singleton_Seq[Scaffold_Number - 1];
3311 Scaffold_Number = Scaffold_Number - 1;
3314 qsort ( Size_Seq, Scaffold_Number, sizeof ( Size_Seq[0] ), cmp_int );
3315 fprintf ( fo, "Size_includeN\t%lld\n", Size_includeN );
3316 fprintf ( fo, "Size_withoutN\t%lld\n", Size_includeN - N_num_all );
3317 fprintf ( fo, "Scaffold_Num\t%ld\n", Scaffold_Number );
3318 fprintf ( fo, "Mean_Size\t%lld\n", Size_includeN / Scaffold_Number );
3319 fprintf ( fo, "Median_Size\t%ld\n", Size_Seq[ ( Scaffold_Number + 1 ) / 2 - 1] );
3320 fprintf ( fo, "Longest_Seq\t%ld\n", Size_Seq[Scaffold_Number - 1] );
3321 fprintf ( fo, "Shortest_Seq\t%ld\n", Size_Seq[0] );
3322 fprintf ( fo, "Singleton_Num\t%ld\n", Singleton_Number );
3323 fprintf ( fo, "Average_length_of_break(N)_in_scaffold\t%lld\n", N_num_all / Scaffold_Number );
3324 fprintf ( fo, "\n" );
3326 if ( known_genome_size )
3328 fprintf ( fo, "Known_genome_size\t%ld\n", known_genome_size );
3329 fprintf ( fo, "Total_scaffold_length_as_percentage_of_known_genome_size\t%.2f%\n", 100 * ( 1.0 * Size_includeN / known_genome_size ) );
3331 else
3333 fprintf ( fo, "Known_genome_size\tNaN\n" );
3334 fprintf ( fo, "Total_scaffold_length_as_percentage_of_known_genome_size\tNaN\n" );
3337 fprintf ( fo, "\n" );
3339 for ( k = 0; k < Scaffold_Number; k++ )
3341 if ( Size_Seq[k] > 100 )
3343 n100++;
3346 if ( Size_Seq[k] > 500 )
3348 n500++;
3351 if ( Size_Seq[k] > 1000 )
3353 n1k++;
3356 if ( Size_Seq[k] > 10000 )
3358 n10k++;
3361 if ( Size_Seq[k] > 100000 )
3363 n100k++;
3366 if ( Size_Seq[k] > 1000000 )
3368 n1m++;
3372 fprintf ( fo, "scaffolds>100 \t%ld\t%.2f%\n", n100 , 100 * ( 1.0 * n100 / Scaffold_Number ) );
3373 fprintf ( fo, "scaffolds>500 \t%ld\t%.2f%\n", n500 , 100 * ( 1.0 * n500 / Scaffold_Number ) );
3374 fprintf ( fo, "scaffolds>1K \t%ld\t%.2f%\n", n1k , 100 * ( 1.0 * n1k / Scaffold_Number ) );
3375 fprintf ( fo, "scaffolds>10K \t%ld\t%.2f%\n", n10k , 100 * ( 1.0 * n10k / Scaffold_Number ) );
3376 fprintf ( fo, "scaffolds>100K\t%ld\t%.2f%\n", n100k, 100 * ( 1.0 * n100k / Scaffold_Number ) );
3377 fprintf ( fo, "scaffolds>1M \t%ld\t%.2f%\n", n1m , 100 * ( 1.0 * n1m / Scaffold_Number ) );
3378 fprintf ( fo, "\n" );
3379 fprintf ( fo, "Nucleotide_A\t%lld\t%.2f%\n", A_num_all, 100 * ( 1.0 * A_num_all / Size_includeN ) );
3380 fprintf ( fo, "Nucleotide_C\t%lld\t%.2f%\n", C_num_all, 100 * ( 1.0 * C_num_all / Size_includeN ) );
3381 fprintf ( fo, "Nucleotide_G\t%lld\t%.2f%\n", G_num_all, 100 * ( 1.0 * G_num_all / Size_includeN ) );
3382 fprintf ( fo, "Nucleotide_T\t%lld\t%.2f%\n", T_num_all, 100 * ( 1.0 * T_num_all / Size_includeN ) );
3383 fprintf ( fo, "GapContent_N\t%lld\t%.2f%\n", N_num_all, 100 * ( 1.0 * N_num_all / Size_includeN ) );
3384 fprintf ( fo, "Non_ACGTN\t%lld\t%.2f%\n", Non_ACGTN_all, 100 * ( 1.0 * Non_ACGTN_all / Size_includeN ) );
3385 fprintf ( fo, "GC_Content\t%.2f%\t\t(G+C)/(A+C+G+T)\n", 100 * ( 1.0 * ( G_num_all + C_num_all ) / ( A_num_all + C_num_all + G_num_all + T_num_all ) ) );
3386 fprintf ( fo, "\n" );
3388 for ( k = 0; k < 10; k++ )
3389 { flag[k] = 0; }
3391 for ( k = Scaffold_Number - 1; k >= 0; k-- )
3393 Sum = Sum + Size_Seq[k];
3395 if ( ( Sum >= Size_includeN * 0.1 ) && ( Sum < Size_includeN * 0.2 ) && ( flag[1] == 0 ) )
3397 fprintf ( fo, "N10\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3398 flag[1] = 1;
3400 else if ( ( Sum >= Size_includeN * 0.2 ) && ( Sum < Size_includeN * 0.3 ) && ( flag[2] == 0 ) )
3402 fprintf ( fo, "N20\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3403 flag[2] = 1;
3405 else if ( ( Sum >= Size_includeN * 0.3 ) && ( Sum < Size_includeN * 0.4 ) && ( flag[3] == 0 ) )
3407 fprintf ( fo, "N30\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3408 flag[3] = 1;
3410 else if ( ( Sum >= Size_includeN * 0.4 ) && ( Sum < Size_includeN * 0.5 ) && ( flag[4] == 0 ) )
3412 fprintf ( fo, "N40\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3413 flag[4] = 1;
3415 else if ( ( Sum >= Size_includeN * 0.5 ) && ( Sum < Size_includeN * 0.6 ) && ( flag[5] == 0 ) )
3417 fprintf ( fo, "N50\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3418 flag[5] = 1;
3419 N50 = Size_Seq[k];
3421 else if ( ( Sum >= Size_includeN * 0.6 ) && ( Sum < Size_includeN * 0.7 ) && ( flag[6] == 0 ) )
3423 fprintf ( fo, "N60\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3424 flag[6] = 1;
3426 else if ( ( Sum >= Size_includeN * 0.7 ) && ( Sum < Size_includeN * 0.8 ) && ( flag[7] == 0 ) )
3428 fprintf ( fo, "N70\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3429 flag[7] = 1;
3431 else if ( ( Sum >= Size_includeN * 0.8 ) && ( Sum < Size_includeN * 0.9 ) && ( flag[8] == 0 ) )
3433 fprintf ( fo, "N80\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3434 flag[8] = 1;
3436 else if ( ( Sum >= Size_includeN * 0.9 ) && ( flag[9] == 0 ) )
3438 fprintf ( fo, "N90\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3439 flag[9] = 1;
3442 if ( ( Sum >= known_genome_size * 0.5 ) && ( flag_known == 0 ) )
3444 N50_known = Size_Seq[k];
3445 Num_N50_known = Scaffold_Number - k;
3446 flag_known = 1;
3450 if ( flag[5] == 0 )
3452 Sum = 0;
3454 for ( k = Scaffold_Number - 1; k >= 0; k-- )
3456 Sum = Sum + Size_Seq[k];
3458 if ( Sum >= Size_includeN * 0.5 )
3460 fprintf ( fo, "N50\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3461 break;
3466 fprintf ( fo, "\n" );
3468 if ( known_genome_size )
3470 fprintf ( fo, "NG50\t%ld\t%ld\n", N50_known, Num_N50_known );
3471 fprintf ( fo, "N50_scaffold-NG50_scaffold_length_difference\t%ld\n", abs ( N50 - N50_known ) );
3473 else
3475 fprintf ( fo, "NG50\tNaN\tNaN\n" );
3476 fprintf ( fo, "N50_scaffold-NG50_scaffold_length_difference\tNaN\n" );
3479 fprintf ( fo, "\n" );
3480 free ( A_num );
3481 free ( C_num );
3482 free ( G_num );
3483 free ( T_num );
3484 free ( N_num );
3485 free ( Non_ACGTN );
3486 free ( Singleton_Seq );
3487 free ( Size_Seq );
3488 Scaffold_Number_Scaf = Scaffold_Number;
3489 Singleton_Number_Scaf = Singleton_Number;
3490 /*********************** Contig ******************************/
3491 fprintf ( fo, "<-- Information for assembly Contig '%s.contig'.(cut_off_length < 100bp) -->\n\n", graphfile );
3492 cut_off_len = 0;
3493 Scaffold_Number = 0;
3494 Singleton_Number = 0;
3495 Singleton_Seq = ( long * ) malloc ( sizeof ( long ) );
3496 A_num_all = 0;
3497 A_num = ( long * ) malloc ( sizeof ( long ) );
3498 C_num_all = 0;
3499 C_num = ( long * ) malloc ( sizeof ( long ) );
3500 G_num_all = 0;
3501 G_num = ( long * ) malloc ( sizeof ( long ) );
3502 T_num_all = 0;
3503 T_num = ( long * ) malloc ( sizeof ( long ) );
3504 N_num_all = 0;
3505 N_num = ( long * ) malloc ( sizeof ( long ) );
3506 Non_ACGTN_all = 0;
3507 Non_ACGTN = ( long * ) malloc ( sizeof ( long ) );
3508 Size_includeN = 0;
3509 Size_Seq = ( long * ) malloc ( sizeof ( long ) );
3510 Sum = 0;
3511 n100 = 0;
3512 n500 = 0;
3513 n1k = 0;
3514 n10k = 0;
3515 n100k = 0;
3516 n1m = 0;
3517 N50 = 0;
3518 N50_known = 0;
3519 Num_N50_known = 0;
3520 flag_known = 0;
3521 cut_off_len = len_cut;
3522 A_num[Scaffold_Number] = 0;
3523 C_num[Scaffold_Number] = 0;
3524 G_num[Scaffold_Number] = 0;
3525 T_num[Scaffold_Number] = 0;
3526 N_num[Scaffold_Number] = 0;
3527 Non_ACGTN[Scaffold_Number] = 0;
3528 Size_Seq[Scaffold_Number] = 0;
3529 Singleton_Seq[Scaffold_Number] = 0;
3530 Nucleotide = fgetc ( fp2 );
3532 while ( Nucleotide != (char) EOF ) /* Bug Fix */
3534 if ( Nucleotide == '>' )
3536 if ( ( Scaffold_Number > 0 ) && ( Size_Seq[Scaffold_Number - 1] < cut_off_len ) )
3538 A_num_all = A_num_all - A_num[Scaffold_Number - 1];
3539 C_num_all = C_num_all - C_num[Scaffold_Number - 1];
3540 G_num_all = G_num_all - G_num[Scaffold_Number - 1];
3541 T_num_all = T_num_all - T_num[Scaffold_Number - 1];
3542 N_num_all = N_num_all - N_num[Scaffold_Number - 1];
3543 Non_ACGTN_all = Non_ACGTN_all - Non_ACGTN[Scaffold_Number - 1];
3544 Size_includeN = Size_includeN - Size_Seq[Scaffold_Number - 1];
3545 Singleton_Number = Singleton_Number - Singleton_Seq[Scaffold_Number - 1];
3546 Scaffold_Number = Scaffold_Number - 1;
3548 else
3550 Size_Seq = ( long * ) realloc ( Size_Seq, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3551 A_num = ( long * ) realloc ( A_num, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3552 C_num = ( long * ) realloc ( C_num, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3553 G_num = ( long * ) realloc ( G_num, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3554 T_num = ( long * ) realloc ( T_num, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3555 N_num = ( long * ) realloc ( N_num, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3556 Non_ACGTN = ( long * ) realloc ( Non_ACGTN, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3557 Singleton_Seq = ( long * ) realloc ( Singleton_Seq, ( Scaffold_Number + 2 ) * sizeof ( long ) );
3560 Scaffold_Number++;
3561 A_num[Scaffold_Number - 1] = 0;
3562 C_num[Scaffold_Number - 1] = 0;
3563 G_num[Scaffold_Number - 1] = 0;
3564 T_num[Scaffold_Number - 1] = 0;
3565 N_num[Scaffold_Number - 1] = 0;
3566 Non_ACGTN[Scaffold_Number - 1] = 0;
3567 Size_Seq[Scaffold_Number - 1] = 0;
3568 Singleton_Seq[Scaffold_Number - 1] = 0;
3569 Nucleotide = fgetc ( fp2 );
3571 if ( Nucleotide == 'C' )
3573 Singleton_Number++;
3574 Singleton_Seq[Scaffold_Number - 1] ++;
3577 fgets ( buf, 4000, fp2 );
3579 else if ( ( Nucleotide == 'N' ) || ( Nucleotide == 'n' ) )
3581 N_num[Scaffold_Number - 1] ++;
3582 N_num_all++;
3583 Size_Seq[Scaffold_Number - 1] ++;
3584 Size_includeN++;
3586 else if ( ( Nucleotide == 'A' ) || ( Nucleotide == 'a' ) )
3588 A_num[Scaffold_Number - 1] ++;
3589 A_num_all++;
3590 Size_Seq[Scaffold_Number - 1] ++;
3591 Size_includeN++;
3593 else if ( ( Nucleotide == 'C' ) || ( Nucleotide == 'c' ) )
3595 C_num[Scaffold_Number - 1] ++;
3596 C_num_all++;
3597 Size_Seq[Scaffold_Number - 1] ++;
3598 Size_includeN++;
3600 else if ( ( Nucleotide == 'G' ) || ( Nucleotide == 'g' ) )
3602 G_num[Scaffold_Number - 1] ++;
3603 G_num_all++;
3604 Size_Seq[Scaffold_Number - 1] ++;
3605 Size_includeN++;
3607 else if ( ( Nucleotide == 'T' ) || ( Nucleotide == 't' ) )
3609 T_num[Scaffold_Number - 1] ++;
3610 T_num_all++;
3611 Size_Seq[Scaffold_Number - 1] ++;
3612 Size_includeN++;
3614 else
3616 if ( ( Nucleotide != '\n' ) && ( Nucleotide != '\r' ) )
3618 Non_ACGTN[Scaffold_Number - 1] ++;
3619 Non_ACGTN_all++;
3620 Size_Seq[Scaffold_Number - 1] ++;
3621 Size_includeN++;
3625 Nucleotide = fgetc ( fp2 );
3628 if ( Size_Seq[Scaffold_Number - 1] < cut_off_len )
3630 A_num_all = A_num_all - A_num[Scaffold_Number - 1];
3631 C_num_all = C_num_all - C_num[Scaffold_Number - 1];
3632 G_num_all = G_num_all - G_num[Scaffold_Number - 1];
3633 T_num_all = T_num_all - T_num[Scaffold_Number - 1];
3634 N_num_all = N_num_all - N_num[Scaffold_Number - 1];
3635 Non_ACGTN_all = Non_ACGTN_all - Non_ACGTN[Scaffold_Number - 1];
3636 Size_includeN = Size_includeN - Size_Seq[Scaffold_Number - 1];
3637 Singleton_Number = Singleton_Number - Singleton_Seq[Scaffold_Number - 1];
3638 Scaffold_Number = Scaffold_Number - 1;
3641 qsort ( Size_Seq, Scaffold_Number, sizeof ( Size_Seq[0] ), cmp_int );
3642 fprintf ( fo, "Size_includeN\t%lld\n", Size_includeN );
3643 fprintf ( fo, "Size_withoutN\t%lld\n", Size_includeN - N_num_all );
3644 fprintf ( fo, "Contig_Num\t%ld\n", Scaffold_Number );
3645 fprintf ( fo, "Mean_Size\t%lld\n", Size_includeN / Scaffold_Number );
3646 fprintf ( fo, "Median_Size\t%ld\n", Size_Seq[ ( Scaffold_Number + 1 ) / 2 - 1] );
3647 fprintf ( fo, "Longest_Seq\t%ld\n", Size_Seq[Scaffold_Number - 1] );
3648 fprintf ( fo, "Shortest_Seq\t%ld\n", Size_Seq[0] );
3649 fprintf ( fo, "\n" );
3651 for ( k = 0; k < Scaffold_Number; k++ )
3653 if ( Size_Seq[k] > 100 )
3655 n100++;
3658 if ( Size_Seq[k] > 500 )
3660 n500++;
3663 if ( Size_Seq[k] > 1000 )
3665 n1k++;
3668 if ( Size_Seq[k] > 10000 )
3670 n10k++;
3673 if ( Size_Seq[k] > 100000 )
3675 n100k++;
3678 if ( Size_Seq[k] > 1000000 )
3680 n1m++;
3684 fprintf ( fo, "Contig>100 \t%ld\t%.2f%\n", n100 , 100 * ( 1.0 * n100 / Scaffold_Number ) );
3685 fprintf ( fo, "Contig>500 \t%ld\t%.2f%\n", n500 , 100 * ( 1.0 * n500 / Scaffold_Number ) );
3686 fprintf ( fo, "Contig>1K \t%ld\t%.2f%\n", n1k , 100 * ( 1.0 * n1k / Scaffold_Number ) );
3687 fprintf ( fo, "Contig>10K \t%ld\t%.2f%\n", n10k , 100 * ( 1.0 * n10k / Scaffold_Number ) );
3688 fprintf ( fo, "Contig>100K\t%ld\t%.2f%\n", n100k, 100 * ( 1.0 * n100k / Scaffold_Number ) );
3689 fprintf ( fo, "Contig>1M \t%ld\t%.2f%\n", n1m , 100 * ( 1.0 * n1m / Scaffold_Number ) );
3690 fprintf ( fo, "\n" );
3691 fprintf ( fo, "Nucleotide_A\t%lld\t%.2f%\n", A_num_all, 100 * ( 1.0 * A_num_all / Size_includeN ) );
3692 fprintf ( fo, "Nucleotide_C\t%lld\t%.2f%\n", C_num_all, 100 * ( 1.0 * C_num_all / Size_includeN ) );
3693 fprintf ( fo, "Nucleotide_G\t%lld\t%.2f%\n", G_num_all, 100 * ( 1.0 * G_num_all / Size_includeN ) );
3694 fprintf ( fo, "Nucleotide_T\t%lld\t%.2f%\n", T_num_all, 100 * ( 1.0 * T_num_all / Size_includeN ) );
3695 fprintf ( fo, "GapContent_N\t%lld\t%.2f%\n", N_num_all, 100 * ( 1.0 * N_num_all / Size_includeN ) );
3696 fprintf ( fo, "Non_ACGTN\t%lld\t%.2f%\n", Non_ACGTN_all, 100 * ( 1.0 * Non_ACGTN_all / Size_includeN ) );
3697 fprintf ( fo, "GC_Content\t%.2f%\t\t(G+C)/(A+C+G+T)\n", 100 * ( 1.0 * ( G_num_all + C_num_all ) / ( A_num_all + C_num_all + G_num_all + T_num_all ) ) );
3698 fprintf ( fo, "\n" );
3700 for ( k = 0; k < 10; k++ )
3701 { flag[k] = 0; }
3703 for ( k = Scaffold_Number - 1; k >= 0; k-- )
3705 Sum = Sum + Size_Seq[k];
3707 if ( ( Sum >= Size_includeN * 0.1 ) && ( Sum < Size_includeN * 0.2 ) && ( flag[1] == 0 ) )
3709 fprintf ( fo, "N10\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3710 flag[1] = 1;
3712 else if ( ( Sum >= Size_includeN * 0.2 ) && ( Sum < Size_includeN * 0.3 ) && ( flag[2] == 0 ) )
3714 fprintf ( fo, "N20\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3715 flag[2] = 1;
3717 else if ( ( Sum >= Size_includeN * 0.3 ) && ( Sum < Size_includeN * 0.4 ) && ( flag[3] == 0 ) )
3719 fprintf ( fo, "N30\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3720 flag[3] = 1;
3722 else if ( ( Sum >= Size_includeN * 0.4 ) && ( Sum < Size_includeN * 0.5 ) && ( flag[4] == 0 ) )
3724 fprintf ( fo, "N40\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3725 flag[4] = 1;
3727 else if ( ( Sum >= Size_includeN * 0.5 ) && ( Sum < Size_includeN * 0.6 ) && ( flag[5] == 0 ) )
3729 fprintf ( fo, "N50\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3730 flag[5] = 1;
3731 N50 = Size_Seq[k];
3733 else if ( ( Sum >= Size_includeN * 0.6 ) && ( Sum < Size_includeN * 0.7 ) && ( flag[6] == 0 ) )
3735 fprintf ( fo, "N60\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3736 flag[6] = 1;
3738 else if ( ( Sum >= Size_includeN * 0.7 ) && ( Sum < Size_includeN * 0.8 ) && ( flag[7] == 0 ) )
3740 fprintf ( fo, "N70\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3741 flag[7] = 1;
3743 else if ( ( Sum >= Size_includeN * 0.8 ) && ( Sum < Size_includeN * 0.9 ) && ( flag[8] == 0 ) )
3745 fprintf ( fo, "N80\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3746 flag[8] = 1;
3748 else if ( ( Sum >= Size_includeN * 0.9 ) && ( flag[9] == 0 ) )
3750 fprintf ( fo, "N90\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3751 flag[9] = 1;
3754 if ( ( Sum >= known_genome_size * 0.5 ) && ( flag_known == 0 ) )
3756 N50_known = Size_Seq[k];
3757 Num_N50_known = Scaffold_Number - k;
3758 flag_known = 1;
3762 if ( flag[5] == 0 )
3764 Sum = 0;
3766 for ( k = Scaffold_Number - 1; k >= 0; k-- )
3768 Sum = Sum + Size_Seq[k];
3770 if ( Sum >= Size_includeN * 0.5 )
3772 fprintf ( fo, "N50\t%ld\t%ld\n", Size_Seq[k], Scaffold_Number - k );
3773 break;
3778 fprintf ( fo, "\n" );
3780 if ( known_genome_size )
3782 fprintf ( fo, "NG50\t%ld\t%ld\n", N50_known, Num_N50_known );
3783 fprintf ( fo, "N50_contig-NG50_contig_length_difference\t%ld\n", abs ( N50 - N50_known ) );
3785 else
3787 fprintf ( fo, "NG50\tNaN\tNaN\n" );
3788 fprintf ( fo, "N50_contig-NG50_contig_length_difference\tNaN\n" );
3791 fprintf ( fo, "\n" );
3792 free ( A_num );
3793 free ( C_num );
3794 free ( G_num );
3795 free ( T_num );
3796 free ( N_num );
3797 free ( Non_ACGTN );
3798 free ( Singleton_Seq );
3799 free ( Size_Seq );
3800 Scaffold_Number_Contig = Scaffold_Number;
3801 fprintf ( fo, "Number_of_contigs_in_scaffolds\t%ld\n", Scaffold_Number_Contig - Singleton_Number_Scaf );
3802 fprintf ( fo, "Number_of_contigs_not_in_scaffolds(Singleton)\t%ld\n", Singleton_Number_Scaf );
3803 fprintf ( fo, "Average_number_of_contigs_per_scaffold\t%.1f\n", 1.0 * ( Scaffold_Number_Contig - Singleton_Number_Scaf ) / ( Scaffold_Number_Scaf - Singleton_Number_Scaf ) );
3804 fprintf ( fo, "\n" );
3805 fclose ( fp );
3806 fclose ( fp2 );
3807 fclose ( fo );
3810 /*************************************************
3811 Function:
3812 getCnt
3813 Description:
3814 Gets connection between two contigs.
3815 Input:
3816 1. from_c: left contig
3817 2. to_c: right contig
3818 Output:
3819 None.
3820 Return:
3821 Connection between two contigs.
3822 *************************************************/
3823 CONNECT * getCnt ( unsigned int from_c, unsigned int to_c )
3825 CONNECT * pcnt;
3826 pcnt = contig_array[from_c].downwardConnect;
3828 while ( pcnt )
3830 if ( pcnt->contigID == to_c )
3831 { return pcnt; }
3833 pcnt = pcnt->next;
3836 return pcnt;
3839 /*************************************************
3840 Function:
3841 allConnect
3842 Description:
3843 Counts contig's downstrem connection number if this contig does
3844 NOT have neigher upstream nor downstream connection in scaffold.
3845 Input:
3846 1. ctg: contig
3847 2. preCNT: contig's upstream connection
3848 Output:
3849 None.
3850 Return:
3851 1 if contig had both upstream and downstream connections in
3852 scaffold.
3853 Contig's downstream connection number otherwise.
3854 *************************************************/
3855 int allConnect ( unsigned int ctg, CONNECT * preCNT )
3857 if ( preCNT && preCNT->nextInScaf )
3858 { return 1; }
3860 CONNECT * cn_temp;
3861 int count = 0;
3863 if ( !contig_array[ctg].downwardConnect )
3864 { return count; }
3866 cn_temp = contig_array[ctg].downwardConnect;
3868 while ( cn_temp )
3870 count++;
3871 cn_temp = cn_temp->next;
3874 return count;
3877 /*************************************************
3878 Function:
3879 get_ctg_score2
3880 Description:
3881 Calculates a score to indicate the strengh of connections
3882 between a contig and other contigs in scaffold.
3883 Input:
3884 1. pos: contig index
3885 2. tempCounter: maximum contig number in scaffold
3886 Output:
3887 None.
3888 Return:
3889 Calculated score.
3890 *************************************************/
3891 int get_ctg_score2 ( int pos, int tempCounter )
3893 int id, innum, outnum, in = 0, out = 0, i, currId;
3894 CONNECT * dh_cnt;
3895 id = * ( unsigned int * ) darrayGet ( tempArray, pos );
3896 outnum = allConnect ( id, NULL );
3897 innum = allConnect ( getTwinCtg ( id ), NULL );
3898 int outlen = 0, inlen = 0, outcut, incut;
3900 if ( contig_array[id].downwardConnect )
3901 { outlen = contig_array[id].downwardConnect->gapLen; }
3903 if ( contig_array[getTwinCtg ( id )].downwardConnect )
3904 { inlen = contig_array[getTwinCtg ( id )].downwardConnect->gapLen; }
3906 outcut = outlen / overlaplen;
3907 incut = inlen / overlaplen;
3909 if ( outcut > outnum * 10 || outcut < outnum * 2 )
3910 { outcut = outnum * 10; }
3912 if ( incut > innum * 10 || incut < innum * 2 )
3913 { incut = innum * 10; }
3915 int start = pos - incut > 0 ? pos - incut : 0;
3916 int end = pos + outcut < tempCounter ? pos + outcut : tempCounter;
3918 for ( i = start; i < end; i++ )
3920 if ( i == pos )
3921 { continue; }
3923 currId = * ( unsigned int * ) darrayGet ( tempArray, i );
3925 if ( i < pos )
3927 dh_cnt = getCnt ( currId, id );
3929 if ( dh_cnt && dh_cnt->weight > 0 )
3930 { in++; }
3933 if ( i > pos )
3935 dh_cnt = getCnt ( id, currId );
3937 if ( dh_cnt && dh_cnt->weight > 0 )
3938 { out++; }
3942 if ( innum > pos )
3943 { innum = pos; }
3945 if ( outnum > tempCounter - pos - 1 )
3946 { outnum = tempCounter - pos - 1; }
3948 if ( innum > 0 && outnum > 0 )
3950 if ( pos < tempCounter - 1 && pos > 0 )
3952 if ( outnum < 3 && out == 0 )
3954 if ( in > 0 )
3955 { return ( int ) ( ( ( double ) ( in ) / ( double ) ( innum ) ) * 100 ); }
3958 if ( innum < 3 && in == 0 )
3960 if ( out > 0 )
3961 { return ( int ) ( ( ( double ) ( out ) / ( double ) ( outnum ) ) * 100 ); }
3964 if ( in == 0 || out == 0 )
3965 { return 0; }
3967 return ( int ) ( ( ( double ) ( in * out ) / ( double ) ( innum * outnum ) ) * 100 );
3970 if ( pos == 0 )
3971 { return ( int ) ( ( ( double ) ( out ) / ( double ) ( outnum ) ) * 100 ); }
3973 if ( pos == tempCounter - 1 )
3974 { return ( int ) ( ( ( double ) ( in ) / ( double ) ( innum ) ) * 100 ); }
3976 else if ( innum > 0 )
3978 if ( pos == tempCounter - 1 && in == 1 && innum > 5 )
3979 { return 0; }
3981 return ( int ) ( ( ( double ) ( in ) / ( double ) ( innum ) ) * 100 );
3983 else if ( outnum > 0 )
3985 if ( pos == 0 && out == 1 && outnum > 5 )
3986 { return 0; }
3988 return ( int ) ( ( ( double ) ( out ) / ( double ) ( outnum ) ) * 100 );
3991 return 0;
3994 /*************************************************
3995 Function:
3996 get_ctg_score2
3997 Description:
3998 Calculates a score to indicate the strengh of connections
3999 between a contig and other contigs in scaffold.
4000 Input:
4001 1. pos: contig index
4002 2. tempCounter: maximum contig number in scaffold
4003 Output:
4004 None.
4005 Return:
4006 Calculated score.
4007 *************************************************/
4008 int get_ctg_score ( int pos, int num3, int num5, int flag )
4010 int i = 0, in = 0, out = 0, innum = 0, outnum = 0, threeid;
4011 CONNECT * dh_cnt;
4012 int id;
4014 if ( flag == 0 )
4016 id = * ( unsigned int * ) darrayGet ( scaf3, pos );
4017 int end = pos > 100 ? pos - 100 : 0;
4018 int start = pos + 100 < num3 ? pos + 100 : num3;
4019 in = 0, out = 0;
4021 for ( i = start; i >= end; i-- )
4023 threeid = * ( unsigned int * ) darrayGet ( scaf3, i );
4025 if ( threeid == id )
4027 pos = i;
4028 continue;
4031 dh_cnt = getCnt ( id, threeid );
4033 if ( dh_cnt && dh_cnt->weight > 0 )
4035 out++;
4038 dh_cnt = getCnt ( threeid, id );
4040 if ( dh_cnt && dh_cnt->weight > 0 )
4041 { in++; }
4044 outnum = allConnect ( id, NULL );
4045 innum = allConnect ( getTwinCtg ( id ), NULL );
4046 int num5_check = 0;
4048 if ( pos - end < outnum )
4050 for ( i = 0; i < num5; i++ )
4052 num5_check++;
4053 threeid = * ( unsigned int * ) darrayGet ( scaf5, i );
4054 dh_cnt = getCnt ( id, threeid );
4056 if ( dh_cnt && dh_cnt->weight > 0 )
4057 { out++; }
4059 if ( num5_check == outnum )
4060 { break; }
4064 if ( pos - end + num5_check < outnum )
4065 { outnum = pos - end + num5_check; }
4067 if ( start - pos < innum )
4068 { innum = start - pos; }
4070 if ( innum > 0 && outnum > 0 )
4072 if ( pos != num3 && pos > 0 )
4074 if ( outnum < 5 && out == 0 )
4076 if ( innum > 0 )
4077 { return ( int ) ( ( ( double ) ( in ) / ( double ) ( innum ) ) * 100 ); }
4080 if ( innum < 5 && in == 0 )
4082 if ( outnum > 0 )
4083 { return ( int ) ( ( ( double ) ( out ) / ( double ) ( outnum ) ) * 100 ); }
4086 if ( in == 0 || out == 0 )
4087 { return 0; }
4089 return ( int ) ( ( ( double ) ( in * out ) / ( double ) ( innum * outnum ) ) * 100 );
4092 if ( pos == num3 )
4093 { return ( int ) ( ( ( double ) ( out ) / ( double ) ( outnum ) ) * 100 ); }
4095 if ( pos == 0 )
4096 { return ( int ) ( ( ( double ) ( in ) / ( double ) ( innum ) ) * 100 ); }
4098 else if ( innum > 0 )
4100 return ( int ) ( ( ( double ) ( in ) / ( double ) ( innum ) ) * 100 );
4102 else if ( outnum > 0 )
4104 if ( pos == num3 && out == 1 && outnum > 5 )
4105 { return 0; }
4107 return ( int ) ( ( ( double ) ( out ) / ( double ) ( outnum ) ) * 100 );
4110 return 0;
4112 else
4114 id = * ( unsigned int * ) darrayGet ( scaf5, pos );
4115 int start = pos > 100 ? pos - 100 : 0;
4116 int end = pos + 100 < num5 ? pos + 100 : num5;
4117 in = 0, out = 0;
4119 for ( i = start; i < end; i++ )
4121 threeid = * ( unsigned int * ) darrayGet ( scaf5, i );
4123 if ( threeid == id )
4125 pos = i;
4126 continue;
4129 dh_cnt = getCnt ( id, threeid );
4131 if ( dh_cnt && dh_cnt->weight > 0 )
4132 { out++; }
4134 dh_cnt = getCnt ( threeid, id );
4136 if ( dh_cnt && dh_cnt->weight > 0 )
4137 { in++; }
4140 outnum = allConnect ( id, NULL );
4141 innum = allConnect ( getTwinCtg ( id ), NULL );
4142 int num3_check = 0;
4144 if ( pos - start < innum )
4146 for ( i = 0; i < num3; i++ )
4148 num3_check++;
4149 threeid = * ( unsigned int * ) darrayGet ( scaf3, i );
4150 dh_cnt = getCnt ( threeid, id );
4152 if ( dh_cnt && dh_cnt->weight > 0 )
4153 { in++; }
4155 if ( num3_check == innum )
4156 { break; }
4160 if ( pos - start + num3_check < innum )
4161 { innum = pos - start + num3_check; }
4163 if ( end - pos - 1 < outnum )
4164 { outnum = end - pos - 1; }
4166 if ( innum > 0 && outnum > 0 )
4168 if ( pos != num5 - 1 && pos > 0 )
4170 if ( outnum < 5 && out == 0 && innum > 0 )
4171 { return ( int ) ( ( ( double ) ( in ) / ( double ) ( innum ) ) * 100 ); }
4173 if ( innum < 5 && in == 0 && outnum > 0 )
4174 { return ( int ) ( ( ( double ) ( out ) / ( double ) ( outnum ) ) * 100 ); }
4176 if ( in == 0 || out == 0 )
4177 { return 0; }
4179 return ( int ) ( ( ( double ) ( in * out ) / ( double ) ( innum * outnum ) ) * 100 );
4182 if ( pos == 0 )
4183 { return ( int ) ( ( ( double ) ( out ) / ( double ) ( outnum ) ) * 100 ); }
4185 if ( pos == num5 - 1 )
4186 { return ( int ) ( ( ( double ) ( in ) / ( double ) ( innum ) ) * 100 ); }
4188 else if ( innum > 0 )
4190 if ( pos == num5 - 1 && in == 1 && innum > 5 )
4191 { return 0; }
4193 return ( int ) ( ( ( double ) ( in ) / ( double ) ( innum ) ) * 100 );
4195 else if ( outnum > 0 )
4196 { return ( int ) ( ( ( double ) ( out ) / ( double ) ( outnum ) ) * 100 ); }
4198 return 0;
4201 return 0;
4204 /*************************************************
4205 Function:
4206 scaffolding
4207 Description:
4208 Masks contigs having low connection score in scaffold. Fills gaps
4209 by using ARC information. Outputs scaffold structure and filled
4210 gaps' information.
4211 Input:
4212 1. len_cut: length cutoff
4213 2. outfile: prefix of graph
4214 Output:
4215 None.
4216 Return:
4217 None.
4218 *************************************************/
4219 void scaffolding ( unsigned int len_cut, char * outfile )
4221 unsigned int prev_ctg, ctg, bal_ctg, *length_array, count = 0, num_lctg = 0, *score_array;
4222 unsigned int i, max_steps = 5;
4223 int num5, num3, j, len, flag, num_route, gap_c = 0;
4224 int tempCounter;
4225 short gap = 0;
4226 long long sum = 0, N50, N90;
4227 FILE * fp, *fo = NULL;
4228 char name[256];
4229 CONNECT * cnt, *prevCNT, *nextCnt, *dh_cnt;
4230 boolean excep, weak;
4231 weakCounter = 0;
4232 so_far = ( unsigned int * ) ckalloc ( max_n_routes * sizeof ( unsigned int ) );
4233 found_routes = ( unsigned int ** ) ckalloc ( max_n_routes * sizeof ( unsigned int * ) );
4235 for ( j = 0; j < max_n_routes; j++ )
4236 { found_routes[j] = ( unsigned int * ) ckalloc ( max_steps * sizeof ( unsigned int ) ); }
4238 length_array = ( unsigned int * ) ckalloc ( ( num_ctg + 1 ) * sizeof ( unsigned int ) );
4240 //use length_array to change info in index_array
4241 for ( i = 1; i <= num_ctg; i++ )
4242 { length_array[i] = 0; }
4244 for ( i = 1; i <= num_ctg; i++ )
4246 if ( index_array[i] > 0 )
4247 { length_array[index_array[i]] = i; }
4250 for ( i = 1; i <= num_ctg; i++ )
4251 { index_array[i] = length_array[i]; }
4253 orig2new = 0;
4254 sprintf ( name, "%s.scaf", outfile );
4255 fp = ckopen ( name, "w" );
4256 sprintf ( name, "%s.scaf_gap", outfile );
4257 fo = ckopen ( name, "w" );
4258 scaf3 = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) );
4259 scaf5 = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) );
4260 gap3 = ( DARRAY * ) createDarray ( 1000, sizeof ( int ) );
4261 gap5 = ( DARRAY * ) createDarray ( 1000, sizeof ( int ) );
4262 tempArray = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) );
4264 for ( i = 1; i <= num_ctg; i++ )
4265 { contig_array[i].flag = 0; }
4267 for ( i = 1; i <= num_ctg; i++ )
4269 if ( contig_array[i].length + ( unsigned int ) overlaplen >= len_cut )
4270 { num_lctg++; }
4271 else
4272 { continue; }
4274 if ( contig_array[i].flag || contig_array[i].mask || !contig_array[i].downwardConnect || !validConnect ( i, NULL ) )
4275 { continue; }
4277 num5 = num3 = 0;
4278 ctg = i;
4279 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = i;
4280 contig_array[i].flag = 1;
4281 bal_ctg = getTwinCtg ( ctg );
4282 contig_array[bal_ctg].flag = 1;
4283 len = contig_array[i].length;
4284 prevCNT = NULL;
4285 cnt = getNextContig ( ctg, prevCNT, &excep );
4287 while ( cnt )
4289 nextCnt = getNextContig ( cnt->contigID, cnt, &excep );
4291 if ( excep && prevCNT )
4292 { fprintf ( stderr, "scaffolding: exception --- prev cnt from %u\n", prevCNT->contigID ); }
4294 if ( nextCnt && nextCnt->used )
4295 { break; }
4297 setConnectUsed ( ctg, cnt->contigID, 1 );
4298 * ( int * ) darrayPut ( gap5, num5 - 1 ) = cnt->gapLen;
4299 ctg = cnt->contigID;
4300 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = ctg;
4301 len += cnt->gapLen + contig_array[ctg].length;
4302 bal_ctg = getTwinCtg ( ctg );
4303 contig_array[ctg].flag = 1;
4304 contig_array[bal_ctg].flag = 1;
4305 prevCNT = cnt;
4306 cnt = nextCnt;
4309 ctg = getTwinCtg ( i );
4311 if ( num5 >= 2 )
4312 { prevCNT = checkConnect ( getTwinCtg ( * ( unsigned int * ) darrayGet ( scaf5, 1 ) ), ctg ); }
4313 else
4314 { prevCNT = NULL; }
4316 cnt = getNextContig ( ctg, prevCNT, &excep );
4318 while ( cnt )
4320 nextCnt = getNextContig ( cnt->contigID, cnt, &excep );
4322 if ( excep && prevCNT )
4323 { fprintf ( stderr, "scaffolding: exception -- prev cnt from %u\n", prevCNT->contigID ); }
4325 if ( nextCnt && nextCnt->used )
4326 { break; }
4328 setConnectUsed ( ctg, cnt->contigID, 1 );
4329 ctg = cnt->contigID;
4330 len += cnt->gapLen + contig_array[ctg].length;
4331 bal_ctg = getTwinCtg ( ctg );
4332 contig_array[ctg].flag = 1;
4333 contig_array[bal_ctg].flag = 1;
4334 * ( int * ) darrayPut ( gap3, num3 ) = cnt->gapLen;
4335 * ( unsigned int * ) darrayPut ( scaf3, num3++ ) = bal_ctg;
4336 prevCNT = cnt;
4337 cnt = nextCnt;
4340 if ( num5 + num3 == 1 )
4342 contig_array[i].flag = 0;
4343 continue;
4346 len += overlaplen;
4347 sum += len;
4348 length_array[count++] = len;
4350 if ( num5 + num3 < 1 )
4352 fprintf ( stderr, "no scaffold created for contig %d\n", i );
4353 continue;
4356 tempCounter = 0;
4358 for ( j = num3 - 1; j >= 0; j-- )
4359 { * ( unsigned int * ) darrayPut ( tempArray, tempCounter++ ) = * ( unsigned int * ) darrayGet ( scaf3, j ); }
4361 for ( j = 0; j < num5; j++ )
4362 { * ( unsigned int * ) darrayPut ( tempArray, tempCounter++ ) = * ( unsigned int * ) darrayGet ( scaf5, j ); }
4364 score_array = ( unsigned int * ) ckalloc ( tempCounter * sizeof ( unsigned int ) );
4365 int now_cnt_weight, curr_ctg_score, pre_score = -1, prev_id = 0, mask_num = 0, score_count = 0, prev_p = 0;
4367 for ( j = 0; j < tempCounter; j++ )
4369 int currId = * ( unsigned int * ) darrayGet ( tempArray, j );
4370 dh_cnt = getCntBetween ( prev_id, currId );
4372 if ( dh_cnt )
4373 { now_cnt_weight = dh_cnt ->weight; }
4374 else
4375 { now_cnt_weight = 0; }
4377 curr_ctg_score = get_ctg_score2 ( j, tempCounter );
4379 if ( prev_id == 0 )
4381 pre_score = curr_ctg_score;
4382 prev_id = currId;
4383 prev_p = j;
4384 continue;
4387 if ( score_mask )
4389 if ( now_cnt_weight == 0 && j > 0 && j < tempCounter - 1 )
4391 if ( pre_score == 0 )
4393 * ( unsigned int * ) darrayPut ( tempArray, prev_p ) = 0;
4394 contig_array[prev_id].flag = 0;
4395 contig_array[getTwinCtg ( prev_id )].flag = 0;
4396 mask_num++;
4398 else if ( curr_ctg_score == 0 )
4400 * ( unsigned int * ) darrayPut ( tempArray, j ) = 0;
4401 contig_array[currId].flag = 0;
4402 contig_array[getTwinCtg ( currId )].flag = 0;
4403 mask_num++;
4405 if ( j < tempCounter - 1 )
4406 { continue; }
4410 if ( abs ( prev_id - currId ) <= 2 && now_cnt_weight == 0
4411 && * ( unsigned int * ) darrayGet ( tempArray, prev_p ) != 0 && * ( unsigned int * ) darrayGet ( tempArray, j ) != 0 )
4413 mask_num++;
4415 if ( contig_array[prev_id].cvg < contig_array[currId].cvg )
4417 * ( unsigned int * ) darrayPut ( tempArray, prev_p ) = 0;
4418 contig_array[prev_id].flag = 0;
4419 contig_array[getTwinCtg ( prev_id )].flag = 0;
4421 else
4423 * ( unsigned int * ) darrayPut ( tempArray, j ) = 0;
4424 contig_array[currId].flag = 0;
4425 contig_array[getTwinCtg ( currId )].flag = 0;
4427 if ( j < tempCounter - 1 )
4428 { continue; }
4433 if ( * ( unsigned int * ) darrayGet ( tempArray, prev_p ) != 0 )
4434 { score_array[score_count++] = pre_score; }
4436 pre_score = curr_ctg_score;
4437 prev_id = currId;
4438 prev_p = j;
4441 if ( * ( unsigned int * ) darrayGet ( tempArray, prev_p ) != 0 )
4442 { score_array[score_count++] = pre_score; }
4444 if ( score_mask == 1 && ( ( num3 + num5 > 5 && score_count < 2 ) || score_count == 1 ) )
4446 free ( ( void * ) score_array );
4447 --count;
4448 sum -= len;
4450 for ( j = 0; j < num3; j++ )
4452 ctg = * ( unsigned int * ) darrayGet ( scaf3, j );
4453 contig_array[ctg].flag = 0;
4454 contig_array[getTwinCtg ( ctg )].flag = 0;
4457 for ( j = 0; j < num5; j++ )
4459 ctg = * ( unsigned int * ) darrayGet ( scaf5, j );
4460 contig_array[ctg].flag = 0;
4461 contig_array[getTwinCtg ( ctg )].flag = 0;
4464 continue;
4467 fprintf ( fp, ">scaffold%d %d %d %d\n", count, score_count, len, num3 + num5, mask_num );
4468 fprintf ( fo, ">scaffold%d %d %d %d\n", count, score_count, len, num3 + num5, mask_num );
4469 len = prev_ctg = 0;
4470 tempCounter = 0, score_count = 0;
4472 for ( j = num3 - 1; j >= 0; j-- )
4474 int now_cnt_weigth = 0;
4475 int nextid, start = 0;
4476 int currId = * ( unsigned int * ) darrayGet ( scaf3, j );
4477 int tmpid = * ( unsigned int * ) darrayGet ( tempArray, tempCounter++ );
4479 if ( tmpid == 0 )
4481 if ( j == num3 - 1 )
4483 len = 0;
4484 continue;
4487 len += contig_array[* ( unsigned int * ) darrayGet ( scaf3, j )].length + * ( int * ) darrayGet ( gap3, j );
4488 int tmpgap = contig_array[* ( unsigned int * ) darrayGet ( scaf3, j )].length + * ( int * ) darrayGet ( gap3, j );
4489 gap += tmpgap > 0 ? tmpgap : 0;
4490 continue;
4493 if ( j > 0 )
4495 nextid = * ( unsigned int * ) darrayGet ( tempArray, tempCounter + start );
4497 while ( nextid == 0 && tempCounter + start + 1 < num3 + num5 )
4499 start++;
4500 nextid = * ( unsigned int * ) darrayGet ( tempArray, tempCounter + start );
4503 else
4505 nextid = i;
4508 CONNECT * dh_cnt = getCntBetween ( currId, nextid );
4510 if ( dh_cnt )
4511 { now_cnt_weigth = dh_cnt->weight; }
4512 else
4513 { now_cnt_weigth = 0; }
4515 curr_ctg_score = score_array[score_count++];
4517 if ( score_mask == 1 && curr_ctg_score == 0 && ( num3 + num5 > 2 )
4518 && ( ( j == num3 - 1 && contig_array[nextid].length < 200 && num3 + num5 > 5 ) || ( now_cnt_weigth == 0 && j > 0 ) ) )
4520 if ( j == num3 - 1 )
4522 len = 0;
4523 continue;
4526 len += contig_array[* ( unsigned int * ) darrayGet ( scaf3, j )].length + * ( int * ) darrayGet ( gap3, j );
4527 int tmpgap = contig_array[* ( unsigned int * ) darrayGet ( scaf3, j )].length + * ( int * ) darrayGet ( gap3, j );
4528 gap += tmpgap > 0 ? tmpgap : 0;
4529 continue;
4532 if ( !isLargerThanTwin ( * ( unsigned int * ) darrayGet ( scaf3, j ) ) )
4534 fprintf ( fp, "%-10d %-10d + %d %d %d"
4535 , index_array[* ( unsigned int * ) darrayGet ( scaf3, j )], len,
4536 contig_array[* ( unsigned int * ) darrayGet ( scaf3, j )].length + overlaplen,
4537 now_cnt_weigth, curr_ctg_score );
4538 weak = printCnts ( fp, * ( unsigned int * ) darrayGet ( scaf3, j ) );
4540 else
4542 fprintf ( fp, "%-10d %-10d - %d %d %d"
4543 , index_array[getTwinCtg ( * ( unsigned int * ) darrayGet ( scaf3, j ) )], len
4544 , contig_array[* ( unsigned int * ) darrayGet ( scaf3, j )].length + overlaplen,
4545 now_cnt_weigth, curr_ctg_score );
4546 weak = printCnts ( fp, * ( unsigned int * ) darrayGet ( scaf3, j ) );
4549 if ( prev_ctg )
4551 num_route = num_trace = 0;
4552 traceAlongArc ( * ( unsigned int * ) darrayGet ( scaf3, j ), prev_ctg, max_steps
4553 , gap - ins_size_var, gap + ins_size_var, 0, 0, &num_route );
4555 if ( num_route == 1 )
4557 output1gap ( fo, max_steps );
4558 gap_c++;
4562 fprintf ( fo, "%-10d %-10d\n", * ( unsigned int * ) darrayGet ( scaf3, j ), len );
4563 len += contig_array[* ( unsigned int * ) darrayGet ( scaf3, j )].length + * ( int * ) darrayGet ( gap3, j );
4564 prev_ctg = * ( unsigned int * ) darrayGet ( scaf3, j );
4565 gap = * ( int * ) darrayGet ( gap3, j ) > 0 ? * ( int * ) darrayGet ( gap3, j ) : 0;
4568 for ( j = 0; j < num5; j++ )
4570 int now_cnt_weigth = 0;
4571 int currId, nextid, start = 0;
4572 currId = * ( unsigned int * ) darrayGet ( scaf5, j );
4573 int tmpid = * ( unsigned int * ) darrayGet ( tempArray, tempCounter++ );
4575 if ( tmpid == 0 )
4577 if ( j == num5 - 1 )
4578 { continue; }
4580 len += contig_array[* ( unsigned int * ) darrayGet ( scaf5, j )].length + * ( int * ) darrayGet ( gap5, j );
4581 int tmpgap = contig_array[* ( unsigned int * ) darrayGet ( scaf5, j )].length + * ( int * ) darrayGet ( gap5, j );
4582 gap += tmpgap > 0 ? tmpgap : 0;
4583 continue;
4586 if ( j < num5 - 1 )
4588 nextid = * ( unsigned int * ) darrayGet ( tempArray, tempCounter + start );
4590 while ( nextid == 0 && tempCounter + start + 1 < num3 + num5 )
4592 start ++;
4593 nextid = * ( unsigned int * ) darrayGet ( tempArray, tempCounter + start );
4596 CONNECT * dh_cnt = getCntBetween ( currId, nextid );
4598 if ( dh_cnt )
4599 { now_cnt_weigth = dh_cnt->weight; }
4600 else
4601 { now_cnt_weigth = 0; }
4604 curr_ctg_score = score_array [score_count++];
4606 if ( score_mask == 1 && curr_ctg_score == 0 && ( num3 + num5 > 2 )
4607 && ( ( j == num5 - 1 && contig_array[* ( unsigned int * ) darrayGet ( scaf5, j - 1 )].length < 200 && num3 + num5 > 5 )
4608 || ( j != num5 - 1 && now_cnt_weigth == 0 ) ) )
4610 if ( j == num5 - 1 )
4612 continue;
4615 len += contig_array[* ( unsigned int * ) darrayGet ( scaf5, j )].length + * ( int * ) darrayGet ( gap5, j );
4616 int tmpgap = contig_array[* ( unsigned int * ) darrayGet ( scaf5, j )].length + * ( int * ) darrayGet ( gap5, j );
4617 gap += tmpgap > 0 ? tmpgap : 0;
4618 continue;
4621 if ( !isLargerThanTwin ( * ( unsigned int * ) darrayGet ( scaf5, j ) ) )
4623 fprintf ( fp, "%-10d %-10d + %d %d %d"
4624 , index_array[* ( unsigned int * ) darrayGet ( scaf5, j )], len
4625 , contig_array[* ( unsigned int * ) darrayGet ( scaf5, j )].length + overlaplen,
4626 now_cnt_weigth, curr_ctg_score );
4627 weak = printCnts ( fp, * ( unsigned int * ) darrayGet ( scaf5, j ) );
4629 else
4631 fprintf ( fp, "%-10d %-10d - %d %d %d"
4632 , index_array[getTwinCtg ( * ( unsigned int * ) darrayGet ( scaf5, j ) )], len
4633 , contig_array[* ( unsigned int * ) darrayGet ( scaf5, j )].length + overlaplen,
4634 now_cnt_weigth, curr_ctg_score );
4635 weak = printCnts ( fp, * ( unsigned int * ) darrayGet ( scaf5, j ) );
4638 if ( prev_ctg )
4640 num_route = num_trace = 0;
4641 traceAlongArc ( * ( unsigned int * ) darrayGet ( scaf5, j ), prev_ctg, max_steps
4642 , gap - ins_size_var, gap + ins_size_var, 0, 0, &num_route );
4644 if ( num_route == 1 )
4646 output1gap ( fo, max_steps );
4647 gap_c++;
4651 fprintf ( fo, "%-10d %-10d\n", * ( unsigned int * ) darrayGet ( scaf5, j ), len );
4653 if ( j < num5 - 1 )
4655 len += contig_array[* ( unsigned int * ) darrayGet ( scaf5, j )].length +
4656 * ( int * ) darrayGet ( gap5, j );
4657 prev_ctg = * ( unsigned int * ) darrayGet ( scaf5, j );
4658 gap = * ( int * ) darrayGet ( gap5, j ) > 0 ? * ( int * ) darrayGet ( gap5, j ) : 0;
4662 free ( ( void * ) score_array );
4665 freeDarray ( scaf3 );
4666 freeDarray ( scaf5 );
4667 freeDarray ( gap3 );
4668 freeDarray ( gap5 );
4669 freeDarray ( tempArray );
4670 fclose ( fp );
4671 fclose ( fo );
4672 fprintf ( stderr, "\nThe final rank\n" );
4674 if ( count == 0 )
4676 fprintf ( stderr, "\n\nNo scaffold was constructed.\n\n" );
4677 free ( ( void * ) length_array );
4679 for ( j = 0; j < max_n_routes; j++ )
4680 { free ( ( void * ) found_routes[j] ); }
4682 free ( ( void * ) found_routes );
4683 free ( ( void * ) so_far );
4684 return;
4687 fprintf ( stderr, "\n*******************************\n" );
4688 fprintf ( stderr, " Scaffold number %d\n", count );
4689 fprintf ( stderr, " In-scaffold contig number %u\n", num_lctg / 2 );
4690 fprintf ( stderr, " Total scaffold length %lld\n", sum );
4691 fprintf ( stderr, " Average scaffold length %lld\n", sum / count );
4692 fprintf ( stderr, " Filled gap number %d\n", gap_c );
4694 //output singleton
4695 for ( i = 1; i <= num_ctg; i++ )
4697 if ( contig_array[i].length + ( unsigned int ) overlaplen < len_cut || contig_array[i].flag )
4698 { continue; }
4700 length_array[count++] = contig_array[i].length;
4701 sum += contig_array[i].length;
4703 if ( isSmallerThanTwin ( i ) )
4704 { i++; }
4707 long long total_len = sum;
4708 qsort ( length_array, count, sizeof ( length_array[0] ), cmp_int );
4709 N50 = sum * 0.5;
4710 N90 = sum * 0.9;
4711 int N50length = 0;
4712 int N90length = 0;
4713 sum = flag = 0;
4715 for ( j = count - 1; j >= 0; j-- )
4717 sum += length_array[j];
4719 if ( !flag && sum >= N50 && N50length == 0 )
4721 N50length = length_array[j];
4722 flag++;
4725 if ( sum >= N90 && N90length == 0 )
4727 N90length = length_array[j];
4728 break;
4732 fprintf ( stderr, " Longest scaffold %lld\n", length_array[count - 1] );
4733 fprintf ( stderr, " Scaffold and singleton number %d\n", count );
4734 fprintf ( stderr, " Scaffold and singleton length %lld\n", total_len );
4735 fprintf ( stderr, " Average length %d\n", total_len / count );
4736 fprintf ( stderr, " N50 %d\n", N50length );
4737 fprintf ( stderr, " N90 %d\n", N90length );
4738 fprintf ( stderr, " Weak points %d\n", weakCounter );
4739 fprintf ( stderr, "\n*******************************\n" );
4740 fflush ( stdout );
4741 free ( ( void * ) length_array );
4743 for ( j = 0; j < max_n_routes; j++ )
4744 { free ( ( void * ) found_routes[j] ); }
4746 free ( ( void * ) found_routes );
4747 free ( ( void * ) so_far );
4750 /*************************************************
4751 Function:
4752 scaffold_count
4753 Description:
4754 Makes statistic of scaffold till current rank.
4755 Input:
4756 1. rank: current rank number
4757 2. len_cut: length cutoff
4758 Output:
4759 None.
4760 Return:
4761 None.
4762 *************************************************/
4763 void scaffold_count ( int rank, unsigned int len_cut )
4765 static DARRAY * scaf3, *scaf5;
4766 static DARRAY * gap3, *gap5;
4767 unsigned int prev_ctg, ctg, bal_ctg, *length_array, count = 0, num_lctg = 0;
4768 unsigned int i, max_steps = 5;
4769 int num5, num3, j, len, flag, num_route, gap_c = 0;
4770 short gap = 0;
4771 long long sum = 0, N50, N90;
4772 CONNECT * cnt, *prevCNT, *nextCnt;
4773 boolean excep;
4774 so_far = ( unsigned int * ) ckalloc ( max_n_routes * sizeof ( unsigned int ) );
4775 found_routes = ( unsigned int ** ) ckalloc ( max_n_routes * sizeof ( unsigned int * ) );
4777 for ( j = 0; j < max_n_routes; j++ )
4778 { found_routes[j] = ( unsigned int * ) ckalloc ( max_steps * sizeof ( unsigned int ) ); }
4780 length_array = ( unsigned int * ) ckalloc ( ( num_ctg + 1 ) * sizeof ( unsigned int ) );
4782 //use length_array to change info in index_array
4783 for ( i = 1; i <= num_ctg; i++ )
4784 { length_array[i] = 0; }
4786 for ( i = 1; i <= num_ctg; i++ )
4788 if ( index_array[i] > 0 )
4789 { length_array[index_array[i]] = i; }
4792 for ( i = 1; i <= num_ctg; i++ )
4793 { index_array[i] = length_array[i]; } //contig i with original index: index_array[i]
4795 orig2new = 0;
4796 scaf3 = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) );
4797 scaf5 = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) );
4798 gap3 = ( DARRAY * ) createDarray ( 1000, sizeof ( int ) );
4799 gap5 = ( DARRAY * ) createDarray ( 1000, sizeof ( int ) );
4801 for ( i = 1; i <= num_ctg; i++ )
4802 { contig_array[i].flag = 0; }
4804 for ( i = 1; i <= num_ctg; i++ )
4806 if ( contig_array[i].length + ( unsigned int ) overlaplen >= len_cut )
4807 { num_lctg++; }
4808 else
4809 { continue; }
4811 if ( contig_array[i].flag || contig_array[i].mask || !contig_array[i].downwardConnect || !validConnect ( i, NULL ) )
4812 { continue; }
4814 num5 = num3 = 0;
4815 ctg = i;
4816 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = i;
4817 contig_array[i].flag = 1;
4818 bal_ctg = getTwinCtg ( ctg );
4819 contig_array[bal_ctg].flag = 1;
4820 len = contig_array[i].length;
4821 prevCNT = NULL;
4822 cnt = getNextContig ( ctg, prevCNT, &excep );
4824 while ( cnt )
4826 nextCnt = getNextContig ( cnt->contigID, cnt, &excep );
4828 if ( excep && prevCNT )
4829 { fprintf ( stderr, "scaffolding: exception --- prev cnt from %u\n", prevCNT->contigID ); }
4831 if ( nextCnt && nextCnt->used )
4832 { break; }
4834 setConnectUsed ( ctg, cnt->contigID, 1 );
4835 * ( int * ) darrayPut ( gap5, num5 - 1 ) = cnt->gapLen;
4836 ctg = cnt->contigID;
4837 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = ctg;
4838 len += cnt->gapLen + contig_array[ctg].length;
4839 bal_ctg = getTwinCtg ( ctg );
4840 contig_array[ctg].flag = 1;
4841 contig_array[bal_ctg].flag = 1;
4842 prevCNT = cnt;
4843 cnt = nextCnt;
4846 ctg = getTwinCtg ( i );
4848 if ( num5 >= 2 )
4849 { prevCNT = checkConnect ( getTwinCtg ( * ( unsigned int * ) darrayGet ( scaf5, 1 ) ), ctg ); }
4850 else
4851 { prevCNT = NULL; }
4853 cnt = getNextContig ( ctg, prevCNT, &excep );
4855 while ( cnt )
4857 nextCnt = getNextContig ( cnt->contigID, cnt, &excep );
4859 if ( excep && prevCNT )
4860 { fprintf ( stderr, "scaffolding: exception -- prev cnt from %u\n", prevCNT->contigID ); }
4862 if ( nextCnt && nextCnt->used )
4863 { break; }
4865 setConnectUsed ( ctg, cnt->contigID, 1 );
4866 ctg = cnt->contigID;
4867 len += cnt->gapLen + contig_array[ctg].length;
4868 bal_ctg = getTwinCtg ( ctg );
4869 contig_array[ctg].flag = 1;
4870 contig_array[bal_ctg].flag = 1;
4871 * ( int * ) darrayPut ( gap3, num3 ) = cnt->gapLen;
4872 * ( unsigned int * ) darrayPut ( scaf3, num3++ ) = bal_ctg;
4873 prevCNT = cnt;
4874 cnt = nextCnt;
4877 len += overlaplen;
4878 sum += len;
4879 length_array[count++] = len;
4881 if ( num5 + num3 < 1 )
4883 fprintf ( stderr, "no scaffold created for contig %d\n", i );
4884 continue;
4887 len = prev_ctg = 0;
4889 for ( j = num3 - 1; j >= 0; j-- )
4891 if ( prev_ctg )
4893 num_route = num_trace = 0;
4894 traceAlongArc ( * ( unsigned int * ) darrayGet ( scaf3, j ), prev_ctg, max_steps
4895 , gap - ins_size_var, gap + ins_size_var, 0, 0, &num_route );
4897 if ( num_route == 1 )
4899 gap_c++;
4903 len += contig_array[* ( unsigned int * ) darrayGet ( scaf3, j )].length + * ( int * ) darrayGet ( gap3, j );
4904 prev_ctg = * ( unsigned int * ) darrayGet ( scaf3, j );
4905 gap = * ( int * ) darrayGet ( gap3, j ) > 0 ? * ( int * ) darrayGet ( gap3, j ) : 0;
4908 for ( j = 0; j < num5; j++ )
4910 if ( prev_ctg )
4912 num_route = num_trace = 0;
4913 traceAlongArc ( * ( unsigned int * ) darrayGet ( scaf5, j ), prev_ctg, max_steps
4914 , gap - ins_size_var, gap + ins_size_var, 0, 0, &num_route );
4916 if ( num_route == 1 )
4918 gap_c++;
4922 if ( j < num5 - 1 )
4924 len += contig_array[* ( unsigned int * ) darrayGet ( scaf5, j )].length +
4925 * ( int * ) darrayGet ( gap5, j );
4926 prev_ctg = * ( unsigned int * ) darrayGet ( scaf5, j );
4927 gap = * ( int * ) darrayGet ( gap5, j ) > 0 ? * ( int * ) darrayGet ( gap5, j ) : 0;
4932 freeDarray ( scaf3 );
4933 freeDarray ( scaf5 );
4934 freeDarray ( gap3 );
4935 freeDarray ( gap5 );
4937 if ( count == 0 )
4939 fprintf ( stderr, "\n\nNo scaffold was constructed.\n\n" );
4940 free ( ( void * ) length_array );
4942 for ( j = 0; j < max_n_routes; j++ )
4943 { free ( ( void * ) found_routes[j] ); }
4945 free ( ( void * ) found_routes );
4946 free ( ( void * ) so_far );
4947 return;
4950 fprintf ( stderr, "\nRank %d\n", rank );
4951 fprintf ( stderr, " Scaffold number %d\n", count );
4952 fprintf ( stderr, " In-scaffold contig number %u\n", num_lctg / 2 );
4953 fprintf ( stderr, " Total scaffold length %lld\n", sum );
4954 fprintf ( stderr, " Average scaffold length %lld\n", sum / count );
4955 fprintf ( stderr, " Filled gap number %d\n", gap_c );
4957 //output singleton
4958 for ( i = 1; i <= num_ctg; i++ )
4960 if ( contig_array[i].length + ( unsigned int ) overlaplen < len_cut || contig_array[i].flag )
4961 { continue; }
4963 length_array[count++] = contig_array[i].length;
4964 sum += contig_array[i].length;
4966 if ( isSmallerThanTwin ( i ) )
4967 { i++; }
4970 long int total_len = sum;
4971 // calculate N50/N90
4972 qsort ( length_array, count, sizeof ( length_array[0] ), cmp_int );
4973 N50 = sum * 0.5;
4974 N90 = sum * 0.9;
4975 int N50length = 0;
4976 int N90length = 0;
4977 sum = flag = 0;
4979 for ( j = count - 1; j >= 0; j-- )
4981 sum += length_array[j];
4983 if ( !flag && sum >= N50 && N50length == 0 )
4985 N50length = length_array[j];
4986 flag++;
4989 if ( sum >= N90 && N90length == 0 )
4991 N90length = length_array[j];
4992 break;
4996 fprintf ( stderr, " Longest scaffold %lld\n", length_array[count - 1] );
4997 fprintf ( stderr, " Scaffold and singleton number %d\n", count );
4998 fprintf ( stderr, " Scaffold and singleton length %lld\n", total_len );
4999 fprintf ( stderr, " Average length %d\n", total_len / count );
5000 fprintf ( stderr, " N50 %d\n", N50length );
5001 fprintf ( stderr, " N90 %d\n", N90length );
5002 free ( ( void * ) length_array );
5004 for ( j = 0; j < max_n_routes; j++ )
5005 { free ( ( void * ) found_routes[j] ); }
5007 free ( ( void * ) found_routes );
5008 free ( ( void * ) so_far );
5012 /*************************************************
5013 Function:
5014 outputLinks
5015 Description:
5016 Outputs connection information of current insert size LIB.
5017 Input:
5018 1. fp: output file
5019 2. insertS: current insert size
5020 Output:
5021 None.
5022 Return:
5023 None.
5024 *************************************************/
5025 static void outputLinks ( FILE * fp, int insertS )
5027 unsigned int i, bal_ctg, bal_toCtg;
5028 CONNECT * cnts, *temp_cnt;
5030 for ( i = 1; i <= num_ctg; i++ )
5032 cnts = contig_array[i].downwardConnect;
5033 bal_ctg = getTwinCtg ( i );
5035 while ( cnts )
5037 if ( cnts->weight < 1 )
5039 cnts = cnts->next;
5040 continue;
5043 fprintf ( fp, "%-10d %-10d\t%d\t%d\t%d\n"
5044 , i, cnts->contigID, cnts->gapLen, cnts->weight, insertS );
5045 cnts->weight = 0;
5046 bal_toCtg = getTwinCtg ( cnts->contigID );
5047 temp_cnt = getCntBetween ( bal_toCtg, bal_ctg );
5049 if ( temp_cnt )
5050 { temp_cnt->weight = 0; }
5052 cnts = cnts->next;
5057 /*************************************************
5058 Function:
5059 PE2Links
5060 Description:
5061 Updates connections between contigs based on alignment
5062 information of paired-end reads.
5063 Input:
5064 1. infile: alignment information file of paired-end reads
5065 Output:
5066 None.
5067 Return:
5068 None.
5069 *************************************************/
5070 void PE2Links ( char * infile )
5072 char name[256], *line;
5073 FILE * fp1;
5074 FILE * linkF;
5075 gzFile * fp2;
5076 int i;
5077 int flag = 0;
5078 unsigned int j;
5079 sprintf ( name, "%s.links", infile );
5080 boolean filesOK = check_file ( name );
5082 if ( filesOK )
5084 fprintf ( stderr, "File %s exists, skip creating the links...\n", name );
5085 return;
5088 linkF = ckopen ( name, "w" );
5090 if ( !pes )
5091 { loadPEgrads ( infile ); }
5093 fprintf ( stderr, "*****************************************************\nStart to load paired-end reads information.\n\n" );
5095 if ( COMPATIBLE_MODE == 1 )
5097 sprintf ( name, "%s.readOnContig", infile );
5098 fp1 = ckopen ( name, "r" );
5100 else
5102 sprintf ( name, "%s.readOnContig.gz", infile );
5103 fp2 = gzopen ( name, "r" );
5106 lineLen = 1024;
5107 line = ( char * ) ckalloc ( lineLen * sizeof ( char ) );
5109 if ( COMPATIBLE_MODE == 1 )
5111 fgets ( line, lineLen, fp1 );
5113 else
5115 gzgets ( fp2, line, lineLen );
5118 line[0] = '\0';
5120 for ( i = 0; i < gradsCounter; i++ )
5122 createCntMemManager();
5123 createCntLookupTable();
5124 newCntCounter = 0;
5126 if ( COMPATIBLE_MODE == 1 )
5128 flag += connectByPE_grad ( fp1, i, line );
5130 else
5132 flag += connectByPE_grad_gz ( fp2, i, line );
5135 fprintf ( stderr, "%lld new connections.\n\n", newCntCounter / 2 );
5137 if ( !flag )
5139 destroyConnectMem();
5140 deleteCntLookupTable();
5142 for ( j = 1; j <= num_ctg; j++ )
5143 { contig_array[j].downwardConnect = NULL; }
5145 fprintf ( stderr, "\n" );
5146 continue;
5149 flag = 0;
5150 outputLinks ( linkF, pes[i].insertS );
5151 destroyConnectMem();
5152 deleteCntLookupTable();
5154 for ( j = 1; j <= num_ctg; j++ )
5155 { contig_array[j].downwardConnect = NULL; }
5158 free ( ( void * ) line );
5160 if ( COMPATIBLE_MODE == 1 )
5162 fclose ( fp1 );
5164 else
5166 gzclose ( fp2 );
5169 fclose ( linkF );
5170 fprintf ( stderr, "All paired-end reads information loaded.\n" );
5173 /*************************************************
5174 Function:
5175 inputLinks
5176 Description:
5177 Loads links information of certain insert size.
5178 Input:
5179 1. fp: alignment inofrmation file
5180 2. insertS: insert size boundary
5181 3. line: the first alignment record of current LIB
5182 Output:
5183 1. line: the first alignment record of next LIB
5184 Return:
5185 loaded record number.
5186 *************************************************/
5187 static int inputLinks ( FILE * fp, int insertS, char * line )
5189 unsigned int ctg, bal_ctg, toCtg, bal_toCtg;
5190 int gap, wt, ins;
5191 unsigned int counter = 0, onScafCounter = 0;
5192 unsigned int maskCounter = 0;
5193 CONNECT * cnt, *bal_cnt;
5195 if ( strlen ( line ) )
5197 sscanf ( line, "%d %d %d %d %d", &ctg, &toCtg, &gap, &wt, &ins );
5199 if ( ins != insertS )
5200 { return counter; }
5202 if ( 1 )
5204 bal_ctg = getTwinCtg ( ctg );
5205 bal_toCtg = getTwinCtg ( toCtg );
5206 cnt = add1Connect ( ctg, toCtg, gap, wt, 0 );
5207 bal_cnt = add1Connect ( bal_toCtg, bal_ctg, gap, wt, 0 );
5209 if ( cnt && insertS > 1000 )
5211 cnt->newIns = bal_cnt->newIns = 1;
5214 counter++;
5216 if ( contig_array[ctg].mask || contig_array[toCtg].mask )
5217 { maskCounter++; }
5219 if ( insertS > 1000 &&
5220 contig_array[ctg].from_vt == contig_array[toCtg].from_vt && // on the same scaff
5221 contig_array[ctg].indexInScaf < contig_array[toCtg].indexInScaf )
5223 add1LongPEcov ( ctg, toCtg, wt );
5224 onScafCounter++;
5229 while ( fgets ( line, lineLen, fp ) != NULL )
5231 sscanf ( line, "%d %d %d %d %d", &ctg, &toCtg, &gap, &wt, &ins );
5233 if ( ins > insertS )
5234 { break; }
5236 if ( insertS > 1000 &&
5237 contig_array[ctg].from_vt == contig_array[toCtg].from_vt && // on the same scaff
5238 contig_array[ctg].indexInScaf < contig_array[toCtg].indexInScaf )
5240 add1LongPEcov ( ctg, toCtg, wt );
5241 onScafCounter++;
5244 bal_ctg = getTwinCtg ( ctg );
5245 bal_toCtg = getTwinCtg ( toCtg );
5246 cnt = add1Connect ( ctg, toCtg, gap, wt, 0 );
5247 bal_cnt = add1Connect ( bal_toCtg, bal_ctg, gap, wt, 0 );
5249 if ( cnt && insertS > 1000 )
5251 cnt->newIns = bal_cnt->newIns = 1;
5254 counter++;
5256 if ( contig_array[ctg].mask || contig_array[toCtg].mask )
5257 { maskCounter++; }
5260 fprintf ( stderr, "***************************\nFor insert size: %d\n", insertS );
5261 fprintf ( stderr, " Total PE links %d\n", counter );
5262 fprintf ( stderr, " PE links to masked contigs %d\n", maskCounter );
5263 fprintf ( stderr, " On same scaffold PE links %d\n", onScafCounter );
5264 return counter;
5267 /*************************************************
5268 Function:
5269 Links2Scaf
5270 Description:
5271 Constructs scaffolds based on alignment information.
5272 Input:
5273 1. infile: prefix of graph
5274 Output:
5275 None.
5276 Return:
5277 None.
5278 *************************************************/
5279 void Links2Scaf ( char * infile )
5281 char name[256], *line;
5282 FILE * fp;
5283 int i, j = 1, lib_n = 0, cutoff_sum = 0;
5284 int flag = 0, flag2;
5285 boolean downS, nonLinear = 0, smallPE = 0, isPrevSmall = 0, markSmall;
5287 if ( cvg4SNP > 0.001 )
5289 sprintf ( name, "%s.bubbleInScaff", infile );
5290 snp_fp = ckopen ( name, "w" );
5293 cvg4SNP = ( double ) ( cvg4SNP * cvgAvg );
5295 if ( !pes )
5296 { loadPEgrads ( infile ); }
5298 sprintf ( name, "%s.links", infile );
5299 fp = ckopen ( name, "r" );
5300 createCntMemManager();
5301 createCntLookupTable();
5302 lineLen = 1024;
5303 line = ( char * ) ckalloc ( lineLen * sizeof ( char ) );
5304 fgets ( line, lineLen, fp );
5305 line[0] = '\0';
5306 solidArray = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) );
5307 tempArray = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) );
5308 scaf3 = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) );
5309 scaf5 = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) );
5310 gap3 = ( DARRAY * ) createDarray ( 1000, sizeof ( int ) );
5311 gap5 = ( DARRAY * ) createDarray ( 1000, sizeof ( int ) );
5312 weakPE = 3;
5313 fprintf ( stderr, "\n" );
5315 for ( i = 0; i < gradsCounter; i++ )
5317 if ( MinWeakCut == 0 && i == 0 )
5318 { MinWeakCut = pes[i].pair_num_cut; }
5320 if ( pes[i].insertS < 1000 )
5322 isPrevSmall = 1;
5324 if ( MinWeakCut > pes[i].pair_num_cut )
5325 { MinWeakCut = pes[i].pair_num_cut; }
5327 else if ( pes[i].insertS > 1000 && isPrevSmall )
5329 smallScaf();
5330 isPrevSmall = 0;
5333 Insert_size = pes[i].insertS;
5334 discardCntCounter = 0;
5335 flag2 = inputLinks ( fp, pes[i].insertS, line );
5337 if ( flag2 )
5339 lib_n++;
5340 cutoff_sum += pes[i].pair_num_cut;
5343 flag += flag2;
5345 if ( !flag )
5347 fprintf ( stderr, "\n" );
5348 continue;
5351 if ( i == gradsCounter - 1 || pes[i + 1].rank != pes[i].rank )
5353 flag = nonLinear = downS = markSmall = 0;
5355 if ( pes[i].insertS > 1000 && pes[i].rank > 1 )
5356 { downS = 1; }
5358 if ( pes[i].insertS <= 1000 )
5359 { smallPE = 1; }
5361 if ( pes[i].insertS >= 1000 )
5363 ins_size_var = 50;
5364 OverlapPercent = 0.05;
5366 else if ( pes[i].insertS >= 300 )
5368 ins_size_var = 30;
5369 OverlapPercent = 0.05;
5371 else
5373 ins_size_var = 20;
5374 OverlapPercent = 0.05;
5377 if ( pes[i].insertS > 1000 )
5378 { weakPE = 5; }
5380 bySmall = Insert_size > 1000 ? 0 : 1;
5382 if ( lib_n > 0 )
5384 weakPE = weakPE < cutoff_sum / lib_n ? cutoff_sum / lib_n : weakPE;
5385 lib_n = cutoff_sum = 0;
5388 if ( MinWeakCut > weakPE )
5389 { MinWeakCut = weakPE; }
5391 fprintf ( stderr, "Cutoff of PE links to make a reliable connection: %d\n", weakPE );
5393 if ( i == gradsCounter - 1 )
5394 { nonLinear = 1; }
5396 if ( Insert_size > 1000 )
5398 detectBreakScaff();
5401 ordering ( 1, downS, nonLinear, infile );
5403 if ( i == gradsCounter - 1 )
5405 recoverMask();
5407 else
5409 scaffold_count ( j, 100 );
5410 j++;
5411 fprintf ( stderr, "\n" );
5414 if ( Insert_size > 1000 && i != gradsCounter - 1 )
5416 clearNewInsFlag();
5421 freeDarray ( tempArray );
5422 freeDarray ( solidArray );
5423 freeDarray ( scaf3 );
5424 freeDarray ( scaf5 );
5425 freeDarray ( gap3 );
5426 freeDarray ( gap5 );
5427 free ( ( void * ) line );
5428 fclose ( fp );
5430 if ( cvg4SNP > 0.001 )
5432 fclose ( snp_fp );
5435 fprintf ( stderr, "\nAll links loaded.\n" );
5438 /*************************************************
5439 Function:
5440 putNodeInArray
5441 Description:
5442 Puts contig in "ctg4heapArray".
5443 Input:
5444 1. node: contig
5445 2. maxNodes: maximum allowed contig number in array
5446 3. dis: contig's distance to base contig
5447 Output:
5448 None.
5449 Return:
5450 1 if contig was already in array or putting operation succeeds.
5451 0 if array size was larger than cutoff or contig's reverse complement
5452 format was already in array.
5453 *************************************************/
5454 static boolean putNodeInArray ( unsigned int node, int maxNodes, int dis )
5456 if ( contig_array[node].inSubGraph )
5457 { return 1; }
5459 int index = nodeCounter;
5461 if ( index > maxNodes )
5462 { return 0; }
5464 if ( contig_array[getTwinCtg ( node )].inSubGraph )
5465 { return 0; }
5467 ctg4heapArray[index].ctgID = node;
5468 ctg4heapArray[index].dis = dis;
5469 contig_array[node].inSubGraph = 1;
5470 ctg4heapArray[index].ds_shut4dheap = 0;
5471 ctg4heapArray[index].us_shut4dheap = 0;
5472 ctg4heapArray[index].ds_shut4uheap = 0;
5473 ctg4heapArray[index].us_shut4uheap = 0;
5474 return 1;
5477 /*************************************************
5478 Function:
5479 setInGraph
5480 Description:
5481 Sets contig's status of "inSubGraph".
5482 Input:
5483 1. flag: new status.
5484 Output:
5485 None.
5486 Return:
5487 None.
5488 *************************************************/
5489 static void setInGraph ( boolean flag )
5491 int i;
5492 int node;
5493 nodeCounter = nodeCounter > MaxNodeInSub ? MaxNodeInSub : nodeCounter;
5495 for ( i = 1; i <= nodeCounter; i++ )
5497 node = ctg4heapArray[i].ctgID;
5499 if ( node > 0 )
5500 { contig_array[node].inSubGraph = flag; }
5505 /*************************************************
5506 Function:
5507 getIndexInArr
5508 Description:
5509 Gets contig's index in "ctg4heapArray".
5510 Input:
5511 1. node: contig.
5512 Output:
5513 None.
5514 Return:
5515 Contig's index if contig was in array.
5516 0 otherwise.
5517 *************************************************/
5518 static int getIndexInArr ( const unsigned int node )
5520 int i = 1;
5522 for ( ; i <= nodeCounter; ++i )
5524 if ( node == ctg4heapArray[i].ctgID )
5525 { return i; }
5528 return 0;
5532 /*************************************************
5533 Function:
5534 dispatch1node
5535 Description:
5536 Dispatchs a contig to heap according to its distance to base
5537 contig, then updates related information of heaps.
5538 Input:
5539 1. dis: distance to base contig
5540 2. tempNode: contig to be dispatched
5541 3. maxNodes: maximum allowed contig number in sub-graph
5542 4. dheap: heap for downstream contigs of base contig
5543 5. uheap: heap for upstream contigs of base contig
5544 6. DmaxDis: maximum distance of downstream contig to base contig
5545 7. UmaxDis: maximum distance of upstream contig to base
5546 contig
5547 Output:
5548 1. dheap: updated heap for downstream contigs
5549 2. uheap: updated heap for upstream contigs
5550 3. DmaxDis: updated maximum distance of downstream contig to base contig
5551 4. UmaxDis: updated maximum distance of upstream contig to base contig
5552 Return:
5553 1 if contig was dispatched to "dheap".
5554 -1 if contig was dispatched to "uheap".
5555 0 otherwise.
5556 *************************************************/
5557 static boolean dispatch1node ( int dis, unsigned int tempNode, int maxNodes,
5558 FibHeap * dheap, FibHeap * uheap, int * DmaxDis, int * UmaxDis )
5560 boolean ret;
5562 if ( dis >= 0 ) // put it to Dheap
5564 nodeCounter++;
5565 ret = putNodeInArray ( tempNode, maxNodes, dis );
5567 if ( !ret )
5568 { return 0; }
5570 insertNodeIntoHeap ( dheap, dis, nodeCounter );
5572 if ( dis > *DmaxDis )
5573 { *DmaxDis = dis; }
5575 return 1;
5577 else // put it to Uheap
5579 nodeCounter++;
5580 ret = putNodeInArray ( tempNode, maxNodes, dis );
5582 if ( !ret )
5583 { return 0; }
5585 insertNodeIntoHeap ( uheap, -dis, nodeCounter );
5586 int temp_len = contig_array[tempNode].length;
5588 if ( -dis + temp_len > *UmaxDis )
5589 { *UmaxDis = -dis + contig_array[tempNode].length; }
5591 return -1;
5594 return 0;
5597 /*************************************************
5598 Function:
5599 canDheapWait
5600 Description:
5601 Checks whether current contig is the furthest contig to base
5602 contig in 'dheap'.
5603 Input:
5604 1. currNode: current contig
5605 2. dis: current contig's distance to base contig
5606 3. DmaxDis: maximum distance of downstream contig to base contig
5607 Output:
5608 None.
5609 Return:
5610 0 if current contig was not the furthest contig.
5611 *************************************************/
5612 static boolean canDheapWait ( unsigned int currNode, int dis, int DmaxDis )
5614 if ( dis < DmaxDis )
5615 { return 0; }
5616 else
5617 { return 1; }
5620 /*************************************************
5621 Function:
5622 workOnDheap
5623 Description:
5624 Travers 'dheap' and adds related contigs into 'dheap' or 'uheap'.
5625 Input:
5626 1. dheap: heap for downstream contigs of base contig
5627 2. uheap: heap for upstream contigs of base contig
5628 3. Dwait: indicator of whether all contigs in 'dheap' have been
5629 traversed, 1 for yes
5630 4. Uwait: indicator of whether all contigs in 'uheap' have been
5631 traversed, 1 for yes
5632 5. DmaxDis: maximum distance of downstream contig to base
5633 contig
5634 6. UmaxDis: maximum distance of upstream contig to base
5635 contig
5636 7. maxNodes: maximum allowed contig number in sub-graph
5637 Output:
5638 1. dheap: updated heap for downstream contigs
5639 2. uheap: updated heap for upstream contigs
5640 3. Dwait: indicator of whether all contigs in 'dheap' have been
5641 traversed, 1 for yes
5642 4. Uwait: indicator of whether all contigs in 'uheap' have been
5643 traversed, 1 for yes
5644 5. DmaxDis: updated maximum distance of downstream contig to
5645 base contig
5646 6. UmaxDis: updated maximum distance of upstream contig to
5647 base contig
5648 Return:
5649 0 if operation of putting contig into sub-graph failed.
5650 *************************************************/
5651 static boolean workOnDheap ( FibHeap * dheap, FibHeap * uheap, boolean * Dwait, boolean * Uwait,
5652 int * DmaxDis, int * UmaxDis, int maxNodes )
5654 if ( *Dwait )
5655 { return 1; }
5657 unsigned int currNode, twin, tempNode;
5658 CTGinHEAP * ctgInHeap;
5659 int indexInArray;
5660 CONNECT * us_cnt, *ds_cnt;
5661 int dis0, dis;
5662 boolean ret, isEmpty;
5664 while ( ( indexInArray = removeNextNodeFromHeap ( dheap ) ) != 0 )
5666 ctgInHeap = &ctg4heapArray[indexInArray];
5667 currNode = ctgInHeap->ctgID;
5668 dis0 = ctgInHeap->dis;
5669 isEmpty = IsHeapEmpty ( dheap );
5670 twin = getTwinCtg ( currNode );
5671 us_cnt = ctgInHeap->us_shut4dheap ? NULL : contig_array[twin].downwardConnect;
5673 while ( us_cnt )
5675 if ( us_cnt->deleted || us_cnt->mask ||
5676 contig_array[getTwinCtg ( us_cnt->contigID )].inSubGraph )
5678 us_cnt = us_cnt->next;
5679 continue;
5682 tempNode = getTwinCtg ( us_cnt->contigID );
5684 if ( contig_array[tempNode].inSubGraph )
5686 us_cnt = us_cnt->next;
5687 continue;
5690 dis = dis0 - us_cnt->gapLen - ( int ) contig_array[twin].length;
5691 ret = dispatch1node ( dis, tempNode, maxNodes, dheap, uheap, DmaxDis, UmaxDis );
5693 if ( ret == 0 )
5694 { return 0; }
5695 else if ( ret < 0 )
5696 { *Uwait = 0; }
5698 us_cnt = us_cnt->next;
5701 if ( nodeCounter > 1 && isEmpty )
5703 *Dwait = canDheapWait ( currNode, dis0, *DmaxDis );
5705 if ( *Dwait )
5707 isEmpty = IsHeapEmpty ( dheap );
5708 insertNodeIntoHeap ( dheap, dis0, indexInArray );
5709 ctg4heapArray[indexInArray].us_shut4dheap = 1;
5711 if ( isEmpty )
5712 { return 1; }
5713 else
5714 { continue; }
5718 ds_cnt = ctgInHeap->ds_shut4dheap ? NULL : contig_array[currNode].downwardConnect;
5720 while ( ds_cnt )
5722 if ( ds_cnt->deleted || ds_cnt->mask || contig_array[ds_cnt->contigID].inSubGraph )
5724 ds_cnt = ds_cnt->next;
5725 continue;
5728 tempNode = ds_cnt->contigID;
5729 dis = dis0 + ds_cnt->gapLen + ( int ) contig_array[tempNode].length;
5730 ret = dispatch1node ( dis, tempNode, maxNodes, dheap, uheap, DmaxDis, UmaxDis );
5732 if ( ret == 0 )
5733 { return 0; }
5734 else if ( ret < 0 )
5735 { *Uwait = 0; }
5737 ds_cnt = ds_cnt->next;
5738 } // for each downstream connections
5739 } // for each node comes off the heap
5741 *Dwait = 1;
5742 return 1;
5745 /*************************************************
5746 Function:
5747 canUheapWait
5748 Description:
5749 Checks whether current contig is the furthest contig to base
5750 contig in "uheap".
5751 Input:
5752 1. currNode: current contig
5753 2. dis: current contig's distance to base contig
5754 3. DmaxDis: maximum distance of downstream contig to base contig
5755 Output:
5756 None.
5757 Return:
5758 0 if current contig was not the furthest contig.
5759 *************************************************/
5760 static boolean canUheapWait ( unsigned int currNode, int dis, int UmaxDis )
5762 int temp_len = contig_array[currNode].length;
5764 if ( -dis + temp_len < UmaxDis )
5765 { return 0; }
5766 else
5767 { return 1; }
5770 /*************************************************
5771 Function:
5772 workOnUheap
5773 Description:
5774 Travers 'uheap' and adds related contigs into "dheap" or "uheap".
5775 Input:
5776 1. dheap: heap for downstream contigs of base contig
5777 2. uheap: heap for upstream contigs of base contig
5778 3. Dwait: indicator of whether all contigs in 'dheap' have been
5779 traversed, 1 for yes
5780 4. Uwait: indicator of whether all contigs in 'uheap' have been
5781 traversed, 1 for yes
5782 5. DmaxDis: maximum distance of downstream contig to base contig
5783 6. UmaxDis: maximum distance of upstream contig to base contig
5784 7. maxNodes: maximum allowed contig number in sub-graph
5785 Output:
5786 1. dheap: updated heap for downstream contigs
5787 2. uheap: updated heap for upstream contigs
5788 3. Dwait: indicator of whether all contigs in "dheap" have been
5789 traversed, 1 for yes
5790 4. Uwait: indicator of whether all contigs in "uheap" have been
5791 traversed, 1 for yes
5792 5. DmaxDis: updated maximum distance of downstream contig to
5793 base contig
5794 6. UmaxDis: updated maximum distance of upstream contig to
5795 base contig
5796 Return:
5797 0 if operation of putting contig into sub-graph failed.
5798 *************************************************/
5799 static boolean workOnUheap ( FibHeap * dheap, FibHeap * uheap, boolean * Dwait, boolean * Uwait,
5800 int * DmaxDis, int * UmaxDis, int maxNodes )
5802 if ( *Uwait )
5803 { return 1; }
5805 unsigned int currNode, twin, tempNode;
5806 CTGinHEAP * ctgInHeap;
5807 int indexInArray;
5808 CONNECT * us_cnt, *ds_cnt;
5809 int dis0, dis;
5810 boolean ret, isEmpty;
5812 while ( ( indexInArray = removeNextNodeFromHeap ( uheap ) ) != 0 )
5814 ctgInHeap = &ctg4heapArray[indexInArray];
5815 currNode = ctgInHeap->ctgID;
5816 dis0 = ctgInHeap->dis;
5817 isEmpty = IsHeapEmpty ( uheap );
5818 ds_cnt = ctgInHeap->ds_shut4uheap ? NULL : contig_array[currNode].downwardConnect;
5820 while ( ds_cnt )
5822 if ( ds_cnt->deleted || ds_cnt->mask || contig_array[ds_cnt->contigID].inSubGraph )
5824 ds_cnt = ds_cnt->next;
5825 continue;
5828 tempNode = ds_cnt->contigID;
5829 dis = dis0 + ds_cnt->gapLen + contig_array[tempNode].length;
5830 ret = dispatch1node ( dis, tempNode, maxNodes, dheap, uheap, DmaxDis, UmaxDis );
5832 if ( ret == 0 )
5833 { return 0; }
5834 else if ( ret > 0 )
5835 { *Dwait = 0; }
5837 ds_cnt = ds_cnt->next;
5838 } // for each downstream connections
5840 if ( nodeCounter > 1 && isEmpty )
5842 *Uwait = canUheapWait ( currNode, dis0, *UmaxDis );
5844 if ( *Uwait )
5846 isEmpty = IsHeapEmpty ( uheap );
5847 insertNodeIntoHeap ( uheap, dis0, indexInArray );
5848 ctg4heapArray[indexInArray].ds_shut4uheap = 1;
5850 if ( isEmpty )
5851 { return 1; }
5852 else
5853 { continue; }
5857 twin = getTwinCtg ( currNode );
5858 us_cnt = ctgInHeap->us_shut4uheap ? NULL : contig_array[twin].downwardConnect;
5860 while ( us_cnt )
5862 if ( us_cnt->deleted || us_cnt->mask ||
5863 contig_array[getTwinCtg ( us_cnt->contigID )].inSubGraph )
5865 us_cnt = us_cnt->next;
5866 continue;
5869 tempNode = getTwinCtg ( us_cnt->contigID );
5871 if ( contig_array[tempNode].inSubGraph )
5873 us_cnt = us_cnt->next;
5874 continue;
5877 dis = dis0 - us_cnt->gapLen - contig_array[twin].length;
5878 ret = dispatch1node ( dis, tempNode, maxNodes, dheap, uheap, DmaxDis, UmaxDis );
5880 if ( ret == 0 )
5881 { return 0; }
5882 else if ( ret > 0 )
5883 { *Dwait = 0; }
5885 us_cnt = us_cnt->next;
5887 } // for each node comes off the heap
5889 *Uwait = 1;
5890 return 1;
5893 /*************************************************
5894 Function:
5895 pickUpGeneralSubgraph
5896 Description:
5897 Starting from a base contig, gathers related contigs to form a
5898 sub-graph.
5899 Input:
5900 1. node1: base contig
5901 2. maxNodes: maximum allowed contig number in sub-graph
5902 Output:
5903 None.
5904 Return:
5905 0 if sub-graph was not found.
5906 *************************************************/
5907 static boolean pickUpGeneralSubgraph ( unsigned int node1, int maxNodes )
5909 FibHeap * Uheap = newFibHeap(); // heap for upstream contigs to node1
5910 FibHeap * Dheap = newFibHeap();
5911 int UmaxDis; // max distance upstream to node1
5912 int DmaxDis;
5913 boolean Uwait; // wait signal for Uheap
5914 boolean Dwait;
5915 int dis;
5916 boolean ret;
5917 //initiate: node1 is put to array once, and to both Dheap and Uheap
5918 dis = 0;
5919 nodeCounter = 1;
5920 putNodeInArray ( node1, maxNodes, dis );
5921 insertNodeIntoHeap ( Dheap, dis, nodeCounter );
5922 ctg4heapArray[nodeCounter].us_shut4dheap = 1;
5923 Dwait = 0;
5924 DmaxDis = 0;
5925 insertNodeIntoHeap ( Uheap, dis, nodeCounter );
5926 ctg4heapArray[nodeCounter].ds_shut4uheap = 1;
5927 Uwait = 1;
5928 UmaxDis = contig_array[node1].length;
5930 while ( 1 )
5932 ret = workOnDheap ( Dheap, Uheap, &Dwait, &Uwait, &DmaxDis, &UmaxDis, maxNodes );
5934 if ( !ret )
5936 setInGraph ( 0 );
5937 destroyHeap ( Dheap );
5938 destroyHeap ( Uheap );
5939 return 0;
5942 ret = workOnUheap ( Dheap, Uheap, &Dwait, &Uwait, &DmaxDis, &UmaxDis, maxNodes );
5944 if ( !ret )
5946 setInGraph ( 0 );
5947 destroyHeap ( Dheap );
5948 destroyHeap ( Uheap );
5949 return 0;
5952 if ( Uwait && Dwait )
5954 destroyHeap ( Dheap );
5955 destroyHeap ( Uheap );
5956 return 1;
5961 /*************************************************
5962 Function:
5963 cmp_ctg
5964 Description:
5965 Compares two contigs according to their distances to base contig.
5966 Input:
5967 1. a: pointer to the first contig in heap
5968 2. b: pointer to the second contig in heap
5969 Output:
5970 None.
5971 Return:
5972 1 if the first contig was further than the second contig.
5973 -1 if the second contig was further than the first contig.
5974 0 if the distances were equal.
5975 *************************************************/
5976 static int cmp_ctg ( const void * a, const void * b )
5978 CTGinHEAP * A, *B;
5979 A = ( CTGinHEAP * ) a;
5980 B = ( CTGinHEAP * ) b;
5982 if ( A->dis > B->dis )
5983 { return 1; }
5984 else if ( A->dis == B->dis )
5985 { return 0; }
5986 else
5987 { return -1; }
5990 /*************************************************
5991 Function:
5992 checkEligible
5993 Description:
5994 Checks if the sub-graph accords with the follwing criterions:
5995 1. The reversed complement of the first contig has neither
5996 downstream connection to contigs in sub-graph, nor more
5997 than one upstream connections in scaffold.
5998 2. The last contig has neither downstream connection to reversed
5999 complement of any contig in sub-graph, nor more than one
6000 upstream connections in scaffold.
6001 3. Except the first and the last contigs, none of other contigs in
6002 sub-graph has connection to contig which is out of sub-graph.
6003 Input:
6004 None.
6005 Output:
6006 None.
6007 Return:
6008 1 if the sub-graph accorded with all above criterions.
6009 *************************************************/
6010 static boolean checkEligible()
6012 unsigned int firstNode = ctg4heapArray[1].ctgID;
6013 unsigned int twin;
6014 int i;
6015 boolean flag = 0;
6016 //check if the first node has incoming link from twin of any node in subgraph
6017 // or it has multi outgoing links bound to incoming links
6018 twin = getTwinCtg ( firstNode );
6019 CONNECT * ite_cnt = contig_array[twin].downwardConnect;
6021 while ( ite_cnt )
6023 if ( ite_cnt->deleted || ite_cnt->mask )
6025 ite_cnt = ite_cnt->next;
6026 continue;
6029 if ( contig_array[ite_cnt->contigID].inSubGraph )
6031 return 0;
6034 if ( ite_cnt->prevInScaf )
6036 if ( flag )
6037 { return 0; }
6039 flag = 1;
6042 ite_cnt = ite_cnt->next;
6045 //check if the last node has outgoing link to twin of any node in subgraph
6046 // or it has multi outgoing links bound to incoming links
6047 unsigned int lastNode = ctg4heapArray[nodeCounter].ctgID;
6048 ite_cnt = contig_array[lastNode].downwardConnect;
6049 flag = 0;
6051 while ( ite_cnt )
6053 if ( ite_cnt->deleted || ite_cnt->mask )
6055 ite_cnt = ite_cnt->next;
6056 continue;
6059 twin = getTwinCtg ( ite_cnt->contigID );
6061 if ( contig_array[twin].inSubGraph )
6063 return 0;
6066 if ( ite_cnt->prevInScaf )
6068 if ( flag )
6069 { return 0; }
6071 flag = 1;
6074 ite_cnt = ite_cnt->next;
6077 //check if any node has outgoing link to node outside the subgraph
6078 for ( i = 1; i < nodeCounter; i++ )
6080 ite_cnt = contig_array[ctg4heapArray[i].ctgID].downwardConnect;
6082 while ( ite_cnt )
6084 if ( ite_cnt->deleted || ite_cnt->mask )
6086 ite_cnt = ite_cnt->next;
6087 continue;
6090 if ( !contig_array[ite_cnt->contigID].inSubGraph )
6092 return 0;
6095 ite_cnt = ite_cnt->next;
6099 //check if any node has incoming link from node outside the subgraph
6100 for ( i = 2; i <= nodeCounter; i++ )
6102 twin = getTwinCtg ( ctg4heapArray[i].ctgID );
6103 ite_cnt = contig_array[twin].downwardConnect;
6105 while ( ite_cnt )
6107 if ( ite_cnt->deleted || ite_cnt->mask )
6109 ite_cnt = ite_cnt->next;
6110 continue;
6113 if ( !contig_array[getTwinCtg ( ite_cnt->contigID )].inSubGraph )
6115 return 0;
6118 ite_cnt = ite_cnt->next;
6122 return 1;
6125 /*************************************************
6126 Function:
6127 arrayvalue
6128 Description:
6129 Copies one contig's values to another in array.
6130 Input:
6131 1. init_array: pointer to target contig in array
6132 2. value_array: pointer to source contig in array
6133 Output:
6134 None.
6135 Return:
6136 None.
6137 *************************************************/
6138 static void arrayvalue ( CTGinHEAP * init_array, CTGinHEAP * value_array )
6140 init_array->ctgID = value_array->ctgID;
6141 init_array->dis = value_array->dis;
6142 init_array->ds_shut4dheap = value_array->ds_shut4dheap;
6143 init_array->ds_shut4uheap = value_array->ds_shut4uheap;
6144 init_array->us_shut4dheap = value_array->us_shut4dheap;
6145 init_array->us_shut4uheap = value_array->us_shut4uheap;
6148 /*************************************************
6149 Function:
6150 arrayexchange
6151 Description:
6152 Exchanges two contigs' values.
6153 Input:
6154 1. from_id: the first contig in array
6155 2. to_id: the second contig in array
6156 Output:
6157 None.
6158 Return:
6159 None.
6160 *************************************************/
6161 static void arrayexchange ( unsigned int from_id, unsigned int to_id )
6163 CTGinHEAP tmp_h;
6164 arrayvalue ( &tmp_h, & ( ctg4heapArray[from_id] ) );
6165 arrayvalue ( & ( ctg4heapArray[from_id] ), & ( ctg4heapArray[to_id] ) );
6166 arrayvalue ( & ( ctg4heapArray[to_id] ), &tmp_h );
6169 static void deletearray ( unsigned int id )
6171 int i;
6173 for ( i = 1; i < nodeCounter; i++ )
6175 if ( i >= id )
6177 arrayvalue ( & ( ctg4heapArray[i] ), & ( ctg4heapArray[i + 1] ) );
6181 nodeCounter--;
6184 int getnextInScafCtg ( int id, int mask, int flag )
6186 CONNECT * tmp_cn = contig_array[id].downwardConnect;
6187 int currId = 0;
6189 while ( tmp_cn )
6191 if ( tmp_cn->prevInScaf )
6193 currId = tmp_cn->contigID;
6195 if ( mask != 0 && tmp_cn->contigID != mask )
6196 { break; }
6199 tmp_cn = tmp_cn->next;
6202 if ( mask == currId && mask != 0 )
6203 { currId = 0; }
6205 if ( flag == 0 && currId != 0 )
6206 { currId = getTwinCtg ( currId ); }
6208 return currId;
6211 void delete_PrevNext ( int i, int flag )
6213 int id = ctg4heapArray[i].ctgID;
6214 int pid = getTwinCtg ( id );
6215 CONNECT * ite_cnt = contig_array[id].downwardConnect;
6217 while ( ite_cnt )
6219 if ( ite_cnt->mask || ite_cnt->deleted || !contig_array[ite_cnt->contigID].inSubGraph )
6221 ite_cnt = ite_cnt->next;
6222 continue;
6225 if ( flag == 1 )
6226 { setNextInScaf ( ite_cnt, NULL ); }
6228 if ( flag == 0 )
6229 { setPrevInScaf ( ite_cnt, 0 ); }
6231 ite_cnt = ite_cnt->next;
6234 ite_cnt = contig_array[pid].downwardConnect;
6236 while ( ite_cnt )
6238 if ( ite_cnt->mask || ite_cnt->deleted || !contig_array[ite_cnt->contigID].inSubGraph )
6240 ite_cnt = ite_cnt->next;
6241 continue;
6244 if ( flag == 0 )
6245 { setNextInScaf ( ite_cnt, NULL ); }
6247 if ( flag == 1 )
6248 { setPrevInScaf ( ite_cnt, 0 ); }
6250 ite_cnt = ite_cnt->next;
6254 /*************************************************
6255 Function:
6256 getCntNodes
6257 Description:
6258 Gets contig's downstream connection provided by short insert
6259 size paired-end reads.
6260 Input:
6261 1. node: contig
6262 Output:
6263 1. nodeArray: array to store downstream contigs
6264 2. gapArray: array to store distance from current contig to
6265 downstream contigs
6266 Return:
6267 Found downstream contig number.
6268 *************************************************/
6269 static int getCntNodes ( unsigned int node, unsigned int * nodeArray, unsigned int * gapArray )
6271 int count = 0;
6272 CONNECT * cnt = contig_array[node].downwardConnect;
6274 while ( cnt )
6276 if ( 0 == bySmall && cnt->weight < 3 && !cnt->smallIns && !cnt->bySmall )
6278 cnt = cnt->next;
6279 continue;
6282 nodeArray[count] = cnt->contigID;
6283 gapArray[count++] = cnt->gapLen;
6285 if ( count == MaxCntNode )
6287 break;
6290 cnt = cnt->next;
6293 return count;
6296 /*************************************************
6297 Function:
6298 calGapLen
6299 Description:
6300 Calculates two contigs' distance through the common contig
6301 to which both contigs connected.
6302 Input:
6303 1. cntCounter: contig numbet in the "cntNodeArr"
6304 2. cntNodeArr: the array of third-party contigs
6305 3. cntGapArr: the array of distances to third-party contigs
6306 4. node1: the first contig
6307 5. node2: the second contig
6308 6. tmpNode: the contig whose upstream/downstream contigs
6309 array was not selected as "cntNodeArr"
6310 Output:
6311 1. cntCounter: common contig number of these two contigs
6312 Return:
6313 Accumulated distance between these two contigs.
6314 *************************************************/
6315 static int calGapLen ( int * cntCounter, unsigned int * cntNodeArr, int * cntGapArr,
6316 unsigned int node1, unsigned int node2, unsigned int tmpNode )
6318 int i = 0, gapLen = 0, count = 0;
6319 unsigned int target_node;
6320 CONNECT * cnt;
6321 int len = contig_array[node2].length;
6323 for ( ; i < *cntCounter; ++i )
6325 cnt = getCntBetween ( tmpNode, cntNodeArr[i] );
6327 if ( cnt && ( cnt->weight >= 3 || bySmall || cnt->smallIns || cnt->bySmall ) )
6329 if ( tmpNode == node1 )
6331 gapLen += cnt->gapLen - cntGapArr[i] - len;
6333 else
6335 gapLen += cntGapArr[i] - cnt->gapLen - len;
6338 ++count;
6342 *cntCounter = count;
6343 return gapLen;
6346 /*************************************************
6347 Function:
6348 arrangeNodes_general
6349 Description:
6350 Arranges contigs in sub-graph and updates related relation.
6351 Input:
6352 None.
6353 Output:
6354 None.
6355 Return:
6356 None.
6357 *************************************************/
6358 static void arrangeNodes_general()
6360 int i, j, gap, adjustedGap;
6361 CONNECT * ite_cnt, *temp_cnt, *bal_cnt, *prev_cnt, *next_cnt, *dh_cnt, *three_cnt;
6362 unsigned int node1, node2, tmpNode;
6363 unsigned int bal_nd1, bal_nd2;
6364 unsigned int pre_node, bal_pre_node, next_node, bal_next_node; // pre/next node that connected to first/last node if there is
6365 unsigned int first_node, bal_first_node, last_node, bal_last_node;
6366 unsigned int affected_node1, bal_affected_node1, affected_node2, bal_affected_node2;
6367 unsigned int exchangeNode1 = 0, exchangeNode2 = 0;
6368 int cntCounter, comCount;
6369 int tmp_dis;
6371 //delete original connections
6372 for ( i = 1; i <= nodeCounter; i++ )
6374 node1 = ctg4heapArray[i].ctgID;
6375 ite_cnt = contig_array[node1].downwardConnect;
6377 while ( ite_cnt )
6379 if ( ite_cnt->mask || ite_cnt->deleted || !contig_array[ite_cnt->contigID].inSubGraph )
6381 ite_cnt = ite_cnt->next;
6382 continue;
6385 ite_cnt->deleted = 1;
6386 setNextInScaf ( ite_cnt, NULL );
6387 setPrevInScaf ( ite_cnt, 0 );
6388 ite_cnt = ite_cnt->next;
6391 bal_nd1 = getTwinCtg ( node1 );
6392 ite_cnt = contig_array[bal_nd1].downwardConnect;
6394 while ( ite_cnt )
6396 if ( ite_cnt->mask || ite_cnt->deleted || !contig_array[getTwinCtg ( ite_cnt->contigID )].inSubGraph )
6398 ite_cnt = ite_cnt->next;
6399 continue;
6402 ite_cnt->deleted = 1;
6403 setNextInScaf ( ite_cnt, NULL );
6404 setPrevInScaf ( ite_cnt, 0 );
6405 ite_cnt = ite_cnt->next;
6409 CONNECT * first_cnt = NULL, *last_cnt = NULL, *tmp_cnt;
6410 CONNECT * affected_cnt = NULL, *bal_affected_cnt = NULL; //connections connected to pre_cnt and next_cnt
6411 pre_node = bal_pre_node = next_node = bal_next_node = 0;
6412 first_node = ctg4heapArray[1].ctgID;
6413 bal_first_node = getTwinCtg ( first_node );
6414 ite_cnt = contig_array[bal_first_node].downwardConnect;
6416 while ( ite_cnt )
6418 if ( ite_cnt->deleted || ite_cnt->mask )
6420 ite_cnt = ite_cnt->next;
6421 continue;
6424 if ( ite_cnt->prevInScaf )
6426 if ( !first_cnt )
6428 first_cnt = ite_cnt;
6431 bal_pre_node = ite_cnt->contigID;
6432 pre_node = getTwinCtg ( bal_pre_node );
6433 tmp_cnt = getCntBetween ( pre_node, first_node );
6436 ite_cnt = ite_cnt->next;
6439 last_node = ctg4heapArray[nodeCounter].ctgID;
6440 bal_last_node = getTwinCtg ( last_node );
6441 ite_cnt = contig_array[last_node].downwardConnect;
6443 while ( ite_cnt )
6445 if ( ite_cnt->deleted || ite_cnt->mask )
6447 ite_cnt = ite_cnt->next;
6448 continue;
6451 if ( ite_cnt->prevInScaf )
6453 if ( !last_cnt )
6455 last_cnt = ite_cnt;
6458 next_node = ite_cnt->contigID;
6459 bal_next_node = getTwinCtg ( next_node );
6460 tmp_cnt = getCntBetween ( bal_next_node, bal_last_node );
6463 ite_cnt = ite_cnt->next;
6466 prev_cnt = next_cnt = NULL;
6468 for ( i = 1; i < nodeCounter; ++i )
6470 node1 = ctg4heapArray[i].ctgID;
6471 node2 = ctg4heapArray[i + 1].ctgID;
6472 bal_nd1 = getTwinCtg ( node1 );
6473 bal_nd2 = getTwinCtg ( node2 );
6474 gap = ctg4heapArray[i + 1].dis - ctg4heapArray[i].dis
6475 - contig_array[node2].length;
6476 temp_cnt = getCntBetween ( node1, node2 );
6477 dh_cnt = getCntBetween ( node2, node1 );
6479 if ( i >= 2 )
6480 { three_cnt = getCntBetween ( ctg4heapArray[i - 1].ctgID, node2 ); }
6482 if ( dh_cnt )
6484 tmp_dis = ( int ) contig_array[node1].length + ( int ) contig_array[node2].length + gap + dh_cnt->gapLen;
6486 else
6488 tmp_dis = -1;
6491 if ( temp_cnt && ( bySmall || temp_cnt->bySmall || temp_cnt->smallIns || !dh_cnt || !dh_cnt->bySmall || !dh_cnt->smallIns ) )
6493 temp_cnt->deleted = 0;
6494 temp_cnt->mask = 0;
6495 bal_cnt = getCntBetween ( bal_nd2, bal_nd1 );
6496 bal_cnt->deleted = 0;
6497 bal_cnt->mask = 0;
6499 else if ( dh_cnt && ( ( dh_cnt->bySmall || dh_cnt->smallIns || bySmall )
6500 || ( ( -gap > ( int ) contig_array[node1].length || -gap > ( int ) contig_array[node2].length )
6501 && tmp_dis > 0 && tmp_dis < 500 && dh_cnt->weight > 3 ) ) )
6503 dh_cnt->deleted = 0;
6504 dh_cnt->mask = 0;
6505 dh_cnt = getCntBetween ( bal_nd1, bal_nd2 );
6506 dh_cnt->deleted = 0;
6507 dh_cnt->mask = 0;
6508 arrayexchange ( i, i + 1 );
6510 if ( i == 1 )
6512 i = 0;
6513 continue;
6516 if ( i == 2 )
6518 prev_cnt->deleted = 1;
6519 prev_cnt = NULL;
6520 next_cnt->deleted = 1;
6521 next_cnt = NULL;
6522 i = 0;
6523 continue;
6526 bal_affected_node2 = next_cnt->contigID;
6527 affected_node2 = getTwinCtg ( bal_affected_node2 );
6528 bal_affected_cnt = next_cnt->nextInScaf;
6529 bal_affected_node1 = bal_affected_cnt->contigID;
6530 affected_node1 = getTwinCtg ( bal_affected_node1 );
6531 affected_cnt = getCntBetween ( affected_node1, affected_node2 );
6533 if ( !affected_cnt )
6535 fprintf ( stderr, "affected cnt between %u(%u) and %u(%u) doesn't exists!\n", affected_node1, bal_affected_node1, affected_node2, bal_affected_node2 );
6536 exit ( 1 );
6539 setNextInScaf ( affected_cnt, NULL );
6540 setPrevInScaf ( bal_affected_cnt, 0 );
6541 setNextInScaf ( next_cnt, NULL );
6542 setPrevInScaf ( prev_cnt, 0 );
6543 prev_cnt->deleted = 1;
6544 prev_cnt = NULL;
6545 next_cnt->deleted = 1;
6546 next_cnt = NULL;
6547 i -= 3;
6548 continue;
6550 else
6552 if ( ( bySmall > 0 && gap < 0 )
6553 || ( -gap > ( int ) contig_array[node1].length || -gap > ( int ) contig_array[node2].length )
6554 && ( i != nodeCounter - 1 ) )
6556 adjustedGap = comCount = 0;
6557 uCntCounter1 = getCntNodes ( getTwinCtg ( node1 ), uCntNodeArr1, uCntGapArr1 );
6558 dCntCounter1 = getCntNodes ( node1, dCntNodeArr1, dCntGapArr1 );
6559 uCntCounter2 = getCntNodes ( getTwinCtg ( node2 ), uCntNodeArr2, uCntGapArr2 );
6560 dCntCounter2 = getCntNodes ( node2, dCntNodeArr2, dCntGapArr2 );
6562 if ( uCntCounter1 < uCntCounter2 )
6564 tmpNode = getTwinCtg ( node2 );
6565 cntCounter = uCntCounter1;
6566 cntNodeArr = &uCntNodeArr1[0];
6567 cntGapArr = &uCntGapArr1[0];
6569 else
6571 tmpNode = getTwinCtg ( node1 );
6572 cntCounter = uCntCounter2;
6573 cntNodeArr = &uCntNodeArr2[0];
6574 cntGapArr = &uCntGapArr2[0];
6577 adjustedGap += calGapLen ( &cntCounter, cntNodeArr, cntGapArr, getTwinCtg ( node2 ), getTwinCtg ( node1 ), tmpNode );
6578 comCount += cntCounter;
6580 if ( dCntCounter1 < dCntCounter2 )
6582 tmpNode = node2;
6583 cntCounter = dCntCounter1;
6584 cntNodeArr = &dCntNodeArr1[0];
6585 cntGapArr = &dCntGapArr1[0];
6587 else
6589 tmpNode = node1;
6590 cntCounter = dCntCounter2;
6591 cntNodeArr = &dCntNodeArr2[0];
6592 cntGapArr = &dCntGapArr2[0];
6595 adjustedGap += calGapLen ( &cntCounter, cntNodeArr, cntGapArr, node1, node2, tmpNode );
6596 comCount += cntCounter;
6598 if ( comCount > 0 )
6600 gap = adjustedGap / comCount;
6604 if ( ( -gap > ( int ) contig_array[node1].length || -gap > ( int ) contig_array[node2].length )
6605 && ( i != nodeCounter - 1 ) && ( ( exchangeNode1 == 0 && exchangeNode2 == 0 )
6606 || ( exchangeNode1 != ctg4heapArray[i + 1].ctgID && exchangeNode2 != ctg4heapArray[i].ctgID ) ) )
6608 exchangeNode1 = ctg4heapArray[i].ctgID;
6609 exchangeNode2 = ctg4heapArray[i + 1].ctgID;
6610 arrayexchange ( i, i + 1 );
6612 if ( i == 1 )
6614 i--;
6615 continue;
6618 if ( i == 2 )
6620 prev_cnt->deleted = 1;
6621 prev_cnt = NULL;
6622 next_cnt->deleted = 1;
6623 next_cnt = NULL;
6624 i = 0;
6625 continue;
6628 bal_affected_node2 = next_cnt->contigID;
6629 affected_node2 = getTwinCtg ( bal_affected_node2 );
6630 bal_affected_cnt = next_cnt->nextInScaf;
6631 bal_affected_node1 = bal_affected_cnt->contigID;
6632 affected_node1 = getTwinCtg ( bal_affected_node1 );
6633 affected_cnt = getCntBetween ( affected_node1, affected_node2 );
6635 if ( !affected_cnt )
6637 fprintf ( stderr, "affected cnt between %u(%u) and %u(%u) doesn't exists!\n", affected_node1, bal_affected_node1, affected_node2, bal_affected_node2 );
6638 exit ( 1 );
6641 setNextInScaf ( affected_cnt, NULL );
6642 setPrevInScaf ( bal_affected_cnt, 0 );
6643 setNextInScaf ( next_cnt, NULL );
6644 setPrevInScaf ( prev_cnt, 0 );
6645 prev_cnt->deleted = 1;
6646 prev_cnt = NULL;
6647 next_cnt->deleted = 1;
6648 next_cnt = NULL;
6649 i -= 3;
6650 continue;
6653 temp_cnt = allocateCN ( node2, gap );
6655 if ( cntLookupTable )
6656 { putCnt2LookupTable ( node1, temp_cnt ); }
6658 temp_cnt->weight = 0;
6659 temp_cnt->next = contig_array[node1].downwardConnect;
6660 contig_array[node1].downwardConnect = temp_cnt;
6661 bal_cnt = allocateCN ( bal_nd1, gap );
6663 if ( cntLookupTable )
6664 { putCnt2LookupTable ( bal_nd2, bal_cnt ); }
6666 bal_cnt->weight = 0;
6667 bal_cnt->next = contig_array[bal_nd2].downwardConnect;
6668 contig_array[bal_nd2].downwardConnect = bal_cnt;
6671 if ( prev_cnt )
6673 setNextInScaf ( prev_cnt, temp_cnt );
6674 setPrevInScaf ( temp_cnt, 1 );
6677 if ( next_cnt )
6679 setNextInScaf ( bal_cnt, next_cnt );
6680 setPrevInScaf ( next_cnt, 1 );
6683 prev_cnt = temp_cnt;
6684 next_cnt = bal_cnt;
6687 if ( first_cnt )
6689 if ( ctg4heapArray[1].ctgID == first_node )
6691 bal_nd1 = first_cnt->contigID;
6692 node1 = getTwinCtg ( bal_nd1 );
6693 node2 = first_node;
6694 temp_cnt = checkConnect ( node1, node2 );
6695 bal_cnt = first_cnt;
6696 next_cnt = checkConnect ( ctg4heapArray[1].ctgID, ctg4heapArray[2].ctgID );
6697 prev_cnt = checkConnect ( getTwinCtg ( ctg4heapArray[2].ctgID ), getTwinCtg ( ctg4heapArray[1].ctgID ) );
6699 if ( temp_cnt )
6701 setNextInScaf ( temp_cnt, next_cnt );
6702 setPrevInScaf ( temp_cnt->nextInScaf, 0 );
6703 setPrevInScaf ( next_cnt, 1 );
6704 setNextInScaf ( prev_cnt, bal_cnt );
6707 else
6709 bal_pre_node = first_cnt->contigID;
6710 pre_node = getTwinCtg ( bal_pre_node );
6711 j = 1;
6712 node1 = ctg4heapArray[j].ctgID;
6713 node2 = ctg4heapArray[j + 1].ctgID;
6714 ite_cnt = getCntBetween ( pre_node, node1 );
6715 bal_cnt = getCntBetween ( getTwinCtg ( node1 ), bal_pre_node );
6717 while ( !ite_cnt && node2 != first_node )
6719 tmp_cnt = getCntBetween ( node1, node2 );
6720 bal_cnt = getCntBetween ( getTwinCtg ( node2 ), getTwinCtg ( node1 ) );
6721 setNextInScaf ( tmp_cnt, NULL );
6722 setPrevInScaf ( tmp_cnt, 0 );
6723 tmp_cnt->deleted = 1;
6724 setNextInScaf ( bal_cnt, NULL );
6725 setPrevInScaf ( bal_cnt, 0 );
6726 bal_cnt->deleted = 1;
6727 ++j;
6728 node1 = ctg4heapArray[j].ctgID;
6729 node2 = ctg4heapArray[j + 1].ctgID;
6730 ite_cnt = getCntBetween ( pre_node, node1 );
6731 bal_cnt = getCntBetween ( getTwinCtg ( node1 ), bal_pre_node );
6734 if ( !ite_cnt )
6736 tmp_cnt = getCntBetween ( node1, first_node );
6737 gap = first_cnt->gapLen - tmp_cnt->gapLen - contig_array[node1].length;
6738 ite_cnt = allocateCN ( node1, gap );
6739 ite_cnt->weight = 0;
6741 if ( cntLookupTable )
6743 putCnt2LookupTable ( pre_node, ite_cnt );
6746 ite_cnt->next = contig_array[pre_node].downwardConnect;
6747 contig_array[pre_node].downwardConnect = ite_cnt;
6748 bal_cnt = allocateCN ( bal_pre_node, gap );
6749 bal_cnt->weight = 0;
6751 if ( cntLookupTable )
6753 putCnt2LookupTable ( getTwinCtg ( node1 ), bal_cnt );
6756 bal_cnt->next = contig_array[getTwinCtg ( node1 )].downwardConnect;
6757 contig_array[getTwinCtg ( node1 )].downwardConnect = bal_cnt;
6760 ite_cnt->deleted = 0;
6761 ite_cnt->mask = 0;
6762 bal_cnt->deleted = 0;
6763 bal_cnt->mask = 0;
6764 tmp_cnt = getCntBetween ( node1, node2 );
6765 setNextInScaf ( ite_cnt, tmp_cnt );
6766 setPrevInScaf ( tmp_cnt, 1 );
6767 tmp_cnt = getCntBetween ( getTwinCtg ( node2 ), getTwinCtg ( node1 ) );
6768 setNextInScaf ( tmp_cnt, bal_cnt );
6769 setPrevInScaf ( bal_cnt, 1 );
6771 if ( first_cnt->nextInScaf )
6773 setNextInScaf ( bal_cnt, first_cnt->nextInScaf );
6774 bal_affected_node1 = first_cnt->nextInScaf->contigID;
6775 affected_node1 = getTwinCtg ( bal_affected_node1 );
6776 affected_cnt = getCntBetween ( affected_node1, pre_node );
6777 setNextInScaf ( affected_cnt, ite_cnt );
6778 setPrevInScaf ( ite_cnt, 1 );
6781 setNextInScaf ( first_cnt, NULL );
6782 setPrevInScaf ( first_cnt, 0 );
6783 first_cnt->deleted = 1;
6784 first_cnt->mask = 1;
6785 tmp_cnt = getCntBetween ( pre_node, first_node );
6786 setNextInScaf ( tmp_cnt, NULL );
6787 setPrevInScaf ( tmp_cnt, 0 );
6788 tmp_cnt->deleted = 1;
6789 tmp_cnt->mask = 1;
6793 if ( last_cnt )
6795 node1 = ctg4heapArray[nodeCounter].ctgID;
6797 if ( node1 == last_node )
6799 node2 = last_cnt->contigID;
6800 bal_nd1 = getTwinCtg ( node1 );
6801 bal_nd2 = getTwinCtg ( node2 );
6802 temp_cnt = last_cnt;
6803 bal_cnt = checkConnect ( bal_nd2, bal_nd1 );
6804 next_cnt = checkConnect ( getTwinCtg ( ctg4heapArray[nodeCounter].ctgID ),
6805 getTwinCtg ( ctg4heapArray[nodeCounter - 1].ctgID ) );
6806 prev_cnt = checkConnect ( ctg4heapArray[nodeCounter - 1].ctgID, ctg4heapArray[nodeCounter].ctgID );
6807 setNextInScaf ( prev_cnt, temp_cnt );
6808 setNextInScaf ( bal_cnt, next_cnt );
6809 setPrevInScaf ( next_cnt, 1 );
6811 else
6813 next_node = last_cnt->contigID;
6814 bal_next_node = getTwinCtg ( next_node );
6815 j = nodeCounter;
6816 node1 = ctg4heapArray[j - 1].ctgID;
6817 node2 = ctg4heapArray[j].ctgID;
6818 ite_cnt = getCntBetween ( node2, next_node );
6819 bal_cnt = getCntBetween ( bal_next_node, getTwinCtg ( node2 ) );
6821 while ( !ite_cnt && node1 != last_node )
6823 tmp_cnt = getCntBetween ( node1, node2 );
6824 bal_cnt = getCntBetween ( getTwinCtg ( node2 ), getTwinCtg ( node1 ) );
6825 setNextInScaf ( tmp_cnt, NULL );
6826 setPrevInScaf ( tmp_cnt, 0 );
6827 tmp_cnt->deleted = 1;
6828 setNextInScaf ( bal_cnt, NULL );
6829 setPrevInScaf ( bal_cnt, 0 );
6830 bal_cnt->deleted = 1;
6831 --j;
6832 node1 = ctg4heapArray[j - 1].ctgID;
6833 node2 = ctg4heapArray[j].ctgID;
6834 ite_cnt = getCntBetween ( node2, next_node );
6835 bal_cnt = getCntBetween ( bal_next_node, getTwinCtg ( node2 ) );
6838 if ( !ite_cnt )
6840 tmp_cnt = getCntBetween ( node1, node2 );
6841 gap = last_cnt->gapLen - tmp_cnt->gapLen - contig_array[node2].length;
6842 ite_cnt = allocateCN ( next_node, gap );
6843 ite_cnt->weight = 0;
6845 if ( cntLookupTable )
6847 putCnt2LookupTable ( node2, ite_cnt );
6850 ite_cnt->next = contig_array[node2].downwardConnect;
6851 contig_array[node2].downwardConnect = ite_cnt;
6852 bal_cnt = allocateCN ( getTwinCtg ( node2 ), gap );
6853 bal_cnt->weight = 0;
6855 if ( cntLookupTable )
6857 putCnt2LookupTable ( bal_next_node, bal_cnt );
6860 bal_cnt->next = contig_array[bal_next_node].downwardConnect;
6861 contig_array[bal_next_node].downwardConnect = bal_cnt;
6864 ite_cnt->deleted = 0;
6865 ite_cnt->mask = 0;
6866 bal_cnt->deleted = 0;
6867 bal_cnt->mask = 0;
6868 tmp_cnt = getCntBetween ( node1, node2 );
6869 setNextInScaf ( tmp_cnt, ite_cnt );
6870 setPrevInScaf ( ite_cnt, 1 );
6871 tmp_cnt = getCntBetween ( getTwinCtg ( node2 ), getTwinCtg ( node1 ) );
6872 setNextInScaf ( bal_cnt, tmp_cnt );
6873 setPrevInScaf ( tmp_cnt, 1 );
6875 if ( last_cnt->nextInScaf )
6877 setNextInScaf ( ite_cnt, last_cnt->nextInScaf );
6878 affected_node1 = last_cnt->nextInScaf->contigID;
6879 bal_affected_node1 = getTwinCtg ( affected_node1 );
6880 bal_affected_cnt = getCntBetween ( bal_affected_node1, bal_next_node );
6881 setNextInScaf ( bal_affected_cnt, bal_cnt );
6882 setPrevInScaf ( bal_cnt, 1 );
6885 setNextInScaf ( last_cnt, NULL );
6886 setPrevInScaf ( last_cnt, 0 );
6887 last_cnt->deleted = 1;
6888 tmp_cnt = getCntBetween ( bal_next_node, bal_last_node );
6889 setNextInScaf ( tmp_cnt, NULL );
6890 setPrevInScaf ( tmp_cnt, 0 );
6891 tmp_cnt->deleted = 1;
6896 /*************************************************
6897 Function:
6898 checkOverlapInBetween_general
6899 Description:
6900 Checks if adjacent contigs in the array have reasonable overlap.
6901 Input:
6902 1. tolerance: max percentage that overlap length accounts for
6903 Output:
6904 None.
6905 Return:
6906 1 if the overlap situation was resonable.
6907 *************************************************/
6908 boolean checkOverlapInBetween_general ( double tolerance )
6910 int i, gap;
6911 unsigned int node1, node2;
6912 int lenSum, lenOlp;
6913 CONNECT * cnt;
6914 lenSum = lenOlp = 0;
6916 for ( i = 1; i <= nodeCounter; i++ )
6918 node1 = ctg4heapArray[i].ctgID;
6919 lenSum += contig_array[node1].length;
6922 if ( lenSum < 1 )
6923 { return 0; }
6925 for ( i = 1; i < nodeCounter; i++ )
6927 node2 = ctg4heapArray[i + 1].ctgID;
6928 gap = ctg4heapArray[i + 1].dis - ctg4heapArray[i].dis
6929 - contig_array[node2].length;
6931 if ( -gap > 0 )
6933 node1 = ctg4heapArray[i].ctgID;
6934 cnt = getCntBetween ( node1, node2 );
6936 if ( cnt && cnt->gapLen > gap )
6938 continue;
6940 else if ( ( cnt = getCntBetween ( node2, node1 ) ) != NULL
6941 && cnt->gapLen > gap )
6943 continue;
6945 else if ( -gap < overlaplen )
6947 continue;
6950 lenOlp += -gap;
6953 if ( ( double ) lenOlp / lenSum > tolerance )
6954 { return 0; }
6957 return 1;
6960 int canexchange = 0, exchange_num = 0, failexchange = 0;
6962 /*************************************************
6963 Function:
6964 checkConflictCnt_general
6965 Description:
6966 Checks if the conflict connections between adjacent contigs
6967 in the array are acceptable.
6968 Input:
6969 1. tolerance: max percentage that conflict connections accounts for
6970 Output:
6971 None.
6972 Return:
6973 0 if acceptable.
6974 *************************************************/
6975 static boolean checkConflictCnt_general ( double tolerance )
6977 int i, j, gap;
6978 int supportCounter = 0;
6979 int objectCounter = 0;
6980 CONNECT * cnt;
6982 for ( i = 1; i < nodeCounter; i++ )
6984 for ( j = i + 1; j <= nodeCounter; j++ )
6986 cnt = checkConnect ( ctg4heapArray[i].ctgID, ctg4heapArray[j].ctgID );
6988 if ( cnt )
6989 { supportCounter += cnt->weight; }
6991 cnt = checkConnect ( ctg4heapArray[j].ctgID, ctg4heapArray[i].ctgID );
6993 if ( cnt )
6995 gap = ctg4heapArray[j].dis - ctg4heapArray[i].dis - contig_array[ctg4heapArray[j].ctgID].length;
6997 if ( gap > -overlaplen && gap >= cnt->gapLen && cnt->inherit == 0 )
6999 objectCounter += cnt->weight;
7005 if ( supportCounter < 1 )
7006 { return 1; }
7008 if ( ( double ) objectCounter / supportCounter < tolerance )
7009 { return 0; }
7011 return 1;
7014 /*************************************************
7015 Function:
7016 getIndexInSortedSubgraph
7017 Description:
7018 Gets contig's index in sorted array.
7019 Input:
7020 1. node: contig
7021 2. count: array size
7022 Output:
7023 None.
7024 Return:
7025 Contig's index if found.
7026 -1 otherwise.
7027 *************************************************/
7028 static int getIndexInSortedSubgraph ( unsigned int node, int count )
7030 int index;
7032 for ( index = 0; index < count; ++index )
7034 if ( nodesInSubInOrder[index] == node )
7035 { return index; }
7038 return -1;
7041 /*************************************************
7042 Function:
7043 getIndexInSortedSubgraph
7044 Description:
7045 Gets contig's arc if contig has only one arc with weight > 1.
7046 Input:
7047 1. node: contig
7048 Output:
7049 None.
7050 Return:
7051 Pointer to arc if existed.
7052 NULL otherwise.
7053 *************************************************/
7054 static preARC * getValidArc ( unsigned int node )
7056 int num = 0;
7057 preARC * arc = contig_array[node].arcs;
7059 while ( arc )
7061 if ( arc->multiplicity > 1 )
7063 ++num;
7065 if ( num > 1 )
7067 return NULL;
7071 arc = arc->next;
7074 return arc;
7077 static boolean clearUpSubgraph()
7079 unsigned int i, ctg1, bal_ctg1, ctg2, bal_ctg2;
7080 int j, arc_num, num5, num3, index = 0, count = 0;
7081 preARC * arc;
7082 CONNECT * cnt;
7084 //put all contigs in "nodesInSub" array
7085 for ( i = 1; i <= nodeCounter; ++i )
7087 nodesInSub[i - 1] = ctg4heapArray[i].ctgID;
7090 for ( i = 0; i < nodeCounter; ++i )
7092 ctg1 = nodesInSub[i];
7093 index = getIndexInSortedSubgraph ( ctg1, count );
7095 if ( index >= 0 && index < count - 1 ) //this contig is already in array
7096 { continue; }
7098 bal_ctg1 = getTwinCtg ( ctg1 );
7099 num5 = 0;
7100 num3 = 0;
7101 arc_num = 0;
7102 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = ctg1;
7103 arc = getValidArc ( ctg1 );
7105 while ( arc )
7107 ctg2 = arc->to_ed;
7108 bal_ctg2 = getTwinCtg ( ctg2 );
7110 if ( ( arc = getValidArc ( bal_ctg2 ) ) == NULL )
7112 break;
7114 else if ( arc->to_ed != bal_ctg1 )
7116 break;
7119 ctg1 = ctg2;
7120 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = ctg1;
7121 arc = getValidArc ( ctg1 );
7124 ctg1 = nodesInSub[i];
7125 arc = getValidArc ( bal_ctg1 );
7127 while ( arc )
7129 bal_ctg2 = arc->to_ed;
7130 ctg2 = getTwinCtg ( bal_ctg2 );
7132 if ( ( arc = getValidArc ( ctg2 ) ) == NULL )
7134 break;
7136 else if ( arc->to_ed != ctg1 )
7138 break;
7141 ctg1 = ctg2;
7142 * ( unsigned int * ) darrayPut ( scaf3, num3++ ) = ctg1;
7143 arc = getValidArc ( bal_ctg2 );
7146 for ( j = num3 - 1; j >= 0; --j )
7148 nodesInSubInOrder[index++] = * ( unsigned int * ) darrayGet ( scaf3, j );
7151 for ( j = 0; j < num5; ++j )
7153 nodesInSubInOrder[index++] = * ( unsigned int * ) darrayGet ( scaf5, j );
7158 /*************************************************
7159 Function:
7160 transferCnt2RemainNode
7161 Description:
7162 In bubble structure, transfers connections of contig which is to
7163 be masked, to remained contig.
7164 Input:
7165 1. maskNode: contig to be masked
7166 2. remainNode: remained contig
7167 Output:
7168 None.
7169 Return:
7170 None.
7171 *************************************************/
7172 static void transferCnt2RemainNode ( unsigned int maskNode, unsigned int remainNode )
7174 CONNECT * cnt = contig_array[maskNode].downwardConnect;
7175 CONNECT * bal_cnt, *nextCnt, *bal_nextCnt, *tmpCnt, *bal_tmpCnt;
7176 unsigned int nextNode1, bal_nextNode1, nextNode2, bal_nextNode2;
7177 unsigned int bal_maskNode = getTwinCtg ( maskNode ), bal_remainNode = getTwinCtg ( remainNode );
7178 int gap, weight, inherit;
7180 while ( cnt )
7182 if ( cnt->mask )
7184 cnt = cnt->next;
7185 continue;
7188 nextNode1 = cnt->contigID;
7189 bal_nextNode1 = getTwinCtg ( nextNode1 );
7190 bal_cnt = getCntBetween ( bal_nextNode1, bal_maskNode );
7191 gap = cnt->gapLen;
7192 weight = cnt->weight;
7193 tmpCnt = getCntBetween ( remainNode, nextNode1 );
7195 if ( tmpCnt )
7197 inherit = 0;
7199 else
7201 inherit = 1;
7204 if ( cnt->nextInScaf )
7206 nextNode2 = cnt->nextInScaf->contigID;
7207 bal_nextNode2 = getTwinCtg ( nextNode2 );
7208 nextCnt = getCntBetween ( nextNode1, nextNode2 );
7209 bal_nextCnt = getCntBetween ( bal_nextNode2, bal_nextNode1 );
7211 if ( nextNode1 != remainNode && nextNode2 != remainNode )
7213 tmpCnt = add1Connect ( remainNode, nextNode1, gap, weight, inherit );
7214 bal_tmpCnt = add1Connect ( bal_nextNode1, bal_remainNode, gap, weight, inherit );
7215 tmpCnt->nextInScaf = nextCnt;
7216 tmpCnt->mask = 0;
7217 tmpCnt->deleted = 0;
7218 bal_nextCnt->nextInScaf = bal_tmpCnt;
7219 bal_tmpCnt->prevInScaf = 1;
7220 bal_tmpCnt->mask = 0;
7221 bal_tmpCnt->deleted = 0;
7223 else
7225 nextCnt->prevInScaf = 0;
7226 bal_nextCnt->nextInScaf = NULL;
7230 cnt->nextInScaf = NULL;
7231 cnt->prevInScaf = 0;
7232 cnt->mask = 1;
7233 cnt->deleted = 1;
7234 bal_cnt->nextInScaf = NULL;
7235 bal_cnt->prevInScaf = 0;
7236 bal_cnt->mask = 1;
7237 bal_cnt->deleted = 1;
7238 cnt = cnt->next;
7242 /*************************************************
7243 Function:
7244 maskNodeCnt
7245 Description:
7246 Masks contig's downstream connections in scaffold.
7247 Input:
7248 1. node: contig whose downstream connections are going to be masked
7249 Output:
7250 None.
7251 Return:
7252 None.
7253 *************************************************/
7254 static void maskNodeCnt ( unsigned int node )
7256 CONNECT * cnt = contig_array[node].downwardConnect;
7257 CONNECT * bal_cnt, *bal_nextCnt;
7258 unsigned int bal_nextNode1, bal_nextNode2;
7260 while ( cnt )
7262 bal_nextNode1 = getTwinCtg ( cnt->contigID );
7263 bal_cnt = getCntBetween ( bal_nextNode1, getTwinCtg ( node ) );
7265 if ( cnt->nextInScaf )
7267 cnt->nextInScaf->prevInScaf = 0;
7268 bal_nextNode2 = getTwinCtg ( cnt->nextInScaf->contigID );
7269 bal_nextCnt = getCntBetween ( bal_nextNode2, bal_nextNode1 );
7271 if ( !bal_nextCnt )
7273 exit ( 1 );
7276 bal_nextCnt->nextInScaf = NULL;
7279 cnt->nextInScaf = NULL;
7280 cnt->prevInScaf = 0;
7281 cnt->mask = 1;
7282 cnt->deleted = 1;
7283 bal_cnt->nextInScaf = NULL;
7284 bal_cnt->prevInScaf = 0;
7285 bal_cnt->mask = 1;
7286 bal_cnt->deleted = 1;
7287 cnt = cnt->next;
7291 /*************************************************
7292 Function:
7293 getEndKmers
7294 Description:
7295 Gets contig's first and last kmers' sequences.
7296 Input:
7297 1. seq: contig sequence
7298 2. len: contig length
7299 3. rev: indicator of reversed complement
7300 Output:
7301 1. firstKmer: first kmer's sequence
7302 2. lastKmer: last kmer's sequence
7303 Return:
7304 None.
7305 *************************************************/
7306 static void getEndKmers ( char * seq, int len, int rev, char * firstKmer, char * lastKmer )
7308 int j;
7310 if ( 0 == rev )
7312 for ( j = 0; j < overlaplen; ++j )
7314 firstKmer[j] = int2base ( ( int ) getCharInTightString ( seq, j ) );
7315 lastKmer[j] = int2base ( ( int ) getCharInTightString ( seq, len - j - 1 ) );
7318 else
7320 for ( j = 0; j < overlaplen; ++j )
7322 firstKmer[j] = int2compbase ( ( int ) getCharInTightString ( seq, len - j - 1 ) );
7323 lastKmer[j] = int2compbase ( ( int ) getCharInTightString ( seq, j ) );
7328 /*************************************************
7329 Function:
7330 output_ctg
7331 Description:
7332 Outputs contig sequence.
7333 Input:
7334 1. ctg: contig
7335 2. fo: output file
7336 Output:
7337 None.
7338 Return:
7339 None.
7340 *************************************************/
7341 static void output_ctg ( unsigned int ctg, FILE * fo )
7343 if ( contig_array[ctg].length < 1 )
7344 { return; }
7346 int len;
7347 unsigned int bal_ctg = getTwinCtg ( ctg );
7348 len = contig_array[ctg].length + overlaplen;
7349 int col = 0;
7351 if ( contig_array[ctg].seq )
7353 fprintf ( fo, ">C%d %4.1f\n", ctg, ( double ) contig_array[ctg].cvg );
7354 outputTightStr ( fo, contig_array[ctg].seq, 0, len, len, 0, &col );
7356 else if ( contig_array[bal_ctg].seq )
7358 fprintf ( fo, ">C%d %4.1f\n", bal_ctg, ( double ) contig_array[ctg].cvg );
7359 outputTightStr ( fo, contig_array[bal_ctg].seq, 0, len, len, 0, &col );
7362 fprintf ( fo, "\n" );
7366 /*************************************************
7367 Function:
7368 removeBubbleCtg
7369 Description:
7370 Searchs bubble structure in sub-graph. Removes the contig
7371 with lower coverage and transfers its connection relation to
7372 the remained contig if found structure accords with following
7373 criterions:
7374 1. Both contigs' coverage smaller than a cutoff.
7375 2. The first and last kmers of two contigs are the same.
7376 Input:
7377 None.
7378 Output:
7379 None.
7380 Return:
7381 Handled bubble structure number.
7382 *************************************************/
7383 static int removeBubbleCtg()
7385 int i, j, count, gap, SnpCounter = 0, conflict = 0;
7386 unsigned int node1, node2, bal_node1, bal_node2;
7387 int len1, len2, addLast = 0;
7388 char * tightStr1, *tightStr2;
7389 char firstKmer1[overlaplen + 1], lastKmer1[overlaplen + 1], firstKmer2[overlaplen + 1], lastKmer2[overlaplen + 1];
7390 CONNECT * cnt, *bal_cnt;
7391 count = 0;
7393 for ( i = 1; i < nodeCounter; ++i )
7395 node1 = ctg4heapArray[i].ctgID;
7396 node2 = ctg4heapArray[i + 1].ctgID;
7397 bal_node1 = getTwinCtg ( node1 );
7398 bal_node2 = getTwinCtg ( node2 );
7399 cnt = getCntBetween ( node1, node2 );
7400 bal_cnt = getCntBetween ( node2, node1 );
7401 gap = ctg4heapArray[i + 1].dis - ctg4heapArray[i].dis - ( int ) contig_array[node2].length;
7403 if ( gap >= 0 || contig_array[node1].cvg >= cvg4SNP || contig_array[node2].cvg >= cvg4SNP || cnt || bal_cnt )
7405 nodesInSubInOrder[count] = node1;
7406 nodeDistanceInOrder[count++] = ctg4heapArray[i].dis;
7407 continue;
7410 len1 = contig_array[node1].length + overlaplen;
7411 len2 = contig_array[node2].length + overlaplen;
7413 if ( contig_array[node1].seq )
7415 getEndKmers ( contig_array[node1].seq, len1, 0, firstKmer1, lastKmer1 );
7417 else
7419 getEndKmers ( contig_array[bal_node1].seq, len1, 1, firstKmer1, lastKmer1 );
7422 if ( contig_array[node2].seq )
7424 getEndKmers ( contig_array[node2].seq, len2, 0, firstKmer2, lastKmer2 );
7426 else
7428 getEndKmers ( contig_array[bal_node2].seq, len2, 1, firstKmer2, lastKmer2 );
7431 for ( j = 0; j < overlaplen; ++j )
7433 if ( firstKmer1[j] != firstKmer2[j] || lastKmer1[j] != lastKmer2[j] )
7435 nodesInSubInOrder[count] = node1;
7436 nodeDistanceInOrder[count++] = ctg4heapArray[i].dis;
7437 conflict = 1;
7438 break;
7442 if ( 1 == conflict )
7444 conflict = 0;
7445 continue;
7448 ++SnpCounter;
7450 if ( contig_array[node1].bubbleInScaff == 0 || contig_array[node2].bubbleInScaff == 0 )
7452 contig_array[node1].bubbleInScaff = 1;
7453 contig_array[bal_node1].bubbleInScaff = 1;
7454 contig_array[node2].bubbleInScaff = 1;
7455 contig_array[bal_node2].bubbleInScaff = 1;
7456 output_ctg ( node1, snp_fp );
7457 output_ctg ( node2, snp_fp );
7460 if ( contig_array[node1].cvg > contig_array[node2].cvg || ( len1 > len2 && contig_array[node1].cvg == contig_array[node2].cvg ) )
7462 if ( i == nodeCounter - 1 )
7464 nodesInSubInOrder[count] = node1;
7465 nodeDistanceInOrder[count++] = ctg4heapArray[i].dis;
7466 addLast = 1;
7469 transferCnt2RemainNode ( node2, node1 );
7470 transferCnt2RemainNode ( bal_node2, bal_node1 );
7471 contig_array[node2].mask = 1;
7472 contig_array[bal_node2].mask = 1;
7473 ctg4heapArray[i + 1].ctgID = node1;
7474 ctg4heapArray[i + 1].dis = ctg4heapArray[i].dis;
7476 else
7478 if ( i == nodeCounter - 1 )
7480 nodesInSubInOrder[count] = node2;
7481 nodeDistanceInOrder[count++] = ctg4heapArray[i + 1].dis;
7482 addLast = 1;
7485 transferCnt2RemainNode ( node1, node2 );
7486 transferCnt2RemainNode ( bal_node1, bal_node2 );
7487 contig_array[node1].mask = 1;
7488 contig_array[getTwinCtg ( node1 )].mask = 1;
7492 if ( 0 == addLast )
7494 nodesInSubInOrder[count] = ctg4heapArray[nodeCounter].ctgID;
7495 nodeDistanceInOrder[count++] = ctg4heapArray[nodeCounter].dis;
7498 for ( i = 0; i < count; ++i )
7500 ctg4heapArray[i + 1].ctgID = nodesInSubInOrder[i];
7501 ctg4heapArray[i + 1].dis = nodeDistanceInOrder[i];
7504 nodeCounter = count;
7505 return SnpCounter;
7508 /*************************************************
7509 Function:
7510 general_linearization
7511 Description:
7512 Picks up sub-graph and trys to line involved contigs.
7513 Input:
7514 1. strict: indicator of whether using strict cutoff to deal with sub-graph
7515 Output:
7516 None.
7517 Return:
7518 None.
7519 *************************************************/
7520 static void general_linearization ( boolean strict )
7522 unsigned int i;
7523 int subCounter = 0;
7524 int out_num;
7525 boolean flag;
7526 int conflCounter = 0, overlapCounter = 0, eligibleCounter = 0;
7527 int SNPCtgCounter = 0;
7528 double overlapTolerance, conflTolerance;
7529 canexchange = 0, exchange_num = 0, failexchange = 0;
7530 fprintf ( stderr, "Start to linearize sub-graph.\n" );
7532 for ( i = num_ctg; i > 0; i-- )
7534 if ( contig_array[i].mask )
7535 { continue; }
7537 out_num = validConnect ( i, NULL );
7539 if ( out_num < 2 )
7540 { continue; }
7542 flag = pickUpGeneralSubgraph ( i, MaxNodeInSub );
7544 if ( !flag )
7546 continue;
7549 subCounter++;
7550 qsort ( &ctg4heapArray[1], nodeCounter, sizeof ( CTGinHEAP ), cmp_ctg );
7552 if ( Insert_size < 1000 && cvg4SNP > 0.001 )
7554 SNPCtgCounter += removeBubbleCtg();
7557 flag = checkEligible();
7559 if ( !flag )
7561 eligibleCounter++;
7562 setInGraph ( 0 );
7563 continue;
7566 if ( strict )
7568 overlapTolerance = OverlapPercent;
7569 conflTolerance = ConflPercent;
7571 else
7573 overlapTolerance = 2 * OverlapPercent;
7574 conflTolerance = 2 * ConflPercent;
7577 flag = checkOverlapInBetween_general ( overlapTolerance );
7579 if ( !flag )
7581 overlapCounter++;
7582 setInGraph ( 0 );
7583 continue;
7586 flag = checkConflictCnt_general ( conflTolerance );
7588 if ( flag )
7590 conflCounter++;
7591 setInGraph ( 0 );
7592 continue;
7595 arrangeNodes_general();
7596 setInGraph ( 0 );
7599 fprintf ( stderr, " Picked sub-graphs %d\n Connection-conflict %d\n Significant overlapping %d\n Eligible %d\n Bubble structures %d\n", subCounter, conflCounter, overlapCounter, eligibleCounter, SNPCtgCounter );
7602 /**** the fowllowing codes for detecting and break down scaffold at weak point **********/
7604 /*************************************************
7605 Function:
7606 smallScaf
7607 Description:
7608 Counts constructed scaffold number by using short insert size
7609 paired-end reads and sets connections' flags: "bySmall" and
7610 "smallIns".
7611 Input:
7612 None.
7613 Output:
7614 None.
7615 Return:
7616 None.
7617 *************************************************/
7618 static void smallScaf()
7620 unsigned int i, ctg, bal_ctg, prevCtg;
7621 int counter = 0;
7622 CONNECT * bindCnt, *cnt;
7624 for ( i = 1; i <= num_ctg; i++ )
7625 { contig_array[i].flag = 0; }
7627 for ( i = 1; i <= num_ctg; i++ )
7629 if ( contig_array[i].flag || contig_array[i].mask || !contig_array[i].downwardConnect )
7630 { continue; }
7632 bindCnt = getBindCnt ( i );
7634 if ( !bindCnt )
7635 { continue; }
7637 counter++;
7638 contig_array[i].flag = 1;
7639 contig_array[getTwinCtg ( i )].flag = 1;
7640 prevCtg = getTwinCtg ( i );
7642 while ( bindCnt )
7644 ctg = bindCnt->contigID;
7645 bal_ctg = getTwinCtg ( ctg );
7646 bindCnt->bySmall = 1;
7647 cnt = getCntBetween ( bal_ctg, prevCtg );
7649 if ( cnt )
7650 { cnt->bySmall = 1; }
7652 contig_array[ctg].flag = 1;
7653 contig_array[bal_ctg].flag = 1;
7654 prevCtg = bal_ctg;
7655 bindCnt = bindCnt->nextInScaf;
7658 ctg = getTwinCtg ( i );
7659 bindCnt = getBindCnt ( ctg );
7660 prevCtg = i;
7662 while ( bindCnt )
7664 ctg = bindCnt->contigID;
7665 bal_ctg = getTwinCtg ( ctg );
7666 bindCnt->bySmall = 1;
7667 cnt = getCntBetween ( bal_ctg, prevCtg );
7669 if ( cnt )
7670 { cnt->bySmall = 1; }
7672 contig_array[ctg].flag = 1;
7673 contig_array[bal_ctg].flag = 1;
7674 prevCtg = bal_ctg;
7675 bindCnt = bindCnt->nextInScaf;
7679 fprintf ( stderr, "Report from smallScaf: %d scaffolds by smallPE.\n", counter );
7681 for ( i = 1; i <= num_ctg; i++ )
7683 if ( !contig_array[i].downwardConnect )
7684 { continue; }
7686 cnt = contig_array[i].downwardConnect;
7688 while ( cnt )
7690 cnt->smallIns = 1;
7691 cnt = cnt->next;
7696 /*************************************************
7697 Function:
7698 clearNewInsFlag
7699 Description:
7700 Clears conections' flag "newIns".
7701 Input:
7702 None.
7703 Output:
7704 None.
7705 Return:
7706 None.
7707 *************************************************/
7708 static void clearNewInsFlag()
7710 int i = 1;
7711 CONNECT * cnt;
7713 for ( ; i <= num_ctg; i++ )
7715 cnt = contig_array[i].downwardConnect;
7717 while ( cnt )
7719 cnt->newIns = 0;
7721 if ( Insert_size > 15000 )
7723 cnt = cnt->next;
7724 continue;
7727 cnt->maxGap = 0;
7728 cnt = cnt->next;
7733 /*************************************************
7734 Function:
7735 putItem2Sarray
7736 Description:
7737 Checks whether a scaffold ID is in array. If not, puts it in to array.
7738 Input:
7739 1. scaf: scaffold ID to be checked and put in to array
7740 2. wt: weight of connection to scaffold ID
7741 3. SCAF: array of scaffold IDs
7742 4. WT: array of connections' weight
7743 5. counter: size of scaffold ID array
7744 Output:
7745 1. SCAF: updated array of scaffold IDs
7746 2. WT: updated array of connections' weight
7747 Return:
7748 1 if operation of putting succeeded.
7749 *************************************************/
7750 static boolean putItem2Sarray ( unsigned int scaf, int wt, DARRAY * SCAF, DARRAY * WT, int counter )
7752 int i;
7753 unsigned int * scafP, *wtP;
7755 for ( i = 0; i < counter; i++ )
7757 scafP = ( unsigned int * ) darrayGet ( SCAF, i );
7759 if ( ( *scafP ) == scaf )
7761 wtP = ( unsigned int * ) darrayGet ( WT, i );
7762 *wtP = ( *wtP + wt );
7763 return 0;
7767 scafP = ( unsigned int * ) darrayPut ( SCAF, counter );
7768 wtP = ( unsigned int * ) darrayPut ( WT, counter );
7769 *scafP = scaf;
7770 *wtP = wt;
7771 return 1;
7774 /*************************************************
7775 Function:
7776 getDSLink2Scaf
7777 Description:
7778 Gets downstream connections to other scaffolds and stores
7779 related information including other scaffolds' ID and connections'
7780 weight.
7781 Input:
7782 1. scafStack: scaffold stack consisting of contigs
7783 2. SCAF: array to store other scaffolds' IDs
7784 3. WT: array to store connections' weight
7785 4. total_len: length sum of contigs used to search connections
7786 to other scaffolds
7787 Output:
7788 1. SCAF: updated array of scaffold IDs
7789 2. WT: updated array of connections' weight
7790 Return:
7791 Number of other scaffolds being connected.
7792 *************************************************/
7793 static int getDSLink2Scaf ( STACK * scafStack, DARRAY * SCAF, DARRAY * WT, int total_len )
7795 CONNECT * ite_cnt;
7796 CONNECT * bind_cnt;
7797 unsigned int ctg, targetCtg, bal_targetCtg, *pt;
7798 int counter = 0;
7799 int len = 0, gap;
7800 boolean inc;
7801 stackRecover ( scafStack );
7803 while ( ( pt = ( unsigned int * ) stackPop ( scafStack ) ) != NULL )
7805 ctg = *pt;
7806 bind_cnt = getBindCnt ( ctg );
7807 gap = bind_cnt ? bind_cnt->gapLen : 0;
7808 len += contig_array[ctg].length + gap;
7810 if ( ( contig_array[ctg].mask && contig_array[ctg].length < 500 ) || !contig_array[ctg].downwardConnect
7811 || total_len - len > Insert_size )
7813 continue;
7816 ite_cnt = contig_array[ctg].downwardConnect;
7818 while ( ite_cnt )
7820 if ( ite_cnt->newIns != 1 )
7822 ite_cnt = ite_cnt->next;
7823 continue;
7826 targetCtg = ite_cnt->contigID;
7827 bal_targetCtg = getTwinCtg ( targetCtg );
7829 if ( ( ite_cnt->mask && contig_array[targetCtg].length < 500 ) || ite_cnt->singleInScaf
7830 || ite_cnt->nextInScaf || ite_cnt->prevInScaf || ite_cnt->inherit )
7832 ite_cnt = ite_cnt->next;
7833 continue;
7836 if ( contig_array[ctg].from_vt == contig_array[targetCtg].from_vt // on the same scaff
7837 || ( targetCtg == contig_array[targetCtg].from_vt && bal_targetCtg == contig_array[bal_targetCtg].from_vt ) ) //targetCtg isn't in any scaffold
7839 ite_cnt = ite_cnt->next;
7840 continue;
7843 inc = putItem2Sarray ( contig_array[targetCtg].from_vt, ite_cnt->weight, SCAF, WT, counter );
7845 if ( inc )
7846 { counter++; }
7848 ite_cnt = ite_cnt->next;
7852 return counter;
7855 /*************************************************
7856 Function:
7857 getScaffold
7858 Description:
7859 Starting from a contig, gets downstream contig chain in
7860 a scaffold.
7861 Input:
7862 1. start: start contig
7863 Output:
7864 1. scafStack: stack to store contig chain
7865 Return:
7866 Length of contig chain.
7867 *************************************************/
7868 static int getScaffold ( unsigned int start, STACK * scafStack )
7870 int len = contig_array[start].length;
7871 unsigned int * pt, ctg;
7872 emptyStack ( scafStack );
7873 pt = ( unsigned int * ) stackPush ( scafStack );
7874 *pt = start;
7875 CONNECT * bindCnt = getBindCnt ( start );
7877 while ( bindCnt )
7879 ctg = bindCnt->contigID;
7880 pt = ( unsigned int * ) stackPush ( scafStack );
7881 *pt = ctg;
7882 len += bindCnt->gapLen + contig_array[ctg].length;
7883 bindCnt = bindCnt->nextInScaf;
7886 stackBackup ( scafStack );
7887 return len;
7890 /*************************************************
7891 Function:
7892 isLinkReliable
7893 Description:
7894 Checks whether there is a reliable connection.
7895 Input:
7896 1. WT: weight of connections
7897 2. count: connection number
7898 Output:
7899 None.
7900 Return:
7901 1 if a reliable connection was found.
7902 *************************************************/
7903 static boolean isLinkReliable ( DARRAY * WT, int count )
7905 int i;
7907 for ( i = 0; i < count; i++ )
7908 if ( * ( int * ) darrayGet ( WT, i ) >= weakPE )
7909 { return 1; }
7911 return 0;
7914 /*************************************************
7915 Function:
7916 getWtFromSarray
7917 Description:
7918 Gets weight of connection to specified scaffold.
7919 Input:
7920 1. SCAF: array of scaffold ID
7921 2. WT: array of onnection weight
7922 3. count: array size
7923 4. scaf: specified scaffold
7924 Output:
7925 None.
7926 Return:
7927 Weight of connection was found.
7928 0 otherwise.
7929 *************************************************/
7930 static int getWtFromSarray ( DARRAY * SCAF, DARRAY * WT, int count, unsigned int scaf )
7932 int i;
7934 for ( i = 0; i < count; i++ )
7935 if ( * ( unsigned int * ) darrayGet ( SCAF, i ) == scaf )
7936 { return * ( int * ) darrayGet ( WT, i ); }
7938 return 0;
7941 /*************************************************
7942 Function:
7943 switch2twin
7944 Description:
7945 Transfers contigs to their reversed complements.
7946 Input:
7947 1. scafStack: stack of contigs
7948 Output:
7949 1. scafStack: updated stack of contigs
7950 Return:
7951 None.
7952 *************************************************/
7953 static void switch2twin ( STACK * scafStack )
7955 unsigned int * pt;
7956 stackRecover ( scafStack );
7958 while ( ( pt = ( unsigned int * ) stackPop ( scafStack ) ) != NULL )
7959 { *pt = getTwinCtg ( *pt ); }
7962 /*************************************************
7963 Function:
7964 recoverLinks
7965 Description:
7966 Recovers contigs' connections to other scaffold if they
7967 accords with some criterions.
7968 Input:
7969 1. scafStack: stack of contigs
7970 Output:
7971 None.
7972 Return:
7973 None.
7974 *************************************************/
7975 static void recoverLinks ( STACK * scafStack )
7977 CONNECT * ite_cnt;
7978 unsigned int ctg, targetCtg, *pt;
7979 int counter = 0;
7980 boolean inc;
7981 unsigned int bal_ctg;
7982 stackRecover ( scafStack );
7984 while ( ( pt = ( unsigned int * ) stackPop ( scafStack ) ) != NULL )
7986 ctg = *pt;
7988 if ( contig_array[ctg].mask || !contig_array[ctg].downwardConnect )
7990 continue;
7993 ite_cnt = contig_array[ctg].downwardConnect;
7995 while ( ite_cnt )
7997 if ( ite_cnt->mask || ite_cnt->singleInScaf || ite_cnt->nextInScaf || ite_cnt->prevInScaf || ite_cnt->inherit || ite_cnt->weight < weakPE )
7999 ite_cnt = ite_cnt->next;
8000 continue;
8003 targetCtg = ite_cnt->contigID;
8005 if ( contig_array[ctg].from_vt == contig_array[targetCtg].from_vt ) // on the same scaff
8007 ite_cnt = ite_cnt->next;
8008 continue;
8011 setConnectDelete ( ctg, targetCtg, 0, 0 );
8012 ite_cnt = ite_cnt->next;
8015 bal_ctg = getTwinCtg ( ctg );
8017 if ( contig_array[bal_ctg].mask || !contig_array[bal_ctg].downwardConnect )
8019 continue;
8022 ite_cnt = contig_array[bal_ctg].downwardConnect;
8024 while ( ite_cnt )
8026 if ( ite_cnt->mask || ite_cnt->singleInScaf || ite_cnt->nextInScaf || ite_cnt->prevInScaf || ite_cnt->inherit || ite_cnt->weight < weakPE )
8028 ite_cnt = ite_cnt->next;
8029 continue;
8032 targetCtg = ite_cnt->contigID;
8034 if ( contig_array[bal_ctg].from_vt == contig_array[targetCtg].from_vt ) // on the same scaff
8036 ite_cnt = ite_cnt->next;
8037 continue;
8040 setConnectDelete ( bal_ctg, targetCtg, 0, 0 );
8041 ite_cnt = ite_cnt->next;
8046 ------>
8047 scaf1 --- --- -- ---
8048 scaf2 -- --- --- --
8049 ---->
8051 /*************************************************
8052 Function:
8053 checkScafConsist
8054 Description:
8055 Checks if both sets of contigs in two stacks have reliable
8056 connection to different scaffolds.
8057 Input:
8058 1. scafStack1: stack of contig set one
8059 2. len1: total length of contigs in set one
8060 3. scafStack2: stack of contig set two
8061 4. len2: total length of contigs in set two
8062 Output:
8063 None.
8064 Return:
8065 0 if both sets of contigs did not have reliable connection
8066 to different scaffolds.
8067 *************************************************/
8068 static boolean checkScafConsist ( STACK * scafStack1, int len1, STACK * scafStack2, int len2 )
8070 DARRAY * downwardTo1 = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) ); // scaf links to those scaffolds
8071 DARRAY * downwardTo2 = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) );
8072 DARRAY * downwardWt1 = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) ); // scaf links to scaffolds with those wt
8073 DARRAY * downwardWt2 = ( DARRAY * ) createDarray ( 1000, sizeof ( unsigned int ) );
8074 int linkCount1 = getDSLink2Scaf ( scafStack1, downwardTo1, downwardWt1, len1 );
8075 int linkCount2 = getDSLink2Scaf ( scafStack2, downwardTo2, downwardWt2, len2 );
8077 if ( !linkCount1 )
8079 freeDarray ( downwardTo1 );
8080 freeDarray ( downwardTo2 );
8081 freeDarray ( downwardWt1 );
8082 freeDarray ( downwardWt2 );
8083 return 1;
8086 boolean flag1 = isLinkReliable ( downwardWt1, linkCount1 );
8088 if ( !flag1 )
8090 freeDarray ( downwardTo1 );
8091 freeDarray ( downwardTo2 );
8092 freeDarray ( downwardWt1 );
8093 freeDarray ( downwardWt2 );
8094 return 1;
8097 unsigned int scaf;
8098 int i, wt1, wt2, ret = 1;
8100 for ( i = 0; i < linkCount1; i++ )
8102 wt1 = * ( int * ) darrayGet ( downwardWt1, i );
8104 if ( wt1 < weakPE )
8105 { continue; }
8107 scaf = * ( unsigned int * ) darrayGet ( downwardTo1, i );
8108 wt2 = getWtFromSarray ( downwardTo2, downwardWt2, linkCount2, scaf );
8110 if ( wt2 < 1 )
8112 ret = 0;
8113 break;
8117 if ( ret == 0 )
8119 if ( linkCount1 && flag1 )
8121 recoverLinks ( scafStack1 );
8124 if ( linkCount2 )
8126 recoverLinks ( scafStack2 );
8130 freeDarray ( downwardTo1 );
8131 freeDarray ( downwardTo2 );
8132 freeDarray ( downwardWt1 );
8133 freeDarray ( downwardWt2 );
8134 return ret;
8137 /*************************************************
8138 Function:
8139 setBreakPoints
8140 Description:
8141 Sets break points of scaffold at weak connections.
8142 Input:
8143 1. ctgArray: array of contig in scaffold
8144 2. count: total contig number
8145 3. weakest: contig whose downstream connection was
8146 estimated break point
8147 Output:
8148 1. start: the first contig with weak downstream connection
8149 2. finish: the last contig with downstream connection
8150 Return:
8151 None.
8152 *************************************************/
8153 static void setBreakPoints ( DARRAY * ctgArray, int count, int weakest,
8154 int * start, int * finish )
8156 int index = weakest - 1;
8157 unsigned int thisCtg;
8158 unsigned int nextCtg = * ( unsigned int * ) darrayGet ( ctgArray, weakest );
8159 CONNECT * cnt;
8160 *start = weakest;
8162 while ( index >= 0 )
8164 thisCtg = * ( unsigned int * ) darrayGet ( ctgArray, index );
8165 cnt = getCntBetween ( thisCtg, nextCtg );
8167 if ( cnt->maxGap > 2 )
8168 { break; }
8169 else
8170 { *start = index; }
8172 nextCtg = thisCtg;
8173 index--;
8176 unsigned int prevCtg = * ( unsigned int * ) darrayGet ( ctgArray, weakest + 1 );
8177 *finish = weakest + 1;
8178 index = weakest + 2;
8180 while ( index < count )
8182 thisCtg = * ( unsigned int * ) darrayGet ( ctgArray, index );
8183 cnt = getCntBetween ( prevCtg, thisCtg );
8185 if ( cnt->maxGap > 2 )
8186 { break; }
8187 else
8188 { *finish = index; }
8190 prevCtg = thisCtg;
8191 index++;
8195 /*************************************************
8196 Function:
8197 changeScafEnd
8198 Description:
8199 Changes "to_vt" of contigs in a scaffold to a specified contig.
8200 Input:
8201 1. scafStack: stack of contigs in scaffold
8202 2. end: specified contig
8203 Output:
8204 None.
8205 Return:
8206 None.
8207 *************************************************/
8208 static void changeScafEnd ( STACK * scafStack, unsigned int end )
8210 unsigned int ctg, *pt;
8211 unsigned int start = getTwinCtg ( end );
8212 stackRecover ( scafStack );
8214 while ( ( pt = ( unsigned int * ) stackPop ( scafStack ) ) != NULL )
8216 ctg = *pt;
8217 contig_array[ctg].to_vt = end;
8218 contig_array[getTwinCtg ( ctg )].from_vt = start;
8222 /*************************************************
8223 Function:
8224 changeScafBegin
8225 Description:
8226 Changes "from_vt" of contigs in a scaffold to a specified
8227 contig.
8228 Input:
8229 1. scafStack: stack of contigs in scaffold
8230 2. start: specified contig
8231 Output:
8232 None.
8233 Return:
8234 None.
8235 *************************************************/
8236 static void changeScafBegin ( STACK * scafStack, unsigned int start )
8238 unsigned int ctg, *pt;
8239 unsigned int end = getTwinCtg ( start );
8240 stackRecover ( scafStack );
8242 while ( ( pt = ( unsigned int * ) stackPop ( scafStack ) ) != NULL )
8244 ctg = *pt;
8245 contig_array[ctg].from_vt = start;
8246 contig_array[getTwinCtg ( ctg )].to_vt = end;
8250 /*************************************************
8251 Function:
8252 detectBreakScaff
8253 Description:
8254 Detects and breaks the weak connection between contigs
8255 in scaffold according to the support of large insert size
8256 paired-end reads.
8257 Input:
8258 None.
8259 Output:
8260 None.
8261 Return:
8262 None.
8263 *************************************************/
8264 static void detectBreakScaf()
8266 unsigned int i, avgPE, scafLen, len, ctg, bal_ctg, prevCtg, thisCtg;
8267 long long peCounter, linkCounter;
8268 int num3, num5, weakPoint, tempCounter, j, t, counter = 0;
8269 CONNECT * bindCnt, *cnt, *weakCnt;
8270 STACK * scafStack1 = ( STACK * ) createStack ( 1000, sizeof ( unsigned int ) );
8271 STACK * scafStack2 = ( STACK * ) createStack ( 1000, sizeof ( unsigned int ) );
8273 for ( i = 1; i <= num_ctg; i++ )
8274 { contig_array[i].flag = 0; }
8276 for ( i = 1; i <= num_ctg; i++ )
8278 if ( contig_array[i].flag || contig_array[i].mask || !contig_array[i].downwardConnect )
8279 { continue; }
8281 bindCnt = getBindCnt ( i );
8283 if ( !bindCnt )
8284 { continue; }
8286 //first scan to get the average coverage by longer pe
8287 num5 = num3 = peCounter = linkCounter = 0;
8288 scafLen = contig_array[i].length;
8289 ctg = i;
8290 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = i;
8291 contig_array[i].flag = 1;
8292 contig_array[getTwinCtg ( i )].flag = 1;
8294 while ( bindCnt )
8296 if ( !bindCnt->bySmall )
8297 { break; }
8299 linkCounter++;
8300 peCounter += bindCnt->maxGap;
8301 ctg = bindCnt->contigID;
8302 scafLen += contig_array[ctg].length;
8303 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = ctg;
8304 bal_ctg = getTwinCtg ( ctg );
8305 contig_array[ctg].flag = 1;
8306 contig_array[bal_ctg].flag = 1;
8307 bindCnt = bindCnt->nextInScaf;
8310 ctg = getTwinCtg ( i );
8311 bindCnt = getBindCnt ( ctg );
8313 while ( bindCnt )
8315 if ( !bindCnt->bySmall )
8316 { break; }
8318 linkCounter++;
8319 peCounter += bindCnt->maxGap;
8320 ctg = bindCnt->contigID;
8321 scafLen += contig_array[ctg].length;
8322 bal_ctg = getTwinCtg ( ctg );
8323 contig_array[ctg].flag = 1;
8324 contig_array[bal_ctg].flag = 1;
8325 * ( unsigned int * ) darrayPut ( scaf3, num3++ ) = bal_ctg;
8326 bindCnt = bindCnt->nextInScaf;
8329 if ( linkCounter < 1 || scafLen < 5000 )
8330 { continue; }
8332 avgPE = peCounter / linkCounter;
8334 if ( avgPE < 10 )
8335 { continue; }
8337 tempCounter = 0;
8339 for ( j = num3 - 1; j >= 0; j-- )
8340 * ( unsigned int * ) darrayPut ( tempArray, tempCounter++ ) =
8341 * ( unsigned int * ) darrayGet ( scaf3, j );
8343 for ( j = 0; j < num5; j++ )
8344 * ( unsigned int * ) darrayPut ( tempArray, tempCounter++ ) =
8345 * ( unsigned int * ) darrayGet ( scaf5, j );
8347 prevCtg = * ( unsigned int * ) darrayGet ( tempArray, 0 );
8348 weakCnt = NULL;
8349 weakPoint = 0;
8350 len = contig_array[prevCtg].length;
8352 for ( t = 1; t < tempCounter; t++ )
8354 thisCtg = * ( unsigned int * ) darrayGet ( tempArray, t );
8356 if ( len < 2000 )
8358 len += contig_array[thisCtg].length;
8359 prevCtg = thisCtg;
8360 continue;
8362 else if ( len > scafLen - 2000 )
8363 { break; }
8365 len += contig_array[thisCtg].length;
8367 if ( contig_array[prevCtg].from_vt != contig_array[thisCtg].from_vt ||
8368 contig_array[prevCtg].indexInScaf > contig_array[thisCtg].indexInScaf )
8370 prevCtg = thisCtg;
8371 continue;
8374 cnt = getCntBetween ( prevCtg, thisCtg );
8376 if ( !weakCnt || weakCnt->maxGap > cnt->maxGap )
8378 weakCnt = cnt;
8379 weakPoint = t;
8382 prevCtg = thisCtg;
8385 if ( !weakCnt || ( weakCnt->maxGap > 2 && weakCnt->maxGap > avgPE / 5 ) )
8386 { continue; }
8388 prevCtg = * ( unsigned int * ) darrayGet ( tempArray, weakPoint - 1 );
8389 thisCtg = * ( unsigned int * ) darrayGet ( tempArray, weakPoint );
8391 if ( contig_array[prevCtg].from_vt != contig_array[thisCtg].from_vt ||
8392 contig_array[prevCtg].indexInScaf > contig_array[thisCtg].indexInScaf )
8394 fprintf ( stderr, "contig %d and %d not on the same scaff\n", prevCtg, thisCtg );
8395 continue;
8398 setConnectWP ( prevCtg, thisCtg, 1 );
8399 int index1, index2;
8400 setBreakPoints ( tempArray, tempCounter, weakPoint - 1, &index1, &index2 );
8401 unsigned int start = * ( unsigned int * ) darrayGet ( tempArray, index1 );
8402 unsigned int finish = * ( unsigned int * ) darrayGet ( tempArray, index2 );
8403 int len1 = getScaffold ( getTwinCtg ( start ), scafStack1 );
8404 int len2 = getScaffold ( finish, scafStack2 );
8406 if ( len1 < 2000 || len2 < 2000 )
8407 { continue; }
8409 switch2twin ( scafStack1 );
8410 int flag1 = checkScafConsist ( scafStack1, len1, scafStack2, len2 );
8411 switch2twin ( scafStack1 );
8412 switch2twin ( scafStack2 );
8413 int flag2 = checkScafConsist ( scafStack2, len2, scafStack1, len1 );
8415 if ( !flag1 || !flag2 )
8417 changeScafBegin ( scafStack1, getTwinCtg ( start ) );
8418 changeScafEnd ( scafStack2, getTwinCtg ( finish ) );
8419 //unbind links
8420 unsigned int nextCtg = * ( unsigned int * ) darrayGet ( tempArray, index1 + 1 );
8421 thisCtg = * ( unsigned int * ) darrayGet ( tempArray, index1 );
8422 cnt = getCntBetween ( getTwinCtg ( nextCtg ), getTwinCtg ( thisCtg ) );
8424 if ( cnt->nextInScaf )
8426 prevCtg = getTwinCtg ( cnt->nextInScaf->contigID );
8427 cnt->nextInScaf->prevInScaf = 0;
8428 cnt = getCntBetween ( prevCtg, thisCtg );
8429 cnt->nextInScaf = NULL;
8432 prevCtg = * ( unsigned int * ) darrayGet ( tempArray, index2 - 1 );
8433 thisCtg = * ( unsigned int * ) darrayGet ( tempArray, index2 );
8434 cnt = getCntBetween ( prevCtg, thisCtg );
8436 if ( cnt->nextInScaf )
8438 nextCtg = cnt->nextInScaf->contigID;
8439 cnt->nextInScaf->prevInScaf = 0;
8440 cnt = getCntBetween ( getTwinCtg ( nextCtg ), getTwinCtg ( thisCtg ) );
8441 cnt->nextInScaf = NULL;
8444 prevCtg = * ( unsigned int * ) darrayGet ( tempArray, index1 );
8446 for ( t = index1 + 1; t <= index2; t++ )
8448 thisCtg = * ( unsigned int * ) darrayGet ( tempArray, t );
8449 cnt = getCntBetween ( prevCtg, thisCtg );
8450 cnt->mask = 1;
8451 cnt->nextInScaf = NULL;
8452 cnt->prevInScaf = 0;
8453 cnt = getCntBetween ( getTwinCtg ( thisCtg ), getTwinCtg ( prevCtg ) );
8454 cnt->mask = 1;
8455 cnt->nextInScaf = NULL;
8456 cnt->prevInScaf = 0;
8457 prevCtg = thisCtg;
8460 counter++;
8464 freeStack ( scafStack1 );
8465 freeStack ( scafStack2 );
8466 fprintf ( stderr, "Report from checkScaf: %d scaffold segments broken.\n", counter );
8470 /*************************************************
8471 Function:
8472 detectBreakScaff
8473 Description:
8474 Detects and breaks the weak connection between contigs
8475 in scaffold according to the support of large insert size
8476 paired-end reads.
8477 Input:
8478 None.
8479 Output:
8480 None.
8481 Return:
8482 None.
8483 *************************************************/
8484 static void detectBreakScaff()
8486 unsigned int i, avgPE, scafLen, len, ctg, bal_ctg, prevCtg, thisCtg;
8487 long long peCounter, linkCounter;
8488 int num3, num5, weakPoint, tempCounter, j, t, counter = 0;
8489 int newInsNum;
8490 CONNECT * bindCnt, *cnt, *weakCnt;
8491 CONNECT * bal_cnt;
8492 STACK * scafStack1 = ( STACK * ) createStack ( 1000, sizeof ( unsigned int ) );
8493 STACK * scafStack2 = ( STACK * ) createStack ( 1000, sizeof ( unsigned int ) );
8495 for ( i = 1; i <= num_ctg; i++ )
8496 { contig_array[i].flag = 0; }
8498 for ( i = 1; i <= num_ctg; i++ )
8500 if ( contig_array[i].flag || contig_array[i].mask || !contig_array[i].downwardConnect )
8501 { continue; }
8503 bindCnt = getBindCnt ( i );
8505 if ( !bindCnt )
8506 { continue; }
8508 //first scan to get the average coverage by longer pe
8509 num5 = num3 = peCounter = linkCounter = 0;
8510 scafLen = contig_array[i].length;
8511 ctg = i;
8512 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = i;
8513 contig_array[i].flag = 1;
8514 contig_array[getTwinCtg ( i )].flag = 1;
8516 while ( bindCnt )
8518 linkCounter++;
8519 peCounter += bindCnt->maxGap;
8520 ctg = bindCnt->contigID;
8521 scafLen += bindCnt->gapLen + contig_array[ctg].length;
8522 * ( unsigned int * ) darrayPut ( scaf5, num5++ ) = ctg;
8523 bal_ctg = getTwinCtg ( ctg );
8524 contig_array[ctg].flag = 1;
8525 contig_array[bal_ctg].flag = 1;
8526 bindCnt = bindCnt->nextInScaf;
8529 ctg = getTwinCtg ( i );
8530 bindCnt = getBindCnt ( ctg );
8532 while ( bindCnt )
8534 linkCounter++;
8535 peCounter += bindCnt->maxGap;
8536 ctg = bindCnt->contigID;
8537 scafLen += bindCnt->gapLen + contig_array[ctg].length;
8538 bal_ctg = getTwinCtg ( ctg );
8539 contig_array[ctg].flag = 1;
8540 contig_array[bal_ctg].flag = 1;
8541 * ( unsigned int * ) darrayPut ( scaf3, num3++ ) = bal_ctg;
8542 bindCnt = bindCnt->nextInScaf;
8545 if ( scafLen < Insert_size )
8546 { continue; }
8548 avgPE = peCounter / linkCounter;
8550 if ( avgPE < 10 )
8551 { continue; }
8553 tempCounter = 0;
8555 for ( j = num3 - 1; j >= 0; j-- )
8556 * ( unsigned int * ) darrayPut ( tempArray, tempCounter++ ) =
8557 * ( unsigned int * ) darrayGet ( scaf3, j );
8559 for ( j = 0; j < num5; j++ )
8560 * ( unsigned int * ) darrayPut ( tempArray, tempCounter++ ) =
8561 * ( unsigned int * ) darrayGet ( scaf5, j );
8563 prevCtg = * ( unsigned int * ) darrayGet ( tempArray, 0 );
8564 weakCnt = NULL;
8565 weakPoint = 0;
8566 len = contig_array[prevCtg].length;
8568 for ( t = 1; t < tempCounter; t++ )
8570 newInsNum = 0;
8571 thisCtg = * ( unsigned int * ) darrayGet ( tempArray, t );
8572 cnt = contig_array[thisCtg].downwardConnect;
8574 while ( cnt )
8576 if ( cnt->newIns == 1 )
8578 ctg = cnt->contigID;
8579 bal_ctg = getTwinCtg ( ctg );
8580 newInsNum++;
8583 cnt = cnt->next;
8586 bal_cnt = contig_array[getTwinCtg ( thisCtg )].downwardConnect;
8587 cnt = getCntBetween ( prevCtg, thisCtg );
8589 if ( len < Insert_size )
8591 len += cnt->gapLen + contig_array[thisCtg].length;
8592 prevCtg = thisCtg;
8593 continue;
8595 else if ( len > scafLen - Insert_size )
8596 { break; }
8598 len += cnt->gapLen + contig_array[thisCtg].length;
8600 if ( contig_array[prevCtg].from_vt != contig_array[thisCtg].from_vt ||
8601 contig_array[prevCtg].indexInScaf > contig_array[thisCtg].indexInScaf )
8603 prevCtg = thisCtg;
8604 continue;
8607 if ( !weakCnt || weakCnt->maxGap > cnt->maxGap )
8609 weakCnt = cnt;
8610 weakPoint = t;
8613 prevCtg = thisCtg;
8616 if ( !weakCnt || ( weakCnt->maxGap > 2 && weakCnt->maxGap > avgPE / 5 ) )
8617 { continue; }
8619 prevCtg = * ( unsigned int * ) darrayGet ( tempArray, weakPoint - 1 );
8620 thisCtg = * ( unsigned int * ) darrayGet ( tempArray, weakPoint );
8622 if ( contig_array[prevCtg].from_vt != contig_array[thisCtg].from_vt ||
8623 contig_array[prevCtg].indexInScaf > contig_array[thisCtg].indexInScaf )
8625 printf ( "contig %d and %d not on the same scaff\n", prevCtg, thisCtg );
8626 continue;
8629 setConnectWP ( prevCtg, thisCtg, 1 );
8630 // set start and end to break down the scaffold
8631 int index1, index2;
8632 setBreakPoints ( tempArray, tempCounter, weakPoint - 1, &index1, &index2 );
8633 unsigned int start = * ( unsigned int * ) darrayGet ( tempArray, index1 );
8634 unsigned int finish = * ( unsigned int * ) darrayGet ( tempArray, index2 );
8635 int len1 = getScaffold ( getTwinCtg ( start ), scafStack1 );
8636 int len2 = getScaffold ( finish, scafStack2 );
8638 if ( len1 < Insert_size || len2 < Insert_size )
8639 { continue; }
8641 switch2twin ( scafStack1 );
8642 int flag1 = checkScafConsist ( scafStack1, len1, scafStack2, len2 );
8643 switch2twin ( scafStack1 );
8644 switch2twin ( scafStack2 );
8645 int flag2 = checkScafConsist ( scafStack2, len2, scafStack1, len1 );
8647 if ( !flag1 || !flag2 )
8649 changeScafBegin ( scafStack1, getTwinCtg ( start ) );
8650 changeScafEnd ( scafStack2, getTwinCtg ( finish ) );
8651 //unbind links
8652 unsigned int nextCtg = * ( unsigned int * ) darrayGet ( tempArray, index1 + 1 );
8653 thisCtg = * ( unsigned int * ) darrayGet ( tempArray, index1 );
8654 cnt = getCntBetween ( getTwinCtg ( nextCtg ), getTwinCtg ( thisCtg ) );
8656 if ( cnt->nextInScaf )
8658 prevCtg = getTwinCtg ( cnt->nextInScaf->contigID );
8659 cnt->nextInScaf->prevInScaf = 0;
8660 cnt = getCntBetween ( prevCtg, thisCtg );
8661 cnt->nextInScaf = NULL;
8664 prevCtg = * ( unsigned int * ) darrayGet ( tempArray, index2 - 1 );
8665 thisCtg = * ( unsigned int * ) darrayGet ( tempArray, index2 );
8666 cnt = getCntBetween ( prevCtg, thisCtg );
8668 if ( cnt->nextInScaf )
8670 nextCtg = cnt->nextInScaf->contigID;
8671 cnt->nextInScaf->prevInScaf = 0;
8672 cnt = getCntBetween ( getTwinCtg ( nextCtg ), getTwinCtg ( thisCtg ) );
8673 cnt->nextInScaf = NULL;
8676 prevCtg = * ( unsigned int * ) darrayGet ( tempArray, index1 );
8678 for ( t = index1 + 1; t <= index2; t++ )
8680 thisCtg = * ( unsigned int * ) darrayGet ( tempArray, t );
8681 cnt = getCntBetween ( prevCtg, thisCtg );
8682 cnt->mask = 1;
8683 cnt->nextInScaf = NULL;
8684 cnt->prevInScaf = 0;
8685 cnt = getCntBetween ( getTwinCtg ( thisCtg ), getTwinCtg ( prevCtg ) );
8686 cnt->mask = 1;
8687 cnt->nextInScaf = NULL;
8688 cnt->prevInScaf = 0;
8689 prevCtg = thisCtg;
8692 counter++;
8696 freeStack ( scafStack1 );
8697 freeStack ( scafStack2 );
8698 fprintf ( stderr, "Report from checkScaf: %d scaffold segments broken.\n", counter );
8701 /*************************************************
8702 Function:
8703 checkSimple
8704 Description:
8705 Checks whether there is a contig appearing more than once
8706 in array.
8707 Input:
8708 1. ctgArray: array of contigs
8709 2. count: contig number
8710 Output:
8711 None.
8712 Return:
8713 1 if no contig appeared more than once.
8714 *************************************************/
8715 static boolean checkSimple ( DARRAY * ctgArray, int count )
8717 int i;
8718 unsigned int ctg;
8720 for ( i = 0; i < count; i++ )
8722 ctg = * ( unsigned int * ) darrayGet ( ctgArray, i );
8723 contig_array[ctg].flag = 0;
8724 contig_array[getTwinCtg ( ctg )].flag = 0;
8727 for ( i = 0; i < count; i++ )
8729 ctg = * ( unsigned int * ) darrayGet ( ctgArray, i );
8731 if ( contig_array[ctg].flag )
8732 { return 0; }
8734 contig_array[ctg].flag = 1;
8735 contig_array[getTwinCtg ( ctg )].flag = 1;
8738 return 1;
8741 /*************************************************
8742 Function:
8743 checkCircle
8744 Description:
8745 Detects and masks contigs having circle connections.
8746 A --> B, B --> A.
8747 Input:
8748 None.
8749 Output:
8750 None.
8751 Return:
8752 None.
8753 *************************************************/
8754 static void checkCircle()
8756 unsigned int i, ctg;
8757 CONNECT * cn_temp1;
8758 int counter = 0;
8760 for ( i = 1; i <= num_ctg; i++ )
8762 cn_temp1 = contig_array[i].downwardConnect;
8764 while ( cn_temp1 )
8766 if ( cn_temp1->weak || cn_temp1->deleted )
8768 cn_temp1 = cn_temp1->next;
8769 continue;
8772 ctg = cn_temp1->contigID;
8774 if ( checkConnect ( ctg, i ) )
8776 counter++;
8777 maskContig ( i, 1 );
8778 maskContig ( ctg, 1 );
8781 cn_temp1 = cn_temp1->next;
8785 fprintf ( stderr, "%d circles removed.\n", counter );