11 #define CNBLOCKSIZE 10000
13 #define MAXCinBetween 200
15 #define MaxNodeInSub 10000
16 #define GapLowerBound -2000
17 #define GapUpperBound 300000
19 #define MaxCntNode 1000
22 static int DEBUG1
= 0;
23 static int DEBUG2
= 0;
25 static int bySmall
= 1;
27 static boolean static_f
= 0;
28 static double OverlapPercent
= 0.05;
29 static double ConflPercent
= 0.05;
30 static int MinWeakCut
= 3;
31 static int gapCounter
;
32 static int orienCounter
;
33 static int orienCounter2
;
34 static int throughCounter
;
36 static int breakPointAtRepeat
= 0;
37 static FILE * snp_fp
= NULL
;
39 static DARRAY
* solidArray
;
40 static DARRAY
* tempArray
;
42 static int solidCounter
;
44 static CTGinHEAP ctg4heapArray
[MaxNodeInSub
+ 1];
45 static unsigned int nodesInSub
[MaxNodeInSub
];
46 static int nodeDistance
[MaxNodeInSub
];
47 static int nodeCounter
;
50 static int dCntCounter1
;
51 static int uCntCounter1
;
52 static int dCntCounter2
;
53 static int uCntCounter2
;
54 static unsigned int dCntNodeArr1
[MaxCntNode
]; //downstream
55 static unsigned int uCntNodeArr1
[MaxCntNode
]; //upstream
56 static int dCntGapArr1
[MaxCntNode
];
57 static int uCntGapArr1
[MaxCntNode
];
58 static unsigned int dCntNodeArr2
[MaxCntNode
];
59 static unsigned int uCntNodeArr2
[MaxCntNode
];
60 static int dCntGapArr2
[MaxCntNode
];
61 static int uCntGapArr2
[MaxCntNode
];
63 static unsigned int * cntNodeArr
;
64 static int * cntGapArr
;
66 static unsigned int nodesInSubInOrder
[MaxNodeInSub
];
67 static int nodeDistanceInOrder
[MaxNodeInSub
];
69 static DARRAY
* scaf3
, *scaf5
;
70 static DARRAY
* gap3
, *gap5
;
72 static unsigned int downstreamCTG
[MAXCinBetween
];
73 static unsigned int upstreamCTG
[MAXCinBetween
];
74 static int dsCtgCounter
;
75 static int usCtgCounter
;
77 static CONNECT
* checkConnect ( unsigned int from_c
, unsigned int to_c
);
78 static int maskPuzzle ( int num_connect
, unsigned int contigLen
);
79 static void freezing();
80 static boolean
checkOverlapInBetween ( double tolerance
);
81 static int setConnectDelete ( unsigned int from_c
, unsigned int to_c
, char flag
, boolean cleanBinding
);
82 static int setConnectMask ( unsigned int from_c
, unsigned int to_c
, char mask
);
83 static int setConnectWP ( unsigned int from_c
, unsigned int to_c
, char flag
);
85 static void general_linearization ( boolean strict
);
86 static void debugging2();
87 static void smallScaf();
88 static void clearNewInsFlag();
89 static void detectBreakScaff();
90 static void detectBreakScaf();
91 static boolean
checkSimple ( DARRAY
* ctgArray
, int count
);
92 static void checkCircle();
94 /*************************************************
98 Checks the required files for scaffolding.
100 1. infile: prefix of graph
104 1 if all files were OK.
105 *************************************************/
106 boolean
checkFiles4Scaff ( char * infile
)
111 sprintf ( name
[0], "%s.Arc", infile
);
112 sprintf ( name
[1], "%s.contig", infile
);
113 sprintf ( name
[2], "%s.peGrads", infile
);
114 sprintf ( name
[3], "%s.preGraphBasic", infile
);
115 sprintf ( name
[4], "%s.updated.edge", infile
);
116 sprintf ( name
[5], "%s.readOnContig.gz", infile
);
120 filesOK
= check_file ( name
[i
] );
124 fprintf ( stderr
, "%s: no such file or empty file!\n\n", name
[i
] );
129 fprintf ( stderr
, "Files for scaffold construction are OK.\n\n" );
134 /*************************************************
138 Gets the only connection of current contig.
140 1. ctg: current contig
144 The pointer to connection if only one connection was found.
145 *************************************************/
146 static CONNECT
* getBindCnt ( unsigned int ctg
)
149 CONNECT
* bindCnt
= NULL
;
150 CONNECT
* temp_cnt
= NULL
;
151 CONNECT
* temp3_cnt
= NULL
;
155 ite_cnt
= contig_array
[ctg
].downwardConnect
;
159 if ( ite_cnt
->nextInScaf
)
165 if ( ite_cnt
->prevInScaf
)
171 if ( ite_cnt
->singleInScaf
)
177 ite_cnt
= ite_cnt
->next
;
183 if ( count
== 0 && count2
== 1 )
186 if ( count
== 0 && count2
== 0 && count3
== 1 )
187 { return temp3_cnt
; }
192 /*************************************************
196 Creats new relation between two non-connected contigs
197 according to toplogy sturcture supported by large insert size
200 1. sourceStart: contig that has connection relations to other two
202 2. originCnt: direct connection supported by large insert size
204 3. gap: calculated distance between two non-connected contigs
205 4. targetStart: left contig of those two non-connected contigs
206 5. targetStop: right contig of those two non-connected contigs
210 1 if creation successed.
211 *************************************************/
212 static boolean
createAnalogousCnt ( unsigned int sourceStart
,
213 CONNECT
* originCnt
, int gap
,
214 unsigned int targetStart
, unsigned int targetStop
)
217 unsigned int balTargetStart
= getTwinCtg ( targetStart
);
218 unsigned int balTargetStop
= getTwinCtg ( targetStop
);
219 unsigned int balSourceStart
= getTwinCtg ( sourceStart
);
220 unsigned int balSourceStop
= getTwinCtg ( originCnt
->contigID
);
221 boolean change_flag
= 0;
222 int add_weight
= originCnt
->weight
;
224 if ( gap
< GapLowerBound
)
227 originCnt
->deleted
= 1;
228 temp_cnt
= getCntBetween ( balSourceStop
, balSourceStart
);
229 temp_cnt
->deleted
= 1;
233 int startLen
= ( int ) contig_array
[targetStart
].length
;
234 int stopLen
= ( int ) contig_array
[targetStop
].length
;
236 if ( gap
< -overlaplen
)
238 if ( ( int ) contig_array
[targetStart
].length
- overlaplen
<= -gap
&& ( int ) contig_array
[targetStart
].length
> 5 * overlaplen
)
240 unsigned int tmp_id
= targetStart
;
241 targetStart
= targetStop
;
243 tmp_id
= balTargetStart
;
244 balTargetStart
= balTargetStop
;
245 balTargetStop
= tmp_id
;
256 originCnt
->deleted
= 1;
257 temp_cnt
= getCntBetween ( balSourceStop
, balSourceStart
);
258 temp_cnt
->deleted
= 1;
259 temp_cnt
= add1Connect ( targetStart
, targetStop
, gap
, add_weight
, 1 );
262 { temp_cnt
->inherit
= 1; }
264 temp_cnt
= add1Connect ( balTargetStop
, balTargetStart
, gap
, add_weight
, 1 );
267 { temp_cnt
->inherit
= 1; }
273 /*************************************************
277 Increases the connections weight supported by large insert size
278 paired-end reads between contigs in one scaffold.
280 1. fromCtg: left contig of pair contig connected by large insert
281 size paired-end reads
282 2. toCtg: right contig of pair contig connected by large insert
283 size paired-end reads
284 3. weight: weight of connection
289 *************************************************/
290 static void add1LongPEcov ( unsigned int fromCtg
, unsigned int toCtg
, int weight
)
292 //check if they are on the same scaff
293 if ( contig_array
[fromCtg
].from_vt
!= contig_array
[toCtg
].from_vt
||
294 contig_array
[fromCtg
].to_vt
!= contig_array
[toCtg
].to_vt
)
296 fprintf ( stderr
, "Warning from add1LongPEcov: contig %d and %d not on the same scaffold\n",
301 if ( contig_array
[fromCtg
].indexInScaf
>= contig_array
[toCtg
].indexInScaf
)
303 fprintf ( stderr
, "Warning from add1LongPEcov: wrong about order between contig %d and %d\n",
309 unsigned int prevCtg
= fromCtg
;
310 bindCnt
= getBindCnt ( fromCtg
);
314 if ( bindCnt
->maxGap
+ weight
<= 1000 )
315 { bindCnt
->maxGap
+= weight
; }
317 { bindCnt
->maxGap
= 1000; }
319 if ( fromCtg
== 0 && toCtg
== 0 )
320 fprintf ( stderr
, "link (%d %d ) covered by link (%d %d), wt %d\n",
321 prevCtg
, bindCnt
->contigID
, fromCtg
, toCtg
, weight
);
323 if ( bindCnt
->contigID
== toCtg
)
326 prevCtg
= bindCnt
->contigID
;
327 bindCnt
= bindCnt
->nextInScaf
;
330 unsigned int bal_fc
= getTwinCtg ( fromCtg
);
331 unsigned int bal_tc
= getTwinCtg ( toCtg
);
332 bindCnt
= getBindCnt ( bal_tc
);
337 if ( bindCnt
->maxGap
+ weight
<= 1000 )
338 { bindCnt
->maxGap
+= weight
; }
340 { bindCnt
->maxGap
= 1000; }
342 if ( fromCtg
== 0 && toCtg
== 0 )
343 fprintf ( stderr
, "link (%d %d ) covered by link (%d %d), wt %d\n",
344 prevCtg
, bindCnt
->contigID
, fromCtg
, toCtg
, weight
);
346 if ( bindCnt
->contigID
== bal_fc
)
349 prevCtg
= bindCnt
->contigID
;
350 bindCnt
= bindCnt
->nextInScaf
;
354 /*************************************************
358 Deals with connections supported by large insert size paired-end
359 reads. If a connecton is abnormal, or the two connected contigs
360 are in the same scaffold and the distance between them is normal,
361 set the connection's status to 'deleted'. If the two connected
362 contigs locate in different scaffolds, trys to creat a connection
363 between these two scaffolds.
370 *************************************************/
371 static void downSlide()
375 CONNECT
* ite_cnt
, *bindCnt
, *temp_cnt
;
376 unsigned int bottomCtg
, topCtg
, bal_i
;
377 unsigned int targetCtg
, bal_target
;
378 boolean getThrough
, orienConflict
;
379 int slideLen
, slideLen2
;
380 int slideexchange1
= 0, slideexchange2
= 0;
381 orienCounter
= throughCounter
= 0;
383 int slidebreak1
= 0, slidebreak2
= 0, slidebreak
= 0, recoverCnt
= 0;
385 for ( i
= 1; i
<= num_ctg
; i
++ )
387 if ( contig_array
[i
].mask
|| !contig_array
[i
].downwardConnect
)
390 bindCnt
= getBindCnt ( i
);
395 bal_i
= getTwinCtg ( i
);
399 //find the last unmasked contig in this binding
400 while ( bindCnt
->nextInScaf
)
402 len
+= bindCnt
->gapLen
+ contig_array
[bindCnt
->contigID
].length
;
404 if ( contig_array
[bindCnt
->contigID
].mask
== 0 )
406 bottomCtg
= bindCnt
->contigID
;
410 bindCnt
= bindCnt
->nextInScaf
;
413 len
+= bindCnt
->gapLen
+ contig_array
[bindCnt
->contigID
].length
;
415 if ( contig_array
[bindCnt
->contigID
].mask
== 0 || bottomCtg
== 0 )
417 bottomCtg
= bindCnt
->contigID
;
421 //check each connetion from long pair ends
422 ite_cnt
= contig_array
[i
].downwardConnect
;
426 if ( ite_cnt
->deleted
|| ite_cnt
->mask
|| ite_cnt
->singleInScaf
427 || ite_cnt
->nextInScaf
|| ite_cnt
->prevInScaf
|| ite_cnt
->inherit
)
429 ite_cnt
= ite_cnt
->next
;
433 targetCtg
= ite_cnt
->contigID
;
435 if ( contig_array
[i
].from_vt
== contig_array
[targetCtg
].from_vt
) // on the same scaff
437 if ( contig_array
[i
].indexInScaf
> contig_array
[targetCtg
].indexInScaf
)
440 { throughCounter
++; }
442 setConnectDelete ( i
, ite_cnt
->contigID
, 1, 0 );
443 ite_cnt
= ite_cnt
->next
;
447 if ( ( ins_var_idx
> 0 ) && ( slideLen
> Insert_size
* ins_var_idx
) )
449 setConnectDelete ( i
, ite_cnt
->contigID
, 1, 0 );
450 ite_cnt
= ite_cnt
->next
;
455 //contig i and targetctg is not in same scaffold
456 //check if this connection conflicts with previous scaffold orientationally
457 temp_cnt
= getBindCnt ( targetCtg
);
462 while ( temp_cnt
->nextInScaf
)
464 if ( temp_cnt
->contigID
== i
)
467 fprintf ( stderr
, "Warning from downSlide: still on the same scaff: %d and %d\n"
469 fprintf ( stderr
, "on scaff %d and %d\n",
470 contig_array
[i
].from_vt
, contig_array
[targetCtg
].from_vt
);
471 fprintf ( stderr
, "on bal_scaff %d and %d\n",
472 contig_array
[bal_target
].to_vt
, contig_array
[bal_i
].to_vt
);
476 temp_cnt
= temp_cnt
->nextInScaf
;
479 if ( temp_cnt
->contigID
== i
)
480 { orienConflict
= 1; }
487 setConnectDelete ( i
, ite_cnt
->contigID
, 1, 0 );
488 ite_cnt
= ite_cnt
->next
;
492 //connection path to i was not found
493 //find the most top contig along previous scaffold starting with the target contig of this connection
494 bal_target
= getTwinCtg ( targetCtg
);
497 if ( contig_array
[targetCtg
].mask
== 0 )
506 temp_cnt
= getBindCnt ( bal_target
);
507 getThrough
= len
= 0;
512 //find the last contig in this binding
513 while ( temp_cnt
->nextInScaf
)
515 //check if this route reaches bal_i
516 if ( temp_cnt
->contigID
== bal_i
)
518 fprintf ( stderr
, "Warning from downSlide: (B) still on the same scaff: %d and %d (%d and %d)\n",
519 i
, targetCtg
, bal_target
, bal_i
);
520 fprintf ( stderr
, "on scaff %d and %d\n",
521 contig_array
[i
].from_vt
, contig_array
[targetCtg
].from_vt
);
522 fprintf ( stderr
, "on bal_scaff %d and %d\n",
523 contig_array
[bal_target
].to_vt
, contig_array
[bal_i
].to_vt
);
528 len
+= temp_cnt
->gapLen
+ contig_array
[temp_cnt
->contigID
].length
;
530 if ( contig_array
[temp_cnt
->contigID
].mask
== 0 )
532 topCtg
= temp_cnt
->contigID
;
536 if ( ( ins_var_idx
> 0 ) && ( len
> ins_var_idx
* Insert_size
) )
542 temp_cnt
= temp_cnt
->nextInScaf
;
545 len
+= temp_cnt
->gapLen
+ contig_array
[temp_cnt
->contigID
].length
;
547 if ( contig_array
[temp_cnt
->contigID
].mask
== 0 || topCtg
== 0 )
549 topCtg
= temp_cnt
->contigID
;
553 if ( slidebreak
== 1 )
555 setConnectDelete ( i
, ite_cnt
->contigID
, 1, 0 );
556 ite_cnt
= ite_cnt
->next
;
561 if ( temp_cnt
->contigID
== bal_i
)
564 { topCtg
= getTwinCtg ( topCtg
); }
567 { topCtg
= targetCtg
; }
572 setConnectDelete ( i
, ite_cnt
->contigID
, 1, 0 );
573 ite_cnt
= ite_cnt
->next
;
577 //connection path to bal_id was not found
579 gap
= ite_cnt
->gapLen
- slideLen
- slideLen2
;
580 dh_cnt
= getCntBetween ( topCtg
, bottomCtg
);
582 if ( dh_cnt
&& dh_cnt
->weight
>= MinWeakCut
)
585 setConnectDelete ( topCtg
, bottomCtg
, 0, 0 );
586 setConnectMask ( topCtg
, bottomCtg
, 0 );
587 ite_cnt
= ite_cnt
->next
;
591 //add a connection between bottomCtg and topCtg
592 if ( bottomCtg
!= topCtg
&& ! ( i
== bottomCtg
&& targetCtg
== topCtg
) )
594 boolean creat_flag
= createAnalogousCnt ( i
, ite_cnt
, gap
, bottomCtg
, topCtg
);
597 { slideexchange2
++; }
599 if ( contig_array
[bottomCtg
].mask
|| contig_array
[topCtg
].mask
)
600 { fprintf ( stderr
, "downSlide to masked contig, bottomCtg %u[mask %d], topCtg %u[mask %d]\n", bottomCtg
, contig_array
[bottomCtg
].mask
, topCtg
, contig_array
[topCtg
].mask
); }
603 ite_cnt
= ite_cnt
->next
;
607 // fprintf(stderr,"downSliding stat:\norienConflict\tfall_inside\tslidebreak1\tslidebreak2\trecoverCnt\tslideexchange1\tslideexchange2\n%d\t%d\t%d\t%d\t%d\t%d\t%d\n",orienCounter, throughCounter, slidebreak1, slidebreak2, recoverCnt, slideexchange1, slideexchange2);
608 fprintf ( stderr
, "Add large insert size PE links: %d orientation-conflict links, %d contigs acrossed by normal links.\n", orienCounter
, throughCounter
);
611 /*************************************************
615 Sets the downstream connection of current connection.
617 1. cnt: current connection
618 2. nextCnt: next connection
622 1 if setting successed.
623 *************************************************/
624 static boolean
setNextInScaf ( CONNECT
* cnt
, CONNECT
* nextCnt
)
628 fprintf ( stderr
, "setNextInScaf: empty pointer\n" );
634 cnt
->nextInScaf
= nextCnt
;
638 if ( cnt
->mask
|| cnt
->deleted
)
640 fprintf ( stderr
, "setNextInScaf: cnt is masked or deleted\n" );
644 if ( nextCnt
->deleted
|| nextCnt
->mask
)
646 fprintf ( stderr
, "setNextInScaf: nextCnt is masked or deleted\n" );
650 cnt
->nextInScaf
= nextCnt
;
654 /*************************************************
658 Sets the upstream connection status of current connection.
660 1. cnt: current connection
665 1 if setting successed.
666 *************************************************/
667 static boolean
setPrevInScaf ( CONNECT
* cnt
, boolean flag
)
671 fprintf ( stderr
, "setPrevInScaf: empty pointer\n" );
677 cnt
->prevInScaf
= flag
;
681 if ( cnt
->mask
|| cnt
->deleted
)
683 fprintf ( stderr
, "setPrevInScaf: cnt is masked or deleted\n" );
687 cnt
->prevInScaf
= flag
;
692 /*************************************************
696 Substitutes the upstream connection of current connection with
702 1. origin: original upstream connection of current connection
704 2. from_c_new: new upstream contig of current contig (branch_c)
709 *************************************************/
710 static void substitueUSinScaf ( CONNECT
* origin
, unsigned int from_c_new
)
712 if ( !origin
|| !origin
->nextInScaf
)
715 unsigned int branch_c
, to_c
;
716 unsigned int bal_branch_c
, bal_to_c
;
717 unsigned int bal_from_c_new
= getTwinCtg ( from_c_new
);
718 CONNECT
* bal_origin
, *bal_nextCNT
, *prevCNT
, *bal_prevCNT
;
719 branch_c
= origin
->contigID
;
720 to_c
= origin
->nextInScaf
->contigID
;
721 bal_branch_c
= getTwinCtg ( branch_c
);
722 bal_to_c
= getTwinCtg ( to_c
);
723 prevCNT
= checkConnect ( from_c_new
, branch_c
);
724 bal_nextCNT
= checkConnect ( bal_to_c
, bal_branch_c
);
728 fprintf ( stderr
, "substitueUSinScaf: no connect between %d and %d\n", bal_to_c
, bal_branch_c
);
732 bal_origin
= bal_nextCNT
->nextInScaf
;
733 bal_prevCNT
= checkConnect ( bal_branch_c
, bal_from_c_new
);
734 setPrevInScaf ( bal_nextCNT
->nextInScaf
, 0 );
735 setNextInScaf ( prevCNT
, origin
->nextInScaf
);
736 setNextInScaf ( bal_nextCNT
, bal_prevCNT
);
737 setPrevInScaf ( bal_prevCNT
, 1 );
738 setNextInScaf ( origin
, NULL
);
739 setPrevInScaf ( bal_origin
, 0 );
743 /*************************************************
747 Substitutes the downstream connection of current connection with
750 from_c -> branch_c ->
753 1. origin: original downstream connection of current connection
755 2. branch_c: current contig
756 3. to_c_new: new downstream contig of current contig
761 *************************************************/
762 static void substitueDSinScaf ( CONNECT
* origin
, unsigned int branch_c
, unsigned int to_c_new
)
764 if ( !origin
|| !origin
->prevInScaf
)
768 unsigned int bal_branch_c
, bal_to_c
, bal_to_c_new
;
769 unsigned int from_c
, bal_from_c
;
770 CONNECT
* bal_origin
, *prevCNT
, *bal_prevCNT
;
771 CONNECT
* nextCNT
, *bal_nextCNT
;
772 to_c
= origin
->contigID
;
773 bal_branch_c
= getTwinCtg ( branch_c
);
774 bal_to_c
= getTwinCtg ( to_c
);
775 bal_origin
= getCntBetween ( bal_to_c
, bal_branch_c
);
779 fprintf ( stderr
, "substitueDSinScaf: no connect between %d and %d\n", bal_to_c
, bal_branch_c
);
783 if ( bal_origin
->nextInScaf
)
784 { bal_from_c
= bal_origin
->nextInScaf
->contigID
; }
787 fprintf ( stderr
, "next null! %d\t%d\n", bal_to_c
, bal_branch_c
);
791 bal_from_c
= bal_origin
->nextInScaf
->contigID
;
792 from_c
= getTwinCtg ( bal_from_c
);
793 bal_to_c_new
= getTwinCtg ( to_c_new
);
794 prevCNT
= checkConnect ( from_c
, branch_c
);
795 nextCNT
= checkConnect ( branch_c
, to_c_new
);
796 setNextInScaf ( prevCNT
, nextCNT
);
797 setPrevInScaf ( nextCNT
, 1 );
798 bal_nextCNT
= checkConnect ( bal_to_c_new
, bal_branch_c
);
799 bal_prevCNT
= checkConnect ( bal_branch_c
, bal_from_c
);
800 setNextInScaf ( bal_nextCNT
, bal_prevCNT
);
801 setPrevInScaf ( origin
, 0 );
802 setNextInScaf ( bal_origin
, NULL
);
805 /*************************************************
809 Calculates the valid connections number of a contig.
810 1. Check whether the contig has upstream conntcion and
811 downstream connection.
812 2. Calculates the non-deleted and non-masked connections number.
815 2. preCNT: upstream connection
819 0 if contig did NOT have connection to other contigs.
820 1 if contig had upstream conntcion and downstream connection.
821 Non-deleted and non-masked connections number otherwise.
822 *************************************************/
823 static int validConnect ( unsigned int ctg
, CONNECT
* preCNT
)
825 if ( preCNT
&& preCNT
->nextInScaf
)
831 if ( !contig_array
[ctg
].downwardConnect
)
834 cn_temp
= contig_array
[ctg
].downwardConnect
;
838 if ( !cn_temp
->deleted
&& !cn_temp
->mask
)
841 cn_temp
= cn_temp
->next
;
847 /*************************************************
851 Gets the connection to next contig of current contig through constructed
852 connection in scaffold or direct connection to other contigs.
853 In the latter case, one and only one non-deleted and non-masked
854 connection is qualified.
856 1. ctg: current contig
857 2. preCNT: upstream connection of current contig
859 1. exception: indicate that whether found connection through
860 constructed connection in scaffold is deleted or masked
862 Pointer to qualified connection or NULL.
863 *************************************************/
864 static CONNECT
* getNextContig ( unsigned int ctg
, CONNECT
* preCNT
, boolean
* exception
)
866 CONNECT
* cn_temp
, *retCNT
= NULL
, *dh_cnt
;
867 int count
= 0, valid_in
;
868 unsigned int nextCtg
, bal_ctg
;
871 if ( preCNT
&& preCNT
->nextInScaf
)
873 if ( preCNT
->contigID
!= ctg
)
874 { fprintf ( stderr
, "pre cnt does not lead to %d\n", ctg
); }
876 nextCtg
= preCNT
->nextInScaf
->contigID
;
877 cn_temp
= getCntBetween ( ctg
, nextCtg
);
878 dh_cnt
= getCntBetween ( getTwinCtg ( nextCtg
), getTwinCtg ( ctg
) );
880 if ( cn_temp
&& ( cn_temp
->mask
|| cn_temp
->deleted
) )
882 int id1
= 0, id2
= 0;
884 if ( dh_cnt
->nextInScaf
)
886 id1
= dh_cnt
->nextInScaf
->contigID
;
887 id2
= getTwinCtg ( dh_cnt
->nextInScaf
->contigID
);
890 if ( !cn_temp
->prevInScaf
)
891 { fprintf ( stderr
, "not even has a prevInScaf %d and %d, %d and %d with before %d with twin %d\n", ctg
, nextCtg
, getTwinCtg ( nextCtg
), getTwinCtg ( ctg
), id1
, id2
); }
893 cn_temp
= getCntBetween ( getTwinCtg ( nextCtg
),
894 getTwinCtg ( ctg
) );
896 if ( !cn_temp
->nextInScaf
)
897 { fprintf ( stderr
, "its twin cnt not has a nextInScaf\n" ); }
903 { return preCNT
->nextInScaf
; }
906 bal_ctg
= getTwinCtg ( ctg
);
907 valid_in
= validConnect ( bal_ctg
, NULL
);
912 if ( !contig_array
[ctg
].downwardConnect
)
915 cn_temp
= contig_array
[ctg
].downwardConnect
;
919 if ( cn_temp
->mask
|| cn_temp
->deleted
)
921 cn_temp
= cn_temp
->next
;
928 { retCNT
= cn_temp
; }
929 else if ( count
== 2 )
932 cn_temp
= cn_temp
->next
;
938 /*************************************************
942 Check whether the connection between two contigs exists and
943 the connection's status.
945 1. from_c: left contig
946 2. to_c: right contig
950 Pointer to qualified connection or NULL.
951 *************************************************/
952 static CONNECT
* checkConnect ( unsigned int from_c
, unsigned int to_c
)
954 CONNECT
* cn_temp
= getCntBetween ( from_c
, to_c
);
959 if ( !cn_temp
->mask
&& !cn_temp
->deleted
)
963 //printf("masked or deleted: %d\t%d\t%d\t%d\n",from_c,to_c,cn_temp->mask,cn_temp->deleted);
967 /*************************************************
971 Sets the "mask" status of connection between two contigs and
972 releated upstream and downstream connection's status if these
975 1. from_c: left contig
976 2. to_c: right contig
981 1 if setting successed.
982 *************************************************/
983 static int setConnectMask ( unsigned int from_c
, unsigned int to_c
, char mask
)
985 CONNECT
* cn_temp
, *cn_bal
, *cn_ds
, *cn_us
;
986 unsigned int bal_fc
= getTwinCtg ( from_c
);
987 unsigned int bal_tc
= getTwinCtg ( to_c
);
988 unsigned int ctg3
, bal_ctg3
;
989 cn_temp
= getCntBetween ( from_c
, to_c
);
990 cn_bal
= getCntBetween ( bal_tc
, bal_fc
);
992 if ( !cn_temp
|| !cn_bal
)
997 cn_temp
->mask
= mask
;
1003 if ( cn_temp
->nextInScaf
) //undo the binding
1005 setPrevInScaf ( cn_temp
->nextInScaf
, 0 );
1006 ctg3
= cn_temp
->nextInScaf
->contigID
;
1007 setNextInScaf ( cn_temp
, NULL
);
1008 bal_ctg3
= getTwinCtg ( ctg3
);
1009 cn_ds
= getCntBetween ( bal_ctg3
, bal_tc
);
1010 setNextInScaf ( cn_ds
, NULL
);
1011 setPrevInScaf ( cn_bal
, 0 );
1014 // ctg3 -> from_c -> to_c
1015 // bal_ctg3 <- bal_fc <- bal_tc
1016 if ( cn_bal
->nextInScaf
)
1018 setPrevInScaf ( cn_bal
->nextInScaf
, 0 );
1019 bal_ctg3
= cn_bal
->nextInScaf
->contigID
;
1020 setNextInScaf ( cn_bal
, NULL
);
1021 ctg3
= getTwinCtg ( bal_ctg3
);
1022 cn_us
= getCntBetween ( ctg3
, from_c
);
1023 setNextInScaf ( cn_us
, NULL
);
1024 setPrevInScaf ( cn_temp
, 0 );
1030 /*************************************************
1034 Sets the "used" status of connection between two contigs if the
1037 1. from_c: left contig
1038 2. to_c: right contig
1043 1 if setting successed.
1044 *************************************************/
1045 static boolean
setConnectUsed ( unsigned int from_c
, unsigned int to_c
, char flag
)
1047 CONNECT
* cn_temp
, *cn_bal
;
1048 unsigned int bal_fc
= getTwinCtg ( from_c
);
1049 unsigned int bal_tc
= getTwinCtg ( to_c
);
1050 cn_temp
= getCntBetween ( from_c
, to_c
);
1051 cn_bal
= getCntBetween ( bal_tc
, bal_fc
);
1053 if ( !cn_temp
|| !cn_bal
)
1058 cn_temp
->used
= flag
;
1059 cn_bal
->used
= flag
;
1063 /*************************************************
1067 Sets the "weakPoint" status of connection between two contigs if the
1070 1. from_c: left contig
1071 2. to_c: right contig
1076 1 if setting successed.
1077 *************************************************/
1078 static int setConnectWP ( unsigned int from_c
, unsigned int to_c
, char flag
)
1080 CONNECT
* cn_temp
, *cn_bal
;
1081 unsigned int bal_fc
= getTwinCtg ( from_c
);
1082 unsigned int bal_tc
= getTwinCtg ( to_c
);
1083 cn_temp
= getCntBetween ( from_c
, to_c
);
1084 cn_bal
= getCntBetween ( bal_tc
, bal_fc
);
1086 if ( !cn_temp
|| !cn_bal
)
1091 cn_temp
->weakPoint
= flag
;
1092 cn_bal
->weakPoint
= flag
;
1096 /*************************************************
1100 Sets the "deleted" status of connection between two contigs if the
1101 connection exists. Cleans the related upstream and downstream
1102 connections if required.
1104 1. from_c: left contig
1105 2. to_c: right contig
1107 4. cleanBinding: indicator of whether cleaning related connections
1111 1 if setting successed.
1112 *************************************************/
1113 static int setConnectDelete ( unsigned int from_c
, unsigned int to_c
, char flag
, boolean cleanBinding
)
1115 CONNECT
* cn_temp
, *cn_bal
;
1116 unsigned int bal_fc
= getTwinCtg ( from_c
);
1117 unsigned int bal_tc
= getTwinCtg ( to_c
);
1118 cn_temp
= getCntBetween ( from_c
, to_c
);
1119 cn_bal
= getCntBetween ( bal_tc
, bal_fc
);
1121 if ( !cn_temp
|| !cn_bal
)
1126 cn_temp
->deleted
= flag
;
1127 cn_bal
->deleted
= flag
;
1134 cn_temp
->prevInScaf
= 0;
1135 cn_temp
->nextInScaf
= NULL
;
1136 cn_bal
->prevInScaf
= 0;
1137 cn_bal
->nextInScaf
= NULL
;
1143 /*************************************************
1147 Sets the "mask" status of target contig and its connections to
1148 other contigs except those connections in scaffold.
1150 1. ctg: target contig
1156 *************************************************/
1157 static void maskContig ( unsigned int ctg
, boolean flag
)
1159 unsigned int bal_ctg
, ctg2
, bal_ctg2
;
1161 bal_ctg
= getTwinCtg ( ctg
);
1162 cn_temp
= contig_array
[ctg
].downwardConnect
;
1166 if ( cn_temp
->mask
|| cn_temp
->prevInScaf
|| cn_temp
->nextInScaf
|| cn_temp
->singleInScaf
)
1168 cn_temp
= cn_temp
->next
;
1172 ctg2
= cn_temp
->contigID
;
1173 setConnectMask ( ctg
, ctg2
, flag
);
1174 cn_temp
= cn_temp
->next
;
1177 // bal_ctg2 <- bal_ctg
1178 cn_temp
= contig_array
[bal_ctg
].downwardConnect
;
1182 if ( cn_temp
->mask
|| cn_temp
->prevInScaf
|| cn_temp
->nextInScaf
|| cn_temp
->singleInScaf
)
1184 cn_temp
= cn_temp
->next
;
1188 bal_ctg2
= cn_temp
->contigID
;
1189 setConnectMask ( bal_ctg
, bal_ctg2
, flag
);
1190 cn_temp
= cn_temp
->next
;
1193 contig_array
[ctg
].mask
= flag
;
1194 contig_array
[bal_ctg
].mask
= flag
;
1198 /*************************************************
1202 For contigs longer than "contigLen", masks them if they have
1203 more than one valid connection in either upstream of downstream
1204 direction, and have more than 'num_connect' valid connection.
1206 1. num_connect: allowed valid connection number cutoff
1207 2. contigLen: contig length cutoff
1211 Masked contig number.
1212 *************************************************/
1213 static int maskPuzzle ( int num_connect
, unsigned int contigLen
)
1215 int in_num
, out_num
, flag
= 0, puzzleCounter
= 0;
1216 unsigned int i
, bal_i
;
1217 fprintf ( stderr
, "Start to mask puzzles.\n" );
1219 for ( i
= 1; i
<= num_ctg
; i
++ )
1221 if ( contigLen
&& contig_array
[i
].length
> contigLen
)
1224 if ( contig_array
[i
].mask
)
1227 bal_i
= getTwinCtg ( i
);
1228 in_num
= validConnect ( bal_i
, NULL
);
1229 out_num
= validConnect ( i
, NULL
);
1231 if ( ( in_num
> 1 || out_num
> 1 ) && ( in_num
+ out_num
>= num_connect
) )
1234 maskContig ( i
, 1 );
1237 // upstream connection in scaffold
1238 in_num
= validConnect ( bal_i
, NULL
);
1239 // downstream connection in scaffold
1240 out_num
= validConnect ( i
, NULL
);
1242 if ( in_num
> 1 || out_num
> 1 )
1248 if ( isSmallerThanTwin ( i
) )
1252 fprintf ( stderr
, " Masked contigs %d\n Remained puzzles %d\n", flag
, puzzleCounter
);
1256 /*************************************************
1260 Updates the status of contigs' connections according to "cut_off",
1261 and then mask those contigs which form circle structure.
1266 1. cut_off: weight cutoff of connection
1271 *************************************************/
1272 static void deleteWeakCnt ( int cut_off
)
1276 int weaks
= 0, counter
= 0;
1278 for ( i
= 1; i
<= num_ctg
; i
++ )
1280 cn_temp1
= contig_array
[i
].downwardConnect
;
1284 if ( !cn_temp1
->mask
&& !cn_temp1
->deleted
&& !cn_temp1
->nextInScaf
1285 && !cn_temp1
->singleInScaf
&& !cn_temp1
->prevInScaf
)
1290 if ( cn_temp1
->weak
&& cn_temp1
->deleted
&& cn_temp1
->weight
>= cut_off
)
1292 cn_temp1
->deleted
= 0;
1295 else if ( !cn_temp1
->deleted
&& cn_temp1
->weight
> 0 && cn_temp1
->weight
< cut_off
1296 && !cn_temp1
->nextInScaf
&& !cn_temp1
->prevInScaf
)
1298 cn_temp1
->deleted
= 1;
1301 if ( cn_temp1
->singleInScaf
)
1302 { cn_temp1
->singleInScaf
= 0; }
1304 if ( !cn_temp1
->mask
)
1308 cn_temp1
= cn_temp1
->next
;
1313 { fprintf ( stderr
, " Active connections %d\n Weak connections %d\n Weak ratio %.1f%%\n", counter
, weaks
, ( float ) weaks
/ counter
* 100 ); }
1319 /*************************************************
1323 For three contigs A, B and C, connections A->B and A->C exist,
1324 check if there is a connection path connecting B and C or
1325 whether a new connection can be created between B and C.
1331 1. starter: leftmost contig
1332 2. cnt2c1: connection A->B
1333 3. c2: rightmost contig
1334 4. min_dis: minimum distance between middle and rightmost contig
1335 5. max_dis: maximum distance between middle and rightmost contig
1339 -1 if B and C were the same contig.
1340 1 if connection path was found or connection was created.
1342 *************************************************/
1343 static int linearC2C ( unsigned int starter
, CONNECT
* cnt2c1
, unsigned int c2
, int min_dis
, int max_dis
)
1345 int out_num
, in_num
;
1346 CONNECT
* prevCNT
, *cnt
, *cn_temp
;
1347 unsigned int c1
, bal_c1
, ctg
, bal_c2
;
1349 unsigned int bal_start
= getTwinCtg ( starter
);
1351 c1
= cnt2c1
->contigID
;
1355 fprintf ( stderr
, "linearC2C: c1(%d) and c2(%d) are the same contig.\n", c1
, c2
);
1359 bal_c1
= getTwinCtg ( c1
);
1362 downstreamCTG
[dsCtgCounter
++] = c1
;
1363 bal_c2
= getTwinCtg ( c2
);
1364 upstreamCTG
[usCtgCounter
++] = bal_c2
;
1365 // check if c1 is linearly connected to c2 by pe connections
1366 cnt
= prevCNT
= cnt2c1
;
1368 while ( ( cnt
= getNextContig ( c1
, prevCNT
, &excep
) ) != NULL
)
1371 len
+= cnt
->gapLen
+ contig_array
[c1
].length
;
1376 return 1; //is interleaving.
1379 if ( len
> max_dis
|| c1
== starter
|| c1
== bal_start
)
1382 downstreamCTG
[dsCtgCounter
++] = c1
;
1384 if ( dsCtgCounter
>= MAXCinBetween
)
1386 fprintf ( stderr
, "%d downstream contigs, start at %d, max_dis %d, current dis %d\n"
1387 , dsCtgCounter
, starter
, max_dis
, len
);
1394 out_num
= validConnect ( c1
, NULL
); //new c1 should have no outgoing link.
1399 //find the most upstream contig to c2
1400 cnt
= prevCNT
= NULL
;
1403 while ( ( cnt
= getNextContig ( ctg
, prevCNT
, &excep
) ) != NULL
)
1405 ctg
= cnt
->contigID
;
1406 len
+= cnt
->gapLen
+ contig_array
[ctg
].length
;
1408 if ( len
> max_dis
|| ctg
== starter
|| ctg
== bal_start
)
1412 upstreamCTG
[usCtgCounter
++] = ctg
;
1414 if ( usCtgCounter
>= MAXCinBetween
)
1416 fprintf ( stderr
, "%d upstream contigs, start at %d, max_dis %d, current dis %d\n"
1417 , usCtgCounter
, starter
, max_dis
, len
);
1422 if ( dsCtgCounter
+ usCtgCounter
> MAXCinBetween
)
1424 fprintf ( stderr
, "%d downstream and %d upstream contigs.\n", dsCtgCounter
, usCtgCounter
);
1428 out_num
= validConnect ( ctg
, NULL
); //new c2 have no incoming link.
1435 c2
= getTwinCtg ( ctg
);
1439 if ( c1
== c2
|| c1
== ctg
|| max_dis
< 0 )
1443 cn_temp
= getCntBetween ( c1
, c2
); //have connection between new c1 and new c2
1447 setConnectMask ( c1
, c2
, 0 );
1448 setConnectDelete ( c1
, c2
, 0, 0 );
1452 int oldsize
= usCtgCounter
;
1454 while ( getCntBetween ( c2
, c1
) && usCtgCounter
> 1 )
1457 c2
= getTwinCtg ( upstreamCTG
[usCtgCounter
] );
1460 if ( usCtgCounter
!= oldsize
)
1462 unsigned int prev_c2
= upstreamCTG
[usCtgCounter
+ 1];
1463 unsigned int bal_prev_c2
= getTwinCtg ( prev_c2
);
1464 setConnectMask ( bal_prev_c2
, c2
, 1 );
1465 setConnectMask ( bal_prev_c2
, c2
, 0 );
1466 int i
= usCtgCounter
+ 1;
1468 for ( ; i
<= oldsize
; i
++ )
1470 contig_array
[upstreamCTG
[i
]].from_vt
= prev_c2
;
1471 contig_array
[getTwinCtg ( upstreamCTG
[i
] )].to_vt
= bal_prev_c2
;
1474 if ( ( cn_temp
= getCntBetween ( c1
, c2
) ) != NULL
)
1476 setConnectMask ( c1
, c2
, 0 );
1477 setConnectDelete ( c1
, c2
, 0, 0 );
1482 len
= ( min_dis
+ max_dis
) / 2 >= 0 ? ( min_dis
+ max_dis
) / 2 : 0;
1483 cn_temp
= allocateCN ( c2
, len
);
1485 if ( cntLookupTable
)
1486 { putCnt2LookupTable ( c1
, cn_temp
); }
1488 cn_temp
->weight
= 0; // special connect from the original graph
1489 cn_temp
->next
= contig_array
[c1
].downwardConnect
;
1490 contig_array
[c1
].downwardConnect
= cn_temp
;
1491 bal_c1
= getTwinCtg ( c1
);
1492 bal_c2
= getTwinCtg ( c2
);
1493 cn_temp
= allocateCN ( bal_c1
, len
);
1495 if ( cntLookupTable
)
1496 { putCnt2LookupTable ( bal_c2
, cn_temp
); }
1498 cn_temp
->weight
= 0; // special connect from the original graph
1499 cn_temp
->next
= contig_array
[bal_c2
].downwardConnect
;
1500 contig_array
[bal_c2
].downwardConnect
= cn_temp
;
1505 /*************************************************
1509 Concatenates upstream and downstream contig arrays together.
1516 *************************************************/
1517 static void catUsDsContig()
1521 for ( i
= 0; i
< dsCtgCounter
; i
++ )
1522 { * ( unsigned int * ) darrayPut ( solidArray
, i
) = downstreamCTG
[i
]; }
1524 for ( i
= usCtgCounter
; i
>= 0; i
-- )
1526 * ( unsigned int * ) darrayPut ( solidArray
, dsCtgCounter
++ ) = getTwinCtg ( upstreamCTG
[i
] );
1529 solidCounter
= dsCtgCounter
;
1533 /*************************************************
1537 Constructs scaffolds by binding connection relation among contigs
1545 *************************************************/
1546 static void consolidate()
1549 CONNECT
* prevCNT
= NULL
;
1551 unsigned int to_ctg
;
1552 unsigned int from_ctg
= * ( unsigned int * ) darrayGet ( solidArray
, 0 );
1554 for ( i
= 1; i
< solidCounter
; i
++ )
1556 to_ctg
= * ( unsigned int * ) darrayGet ( solidArray
, i
);
1557 cnt
= checkConnect ( from_ctg
, to_ctg
);
1561 fprintf ( stderr
, "consolidate A: no connect from %d to %d\n",
1564 for ( j
= 0; j
< solidCounter
; j
++ )
1565 { fprintf ( stderr
, "%d-->", * ( unsigned int * ) darrayGet ( solidArray
, j
) ); }
1567 fprintf ( stderr
, "\n" );
1571 cnt
->singleInScaf
= solidCounter
== 2 ? 1 : 0;
1575 setNextInScaf ( prevCNT
, cnt
);
1576 setPrevInScaf ( cnt
, 1 );
1583 //the reverse complementary path
1584 from_ctg
= getTwinCtg ( * ( unsigned int * ) darrayGet ( solidArray
, solidCounter
- 1 ) );
1587 for ( i
= solidCounter
- 2; i
>= 0; i
-- )
1589 to_ctg
= getTwinCtg ( * ( unsigned int * ) darrayGet ( solidArray
, i
) );
1590 cnt
= checkConnect ( from_ctg
, to_ctg
);
1594 fprintf ( stderr
, "consolidate B: no connect from %d to %d\n", from_ctg
, to_ctg
);
1598 cnt
->singleInScaf
= solidCounter
== 2 ? 1 : 0;
1602 setNextInScaf ( prevCNT
, cnt
);
1603 setPrevInScaf ( cnt
, 1 );
1611 static void debugging1 ( unsigned int ctg1
, unsigned int ctg2
)
1614 cn1
= getCntBetween ( ctg1
, ctg2
);
1618 fprintf ( stderr
, "(%d,%d) mask %d deleted %d w %d,singleInScaf %d\n",
1619 ctg1
, ctg2
, cn1
->mask
, cn1
->deleted
, cn1
->weight
, cn1
->singleInScaf
);
1621 if ( cn1
->nextInScaf
)
1622 { fprintf ( stderr
, "%d->%d->%d\n", ctg1
, ctg2
, cn1
->nextInScaf
->contigID
); }
1624 if ( cn1
->prevInScaf
)
1625 { fprintf ( stderr
, "*->%d->%d\n", ctg1
, ctg2
); }
1626 else if ( !cn1
->nextInScaf
)
1627 { fprintf ( stderr
, "NULL->%d->%d->NULL\n", ctg1
, ctg2
); }
1630 { fprintf ( stderr
, "%d -X- %d\n", ctg1
, ctg2
); }
1633 /*************************************************
1637 For three contigs A, B and C, connections A->B and A->C exist.
1638 If there is a connection path connecting B and C or a new
1639 connection can be created between B and C, then deletes
1650 *************************************************/
1651 static void removeTransitive()
1653 unsigned int i
, bal_ctg
;
1654 int flag
= 1, out_num
, in_num
, count
, min
, max
, linear
;
1655 CONNECT
* cn_temp
, *cn1
= NULL
, *cn2
= NULL
;
1656 int multi_out
= 0, single_out
= 0, two_out
= 0, may_transitive
= 0, not_transitive
= 0, cycle_num
= 0, mask_ctg
= 0, no_out
= 0;
1657 fprintf ( stderr
, "Start to remove transitive connection.\n" );
1667 for ( i
= 1; i
<= num_ctg
; i
++ )
1669 if ( contig_array
[i
].mask
)
1671 if ( cycle_num
== 1 )
1677 out_num
= validConnect ( i
, NULL
);
1681 if ( out_num
== 1 && cycle_num
== 1 )
1684 if ( out_num
> 2 && cycle_num
== 1 )
1687 if ( out_num
== 0 && cycle_num
== 1 )
1694 cn_temp
= contig_array
[i
].downwardConnect
;
1699 if ( cn_temp
->deleted
|| cn_temp
->mask
)
1701 cn_temp
= cn_temp
->next
;
1709 else if ( count
== 2 )
1716 cn_temp
= cn_temp
->next
;
1721 fprintf ( stderr
, "%d valid connections from ctg %d\n", count
, i
);
1725 if ( cn1
->gapLen
> cn2
->gapLen
)
1730 } //make sure cn1 is closer to contig i than cn2
1732 if ( cn1
->prevInScaf
&& cn2
->prevInScaf
)
1735 bal_ctg
= getTwinCtg ( cn2
->contigID
);
1736 in_num
= validConnect ( bal_ctg
, NULL
);
1741 int bal_c1
= getTwinCtg ( cn1
->contigID
);
1742 in_num
= validConnect ( bal_c1
, NULL
);
1747 min
= cn2
->gapLen
- cn1
->gapLen
- contig_array
[cn1
->contigID
].length
- ins_size_var
/ 2;
1748 max
= cn2
->gapLen
- cn1
->gapLen
- contig_array
[cn1
->contigID
].length
+ ins_size_var
/ 2;
1754 //temprarily delete cn2
1755 setConnectDelete ( i
, cn2
->contigID
, 1, 0 );
1756 int oldc2
= cn2
->contigID
;
1757 linear
= linearC2C ( i
, cn1
, cn2
->contigID
, min
, max
);
1762 setConnectDelete ( i
, cn2
->contigID
, 0, 0 );
1767 downstreamCTG
[0] = i
;
1770 if ( !checkSimple ( solidArray
, solidCounter
) )
1773 cn1
= getCntBetween ( * ( unsigned int * ) darrayGet ( solidArray
, solidCounter
- 2 ),
1774 * ( unsigned int * ) darrayGet ( solidArray
, solidCounter
- 1 ) );
1776 if ( cn1
&& cn1
->nextInScaf
&& cn2
->nextInScaf
)
1778 setConnectDelete ( i
, cn2
->contigID
, 0, 0 );
1784 if ( cn2
->prevInScaf
)
1785 substitueDSinScaf ( cn2
, * ( unsigned int * ) darrayGet ( solidArray
, 0 ),
1786 * ( unsigned int * ) darrayGet ( solidArray
, 1 ) );
1788 if ( cn2
->nextInScaf
)
1789 { substitueUSinScaf ( cn2
, * ( unsigned int * ) darrayGet ( solidArray
, solidCounter
- 2 ) ); }
1795 if ( cycle_num
== 1 )
1797 fprintf ( stderr
, "Total contigs %u\n", num_ctg
);
1798 fprintf ( stderr
, "Masked contigs %d\n", mask_ctg
);
1799 fprintf ( stderr
, "Remained contigs %u\n", num_ctg
- mask_ctg
);
1800 fprintf ( stderr
, "None-outgoing-connection contigs %d", no_out
);
1802 if ( num_ctg
- mask_ctg
> 0 )
1804 fprintf ( stderr
, " (%1f%%)", ( float ) no_out
/ ( num_ctg
- mask_ctg
) * 100 );
1807 fprintf ( stderr
, "\nSingle-outgoing-connection contigs %d\n", single_out
);
1808 fprintf ( stderr
, "Multi-outgoing-connection contigs %d\n", multi_out
);
1811 fprintf ( stderr
, "Cycle %d\n Two-outgoing-connection contigs %d\n Potential transitive connections %d\n Transitive connections %d\n", cycle_num
, two_out
, may_transitive
, flag
);
1815 fprintf ( stderr
, " Transitive ratio %.1f%%\n", ( float ) flag
/ two_out
* 100 );
1823 static void debugging2 ( unsigned int ctg
)
1825 if ( ctg
> num_ctg
)
1830 CONNECT
* cn1
= contig_array
[ctg
].downwardConnect
;
1834 if ( cn1
->nextInScaf
)
1835 { fprintf ( stderr
, "with nextInScaf %u,", cn1
->nextInScaf
->contigID
); }
1837 if ( cn1
->prevInScaf
)
1838 { fprintf ( stderr
, "with prevInScaf," ); }
1840 fprintf ( stderr
, "%u >> %u, weight %d, gapLen %d, mask %d deleted %d, inherit %d, singleInScaf %d, bySmall %d\n",
1841 ctg
, cn1
->contigID
, cn1
->weight
, cn1
->gapLen
, cn1
->mask
, cn1
->deleted
, cn1
->inherit
, cn1
->singleInScaf
, cn1
->bySmall
);
1845 static void debugging()
1847 // debugging1(13298356, 13245956);
1850 /*************************************************
1854 Simplifys contig graph by two operations.
1855 1) Removes transitive connections.
1856 2) Picks up local contig graph and tries to line involved
1864 *************************************************/
1865 static void simplifyCnt()
1869 general_linearization ( 1 );
1873 /*************************************************
1877 Gets contig's index in array.
1883 Index of contig if contig existed.
1885 *************************************************/
1886 static int getIndexInArray ( unsigned int node
)
1890 for ( index
= 0; index
< nodeCounter
; index
++ )
1891 if ( nodesInSub
[index
] == node
)
1897 /*************************************************
1901 Puts contig into sub-graph.
1903 1. heap: heap to store contigs
1904 2. distance: current contig's distance to base contig
1905 3. node: current contig
1906 4. index: contig's index in array
1910 0 if contig already existed in sub-graph.
1911 1 if operation succeeded.
1912 -1 if index was larger than allowed maximum sub-graph size.
1913 *************************************************/
1914 static boolean
putNodeIntoSubgraph ( FibHeap
* heap
, int distance
, unsigned int node
, int index
)
1916 int pos
= getIndexInArray ( node
);
1923 if ( index
>= MaxNodeInSub
)
1926 insertNodeIntoHeap ( heap
, distance
, node
);
1927 nodesInSub
[index
] = node
;
1928 nodeDistance
[index
] = distance
;
1932 /*************************************************
1934 putChainIntoSubgraph
1936 Puts contigs of connection chain into sub-graph.
1938 1. heap: heap to store contigs
1939 2. distance: current contig's distance to base contig
1940 3. node: current contig
1941 4. index: contig's index in array
1942 5. prevC: upstream connection of current contig
1944 1. index: index of next contig to add
1945 2. prevC: upstream connection of last contig in connection chain
1947 0 if operation of putting contig into sub-graph failed.
1948 *************************************************/
1949 static boolean
putChainIntoSubgraph ( FibHeap
* heap
, int distance
, unsigned int node
, int * index
, CONNECT
* prevC
)
1951 unsigned int ctg
= node
;
1953 boolean excep
, flag
;
1954 int counter
= *index
;
1958 nextCnt
= getNextContig ( ctg
, prevC
, &excep
);
1960 if ( excep
|| !nextCnt
)
1966 ctg
= nextCnt
->contigID
;
1967 distance
+= nextCnt
->gapLen
+ contig_array
[ctg
].length
;
1968 flag
= putNodeIntoSubgraph ( heap
, distance
, ctg
, counter
);
1980 /*************************************************
1984 Checks if a contig is unique by trying to line its upstream and
1987 1. node: contig to check
1988 2. tolerance: allowed percentage that overlap length among
1989 contigs in sub-graph accounts for in all involved contigs' length
1993 1 if contig was unique.
1994 *************************************************/
1995 static boolean
checkUnique ( unsigned int node
, double tolerance
)
1998 unsigned int currNode
;
2003 FibHeap
* heap
= newFibHeap();
2004 putNodeIntoSubgraph ( heap
, 0, currNode
, 0 );
2006 ite_cnt
= contig_array
[currNode
].downwardConnect
;
2010 if ( ite_cnt
->deleted
|| ite_cnt
->mask
)
2012 ite_cnt
= ite_cnt
->next
;
2016 currNode
= ite_cnt
->contigID
;
2017 distance
= ite_cnt
->gapLen
+ contig_array
[currNode
].length
;
2018 flag
= putNodeIntoSubgraph ( heap
, distance
, currNode
, nodeCounter
);
2022 destroyHeap ( heap
);
2029 flag
= putChainIntoSubgraph ( heap
, distance
, currNode
, &nodeCounter
, ite_cnt
);
2033 destroyHeap ( heap
);
2037 ite_cnt
= ite_cnt
->next
;
2040 if ( nodeCounter
<= 2 ) // no more than 2 valid connections
2042 destroyHeap ( heap
);
2046 while ( ( currNode
= removeNextNodeFromHeap ( heap
) ) != 0 )
2047 { nodesInSubInOrder
[popCounter
++] = currNode
; }
2049 destroyHeap ( heap
);
2050 flag
= checkOverlapInBetween ( tolerance
);
2054 /*************************************************
2058 Masks repeat contigs.
2065 *************************************************/
2066 static void maskRepeat()
2068 int in_num
, out_num
, flagA
, flagB
;
2070 int puzzleCounter
= 0;
2071 unsigned int i
, bal_i
;
2073 for ( i
= 1; i
<= num_ctg
; i
++ )
2075 if ( contig_array
[i
].mask
)
2078 bal_i
= getTwinCtg ( i
);
2079 in_num
= validConnect ( bal_i
, NULL
);
2080 out_num
= validConnect ( i
, NULL
);
2082 if ( in_num
> 1 || out_num
> 1 )
2083 { puzzleCounter
++; }
2086 if ( isSmallerThanTwin ( i
) )
2092 if ( contig_array
[i
].cvg
> 1.4 * cvgAvg
)
2095 maskContig ( i
, 1 );
2097 if ( isSmallerThanTwin ( i
) )
2104 { flagA
= checkUnique ( bal_i
, OverlapPercent
); }
2109 { flagB
= checkUnique ( i
, OverlapPercent
); }
2113 if ( !flagA
|| !flagB
)
2116 maskContig ( i
, 1 );
2119 if ( isSmallerThanTwin ( i
) )
2123 fprintf ( stderr
, "Mask repeats:\n Puzzles %d\n Masked contigs %d\n", puzzleCounter
, counter
);
2126 /*************************************************
2130 Counts valid connections of all contigs.
2136 Valid connections of all contigs.
2137 *************************************************/
2138 static int Countlink()
2140 unsigned int i
, bal_i
;
2141 int conflict_count
= 0;
2143 for ( i
= 1; i
< num_ctg
; i
++ )
2145 if ( contig_array
[i
].mask
)
2148 int out_num
= validConnect ( i
, NULL
);
2151 { conflict_count
++; }
2154 return conflict_count
;
2157 /*************************************************
2161 Constructs scaffold using alignment information of paired-end
2164 1. deWeak: indicator of deleting weak connection.
2165 2. downS: indicator of large insert size paired-end reads
2166 3. nonlinear: indicator of simplifying graph using loose restraint
2167 4. infile: prefix of graph
2172 *************************************************/
2173 static void ordering ( boolean deWeak
, boolean downS
, boolean nonlinear
, char * infile
)
2175 int conf0
, conf1
, conf2
, conf3
, conf4
, conf5
;
2184 { deleteWeakCnt ( weakPE
); }
2189 { deleteWeakCnt ( weakPE
); }
2202 fprintf ( stderr
, "Non-strict linearization.\n" );
2203 general_linearization ( 0 );
2206 maskPuzzle ( 2, 0 );
2213 /*************************************************
2215 checkOverlapInBetween
2217 Checks if adjacent contigs in the array have reasonable overlap.
2219 1. tolerance: max percentage that overlap length accounts for
2223 1 if the overlap situation was resonable.
2224 *************************************************/
2225 boolean
checkOverlapInBetween ( double tolerance
)
2231 lenSum
= lenOlp
= 0;
2233 for ( i
= 0; i
< nodeCounter
; i
++ )
2235 node
= nodesInSubInOrder
[i
];
2236 lenSum
+= contig_array
[node
].length
;
2237 index
= getIndexInArray ( node
);
2238 nodeDistanceInOrder
[i
] = nodeDistance
[index
];
2244 for ( i
= 0; i
< nodeCounter
- 1; i
++ )
2246 gap
= nodeDistanceInOrder
[i
+ 1] - nodeDistanceInOrder
[i
]
2247 - contig_array
[nodesInSubInOrder
[i
+ 1]].length
;
2252 if ( ( double ) lenOlp
/ lenSum
> tolerance
)
2262 /********* the following codes are for freezing current scaffolds ****************/
2263 /*************************************************
2267 Checks status of connection between contigs. If none of them
2268 equals to "flag", sets all status to "flag" and adjusts the related
2269 indicators accordingly. Otherwise does NOT change the status
2272 1. start: the first contig
2273 2. array: contig array
2274 3. max_steps: max number of connection to check
2279 o if setting successed.
2280 *************************************************/
2281 static boolean
setUsed ( unsigned int start
, unsigned int * array
, int max_steps
, boolean flag
)
2283 unsigned int prevCtg
= start
;
2284 unsigned int twinA
, twinB
;
2287 boolean usedFlag
= 0;
2288 // save 'used' to 'checking'
2291 for ( j
= 0; j
< max_steps
; j
++ )
2293 if ( array
[j
] == 0 )
2296 cnt
= getCntBetween ( prevCtg
, array
[j
] );
2300 fprintf ( stderr
, "setUsed: no connect between %d and %d\n", prevCtg
, array
[j
] );
2305 if ( cnt
->used
== flag
|| cnt
->nextInScaf
|| cnt
->prevInScaf
|| cnt
->singleInScaf
)
2310 cnt
->checking
= cnt
->used
;
2311 twinA
= getTwinCtg ( prevCtg
);
2312 twinB
= getTwinCtg ( array
[j
] );
2313 cnt
= getCntBetween ( twinB
, twinA
);
2316 { cnt
->checking
= cnt
->used
; }
2324 for ( j
= 0; j
< max_steps
; j
++ )
2326 if ( array
[j
] == 0 )
2329 cnt
= getCntBetween ( prevCtg
, array
[j
] );
2337 if ( cnt
->used
== flag
)
2344 twinA
= getTwinCtg ( prevCtg
);
2345 twinB
= getTwinCtg ( array
[j
] );
2346 cnt
= getCntBetween ( twinB
, twinA
);
2349 { cnt
->used
= flag
; }
2354 // set mask to 'NOT flag' or set used to original value
2357 for ( j
= 0; j
< max_steps
; j
++ )
2359 if ( array
[j
] == 0 )
2362 cnt
= getCntBetween ( prevCtg
, array
[j
] );
2371 { cnt
->mask
= 1 - flag
; }
2373 { cnt
->used
= cnt
->checking
; }
2375 twinA
= getTwinCtg ( prevCtg
);
2376 twinB
= getTwinCtg ( array
[j
] );
2377 cnt
= getCntBetween ( twinB
, twinA
);
2378 cnt
->used
= 1 - flag
;
2381 { cnt
->mask
= 1 - flag
; }
2383 { cnt
->used
= cnt
->checking
; }
2392 /*************************************************
2396 Checks whether a contig is supposed to locate in a scaffold
2397 according to its connections to other contigs in this scaffold.
2399 1. array: array of contigs of part of or whole scaffold
2400 2. Counter: contig number in array
2401 3. beforep: index of previous contig
2402 4. afterp: index of next contig
2403 5. id: current contig
2407 1 if this contig had enough support from other contigs.
2408 *************************************************/
2409 int score_pass ( DARRAY
* array
, int Counter
, int beforep
, int afterp
, int id
)
2411 int outnum
= allConnect ( id
, NULL
);
2412 int innum
= allConnect ( getTwinCtg ( id
), NULL
);
2413 int start
= beforep
- 2 * innum
> 0 ? beforep
- 2 * innum
: 0;
2414 int end
= afterp
+ 2 * outnum
< Counter
? afterp
+ 2 * outnum
: Counter
;
2415 int i
, inc
= 1, outc
= 1;
2418 for ( i
= start
; i
< end
; i
++ )
2422 dh_cnt
= getCntBetween ( * ( unsigned int * ) darrayGet ( array
, i
), id
);
2430 dh_cnt
= getCntBetween ( id
, * ( unsigned int * ) darrayGet ( array
, i
) );
2437 if ( inc
== innum
|| outc
== outnum
)
2440 if ( ( inc
== 1 && innum
> 2 ) || ( outc
== 1 && outnum
> 2 ) )
2443 int score
= ( int ) ( ( ( double ) ( inc
* outc
) / ( double ) ( innum
* outnum
) ) * 100 );
2451 /*************************************************
2455 Searchs a contig path between two contigs accroding to
2456 connection relation. If a path is found, recovers those masked
2457 connection between contigs in this path.
2467 *************************************************/
2468 static void recoverMask()
2470 unsigned int i
, ctg
, bal_ctg
, start
, finish
;
2471 int num3
, num5
, j
, t
;
2472 CONNECT
* bindCnt
, *cnt
;
2473 int min
, max
, max_steps
= 5, num_route
, length
;
2474 int tempCounter
, recoverCounter
= 0;
2475 boolean multiUSE
, change
;
2476 int stat
[] = {0, 0, 0, 0, 0, 0, 0};
2478 for ( i
= 1; i
<= num_ctg
; i
++ )
2479 { contig_array
[i
].flag
= 0; }
2481 so_far
= ( unsigned int * ) ckalloc ( max_n_routes
* sizeof ( unsigned int ) );
2482 found_routes
= ( unsigned int ** ) ckalloc ( max_n_routes
* sizeof ( unsigned int * ) );
2484 for ( j
= 0; j
< max_n_routes
; j
++ )
2485 { found_routes
[j
] = ( unsigned int * ) ckalloc ( max_steps
* sizeof ( unsigned int ) ); }
2487 for ( i
= 1; i
<= num_ctg
; i
++ )
2489 if ( contig_array
[i
].flag
|| contig_array
[i
].mask
|| !contig_array
[i
].downwardConnect
)
2492 bindCnt
= getBindCnt ( i
);
2497 //first scan get the average coverage by longer pe
2500 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = i
;
2501 contig_array
[i
].flag
= 1;
2502 contig_array
[getTwinCtg ( i
)].flag
= 1;
2506 if ( bindCnt
->used
)
2509 setConnectUsed ( ctg
, bindCnt
->contigID
, 1 );
2510 ctg
= bindCnt
->contigID
;
2511 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = ctg
;
2512 bal_ctg
= getTwinCtg ( ctg
);
2513 contig_array
[ctg
].flag
= 1;
2514 contig_array
[bal_ctg
].flag
= 1;
2515 bindCnt
= bindCnt
->nextInScaf
;
2518 ctg
= getTwinCtg ( i
);
2519 bindCnt
= getBindCnt ( ctg
);
2523 if ( bindCnt
->used
)
2526 setConnectUsed ( ctg
, bindCnt
->contigID
, 1 );
2527 ctg
= bindCnt
->contigID
;
2528 bal_ctg
= getTwinCtg ( ctg
);
2529 contig_array
[ctg
].flag
= 1;
2530 contig_array
[bal_ctg
].flag
= 1;
2531 * ( unsigned int * ) darrayPut ( scaf3
, num3
++ ) = bal_ctg
;
2532 bindCnt
= bindCnt
->nextInScaf
;
2535 if ( num5
+ num3
< 2 )
2538 tempCounter
= solidCounter
= 0;
2540 for ( j
= num3
- 1; j
>= 0; j
-- )
2541 * ( unsigned int * ) darrayPut ( tempArray
, tempCounter
++ ) =
2542 * ( unsigned int * ) darrayGet ( scaf3
, j
);
2544 for ( j
= 0; j
< num5
; j
++ )
2545 * ( unsigned int * ) darrayPut ( tempArray
, tempCounter
++ ) =
2546 * ( unsigned int * ) darrayGet ( scaf5
, j
);
2550 for ( t
= 0; t
< tempCounter
- 1; t
++ )
2552 * ( unsigned int * ) darrayPut ( solidArray
, solidCounter
++ ) =
2553 * ( unsigned int * ) darrayGet ( tempArray
, t
);
2554 start
= * ( unsigned int * ) darrayGet ( tempArray
, t
);
2555 finish
= * ( unsigned int * ) darrayGet ( tempArray
, t
+ 1 );
2556 num_route
= num_trace
= 0;
2557 cnt
= checkConnect ( start
, finish
);
2561 fprintf ( stderr
, "Warning from recoverMask: no connection (%d %d), start at %d\n",
2563 cnt
= getCntBetween ( start
, finish
);
2566 { debugging1 ( start
, finish
); }
2571 length
= cnt
->gapLen
+ contig_array
[finish
].length
;
2572 min
= length
- 1.5 * ins_size_var
;
2573 max
= length
+ 1.5 * ins_size_var
;
2574 traceAlongMaskedCnt ( finish
, start
, max_steps
, min
, max
, 0, 0, &num_route
);
2576 if ( finish
== start
)
2578 for ( j
= 0; j
< tempCounter
; j
++ )
2579 { fprintf ( stderr
, "->%d", * ( unsigned int * ) darrayGet ( tempArray
, j
) ); }
2581 fprintf ( stderr
, ": start at %d\n", i
);
2584 if ( num_route
== 1 )
2586 for ( j
= 0; j
< max_steps
; j
++ )
2587 if ( found_routes
[0][j
] == 0 )
2593 //check if connects have been used more than once
2594 multiUSE
= setUsed ( start
, found_routes
[0], max_steps
, 1 );
2599 for ( j
= 0; j
< max_steps
; j
++ )
2601 if ( j
+ 1 == max_steps
|| found_routes
[0][j
+ 1] == 0 )
2604 * ( unsigned int * ) darrayPut ( solidArray
, solidCounter
++ ) = found_routes
[0][j
];
2605 contig_array
[found_routes
[0][j
]].flag
= 1;
2606 contig_array
[getTwinCtg ( found_routes
[0][j
] )].flag
= 1;
2609 recoverCounter
+= j
;
2610 setConnectDelete ( start
, finish
, 1, 1 );
2613 } //end if num_route=1
2614 else if ( num_route
> 1 )
2617 int k
, l
, num
, longest
= 0, longestid
= 0;
2618 int merg
= 0, quality
= 0;
2620 // get the longest route.
2621 for ( k
= 0; k
< num_route
; k
++ )
2623 for ( j
= 0; j
< max_steps
; j
++ )
2625 if ( j
+ 1 == max_steps
|| found_routes
[k
][j
+ 1] == 0 )
2640 if ( longest
== 1 ) //multi one.
2644 for ( k
= 0; k
< num_route
; k
++ )
2646 if ( score_pass ( tempArray
, tempCounter
, t
, t
+ 1, found_routes
[k
][0] ) )
2664 for ( k
= 0; k
< num_route
; k
++ )
2666 if ( k
== longestid
)
2669 int merg_num
= 0, total
= 0;
2671 for ( j
= 0; j
< max_steps
; j
++ )
2673 if ( j
+ 1 == max_steps
|| found_routes
[k
][j
+ 1] == 0 )
2679 for ( l
= 0; l
< longest
; l
++ )
2681 if ( found_routes
[k
][j
] == found_routes
[longestid
][l
] )
2689 if ( merg_num
== total
)
2694 if ( merg
== num_route
- 1 || quality
== 1 || merg
>= longest
)
2696 multiUSE
= setUsed ( start
, found_routes
[longestid
], max_steps
, 1 );
2703 for ( j
= 0; j
< longest
; j
++ )
2705 * ( unsigned int * ) darrayPut ( solidArray
, solidCounter
++ ) = found_routes
[longestid
][j
];
2706 contig_array
[found_routes
[longestid
][j
]].flag
= 1;
2707 contig_array
[getTwinCtg ( found_routes
[longestid
][j
] )].flag
= 1;
2711 recoverCounter
+= j
;
2712 setConnectDelete ( start
, finish
, 1, 1 );
2718 * ( unsigned int * ) darrayPut ( solidArray
, solidCounter
++ ) =
2719 * ( unsigned int * ) darrayGet ( tempArray
, tempCounter
- 1 );
2725 fprintf ( stderr
, "\nRecover contigs.\n" );
2726 fprintf ( stderr
, " Total recovered contigs %d\n", recoverCounter
);
2727 fprintf ( stderr
, " Single-route cases %d\n", stat
[0] );
2728 fprintf ( stderr
, " Multi-route cases %d\n", stat
[1] );
2730 for ( i
= 1; i
<= num_ctg
; i
++ )
2732 cnt
= contig_array
[i
].downwardConnect
;
2742 for ( j
= 0; j
< max_n_routes
; j
++ )
2743 { free ( ( void * ) found_routes
[j
] ); }
2745 free ( ( void * ) found_routes
);
2746 free ( ( void * ) so_far
);
2750 /*************************************************
2754 Destroys upstream and downstram connections of connection
2755 between two specified contigs.
2757 1. CB: the first specified contig
2758 2. CC: the second specified contig
2763 *************************************************/
2764 static void unBindLink ( unsigned int CB
, unsigned int CC
)
2766 CONNECT
* cnt1
= getCntBetween ( CB
, CC
);
2771 if ( cnt1
->singleInScaf
)
2772 { cnt1
->singleInScaf
= 0; }
2774 CONNECT
* cnt2
= getCntBetween ( getTwinCtg ( CC
), getTwinCtg ( CB
) );
2779 if ( cnt2
->singleInScaf
)
2780 { cnt2
->singleInScaf
= 0; }
2782 if ( cnt1
->nextInScaf
)
2784 unsigned int CD
= cnt1
->nextInScaf
->contigID
;
2785 cnt1
->nextInScaf
->prevInScaf
= 0;
2786 cnt1
->nextInScaf
= NULL
;
2787 CONNECT
* cnt3
= getCntBetween ( getTwinCtg ( CD
), getTwinCtg ( CC
) );
2790 { cnt3
->nextInScaf
= NULL
; }
2792 cnt2
->prevInScaf
= 0;
2795 if ( cnt2
->nextInScaf
)
2797 unsigned int bal_CA
= cnt2
->nextInScaf
->contigID
;
2798 cnt2
->nextInScaf
->prevInScaf
= 0;
2799 cnt2
->nextInScaf
= NULL
;
2800 CONNECT
* cnt4
= getCntBetween ( getTwinCtg ( bal_CA
), CB
);
2803 { cnt4
->nextInScaf
= NULL
; }
2805 cnt1
->prevInScaf
= 0;
2809 /*************************************************
2813 Updates connection relation and scaffold ID of contigs in a scaffold.
2820 *************************************************/
2821 static void freezing()
2824 unsigned int ctg
, bal_ctg
;
2827 CONNECT
* cnt
, *prevCNT
, *nextCnt
;
2830 for ( i
= 1; i
<= num_ctg
; i
++ )
2832 contig_array
[i
].flag
= 0;
2833 contig_array
[i
].from_vt
= 0;
2834 contig_array
[i
].to_vt
= 0;
2835 cnt
= contig_array
[i
].downwardConnect
;
2841 cnt
->singleInScaf
= 0;
2846 for ( i
= 1; i
<= num_ctg
; i
++ )
2848 if ( contig_array
[i
].flag
|| contig_array
[i
].mask
)
2851 if ( !contig_array
[i
].downwardConnect
|| !validConnect ( i
, NULL
) )
2858 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = i
;
2859 contig_array
[i
].flag
= 1;
2860 contig_array
[getTwinCtg ( i
)].flag
= 1;
2862 cnt
= getNextContig ( ctg
, prevCNT
, &excep
);
2866 if ( contig_array
[cnt
->contigID
].flag
)
2868 unBindLink ( ctg
, cnt
->contigID
);
2872 nextCnt
= getNextContig ( cnt
->contigID
, cnt
, &excep
);
2873 setConnectUsed ( ctg
, cnt
->contigID
, 1 );
2874 ctg
= cnt
->contigID
;
2875 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = ctg
;
2876 bal_ctg
= getTwinCtg ( ctg
);
2877 contig_array
[ctg
].flag
= 1;
2878 contig_array
[bal_ctg
].flag
= 1;
2883 ctg
= getTwinCtg ( i
);
2886 { prevCNT
= checkConnect ( getTwinCtg ( * ( unsigned int * ) darrayGet ( scaf5
, 1 ) ), ctg
); }
2890 cnt
= getNextContig ( ctg
, prevCNT
, &excep
);
2894 if ( contig_array
[cnt
->contigID
].flag
)
2896 unBindLink ( ctg
, cnt
->contigID
);
2900 nextCnt
= getNextContig ( cnt
->contigID
, cnt
, &excep
);
2901 setConnectUsed ( ctg
, cnt
->contigID
, 1 );
2902 ctg
= cnt
->contigID
;
2903 bal_ctg
= getTwinCtg ( ctg
);
2904 contig_array
[ctg
].flag
= 1;
2905 contig_array
[bal_ctg
].flag
= 1;
2906 * ( unsigned int * ) darrayPut ( scaf3
, num3
++ ) = bal_ctg
;
2911 if ( num5
+ num3
< 2 )
2916 for ( j
= num3
- 1; j
>= 0; j
-- )
2917 * ( unsigned int * ) darrayPut ( solidArray
, solidCounter
++ ) =
2918 * ( unsigned int * ) darrayGet ( scaf3
, j
);
2920 for ( j
= 0; j
< num5
; j
++ )
2921 * ( unsigned int * ) darrayPut ( solidArray
, solidCounter
++ ) =
2922 * ( unsigned int * ) darrayGet ( scaf5
, j
);
2924 unsigned int firstCtg
= 0;
2925 unsigned int lastCtg
= 0;
2926 unsigned int firstTwin
= 0;
2927 unsigned int lastTwin
= 0;
2929 for ( t
= 0; t
< solidCounter
; t
++ )
2930 if ( !contig_array
[* ( unsigned int * ) darrayGet ( solidArray
, t
)].mask
)
2932 firstCtg
= * ( unsigned int * ) darrayGet ( solidArray
, t
);
2936 for ( t
= solidCounter
- 1; t
>= 0; t
-- )
2937 if ( !contig_array
[* ( unsigned int * ) darrayGet ( solidArray
, t
)].mask
)
2939 lastCtg
= * ( unsigned int * ) darrayGet ( solidArray
, t
);
2943 if ( firstCtg
== 0 || lastCtg
== 0 )
2945 fprintf ( stderr
, "scaffold start at %d, stop at %d, freezing began with %d\n", firstCtg
, lastCtg
, i
);
2947 for ( j
= 0; j
< solidCounter
; j
++ )
2948 fprintf ( stderr
, "->%d(%d %d)", * ( unsigned int * ) darrayGet ( solidArray
, j
)
2949 , contig_array
[* ( unsigned int * ) darrayGet ( solidArray
, j
)].mask
2950 , contig_array
[* ( unsigned int * ) darrayGet ( solidArray
, j
)].flag
);
2952 fprintf ( stderr
, "\n" );
2956 firstTwin
= getTwinCtg ( firstCtg
);
2957 lastTwin
= getTwinCtg ( lastCtg
);
2960 for ( t
= 0; t
< solidCounter
; t
++ )
2962 unsigned int ctg
= * ( unsigned int * ) darrayGet ( solidArray
, t
);
2964 if ( contig_array
[ctg
].from_vt
> 0 )
2966 contig_array
[ctg
].mask
= 1;
2967 contig_array
[getTwinCtg ( ctg
)].mask
= 1;
2968 fprintf ( stderr
, "Repeat: contig %d (%d) appears more than once\n", ctg
, getTwinCtg ( ctg
) );
2972 contig_array
[ctg
].from_vt
= firstCtg
;
2973 contig_array
[ctg
].to_vt
= lastCtg
;
2974 contig_array
[ctg
].indexInScaf
= t
+ 1;
2975 contig_array
[getTwinCtg ( ctg
)].from_vt
= lastTwin
;
2976 contig_array
[getTwinCtg ( ctg
)].to_vt
= firstTwin
;
2977 contig_array
[getTwinCtg ( ctg
)].indexInScaf
= solidCounter
- t
;
2984 fprintf ( stderr
, "Freezing done.\n" );
2987 for ( i
= 1; i
<= num_ctg
; i
++ )
2989 if ( contig_array
[i
].flag
)
2990 { contig_array
[i
].flag
= 0; }
2992 if ( contig_array
[i
].from_vt
== 0 )
2994 contig_array
[i
].from_vt
= i
;
2995 contig_array
[i
].to_vt
= i
;
2998 cnt
= contig_array
[i
].downwardConnect
;
3009 /************** codes below this line are for pulling the scaffolds out ************/
3010 /*************************************************
3014 Outputs information of a filled gap.
3017 2. max_steps: maximum allowed found contigs for filling gap
3022 *************************************************/
3023 void output1gap ( FILE * fo
, int max_steps
)
3028 for ( i
= 0; i
< max_steps
- 1; i
++ )
3030 if ( found_routes
[0][i
+ 1] == 0 )
3033 len
+= contig_array
[found_routes
[0][i
]].length
;
3037 fprintf ( fo
, "GAP %d %d", len
, seg
);
3039 for ( i
= 0; i
< max_steps
- 1; i
++ )
3041 if ( found_routes
[0][i
+ 1] == 0 )
3044 fprintf ( fo
, " %d", found_routes
[0][i
] );
3047 fprintf ( fo
, "\n" );
3050 static int weakCounter
;
3052 /*************************************************
3056 Outputs connection information of specified contig.
3059 2. ctg: specified contig
3063 0 if contig's downstream connection was not weak connection.
3064 *************************************************/
3065 static boolean
printCnts ( FILE * fp
, unsigned int ctg
)
3067 CONNECT
* cnt
= contig_array
[ctg
].downwardConnect
;
3068 boolean flag
= 0, ret
= 0;
3069 unsigned int bal_ctg
= getTwinCtg ( ctg
);
3070 unsigned int linkCtg
;
3072 if ( isSameAsTwin ( ctg
) )
3075 CONNECT
* bindCnt
= getBindCnt ( ctg
);
3077 if ( bindCnt
&& bindCnt
->bySmall
&& bindCnt
->weakPoint
)
3080 fprintf ( fp
, "\tWP" );
3086 if ( cnt
->weight
&& !cnt
->inherit
)
3091 fprintf ( fp
, "\t#DOWN " );
3094 linkCtg
= cnt
->contigID
;
3096 if ( isLargerThanTwin ( linkCtg
) )
3097 { linkCtg
= getTwinCtg ( linkCtg
); }
3099 fprintf ( fp
, "%d:%d:%d ", index_array
[linkCtg
], cnt
->weight
, cnt
->gapLen
);
3106 cnt
= contig_array
[bal_ctg
].downwardConnect
;
3110 if ( cnt
->weight
&& !cnt
->inherit
)
3115 fprintf ( fp
, "\t#UP " );
3118 linkCtg
= cnt
->contigID
;
3120 if ( isLargerThanTwin ( linkCtg
) )
3121 { linkCtg
= getTwinCtg ( linkCtg
); }
3123 fprintf ( fp
, "%d:%d:%d ", index_array
[linkCtg
], cnt
->weight
, cnt
->gapLen
);
3129 fprintf ( fp
, "\n" );
3133 /*************************************************
3137 Makes statistic of scaffolds and contigs, including total length,
3138 non-N length, number, N50, etc.
3140 1. len_cut: length cutoff
3141 2. graphfile: prefix of graph
3146 *************************************************/
3147 void ScafStat ( int len_cut
, char * graphfile
)
3149 FILE * fp
, *fp2
, *fo
;
3151 sprintf ( line
, "%s.scafSeq", graphfile
);
3152 fp
= ckopen ( line
, "r" );
3153 sprintf ( line
, "%s.contig", graphfile
);
3154 fp2
= ckopen ( line
, "r" );
3155 sprintf ( line
, "%s.scafStatistics", graphfile
);
3156 fo
= ckopen ( line
, "w" );
3157 fprintf ( fo
, "<-- Information for assembly Scaffold '%s.scafSeq'.(cut_off_length < 100bp) -->\n\n", graphfile
);
3158 int cut_off_len
= 0;
3161 long Scaffold_Number
= 0;
3162 long Scaffold_Number_Scaf
= 0;
3163 long Scaffold_Number_Contig
= 0;
3164 long Singleton_Number_Scaf
= 0;
3165 long Singleton_Number
= 0;
3166 long * Singleton_Seq
= ( long * ) malloc ( sizeof ( long ) );
3167 long long A_num_all
= 0;
3168 long * A_num
= ( long * ) malloc ( sizeof ( long ) );
3169 long long C_num_all
= 0;
3170 long * C_num
= ( long * ) malloc ( sizeof ( long ) );
3171 long long G_num_all
= 0;
3172 long * G_num
= ( long * ) malloc ( sizeof ( long ) );
3173 long long T_num_all
= 0;
3174 long * T_num
= ( long * ) malloc ( sizeof ( long ) );
3175 long long N_num_all
= 0;
3176 long * N_num
= ( long * ) malloc ( sizeof ( long ) );
3177 long long Non_ACGTN_all
= 0;
3178 long * Non_ACGTN
= ( long * ) malloc ( sizeof ( long ) );
3179 long long Size_includeN
= 0;
3180 long * Size_Seq
= ( long * ) malloc ( sizeof ( long ) );
3193 long Num_N50_known
= 0;
3194 cut_off_len
= len_cut
;
3195 A_num
[Scaffold_Number
] = 0;
3196 C_num
[Scaffold_Number
] = 0;
3197 G_num
[Scaffold_Number
] = 0;
3198 T_num
[Scaffold_Number
] = 0;
3199 N_num
[Scaffold_Number
] = 0;
3200 Non_ACGTN
[Scaffold_Number
] = 0;
3201 Size_Seq
[Scaffold_Number
] = 0;
3202 Singleton_Seq
[Scaffold_Number
] = 0;
3203 Nucleotide
= fgetc ( fp
);
3205 while ( Nucleotide
!= (char) EOF
) /* Bug Fix */
3207 if ( Nucleotide
== '>' )
3209 if ( ( Scaffold_Number
> 0 ) && ( Size_Seq
[Scaffold_Number
- 1] < cut_off_len
) )
3211 A_num_all
= A_num_all
- A_num
[Scaffold_Number
- 1];
3212 C_num_all
= C_num_all
- C_num
[Scaffold_Number
- 1];
3213 G_num_all
= G_num_all
- G_num
[Scaffold_Number
- 1];
3214 T_num_all
= T_num_all
- T_num
[Scaffold_Number
- 1];
3215 N_num_all
= N_num_all
- N_num
[Scaffold_Number
- 1];
3216 Non_ACGTN_all
= Non_ACGTN_all
- Non_ACGTN
[Scaffold_Number
- 1];
3217 Size_includeN
= Size_includeN
- Size_Seq
[Scaffold_Number
- 1];
3218 Singleton_Number
= Singleton_Number
- Singleton_Seq
[Scaffold_Number
- 1];
3219 Scaffold_Number
= Scaffold_Number
- 1;
3223 Size_Seq
= ( long * ) realloc ( Size_Seq
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3224 A_num
= ( long * ) realloc ( A_num
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3225 C_num
= ( long * ) realloc ( C_num
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3226 G_num
= ( long * ) realloc ( G_num
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3227 T_num
= ( long * ) realloc ( T_num
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3228 N_num
= ( long * ) realloc ( N_num
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3229 Non_ACGTN
= ( long * ) realloc ( Non_ACGTN
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3230 Singleton_Seq
= ( long * ) realloc ( Singleton_Seq
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3234 A_num
[Scaffold_Number
- 1] = 0;
3235 C_num
[Scaffold_Number
- 1] = 0;
3236 G_num
[Scaffold_Number
- 1] = 0;
3237 T_num
[Scaffold_Number
- 1] = 0;
3238 N_num
[Scaffold_Number
- 1] = 0;
3239 Non_ACGTN
[Scaffold_Number
- 1] = 0;
3240 Size_Seq
[Scaffold_Number
- 1] = 0;
3241 Singleton_Seq
[Scaffold_Number
- 1] = 0;
3242 Nucleotide
= fgetc ( fp
);
3244 if ( Nucleotide
== 'C' )
3247 Singleton_Seq
[Scaffold_Number
- 1] ++;
3250 fgets ( buf
, 4000, fp
);
3252 else if ( ( Nucleotide
== 'N' ) || ( Nucleotide
== 'n' ) )
3254 N_num
[Scaffold_Number
- 1] ++;
3256 Size_Seq
[Scaffold_Number
- 1] ++;
3259 else if ( ( Nucleotide
== 'A' ) || ( Nucleotide
== 'a' ) )
3261 A_num
[Scaffold_Number
- 1] ++;
3263 Size_Seq
[Scaffold_Number
- 1] ++;
3266 else if ( ( Nucleotide
== 'C' ) || ( Nucleotide
== 'c' ) )
3268 C_num
[Scaffold_Number
- 1] ++;
3270 Size_Seq
[Scaffold_Number
- 1] ++;
3273 else if ( ( Nucleotide
== 'G' ) || ( Nucleotide
== 'g' ) )
3275 G_num
[Scaffold_Number
- 1] ++;
3277 Size_Seq
[Scaffold_Number
- 1] ++;
3280 else if ( ( Nucleotide
== 'T' ) || ( Nucleotide
== 't' ) )
3282 T_num
[Scaffold_Number
- 1] ++;
3284 Size_Seq
[Scaffold_Number
- 1] ++;
3289 if ( ( Nucleotide
!= '\n' ) && ( Nucleotide
!= '\r' ) )
3291 Non_ACGTN
[Scaffold_Number
- 1] ++;
3293 Size_Seq
[Scaffold_Number
- 1] ++;
3298 Nucleotide
= fgetc ( fp
);
3301 if ( Size_Seq
[Scaffold_Number
- 1] < cut_off_len
)
3303 A_num_all
= A_num_all
- A_num
[Scaffold_Number
- 1];
3304 C_num_all
= C_num_all
- C_num
[Scaffold_Number
- 1];
3305 G_num_all
= G_num_all
- G_num
[Scaffold_Number
- 1];
3306 T_num_all
= T_num_all
- T_num
[Scaffold_Number
- 1];
3307 N_num_all
= N_num_all
- N_num
[Scaffold_Number
- 1];
3308 Non_ACGTN_all
= Non_ACGTN_all
- Non_ACGTN
[Scaffold_Number
- 1];
3309 Size_includeN
= Size_includeN
- Size_Seq
[Scaffold_Number
- 1];
3310 Singleton_Number
= Singleton_Number
- Singleton_Seq
[Scaffold_Number
- 1];
3311 Scaffold_Number
= Scaffold_Number
- 1;
3314 qsort ( Size_Seq
, Scaffold_Number
, sizeof ( Size_Seq
[0] ), cmp_int
);
3315 fprintf ( fo
, "Size_includeN\t%lld\n", Size_includeN
);
3316 fprintf ( fo
, "Size_withoutN\t%lld\n", Size_includeN
- N_num_all
);
3317 fprintf ( fo
, "Scaffold_Num\t%ld\n", Scaffold_Number
);
3318 fprintf ( fo
, "Mean_Size\t%lld\n", Size_includeN
/ Scaffold_Number
);
3319 fprintf ( fo
, "Median_Size\t%ld\n", Size_Seq
[ ( Scaffold_Number
+ 1 ) / 2 - 1] );
3320 fprintf ( fo
, "Longest_Seq\t%ld\n", Size_Seq
[Scaffold_Number
- 1] );
3321 fprintf ( fo
, "Shortest_Seq\t%ld\n", Size_Seq
[0] );
3322 fprintf ( fo
, "Singleton_Num\t%ld\n", Singleton_Number
);
3323 fprintf ( fo
, "Average_length_of_break(N)_in_scaffold\t%lld\n", N_num_all
/ Scaffold_Number
);
3324 fprintf ( fo
, "\n" );
3326 if ( known_genome_size
)
3328 fprintf ( fo
, "Known_genome_size\t%ld\n", known_genome_size
);
3329 fprintf ( fo
, "Total_scaffold_length_as_percentage_of_known_genome_size\t%.2f%\n", 100 * ( 1.0 * Size_includeN
/ known_genome_size
) );
3333 fprintf ( fo
, "Known_genome_size\tNaN\n" );
3334 fprintf ( fo
, "Total_scaffold_length_as_percentage_of_known_genome_size\tNaN\n" );
3337 fprintf ( fo
, "\n" );
3339 for ( k
= 0; k
< Scaffold_Number
; k
++ )
3341 if ( Size_Seq
[k
] > 100 )
3346 if ( Size_Seq
[k
] > 500 )
3351 if ( Size_Seq
[k
] > 1000 )
3356 if ( Size_Seq
[k
] > 10000 )
3361 if ( Size_Seq
[k
] > 100000 )
3366 if ( Size_Seq
[k
] > 1000000 )
3372 fprintf ( fo
, "scaffolds>100 \t%ld\t%.2f%\n", n100
, 100 * ( 1.0 * n100
/ Scaffold_Number
) );
3373 fprintf ( fo
, "scaffolds>500 \t%ld\t%.2f%\n", n500
, 100 * ( 1.0 * n500
/ Scaffold_Number
) );
3374 fprintf ( fo
, "scaffolds>1K \t%ld\t%.2f%\n", n1k
, 100 * ( 1.0 * n1k
/ Scaffold_Number
) );
3375 fprintf ( fo
, "scaffolds>10K \t%ld\t%.2f%\n", n10k
, 100 * ( 1.0 * n10k
/ Scaffold_Number
) );
3376 fprintf ( fo
, "scaffolds>100K\t%ld\t%.2f%\n", n100k
, 100 * ( 1.0 * n100k
/ Scaffold_Number
) );
3377 fprintf ( fo
, "scaffolds>1M \t%ld\t%.2f%\n", n1m
, 100 * ( 1.0 * n1m
/ Scaffold_Number
) );
3378 fprintf ( fo
, "\n" );
3379 fprintf ( fo
, "Nucleotide_A\t%lld\t%.2f%\n", A_num_all
, 100 * ( 1.0 * A_num_all
/ Size_includeN
) );
3380 fprintf ( fo
, "Nucleotide_C\t%lld\t%.2f%\n", C_num_all
, 100 * ( 1.0 * C_num_all
/ Size_includeN
) );
3381 fprintf ( fo
, "Nucleotide_G\t%lld\t%.2f%\n", G_num_all
, 100 * ( 1.0 * G_num_all
/ Size_includeN
) );
3382 fprintf ( fo
, "Nucleotide_T\t%lld\t%.2f%\n", T_num_all
, 100 * ( 1.0 * T_num_all
/ Size_includeN
) );
3383 fprintf ( fo
, "GapContent_N\t%lld\t%.2f%\n", N_num_all
, 100 * ( 1.0 * N_num_all
/ Size_includeN
) );
3384 fprintf ( fo
, "Non_ACGTN\t%lld\t%.2f%\n", Non_ACGTN_all
, 100 * ( 1.0 * Non_ACGTN_all
/ Size_includeN
) );
3385 fprintf ( fo
, "GC_Content\t%.2f%\t\t(G+C)/(A+C+G+T)\n", 100 * ( 1.0 * ( G_num_all
+ C_num_all
) / ( A_num_all
+ C_num_all
+ G_num_all
+ T_num_all
) ) );
3386 fprintf ( fo
, "\n" );
3388 for ( k
= 0; k
< 10; k
++ )
3391 for ( k
= Scaffold_Number
- 1; k
>= 0; k
-- )
3393 Sum
= Sum
+ Size_Seq
[k
];
3395 if ( ( Sum
>= Size_includeN
* 0.1 ) && ( Sum
< Size_includeN
* 0.2 ) && ( flag
[1] == 0 ) )
3397 fprintf ( fo
, "N10\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3400 else if ( ( Sum
>= Size_includeN
* 0.2 ) && ( Sum
< Size_includeN
* 0.3 ) && ( flag
[2] == 0 ) )
3402 fprintf ( fo
, "N20\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3405 else if ( ( Sum
>= Size_includeN
* 0.3 ) && ( Sum
< Size_includeN
* 0.4 ) && ( flag
[3] == 0 ) )
3407 fprintf ( fo
, "N30\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3410 else if ( ( Sum
>= Size_includeN
* 0.4 ) && ( Sum
< Size_includeN
* 0.5 ) && ( flag
[4] == 0 ) )
3412 fprintf ( fo
, "N40\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3415 else if ( ( Sum
>= Size_includeN
* 0.5 ) && ( Sum
< Size_includeN
* 0.6 ) && ( flag
[5] == 0 ) )
3417 fprintf ( fo
, "N50\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3421 else if ( ( Sum
>= Size_includeN
* 0.6 ) && ( Sum
< Size_includeN
* 0.7 ) && ( flag
[6] == 0 ) )
3423 fprintf ( fo
, "N60\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3426 else if ( ( Sum
>= Size_includeN
* 0.7 ) && ( Sum
< Size_includeN
* 0.8 ) && ( flag
[7] == 0 ) )
3428 fprintf ( fo
, "N70\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3431 else if ( ( Sum
>= Size_includeN
* 0.8 ) && ( Sum
< Size_includeN
* 0.9 ) && ( flag
[8] == 0 ) )
3433 fprintf ( fo
, "N80\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3436 else if ( ( Sum
>= Size_includeN
* 0.9 ) && ( flag
[9] == 0 ) )
3438 fprintf ( fo
, "N90\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3442 if ( ( Sum
>= known_genome_size
* 0.5 ) && ( flag_known
== 0 ) )
3444 N50_known
= Size_Seq
[k
];
3445 Num_N50_known
= Scaffold_Number
- k
;
3454 for ( k
= Scaffold_Number
- 1; k
>= 0; k
-- )
3456 Sum
= Sum
+ Size_Seq
[k
];
3458 if ( Sum
>= Size_includeN
* 0.5 )
3460 fprintf ( fo
, "N50\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3466 fprintf ( fo
, "\n" );
3468 if ( known_genome_size
)
3470 fprintf ( fo
, "NG50\t%ld\t%ld\n", N50_known
, Num_N50_known
);
3471 fprintf ( fo
, "N50_scaffold-NG50_scaffold_length_difference\t%ld\n", abs ( N50
- N50_known
) );
3475 fprintf ( fo
, "NG50\tNaN\tNaN\n" );
3476 fprintf ( fo
, "N50_scaffold-NG50_scaffold_length_difference\tNaN\n" );
3479 fprintf ( fo
, "\n" );
3486 free ( Singleton_Seq
);
3488 Scaffold_Number_Scaf
= Scaffold_Number
;
3489 Singleton_Number_Scaf
= Singleton_Number
;
3490 /*********************** Contig ******************************/
3491 fprintf ( fo
, "<-- Information for assembly Contig '%s.contig'.(cut_off_length < 100bp) -->\n\n", graphfile
);
3493 Scaffold_Number
= 0;
3494 Singleton_Number
= 0;
3495 Singleton_Seq
= ( long * ) malloc ( sizeof ( long ) );
3497 A_num
= ( long * ) malloc ( sizeof ( long ) );
3499 C_num
= ( long * ) malloc ( sizeof ( long ) );
3501 G_num
= ( long * ) malloc ( sizeof ( long ) );
3503 T_num
= ( long * ) malloc ( sizeof ( long ) );
3505 N_num
= ( long * ) malloc ( sizeof ( long ) );
3507 Non_ACGTN
= ( long * ) malloc ( sizeof ( long ) );
3509 Size_Seq
= ( long * ) malloc ( sizeof ( long ) );
3521 cut_off_len
= len_cut
;
3522 A_num
[Scaffold_Number
] = 0;
3523 C_num
[Scaffold_Number
] = 0;
3524 G_num
[Scaffold_Number
] = 0;
3525 T_num
[Scaffold_Number
] = 0;
3526 N_num
[Scaffold_Number
] = 0;
3527 Non_ACGTN
[Scaffold_Number
] = 0;
3528 Size_Seq
[Scaffold_Number
] = 0;
3529 Singleton_Seq
[Scaffold_Number
] = 0;
3530 Nucleotide
= fgetc ( fp2
);
3532 while ( Nucleotide
!= (char) EOF
) /* Bug Fix */
3534 if ( Nucleotide
== '>' )
3536 if ( ( Scaffold_Number
> 0 ) && ( Size_Seq
[Scaffold_Number
- 1] < cut_off_len
) )
3538 A_num_all
= A_num_all
- A_num
[Scaffold_Number
- 1];
3539 C_num_all
= C_num_all
- C_num
[Scaffold_Number
- 1];
3540 G_num_all
= G_num_all
- G_num
[Scaffold_Number
- 1];
3541 T_num_all
= T_num_all
- T_num
[Scaffold_Number
- 1];
3542 N_num_all
= N_num_all
- N_num
[Scaffold_Number
- 1];
3543 Non_ACGTN_all
= Non_ACGTN_all
- Non_ACGTN
[Scaffold_Number
- 1];
3544 Size_includeN
= Size_includeN
- Size_Seq
[Scaffold_Number
- 1];
3545 Singleton_Number
= Singleton_Number
- Singleton_Seq
[Scaffold_Number
- 1];
3546 Scaffold_Number
= Scaffold_Number
- 1;
3550 Size_Seq
= ( long * ) realloc ( Size_Seq
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3551 A_num
= ( long * ) realloc ( A_num
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3552 C_num
= ( long * ) realloc ( C_num
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3553 G_num
= ( long * ) realloc ( G_num
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3554 T_num
= ( long * ) realloc ( T_num
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3555 N_num
= ( long * ) realloc ( N_num
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3556 Non_ACGTN
= ( long * ) realloc ( Non_ACGTN
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3557 Singleton_Seq
= ( long * ) realloc ( Singleton_Seq
, ( Scaffold_Number
+ 2 ) * sizeof ( long ) );
3561 A_num
[Scaffold_Number
- 1] = 0;
3562 C_num
[Scaffold_Number
- 1] = 0;
3563 G_num
[Scaffold_Number
- 1] = 0;
3564 T_num
[Scaffold_Number
- 1] = 0;
3565 N_num
[Scaffold_Number
- 1] = 0;
3566 Non_ACGTN
[Scaffold_Number
- 1] = 0;
3567 Size_Seq
[Scaffold_Number
- 1] = 0;
3568 Singleton_Seq
[Scaffold_Number
- 1] = 0;
3569 Nucleotide
= fgetc ( fp2
);
3571 if ( Nucleotide
== 'C' )
3574 Singleton_Seq
[Scaffold_Number
- 1] ++;
3577 fgets ( buf
, 4000, fp2
);
3579 else if ( ( Nucleotide
== 'N' ) || ( Nucleotide
== 'n' ) )
3581 N_num
[Scaffold_Number
- 1] ++;
3583 Size_Seq
[Scaffold_Number
- 1] ++;
3586 else if ( ( Nucleotide
== 'A' ) || ( Nucleotide
== 'a' ) )
3588 A_num
[Scaffold_Number
- 1] ++;
3590 Size_Seq
[Scaffold_Number
- 1] ++;
3593 else if ( ( Nucleotide
== 'C' ) || ( Nucleotide
== 'c' ) )
3595 C_num
[Scaffold_Number
- 1] ++;
3597 Size_Seq
[Scaffold_Number
- 1] ++;
3600 else if ( ( Nucleotide
== 'G' ) || ( Nucleotide
== 'g' ) )
3602 G_num
[Scaffold_Number
- 1] ++;
3604 Size_Seq
[Scaffold_Number
- 1] ++;
3607 else if ( ( Nucleotide
== 'T' ) || ( Nucleotide
== 't' ) )
3609 T_num
[Scaffold_Number
- 1] ++;
3611 Size_Seq
[Scaffold_Number
- 1] ++;
3616 if ( ( Nucleotide
!= '\n' ) && ( Nucleotide
!= '\r' ) )
3618 Non_ACGTN
[Scaffold_Number
- 1] ++;
3620 Size_Seq
[Scaffold_Number
- 1] ++;
3625 Nucleotide
= fgetc ( fp2
);
3628 if ( Size_Seq
[Scaffold_Number
- 1] < cut_off_len
)
3630 A_num_all
= A_num_all
- A_num
[Scaffold_Number
- 1];
3631 C_num_all
= C_num_all
- C_num
[Scaffold_Number
- 1];
3632 G_num_all
= G_num_all
- G_num
[Scaffold_Number
- 1];
3633 T_num_all
= T_num_all
- T_num
[Scaffold_Number
- 1];
3634 N_num_all
= N_num_all
- N_num
[Scaffold_Number
- 1];
3635 Non_ACGTN_all
= Non_ACGTN_all
- Non_ACGTN
[Scaffold_Number
- 1];
3636 Size_includeN
= Size_includeN
- Size_Seq
[Scaffold_Number
- 1];
3637 Singleton_Number
= Singleton_Number
- Singleton_Seq
[Scaffold_Number
- 1];
3638 Scaffold_Number
= Scaffold_Number
- 1;
3641 qsort ( Size_Seq
, Scaffold_Number
, sizeof ( Size_Seq
[0] ), cmp_int
);
3642 fprintf ( fo
, "Size_includeN\t%lld\n", Size_includeN
);
3643 fprintf ( fo
, "Size_withoutN\t%lld\n", Size_includeN
- N_num_all
);
3644 fprintf ( fo
, "Contig_Num\t%ld\n", Scaffold_Number
);
3645 fprintf ( fo
, "Mean_Size\t%lld\n", Size_includeN
/ Scaffold_Number
);
3646 fprintf ( fo
, "Median_Size\t%ld\n", Size_Seq
[ ( Scaffold_Number
+ 1 ) / 2 - 1] );
3647 fprintf ( fo
, "Longest_Seq\t%ld\n", Size_Seq
[Scaffold_Number
- 1] );
3648 fprintf ( fo
, "Shortest_Seq\t%ld\n", Size_Seq
[0] );
3649 fprintf ( fo
, "\n" );
3651 for ( k
= 0; k
< Scaffold_Number
; k
++ )
3653 if ( Size_Seq
[k
] > 100 )
3658 if ( Size_Seq
[k
] > 500 )
3663 if ( Size_Seq
[k
] > 1000 )
3668 if ( Size_Seq
[k
] > 10000 )
3673 if ( Size_Seq
[k
] > 100000 )
3678 if ( Size_Seq
[k
] > 1000000 )
3684 fprintf ( fo
, "Contig>100 \t%ld\t%.2f%\n", n100
, 100 * ( 1.0 * n100
/ Scaffold_Number
) );
3685 fprintf ( fo
, "Contig>500 \t%ld\t%.2f%\n", n500
, 100 * ( 1.0 * n500
/ Scaffold_Number
) );
3686 fprintf ( fo
, "Contig>1K \t%ld\t%.2f%\n", n1k
, 100 * ( 1.0 * n1k
/ Scaffold_Number
) );
3687 fprintf ( fo
, "Contig>10K \t%ld\t%.2f%\n", n10k
, 100 * ( 1.0 * n10k
/ Scaffold_Number
) );
3688 fprintf ( fo
, "Contig>100K\t%ld\t%.2f%\n", n100k
, 100 * ( 1.0 * n100k
/ Scaffold_Number
) );
3689 fprintf ( fo
, "Contig>1M \t%ld\t%.2f%\n", n1m
, 100 * ( 1.0 * n1m
/ Scaffold_Number
) );
3690 fprintf ( fo
, "\n" );
3691 fprintf ( fo
, "Nucleotide_A\t%lld\t%.2f%\n", A_num_all
, 100 * ( 1.0 * A_num_all
/ Size_includeN
) );
3692 fprintf ( fo
, "Nucleotide_C\t%lld\t%.2f%\n", C_num_all
, 100 * ( 1.0 * C_num_all
/ Size_includeN
) );
3693 fprintf ( fo
, "Nucleotide_G\t%lld\t%.2f%\n", G_num_all
, 100 * ( 1.0 * G_num_all
/ Size_includeN
) );
3694 fprintf ( fo
, "Nucleotide_T\t%lld\t%.2f%\n", T_num_all
, 100 * ( 1.0 * T_num_all
/ Size_includeN
) );
3695 fprintf ( fo
, "GapContent_N\t%lld\t%.2f%\n", N_num_all
, 100 * ( 1.0 * N_num_all
/ Size_includeN
) );
3696 fprintf ( fo
, "Non_ACGTN\t%lld\t%.2f%\n", Non_ACGTN_all
, 100 * ( 1.0 * Non_ACGTN_all
/ Size_includeN
) );
3697 fprintf ( fo
, "GC_Content\t%.2f%\t\t(G+C)/(A+C+G+T)\n", 100 * ( 1.0 * ( G_num_all
+ C_num_all
) / ( A_num_all
+ C_num_all
+ G_num_all
+ T_num_all
) ) );
3698 fprintf ( fo
, "\n" );
3700 for ( k
= 0; k
< 10; k
++ )
3703 for ( k
= Scaffold_Number
- 1; k
>= 0; k
-- )
3705 Sum
= Sum
+ Size_Seq
[k
];
3707 if ( ( Sum
>= Size_includeN
* 0.1 ) && ( Sum
< Size_includeN
* 0.2 ) && ( flag
[1] == 0 ) )
3709 fprintf ( fo
, "N10\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3712 else if ( ( Sum
>= Size_includeN
* 0.2 ) && ( Sum
< Size_includeN
* 0.3 ) && ( flag
[2] == 0 ) )
3714 fprintf ( fo
, "N20\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3717 else if ( ( Sum
>= Size_includeN
* 0.3 ) && ( Sum
< Size_includeN
* 0.4 ) && ( flag
[3] == 0 ) )
3719 fprintf ( fo
, "N30\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3722 else if ( ( Sum
>= Size_includeN
* 0.4 ) && ( Sum
< Size_includeN
* 0.5 ) && ( flag
[4] == 0 ) )
3724 fprintf ( fo
, "N40\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3727 else if ( ( Sum
>= Size_includeN
* 0.5 ) && ( Sum
< Size_includeN
* 0.6 ) && ( flag
[5] == 0 ) )
3729 fprintf ( fo
, "N50\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3733 else if ( ( Sum
>= Size_includeN
* 0.6 ) && ( Sum
< Size_includeN
* 0.7 ) && ( flag
[6] == 0 ) )
3735 fprintf ( fo
, "N60\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3738 else if ( ( Sum
>= Size_includeN
* 0.7 ) && ( Sum
< Size_includeN
* 0.8 ) && ( flag
[7] == 0 ) )
3740 fprintf ( fo
, "N70\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3743 else if ( ( Sum
>= Size_includeN
* 0.8 ) && ( Sum
< Size_includeN
* 0.9 ) && ( flag
[8] == 0 ) )
3745 fprintf ( fo
, "N80\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3748 else if ( ( Sum
>= Size_includeN
* 0.9 ) && ( flag
[9] == 0 ) )
3750 fprintf ( fo
, "N90\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3754 if ( ( Sum
>= known_genome_size
* 0.5 ) && ( flag_known
== 0 ) )
3756 N50_known
= Size_Seq
[k
];
3757 Num_N50_known
= Scaffold_Number
- k
;
3766 for ( k
= Scaffold_Number
- 1; k
>= 0; k
-- )
3768 Sum
= Sum
+ Size_Seq
[k
];
3770 if ( Sum
>= Size_includeN
* 0.5 )
3772 fprintf ( fo
, "N50\t%ld\t%ld\n", Size_Seq
[k
], Scaffold_Number
- k
);
3778 fprintf ( fo
, "\n" );
3780 if ( known_genome_size
)
3782 fprintf ( fo
, "NG50\t%ld\t%ld\n", N50_known
, Num_N50_known
);
3783 fprintf ( fo
, "N50_contig-NG50_contig_length_difference\t%ld\n", abs ( N50
- N50_known
) );
3787 fprintf ( fo
, "NG50\tNaN\tNaN\n" );
3788 fprintf ( fo
, "N50_contig-NG50_contig_length_difference\tNaN\n" );
3791 fprintf ( fo
, "\n" );
3798 free ( Singleton_Seq
);
3800 Scaffold_Number_Contig
= Scaffold_Number
;
3801 fprintf ( fo
, "Number_of_contigs_in_scaffolds\t%ld\n", Scaffold_Number_Contig
- Singleton_Number_Scaf
);
3802 fprintf ( fo
, "Number_of_contigs_not_in_scaffolds(Singleton)\t%ld\n", Singleton_Number_Scaf
);
3803 fprintf ( fo
, "Average_number_of_contigs_per_scaffold\t%.1f\n", 1.0 * ( Scaffold_Number_Contig
- Singleton_Number_Scaf
) / ( Scaffold_Number_Scaf
- Singleton_Number_Scaf
) );
3804 fprintf ( fo
, "\n" );
3810 /*************************************************
3814 Gets connection between two contigs.
3816 1. from_c: left contig
3817 2. to_c: right contig
3821 Connection between two contigs.
3822 *************************************************/
3823 CONNECT
* getCnt ( unsigned int from_c
, unsigned int to_c
)
3826 pcnt
= contig_array
[from_c
].downwardConnect
;
3830 if ( pcnt
->contigID
== to_c
)
3839 /*************************************************
3843 Counts contig's downstrem connection number if this contig does
3844 NOT have neigher upstream nor downstream connection in scaffold.
3847 2. preCNT: contig's upstream connection
3851 1 if contig had both upstream and downstream connections in
3853 Contig's downstream connection number otherwise.
3854 *************************************************/
3855 int allConnect ( unsigned int ctg
, CONNECT
* preCNT
)
3857 if ( preCNT
&& preCNT
->nextInScaf
)
3863 if ( !contig_array
[ctg
].downwardConnect
)
3866 cn_temp
= contig_array
[ctg
].downwardConnect
;
3871 cn_temp
= cn_temp
->next
;
3877 /*************************************************
3881 Calculates a score to indicate the strengh of connections
3882 between a contig and other contigs in scaffold.
3884 1. pos: contig index
3885 2. tempCounter: maximum contig number in scaffold
3890 *************************************************/
3891 int get_ctg_score2 ( int pos
, int tempCounter
)
3893 int id
, innum
, outnum
, in
= 0, out
= 0, i
, currId
;
3895 id
= * ( unsigned int * ) darrayGet ( tempArray
, pos
);
3896 outnum
= allConnect ( id
, NULL
);
3897 innum
= allConnect ( getTwinCtg ( id
), NULL
);
3898 int outlen
= 0, inlen
= 0, outcut
, incut
;
3900 if ( contig_array
[id
].downwardConnect
)
3901 { outlen
= contig_array
[id
].downwardConnect
->gapLen
; }
3903 if ( contig_array
[getTwinCtg ( id
)].downwardConnect
)
3904 { inlen
= contig_array
[getTwinCtg ( id
)].downwardConnect
->gapLen
; }
3906 outcut
= outlen
/ overlaplen
;
3907 incut
= inlen
/ overlaplen
;
3909 if ( outcut
> outnum
* 10 || outcut
< outnum
* 2 )
3910 { outcut
= outnum
* 10; }
3912 if ( incut
> innum
* 10 || incut
< innum
* 2 )
3913 { incut
= innum
* 10; }
3915 int start
= pos
- incut
> 0 ? pos
- incut
: 0;
3916 int end
= pos
+ outcut
< tempCounter
? pos
+ outcut
: tempCounter
;
3918 for ( i
= start
; i
< end
; i
++ )
3923 currId
= * ( unsigned int * ) darrayGet ( tempArray
, i
);
3927 dh_cnt
= getCnt ( currId
, id
);
3929 if ( dh_cnt
&& dh_cnt
->weight
> 0 )
3935 dh_cnt
= getCnt ( id
, currId
);
3937 if ( dh_cnt
&& dh_cnt
->weight
> 0 )
3945 if ( outnum
> tempCounter
- pos
- 1 )
3946 { outnum
= tempCounter
- pos
- 1; }
3948 if ( innum
> 0 && outnum
> 0 )
3950 if ( pos
< tempCounter
- 1 && pos
> 0 )
3952 if ( outnum
< 3 && out
== 0 )
3955 { return ( int ) ( ( ( double ) ( in
) / ( double ) ( innum
) ) * 100 ); }
3958 if ( innum
< 3 && in
== 0 )
3961 { return ( int ) ( ( ( double ) ( out
) / ( double ) ( outnum
) ) * 100 ); }
3964 if ( in
== 0 || out
== 0 )
3967 return ( int ) ( ( ( double ) ( in
* out
) / ( double ) ( innum
* outnum
) ) * 100 );
3971 { return ( int ) ( ( ( double ) ( out
) / ( double ) ( outnum
) ) * 100 ); }
3973 if ( pos
== tempCounter
- 1 )
3974 { return ( int ) ( ( ( double ) ( in
) / ( double ) ( innum
) ) * 100 ); }
3976 else if ( innum
> 0 )
3978 if ( pos
== tempCounter
- 1 && in
== 1 && innum
> 5 )
3981 return ( int ) ( ( ( double ) ( in
) / ( double ) ( innum
) ) * 100 );
3983 else if ( outnum
> 0 )
3985 if ( pos
== 0 && out
== 1 && outnum
> 5 )
3988 return ( int ) ( ( ( double ) ( out
) / ( double ) ( outnum
) ) * 100 );
3994 /*************************************************
3998 Calculates a score to indicate the strengh of connections
3999 between a contig and other contigs in scaffold.
4001 1. pos: contig index
4002 2. tempCounter: maximum contig number in scaffold
4007 *************************************************/
4008 int get_ctg_score ( int pos
, int num3
, int num5
, int flag
)
4010 int i
= 0, in
= 0, out
= 0, innum
= 0, outnum
= 0, threeid
;
4016 id
= * ( unsigned int * ) darrayGet ( scaf3
, pos
);
4017 int end
= pos
> 100 ? pos
- 100 : 0;
4018 int start
= pos
+ 100 < num3
? pos
+ 100 : num3
;
4021 for ( i
= start
; i
>= end
; i
-- )
4023 threeid
= * ( unsigned int * ) darrayGet ( scaf3
, i
);
4025 if ( threeid
== id
)
4031 dh_cnt
= getCnt ( id
, threeid
);
4033 if ( dh_cnt
&& dh_cnt
->weight
> 0 )
4038 dh_cnt
= getCnt ( threeid
, id
);
4040 if ( dh_cnt
&& dh_cnt
->weight
> 0 )
4044 outnum
= allConnect ( id
, NULL
);
4045 innum
= allConnect ( getTwinCtg ( id
), NULL
);
4048 if ( pos
- end
< outnum
)
4050 for ( i
= 0; i
< num5
; i
++ )
4053 threeid
= * ( unsigned int * ) darrayGet ( scaf5
, i
);
4054 dh_cnt
= getCnt ( id
, threeid
);
4056 if ( dh_cnt
&& dh_cnt
->weight
> 0 )
4059 if ( num5_check
== outnum
)
4064 if ( pos
- end
+ num5_check
< outnum
)
4065 { outnum
= pos
- end
+ num5_check
; }
4067 if ( start
- pos
< innum
)
4068 { innum
= start
- pos
; }
4070 if ( innum
> 0 && outnum
> 0 )
4072 if ( pos
!= num3
&& pos
> 0 )
4074 if ( outnum
< 5 && out
== 0 )
4077 { return ( int ) ( ( ( double ) ( in
) / ( double ) ( innum
) ) * 100 ); }
4080 if ( innum
< 5 && in
== 0 )
4083 { return ( int ) ( ( ( double ) ( out
) / ( double ) ( outnum
) ) * 100 ); }
4086 if ( in
== 0 || out
== 0 )
4089 return ( int ) ( ( ( double ) ( in
* out
) / ( double ) ( innum
* outnum
) ) * 100 );
4093 { return ( int ) ( ( ( double ) ( out
) / ( double ) ( outnum
) ) * 100 ); }
4096 { return ( int ) ( ( ( double ) ( in
) / ( double ) ( innum
) ) * 100 ); }
4098 else if ( innum
> 0 )
4100 return ( int ) ( ( ( double ) ( in
) / ( double ) ( innum
) ) * 100 );
4102 else if ( outnum
> 0 )
4104 if ( pos
== num3
&& out
== 1 && outnum
> 5 )
4107 return ( int ) ( ( ( double ) ( out
) / ( double ) ( outnum
) ) * 100 );
4114 id
= * ( unsigned int * ) darrayGet ( scaf5
, pos
);
4115 int start
= pos
> 100 ? pos
- 100 : 0;
4116 int end
= pos
+ 100 < num5
? pos
+ 100 : num5
;
4119 for ( i
= start
; i
< end
; i
++ )
4121 threeid
= * ( unsigned int * ) darrayGet ( scaf5
, i
);
4123 if ( threeid
== id
)
4129 dh_cnt
= getCnt ( id
, threeid
);
4131 if ( dh_cnt
&& dh_cnt
->weight
> 0 )
4134 dh_cnt
= getCnt ( threeid
, id
);
4136 if ( dh_cnt
&& dh_cnt
->weight
> 0 )
4140 outnum
= allConnect ( id
, NULL
);
4141 innum
= allConnect ( getTwinCtg ( id
), NULL
);
4144 if ( pos
- start
< innum
)
4146 for ( i
= 0; i
< num3
; i
++ )
4149 threeid
= * ( unsigned int * ) darrayGet ( scaf3
, i
);
4150 dh_cnt
= getCnt ( threeid
, id
);
4152 if ( dh_cnt
&& dh_cnt
->weight
> 0 )
4155 if ( num3_check
== innum
)
4160 if ( pos
- start
+ num3_check
< innum
)
4161 { innum
= pos
- start
+ num3_check
; }
4163 if ( end
- pos
- 1 < outnum
)
4164 { outnum
= end
- pos
- 1; }
4166 if ( innum
> 0 && outnum
> 0 )
4168 if ( pos
!= num5
- 1 && pos
> 0 )
4170 if ( outnum
< 5 && out
== 0 && innum
> 0 )
4171 { return ( int ) ( ( ( double ) ( in
) / ( double ) ( innum
) ) * 100 ); }
4173 if ( innum
< 5 && in
== 0 && outnum
> 0 )
4174 { return ( int ) ( ( ( double ) ( out
) / ( double ) ( outnum
) ) * 100 ); }
4176 if ( in
== 0 || out
== 0 )
4179 return ( int ) ( ( ( double ) ( in
* out
) / ( double ) ( innum
* outnum
) ) * 100 );
4183 { return ( int ) ( ( ( double ) ( out
) / ( double ) ( outnum
) ) * 100 ); }
4185 if ( pos
== num5
- 1 )
4186 { return ( int ) ( ( ( double ) ( in
) / ( double ) ( innum
) ) * 100 ); }
4188 else if ( innum
> 0 )
4190 if ( pos
== num5
- 1 && in
== 1 && innum
> 5 )
4193 return ( int ) ( ( ( double ) ( in
) / ( double ) ( innum
) ) * 100 );
4195 else if ( outnum
> 0 )
4196 { return ( int ) ( ( ( double ) ( out
) / ( double ) ( outnum
) ) * 100 ); }
4204 /*************************************************
4208 Masks contigs having low connection score in scaffold. Fills gaps
4209 by using ARC information. Outputs scaffold structure and filled
4212 1. len_cut: length cutoff
4213 2. outfile: prefix of graph
4218 *************************************************/
4219 void scaffolding ( unsigned int len_cut
, char * outfile
)
4221 unsigned int prev_ctg
, ctg
, bal_ctg
, *length_array
, count
= 0, num_lctg
= 0, *score_array
;
4222 unsigned int i
, max_steps
= 5;
4223 int num5
, num3
, j
, len
, flag
, num_route
, gap_c
= 0;
4226 long long sum
= 0, N50
, N90
;
4227 FILE * fp
, *fo
= NULL
;
4229 CONNECT
* cnt
, *prevCNT
, *nextCnt
, *dh_cnt
;
4230 boolean excep
, weak
;
4232 so_far
= ( unsigned int * ) ckalloc ( max_n_routes
* sizeof ( unsigned int ) );
4233 found_routes
= ( unsigned int ** ) ckalloc ( max_n_routes
* sizeof ( unsigned int * ) );
4235 for ( j
= 0; j
< max_n_routes
; j
++ )
4236 { found_routes
[j
] = ( unsigned int * ) ckalloc ( max_steps
* sizeof ( unsigned int ) ); }
4238 length_array
= ( unsigned int * ) ckalloc ( ( num_ctg
+ 1 ) * sizeof ( unsigned int ) );
4240 //use length_array to change info in index_array
4241 for ( i
= 1; i
<= num_ctg
; i
++ )
4242 { length_array
[i
] = 0; }
4244 for ( i
= 1; i
<= num_ctg
; i
++ )
4246 if ( index_array
[i
] > 0 )
4247 { length_array
[index_array
[i
]] = i
; }
4250 for ( i
= 1; i
<= num_ctg
; i
++ )
4251 { index_array
[i
] = length_array
[i
]; }
4254 sprintf ( name
, "%s.scaf", outfile
);
4255 fp
= ckopen ( name
, "w" );
4256 sprintf ( name
, "%s.scaf_gap", outfile
);
4257 fo
= ckopen ( name
, "w" );
4258 scaf3
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) );
4259 scaf5
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) );
4260 gap3
= ( DARRAY
* ) createDarray ( 1000, sizeof ( int ) );
4261 gap5
= ( DARRAY
* ) createDarray ( 1000, sizeof ( int ) );
4262 tempArray
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) );
4264 for ( i
= 1; i
<= num_ctg
; i
++ )
4265 { contig_array
[i
].flag
= 0; }
4267 for ( i
= 1; i
<= num_ctg
; i
++ )
4269 if ( contig_array
[i
].length
+ ( unsigned int ) overlaplen
>= len_cut
)
4274 if ( contig_array
[i
].flag
|| contig_array
[i
].mask
|| !contig_array
[i
].downwardConnect
|| !validConnect ( i
, NULL
) )
4279 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = i
;
4280 contig_array
[i
].flag
= 1;
4281 bal_ctg
= getTwinCtg ( ctg
);
4282 contig_array
[bal_ctg
].flag
= 1;
4283 len
= contig_array
[i
].length
;
4285 cnt
= getNextContig ( ctg
, prevCNT
, &excep
);
4289 nextCnt
= getNextContig ( cnt
->contigID
, cnt
, &excep
);
4291 if ( excep
&& prevCNT
)
4292 { fprintf ( stderr
, "scaffolding: exception --- prev cnt from %u\n", prevCNT
->contigID
); }
4294 if ( nextCnt
&& nextCnt
->used
)
4297 setConnectUsed ( ctg
, cnt
->contigID
, 1 );
4298 * ( int * ) darrayPut ( gap5
, num5
- 1 ) = cnt
->gapLen
;
4299 ctg
= cnt
->contigID
;
4300 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = ctg
;
4301 len
+= cnt
->gapLen
+ contig_array
[ctg
].length
;
4302 bal_ctg
= getTwinCtg ( ctg
);
4303 contig_array
[ctg
].flag
= 1;
4304 contig_array
[bal_ctg
].flag
= 1;
4309 ctg
= getTwinCtg ( i
);
4312 { prevCNT
= checkConnect ( getTwinCtg ( * ( unsigned int * ) darrayGet ( scaf5
, 1 ) ), ctg
); }
4316 cnt
= getNextContig ( ctg
, prevCNT
, &excep
);
4320 nextCnt
= getNextContig ( cnt
->contigID
, cnt
, &excep
);
4322 if ( excep
&& prevCNT
)
4323 { fprintf ( stderr
, "scaffolding: exception -- prev cnt from %u\n", prevCNT
->contigID
); }
4325 if ( nextCnt
&& nextCnt
->used
)
4328 setConnectUsed ( ctg
, cnt
->contigID
, 1 );
4329 ctg
= cnt
->contigID
;
4330 len
+= cnt
->gapLen
+ contig_array
[ctg
].length
;
4331 bal_ctg
= getTwinCtg ( ctg
);
4332 contig_array
[ctg
].flag
= 1;
4333 contig_array
[bal_ctg
].flag
= 1;
4334 * ( int * ) darrayPut ( gap3
, num3
) = cnt
->gapLen
;
4335 * ( unsigned int * ) darrayPut ( scaf3
, num3
++ ) = bal_ctg
;
4340 if ( num5
+ num3
== 1 )
4342 contig_array
[i
].flag
= 0;
4348 length_array
[count
++] = len
;
4350 if ( num5
+ num3
< 1 )
4352 fprintf ( stderr
, "no scaffold created for contig %d\n", i
);
4358 for ( j
= num3
- 1; j
>= 0; j
-- )
4359 { * ( unsigned int * ) darrayPut ( tempArray
, tempCounter
++ ) = * ( unsigned int * ) darrayGet ( scaf3
, j
); }
4361 for ( j
= 0; j
< num5
; j
++ )
4362 { * ( unsigned int * ) darrayPut ( tempArray
, tempCounter
++ ) = * ( unsigned int * ) darrayGet ( scaf5
, j
); }
4364 score_array
= ( unsigned int * ) ckalloc ( tempCounter
* sizeof ( unsigned int ) );
4365 int now_cnt_weight
, curr_ctg_score
, pre_score
= -1, prev_id
= 0, mask_num
= 0, score_count
= 0, prev_p
= 0;
4367 for ( j
= 0; j
< tempCounter
; j
++ )
4369 int currId
= * ( unsigned int * ) darrayGet ( tempArray
, j
);
4370 dh_cnt
= getCntBetween ( prev_id
, currId
);
4373 { now_cnt_weight
= dh_cnt
->weight
; }
4375 { now_cnt_weight
= 0; }
4377 curr_ctg_score
= get_ctg_score2 ( j
, tempCounter
);
4381 pre_score
= curr_ctg_score
;
4389 if ( now_cnt_weight
== 0 && j
> 0 && j
< tempCounter
- 1 )
4391 if ( pre_score
== 0 )
4393 * ( unsigned int * ) darrayPut ( tempArray
, prev_p
) = 0;
4394 contig_array
[prev_id
].flag
= 0;
4395 contig_array
[getTwinCtg ( prev_id
)].flag
= 0;
4398 else if ( curr_ctg_score
== 0 )
4400 * ( unsigned int * ) darrayPut ( tempArray
, j
) = 0;
4401 contig_array
[currId
].flag
= 0;
4402 contig_array
[getTwinCtg ( currId
)].flag
= 0;
4405 if ( j
< tempCounter
- 1 )
4410 if ( abs ( prev_id
- currId
) <= 2 && now_cnt_weight
== 0
4411 && * ( unsigned int * ) darrayGet ( tempArray
, prev_p
) != 0 && * ( unsigned int * ) darrayGet ( tempArray
, j
) != 0 )
4415 if ( contig_array
[prev_id
].cvg
< contig_array
[currId
].cvg
)
4417 * ( unsigned int * ) darrayPut ( tempArray
, prev_p
) = 0;
4418 contig_array
[prev_id
].flag
= 0;
4419 contig_array
[getTwinCtg ( prev_id
)].flag
= 0;
4423 * ( unsigned int * ) darrayPut ( tempArray
, j
) = 0;
4424 contig_array
[currId
].flag
= 0;
4425 contig_array
[getTwinCtg ( currId
)].flag
= 0;
4427 if ( j
< tempCounter
- 1 )
4433 if ( * ( unsigned int * ) darrayGet ( tempArray
, prev_p
) != 0 )
4434 { score_array
[score_count
++] = pre_score
; }
4436 pre_score
= curr_ctg_score
;
4441 if ( * ( unsigned int * ) darrayGet ( tempArray
, prev_p
) != 0 )
4442 { score_array
[score_count
++] = pre_score
; }
4444 if ( score_mask
== 1 && ( ( num3
+ num5
> 5 && score_count
< 2 ) || score_count
== 1 ) )
4446 free ( ( void * ) score_array
);
4450 for ( j
= 0; j
< num3
; j
++ )
4452 ctg
= * ( unsigned int * ) darrayGet ( scaf3
, j
);
4453 contig_array
[ctg
].flag
= 0;
4454 contig_array
[getTwinCtg ( ctg
)].flag
= 0;
4457 for ( j
= 0; j
< num5
; j
++ )
4459 ctg
= * ( unsigned int * ) darrayGet ( scaf5
, j
);
4460 contig_array
[ctg
].flag
= 0;
4461 contig_array
[getTwinCtg ( ctg
)].flag
= 0;
4467 fprintf ( fp
, ">scaffold%d %d %d %d\n", count
, score_count
, len
, num3
+ num5
, mask_num
);
4468 fprintf ( fo
, ">scaffold%d %d %d %d\n", count
, score_count
, len
, num3
+ num5
, mask_num
);
4470 tempCounter
= 0, score_count
= 0;
4472 for ( j
= num3
- 1; j
>= 0; j
-- )
4474 int now_cnt_weigth
= 0;
4475 int nextid
, start
= 0;
4476 int currId
= * ( unsigned int * ) darrayGet ( scaf3
, j
);
4477 int tmpid
= * ( unsigned int * ) darrayGet ( tempArray
, tempCounter
++ );
4481 if ( j
== num3
- 1 )
4487 len
+= contig_array
[* ( unsigned int * ) darrayGet ( scaf3
, j
)].length
+ * ( int * ) darrayGet ( gap3
, j
);
4488 int tmpgap
= contig_array
[* ( unsigned int * ) darrayGet ( scaf3
, j
)].length
+ * ( int * ) darrayGet ( gap3
, j
);
4489 gap
+= tmpgap
> 0 ? tmpgap
: 0;
4495 nextid
= * ( unsigned int * ) darrayGet ( tempArray
, tempCounter
+ start
);
4497 while ( nextid
== 0 && tempCounter
+ start
+ 1 < num3
+ num5
)
4500 nextid
= * ( unsigned int * ) darrayGet ( tempArray
, tempCounter
+ start
);
4508 CONNECT
* dh_cnt
= getCntBetween ( currId
, nextid
);
4511 { now_cnt_weigth
= dh_cnt
->weight
; }
4513 { now_cnt_weigth
= 0; }
4515 curr_ctg_score
= score_array
[score_count
++];
4517 if ( score_mask
== 1 && curr_ctg_score
== 0 && ( num3
+ num5
> 2 )
4518 && ( ( j
== num3
- 1 && contig_array
[nextid
].length
< 200 && num3
+ num5
> 5 ) || ( now_cnt_weigth
== 0 && j
> 0 ) ) )
4520 if ( j
== num3
- 1 )
4526 len
+= contig_array
[* ( unsigned int * ) darrayGet ( scaf3
, j
)].length
+ * ( int * ) darrayGet ( gap3
, j
);
4527 int tmpgap
= contig_array
[* ( unsigned int * ) darrayGet ( scaf3
, j
)].length
+ * ( int * ) darrayGet ( gap3
, j
);
4528 gap
+= tmpgap
> 0 ? tmpgap
: 0;
4532 if ( !isLargerThanTwin ( * ( unsigned int * ) darrayGet ( scaf3
, j
) ) )
4534 fprintf ( fp
, "%-10d %-10d + %d %d %d"
4535 , index_array
[* ( unsigned int * ) darrayGet ( scaf3
, j
)], len
,
4536 contig_array
[* ( unsigned int * ) darrayGet ( scaf3
, j
)].length
+ overlaplen
,
4537 now_cnt_weigth
, curr_ctg_score
);
4538 weak
= printCnts ( fp
, * ( unsigned int * ) darrayGet ( scaf3
, j
) );
4542 fprintf ( fp
, "%-10d %-10d - %d %d %d"
4543 , index_array
[getTwinCtg ( * ( unsigned int * ) darrayGet ( scaf3
, j
) )], len
4544 , contig_array
[* ( unsigned int * ) darrayGet ( scaf3
, j
)].length
+ overlaplen
,
4545 now_cnt_weigth
, curr_ctg_score
);
4546 weak
= printCnts ( fp
, * ( unsigned int * ) darrayGet ( scaf3
, j
) );
4551 num_route
= num_trace
= 0;
4552 traceAlongArc ( * ( unsigned int * ) darrayGet ( scaf3
, j
), prev_ctg
, max_steps
4553 , gap
- ins_size_var
, gap
+ ins_size_var
, 0, 0, &num_route
);
4555 if ( num_route
== 1 )
4557 output1gap ( fo
, max_steps
);
4562 fprintf ( fo
, "%-10d %-10d\n", * ( unsigned int * ) darrayGet ( scaf3
, j
), len
);
4563 len
+= contig_array
[* ( unsigned int * ) darrayGet ( scaf3
, j
)].length
+ * ( int * ) darrayGet ( gap3
, j
);
4564 prev_ctg
= * ( unsigned int * ) darrayGet ( scaf3
, j
);
4565 gap
= * ( int * ) darrayGet ( gap3
, j
) > 0 ? * ( int * ) darrayGet ( gap3
, j
) : 0;
4568 for ( j
= 0; j
< num5
; j
++ )
4570 int now_cnt_weigth
= 0;
4571 int currId
, nextid
, start
= 0;
4572 currId
= * ( unsigned int * ) darrayGet ( scaf5
, j
);
4573 int tmpid
= * ( unsigned int * ) darrayGet ( tempArray
, tempCounter
++ );
4577 if ( j
== num5
- 1 )
4580 len
+= contig_array
[* ( unsigned int * ) darrayGet ( scaf5
, j
)].length
+ * ( int * ) darrayGet ( gap5
, j
);
4581 int tmpgap
= contig_array
[* ( unsigned int * ) darrayGet ( scaf5
, j
)].length
+ * ( int * ) darrayGet ( gap5
, j
);
4582 gap
+= tmpgap
> 0 ? tmpgap
: 0;
4588 nextid
= * ( unsigned int * ) darrayGet ( tempArray
, tempCounter
+ start
);
4590 while ( nextid
== 0 && tempCounter
+ start
+ 1 < num3
+ num5
)
4593 nextid
= * ( unsigned int * ) darrayGet ( tempArray
, tempCounter
+ start
);
4596 CONNECT
* dh_cnt
= getCntBetween ( currId
, nextid
);
4599 { now_cnt_weigth
= dh_cnt
->weight
; }
4601 { now_cnt_weigth
= 0; }
4604 curr_ctg_score
= score_array
[score_count
++];
4606 if ( score_mask
== 1 && curr_ctg_score
== 0 && ( num3
+ num5
> 2 )
4607 && ( ( j
== num5
- 1 && contig_array
[* ( unsigned int * ) darrayGet ( scaf5
, j
- 1 )].length
< 200 && num3
+ num5
> 5 )
4608 || ( j
!= num5
- 1 && now_cnt_weigth
== 0 ) ) )
4610 if ( j
== num5
- 1 )
4615 len
+= contig_array
[* ( unsigned int * ) darrayGet ( scaf5
, j
)].length
+ * ( int * ) darrayGet ( gap5
, j
);
4616 int tmpgap
= contig_array
[* ( unsigned int * ) darrayGet ( scaf5
, j
)].length
+ * ( int * ) darrayGet ( gap5
, j
);
4617 gap
+= tmpgap
> 0 ? tmpgap
: 0;
4621 if ( !isLargerThanTwin ( * ( unsigned int * ) darrayGet ( scaf5
, j
) ) )
4623 fprintf ( fp
, "%-10d %-10d + %d %d %d"
4624 , index_array
[* ( unsigned int * ) darrayGet ( scaf5
, j
)], len
4625 , contig_array
[* ( unsigned int * ) darrayGet ( scaf5
, j
)].length
+ overlaplen
,
4626 now_cnt_weigth
, curr_ctg_score
);
4627 weak
= printCnts ( fp
, * ( unsigned int * ) darrayGet ( scaf5
, j
) );
4631 fprintf ( fp
, "%-10d %-10d - %d %d %d"
4632 , index_array
[getTwinCtg ( * ( unsigned int * ) darrayGet ( scaf5
, j
) )], len
4633 , contig_array
[* ( unsigned int * ) darrayGet ( scaf5
, j
)].length
+ overlaplen
,
4634 now_cnt_weigth
, curr_ctg_score
);
4635 weak
= printCnts ( fp
, * ( unsigned int * ) darrayGet ( scaf5
, j
) );
4640 num_route
= num_trace
= 0;
4641 traceAlongArc ( * ( unsigned int * ) darrayGet ( scaf5
, j
), prev_ctg
, max_steps
4642 , gap
- ins_size_var
, gap
+ ins_size_var
, 0, 0, &num_route
);
4644 if ( num_route
== 1 )
4646 output1gap ( fo
, max_steps
);
4651 fprintf ( fo
, "%-10d %-10d\n", * ( unsigned int * ) darrayGet ( scaf5
, j
), len
);
4655 len
+= contig_array
[* ( unsigned int * ) darrayGet ( scaf5
, j
)].length
+
4656 * ( int * ) darrayGet ( gap5
, j
);
4657 prev_ctg
= * ( unsigned int * ) darrayGet ( scaf5
, j
);
4658 gap
= * ( int * ) darrayGet ( gap5
, j
) > 0 ? * ( int * ) darrayGet ( gap5
, j
) : 0;
4662 free ( ( void * ) score_array
);
4665 freeDarray ( scaf3
);
4666 freeDarray ( scaf5
);
4667 freeDarray ( gap3
);
4668 freeDarray ( gap5
);
4669 freeDarray ( tempArray
);
4672 fprintf ( stderr
, "\nThe final rank\n" );
4676 fprintf ( stderr
, "\n\nNo scaffold was constructed.\n\n" );
4677 free ( ( void * ) length_array
);
4679 for ( j
= 0; j
< max_n_routes
; j
++ )
4680 { free ( ( void * ) found_routes
[j
] ); }
4682 free ( ( void * ) found_routes
);
4683 free ( ( void * ) so_far
);
4687 fprintf ( stderr
, "\n*******************************\n" );
4688 fprintf ( stderr
, " Scaffold number %d\n", count
);
4689 fprintf ( stderr
, " In-scaffold contig number %u\n", num_lctg
/ 2 );
4690 fprintf ( stderr
, " Total scaffold length %lld\n", sum
);
4691 fprintf ( stderr
, " Average scaffold length %lld\n", sum
/ count
);
4692 fprintf ( stderr
, " Filled gap number %d\n", gap_c
);
4695 for ( i
= 1; i
<= num_ctg
; i
++ )
4697 if ( contig_array
[i
].length
+ ( unsigned int ) overlaplen
< len_cut
|| contig_array
[i
].flag
)
4700 length_array
[count
++] = contig_array
[i
].length
;
4701 sum
+= contig_array
[i
].length
;
4703 if ( isSmallerThanTwin ( i
) )
4707 long long total_len
= sum
;
4708 qsort ( length_array
, count
, sizeof ( length_array
[0] ), cmp_int
);
4715 for ( j
= count
- 1; j
>= 0; j
-- )
4717 sum
+= length_array
[j
];
4719 if ( !flag
&& sum
>= N50
&& N50length
== 0 )
4721 N50length
= length_array
[j
];
4725 if ( sum
>= N90
&& N90length
== 0 )
4727 N90length
= length_array
[j
];
4732 fprintf ( stderr
, " Longest scaffold %lld\n", length_array
[count
- 1] );
4733 fprintf ( stderr
, " Scaffold and singleton number %d\n", count
);
4734 fprintf ( stderr
, " Scaffold and singleton length %lld\n", total_len
);
4735 fprintf ( stderr
, " Average length %d\n", total_len
/ count
);
4736 fprintf ( stderr
, " N50 %d\n", N50length
);
4737 fprintf ( stderr
, " N90 %d\n", N90length
);
4738 fprintf ( stderr
, " Weak points %d\n", weakCounter
);
4739 fprintf ( stderr
, "\n*******************************\n" );
4741 free ( ( void * ) length_array
);
4743 for ( j
= 0; j
< max_n_routes
; j
++ )
4744 { free ( ( void * ) found_routes
[j
] ); }
4746 free ( ( void * ) found_routes
);
4747 free ( ( void * ) so_far
);
4750 /*************************************************
4754 Makes statistic of scaffold till current rank.
4756 1. rank: current rank number
4757 2. len_cut: length cutoff
4762 *************************************************/
4763 void scaffold_count ( int rank
, unsigned int len_cut
)
4765 static DARRAY
* scaf3
, *scaf5
;
4766 static DARRAY
* gap3
, *gap5
;
4767 unsigned int prev_ctg
, ctg
, bal_ctg
, *length_array
, count
= 0, num_lctg
= 0;
4768 unsigned int i
, max_steps
= 5;
4769 int num5
, num3
, j
, len
, flag
, num_route
, gap_c
= 0;
4771 long long sum
= 0, N50
, N90
;
4772 CONNECT
* cnt
, *prevCNT
, *nextCnt
;
4774 so_far
= ( unsigned int * ) ckalloc ( max_n_routes
* sizeof ( unsigned int ) );
4775 found_routes
= ( unsigned int ** ) ckalloc ( max_n_routes
* sizeof ( unsigned int * ) );
4777 for ( j
= 0; j
< max_n_routes
; j
++ )
4778 { found_routes
[j
] = ( unsigned int * ) ckalloc ( max_steps
* sizeof ( unsigned int ) ); }
4780 length_array
= ( unsigned int * ) ckalloc ( ( num_ctg
+ 1 ) * sizeof ( unsigned int ) );
4782 //use length_array to change info in index_array
4783 for ( i
= 1; i
<= num_ctg
; i
++ )
4784 { length_array
[i
] = 0; }
4786 for ( i
= 1; i
<= num_ctg
; i
++ )
4788 if ( index_array
[i
] > 0 )
4789 { length_array
[index_array
[i
]] = i
; }
4792 for ( i
= 1; i
<= num_ctg
; i
++ )
4793 { index_array
[i
] = length_array
[i
]; } //contig i with original index: index_array[i]
4796 scaf3
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) );
4797 scaf5
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) );
4798 gap3
= ( DARRAY
* ) createDarray ( 1000, sizeof ( int ) );
4799 gap5
= ( DARRAY
* ) createDarray ( 1000, sizeof ( int ) );
4801 for ( i
= 1; i
<= num_ctg
; i
++ )
4802 { contig_array
[i
].flag
= 0; }
4804 for ( i
= 1; i
<= num_ctg
; i
++ )
4806 if ( contig_array
[i
].length
+ ( unsigned int ) overlaplen
>= len_cut
)
4811 if ( contig_array
[i
].flag
|| contig_array
[i
].mask
|| !contig_array
[i
].downwardConnect
|| !validConnect ( i
, NULL
) )
4816 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = i
;
4817 contig_array
[i
].flag
= 1;
4818 bal_ctg
= getTwinCtg ( ctg
);
4819 contig_array
[bal_ctg
].flag
= 1;
4820 len
= contig_array
[i
].length
;
4822 cnt
= getNextContig ( ctg
, prevCNT
, &excep
);
4826 nextCnt
= getNextContig ( cnt
->contigID
, cnt
, &excep
);
4828 if ( excep
&& prevCNT
)
4829 { fprintf ( stderr
, "scaffolding: exception --- prev cnt from %u\n", prevCNT
->contigID
); }
4831 if ( nextCnt
&& nextCnt
->used
)
4834 setConnectUsed ( ctg
, cnt
->contigID
, 1 );
4835 * ( int * ) darrayPut ( gap5
, num5
- 1 ) = cnt
->gapLen
;
4836 ctg
= cnt
->contigID
;
4837 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = ctg
;
4838 len
+= cnt
->gapLen
+ contig_array
[ctg
].length
;
4839 bal_ctg
= getTwinCtg ( ctg
);
4840 contig_array
[ctg
].flag
= 1;
4841 contig_array
[bal_ctg
].flag
= 1;
4846 ctg
= getTwinCtg ( i
);
4849 { prevCNT
= checkConnect ( getTwinCtg ( * ( unsigned int * ) darrayGet ( scaf5
, 1 ) ), ctg
); }
4853 cnt
= getNextContig ( ctg
, prevCNT
, &excep
);
4857 nextCnt
= getNextContig ( cnt
->contigID
, cnt
, &excep
);
4859 if ( excep
&& prevCNT
)
4860 { fprintf ( stderr
, "scaffolding: exception -- prev cnt from %u\n", prevCNT
->contigID
); }
4862 if ( nextCnt
&& nextCnt
->used
)
4865 setConnectUsed ( ctg
, cnt
->contigID
, 1 );
4866 ctg
= cnt
->contigID
;
4867 len
+= cnt
->gapLen
+ contig_array
[ctg
].length
;
4868 bal_ctg
= getTwinCtg ( ctg
);
4869 contig_array
[ctg
].flag
= 1;
4870 contig_array
[bal_ctg
].flag
= 1;
4871 * ( int * ) darrayPut ( gap3
, num3
) = cnt
->gapLen
;
4872 * ( unsigned int * ) darrayPut ( scaf3
, num3
++ ) = bal_ctg
;
4879 length_array
[count
++] = len
;
4881 if ( num5
+ num3
< 1 )
4883 fprintf ( stderr
, "no scaffold created for contig %d\n", i
);
4889 for ( j
= num3
- 1; j
>= 0; j
-- )
4893 num_route
= num_trace
= 0;
4894 traceAlongArc ( * ( unsigned int * ) darrayGet ( scaf3
, j
), prev_ctg
, max_steps
4895 , gap
- ins_size_var
, gap
+ ins_size_var
, 0, 0, &num_route
);
4897 if ( num_route
== 1 )
4903 len
+= contig_array
[* ( unsigned int * ) darrayGet ( scaf3
, j
)].length
+ * ( int * ) darrayGet ( gap3
, j
);
4904 prev_ctg
= * ( unsigned int * ) darrayGet ( scaf3
, j
);
4905 gap
= * ( int * ) darrayGet ( gap3
, j
) > 0 ? * ( int * ) darrayGet ( gap3
, j
) : 0;
4908 for ( j
= 0; j
< num5
; j
++ )
4912 num_route
= num_trace
= 0;
4913 traceAlongArc ( * ( unsigned int * ) darrayGet ( scaf5
, j
), prev_ctg
, max_steps
4914 , gap
- ins_size_var
, gap
+ ins_size_var
, 0, 0, &num_route
);
4916 if ( num_route
== 1 )
4924 len
+= contig_array
[* ( unsigned int * ) darrayGet ( scaf5
, j
)].length
+
4925 * ( int * ) darrayGet ( gap5
, j
);
4926 prev_ctg
= * ( unsigned int * ) darrayGet ( scaf5
, j
);
4927 gap
= * ( int * ) darrayGet ( gap5
, j
) > 0 ? * ( int * ) darrayGet ( gap5
, j
) : 0;
4932 freeDarray ( scaf3
);
4933 freeDarray ( scaf5
);
4934 freeDarray ( gap3
);
4935 freeDarray ( gap5
);
4939 fprintf ( stderr
, "\n\nNo scaffold was constructed.\n\n" );
4940 free ( ( void * ) length_array
);
4942 for ( j
= 0; j
< max_n_routes
; j
++ )
4943 { free ( ( void * ) found_routes
[j
] ); }
4945 free ( ( void * ) found_routes
);
4946 free ( ( void * ) so_far
);
4950 fprintf ( stderr
, "\nRank %d\n", rank
);
4951 fprintf ( stderr
, " Scaffold number %d\n", count
);
4952 fprintf ( stderr
, " In-scaffold contig number %u\n", num_lctg
/ 2 );
4953 fprintf ( stderr
, " Total scaffold length %lld\n", sum
);
4954 fprintf ( stderr
, " Average scaffold length %lld\n", sum
/ count
);
4955 fprintf ( stderr
, " Filled gap number %d\n", gap_c
);
4958 for ( i
= 1; i
<= num_ctg
; i
++ )
4960 if ( contig_array
[i
].length
+ ( unsigned int ) overlaplen
< len_cut
|| contig_array
[i
].flag
)
4963 length_array
[count
++] = contig_array
[i
].length
;
4964 sum
+= contig_array
[i
].length
;
4966 if ( isSmallerThanTwin ( i
) )
4970 long int total_len
= sum
;
4971 // calculate N50/N90
4972 qsort ( length_array
, count
, sizeof ( length_array
[0] ), cmp_int
);
4979 for ( j
= count
- 1; j
>= 0; j
-- )
4981 sum
+= length_array
[j
];
4983 if ( !flag
&& sum
>= N50
&& N50length
== 0 )
4985 N50length
= length_array
[j
];
4989 if ( sum
>= N90
&& N90length
== 0 )
4991 N90length
= length_array
[j
];
4996 fprintf ( stderr
, " Longest scaffold %lld\n", length_array
[count
- 1] );
4997 fprintf ( stderr
, " Scaffold and singleton number %d\n", count
);
4998 fprintf ( stderr
, " Scaffold and singleton length %lld\n", total_len
);
4999 fprintf ( stderr
, " Average length %d\n", total_len
/ count
);
5000 fprintf ( stderr
, " N50 %d\n", N50length
);
5001 fprintf ( stderr
, " N90 %d\n", N90length
);
5002 free ( ( void * ) length_array
);
5004 for ( j
= 0; j
< max_n_routes
; j
++ )
5005 { free ( ( void * ) found_routes
[j
] ); }
5007 free ( ( void * ) found_routes
);
5008 free ( ( void * ) so_far
);
5012 /*************************************************
5016 Outputs connection information of current insert size LIB.
5019 2. insertS: current insert size
5024 *************************************************/
5025 static void outputLinks ( FILE * fp
, int insertS
)
5027 unsigned int i
, bal_ctg
, bal_toCtg
;
5028 CONNECT
* cnts
, *temp_cnt
;
5030 for ( i
= 1; i
<= num_ctg
; i
++ )
5032 cnts
= contig_array
[i
].downwardConnect
;
5033 bal_ctg
= getTwinCtg ( i
);
5037 if ( cnts
->weight
< 1 )
5043 fprintf ( fp
, "%-10d %-10d\t%d\t%d\t%d\n"
5044 , i
, cnts
->contigID
, cnts
->gapLen
, cnts
->weight
, insertS
);
5046 bal_toCtg
= getTwinCtg ( cnts
->contigID
);
5047 temp_cnt
= getCntBetween ( bal_toCtg
, bal_ctg
);
5050 { temp_cnt
->weight
= 0; }
5057 /*************************************************
5061 Updates connections between contigs based on alignment
5062 information of paired-end reads.
5064 1. infile: alignment information file of paired-end reads
5069 *************************************************/
5070 void PE2Links ( char * infile
)
5072 char name
[256], *line
;
5079 sprintf ( name
, "%s.links", infile
);
5080 boolean filesOK
= check_file ( name
);
5084 fprintf ( stderr
, "File %s exists, skip creating the links...\n", name
);
5088 linkF
= ckopen ( name
, "w" );
5091 { loadPEgrads ( infile
); }
5093 fprintf ( stderr
, "*****************************************************\nStart to load paired-end reads information.\n\n" );
5095 if ( COMPATIBLE_MODE
== 1 )
5097 sprintf ( name
, "%s.readOnContig", infile
);
5098 fp1
= ckopen ( name
, "r" );
5102 sprintf ( name
, "%s.readOnContig.gz", infile
);
5103 fp2
= gzopen ( name
, "r" );
5107 line
= ( char * ) ckalloc ( lineLen
* sizeof ( char ) );
5109 if ( COMPATIBLE_MODE
== 1 )
5111 fgets ( line
, lineLen
, fp1
);
5115 gzgets ( fp2
, line
, lineLen
);
5120 for ( i
= 0; i
< gradsCounter
; i
++ )
5122 createCntMemManager();
5123 createCntLookupTable();
5126 if ( COMPATIBLE_MODE
== 1 )
5128 flag
+= connectByPE_grad ( fp1
, i
, line
);
5132 flag
+= connectByPE_grad_gz ( fp2
, i
, line
);
5135 fprintf ( stderr
, "%lld new connections.\n\n", newCntCounter
/ 2 );
5139 destroyConnectMem();
5140 deleteCntLookupTable();
5142 for ( j
= 1; j
<= num_ctg
; j
++ )
5143 { contig_array
[j
].downwardConnect
= NULL
; }
5145 fprintf ( stderr
, "\n" );
5150 outputLinks ( linkF
, pes
[i
].insertS
);
5151 destroyConnectMem();
5152 deleteCntLookupTable();
5154 for ( j
= 1; j
<= num_ctg
; j
++ )
5155 { contig_array
[j
].downwardConnect
= NULL
; }
5158 free ( ( void * ) line
);
5160 if ( COMPATIBLE_MODE
== 1 )
5170 fprintf ( stderr
, "All paired-end reads information loaded.\n" );
5173 /*************************************************
5177 Loads links information of certain insert size.
5179 1. fp: alignment inofrmation file
5180 2. insertS: insert size boundary
5181 3. line: the first alignment record of current LIB
5183 1. line: the first alignment record of next LIB
5185 loaded record number.
5186 *************************************************/
5187 static int inputLinks ( FILE * fp
, int insertS
, char * line
)
5189 unsigned int ctg
, bal_ctg
, toCtg
, bal_toCtg
;
5191 unsigned int counter
= 0, onScafCounter
= 0;
5192 unsigned int maskCounter
= 0;
5193 CONNECT
* cnt
, *bal_cnt
;
5195 if ( strlen ( line
) )
5197 sscanf ( line
, "%d %d %d %d %d", &ctg
, &toCtg
, &gap
, &wt
, &ins
);
5199 if ( ins
!= insertS
)
5204 bal_ctg
= getTwinCtg ( ctg
);
5205 bal_toCtg
= getTwinCtg ( toCtg
);
5206 cnt
= add1Connect ( ctg
, toCtg
, gap
, wt
, 0 );
5207 bal_cnt
= add1Connect ( bal_toCtg
, bal_ctg
, gap
, wt
, 0 );
5209 if ( cnt
&& insertS
> 1000 )
5211 cnt
->newIns
= bal_cnt
->newIns
= 1;
5216 if ( contig_array
[ctg
].mask
|| contig_array
[toCtg
].mask
)
5219 if ( insertS
> 1000 &&
5220 contig_array
[ctg
].from_vt
== contig_array
[toCtg
].from_vt
&& // on the same scaff
5221 contig_array
[ctg
].indexInScaf
< contig_array
[toCtg
].indexInScaf
)
5223 add1LongPEcov ( ctg
, toCtg
, wt
);
5229 while ( fgets ( line
, lineLen
, fp
) != NULL
)
5231 sscanf ( line
, "%d %d %d %d %d", &ctg
, &toCtg
, &gap
, &wt
, &ins
);
5233 if ( ins
> insertS
)
5236 if ( insertS
> 1000 &&
5237 contig_array
[ctg
].from_vt
== contig_array
[toCtg
].from_vt
&& // on the same scaff
5238 contig_array
[ctg
].indexInScaf
< contig_array
[toCtg
].indexInScaf
)
5240 add1LongPEcov ( ctg
, toCtg
, wt
);
5244 bal_ctg
= getTwinCtg ( ctg
);
5245 bal_toCtg
= getTwinCtg ( toCtg
);
5246 cnt
= add1Connect ( ctg
, toCtg
, gap
, wt
, 0 );
5247 bal_cnt
= add1Connect ( bal_toCtg
, bal_ctg
, gap
, wt
, 0 );
5249 if ( cnt
&& insertS
> 1000 )
5251 cnt
->newIns
= bal_cnt
->newIns
= 1;
5256 if ( contig_array
[ctg
].mask
|| contig_array
[toCtg
].mask
)
5260 fprintf ( stderr
, "***************************\nFor insert size: %d\n", insertS
);
5261 fprintf ( stderr
, " Total PE links %d\n", counter
);
5262 fprintf ( stderr
, " PE links to masked contigs %d\n", maskCounter
);
5263 fprintf ( stderr
, " On same scaffold PE links %d\n", onScafCounter
);
5267 /*************************************************
5271 Constructs scaffolds based on alignment information.
5273 1. infile: prefix of graph
5278 *************************************************/
5279 void Links2Scaf ( char * infile
)
5281 char name
[256], *line
;
5283 int i
, j
= 1, lib_n
= 0, cutoff_sum
= 0;
5284 int flag
= 0, flag2
;
5285 boolean downS
, nonLinear
= 0, smallPE
= 0, isPrevSmall
= 0, markSmall
;
5287 if ( cvg4SNP
> 0.001 )
5289 sprintf ( name
, "%s.bubbleInScaff", infile
);
5290 snp_fp
= ckopen ( name
, "w" );
5293 cvg4SNP
= ( double ) ( cvg4SNP
* cvgAvg
);
5296 { loadPEgrads ( infile
); }
5298 sprintf ( name
, "%s.links", infile
);
5299 fp
= ckopen ( name
, "r" );
5300 createCntMemManager();
5301 createCntLookupTable();
5303 line
= ( char * ) ckalloc ( lineLen
* sizeof ( char ) );
5304 fgets ( line
, lineLen
, fp
);
5306 solidArray
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) );
5307 tempArray
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) );
5308 scaf3
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) );
5309 scaf5
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) );
5310 gap3
= ( DARRAY
* ) createDarray ( 1000, sizeof ( int ) );
5311 gap5
= ( DARRAY
* ) createDarray ( 1000, sizeof ( int ) );
5313 fprintf ( stderr
, "\n" );
5315 for ( i
= 0; i
< gradsCounter
; i
++ )
5317 if ( MinWeakCut
== 0 && i
== 0 )
5318 { MinWeakCut
= pes
[i
].pair_num_cut
; }
5320 if ( pes
[i
].insertS
< 1000 )
5324 if ( MinWeakCut
> pes
[i
].pair_num_cut
)
5325 { MinWeakCut
= pes
[i
].pair_num_cut
; }
5327 else if ( pes
[i
].insertS
> 1000 && isPrevSmall
)
5333 Insert_size
= pes
[i
].insertS
;
5334 discardCntCounter
= 0;
5335 flag2
= inputLinks ( fp
, pes
[i
].insertS
, line
);
5340 cutoff_sum
+= pes
[i
].pair_num_cut
;
5347 fprintf ( stderr
, "\n" );
5351 if ( i
== gradsCounter
- 1 || pes
[i
+ 1].rank
!= pes
[i
].rank
)
5353 flag
= nonLinear
= downS
= markSmall
= 0;
5355 if ( pes
[i
].insertS
> 1000 && pes
[i
].rank
> 1 )
5358 if ( pes
[i
].insertS
<= 1000 )
5361 if ( pes
[i
].insertS
>= 1000 )
5364 OverlapPercent
= 0.05;
5366 else if ( pes
[i
].insertS
>= 300 )
5369 OverlapPercent
= 0.05;
5374 OverlapPercent
= 0.05;
5377 if ( pes
[i
].insertS
> 1000 )
5380 bySmall
= Insert_size
> 1000 ? 0 : 1;
5384 weakPE
= weakPE
< cutoff_sum
/ lib_n
? cutoff_sum
/ lib_n
: weakPE
;
5385 lib_n
= cutoff_sum
= 0;
5388 if ( MinWeakCut
> weakPE
)
5389 { MinWeakCut
= weakPE
; }
5391 fprintf ( stderr
, "Cutoff of PE links to make a reliable connection: %d\n", weakPE
);
5393 if ( i
== gradsCounter
- 1 )
5396 if ( Insert_size
> 1000 )
5401 ordering ( 1, downS
, nonLinear
, infile
);
5403 if ( i
== gradsCounter
- 1 )
5409 scaffold_count ( j
, 100 );
5411 fprintf ( stderr
, "\n" );
5414 if ( Insert_size
> 1000 && i
!= gradsCounter
- 1 )
5421 freeDarray ( tempArray
);
5422 freeDarray ( solidArray
);
5423 freeDarray ( scaf3
);
5424 freeDarray ( scaf5
);
5425 freeDarray ( gap3
);
5426 freeDarray ( gap5
);
5427 free ( ( void * ) line
);
5430 if ( cvg4SNP
> 0.001 )
5435 fprintf ( stderr
, "\nAll links loaded.\n" );
5438 /*************************************************
5442 Puts contig in "ctg4heapArray".
5445 2. maxNodes: maximum allowed contig number in array
5446 3. dis: contig's distance to base contig
5450 1 if contig was already in array or putting operation succeeds.
5451 0 if array size was larger than cutoff or contig's reverse complement
5452 format was already in array.
5453 *************************************************/
5454 static boolean
putNodeInArray ( unsigned int node
, int maxNodes
, int dis
)
5456 if ( contig_array
[node
].inSubGraph
)
5459 int index
= nodeCounter
;
5461 if ( index
> maxNodes
)
5464 if ( contig_array
[getTwinCtg ( node
)].inSubGraph
)
5467 ctg4heapArray
[index
].ctgID
= node
;
5468 ctg4heapArray
[index
].dis
= dis
;
5469 contig_array
[node
].inSubGraph
= 1;
5470 ctg4heapArray
[index
].ds_shut4dheap
= 0;
5471 ctg4heapArray
[index
].us_shut4dheap
= 0;
5472 ctg4heapArray
[index
].ds_shut4uheap
= 0;
5473 ctg4heapArray
[index
].us_shut4uheap
= 0;
5477 /*************************************************
5481 Sets contig's status of "inSubGraph".
5483 1. flag: new status.
5488 *************************************************/
5489 static void setInGraph ( boolean flag
)
5493 nodeCounter
= nodeCounter
> MaxNodeInSub
? MaxNodeInSub
: nodeCounter
;
5495 for ( i
= 1; i
<= nodeCounter
; i
++ )
5497 node
= ctg4heapArray
[i
].ctgID
;
5500 { contig_array
[node
].inSubGraph
= flag
; }
5505 /*************************************************
5509 Gets contig's index in "ctg4heapArray".
5515 Contig's index if contig was in array.
5517 *************************************************/
5518 static int getIndexInArr ( const unsigned int node
)
5522 for ( ; i
<= nodeCounter
; ++i
)
5524 if ( node
== ctg4heapArray
[i
].ctgID
)
5532 /*************************************************
5536 Dispatchs a contig to heap according to its distance to base
5537 contig, then updates related information of heaps.
5539 1. dis: distance to base contig
5540 2. tempNode: contig to be dispatched
5541 3. maxNodes: maximum allowed contig number in sub-graph
5542 4. dheap: heap for downstream contigs of base contig
5543 5. uheap: heap for upstream contigs of base contig
5544 6. DmaxDis: maximum distance of downstream contig to base contig
5545 7. UmaxDis: maximum distance of upstream contig to base
5548 1. dheap: updated heap for downstream contigs
5549 2. uheap: updated heap for upstream contigs
5550 3. DmaxDis: updated maximum distance of downstream contig to base contig
5551 4. UmaxDis: updated maximum distance of upstream contig to base contig
5553 1 if contig was dispatched to "dheap".
5554 -1 if contig was dispatched to "uheap".
5556 *************************************************/
5557 static boolean
dispatch1node ( int dis
, unsigned int tempNode
, int maxNodes
,
5558 FibHeap
* dheap
, FibHeap
* uheap
, int * DmaxDis
, int * UmaxDis
)
5562 if ( dis
>= 0 ) // put it to Dheap
5565 ret
= putNodeInArray ( tempNode
, maxNodes
, dis
);
5570 insertNodeIntoHeap ( dheap
, dis
, nodeCounter
);
5572 if ( dis
> *DmaxDis
)
5577 else // put it to Uheap
5580 ret
= putNodeInArray ( tempNode
, maxNodes
, dis
);
5585 insertNodeIntoHeap ( uheap
, -dis
, nodeCounter
);
5586 int temp_len
= contig_array
[tempNode
].length
;
5588 if ( -dis
+ temp_len
> *UmaxDis
)
5589 { *UmaxDis
= -dis
+ contig_array
[tempNode
].length
; }
5597 /*************************************************
5601 Checks whether current contig is the furthest contig to base
5604 1. currNode: current contig
5605 2. dis: current contig's distance to base contig
5606 3. DmaxDis: maximum distance of downstream contig to base contig
5610 0 if current contig was not the furthest contig.
5611 *************************************************/
5612 static boolean
canDheapWait ( unsigned int currNode
, int dis
, int DmaxDis
)
5614 if ( dis
< DmaxDis
)
5620 /*************************************************
5624 Travers 'dheap' and adds related contigs into 'dheap' or 'uheap'.
5626 1. dheap: heap for downstream contigs of base contig
5627 2. uheap: heap for upstream contigs of base contig
5628 3. Dwait: indicator of whether all contigs in 'dheap' have been
5629 traversed, 1 for yes
5630 4. Uwait: indicator of whether all contigs in 'uheap' have been
5631 traversed, 1 for yes
5632 5. DmaxDis: maximum distance of downstream contig to base
5634 6. UmaxDis: maximum distance of upstream contig to base
5636 7. maxNodes: maximum allowed contig number in sub-graph
5638 1. dheap: updated heap for downstream contigs
5639 2. uheap: updated heap for upstream contigs
5640 3. Dwait: indicator of whether all contigs in 'dheap' have been
5641 traversed, 1 for yes
5642 4. Uwait: indicator of whether all contigs in 'uheap' have been
5643 traversed, 1 for yes
5644 5. DmaxDis: updated maximum distance of downstream contig to
5646 6. UmaxDis: updated maximum distance of upstream contig to
5649 0 if operation of putting contig into sub-graph failed.
5650 *************************************************/
5651 static boolean
workOnDheap ( FibHeap
* dheap
, FibHeap
* uheap
, boolean
* Dwait
, boolean
* Uwait
,
5652 int * DmaxDis
, int * UmaxDis
, int maxNodes
)
5657 unsigned int currNode
, twin
, tempNode
;
5658 CTGinHEAP
* ctgInHeap
;
5660 CONNECT
* us_cnt
, *ds_cnt
;
5662 boolean ret
, isEmpty
;
5664 while ( ( indexInArray
= removeNextNodeFromHeap ( dheap
) ) != 0 )
5666 ctgInHeap
= &ctg4heapArray
[indexInArray
];
5667 currNode
= ctgInHeap
->ctgID
;
5668 dis0
= ctgInHeap
->dis
;
5669 isEmpty
= IsHeapEmpty ( dheap
);
5670 twin
= getTwinCtg ( currNode
);
5671 us_cnt
= ctgInHeap
->us_shut4dheap
? NULL
: contig_array
[twin
].downwardConnect
;
5675 if ( us_cnt
->deleted
|| us_cnt
->mask
||
5676 contig_array
[getTwinCtg ( us_cnt
->contigID
)].inSubGraph
)
5678 us_cnt
= us_cnt
->next
;
5682 tempNode
= getTwinCtg ( us_cnt
->contigID
);
5684 if ( contig_array
[tempNode
].inSubGraph
)
5686 us_cnt
= us_cnt
->next
;
5690 dis
= dis0
- us_cnt
->gapLen
- ( int ) contig_array
[twin
].length
;
5691 ret
= dispatch1node ( dis
, tempNode
, maxNodes
, dheap
, uheap
, DmaxDis
, UmaxDis
);
5698 us_cnt
= us_cnt
->next
;
5701 if ( nodeCounter
> 1 && isEmpty
)
5703 *Dwait
= canDheapWait ( currNode
, dis0
, *DmaxDis
);
5707 isEmpty
= IsHeapEmpty ( dheap
);
5708 insertNodeIntoHeap ( dheap
, dis0
, indexInArray
);
5709 ctg4heapArray
[indexInArray
].us_shut4dheap
= 1;
5718 ds_cnt
= ctgInHeap
->ds_shut4dheap
? NULL
: contig_array
[currNode
].downwardConnect
;
5722 if ( ds_cnt
->deleted
|| ds_cnt
->mask
|| contig_array
[ds_cnt
->contigID
].inSubGraph
)
5724 ds_cnt
= ds_cnt
->next
;
5728 tempNode
= ds_cnt
->contigID
;
5729 dis
= dis0
+ ds_cnt
->gapLen
+ ( int ) contig_array
[tempNode
].length
;
5730 ret
= dispatch1node ( dis
, tempNode
, maxNodes
, dheap
, uheap
, DmaxDis
, UmaxDis
);
5737 ds_cnt
= ds_cnt
->next
;
5738 } // for each downstream connections
5739 } // for each node comes off the heap
5745 /*************************************************
5749 Checks whether current contig is the furthest contig to base
5752 1. currNode: current contig
5753 2. dis: current contig's distance to base contig
5754 3. DmaxDis: maximum distance of downstream contig to base contig
5758 0 if current contig was not the furthest contig.
5759 *************************************************/
5760 static boolean
canUheapWait ( unsigned int currNode
, int dis
, int UmaxDis
)
5762 int temp_len
= contig_array
[currNode
].length
;
5764 if ( -dis
+ temp_len
< UmaxDis
)
5770 /*************************************************
5774 Travers 'uheap' and adds related contigs into "dheap" or "uheap".
5776 1. dheap: heap for downstream contigs of base contig
5777 2. uheap: heap for upstream contigs of base contig
5778 3. Dwait: indicator of whether all contigs in 'dheap' have been
5779 traversed, 1 for yes
5780 4. Uwait: indicator of whether all contigs in 'uheap' have been
5781 traversed, 1 for yes
5782 5. DmaxDis: maximum distance of downstream contig to base contig
5783 6. UmaxDis: maximum distance of upstream contig to base contig
5784 7. maxNodes: maximum allowed contig number in sub-graph
5786 1. dheap: updated heap for downstream contigs
5787 2. uheap: updated heap for upstream contigs
5788 3. Dwait: indicator of whether all contigs in "dheap" have been
5789 traversed, 1 for yes
5790 4. Uwait: indicator of whether all contigs in "uheap" have been
5791 traversed, 1 for yes
5792 5. DmaxDis: updated maximum distance of downstream contig to
5794 6. UmaxDis: updated maximum distance of upstream contig to
5797 0 if operation of putting contig into sub-graph failed.
5798 *************************************************/
5799 static boolean
workOnUheap ( FibHeap
* dheap
, FibHeap
* uheap
, boolean
* Dwait
, boolean
* Uwait
,
5800 int * DmaxDis
, int * UmaxDis
, int maxNodes
)
5805 unsigned int currNode
, twin
, tempNode
;
5806 CTGinHEAP
* ctgInHeap
;
5808 CONNECT
* us_cnt
, *ds_cnt
;
5810 boolean ret
, isEmpty
;
5812 while ( ( indexInArray
= removeNextNodeFromHeap ( uheap
) ) != 0 )
5814 ctgInHeap
= &ctg4heapArray
[indexInArray
];
5815 currNode
= ctgInHeap
->ctgID
;
5816 dis0
= ctgInHeap
->dis
;
5817 isEmpty
= IsHeapEmpty ( uheap
);
5818 ds_cnt
= ctgInHeap
->ds_shut4uheap
? NULL
: contig_array
[currNode
].downwardConnect
;
5822 if ( ds_cnt
->deleted
|| ds_cnt
->mask
|| contig_array
[ds_cnt
->contigID
].inSubGraph
)
5824 ds_cnt
= ds_cnt
->next
;
5828 tempNode
= ds_cnt
->contigID
;
5829 dis
= dis0
+ ds_cnt
->gapLen
+ contig_array
[tempNode
].length
;
5830 ret
= dispatch1node ( dis
, tempNode
, maxNodes
, dheap
, uheap
, DmaxDis
, UmaxDis
);
5837 ds_cnt
= ds_cnt
->next
;
5838 } // for each downstream connections
5840 if ( nodeCounter
> 1 && isEmpty
)
5842 *Uwait
= canUheapWait ( currNode
, dis0
, *UmaxDis
);
5846 isEmpty
= IsHeapEmpty ( uheap
);
5847 insertNodeIntoHeap ( uheap
, dis0
, indexInArray
);
5848 ctg4heapArray
[indexInArray
].ds_shut4uheap
= 1;
5857 twin
= getTwinCtg ( currNode
);
5858 us_cnt
= ctgInHeap
->us_shut4uheap
? NULL
: contig_array
[twin
].downwardConnect
;
5862 if ( us_cnt
->deleted
|| us_cnt
->mask
||
5863 contig_array
[getTwinCtg ( us_cnt
->contigID
)].inSubGraph
)
5865 us_cnt
= us_cnt
->next
;
5869 tempNode
= getTwinCtg ( us_cnt
->contigID
);
5871 if ( contig_array
[tempNode
].inSubGraph
)
5873 us_cnt
= us_cnt
->next
;
5877 dis
= dis0
- us_cnt
->gapLen
- contig_array
[twin
].length
;
5878 ret
= dispatch1node ( dis
, tempNode
, maxNodes
, dheap
, uheap
, DmaxDis
, UmaxDis
);
5885 us_cnt
= us_cnt
->next
;
5887 } // for each node comes off the heap
5893 /*************************************************
5895 pickUpGeneralSubgraph
5897 Starting from a base contig, gathers related contigs to form a
5900 1. node1: base contig
5901 2. maxNodes: maximum allowed contig number in sub-graph
5905 0 if sub-graph was not found.
5906 *************************************************/
5907 static boolean
pickUpGeneralSubgraph ( unsigned int node1
, int maxNodes
)
5909 FibHeap
* Uheap
= newFibHeap(); // heap for upstream contigs to node1
5910 FibHeap
* Dheap
= newFibHeap();
5911 int UmaxDis
; // max distance upstream to node1
5913 boolean Uwait
; // wait signal for Uheap
5917 //initiate: node1 is put to array once, and to both Dheap and Uheap
5920 putNodeInArray ( node1
, maxNodes
, dis
);
5921 insertNodeIntoHeap ( Dheap
, dis
, nodeCounter
);
5922 ctg4heapArray
[nodeCounter
].us_shut4dheap
= 1;
5925 insertNodeIntoHeap ( Uheap
, dis
, nodeCounter
);
5926 ctg4heapArray
[nodeCounter
].ds_shut4uheap
= 1;
5928 UmaxDis
= contig_array
[node1
].length
;
5932 ret
= workOnDheap ( Dheap
, Uheap
, &Dwait
, &Uwait
, &DmaxDis
, &UmaxDis
, maxNodes
);
5937 destroyHeap ( Dheap
);
5938 destroyHeap ( Uheap
);
5942 ret
= workOnUheap ( Dheap
, Uheap
, &Dwait
, &Uwait
, &DmaxDis
, &UmaxDis
, maxNodes
);
5947 destroyHeap ( Dheap
);
5948 destroyHeap ( Uheap
);
5952 if ( Uwait
&& Dwait
)
5954 destroyHeap ( Dheap
);
5955 destroyHeap ( Uheap
);
5961 /*************************************************
5965 Compares two contigs according to their distances to base contig.
5967 1. a: pointer to the first contig in heap
5968 2. b: pointer to the second contig in heap
5972 1 if the first contig was further than the second contig.
5973 -1 if the second contig was further than the first contig.
5974 0 if the distances were equal.
5975 *************************************************/
5976 static int cmp_ctg ( const void * a
, const void * b
)
5979 A
= ( CTGinHEAP
* ) a
;
5980 B
= ( CTGinHEAP
* ) b
;
5982 if ( A
->dis
> B
->dis
)
5984 else if ( A
->dis
== B
->dis
)
5990 /*************************************************
5994 Checks if the sub-graph accords with the follwing criterions:
5995 1. The reversed complement of the first contig has neither
5996 downstream connection to contigs in sub-graph, nor more
5997 than one upstream connections in scaffold.
5998 2. The last contig has neither downstream connection to reversed
5999 complement of any contig in sub-graph, nor more than one
6000 upstream connections in scaffold.
6001 3. Except the first and the last contigs, none of other contigs in
6002 sub-graph has connection to contig which is out of sub-graph.
6008 1 if the sub-graph accorded with all above criterions.
6009 *************************************************/
6010 static boolean
checkEligible()
6012 unsigned int firstNode
= ctg4heapArray
[1].ctgID
;
6016 //check if the first node has incoming link from twin of any node in subgraph
6017 // or it has multi outgoing links bound to incoming links
6018 twin
= getTwinCtg ( firstNode
);
6019 CONNECT
* ite_cnt
= contig_array
[twin
].downwardConnect
;
6023 if ( ite_cnt
->deleted
|| ite_cnt
->mask
)
6025 ite_cnt
= ite_cnt
->next
;
6029 if ( contig_array
[ite_cnt
->contigID
].inSubGraph
)
6034 if ( ite_cnt
->prevInScaf
)
6042 ite_cnt
= ite_cnt
->next
;
6045 //check if the last node has outgoing link to twin of any node in subgraph
6046 // or it has multi outgoing links bound to incoming links
6047 unsigned int lastNode
= ctg4heapArray
[nodeCounter
].ctgID
;
6048 ite_cnt
= contig_array
[lastNode
].downwardConnect
;
6053 if ( ite_cnt
->deleted
|| ite_cnt
->mask
)
6055 ite_cnt
= ite_cnt
->next
;
6059 twin
= getTwinCtg ( ite_cnt
->contigID
);
6061 if ( contig_array
[twin
].inSubGraph
)
6066 if ( ite_cnt
->prevInScaf
)
6074 ite_cnt
= ite_cnt
->next
;
6077 //check if any node has outgoing link to node outside the subgraph
6078 for ( i
= 1; i
< nodeCounter
; i
++ )
6080 ite_cnt
= contig_array
[ctg4heapArray
[i
].ctgID
].downwardConnect
;
6084 if ( ite_cnt
->deleted
|| ite_cnt
->mask
)
6086 ite_cnt
= ite_cnt
->next
;
6090 if ( !contig_array
[ite_cnt
->contigID
].inSubGraph
)
6095 ite_cnt
= ite_cnt
->next
;
6099 //check if any node has incoming link from node outside the subgraph
6100 for ( i
= 2; i
<= nodeCounter
; i
++ )
6102 twin
= getTwinCtg ( ctg4heapArray
[i
].ctgID
);
6103 ite_cnt
= contig_array
[twin
].downwardConnect
;
6107 if ( ite_cnt
->deleted
|| ite_cnt
->mask
)
6109 ite_cnt
= ite_cnt
->next
;
6113 if ( !contig_array
[getTwinCtg ( ite_cnt
->contigID
)].inSubGraph
)
6118 ite_cnt
= ite_cnt
->next
;
6125 /*************************************************
6129 Copies one contig's values to another in array.
6131 1. init_array: pointer to target contig in array
6132 2. value_array: pointer to source contig in array
6137 *************************************************/
6138 static void arrayvalue ( CTGinHEAP
* init_array
, CTGinHEAP
* value_array
)
6140 init_array
->ctgID
= value_array
->ctgID
;
6141 init_array
->dis
= value_array
->dis
;
6142 init_array
->ds_shut4dheap
= value_array
->ds_shut4dheap
;
6143 init_array
->ds_shut4uheap
= value_array
->ds_shut4uheap
;
6144 init_array
->us_shut4dheap
= value_array
->us_shut4dheap
;
6145 init_array
->us_shut4uheap
= value_array
->us_shut4uheap
;
6148 /*************************************************
6152 Exchanges two contigs' values.
6154 1. from_id: the first contig in array
6155 2. to_id: the second contig in array
6160 *************************************************/
6161 static void arrayexchange ( unsigned int from_id
, unsigned int to_id
)
6164 arrayvalue ( &tmp_h
, & ( ctg4heapArray
[from_id
] ) );
6165 arrayvalue ( & ( ctg4heapArray
[from_id
] ), & ( ctg4heapArray
[to_id
] ) );
6166 arrayvalue ( & ( ctg4heapArray
[to_id
] ), &tmp_h
);
6169 static void deletearray ( unsigned int id
)
6173 for ( i
= 1; i
< nodeCounter
; i
++ )
6177 arrayvalue ( & ( ctg4heapArray
[i
] ), & ( ctg4heapArray
[i
+ 1] ) );
6184 int getnextInScafCtg ( int id
, int mask
, int flag
)
6186 CONNECT
* tmp_cn
= contig_array
[id
].downwardConnect
;
6191 if ( tmp_cn
->prevInScaf
)
6193 currId
= tmp_cn
->contigID
;
6195 if ( mask
!= 0 && tmp_cn
->contigID
!= mask
)
6199 tmp_cn
= tmp_cn
->next
;
6202 if ( mask
== currId
&& mask
!= 0 )
6205 if ( flag
== 0 && currId
!= 0 )
6206 { currId
= getTwinCtg ( currId
); }
6211 void delete_PrevNext ( int i
, int flag
)
6213 int id
= ctg4heapArray
[i
].ctgID
;
6214 int pid
= getTwinCtg ( id
);
6215 CONNECT
* ite_cnt
= contig_array
[id
].downwardConnect
;
6219 if ( ite_cnt
->mask
|| ite_cnt
->deleted
|| !contig_array
[ite_cnt
->contigID
].inSubGraph
)
6221 ite_cnt
= ite_cnt
->next
;
6226 { setNextInScaf ( ite_cnt
, NULL
); }
6229 { setPrevInScaf ( ite_cnt
, 0 ); }
6231 ite_cnt
= ite_cnt
->next
;
6234 ite_cnt
= contig_array
[pid
].downwardConnect
;
6238 if ( ite_cnt
->mask
|| ite_cnt
->deleted
|| !contig_array
[ite_cnt
->contigID
].inSubGraph
)
6240 ite_cnt
= ite_cnt
->next
;
6245 { setNextInScaf ( ite_cnt
, NULL
); }
6248 { setPrevInScaf ( ite_cnt
, 0 ); }
6250 ite_cnt
= ite_cnt
->next
;
6254 /*************************************************
6258 Gets contig's downstream connection provided by short insert
6259 size paired-end reads.
6263 1. nodeArray: array to store downstream contigs
6264 2. gapArray: array to store distance from current contig to
6267 Found downstream contig number.
6268 *************************************************/
6269 static int getCntNodes ( unsigned int node
, unsigned int * nodeArray
, unsigned int * gapArray
)
6272 CONNECT
* cnt
= contig_array
[node
].downwardConnect
;
6276 if ( 0 == bySmall
&& cnt
->weight
< 3 && !cnt
->smallIns
&& !cnt
->bySmall
)
6282 nodeArray
[count
] = cnt
->contigID
;
6283 gapArray
[count
++] = cnt
->gapLen
;
6285 if ( count
== MaxCntNode
)
6296 /*************************************************
6300 Calculates two contigs' distance through the common contig
6301 to which both contigs connected.
6303 1. cntCounter: contig numbet in the "cntNodeArr"
6304 2. cntNodeArr: the array of third-party contigs
6305 3. cntGapArr: the array of distances to third-party contigs
6306 4. node1: the first contig
6307 5. node2: the second contig
6308 6. tmpNode: the contig whose upstream/downstream contigs
6309 array was not selected as "cntNodeArr"
6311 1. cntCounter: common contig number of these two contigs
6313 Accumulated distance between these two contigs.
6314 *************************************************/
6315 static int calGapLen ( int * cntCounter
, unsigned int * cntNodeArr
, int * cntGapArr
,
6316 unsigned int node1
, unsigned int node2
, unsigned int tmpNode
)
6318 int i
= 0, gapLen
= 0, count
= 0;
6319 unsigned int target_node
;
6321 int len
= contig_array
[node2
].length
;
6323 for ( ; i
< *cntCounter
; ++i
)
6325 cnt
= getCntBetween ( tmpNode
, cntNodeArr
[i
] );
6327 if ( cnt
&& ( cnt
->weight
>= 3 || bySmall
|| cnt
->smallIns
|| cnt
->bySmall
) )
6329 if ( tmpNode
== node1
)
6331 gapLen
+= cnt
->gapLen
- cntGapArr
[i
] - len
;
6335 gapLen
+= cntGapArr
[i
] - cnt
->gapLen
- len
;
6342 *cntCounter
= count
;
6346 /*************************************************
6348 arrangeNodes_general
6350 Arranges contigs in sub-graph and updates related relation.
6357 *************************************************/
6358 static void arrangeNodes_general()
6360 int i
, j
, gap
, adjustedGap
;
6361 CONNECT
* ite_cnt
, *temp_cnt
, *bal_cnt
, *prev_cnt
, *next_cnt
, *dh_cnt
, *three_cnt
;
6362 unsigned int node1
, node2
, tmpNode
;
6363 unsigned int bal_nd1
, bal_nd2
;
6364 unsigned int pre_node
, bal_pre_node
, next_node
, bal_next_node
; // pre/next node that connected to first/last node if there is
6365 unsigned int first_node
, bal_first_node
, last_node
, bal_last_node
;
6366 unsigned int affected_node1
, bal_affected_node1
, affected_node2
, bal_affected_node2
;
6367 unsigned int exchangeNode1
= 0, exchangeNode2
= 0;
6368 int cntCounter
, comCount
;
6371 //delete original connections
6372 for ( i
= 1; i
<= nodeCounter
; i
++ )
6374 node1
= ctg4heapArray
[i
].ctgID
;
6375 ite_cnt
= contig_array
[node1
].downwardConnect
;
6379 if ( ite_cnt
->mask
|| ite_cnt
->deleted
|| !contig_array
[ite_cnt
->contigID
].inSubGraph
)
6381 ite_cnt
= ite_cnt
->next
;
6385 ite_cnt
->deleted
= 1;
6386 setNextInScaf ( ite_cnt
, NULL
);
6387 setPrevInScaf ( ite_cnt
, 0 );
6388 ite_cnt
= ite_cnt
->next
;
6391 bal_nd1
= getTwinCtg ( node1
);
6392 ite_cnt
= contig_array
[bal_nd1
].downwardConnect
;
6396 if ( ite_cnt
->mask
|| ite_cnt
->deleted
|| !contig_array
[getTwinCtg ( ite_cnt
->contigID
)].inSubGraph
)
6398 ite_cnt
= ite_cnt
->next
;
6402 ite_cnt
->deleted
= 1;
6403 setNextInScaf ( ite_cnt
, NULL
);
6404 setPrevInScaf ( ite_cnt
, 0 );
6405 ite_cnt
= ite_cnt
->next
;
6409 CONNECT
* first_cnt
= NULL
, *last_cnt
= NULL
, *tmp_cnt
;
6410 CONNECT
* affected_cnt
= NULL
, *bal_affected_cnt
= NULL
; //connections connected to pre_cnt and next_cnt
6411 pre_node
= bal_pre_node
= next_node
= bal_next_node
= 0;
6412 first_node
= ctg4heapArray
[1].ctgID
;
6413 bal_first_node
= getTwinCtg ( first_node
);
6414 ite_cnt
= contig_array
[bal_first_node
].downwardConnect
;
6418 if ( ite_cnt
->deleted
|| ite_cnt
->mask
)
6420 ite_cnt
= ite_cnt
->next
;
6424 if ( ite_cnt
->prevInScaf
)
6428 first_cnt
= ite_cnt
;
6431 bal_pre_node
= ite_cnt
->contigID
;
6432 pre_node
= getTwinCtg ( bal_pre_node
);
6433 tmp_cnt
= getCntBetween ( pre_node
, first_node
);
6436 ite_cnt
= ite_cnt
->next
;
6439 last_node
= ctg4heapArray
[nodeCounter
].ctgID
;
6440 bal_last_node
= getTwinCtg ( last_node
);
6441 ite_cnt
= contig_array
[last_node
].downwardConnect
;
6445 if ( ite_cnt
->deleted
|| ite_cnt
->mask
)
6447 ite_cnt
= ite_cnt
->next
;
6451 if ( ite_cnt
->prevInScaf
)
6458 next_node
= ite_cnt
->contigID
;
6459 bal_next_node
= getTwinCtg ( next_node
);
6460 tmp_cnt
= getCntBetween ( bal_next_node
, bal_last_node
);
6463 ite_cnt
= ite_cnt
->next
;
6466 prev_cnt
= next_cnt
= NULL
;
6468 for ( i
= 1; i
< nodeCounter
; ++i
)
6470 node1
= ctg4heapArray
[i
].ctgID
;
6471 node2
= ctg4heapArray
[i
+ 1].ctgID
;
6472 bal_nd1
= getTwinCtg ( node1
);
6473 bal_nd2
= getTwinCtg ( node2
);
6474 gap
= ctg4heapArray
[i
+ 1].dis
- ctg4heapArray
[i
].dis
6475 - contig_array
[node2
].length
;
6476 temp_cnt
= getCntBetween ( node1
, node2
);
6477 dh_cnt
= getCntBetween ( node2
, node1
);
6480 { three_cnt
= getCntBetween ( ctg4heapArray
[i
- 1].ctgID
, node2
); }
6484 tmp_dis
= ( int ) contig_array
[node1
].length
+ ( int ) contig_array
[node2
].length
+ gap
+ dh_cnt
->gapLen
;
6491 if ( temp_cnt
&& ( bySmall
|| temp_cnt
->bySmall
|| temp_cnt
->smallIns
|| !dh_cnt
|| !dh_cnt
->bySmall
|| !dh_cnt
->smallIns
) )
6493 temp_cnt
->deleted
= 0;
6495 bal_cnt
= getCntBetween ( bal_nd2
, bal_nd1
);
6496 bal_cnt
->deleted
= 0;
6499 else if ( dh_cnt
&& ( ( dh_cnt
->bySmall
|| dh_cnt
->smallIns
|| bySmall
)
6500 || ( ( -gap
> ( int ) contig_array
[node1
].length
|| -gap
> ( int ) contig_array
[node2
].length
)
6501 && tmp_dis
> 0 && tmp_dis
< 500 && dh_cnt
->weight
> 3 ) ) )
6503 dh_cnt
->deleted
= 0;
6505 dh_cnt
= getCntBetween ( bal_nd1
, bal_nd2
);
6506 dh_cnt
->deleted
= 0;
6508 arrayexchange ( i
, i
+ 1 );
6518 prev_cnt
->deleted
= 1;
6520 next_cnt
->deleted
= 1;
6526 bal_affected_node2
= next_cnt
->contigID
;
6527 affected_node2
= getTwinCtg ( bal_affected_node2
);
6528 bal_affected_cnt
= next_cnt
->nextInScaf
;
6529 bal_affected_node1
= bal_affected_cnt
->contigID
;
6530 affected_node1
= getTwinCtg ( bal_affected_node1
);
6531 affected_cnt
= getCntBetween ( affected_node1
, affected_node2
);
6533 if ( !affected_cnt
)
6535 fprintf ( stderr
, "affected cnt between %u(%u) and %u(%u) doesn't exists!\n", affected_node1
, bal_affected_node1
, affected_node2
, bal_affected_node2
);
6539 setNextInScaf ( affected_cnt
, NULL
);
6540 setPrevInScaf ( bal_affected_cnt
, 0 );
6541 setNextInScaf ( next_cnt
, NULL
);
6542 setPrevInScaf ( prev_cnt
, 0 );
6543 prev_cnt
->deleted
= 1;
6545 next_cnt
->deleted
= 1;
6552 if ( ( bySmall
> 0 && gap
< 0 )
6553 || ( -gap
> ( int ) contig_array
[node1
].length
|| -gap
> ( int ) contig_array
[node2
].length
)
6554 && ( i
!= nodeCounter
- 1 ) )
6556 adjustedGap
= comCount
= 0;
6557 uCntCounter1
= getCntNodes ( getTwinCtg ( node1
), uCntNodeArr1
, uCntGapArr1
);
6558 dCntCounter1
= getCntNodes ( node1
, dCntNodeArr1
, dCntGapArr1
);
6559 uCntCounter2
= getCntNodes ( getTwinCtg ( node2
), uCntNodeArr2
, uCntGapArr2
);
6560 dCntCounter2
= getCntNodes ( node2
, dCntNodeArr2
, dCntGapArr2
);
6562 if ( uCntCounter1
< uCntCounter2
)
6564 tmpNode
= getTwinCtg ( node2
);
6565 cntCounter
= uCntCounter1
;
6566 cntNodeArr
= &uCntNodeArr1
[0];
6567 cntGapArr
= &uCntGapArr1
[0];
6571 tmpNode
= getTwinCtg ( node1
);
6572 cntCounter
= uCntCounter2
;
6573 cntNodeArr
= &uCntNodeArr2
[0];
6574 cntGapArr
= &uCntGapArr2
[0];
6577 adjustedGap
+= calGapLen ( &cntCounter
, cntNodeArr
, cntGapArr
, getTwinCtg ( node2
), getTwinCtg ( node1
), tmpNode
);
6578 comCount
+= cntCounter
;
6580 if ( dCntCounter1
< dCntCounter2
)
6583 cntCounter
= dCntCounter1
;
6584 cntNodeArr
= &dCntNodeArr1
[0];
6585 cntGapArr
= &dCntGapArr1
[0];
6590 cntCounter
= dCntCounter2
;
6591 cntNodeArr
= &dCntNodeArr2
[0];
6592 cntGapArr
= &dCntGapArr2
[0];
6595 adjustedGap
+= calGapLen ( &cntCounter
, cntNodeArr
, cntGapArr
, node1
, node2
, tmpNode
);
6596 comCount
+= cntCounter
;
6600 gap
= adjustedGap
/ comCount
;
6604 if ( ( -gap
> ( int ) contig_array
[node1
].length
|| -gap
> ( int ) contig_array
[node2
].length
)
6605 && ( i
!= nodeCounter
- 1 ) && ( ( exchangeNode1
== 0 && exchangeNode2
== 0 )
6606 || ( exchangeNode1
!= ctg4heapArray
[i
+ 1].ctgID
&& exchangeNode2
!= ctg4heapArray
[i
].ctgID
) ) )
6608 exchangeNode1
= ctg4heapArray
[i
].ctgID
;
6609 exchangeNode2
= ctg4heapArray
[i
+ 1].ctgID
;
6610 arrayexchange ( i
, i
+ 1 );
6620 prev_cnt
->deleted
= 1;
6622 next_cnt
->deleted
= 1;
6628 bal_affected_node2
= next_cnt
->contigID
;
6629 affected_node2
= getTwinCtg ( bal_affected_node2
);
6630 bal_affected_cnt
= next_cnt
->nextInScaf
;
6631 bal_affected_node1
= bal_affected_cnt
->contigID
;
6632 affected_node1
= getTwinCtg ( bal_affected_node1
);
6633 affected_cnt
= getCntBetween ( affected_node1
, affected_node2
);
6635 if ( !affected_cnt
)
6637 fprintf ( stderr
, "affected cnt between %u(%u) and %u(%u) doesn't exists!\n", affected_node1
, bal_affected_node1
, affected_node2
, bal_affected_node2
);
6641 setNextInScaf ( affected_cnt
, NULL
);
6642 setPrevInScaf ( bal_affected_cnt
, 0 );
6643 setNextInScaf ( next_cnt
, NULL
);
6644 setPrevInScaf ( prev_cnt
, 0 );
6645 prev_cnt
->deleted
= 1;
6647 next_cnt
->deleted
= 1;
6653 temp_cnt
= allocateCN ( node2
, gap
);
6655 if ( cntLookupTable
)
6656 { putCnt2LookupTable ( node1
, temp_cnt
); }
6658 temp_cnt
->weight
= 0;
6659 temp_cnt
->next
= contig_array
[node1
].downwardConnect
;
6660 contig_array
[node1
].downwardConnect
= temp_cnt
;
6661 bal_cnt
= allocateCN ( bal_nd1
, gap
);
6663 if ( cntLookupTable
)
6664 { putCnt2LookupTable ( bal_nd2
, bal_cnt
); }
6666 bal_cnt
->weight
= 0;
6667 bal_cnt
->next
= contig_array
[bal_nd2
].downwardConnect
;
6668 contig_array
[bal_nd2
].downwardConnect
= bal_cnt
;
6673 setNextInScaf ( prev_cnt
, temp_cnt
);
6674 setPrevInScaf ( temp_cnt
, 1 );
6679 setNextInScaf ( bal_cnt
, next_cnt
);
6680 setPrevInScaf ( next_cnt
, 1 );
6683 prev_cnt
= temp_cnt
;
6689 if ( ctg4heapArray
[1].ctgID
== first_node
)
6691 bal_nd1
= first_cnt
->contigID
;
6692 node1
= getTwinCtg ( bal_nd1
);
6694 temp_cnt
= checkConnect ( node1
, node2
);
6695 bal_cnt
= first_cnt
;
6696 next_cnt
= checkConnect ( ctg4heapArray
[1].ctgID
, ctg4heapArray
[2].ctgID
);
6697 prev_cnt
= checkConnect ( getTwinCtg ( ctg4heapArray
[2].ctgID
), getTwinCtg ( ctg4heapArray
[1].ctgID
) );
6701 setNextInScaf ( temp_cnt
, next_cnt
);
6702 setPrevInScaf ( temp_cnt
->nextInScaf
, 0 );
6703 setPrevInScaf ( next_cnt
, 1 );
6704 setNextInScaf ( prev_cnt
, bal_cnt
);
6709 bal_pre_node
= first_cnt
->contigID
;
6710 pre_node
= getTwinCtg ( bal_pre_node
);
6712 node1
= ctg4heapArray
[j
].ctgID
;
6713 node2
= ctg4heapArray
[j
+ 1].ctgID
;
6714 ite_cnt
= getCntBetween ( pre_node
, node1
);
6715 bal_cnt
= getCntBetween ( getTwinCtg ( node1
), bal_pre_node
);
6717 while ( !ite_cnt
&& node2
!= first_node
)
6719 tmp_cnt
= getCntBetween ( node1
, node2
);
6720 bal_cnt
= getCntBetween ( getTwinCtg ( node2
), getTwinCtg ( node1
) );
6721 setNextInScaf ( tmp_cnt
, NULL
);
6722 setPrevInScaf ( tmp_cnt
, 0 );
6723 tmp_cnt
->deleted
= 1;
6724 setNextInScaf ( bal_cnt
, NULL
);
6725 setPrevInScaf ( bal_cnt
, 0 );
6726 bal_cnt
->deleted
= 1;
6728 node1
= ctg4heapArray
[j
].ctgID
;
6729 node2
= ctg4heapArray
[j
+ 1].ctgID
;
6730 ite_cnt
= getCntBetween ( pre_node
, node1
);
6731 bal_cnt
= getCntBetween ( getTwinCtg ( node1
), bal_pre_node
);
6736 tmp_cnt
= getCntBetween ( node1
, first_node
);
6737 gap
= first_cnt
->gapLen
- tmp_cnt
->gapLen
- contig_array
[node1
].length
;
6738 ite_cnt
= allocateCN ( node1
, gap
);
6739 ite_cnt
->weight
= 0;
6741 if ( cntLookupTable
)
6743 putCnt2LookupTable ( pre_node
, ite_cnt
);
6746 ite_cnt
->next
= contig_array
[pre_node
].downwardConnect
;
6747 contig_array
[pre_node
].downwardConnect
= ite_cnt
;
6748 bal_cnt
= allocateCN ( bal_pre_node
, gap
);
6749 bal_cnt
->weight
= 0;
6751 if ( cntLookupTable
)
6753 putCnt2LookupTable ( getTwinCtg ( node1
), bal_cnt
);
6756 bal_cnt
->next
= contig_array
[getTwinCtg ( node1
)].downwardConnect
;
6757 contig_array
[getTwinCtg ( node1
)].downwardConnect
= bal_cnt
;
6760 ite_cnt
->deleted
= 0;
6762 bal_cnt
->deleted
= 0;
6764 tmp_cnt
= getCntBetween ( node1
, node2
);
6765 setNextInScaf ( ite_cnt
, tmp_cnt
);
6766 setPrevInScaf ( tmp_cnt
, 1 );
6767 tmp_cnt
= getCntBetween ( getTwinCtg ( node2
), getTwinCtg ( node1
) );
6768 setNextInScaf ( tmp_cnt
, bal_cnt
);
6769 setPrevInScaf ( bal_cnt
, 1 );
6771 if ( first_cnt
->nextInScaf
)
6773 setNextInScaf ( bal_cnt
, first_cnt
->nextInScaf
);
6774 bal_affected_node1
= first_cnt
->nextInScaf
->contigID
;
6775 affected_node1
= getTwinCtg ( bal_affected_node1
);
6776 affected_cnt
= getCntBetween ( affected_node1
, pre_node
);
6777 setNextInScaf ( affected_cnt
, ite_cnt
);
6778 setPrevInScaf ( ite_cnt
, 1 );
6781 setNextInScaf ( first_cnt
, NULL
);
6782 setPrevInScaf ( first_cnt
, 0 );
6783 first_cnt
->deleted
= 1;
6784 first_cnt
->mask
= 1;
6785 tmp_cnt
= getCntBetween ( pre_node
, first_node
);
6786 setNextInScaf ( tmp_cnt
, NULL
);
6787 setPrevInScaf ( tmp_cnt
, 0 );
6788 tmp_cnt
->deleted
= 1;
6795 node1
= ctg4heapArray
[nodeCounter
].ctgID
;
6797 if ( node1
== last_node
)
6799 node2
= last_cnt
->contigID
;
6800 bal_nd1
= getTwinCtg ( node1
);
6801 bal_nd2
= getTwinCtg ( node2
);
6802 temp_cnt
= last_cnt
;
6803 bal_cnt
= checkConnect ( bal_nd2
, bal_nd1
);
6804 next_cnt
= checkConnect ( getTwinCtg ( ctg4heapArray
[nodeCounter
].ctgID
),
6805 getTwinCtg ( ctg4heapArray
[nodeCounter
- 1].ctgID
) );
6806 prev_cnt
= checkConnect ( ctg4heapArray
[nodeCounter
- 1].ctgID
, ctg4heapArray
[nodeCounter
].ctgID
);
6807 setNextInScaf ( prev_cnt
, temp_cnt
);
6808 setNextInScaf ( bal_cnt
, next_cnt
);
6809 setPrevInScaf ( next_cnt
, 1 );
6813 next_node
= last_cnt
->contigID
;
6814 bal_next_node
= getTwinCtg ( next_node
);
6816 node1
= ctg4heapArray
[j
- 1].ctgID
;
6817 node2
= ctg4heapArray
[j
].ctgID
;
6818 ite_cnt
= getCntBetween ( node2
, next_node
);
6819 bal_cnt
= getCntBetween ( bal_next_node
, getTwinCtg ( node2
) );
6821 while ( !ite_cnt
&& node1
!= last_node
)
6823 tmp_cnt
= getCntBetween ( node1
, node2
);
6824 bal_cnt
= getCntBetween ( getTwinCtg ( node2
), getTwinCtg ( node1
) );
6825 setNextInScaf ( tmp_cnt
, NULL
);
6826 setPrevInScaf ( tmp_cnt
, 0 );
6827 tmp_cnt
->deleted
= 1;
6828 setNextInScaf ( bal_cnt
, NULL
);
6829 setPrevInScaf ( bal_cnt
, 0 );
6830 bal_cnt
->deleted
= 1;
6832 node1
= ctg4heapArray
[j
- 1].ctgID
;
6833 node2
= ctg4heapArray
[j
].ctgID
;
6834 ite_cnt
= getCntBetween ( node2
, next_node
);
6835 bal_cnt
= getCntBetween ( bal_next_node
, getTwinCtg ( node2
) );
6840 tmp_cnt
= getCntBetween ( node1
, node2
);
6841 gap
= last_cnt
->gapLen
- tmp_cnt
->gapLen
- contig_array
[node2
].length
;
6842 ite_cnt
= allocateCN ( next_node
, gap
);
6843 ite_cnt
->weight
= 0;
6845 if ( cntLookupTable
)
6847 putCnt2LookupTable ( node2
, ite_cnt
);
6850 ite_cnt
->next
= contig_array
[node2
].downwardConnect
;
6851 contig_array
[node2
].downwardConnect
= ite_cnt
;
6852 bal_cnt
= allocateCN ( getTwinCtg ( node2
), gap
);
6853 bal_cnt
->weight
= 0;
6855 if ( cntLookupTable
)
6857 putCnt2LookupTable ( bal_next_node
, bal_cnt
);
6860 bal_cnt
->next
= contig_array
[bal_next_node
].downwardConnect
;
6861 contig_array
[bal_next_node
].downwardConnect
= bal_cnt
;
6864 ite_cnt
->deleted
= 0;
6866 bal_cnt
->deleted
= 0;
6868 tmp_cnt
= getCntBetween ( node1
, node2
);
6869 setNextInScaf ( tmp_cnt
, ite_cnt
);
6870 setPrevInScaf ( ite_cnt
, 1 );
6871 tmp_cnt
= getCntBetween ( getTwinCtg ( node2
), getTwinCtg ( node1
) );
6872 setNextInScaf ( bal_cnt
, tmp_cnt
);
6873 setPrevInScaf ( tmp_cnt
, 1 );
6875 if ( last_cnt
->nextInScaf
)
6877 setNextInScaf ( ite_cnt
, last_cnt
->nextInScaf
);
6878 affected_node1
= last_cnt
->nextInScaf
->contigID
;
6879 bal_affected_node1
= getTwinCtg ( affected_node1
);
6880 bal_affected_cnt
= getCntBetween ( bal_affected_node1
, bal_next_node
);
6881 setNextInScaf ( bal_affected_cnt
, bal_cnt
);
6882 setPrevInScaf ( bal_cnt
, 1 );
6885 setNextInScaf ( last_cnt
, NULL
);
6886 setPrevInScaf ( last_cnt
, 0 );
6887 last_cnt
->deleted
= 1;
6888 tmp_cnt
= getCntBetween ( bal_next_node
, bal_last_node
);
6889 setNextInScaf ( tmp_cnt
, NULL
);
6890 setPrevInScaf ( tmp_cnt
, 0 );
6891 tmp_cnt
->deleted
= 1;
6896 /*************************************************
6898 checkOverlapInBetween_general
6900 Checks if adjacent contigs in the array have reasonable overlap.
6902 1. tolerance: max percentage that overlap length accounts for
6906 1 if the overlap situation was resonable.
6907 *************************************************/
6908 boolean
checkOverlapInBetween_general ( double tolerance
)
6911 unsigned int node1
, node2
;
6914 lenSum
= lenOlp
= 0;
6916 for ( i
= 1; i
<= nodeCounter
; i
++ )
6918 node1
= ctg4heapArray
[i
].ctgID
;
6919 lenSum
+= contig_array
[node1
].length
;
6925 for ( i
= 1; i
< nodeCounter
; i
++ )
6927 node2
= ctg4heapArray
[i
+ 1].ctgID
;
6928 gap
= ctg4heapArray
[i
+ 1].dis
- ctg4heapArray
[i
].dis
6929 - contig_array
[node2
].length
;
6933 node1
= ctg4heapArray
[i
].ctgID
;
6934 cnt
= getCntBetween ( node1
, node2
);
6936 if ( cnt
&& cnt
->gapLen
> gap
)
6940 else if ( ( cnt
= getCntBetween ( node2
, node1
) ) != NULL
6941 && cnt
->gapLen
> gap
)
6945 else if ( -gap
< overlaplen
)
6953 if ( ( double ) lenOlp
/ lenSum
> tolerance
)
6960 int canexchange
= 0, exchange_num
= 0, failexchange
= 0;
6962 /*************************************************
6964 checkConflictCnt_general
6966 Checks if the conflict connections between adjacent contigs
6967 in the array are acceptable.
6969 1. tolerance: max percentage that conflict connections accounts for
6974 *************************************************/
6975 static boolean
checkConflictCnt_general ( double tolerance
)
6978 int supportCounter
= 0;
6979 int objectCounter
= 0;
6982 for ( i
= 1; i
< nodeCounter
; i
++ )
6984 for ( j
= i
+ 1; j
<= nodeCounter
; j
++ )
6986 cnt
= checkConnect ( ctg4heapArray
[i
].ctgID
, ctg4heapArray
[j
].ctgID
);
6989 { supportCounter
+= cnt
->weight
; }
6991 cnt
= checkConnect ( ctg4heapArray
[j
].ctgID
, ctg4heapArray
[i
].ctgID
);
6995 gap
= ctg4heapArray
[j
].dis
- ctg4heapArray
[i
].dis
- contig_array
[ctg4heapArray
[j
].ctgID
].length
;
6997 if ( gap
> -overlaplen
&& gap
>= cnt
->gapLen
&& cnt
->inherit
== 0 )
6999 objectCounter
+= cnt
->weight
;
7005 if ( supportCounter
< 1 )
7008 if ( ( double ) objectCounter
/ supportCounter
< tolerance
)
7014 /*************************************************
7016 getIndexInSortedSubgraph
7018 Gets contig's index in sorted array.
7021 2. count: array size
7025 Contig's index if found.
7027 *************************************************/
7028 static int getIndexInSortedSubgraph ( unsigned int node
, int count
)
7032 for ( index
= 0; index
< count
; ++index
)
7034 if ( nodesInSubInOrder
[index
] == node
)
7041 /*************************************************
7043 getIndexInSortedSubgraph
7045 Gets contig's arc if contig has only one arc with weight > 1.
7051 Pointer to arc if existed.
7053 *************************************************/
7054 static preARC
* getValidArc ( unsigned int node
)
7057 preARC
* arc
= contig_array
[node
].arcs
;
7061 if ( arc
->multiplicity
> 1 )
7077 static boolean
clearUpSubgraph()
7079 unsigned int i
, ctg1
, bal_ctg1
, ctg2
, bal_ctg2
;
7080 int j
, arc_num
, num5
, num3
, index
= 0, count
= 0;
7084 //put all contigs in "nodesInSub" array
7085 for ( i
= 1; i
<= nodeCounter
; ++i
)
7087 nodesInSub
[i
- 1] = ctg4heapArray
[i
].ctgID
;
7090 for ( i
= 0; i
< nodeCounter
; ++i
)
7092 ctg1
= nodesInSub
[i
];
7093 index
= getIndexInSortedSubgraph ( ctg1
, count
);
7095 if ( index
>= 0 && index
< count
- 1 ) //this contig is already in array
7098 bal_ctg1
= getTwinCtg ( ctg1
);
7102 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = ctg1
;
7103 arc
= getValidArc ( ctg1
);
7108 bal_ctg2
= getTwinCtg ( ctg2
);
7110 if ( ( arc
= getValidArc ( bal_ctg2
) ) == NULL
)
7114 else if ( arc
->to_ed
!= bal_ctg1
)
7120 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = ctg1
;
7121 arc
= getValidArc ( ctg1
);
7124 ctg1
= nodesInSub
[i
];
7125 arc
= getValidArc ( bal_ctg1
);
7129 bal_ctg2
= arc
->to_ed
;
7130 ctg2
= getTwinCtg ( bal_ctg2
);
7132 if ( ( arc
= getValidArc ( ctg2
) ) == NULL
)
7136 else if ( arc
->to_ed
!= ctg1
)
7142 * ( unsigned int * ) darrayPut ( scaf3
, num3
++ ) = ctg1
;
7143 arc
= getValidArc ( bal_ctg2
);
7146 for ( j
= num3
- 1; j
>= 0; --j
)
7148 nodesInSubInOrder
[index
++] = * ( unsigned int * ) darrayGet ( scaf3
, j
);
7151 for ( j
= 0; j
< num5
; ++j
)
7153 nodesInSubInOrder
[index
++] = * ( unsigned int * ) darrayGet ( scaf5
, j
);
7158 /*************************************************
7160 transferCnt2RemainNode
7162 In bubble structure, transfers connections of contig which is to
7163 be masked, to remained contig.
7165 1. maskNode: contig to be masked
7166 2. remainNode: remained contig
7171 *************************************************/
7172 static void transferCnt2RemainNode ( unsigned int maskNode
, unsigned int remainNode
)
7174 CONNECT
* cnt
= contig_array
[maskNode
].downwardConnect
;
7175 CONNECT
* bal_cnt
, *nextCnt
, *bal_nextCnt
, *tmpCnt
, *bal_tmpCnt
;
7176 unsigned int nextNode1
, bal_nextNode1
, nextNode2
, bal_nextNode2
;
7177 unsigned int bal_maskNode
= getTwinCtg ( maskNode
), bal_remainNode
= getTwinCtg ( remainNode
);
7178 int gap
, weight
, inherit
;
7188 nextNode1
= cnt
->contigID
;
7189 bal_nextNode1
= getTwinCtg ( nextNode1
);
7190 bal_cnt
= getCntBetween ( bal_nextNode1
, bal_maskNode
);
7192 weight
= cnt
->weight
;
7193 tmpCnt
= getCntBetween ( remainNode
, nextNode1
);
7204 if ( cnt
->nextInScaf
)
7206 nextNode2
= cnt
->nextInScaf
->contigID
;
7207 bal_nextNode2
= getTwinCtg ( nextNode2
);
7208 nextCnt
= getCntBetween ( nextNode1
, nextNode2
);
7209 bal_nextCnt
= getCntBetween ( bal_nextNode2
, bal_nextNode1
);
7211 if ( nextNode1
!= remainNode
&& nextNode2
!= remainNode
)
7213 tmpCnt
= add1Connect ( remainNode
, nextNode1
, gap
, weight
, inherit
);
7214 bal_tmpCnt
= add1Connect ( bal_nextNode1
, bal_remainNode
, gap
, weight
, inherit
);
7215 tmpCnt
->nextInScaf
= nextCnt
;
7217 tmpCnt
->deleted
= 0;
7218 bal_nextCnt
->nextInScaf
= bal_tmpCnt
;
7219 bal_tmpCnt
->prevInScaf
= 1;
7220 bal_tmpCnt
->mask
= 0;
7221 bal_tmpCnt
->deleted
= 0;
7225 nextCnt
->prevInScaf
= 0;
7226 bal_nextCnt
->nextInScaf
= NULL
;
7230 cnt
->nextInScaf
= NULL
;
7231 cnt
->prevInScaf
= 0;
7234 bal_cnt
->nextInScaf
= NULL
;
7235 bal_cnt
->prevInScaf
= 0;
7237 bal_cnt
->deleted
= 1;
7242 /*************************************************
7246 Masks contig's downstream connections in scaffold.
7248 1. node: contig whose downstream connections are going to be masked
7253 *************************************************/
7254 static void maskNodeCnt ( unsigned int node
)
7256 CONNECT
* cnt
= contig_array
[node
].downwardConnect
;
7257 CONNECT
* bal_cnt
, *bal_nextCnt
;
7258 unsigned int bal_nextNode1
, bal_nextNode2
;
7262 bal_nextNode1
= getTwinCtg ( cnt
->contigID
);
7263 bal_cnt
= getCntBetween ( bal_nextNode1
, getTwinCtg ( node
) );
7265 if ( cnt
->nextInScaf
)
7267 cnt
->nextInScaf
->prevInScaf
= 0;
7268 bal_nextNode2
= getTwinCtg ( cnt
->nextInScaf
->contigID
);
7269 bal_nextCnt
= getCntBetween ( bal_nextNode2
, bal_nextNode1
);
7276 bal_nextCnt
->nextInScaf
= NULL
;
7279 cnt
->nextInScaf
= NULL
;
7280 cnt
->prevInScaf
= 0;
7283 bal_cnt
->nextInScaf
= NULL
;
7284 bal_cnt
->prevInScaf
= 0;
7286 bal_cnt
->deleted
= 1;
7291 /*************************************************
7295 Gets contig's first and last kmers' sequences.
7297 1. seq: contig sequence
7298 2. len: contig length
7299 3. rev: indicator of reversed complement
7301 1. firstKmer: first kmer's sequence
7302 2. lastKmer: last kmer's sequence
7305 *************************************************/
7306 static void getEndKmers ( char * seq
, int len
, int rev
, char * firstKmer
, char * lastKmer
)
7312 for ( j
= 0; j
< overlaplen
; ++j
)
7314 firstKmer
[j
] = int2base ( ( int ) getCharInTightString ( seq
, j
) );
7315 lastKmer
[j
] = int2base ( ( int ) getCharInTightString ( seq
, len
- j
- 1 ) );
7320 for ( j
= 0; j
< overlaplen
; ++j
)
7322 firstKmer
[j
] = int2compbase ( ( int ) getCharInTightString ( seq
, len
- j
- 1 ) );
7323 lastKmer
[j
] = int2compbase ( ( int ) getCharInTightString ( seq
, j
) );
7328 /*************************************************
7332 Outputs contig sequence.
7340 *************************************************/
7341 static void output_ctg ( unsigned int ctg
, FILE * fo
)
7343 if ( contig_array
[ctg
].length
< 1 )
7347 unsigned int bal_ctg
= getTwinCtg ( ctg
);
7348 len
= contig_array
[ctg
].length
+ overlaplen
;
7351 if ( contig_array
[ctg
].seq
)
7353 fprintf ( fo
, ">C%d %4.1f\n", ctg
, ( double ) contig_array
[ctg
].cvg
);
7354 outputTightStr ( fo
, contig_array
[ctg
].seq
, 0, len
, len
, 0, &col
);
7356 else if ( contig_array
[bal_ctg
].seq
)
7358 fprintf ( fo
, ">C%d %4.1f\n", bal_ctg
, ( double ) contig_array
[ctg
].cvg
);
7359 outputTightStr ( fo
, contig_array
[bal_ctg
].seq
, 0, len
, len
, 0, &col
);
7362 fprintf ( fo
, "\n" );
7366 /*************************************************
7370 Searchs bubble structure in sub-graph. Removes the contig
7371 with lower coverage and transfers its connection relation to
7372 the remained contig if found structure accords with following
7374 1. Both contigs' coverage smaller than a cutoff.
7375 2. The first and last kmers of two contigs are the same.
7381 Handled bubble structure number.
7382 *************************************************/
7383 static int removeBubbleCtg()
7385 int i
, j
, count
, gap
, SnpCounter
= 0, conflict
= 0;
7386 unsigned int node1
, node2
, bal_node1
, bal_node2
;
7387 int len1
, len2
, addLast
= 0;
7388 char * tightStr1
, *tightStr2
;
7389 char firstKmer1
[overlaplen
+ 1], lastKmer1
[overlaplen
+ 1], firstKmer2
[overlaplen
+ 1], lastKmer2
[overlaplen
+ 1];
7390 CONNECT
* cnt
, *bal_cnt
;
7393 for ( i
= 1; i
< nodeCounter
; ++i
)
7395 node1
= ctg4heapArray
[i
].ctgID
;
7396 node2
= ctg4heapArray
[i
+ 1].ctgID
;
7397 bal_node1
= getTwinCtg ( node1
);
7398 bal_node2
= getTwinCtg ( node2
);
7399 cnt
= getCntBetween ( node1
, node2
);
7400 bal_cnt
= getCntBetween ( node2
, node1
);
7401 gap
= ctg4heapArray
[i
+ 1].dis
- ctg4heapArray
[i
].dis
- ( int ) contig_array
[node2
].length
;
7403 if ( gap
>= 0 || contig_array
[node1
].cvg
>= cvg4SNP
|| contig_array
[node2
].cvg
>= cvg4SNP
|| cnt
|| bal_cnt
)
7405 nodesInSubInOrder
[count
] = node1
;
7406 nodeDistanceInOrder
[count
++] = ctg4heapArray
[i
].dis
;
7410 len1
= contig_array
[node1
].length
+ overlaplen
;
7411 len2
= contig_array
[node2
].length
+ overlaplen
;
7413 if ( contig_array
[node1
].seq
)
7415 getEndKmers ( contig_array
[node1
].seq
, len1
, 0, firstKmer1
, lastKmer1
);
7419 getEndKmers ( contig_array
[bal_node1
].seq
, len1
, 1, firstKmer1
, lastKmer1
);
7422 if ( contig_array
[node2
].seq
)
7424 getEndKmers ( contig_array
[node2
].seq
, len2
, 0, firstKmer2
, lastKmer2
);
7428 getEndKmers ( contig_array
[bal_node2
].seq
, len2
, 1, firstKmer2
, lastKmer2
);
7431 for ( j
= 0; j
< overlaplen
; ++j
)
7433 if ( firstKmer1
[j
] != firstKmer2
[j
] || lastKmer1
[j
] != lastKmer2
[j
] )
7435 nodesInSubInOrder
[count
] = node1
;
7436 nodeDistanceInOrder
[count
++] = ctg4heapArray
[i
].dis
;
7442 if ( 1 == conflict
)
7450 if ( contig_array
[node1
].bubbleInScaff
== 0 || contig_array
[node2
].bubbleInScaff
== 0 )
7452 contig_array
[node1
].bubbleInScaff
= 1;
7453 contig_array
[bal_node1
].bubbleInScaff
= 1;
7454 contig_array
[node2
].bubbleInScaff
= 1;
7455 contig_array
[bal_node2
].bubbleInScaff
= 1;
7456 output_ctg ( node1
, snp_fp
);
7457 output_ctg ( node2
, snp_fp
);
7460 if ( contig_array
[node1
].cvg
> contig_array
[node2
].cvg
|| ( len1
> len2
&& contig_array
[node1
].cvg
== contig_array
[node2
].cvg
) )
7462 if ( i
== nodeCounter
- 1 )
7464 nodesInSubInOrder
[count
] = node1
;
7465 nodeDistanceInOrder
[count
++] = ctg4heapArray
[i
].dis
;
7469 transferCnt2RemainNode ( node2
, node1
);
7470 transferCnt2RemainNode ( bal_node2
, bal_node1
);
7471 contig_array
[node2
].mask
= 1;
7472 contig_array
[bal_node2
].mask
= 1;
7473 ctg4heapArray
[i
+ 1].ctgID
= node1
;
7474 ctg4heapArray
[i
+ 1].dis
= ctg4heapArray
[i
].dis
;
7478 if ( i
== nodeCounter
- 1 )
7480 nodesInSubInOrder
[count
] = node2
;
7481 nodeDistanceInOrder
[count
++] = ctg4heapArray
[i
+ 1].dis
;
7485 transferCnt2RemainNode ( node1
, node2
);
7486 transferCnt2RemainNode ( bal_node1
, bal_node2
);
7487 contig_array
[node1
].mask
= 1;
7488 contig_array
[getTwinCtg ( node1
)].mask
= 1;
7494 nodesInSubInOrder
[count
] = ctg4heapArray
[nodeCounter
].ctgID
;
7495 nodeDistanceInOrder
[count
++] = ctg4heapArray
[nodeCounter
].dis
;
7498 for ( i
= 0; i
< count
; ++i
)
7500 ctg4heapArray
[i
+ 1].ctgID
= nodesInSubInOrder
[i
];
7501 ctg4heapArray
[i
+ 1].dis
= nodeDistanceInOrder
[i
];
7504 nodeCounter
= count
;
7508 /*************************************************
7510 general_linearization
7512 Picks up sub-graph and trys to line involved contigs.
7514 1. strict: indicator of whether using strict cutoff to deal with sub-graph
7519 *************************************************/
7520 static void general_linearization ( boolean strict
)
7526 int conflCounter
= 0, overlapCounter
= 0, eligibleCounter
= 0;
7527 int SNPCtgCounter
= 0;
7528 double overlapTolerance
, conflTolerance
;
7529 canexchange
= 0, exchange_num
= 0, failexchange
= 0;
7530 fprintf ( stderr
, "Start to linearize sub-graph.\n" );
7532 for ( i
= num_ctg
; i
> 0; i
-- )
7534 if ( contig_array
[i
].mask
)
7537 out_num
= validConnect ( i
, NULL
);
7542 flag
= pickUpGeneralSubgraph ( i
, MaxNodeInSub
);
7550 qsort ( &ctg4heapArray
[1], nodeCounter
, sizeof ( CTGinHEAP
), cmp_ctg
);
7552 if ( Insert_size
< 1000 && cvg4SNP
> 0.001 )
7554 SNPCtgCounter
+= removeBubbleCtg();
7557 flag
= checkEligible();
7568 overlapTolerance
= OverlapPercent
;
7569 conflTolerance
= ConflPercent
;
7573 overlapTolerance
= 2 * OverlapPercent
;
7574 conflTolerance
= 2 * ConflPercent
;
7577 flag
= checkOverlapInBetween_general ( overlapTolerance
);
7586 flag
= checkConflictCnt_general ( conflTolerance
);
7595 arrangeNodes_general();
7599 fprintf ( stderr
, " Picked sub-graphs %d\n Connection-conflict %d\n Significant overlapping %d\n Eligible %d\n Bubble structures %d\n", subCounter
, conflCounter
, overlapCounter
, eligibleCounter
, SNPCtgCounter
);
7602 /**** the fowllowing codes for detecting and break down scaffold at weak point **********/
7604 /*************************************************
7608 Counts constructed scaffold number by using short insert size
7609 paired-end reads and sets connections' flags: "bySmall" and
7617 *************************************************/
7618 static void smallScaf()
7620 unsigned int i
, ctg
, bal_ctg
, prevCtg
;
7622 CONNECT
* bindCnt
, *cnt
;
7624 for ( i
= 1; i
<= num_ctg
; i
++ )
7625 { contig_array
[i
].flag
= 0; }
7627 for ( i
= 1; i
<= num_ctg
; i
++ )
7629 if ( contig_array
[i
].flag
|| contig_array
[i
].mask
|| !contig_array
[i
].downwardConnect
)
7632 bindCnt
= getBindCnt ( i
);
7638 contig_array
[i
].flag
= 1;
7639 contig_array
[getTwinCtg ( i
)].flag
= 1;
7640 prevCtg
= getTwinCtg ( i
);
7644 ctg
= bindCnt
->contigID
;
7645 bal_ctg
= getTwinCtg ( ctg
);
7646 bindCnt
->bySmall
= 1;
7647 cnt
= getCntBetween ( bal_ctg
, prevCtg
);
7650 { cnt
->bySmall
= 1; }
7652 contig_array
[ctg
].flag
= 1;
7653 contig_array
[bal_ctg
].flag
= 1;
7655 bindCnt
= bindCnt
->nextInScaf
;
7658 ctg
= getTwinCtg ( i
);
7659 bindCnt
= getBindCnt ( ctg
);
7664 ctg
= bindCnt
->contigID
;
7665 bal_ctg
= getTwinCtg ( ctg
);
7666 bindCnt
->bySmall
= 1;
7667 cnt
= getCntBetween ( bal_ctg
, prevCtg
);
7670 { cnt
->bySmall
= 1; }
7672 contig_array
[ctg
].flag
= 1;
7673 contig_array
[bal_ctg
].flag
= 1;
7675 bindCnt
= bindCnt
->nextInScaf
;
7679 fprintf ( stderr
, "Report from smallScaf: %d scaffolds by smallPE.\n", counter
);
7681 for ( i
= 1; i
<= num_ctg
; i
++ )
7683 if ( !contig_array
[i
].downwardConnect
)
7686 cnt
= contig_array
[i
].downwardConnect
;
7696 /*************************************************
7700 Clears conections' flag "newIns".
7707 *************************************************/
7708 static void clearNewInsFlag()
7713 for ( ; i
<= num_ctg
; i
++ )
7715 cnt
= contig_array
[i
].downwardConnect
;
7721 if ( Insert_size
> 15000 )
7733 /*************************************************
7737 Checks whether a scaffold ID is in array. If not, puts it in to array.
7739 1. scaf: scaffold ID to be checked and put in to array
7740 2. wt: weight of connection to scaffold ID
7741 3. SCAF: array of scaffold IDs
7742 4. WT: array of connections' weight
7743 5. counter: size of scaffold ID array
7745 1. SCAF: updated array of scaffold IDs
7746 2. WT: updated array of connections' weight
7748 1 if operation of putting succeeded.
7749 *************************************************/
7750 static boolean
putItem2Sarray ( unsigned int scaf
, int wt
, DARRAY
* SCAF
, DARRAY
* WT
, int counter
)
7753 unsigned int * scafP
, *wtP
;
7755 for ( i
= 0; i
< counter
; i
++ )
7757 scafP
= ( unsigned int * ) darrayGet ( SCAF
, i
);
7759 if ( ( *scafP
) == scaf
)
7761 wtP
= ( unsigned int * ) darrayGet ( WT
, i
);
7762 *wtP
= ( *wtP
+ wt
);
7767 scafP
= ( unsigned int * ) darrayPut ( SCAF
, counter
);
7768 wtP
= ( unsigned int * ) darrayPut ( WT
, counter
);
7774 /*************************************************
7778 Gets downstream connections to other scaffolds and stores
7779 related information including other scaffolds' ID and connections'
7782 1. scafStack: scaffold stack consisting of contigs
7783 2. SCAF: array to store other scaffolds' IDs
7784 3. WT: array to store connections' weight
7785 4. total_len: length sum of contigs used to search connections
7788 1. SCAF: updated array of scaffold IDs
7789 2. WT: updated array of connections' weight
7791 Number of other scaffolds being connected.
7792 *************************************************/
7793 static int getDSLink2Scaf ( STACK
* scafStack
, DARRAY
* SCAF
, DARRAY
* WT
, int total_len
)
7797 unsigned int ctg
, targetCtg
, bal_targetCtg
, *pt
;
7801 stackRecover ( scafStack
);
7803 while ( ( pt
= ( unsigned int * ) stackPop ( scafStack
) ) != NULL
)
7806 bind_cnt
= getBindCnt ( ctg
);
7807 gap
= bind_cnt
? bind_cnt
->gapLen
: 0;
7808 len
+= contig_array
[ctg
].length
+ gap
;
7810 if ( ( contig_array
[ctg
].mask
&& contig_array
[ctg
].length
< 500 ) || !contig_array
[ctg
].downwardConnect
7811 || total_len
- len
> Insert_size
)
7816 ite_cnt
= contig_array
[ctg
].downwardConnect
;
7820 if ( ite_cnt
->newIns
!= 1 )
7822 ite_cnt
= ite_cnt
->next
;
7826 targetCtg
= ite_cnt
->contigID
;
7827 bal_targetCtg
= getTwinCtg ( targetCtg
);
7829 if ( ( ite_cnt
->mask
&& contig_array
[targetCtg
].length
< 500 ) || ite_cnt
->singleInScaf
7830 || ite_cnt
->nextInScaf
|| ite_cnt
->prevInScaf
|| ite_cnt
->inherit
)
7832 ite_cnt
= ite_cnt
->next
;
7836 if ( contig_array
[ctg
].from_vt
== contig_array
[targetCtg
].from_vt
// on the same scaff
7837 || ( targetCtg
== contig_array
[targetCtg
].from_vt
&& bal_targetCtg
== contig_array
[bal_targetCtg
].from_vt
) ) //targetCtg isn't in any scaffold
7839 ite_cnt
= ite_cnt
->next
;
7843 inc
= putItem2Sarray ( contig_array
[targetCtg
].from_vt
, ite_cnt
->weight
, SCAF
, WT
, counter
);
7848 ite_cnt
= ite_cnt
->next
;
7855 /*************************************************
7859 Starting from a contig, gets downstream contig chain in
7862 1. start: start contig
7864 1. scafStack: stack to store contig chain
7866 Length of contig chain.
7867 *************************************************/
7868 static int getScaffold ( unsigned int start
, STACK
* scafStack
)
7870 int len
= contig_array
[start
].length
;
7871 unsigned int * pt
, ctg
;
7872 emptyStack ( scafStack
);
7873 pt
= ( unsigned int * ) stackPush ( scafStack
);
7875 CONNECT
* bindCnt
= getBindCnt ( start
);
7879 ctg
= bindCnt
->contigID
;
7880 pt
= ( unsigned int * ) stackPush ( scafStack
);
7882 len
+= bindCnt
->gapLen
+ contig_array
[ctg
].length
;
7883 bindCnt
= bindCnt
->nextInScaf
;
7886 stackBackup ( scafStack
);
7890 /*************************************************
7894 Checks whether there is a reliable connection.
7896 1. WT: weight of connections
7897 2. count: connection number
7901 1 if a reliable connection was found.
7902 *************************************************/
7903 static boolean
isLinkReliable ( DARRAY
* WT
, int count
)
7907 for ( i
= 0; i
< count
; i
++ )
7908 if ( * ( int * ) darrayGet ( WT
, i
) >= weakPE
)
7914 /*************************************************
7918 Gets weight of connection to specified scaffold.
7920 1. SCAF: array of scaffold ID
7921 2. WT: array of onnection weight
7922 3. count: array size
7923 4. scaf: specified scaffold
7927 Weight of connection was found.
7929 *************************************************/
7930 static int getWtFromSarray ( DARRAY
* SCAF
, DARRAY
* WT
, int count
, unsigned int scaf
)
7934 for ( i
= 0; i
< count
; i
++ )
7935 if ( * ( unsigned int * ) darrayGet ( SCAF
, i
) == scaf
)
7936 { return * ( int * ) darrayGet ( WT
, i
); }
7941 /*************************************************
7945 Transfers contigs to their reversed complements.
7947 1. scafStack: stack of contigs
7949 1. scafStack: updated stack of contigs
7952 *************************************************/
7953 static void switch2twin ( STACK
* scafStack
)
7956 stackRecover ( scafStack
);
7958 while ( ( pt
= ( unsigned int * ) stackPop ( scafStack
) ) != NULL
)
7959 { *pt
= getTwinCtg ( *pt
); }
7962 /*************************************************
7966 Recovers contigs' connections to other scaffold if they
7967 accords with some criterions.
7969 1. scafStack: stack of contigs
7974 *************************************************/
7975 static void recoverLinks ( STACK
* scafStack
)
7978 unsigned int ctg
, targetCtg
, *pt
;
7981 unsigned int bal_ctg
;
7982 stackRecover ( scafStack
);
7984 while ( ( pt
= ( unsigned int * ) stackPop ( scafStack
) ) != NULL
)
7988 if ( contig_array
[ctg
].mask
|| !contig_array
[ctg
].downwardConnect
)
7993 ite_cnt
= contig_array
[ctg
].downwardConnect
;
7997 if ( ite_cnt
->mask
|| ite_cnt
->singleInScaf
|| ite_cnt
->nextInScaf
|| ite_cnt
->prevInScaf
|| ite_cnt
->inherit
|| ite_cnt
->weight
< weakPE
)
7999 ite_cnt
= ite_cnt
->next
;
8003 targetCtg
= ite_cnt
->contigID
;
8005 if ( contig_array
[ctg
].from_vt
== contig_array
[targetCtg
].from_vt
) // on the same scaff
8007 ite_cnt
= ite_cnt
->next
;
8011 setConnectDelete ( ctg
, targetCtg
, 0, 0 );
8012 ite_cnt
= ite_cnt
->next
;
8015 bal_ctg
= getTwinCtg ( ctg
);
8017 if ( contig_array
[bal_ctg
].mask
|| !contig_array
[bal_ctg
].downwardConnect
)
8022 ite_cnt
= contig_array
[bal_ctg
].downwardConnect
;
8026 if ( ite_cnt
->mask
|| ite_cnt
->singleInScaf
|| ite_cnt
->nextInScaf
|| ite_cnt
->prevInScaf
|| ite_cnt
->inherit
|| ite_cnt
->weight
< weakPE
)
8028 ite_cnt
= ite_cnt
->next
;
8032 targetCtg
= ite_cnt
->contigID
;
8034 if ( contig_array
[bal_ctg
].from_vt
== contig_array
[targetCtg
].from_vt
) // on the same scaff
8036 ite_cnt
= ite_cnt
->next
;
8040 setConnectDelete ( bal_ctg
, targetCtg
, 0, 0 );
8041 ite_cnt
= ite_cnt
->next
;
8047 scaf1 --- --- -- ---
8051 /*************************************************
8055 Checks if both sets of contigs in two stacks have reliable
8056 connection to different scaffolds.
8058 1. scafStack1: stack of contig set one
8059 2. len1: total length of contigs in set one
8060 3. scafStack2: stack of contig set two
8061 4. len2: total length of contigs in set two
8065 0 if both sets of contigs did not have reliable connection
8066 to different scaffolds.
8067 *************************************************/
8068 static boolean
checkScafConsist ( STACK
* scafStack1
, int len1
, STACK
* scafStack2
, int len2
)
8070 DARRAY
* downwardTo1
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) ); // scaf links to those scaffolds
8071 DARRAY
* downwardTo2
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) );
8072 DARRAY
* downwardWt1
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) ); // scaf links to scaffolds with those wt
8073 DARRAY
* downwardWt2
= ( DARRAY
* ) createDarray ( 1000, sizeof ( unsigned int ) );
8074 int linkCount1
= getDSLink2Scaf ( scafStack1
, downwardTo1
, downwardWt1
, len1
);
8075 int linkCount2
= getDSLink2Scaf ( scafStack2
, downwardTo2
, downwardWt2
, len2
);
8079 freeDarray ( downwardTo1
);
8080 freeDarray ( downwardTo2
);
8081 freeDarray ( downwardWt1
);
8082 freeDarray ( downwardWt2
);
8086 boolean flag1
= isLinkReliable ( downwardWt1
, linkCount1
);
8090 freeDarray ( downwardTo1
);
8091 freeDarray ( downwardTo2
);
8092 freeDarray ( downwardWt1
);
8093 freeDarray ( downwardWt2
);
8098 int i
, wt1
, wt2
, ret
= 1;
8100 for ( i
= 0; i
< linkCount1
; i
++ )
8102 wt1
= * ( int * ) darrayGet ( downwardWt1
, i
);
8107 scaf
= * ( unsigned int * ) darrayGet ( downwardTo1
, i
);
8108 wt2
= getWtFromSarray ( downwardTo2
, downwardWt2
, linkCount2
, scaf
);
8119 if ( linkCount1
&& flag1
)
8121 recoverLinks ( scafStack1
);
8126 recoverLinks ( scafStack2
);
8130 freeDarray ( downwardTo1
);
8131 freeDarray ( downwardTo2
);
8132 freeDarray ( downwardWt1
);
8133 freeDarray ( downwardWt2
);
8137 /*************************************************
8141 Sets break points of scaffold at weak connections.
8143 1. ctgArray: array of contig in scaffold
8144 2. count: total contig number
8145 3. weakest: contig whose downstream connection was
8146 estimated break point
8148 1. start: the first contig with weak downstream connection
8149 2. finish: the last contig with downstream connection
8152 *************************************************/
8153 static void setBreakPoints ( DARRAY
* ctgArray
, int count
, int weakest
,
8154 int * start
, int * finish
)
8156 int index
= weakest
- 1;
8157 unsigned int thisCtg
;
8158 unsigned int nextCtg
= * ( unsigned int * ) darrayGet ( ctgArray
, weakest
);
8162 while ( index
>= 0 )
8164 thisCtg
= * ( unsigned int * ) darrayGet ( ctgArray
, index
);
8165 cnt
= getCntBetween ( thisCtg
, nextCtg
);
8167 if ( cnt
->maxGap
> 2 )
8176 unsigned int prevCtg
= * ( unsigned int * ) darrayGet ( ctgArray
, weakest
+ 1 );
8177 *finish
= weakest
+ 1;
8178 index
= weakest
+ 2;
8180 while ( index
< count
)
8182 thisCtg
= * ( unsigned int * ) darrayGet ( ctgArray
, index
);
8183 cnt
= getCntBetween ( prevCtg
, thisCtg
);
8185 if ( cnt
->maxGap
> 2 )
8188 { *finish
= index
; }
8195 /*************************************************
8199 Changes "to_vt" of contigs in a scaffold to a specified contig.
8201 1. scafStack: stack of contigs in scaffold
8202 2. end: specified contig
8207 *************************************************/
8208 static void changeScafEnd ( STACK
* scafStack
, unsigned int end
)
8210 unsigned int ctg
, *pt
;
8211 unsigned int start
= getTwinCtg ( end
);
8212 stackRecover ( scafStack
);
8214 while ( ( pt
= ( unsigned int * ) stackPop ( scafStack
) ) != NULL
)
8217 contig_array
[ctg
].to_vt
= end
;
8218 contig_array
[getTwinCtg ( ctg
)].from_vt
= start
;
8222 /*************************************************
8226 Changes "from_vt" of contigs in a scaffold to a specified
8229 1. scafStack: stack of contigs in scaffold
8230 2. start: specified contig
8235 *************************************************/
8236 static void changeScafBegin ( STACK
* scafStack
, unsigned int start
)
8238 unsigned int ctg
, *pt
;
8239 unsigned int end
= getTwinCtg ( start
);
8240 stackRecover ( scafStack
);
8242 while ( ( pt
= ( unsigned int * ) stackPop ( scafStack
) ) != NULL
)
8245 contig_array
[ctg
].from_vt
= start
;
8246 contig_array
[getTwinCtg ( ctg
)].to_vt
= end
;
8250 /*************************************************
8254 Detects and breaks the weak connection between contigs
8255 in scaffold according to the support of large insert size
8263 *************************************************/
8264 static void detectBreakScaf()
8266 unsigned int i
, avgPE
, scafLen
, len
, ctg
, bal_ctg
, prevCtg
, thisCtg
;
8267 long long peCounter
, linkCounter
;
8268 int num3
, num5
, weakPoint
, tempCounter
, j
, t
, counter
= 0;
8269 CONNECT
* bindCnt
, *cnt
, *weakCnt
;
8270 STACK
* scafStack1
= ( STACK
* ) createStack ( 1000, sizeof ( unsigned int ) );
8271 STACK
* scafStack2
= ( STACK
* ) createStack ( 1000, sizeof ( unsigned int ) );
8273 for ( i
= 1; i
<= num_ctg
; i
++ )
8274 { contig_array
[i
].flag
= 0; }
8276 for ( i
= 1; i
<= num_ctg
; i
++ )
8278 if ( contig_array
[i
].flag
|| contig_array
[i
].mask
|| !contig_array
[i
].downwardConnect
)
8281 bindCnt
= getBindCnt ( i
);
8286 //first scan to get the average coverage by longer pe
8287 num5
= num3
= peCounter
= linkCounter
= 0;
8288 scafLen
= contig_array
[i
].length
;
8290 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = i
;
8291 contig_array
[i
].flag
= 1;
8292 contig_array
[getTwinCtg ( i
)].flag
= 1;
8296 if ( !bindCnt
->bySmall
)
8300 peCounter
+= bindCnt
->maxGap
;
8301 ctg
= bindCnt
->contigID
;
8302 scafLen
+= contig_array
[ctg
].length
;
8303 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = ctg
;
8304 bal_ctg
= getTwinCtg ( ctg
);
8305 contig_array
[ctg
].flag
= 1;
8306 contig_array
[bal_ctg
].flag
= 1;
8307 bindCnt
= bindCnt
->nextInScaf
;
8310 ctg
= getTwinCtg ( i
);
8311 bindCnt
= getBindCnt ( ctg
);
8315 if ( !bindCnt
->bySmall
)
8319 peCounter
+= bindCnt
->maxGap
;
8320 ctg
= bindCnt
->contigID
;
8321 scafLen
+= contig_array
[ctg
].length
;
8322 bal_ctg
= getTwinCtg ( ctg
);
8323 contig_array
[ctg
].flag
= 1;
8324 contig_array
[bal_ctg
].flag
= 1;
8325 * ( unsigned int * ) darrayPut ( scaf3
, num3
++ ) = bal_ctg
;
8326 bindCnt
= bindCnt
->nextInScaf
;
8329 if ( linkCounter
< 1 || scafLen
< 5000 )
8332 avgPE
= peCounter
/ linkCounter
;
8339 for ( j
= num3
- 1; j
>= 0; j
-- )
8340 * ( unsigned int * ) darrayPut ( tempArray
, tempCounter
++ ) =
8341 * ( unsigned int * ) darrayGet ( scaf3
, j
);
8343 for ( j
= 0; j
< num5
; j
++ )
8344 * ( unsigned int * ) darrayPut ( tempArray
, tempCounter
++ ) =
8345 * ( unsigned int * ) darrayGet ( scaf5
, j
);
8347 prevCtg
= * ( unsigned int * ) darrayGet ( tempArray
, 0 );
8350 len
= contig_array
[prevCtg
].length
;
8352 for ( t
= 1; t
< tempCounter
; t
++ )
8354 thisCtg
= * ( unsigned int * ) darrayGet ( tempArray
, t
);
8358 len
+= contig_array
[thisCtg
].length
;
8362 else if ( len
> scafLen
- 2000 )
8365 len
+= contig_array
[thisCtg
].length
;
8367 if ( contig_array
[prevCtg
].from_vt
!= contig_array
[thisCtg
].from_vt
||
8368 contig_array
[prevCtg
].indexInScaf
> contig_array
[thisCtg
].indexInScaf
)
8374 cnt
= getCntBetween ( prevCtg
, thisCtg
);
8376 if ( !weakCnt
|| weakCnt
->maxGap
> cnt
->maxGap
)
8385 if ( !weakCnt
|| ( weakCnt
->maxGap
> 2 && weakCnt
->maxGap
> avgPE
/ 5 ) )
8388 prevCtg
= * ( unsigned int * ) darrayGet ( tempArray
, weakPoint
- 1 );
8389 thisCtg
= * ( unsigned int * ) darrayGet ( tempArray
, weakPoint
);
8391 if ( contig_array
[prevCtg
].from_vt
!= contig_array
[thisCtg
].from_vt
||
8392 contig_array
[prevCtg
].indexInScaf
> contig_array
[thisCtg
].indexInScaf
)
8394 fprintf ( stderr
, "contig %d and %d not on the same scaff\n", prevCtg
, thisCtg
);
8398 setConnectWP ( prevCtg
, thisCtg
, 1 );
8400 setBreakPoints ( tempArray
, tempCounter
, weakPoint
- 1, &index1
, &index2
);
8401 unsigned int start
= * ( unsigned int * ) darrayGet ( tempArray
, index1
);
8402 unsigned int finish
= * ( unsigned int * ) darrayGet ( tempArray
, index2
);
8403 int len1
= getScaffold ( getTwinCtg ( start
), scafStack1
);
8404 int len2
= getScaffold ( finish
, scafStack2
);
8406 if ( len1
< 2000 || len2
< 2000 )
8409 switch2twin ( scafStack1
);
8410 int flag1
= checkScafConsist ( scafStack1
, len1
, scafStack2
, len2
);
8411 switch2twin ( scafStack1
);
8412 switch2twin ( scafStack2
);
8413 int flag2
= checkScafConsist ( scafStack2
, len2
, scafStack1
, len1
);
8415 if ( !flag1
|| !flag2
)
8417 changeScafBegin ( scafStack1
, getTwinCtg ( start
) );
8418 changeScafEnd ( scafStack2
, getTwinCtg ( finish
) );
8420 unsigned int nextCtg
= * ( unsigned int * ) darrayGet ( tempArray
, index1
+ 1 );
8421 thisCtg
= * ( unsigned int * ) darrayGet ( tempArray
, index1
);
8422 cnt
= getCntBetween ( getTwinCtg ( nextCtg
), getTwinCtg ( thisCtg
) );
8424 if ( cnt
->nextInScaf
)
8426 prevCtg
= getTwinCtg ( cnt
->nextInScaf
->contigID
);
8427 cnt
->nextInScaf
->prevInScaf
= 0;
8428 cnt
= getCntBetween ( prevCtg
, thisCtg
);
8429 cnt
->nextInScaf
= NULL
;
8432 prevCtg
= * ( unsigned int * ) darrayGet ( tempArray
, index2
- 1 );
8433 thisCtg
= * ( unsigned int * ) darrayGet ( tempArray
, index2
);
8434 cnt
= getCntBetween ( prevCtg
, thisCtg
);
8436 if ( cnt
->nextInScaf
)
8438 nextCtg
= cnt
->nextInScaf
->contigID
;
8439 cnt
->nextInScaf
->prevInScaf
= 0;
8440 cnt
= getCntBetween ( getTwinCtg ( nextCtg
), getTwinCtg ( thisCtg
) );
8441 cnt
->nextInScaf
= NULL
;
8444 prevCtg
= * ( unsigned int * ) darrayGet ( tempArray
, index1
);
8446 for ( t
= index1
+ 1; t
<= index2
; t
++ )
8448 thisCtg
= * ( unsigned int * ) darrayGet ( tempArray
, t
);
8449 cnt
= getCntBetween ( prevCtg
, thisCtg
);
8451 cnt
->nextInScaf
= NULL
;
8452 cnt
->prevInScaf
= 0;
8453 cnt
= getCntBetween ( getTwinCtg ( thisCtg
), getTwinCtg ( prevCtg
) );
8455 cnt
->nextInScaf
= NULL
;
8456 cnt
->prevInScaf
= 0;
8464 freeStack ( scafStack1
);
8465 freeStack ( scafStack2
);
8466 fprintf ( stderr
, "Report from checkScaf: %d scaffold segments broken.\n", counter
);
8470 /*************************************************
8474 Detects and breaks the weak connection between contigs
8475 in scaffold according to the support of large insert size
8483 *************************************************/
8484 static void detectBreakScaff()
8486 unsigned int i
, avgPE
, scafLen
, len
, ctg
, bal_ctg
, prevCtg
, thisCtg
;
8487 long long peCounter
, linkCounter
;
8488 int num3
, num5
, weakPoint
, tempCounter
, j
, t
, counter
= 0;
8490 CONNECT
* bindCnt
, *cnt
, *weakCnt
;
8492 STACK
* scafStack1
= ( STACK
* ) createStack ( 1000, sizeof ( unsigned int ) );
8493 STACK
* scafStack2
= ( STACK
* ) createStack ( 1000, sizeof ( unsigned int ) );
8495 for ( i
= 1; i
<= num_ctg
; i
++ )
8496 { contig_array
[i
].flag
= 0; }
8498 for ( i
= 1; i
<= num_ctg
; i
++ )
8500 if ( contig_array
[i
].flag
|| contig_array
[i
].mask
|| !contig_array
[i
].downwardConnect
)
8503 bindCnt
= getBindCnt ( i
);
8508 //first scan to get the average coverage by longer pe
8509 num5
= num3
= peCounter
= linkCounter
= 0;
8510 scafLen
= contig_array
[i
].length
;
8512 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = i
;
8513 contig_array
[i
].flag
= 1;
8514 contig_array
[getTwinCtg ( i
)].flag
= 1;
8519 peCounter
+= bindCnt
->maxGap
;
8520 ctg
= bindCnt
->contigID
;
8521 scafLen
+= bindCnt
->gapLen
+ contig_array
[ctg
].length
;
8522 * ( unsigned int * ) darrayPut ( scaf5
, num5
++ ) = ctg
;
8523 bal_ctg
= getTwinCtg ( ctg
);
8524 contig_array
[ctg
].flag
= 1;
8525 contig_array
[bal_ctg
].flag
= 1;
8526 bindCnt
= bindCnt
->nextInScaf
;
8529 ctg
= getTwinCtg ( i
);
8530 bindCnt
= getBindCnt ( ctg
);
8535 peCounter
+= bindCnt
->maxGap
;
8536 ctg
= bindCnt
->contigID
;
8537 scafLen
+= bindCnt
->gapLen
+ contig_array
[ctg
].length
;
8538 bal_ctg
= getTwinCtg ( ctg
);
8539 contig_array
[ctg
].flag
= 1;
8540 contig_array
[bal_ctg
].flag
= 1;
8541 * ( unsigned int * ) darrayPut ( scaf3
, num3
++ ) = bal_ctg
;
8542 bindCnt
= bindCnt
->nextInScaf
;
8545 if ( scafLen
< Insert_size
)
8548 avgPE
= peCounter
/ linkCounter
;
8555 for ( j
= num3
- 1; j
>= 0; j
-- )
8556 * ( unsigned int * ) darrayPut ( tempArray
, tempCounter
++ ) =
8557 * ( unsigned int * ) darrayGet ( scaf3
, j
);
8559 for ( j
= 0; j
< num5
; j
++ )
8560 * ( unsigned int * ) darrayPut ( tempArray
, tempCounter
++ ) =
8561 * ( unsigned int * ) darrayGet ( scaf5
, j
);
8563 prevCtg
= * ( unsigned int * ) darrayGet ( tempArray
, 0 );
8566 len
= contig_array
[prevCtg
].length
;
8568 for ( t
= 1; t
< tempCounter
; t
++ )
8571 thisCtg
= * ( unsigned int * ) darrayGet ( tempArray
, t
);
8572 cnt
= contig_array
[thisCtg
].downwardConnect
;
8576 if ( cnt
->newIns
== 1 )
8578 ctg
= cnt
->contigID
;
8579 bal_ctg
= getTwinCtg ( ctg
);
8586 bal_cnt
= contig_array
[getTwinCtg ( thisCtg
)].downwardConnect
;
8587 cnt
= getCntBetween ( prevCtg
, thisCtg
);
8589 if ( len
< Insert_size
)
8591 len
+= cnt
->gapLen
+ contig_array
[thisCtg
].length
;
8595 else if ( len
> scafLen
- Insert_size
)
8598 len
+= cnt
->gapLen
+ contig_array
[thisCtg
].length
;
8600 if ( contig_array
[prevCtg
].from_vt
!= contig_array
[thisCtg
].from_vt
||
8601 contig_array
[prevCtg
].indexInScaf
> contig_array
[thisCtg
].indexInScaf
)
8607 if ( !weakCnt
|| weakCnt
->maxGap
> cnt
->maxGap
)
8616 if ( !weakCnt
|| ( weakCnt
->maxGap
> 2 && weakCnt
->maxGap
> avgPE
/ 5 ) )
8619 prevCtg
= * ( unsigned int * ) darrayGet ( tempArray
, weakPoint
- 1 );
8620 thisCtg
= * ( unsigned int * ) darrayGet ( tempArray
, weakPoint
);
8622 if ( contig_array
[prevCtg
].from_vt
!= contig_array
[thisCtg
].from_vt
||
8623 contig_array
[prevCtg
].indexInScaf
> contig_array
[thisCtg
].indexInScaf
)
8625 printf ( "contig %d and %d not on the same scaff\n", prevCtg
, thisCtg
);
8629 setConnectWP ( prevCtg
, thisCtg
, 1 );
8630 // set start and end to break down the scaffold
8632 setBreakPoints ( tempArray
, tempCounter
, weakPoint
- 1, &index1
, &index2
);
8633 unsigned int start
= * ( unsigned int * ) darrayGet ( tempArray
, index1
);
8634 unsigned int finish
= * ( unsigned int * ) darrayGet ( tempArray
, index2
);
8635 int len1
= getScaffold ( getTwinCtg ( start
), scafStack1
);
8636 int len2
= getScaffold ( finish
, scafStack2
);
8638 if ( len1
< Insert_size
|| len2
< Insert_size
)
8641 switch2twin ( scafStack1
);
8642 int flag1
= checkScafConsist ( scafStack1
, len1
, scafStack2
, len2
);
8643 switch2twin ( scafStack1
);
8644 switch2twin ( scafStack2
);
8645 int flag2
= checkScafConsist ( scafStack2
, len2
, scafStack1
, len1
);
8647 if ( !flag1
|| !flag2
)
8649 changeScafBegin ( scafStack1
, getTwinCtg ( start
) );
8650 changeScafEnd ( scafStack2
, getTwinCtg ( finish
) );
8652 unsigned int nextCtg
= * ( unsigned int * ) darrayGet ( tempArray
, index1
+ 1 );
8653 thisCtg
= * ( unsigned int * ) darrayGet ( tempArray
, index1
);
8654 cnt
= getCntBetween ( getTwinCtg ( nextCtg
), getTwinCtg ( thisCtg
) );
8656 if ( cnt
->nextInScaf
)
8658 prevCtg
= getTwinCtg ( cnt
->nextInScaf
->contigID
);
8659 cnt
->nextInScaf
->prevInScaf
= 0;
8660 cnt
= getCntBetween ( prevCtg
, thisCtg
);
8661 cnt
->nextInScaf
= NULL
;
8664 prevCtg
= * ( unsigned int * ) darrayGet ( tempArray
, index2
- 1 );
8665 thisCtg
= * ( unsigned int * ) darrayGet ( tempArray
, index2
);
8666 cnt
= getCntBetween ( prevCtg
, thisCtg
);
8668 if ( cnt
->nextInScaf
)
8670 nextCtg
= cnt
->nextInScaf
->contigID
;
8671 cnt
->nextInScaf
->prevInScaf
= 0;
8672 cnt
= getCntBetween ( getTwinCtg ( nextCtg
), getTwinCtg ( thisCtg
) );
8673 cnt
->nextInScaf
= NULL
;
8676 prevCtg
= * ( unsigned int * ) darrayGet ( tempArray
, index1
);
8678 for ( t
= index1
+ 1; t
<= index2
; t
++ )
8680 thisCtg
= * ( unsigned int * ) darrayGet ( tempArray
, t
);
8681 cnt
= getCntBetween ( prevCtg
, thisCtg
);
8683 cnt
->nextInScaf
= NULL
;
8684 cnt
->prevInScaf
= 0;
8685 cnt
= getCntBetween ( getTwinCtg ( thisCtg
), getTwinCtg ( prevCtg
) );
8687 cnt
->nextInScaf
= NULL
;
8688 cnt
->prevInScaf
= 0;
8696 freeStack ( scafStack1
);
8697 freeStack ( scafStack2
);
8698 fprintf ( stderr
, "Report from checkScaf: %d scaffold segments broken.\n", counter
);
8701 /*************************************************
8705 Checks whether there is a contig appearing more than once
8708 1. ctgArray: array of contigs
8709 2. count: contig number
8713 1 if no contig appeared more than once.
8714 *************************************************/
8715 static boolean
checkSimple ( DARRAY
* ctgArray
, int count
)
8720 for ( i
= 0; i
< count
; i
++ )
8722 ctg
= * ( unsigned int * ) darrayGet ( ctgArray
, i
);
8723 contig_array
[ctg
].flag
= 0;
8724 contig_array
[getTwinCtg ( ctg
)].flag
= 0;
8727 for ( i
= 0; i
< count
; i
++ )
8729 ctg
= * ( unsigned int * ) darrayGet ( ctgArray
, i
);
8731 if ( contig_array
[ctg
].flag
)
8734 contig_array
[ctg
].flag
= 1;
8735 contig_array
[getTwinCtg ( ctg
)].flag
= 1;
8741 /*************************************************
8745 Detects and masks contigs having circle connections.
8753 *************************************************/
8754 static void checkCircle()
8756 unsigned int i
, ctg
;
8760 for ( i
= 1; i
<= num_ctg
; i
++ )
8762 cn_temp1
= contig_array
[i
].downwardConnect
;
8766 if ( cn_temp1
->weak
|| cn_temp1
->deleted
)
8768 cn_temp1
= cn_temp1
->next
;
8772 ctg
= cn_temp1
->contigID
;
8774 if ( checkConnect ( ctg
, i
) )
8777 maskContig ( i
, 1 );
8778 maskContig ( ctg
, 1 );
8781 cn_temp1
= cn_temp1
->next
;
8785 fprintf ( stderr
, "%d circles removed.\n", counter
);