4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
29 #define CTGendLen 35 // shouldn't larger than max_read_len
31 #define MaxRouteNum 10
33 static void kmerSet_mark ( KmerSet
* set
);
34 static void trace4Repeat ( Kmer currW
, int steps
, int min
, int max
, int * num_route
, KmerSet
* kset
,
35 Kmer kmerDest
, int overlap
, Kmer WORDF
, int * traceCounter
, int maxRoute
,
36 kmer_t
** soFarNode
, short * multiOccu1
, short * multiOccu2
, int * routeLens
,
37 char ** foundRoutes
, char * soFarSeq
, long long * soFarLinks
, double * avgLinks
);
40 static Kmer
prevKmerLocal ( Kmer next
, char ch
, int overlap
)
42 Kmer word
= KmerRightBitMoveBy2 ( next
);
44 if ( 2 * ( overlap
- 1 ) < 64 )
46 word
.low2
|= ( ( ( ubyte8
) ch
) << 2 * ( overlap
- 1 ) );
49 if ( 2 * ( overlap
- 1 ) >= 64 && 2 * ( overlap
- 1 ) < 128 )
51 word
.high2
|= ( ( ubyte8
) ch
) << ( 2 * ( overlap
- 1 ) - 64 );
54 if ( 2 * ( overlap
- 1 ) >= 128 && 2 * ( overlap
- 1 ) < 192 )
56 word
.low1
|= ( ( ubyte8
) ch
) << ( 2 * ( overlap
- 1 ) - 128 );
59 if ( 2 * ( overlap
- 1 ) >= 192 && 2 * ( overlap
- 1 ) < 256 )
61 word
.high1
|= ( ( ubyte8
) ch
) << ( 2 * ( overlap
- 1 ) - 192 );
67 static Kmer
nextKmerLocal ( Kmer prev
, char ch
, Kmer WordFilter
)
69 Kmer word
= KmerLeftBitMoveBy2 ( prev
);
70 word
= KmerAnd ( word
, WordFilter
);
75 static Kmer
prevKmerLocal ( Kmer next
, char ch
, int overlap
)
77 Kmer word
= KmerRightBitMoveBy2 ( next
);
79 if ( 2 * ( overlap
- 1 ) < 64 )
81 word
.low
|= ( ( ( ubyte8
) ch
) << 2 * ( overlap
- 1 ) );
85 word
.high
|= ( ( ubyte8
) ch
) << ( 2 * ( overlap
- 1 ) - 64 );
91 static Kmer
nextKmerLocal ( Kmer prev
, char ch
, Kmer WordFilter
)
93 Kmer word
= KmerLeftBitMoveBy2 ( prev
);
94 word
= KmerAnd ( word
, WordFilter
);
99 static void singleKmer ( int t
, KmerSet
* kset
, int flag
, Kmer
* kmerBuffer
, char * prevcBuffer
, char * nextcBuffer
)
102 put_kmerset ( kset
, kmerBuffer
[t
], prevcBuffer
[t
], nextcBuffer
[t
], &pos
);
104 if ( pos
->inEdge
== flag
)
108 else if ( pos
->inEdge
== 0 )
112 else if ( pos
->inEdge
== 1 && flag
== 2 )
116 else if ( pos
->inEdge
== 2 && flag
== 1 )
122 static void putKmer2DBgraph ( KmerSet
* kset
, int flag
, int kmer_c
, Kmer
* kmerBuffer
, char * prevcBuffer
, char * nextcBuffer
)
126 for ( t
= 0; t
< kmer_c
; t
++ )
128 singleKmer ( t
, kset
, flag
, kmerBuffer
, prevcBuffer
, nextcBuffer
);
132 static void getSeqFromRead ( READNEARBY read
, char * src_seq
)
134 int len_seq
= read
.len
;
136 char * tightSeq
= ( char * ) darrayGet ( readSeqInGap
, read
.seqStarter
);
138 for ( j
= 0; j
< len_seq
; j
++ )
140 src_seq
[j
] = getCharInTightString ( tightSeq
, j
);
145 static void chopKmer4Ctg ( Kmer
* kmerCtg
, int lenCtg
, int overlap
, char * src_seq
, Kmer WORDF
)
149 word
.high1
= word
.low1
= word
.high2
= word
.low2
= 0;
151 for ( index
= 0; index
< overlap
; index
++ )
153 word
= KmerLeftBitMoveBy2 ( word
);
154 word
.low2
|= src_seq
[index
];
158 kmerCtg
[index
++] = word
;
160 for ( j
= 1; j
<= lenCtg
- overlap
; j
++ )
162 word
= nextKmerLocal ( word
, src_seq
[j
- 1 + overlap
], WORDF
);
163 kmerCtg
[index
++] = word
;
167 static void chopKmer4Ctg ( Kmer
* kmerCtg
, int lenCtg
, int overlap
, char * src_seq
, Kmer WORDF
)
171 word
.high
= word
.low
= 0;
173 for ( index
= 0; index
< overlap
; index
++ )
175 word
= KmerLeftBitMoveBy2 ( word
);
176 word
.low
|= src_seq
[index
];
180 kmerCtg
[index
++] = word
;
182 for ( j
= 1; j
<= lenCtg
- overlap
; j
++ )
184 word
= nextKmerLocal ( word
, src_seq
[j
- 1 + overlap
], WORDF
);
185 kmerCtg
[index
++] = word
;
190 static void chopKmer4read ( int len_seq
, int overlap
, char * src_seq
, char * bal_seq
, Kmer
* kmerBuffer
, char * prevcBuffer
, char * nextcBuffer
, int * kmer_c
, Kmer WORDF
)
197 if ( len_seq
< overlap
+ 1 )
204 word
.high1
= word
.low1
= word
.high2
= word
.low2
= 0;
206 for ( index
= 0; index
< overlap
; index
++ )
208 word
= KmerLeftBitMoveBy2 ( word
);
209 word
.low2
|= src_seq
[index
];
213 word
.high
= word
.low
= 0;
215 for ( index
= 0; index
< overlap
; index
++ )
217 word
= KmerLeftBitMoveBy2 ( word
);
218 word
.low
|= src_seq
[index
];
222 reverseComplementSeq ( src_seq
, len_seq
, bal_seq
);
223 // complementary node
224 bal_word
= reverseComplement ( word
, overlap
);
225 bal_j
= len_seq
- 0 - overlap
; // 0;
228 if ( KmerSmaller ( word
, bal_word
) )
230 kmerBuffer
[index
] = word
;
231 prevcBuffer
[index
] = InvalidCh
;
232 nextcBuffer
[index
++] = src_seq
[0 + overlap
];
236 kmerBuffer
[index
] = bal_word
;
237 prevcBuffer
[index
] = bal_seq
[bal_j
- 1];
238 nextcBuffer
[index
++] = InvalidCh
;
241 for ( j
= 1; j
<= len_seq
- overlap
; j
++ )
243 word
= nextKmerLocal ( word
, src_seq
[j
- 1 + overlap
], WORDF
);
244 bal_j
= len_seq
- j
- overlap
; // j;
245 bal_word
= prevKmerLocal ( bal_word
, bal_seq
[bal_j
], overlap
);
247 if ( KmerSmaller ( word
, bal_word
) )
249 kmerBuffer
[index
] = word
;
250 prevcBuffer
[index
] = src_seq
[j
- 1];
252 if ( j
< len_seq
- overlap
)
254 nextcBuffer
[index
++] = src_seq
[j
+ overlap
];
258 nextcBuffer
[index
++] = InvalidCh
;
261 //printf("%dth: %p with %p\n",kmer_c-1,word,hashBanBuffer[kmer_c-1]);
265 // complementary node
266 kmerBuffer
[index
] = bal_word
;
270 prevcBuffer
[index
] = bal_seq
[bal_j
- 1];
274 prevcBuffer
[index
] = InvalidCh
;
277 nextcBuffer
[index
++] = bal_seq
[bal_j
+ overlap
];
278 //printf("%dth: %p with %p\n",kmer_c-1,bal_word,hashBanBuffer[kmer_c-1]);
285 static void headTightStr ( char * tightStr
, int length
, int start
, int headLen
, int revS
, char * src_seq
)
291 for ( i
= start
; i
< start
+ headLen
; i
++ )
293 src_seq
[index
++] = getCharInTightString ( tightStr
, i
);
298 for ( i
= length
- 1 - start
; i
>= length
- headLen
- start
; i
-- )
300 src_seq
[index
++] = int_comp ( getCharInTightString ( tightStr
, i
) );
305 static int getSeqFromCtg ( CTGinSCAF
* ctg
, boolean fromHead
, unsigned int len
, int originOverlap
, char * src_seq
)
307 unsigned int ctgId
= ctg
->ctgID
;
308 unsigned int bal_ctg
= getTwinCtg ( ctgId
);
310 if ( contig_array
[ctgId
].length
< 1 )
315 unsigned int length
= contig_array
[ctgId
].length
+ originOverlap
;
316 len
= len
< length
? len
: length
;
320 if ( contig_array
[ctgId
].seq
)
322 headTightStr ( contig_array
[ctgId
].seq
, length
, 0, len
, 0, src_seq
);
326 headTightStr ( contig_array
[bal_ctg
].seq
, length
, 0, len
, 1, src_seq
);
331 if ( contig_array
[ctgId
].seq
)
333 headTightStr ( contig_array
[ctgId
].seq
, length
, length
- len
, len
, 0, src_seq
);
337 headTightStr ( contig_array
[bal_ctg
].seq
, length
, length
- len
, len
, 1, src_seq
);
344 static KmerSet
* readsInGap2DBgraph ( READNEARBY
* rdArray
, int num
, CTGinSCAF
* ctg1
, CTGinSCAF
* ctg2
, int originOverlap
, Kmer
* kmerCtg1
, Kmer
* kmerCtg2
, int overlap
, Kmer WordFilter
)
348 char * nextcBuffer
, *prevcBuffer
;
350 int buffer_size
= maxReadLen
> CTGendLen
? maxReadLen
: CTGendLen
;
351 KmerSet
* kmerS
= NULL
;
356 src_seq
= ( char * ) ckalloc ( buffer_size
* sizeof ( char ) );
357 bal_seq
= ( char * ) ckalloc ( buffer_size
* sizeof ( char ) );
358 lenCtg1
= getSeqFromCtg ( ctg1
, 0, CTGendLen
, originOverlap
, src_seq
);
359 lenCtg2
= getSeqFromCtg ( ctg2
, 1, CTGendLen
, originOverlap
, src_seq
);
361 if ( lenCtg1
<= overlap
|| lenCtg2
<= overlap
)
363 free ( ( void * ) src_seq
);
364 free ( ( void * ) bal_seq
);
368 kmerBuffer
= ( Kmer
* ) ckalloc ( buffer_size
* sizeof ( Kmer
) );
369 prevcBuffer
= ( char * ) ckalloc ( buffer_size
* sizeof ( char ) );
370 nextcBuffer
= ( char * ) ckalloc ( buffer_size
* sizeof ( char ) );
371 kmerS
= init_kmerset ( 1024, 0.77f
);
373 for ( i
= 0; i
< num
; i
++ )
375 getSeqFromRead ( rdArray
[i
], src_seq
);
376 chopKmer4read ( rdArray
[i
].len
, overlap
, src_seq
, bal_seq
, kmerBuffer
, prevcBuffer
, nextcBuffer
, &kmer_c
, WordFilter
);
377 putKmer2DBgraph ( kmerS
, 0, kmer_c
, kmerBuffer
, prevcBuffer
, nextcBuffer
);
380 lenCtg1
= getSeqFromCtg ( ctg1
, 0, CTGendLen
, originOverlap
, src_seq
);
381 chopKmer4Ctg ( kmerCtg1
, lenCtg1
, overlap
, src_seq
, WordFilter
);
382 chopKmer4read ( lenCtg1
, overlap
, src_seq
, bal_seq
, kmerBuffer
, prevcBuffer
, nextcBuffer
, &kmer_c
, WordFilter
);
383 putKmer2DBgraph ( kmerS
, 1, kmer_c
, kmerBuffer
, prevcBuffer
, nextcBuffer
);
384 lenCtg2
= getSeqFromCtg ( ctg2
, 1, CTGendLen
, originOverlap
, src_seq
);
385 chopKmer4Ctg ( kmerCtg2
, lenCtg2
, overlap
, src_seq
, WordFilter
);
386 chopKmer4read ( lenCtg2
, overlap
, src_seq
, bal_seq
, kmerBuffer
, prevcBuffer
, nextcBuffer
, &kmer_c
, WordFilter
);
387 putKmer2DBgraph ( kmerS
, 2, kmer_c
, kmerBuffer
, prevcBuffer
, nextcBuffer
);
389 if(ctg1->ctgID==3733&&ctg2->ctgID==3067){
390 for(i=0;i<lenCtg2;i++)
391 printf("%c",int2base((int)src_seq[i]));
395 //printf("sequence length chop from contigs on both sides: %d %d\n",lenCtg1,lenCtg2);
396 //kmerSet_deLoop(kmerS,WordFilter);
397 kmerSet_mark ( kmerS
);
398 free ( ( void * ) src_seq
);
399 free ( ( void * ) bal_seq
);
400 free ( ( void * ) kmerBuffer
);
401 free ( ( void * ) nextcBuffer
);
402 free ( ( void * ) prevcBuffer
);
407 static void printKmerSeqLocal ( FILE * fp
, Kmer kmer
, int overlap
)
409 int i
, bit1
, bit2
, bit3
, bit4
;
421 if ( overlap
>= 32 && overlap
< 64 )
429 if ( overlap
>= 64 && overlap
< 96 )
437 if ( overlap
>= 96 && overlap
< 128 )
445 for ( i
= bit1
- 1; i
>= 0; i
-- )
447 ch
= kmer
.high1
& 0x3;
452 for ( i
= bit2
- 1; i
>= 0; i
-- )
454 ch
= kmer
.low1
& 0x3;
456 kmerSeq
[i
+ bit1
] = ch
;
459 for ( i
= bit3
- 1; i
>= 0; i
-- )
461 ch
= kmer
.high2
& 0x3;
463 kmerSeq
[i
+ bit1
+ bit2
] = ch
;
466 for ( i
= bit4
- 1; i
>= 0; i
-- )
468 ch
= kmer
.low2
& 0x3;
470 kmerSeq
[i
+ bit1
+ bit2
+ bit3
] = ch
;
473 for ( i
= 0; i
< overlap
; i
++ )
475 fprintf ( fp
, "%c", int2base ( ( int ) kmerSeq
[i
] ) );
479 static void printKmerSeqLocal ( FILE * fp
, Kmer kmer
, int overlap
)
484 bit2
= overlap
> 32 ? 32 : overlap
;
485 bit1
= overlap
> 32 ? overlap
- 32 : 0;
487 for ( i
= bit1
- 1; i
>= 0; i
-- )
489 ch
= kmer
.high
& 0x3;
494 for ( i
= bit2
- 1; i
>= 0; i
-- )
498 kmerSeq
[i
+ bit1
] = ch
;
501 for ( i
= 0; i
< overlap
; i
++ )
503 fprintf ( fp
, "%c", int2base ( ( int ) kmerSeq
[i
] ) );
508 static void kmerSet_mark ( KmerSet
* set
)
510 int i
, in_num
, out_num
, cvgSingle
;
512 long long counter
= 0, linear
= 0;
516 while ( set
->iter_ptr
< set
->size
)
518 if ( !is_kmer_entity_null ( set
->flags
, set
->iter_ptr
) )
520 in_num
= out_num
= 0;
521 rs
= set
->array
+ set
->iter_ptr
;
524 for ( i
= 0; i
< 4; i
++ )
526 cvgSingle
= get_kmer_left_cov ( *rs
, i
);
533 cvgSingle
= get_kmer_right_cov ( *rs
, i
);
546 if ( in_num
== 1 && out_num
== 1 )
556 //printf("Allocated %ld node, %ld single nodes, %ld linear\n",(long)count_kmerset(set),counter,linear);
559 static kmer_t
* searchNode ( Kmer word
, KmerSet
* kset
, int overlap
)
561 Kmer bal_word
= reverseComplement ( word
, overlap
);
565 if ( KmerSmaller ( word
, bal_word
) )
567 found
= search_kmerset ( kset
, word
, &node
);
571 found
= search_kmerset ( kset
, bal_word
, &node
);
584 static int searchKmerOnCtg ( Kmer currW
, Kmer
* kmerDest
, int num
)
588 for ( i
= 0; i
< num
; i
++ )
590 if ( KmerEqual ( currW
, kmerDest
[i
] ) )
599 // pick on from n items randomly
600 static int nPick1 ( int * array
, int n
)
603 m
= n
- 1; //(int)(drand48()*n);
604 int value
= array
[m
];
606 for ( i
= m
; i
< n
- 1; i
++ )
608 array
[i
] = array
[i
+ 1];
614 static void traceAlongDBgraph ( Kmer currW
, int steps
, int min
, int max
, int * num_route
, KmerSet
* kset
,
615 Kmer
* kmerDest
, int num
, int overlap
, Kmer WORDF
, char ** foundRoutes
, int * routeEndOnCtg2
,
616 int * routeLens
, char * soFarSeq
, int * traceCounter
, int maxRoute
, kmer_t
** soFarNode
,
617 boolean
* multiOccu
, long long * soFarLinks
, double * avgLinks
)
619 ( *traceCounter
) ++;
621 if ( *traceCounter
> UPlimit
)
624 if(overlap==19&&kmerDest[0]==pubKmer)
630 if ( steps
> max
|| *num_route
>= maxRoute
)
633 if(overlap==19&&kmerDest[0]==pubKmer)
634 printf("max steps/maxRoute\n");
639 Kmer word
= reverseComplement ( currW
, overlap
);
640 boolean isSmaller
= KmerSmaller ( currW
, word
);
651 boolean found
= search_kmerset ( kset
, word
, &node
);
656 fprintf ( stderr
, "%s Trace: can't find kmer ", __FUNCTION__
);
657 PrintKmer ( stderr
, word
);
658 fprintf ( stderr
, " (input " );
659 PrintKmer ( stderr
, currW
);
660 fprintf ( stderr
, ") at step %d.\n", steps
);
663 fprintf (stderr, "%s Trace: can't find kmer %llx %llx %llx %llx (input %llx %llx %llx %llx) at step %d.\n",
664 __FUNCTION__, word.high1, word.low1, word.high2, word.low2, currW.high1, currW.low1,
665 currW.high2, currW.low2, steps );
667 printf ( "Trace: can't find kmer %llx %llx (input %llx %llx) at step %d\n",
668 word.high, word.low, currW.high, currW.low, steps );
675 // if (!found) return;
678 if ( node
->twin
> 1 )
685 soFarNode
[steps
] = node
;
690 soFarSeq
[steps
- 1] = lastCharInKmer ( currW
);
694 int linkCounter
= *soFarLinks
;
696 if ( steps
>= min
&& node
->inEdge
> 1 && ( end
= searchKmerOnCtg ( currW
, kmerDest
, num
) ) >= 0 )
702 avgLinks
[index
] = ( double ) linkCounter
/ steps
;
709 //find node that appears more than once in the path
710 multiOccu
[index
] = 0;
712 for ( i
= 0; i
< steps
+ 1; i
++ )
714 soFarNode
[i
]->deleted
= 0;
717 for ( i
= 0; i
< steps
+ 1; i
++ )
719 if ( soFarNode
[i
]->deleted
)
721 multiOccu
[index
] = 1;
725 soFarNode
[i
]->deleted
= 1;
728 routeEndOnCtg2
[index
] = end
;
729 routeLens
[index
] = steps
;
730 char * array
= foundRoutes
[index
];
732 for ( i
= 0; i
< steps
; i
++ )
734 array
[i
] = soFarSeq
[i
];
740 } //indicate the end of the sequence
742 *num_route
= ++index
;
750 int array
[] = { 0, 1, 2, 3 };
752 for ( i
= 4; i
> 0; i
-- )
754 ch
= nPick1 ( array
, i
);
755 links
= get_kmer_right_cov ( *node
, ch
);
762 *soFarLinks
= linkCounter
+ links
;
763 word
= nextKmerLocal ( currW
, ch
, WORDF
);
764 traceAlongDBgraph ( word
, steps
, min
, max
, num_route
, kset
, kmerDest
, num
, overlap
, WORDF
,
765 foundRoutes
, routeEndOnCtg2
, routeLens
, soFarSeq
, traceCounter
, maxRoute
,
766 soFarNode
, multiOccu
, soFarLinks
, avgLinks
);
771 int array
[] = { 0, 1, 2, 3 };
773 for ( i
= 4; i
> 0; i
-- )
775 ch
= nPick1 ( array
, i
);
776 links
= get_kmer_left_cov ( *node
, ch
);
783 *soFarLinks
= linkCounter
+ links
;
784 word
= nextKmerLocal ( currW
, int_comp ( ch
), WORDF
);
785 traceAlongDBgraph ( word
, steps
, min
, max
, num_route
, kset
, kmerDest
, num
, overlap
, WORDF
,
786 foundRoutes
, routeEndOnCtg2
, routeLens
, soFarSeq
, traceCounter
, maxRoute
,
787 soFarNode
, multiOccu
, soFarLinks
, avgLinks
);
792 static int searchFgap ( KmerSet
* kset
, CTGinSCAF
* ctg1
, CTGinSCAF
* ctg2
, Kmer
* kmerCtg1
, Kmer
* kmerCtg2
,
793 unsigned int origOverlap
, int overlap
, DARRAY
* gapSeqArray
, int len1
, int len2
,
794 Kmer WordFilter
, int * offset1
, int * offset2
, char * seqGap
, int * cut1
, int * cut2
)
798 kmer_t
* node
, **soFarNode
;
800 int gapLen
= ctg2
->start
- ctg1
->end
- origOverlap
+ overlap
;
801 int min
= gapLen
- GLDiff
> 0 ? gapLen
- GLDiff
: 0; //0531
802 int max
= gapLen
+ GLDiff
< 10 ? 10 : gapLen
+ GLDiff
;
806 int * routeEndOnCtg2
;
809 long long soFarLinks
;
811 //mask linear internal linear kmer on contig1 end
812 routeEndOnCtg2
= ( int * ) ckalloc ( MaxRouteNum
* sizeof ( int ) );
813 routeLens
= ( int * ) ckalloc ( MaxRouteNum
* sizeof ( int ) );
814 multiOccu
= ( boolean
* ) ckalloc ( MaxRouteNum
* sizeof ( boolean
) );
815 short * MULTI1
= ( short * ) ckalloc ( MaxRouteNum
* sizeof ( short ) );
816 short * MULTI2
= ( short * ) ckalloc ( MaxRouteNum
* sizeof ( short ) );
817 soFarSeq
= ( char * ) ckalloc ( max
* sizeof ( char ) );
818 soFarNode
= ( kmer_t
** ) ckalloc ( ( max
+ 1 ) * sizeof ( kmer_t
* ) );
819 foundRoutes
= ( char ** ) ckalloc ( MaxRouteNum
* sizeof ( char * ) );;
820 avgLinks
= ( double * ) ckalloc ( MaxRouteNum
* sizeof ( double ) );;
822 for ( i
= 0; i
< MaxRouteNum
; i
++ )
824 foundRoutes
[i
] = ( char * ) ckalloc ( max
* sizeof ( char ) );
827 for ( i
= len1
- 1; i
>= 0; i
-- )
829 num_route
= traceCounter
= soFarLinks
= 0;
831 traceAlongDBgraph ( kmerCtg1
[i
], steps
, min
, max
, &num_route
, kset
, kmerCtg2
, len2
, overlap
, WordFilter
,
832 foundRoutes
, routeEndOnCtg2
, routeLens
, soFarSeq
, &traceCounter
, MaxRouteNum
, soFarNode
,
833 multiOccu
, &soFarLinks
, avgLinks
);
837 int m
, minEnd
= routeEndOnCtg2
[0];
839 for ( m
= 0; m
< num_route
; m
++ )
841 if ( routeLens
[m
] < 0 )
846 if ( routeEndOnCtg2
[m
] < minEnd
)
848 minEnd
= routeEndOnCtg2
[m
];
852 /* else if(minFreq>1){
853 for(m=0;m<num_route;m++){
854 if(routeEndOnCtg2[m]!=minEnd)
857 if(foundRoutes[m][j]>3)
859 printf("%c",int2base((int)foundRoutes[m][j]));
861 printf(": %4.2f\n",avgLinks[m]);
864 num_route
= traceCounter
= soFarLinks
= 0;
866 trace4Repeat ( kmerCtg1
[i
], steps
, min
, max
, &num_route
, kset
, kmerCtg2
[minEnd
], overlap
, WordFilter
, &traceCounter
,
867 MaxRouteNum
, soFarNode
, MULTI1
, MULTI2
, routeLens
, foundRoutes
, soFarSeq
, &soFarLinks
, avgLinks
);
869 int maxLen
= routeLens
[0];
870 double maxLink
= avgLinks
[0];
872 boolean repeat
= 0, sameLen
= 1;
873 int leftMost
= max
, rightMost
= max
;
878 fprintf ( stderr
, "After trace4Repeat: non route was found.\n" );
884 if ( num_route
< 1 ) { continue; }
890 // if multi paths are found, we check on the repeatative occurrences and links/length
891 for ( m
= 0; m
< num_route
; m
++ )
893 if ( routeLens
[m
] < 0 )
898 if ( MULTI1
[m
] >= 0 && MULTI2
[m
] >= 0 )
901 leftMost
= leftMost
> MULTI1
[m
] ? MULTI1
[m
] : leftMost
;
902 rightMost
= rightMost
> MULTI2
[m
] ? MULTI2
[m
] : rightMost
;
905 if ( routeLens
[m
] != maxLen
)
910 if ( routeLens
[m
] < maxLen
)
912 maxLen
= routeLens
[m
];
915 if ( avgLinks
[m
] > maxLink
)
917 maxLink
= avgLinks
[m
];
925 *offset1
= *offset2
= *cut1
= *cut2
= 0;
929 for ( j
= 0; j
< leftMost
; j
++ )
931 if ( routeLens
[0] < j
+ overlap
+ 1 )
937 ch
= foundRoutes
[0][j
];
940 for ( m
= 1; m
< num_route
; m
++ )
942 if ( routeLens
[m
] < 0 )
947 if ( ch
!= foundRoutes
[m
][j
] )
953 if ( m
== num_route
)
955 seqGap
[index
++] = ch
;
966 for ( j
= 0; j
< rightMost
; j
++ )
968 if ( routeLens
[0] - overlap
- 1 < j
)
974 ch
= foundRoutes
[0][routeLens
[0] - overlap
- 1 - j
];
977 for ( m
= 1; m
< num_route
; m
++ )
979 if ( routeLens
[m
] < 0 )
984 if ( ch
!= foundRoutes
[m
][routeLens
[m
] - overlap
- 1 - j
] )
990 if ( m
== num_route
)
1002 for ( j
= 0; j
< *offset2
; j
++ )
1004 seqGap
[*offset1
+ *offset2
- 1 - j
] = foundRoutes
[0][routeLens
[0] - overlap
- 1 - j
];
1007 if ( *offset1
> 0 || *offset2
> 0 )
1009 *cut1
= len1
- i
- 1;
1012 //fprintf(stderr,"\n");
1013 for ( m
= 0; m
< num_route
; m
++ )
1015 for ( j
= 0; j
< max
; j
++ )
1017 if ( foundRoutes
[m
][j
] > 3 )
1022 //fprintf(stderr,"%c",int2base((int)foundRoutes[m][j]));
1025 //fprintf(stderr,": %4.2f\n",avgLinks[m]);
1029 fprintf(stderr,">Gap (%d + %d) (%d + %d)\n",*offset1,*offset2,*cut1,*cut2);
1030 for(index=0;index<*offset1+*offset2;index++)
1031 fprintf(stderr,"%c",int2base(seqGap[index]));
1032 fprintf(stderr,"\n"); */
1039 if ( overlap
+ ( len1
- i
- 1 ) + minEnd
- routeLens
[best
] > ( int ) origOverlap
)
1044 ctg1
->gapSeqOffset
= gapSeqArray
->item_c
;
1045 ctg1
->gapSeqLen
= routeLens
[best
];
1047 if ( !darrayPut ( gapSeqArray
, ctg1
->gapSeqOffset
+ maxLen
/ 4 ) )
1052 pt
= ( char * ) darrayPut ( gapSeqArray
, ctg1
->gapSeqOffset
);
1055 printKmerSeqLocal(stderr,kmerCtg1[i],overlap);
1056 fprintf(stderr,"-");
1058 for ( j
= 0; j
< max
; j
++ )
1060 if ( foundRoutes
[best
][j
] > 3 )
1065 writeChar2tightString ( foundRoutes
[best
][j
], pt
, j
);
1066 //fprintf(stderr,"%c",int2base((int)foundRoutes[best][j]));
1069 //fprintf(stderr,": GAPSEQ %d + %d, avglink %4.2f\n",len1-i-1,minEnd,avgLinks[best]);
1070 ctg1
->cutTail
= len1
- i
- 1;
1071 ctg2
->cutHead
= overlap
+ minEnd
;
1072 ctg2
->scaftig_start
= 0;
1075 /* }if(num_route>1){
1079 else //mark node which leads to dead end
1081 node
= searchNode ( kmerCtg1
[i
], kset
, overlap
);
1090 for ( i
= 0; i
< MaxRouteNum
; i
++ )
1092 free ( ( void * ) foundRoutes
[i
] );
1095 free ( ( void * ) soFarSeq
);
1096 free ( ( void * ) soFarNode
);
1097 free ( ( void * ) multiOccu
);
1098 free ( ( void * ) MULTI1
);
1099 free ( ( void * ) MULTI2
);
1100 free ( ( void * ) foundRoutes
);
1101 free ( ( void * ) routeEndOnCtg2
);
1102 free ( ( void * ) routeLens
);
1106 static void trace4Repeat ( Kmer currW
, int steps
, int min
, int max
, int * num_route
, KmerSet
* kset
, Kmer kmerDest
, int overlap
, Kmer WORDF
,
1107 int * traceCounter
, int maxRoute
, kmer_t
** soFarNode
, short * multiOccu1
, short * multiOccu2
, int * routeLens
,
1108 char ** foundRoutes
, char * soFarSeq
, long long * soFarLinks
, double * avgLinks
)
1110 ( *traceCounter
) ++;
1112 if ( *traceCounter
> UPlimit
)
1117 if ( steps
> max
|| *num_route
>= maxRoute
)
1122 Kmer word
= reverseComplement ( currW
, overlap
);
1123 boolean isSmaller
= KmerSmaller ( currW
, word
);
1125 unsigned char links
;
1134 boolean found
= search_kmerset ( kset
, word
, &node
);
1139 fprintf ( stderr
, "%s Trace: can't find kmer ", __FUNCTION__
);
1140 PrintKmer ( stderr
, word
);
1141 fprintf ( stderr
, " (input " );
1142 PrintKmer ( stderr
, currW
);
1143 fprintf ( stderr
, ") at step %d.\n", steps
);
1146 printf ( "%s Trace: can't find kmer %llx %llx %llx %llx (input %llx %llx %llx %llx) at step %d\n",
1147 __FUNCTION__, word.high1, word.low1, word.high2, word.low2, currW.high1, currW.low1,
1148 currW.high2, currW.low2, steps );
1150 printf ( "Trace: can't find kmer %llx %llx (input %llx %llx) at step %d\n",
1151 word.high, word.low, currW.high, currW.low, steps );
1158 // if (!found) return;
1163 soFarNode
[steps
] = node
;
1166 if ( soFarSeq
&& steps
> 0 )
1168 soFarSeq
[steps
- 1] = lastCharInKmer ( currW
);
1175 linkCounter
= *soFarLinks
;
1178 if ( steps
>= min
&& KmerEqual ( currW
, kmerDest
) )
1182 if ( avgLinks
&& steps
> 0 )
1184 avgLinks
[index
] = ( double ) linkCounter
/ steps
;
1186 else if ( avgLinks
)
1188 avgLinks
[index
] = 0;
1191 //find node that appears more than once in the path
1192 if ( multiOccu1
&& multiOccu2
)
1194 for ( i
= 0; i
< steps
+ 1; i
++ )
1196 soFarNode
[i
]->deleted
= 0;
1202 for ( i
= 0; i
< steps
+ 1; i
++ )
1204 if ( soFarNode
[i
]->deleted
)
1206 rightMost
= rightMost
< i
- 1 ? i
- 1 : rightMost
;
1210 soFarNode
[i
]->deleted
= 1;
1215 multiOccu1
[index
] = multiOccu2
[index
] = -1;
1219 multiOccu2
[index
] = steps
- 2 - rightMost
< 0 ? 0 : steps
- 2 - rightMost
; //[0 steps-2]
1221 for ( i
= 0; i
< steps
+ 1; i
++ )
1223 soFarNode
[i
]->deleted
= 0;
1226 int leftMost
= steps
- 2;
1228 for ( i
= steps
; i
>= 0; i
-- )
1230 if ( soFarNode
[i
]->deleted
)
1232 leftMost
= leftMost
> i
- 1 ? i
- 1 : leftMost
;
1235 soFarNode
[i
]->deleted
= 1;
1238 multiOccu1
[index
] = leftMost
< 0 ? 0 : leftMost
; //[0 steps-2]
1244 routeLens
[index
] = steps
;
1249 char * array
= foundRoutes
[index
];
1251 for ( i
= 0; i
< steps
; i
++ )
1253 array
[i
] = soFarSeq
[i
];
1259 } //indicate the end of the sequence
1262 *num_route
= ++index
;
1269 int array
[] = { 0, 1, 2, 3 };
1271 for ( i
= 4; i
> 0; i
-- )
1273 ch
= nPick1 ( array
, i
);
1274 links
= get_kmer_right_cov ( *node
, ch
);
1283 *soFarLinks
= linkCounter
+ links
;
1286 word
= nextKmerLocal ( currW
, ch
, WORDF
);
1287 trace4Repeat ( word
, steps
, min
, max
, num_route
, kset
, kmerDest
, overlap
, WORDF
, traceCounter
, maxRoute
, soFarNode
,
1288 multiOccu1
, multiOccu2
, routeLens
, foundRoutes
, soFarSeq
, soFarLinks
, avgLinks
);
1293 int array
[] = { 0, 1, 2, 3 };
1295 for ( i
= 4; i
> 0; i
-- )
1297 ch
= nPick1 ( array
, i
);
1298 links
= get_kmer_left_cov ( *node
, ch
);
1307 *soFarLinks
= linkCounter
+ links
;
1310 word
= nextKmerLocal ( currW
, int_comp ( ch
), WORDF
);
1311 trace4Repeat ( word
, steps
, min
, max
, num_route
, kset
, kmerDest
, overlap
, WORDF
, traceCounter
, maxRoute
, soFarNode
,
1312 multiOccu1
, multiOccu2
, routeLens
, foundRoutes
, soFarSeq
, soFarLinks
, avgLinks
);
1317 //found repeat node on contig ends
1318 static void maskRepeatNode ( KmerSet
* kset
, Kmer
* kmerCtg1
, Kmer
* kmerCtg2
, int overlap
, int len1
, int len2
, int max
, Kmer WordFilter
)
1321 int num_route
, steps
;
1322 int min
= 1, maxRoute
= 1;
1324 Kmer word
, bal_word
;
1329 for ( i
= 0; i
< len1
; i
++ )
1332 bal_word
= reverseComplement ( word
, overlap
);
1334 if ( KmerLarger ( word
, bal_word
) )
1339 found
= search_kmerset ( kset
, word
, &node
);
1341 if ( !found
|| node
->linear
)
1343 //printf("Found no node for kmer %llx\n",word);
1347 num_route
= traceCounter
= 0;
1349 trace4Repeat ( word
, steps
, min
, max
, &num_route
, kset
, word
, overlap
, WordFilter
, &traceCounter
, maxRoute
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
);
1351 if ( num_route
< 1 )
1360 for ( i
= 0; i
< len2
; i
++ )
1363 bal_word
= reverseComplement ( word
, overlap
);
1365 if ( KmerLarger ( word
, bal_word
) )
1370 found
= search_kmerset ( kset
, word
, &node
);
1372 if ( !found
|| node
->linear
)
1374 //printf("Found no node for kmer %llx\n",word);
1378 num_route
= traceCounter
= 0;
1380 trace4Repeat ( word
, steps
, min
, max
, &num_route
, kset
, word
, overlap
, WordFilter
, &traceCounter
, maxRoute
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
);
1382 if ( num_route
< 1 )
1391 //printf("MR: %d(%d)\n",counter,len1+len2);
1395 static boolean chopReadFillGap(int len_seq,int overlap,char *src_seq, char *bal_seq,
1396 KmerSet *kset,Kmer WORDF,int *start,int *end,boolean *bal,
1397 Kmer *KmerCtg1,int len1,Kmer *KmerCtg2,int len2,int *index1,int *index2)
1399 int index,j=0,bal_j;
1401 int flag=0,bal_flag=0;
1402 int ctg1start,bal_ctg1start,ctg2end,bal_ctg2end;
1403 int seqStart,bal_start,seqEnd,bal_end;
1407 if(len_seq<overlap+1){
1411 for (index = 0;index<overlap;index++){
1413 word += src_seq[index];
1415 reverseComplementSeq(src_seq, len_seq,bal_seq);
1417 // complementary node
1418 bal_word = reverseComplement(word,overlap);
1419 bal_j = len_seq-0-overlap; // 0;
1420 flag = bal_flag = 0;
1422 found = search_kmerset(kset,word,&node);
1424 found = search_kmerset(kset,bal_word,&node);
1426 if(found&&!node->linear&&!node->checked){
1427 if(!flag&&node->inEdge==1){
1428 ctg1start = searchKmerOnCtg(word,KmerCtg1,len1);
1431 seqStart = j + overlap-1;
1434 if(!bal_flag&&node->inEdge==2){
1435 bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2);
1438 bal_end = bal_j+overlap-1;
1443 for(j = 1; j <= len_seq - overlap; j ++) {
1444 word = nextKmerLocal(word,src_seq[j-1+overlap],WORDF);
1445 bal_j = len_seq-j-overlap; // j;
1446 bal_word = prevKmerLocal(bal_word,bal_seq[bal_j],overlap);
1449 found = search_kmerset(kset,word,&node);
1451 found = search_kmerset(kset,bal_word,&node);
1453 if(found&&!node->linear&&!node->checked){
1454 if(!flag&&node->inEdge==1){
1455 ctg1start = searchKmerOnCtg(word,KmerCtg1,len1);
1458 seqStart = j + overlap-1;
1460 }else if(flag==1&&node->inEdge==1){
1461 index = searchKmerOnCtg(word,KmerCtg1,len1);
1462 if(index>ctg1start){ // choose hit closer to gap
1464 seqStart = j + overlap-1;
1466 }else if(flag==1&&node->inEdge==2){
1467 ctg2end = searchKmerOnCtg(word,KmerCtg2,len2);
1470 seqEnd = j+overlap-1;
1475 if(!bal_flag&&node->inEdge==2){
1476 bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2);
1479 bal_end = bal_j+overlap-1;
1481 }else if(bal_flag==2&&node->inEdge==2){
1482 index = searchKmerOnCtg(bal_word,KmerCtg2,len2);
1483 if(index<bal_ctg2end){ // choose hit closer to gap
1484 index = bal_ctg2end;
1485 bal_end = bal_j+overlap-1;
1487 }else if(bal_flag==2&&node->inEdge==1){
1488 bal_ctg1start = searchKmerOnCtg(bal_word,KmerCtg1,len1);
1489 if(bal_ctg1start>0){
1491 bal_start = bal_j+overlap-1;
1501 *index1 = ctg1start;
1504 }else if(bal_flag==3){
1508 *index1 = bal_ctg1start;
1509 *index2 = bal_ctg2end;
1515 static boolean readsCrossGap(READNEARBY *rdArray, int num, int originOverlap,DARRAY *gapSeqArray,
1516 Kmer *kmerCtg1,Kmer *kmerCtg2,int overlap,int len1,int len2,
1517 CTGinSCAF *ctg1,CTGinSCAF *ctg2,KmerSet *kmerS,Kmer WordFilter,int min,int max)
1519 int i,j,start,end,startOnCtg1,endOnCtg2;
1523 boolean bal,ret=0,FILL;
1525 src_seq = (char *)ckalloc(maxReadLen*sizeof(char));
1526 bal_seq = (char *)ckalloc(maxReadLen*sizeof(char));
1529 getSeqFromRead(rdArray[i],src_seq);
1530 FILL = chopReadFillGap(rdArray[i].len,overlap,src_seq,bal_seq,
1531 kmerS,WordFilter,&start,&end,&bal,
1532 kmerCtg1,len1,kmerCtg2,len2,&startOnCtg1,&endOnCtg2);
1534 if(!FILL||(end-start)<min||(end-start)>max)
1536 fprintf(stderr,"Read across\n");
1537 //printf("Filled: K %d, ctg1 %d ctg2 %d,start %d end %d\n",overlap,startOnCtg1,endOnCtg2,start,end);
1538 if(overlap+(len1-startOnCtg1-1)+endOnCtg2-(end-start)>(int)originOverlap)
1539 continue; // contig1 and contig2 could not overlap more than origOverlap bases
1541 ctg1->gapSeqOffset = gapSeqArray->item_c;
1542 ctg1->gapSeqLen = end-start;
1543 if(!darrayPut(gapSeqArray,ctg1->gapSeqOffset+(end-start)/4))
1545 pt = (char *)darrayPut(gapSeqArray,ctg1->gapSeqOffset);
1546 for(j=start+1;j<=end;j++){
1548 writeChar2tightString(bal_seq[j],pt,j-start-1);
1550 writeChar2tightString(src_seq[j],pt,j-start-1);
1553 ctg1->cutTail = len1-startOnCtg1-1;
1554 ctg2->cutHead = overlap + endOnCtg2;
1555 ctg2->scaftig_start = 0;
1561 free((void*)src_seq);
1562 free((void*)bal_seq);
1566 static void kmerSet_markTandem ( KmerSet
* set
, Kmer WordFilter
, int overlap
);
1567 static boolean
readsCrossGap ( READNEARBY
* rdArray
, int num
, int originOverlap
, DARRAY
* gapSeqArray
,
1568 Kmer
* kmerCtg1
, Kmer
* kmerCtg2
, int overlap
, CTGinSCAF
* ctg1
, CTGinSCAF
* ctg2
,
1569 KmerSet
* kmerS
, Kmer WordFilter
, int min
, int max
, int offset1
, int offset2
,
1570 char * seqGap
, char * seqCtg1
, char * seqCtg2
, int cut1
, int cut2
);
1572 int localGraph ( READNEARBY
* rdArray
, int num
, CTGinSCAF
* ctg1
, CTGinSCAF
* ctg2
,
1573 int origOverlap
, Kmer
* kmerCtg1
, Kmer
* kmerCtg2
, int overlap
,
1574 DARRAY
* gapSeqArray
, char * seqCtg1
, char * seqCtg2
, char * seqGap
)
1576 /**************** put kmer in DBgraph ****************/
1578 Kmer WordFilter
= createFilter ( overlap
);
1580 if(ctg1->ctgID==56410&&ctg2->ctgID==61741)
1581 printf("Extract %d reads for gap [%d %d]\n",num,ctg1->ctgID,ctg2->ctgID);
1583 kmerSet
= readsInGap2DBgraph ( rdArray
, num
, ctg1
, ctg2
, origOverlap
, kmerCtg1
, kmerCtg2
, overlap
, WordFilter
);
1587 //printf("no kmer found\n");
1593 //srand48 ( ( int ) tt );
1597 int array[] = {0,1,2,3};
1599 fprintf(stderr,"%d ", nPick1(array,j));
1601 fprintf(stderr,"\n");
1603 /***************** search path to connect contig ends ********/
1604 int gapLen
= ctg2
->start
- ctg1
->end
- origOverlap
+ overlap
;
1605 int min
= gapLen
- GLDiff
> 0 ? gapLen
- GLDiff
: 0;
1606 int max
= gapLen
+ GLDiff
< 10 ? 10 : gapLen
+ GLDiff
;
1607 //count kmer number for contig1 and contig2 ends
1609 len1
= CTGendLen
< contig_array
[ctg1
->ctgID
].length
+ origOverlap
? CTGendLen
: contig_array
[ctg1
->ctgID
].length
+ origOverlap
;
1610 len2
= CTGendLen
< contig_array
[ctg2
->ctgID
].length
+ origOverlap
? CTGendLen
: contig_array
[ctg2
->ctgID
].length
+ origOverlap
;
1611 len1
-= overlap
- 1;
1612 len2
-= overlap
- 1;
1614 int offset1
= 0, offset2
= 0, cut1
= 0, cut2
= 0;
1615 int pathNum
= searchFgap ( kmerSet
, ctg1
, ctg2
, kmerCtg1
, kmerCtg2
,
1616 origOverlap
, overlap
, gapSeqArray
,
1617 len1
, len2
, WordFilter
, &offset1
, &offset2
, seqGap
, &cut1
, &cut2
);
1619 //printf("SF: %d K %d\n",pathNum,overlap);
1622 free_kmerset ( kmerSet
);
1625 else if ( pathNum
== 1 )
1627 free_kmerset ( kmerSet
);
1632 printf("ret %d\n",pathNum);
1633 free_kmerset(kmerSet);
1636 /******************* cross the gap by single reads *********/
1637 //kmerSet_markTandem(kmerSet,WordFilter,overlap);
1638 maskRepeatNode ( kmerSet
, kmerCtg1
, kmerCtg2
, overlap
, len1
, len2
, max
, WordFilter
);
1639 boolean found
= readsCrossGap ( rdArray
, num
, origOverlap
, gapSeqArray
,
1640 kmerCtg1
, kmerCtg2
, overlap
, ctg1
, ctg2
, kmerSet
, WordFilter
, min
, max
,
1641 offset1
, offset2
, seqGap
, seqCtg1
, seqCtg2
, cut1
, cut2
);
1645 //fprintf(stderr,"read across\n");
1646 free_kmerset ( kmerSet
);
1651 free_kmerset ( kmerSet
);
1656 static void kmerSet_markTandem ( KmerSet
* set
, Kmer WordFilter
, int overlap
)
1659 long long counter
= 0;
1660 int num_route
, steps
;
1661 int min
= 1, max
= overlap
, maxRoute
= 1;
1665 while ( set
->iter_ptr
< set
->size
)
1667 if ( !is_kmer_entity_null ( set
->flags
, set
->iter_ptr
) )
1669 rs
= set
->array
+ set
->iter_ptr
;
1671 if ( rs
->inEdge
> 0 )
1677 num_route
= traceCounter
= 0;
1679 trace4Repeat ( rs
->seq
, steps
, min
, max
, &num_route
, set
, rs
->seq
, overlap
, WordFilter
, &traceCounter
, maxRoute
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
);
1681 if ( num_route
< 1 )
1688 printKmerSeqLocal(stderr,rs->seq,overlap);
1689 fprintf(stderr, "\n");
1699 /******************* the following is for read-crossing gaps *************************/
1701 #define MAXREADLENGTH 100
1703 static const int INDEL
= 0;
1704 static const int SIM
[4][4] =
1711 static char fastSequence
[MAXREADLENGTH
];
1712 static char slowSequence
[MAXREADLENGTH
];
1714 static int Fmatrix
[MAXREADLENGTH
+ 1][MAXREADLENGTH
+ 1];
1715 static int slowToFastMapping
[MAXREADLENGTH
+ 1];
1716 static int fastToSlowMapping
[MAXREADLENGTH
+ 1];
1718 static int max ( int A
, int B
, int C
)
1721 return ( A
>= C
? A
: C
);
1724 static int compareSequences ( char * sequence1
, char * sequence2
, int length1
, int length2
)
1726 if ( length1
< 1 || length2
< 1 || length1
> MAXREADLENGTH
|| length2
> MAXREADLENGTH
)
1732 int Choice1
, Choice2
, Choice3
;
1735 for ( i
= 0; i
<= length1
; i
++ )
1740 for ( j
= 0; j
<= length2
; j
++ )
1745 for ( i
= 1; i
<= length1
; i
++ )
1747 for ( j
= 1; j
<= length2
; j
++ )
1749 Choice1
= Fmatrix
[i
- 1][j
- 1] + SIM
[ ( int ) sequence1
[i
- 1]][ ( int ) sequence2
[j
- 1]];
1750 Choice2
= Fmatrix
[i
- 1][j
] + INDEL
;
1751 Choice3
= Fmatrix
[i
][j
- 1] + INDEL
;
1752 Fmatrix
[i
][j
] = max ( Choice1
, Choice2
, Choice3
);
1756 maxScore
= Fmatrix
[length1
][length2
];
1760 static void mapSlowOntoFast ( int slowSeqLength
, int fastSeqLength
)
1762 int slowIndex
= slowSeqLength
;
1763 int fastIndex
= fastSeqLength
;
1766 if ( slowIndex
== 0 )
1768 slowToFastMapping
[0] = fastIndex
;
1770 while ( fastIndex
>= 0 )
1772 fastToSlowMapping
[fastIndex
--] = 0;
1778 if ( fastIndex
== 0 )
1780 while ( slowIndex
>= 0 )
1782 slowToFastMapping
[slowIndex
--] = 0;
1785 fastToSlowMapping
[0] = slowIndex
;
1789 while ( slowIndex
> 0 && fastIndex
> 0 )
1791 fastn
= ( int ) fastSequence
[fastIndex
- 1]; //getCharInTightString(fastSequence,fastIndex-1);
1792 slown
= ( int ) slowSequence
[slowIndex
- 1]; //getCharInTightString(slowSequence,slowIndex-1);
1794 if ( Fmatrix
[fastIndex
][slowIndex
] == Fmatrix
[fastIndex
- 1][slowIndex
- 1] + SIM
[fastn
][slown
] )
1796 fastToSlowMapping
[--fastIndex
] = --slowIndex
;
1797 slowToFastMapping
[slowIndex
] = fastIndex
;
1799 else if ( Fmatrix
[fastIndex
][slowIndex
] == Fmatrix
[fastIndex
- 1][slowIndex
] + INDEL
)
1801 fastToSlowMapping
[--fastIndex
] = slowIndex
- 1;
1803 else if ( Fmatrix
[fastIndex
][slowIndex
] == Fmatrix
[fastIndex
][slowIndex
- 1] + INDEL
)
1805 slowToFastMapping
[--slowIndex
] = fastIndex
- 1;
1809 fprintf ( stderr
, "CompareSequence: Error trace.\n" );
1814 while ( slowIndex
> 0 )
1816 slowToFastMapping
[--slowIndex
] = -1;
1819 while ( fastIndex
> 0 )
1821 fastToSlowMapping
[--fastIndex
] = -1;
1824 slowToFastMapping
[slowSeqLength
] = fastSeqLength
;
1825 fastToSlowMapping
[fastSeqLength
] = slowSeqLength
;
1828 static boolean
chopReadFillGap ( int len_seq
, int overlap
, char * src_seq
, char * bal_seq
,
1829 KmerSet
* kset
, Kmer WORDF
, int * start
, int * end
, boolean
* bal
,
1830 Kmer
* KmerCtg1
, int len1
, Kmer
* KmerCtg2
, int len2
, int * index1
, int * index2
)
1832 int index
, j
= 0, bal_j
;
1833 Kmer word
, bal_word
;
1834 int flag
= 0, bal_flag
= 0;
1835 int ctg1start
, bal_ctg1start
, ctg2end
, bal_ctg2end
;
1836 int seqStart
, bal_start
, seqEnd
, bal_end
;
1840 if ( len_seq
< overlap
+ 1 )
1846 word
.high1
= word
.low1
= word
.high2
= word
.low2
= 0;
1848 for ( index
= 0; index
< overlap
; index
++ )
1850 word
= KmerLeftBitMoveBy2 ( word
);
1851 word
.low2
|= src_seq
[index
];
1855 word
.high
= word
.low
= 0;
1857 for ( index
= 0; index
< overlap
; index
++ )
1859 word
= KmerLeftBitMoveBy2 ( word
);
1860 word
.low
|= src_seq
[index
];
1864 reverseComplementSeq ( src_seq
, len_seq
, bal_seq
);
1865 // complementary node
1866 bal_word
= reverseComplement ( word
, overlap
);
1867 bal_j
= len_seq
- 0 - overlap
; // 0;
1868 flag
= bal_flag
= 0;
1870 if ( KmerSmaller ( word
, bal_word
) )
1872 found
= search_kmerset ( kset
, word
, &node
);
1876 found
= search_kmerset ( kset
, bal_word
, &node
);
1879 //if ( !found ) { printf ( "chopReadFillGap 1292 not found!\n" ); }
1881 if ( found
&& !node
->linear
&& !node
->checked
)
1883 if ( !flag
&& node
->inEdge
== 1 )
1885 ctg1start
= searchKmerOnCtg ( word
, KmerCtg1
, len1
);
1887 if ( ctg1start
>= 0 )
1890 seqStart
= j
+ overlap
- 1;
1894 if ( !bal_flag
&& node
->inEdge
== 2 )
1896 bal_ctg2end
= searchKmerOnCtg ( bal_word
, KmerCtg2
, len2
);
1898 if ( bal_ctg2end
>= 0 )
1901 bal_end
= bal_j
+ overlap
- 1;
1906 for ( j
= 1; j
<= len_seq
- overlap
; j
++ )
1908 word
= nextKmerLocal ( word
, src_seq
[j
- 1 + overlap
], WORDF
);
1909 bal_j
= len_seq
- j
- overlap
; // j;
1910 bal_word
= prevKmerLocal ( bal_word
, bal_seq
[bal_j
], overlap
);
1912 if ( KmerSmaller ( word
, bal_word
) )
1914 found
= search_kmerset ( kset
, word
, &node
);
1918 found
= search_kmerset ( kset
, bal_word
, &node
);
1921 //if ( !found ) { printf ( "chopReadFillGap 1321 not found!\n" ); }
1923 if ( found
&& !node
->linear
&& !node
->checked
)
1925 if ( !flag
&& node
->inEdge
== 1 )
1927 ctg1start
= searchKmerOnCtg ( word
, KmerCtg1
, len1
);
1929 if ( ctg1start
>= 0 )
1932 seqStart
= j
+ overlap
- 1;
1935 else if ( flag
== 1 && node
->inEdge
== 1 )
1937 index
= searchKmerOnCtg ( word
, KmerCtg1
, len1
);
1939 if ( index
>= 0 && index
> ctg1start
) // choose hit closer to gap
1942 seqStart
= j
+ overlap
- 1;
1945 else if ( flag
== 1 && node
->inEdge
== 2 )
1947 ctg2end
= searchKmerOnCtg ( word
, KmerCtg2
, len2
);
1952 seqEnd
= j
+ overlap
- 1;
1957 if ( !bal_flag
&& node
->inEdge
== 2 )
1959 bal_ctg2end
= searchKmerOnCtg ( bal_word
, KmerCtg2
, len2
);
1961 if ( bal_ctg2end
>= 0 )
1964 bal_end
= bal_j
+ overlap
- 1;
1967 else if ( bal_flag
== 2 && node
->inEdge
== 2 )
1969 index
= searchKmerOnCtg ( bal_word
, KmerCtg2
, len2
);
1971 if ( index
>= 0 && index
< bal_ctg2end
) // choose hit closer to gap
1973 bal_ctg2end
= index
;
1974 bal_end
= bal_j
+ overlap
- 1;
1977 else if ( bal_flag
== 2 && node
->inEdge
== 1 )
1979 bal_ctg1start
= searchKmerOnCtg ( bal_word
, KmerCtg1
, len1
);
1981 if ( bal_ctg1start
>= 0 )
1984 bal_start
= bal_j
+ overlap
- 1;
1996 *index1
= ctg1start
;
2000 else if ( bal_flag
== 3 )
2005 *index1
= bal_ctg1start
;
2006 *index2
= bal_ctg2end
;
2013 static int cutSeqFromTightStr ( char * tightStr
, int length
, int start
, int end
, int revS
, char * src_seq
)
2016 end
= end
< length
? end
: length
- 1;
2017 start
= start
>= 0 ? start
: 0;
2021 for ( i
= start
; i
<= end
; i
++ )
2023 src_seq
[index
++] = getCharInTightString ( tightStr
, i
);
2028 for ( i
= length
- 1 - start
; i
>= length
- end
- 1; i
-- )
2030 src_seq
[index
++] = int_comp ( getCharInTightString ( tightStr
, i
) );
2034 return end
- start
+ 1;
2037 static int cutSeqFromCtg ( unsigned int ctgID
, int start
, int end
, char * sequence
, int originOverlap
)
2039 unsigned int bal_ctg
= getTwinCtg ( ctgID
);
2041 if ( contig_array
[ctgID
].length
< 1 )
2046 int length
= contig_array
[ctgID
].length
+ originOverlap
;
2048 if ( contig_array
[ctgID
].seq
)
2050 return cutSeqFromTightStr ( contig_array
[ctgID
].seq
, length
, start
, end
, 0, sequence
);
2054 return cutSeqFromTightStr ( contig_array
[bal_ctg
].seq
, length
, start
, end
, 1, sequence
);
2058 static int cutSeqFromRead ( char * src_seq
, int length
, int start
, int end
, char * sequence
)
2060 if ( end
>= length
)
2062 fprintf ( stderr
, "The index is bigger than the length: end %d length %d.\n", end
, length
);
2065 end
= end
< length
? end
: length
- 1;
2066 start
= start
>= 0 ? start
: 0;
2069 for ( i
= start
; i
<= end
; i
++ )
2071 sequence
[i
- start
] = src_seq
[i
];
2074 return end
- start
+ 1;
2077 void printSeq ( FILE * fo
, char * seq
, int len
)
2081 for ( i
= 0; i
< len
; i
++ )
2083 fprintf ( fo
, "%c", int2base ( ( int ) seq
[i
] ) );
2086 fprintf ( fo
, "\n" );
2089 static boolean
readsCrossGap ( READNEARBY
* rdArray
, int num
, int originOverlap
, DARRAY
* gapSeqArray
,
2090 Kmer
* kmerCtg1
, Kmer
* kmerCtg2
, int overlap
, CTGinSCAF
* ctg1
, CTGinSCAF
* ctg2
,
2091 KmerSet
* kmerS
, Kmer WordFilter
, int min
, int max
, int offset1
, int offset2
,
2092 char * seqGap
, char * seqCtg1
, char * seqCtg2
, int cut1
, int cut2
)
2094 int i
, j
, start
, end
, startOnCtg1
, endOnCtg2
;
2098 boolean bal
, ret
= 0, FILL
;
2099 double maxScore
= 0.0;
2101 int lenCtg1
, lenCtg2
;
2102 //build sequences on left and right of the uncertain region
2103 int buffer_size
= maxReadLen
> 100 ? maxReadLen
: 100;
2104 int length
= contig_array
[ctg1
->ctgID
].length
+ originOverlap
;
2106 if ( buffer_size
> offset1
)
2108 lenCtg1
= cutSeqFromCtg ( ctg1
->ctgID
, length
- cut1
- ( buffer_size
- offset1
), length
- 1 - cut1
, seqCtg1
, originOverlap
);
2110 for ( i
= 0; i
< offset1
; i
++ )
2112 seqCtg1
[lenCtg1
+ i
] = seqGap
[i
];
2119 for ( i
= offset1
- buffer_size
; i
< offset1
; i
++ )
2121 seqCtg1
[i
+ buffer_size
- offset1
] = seqGap
[i
];
2124 lenCtg1
= buffer_size
;
2127 length
= contig_array
[ctg2
->ctgID
].length
+ originOverlap
;
2129 if ( buffer_size
> offset2
)
2131 lenCtg2
= cutSeqFromCtg ( ctg2
->ctgID
, cut2
, buffer_size
- offset2
- 1 + cut2
, & ( seqCtg2
[offset2
] ), originOverlap
);
2133 for ( i
= 0; i
< offset2
; i
++ )
2135 seqCtg2
[i
] = seqGap
[i
+ offset1
];
2142 for ( i
= 0; i
< buffer_size
; i
++ )
2144 seqCtg2
[i
] = seqGap
[i
+ offset1
];
2147 lenCtg2
= buffer_size
;
2151 if(offset1>0||offset2>0){
2152 for(i=0;i<lenCtg1;i++)
2153 fprintf(stderr,"%c",int2base(seqCtg1[i]));
2154 fprintf(stderr,": CTG1\n");
2155 for(i=0;i<lenCtg2;i++)
2156 fprintf(stderr,"%c",int2base(seqCtg2[i]));
2157 fprintf(stderr,": CTG2\n");
2160 //chop kmer from both ends of the uncertain region
2162 len1
= CTGendLen
< lenCtg1
? CTGendLen
: lenCtg1
;
2163 len2
= CTGendLen
< lenCtg2
? CTGendLen
: lenCtg2
;
2164 chopKmer4Ctg ( kmerCtg1
, len1
, overlap
, & ( seqCtg1
[lenCtg1
- len1
] ), WordFilter
);
2165 chopKmer4Ctg ( kmerCtg2
, len2
, overlap
, seqCtg2
, WordFilter
);
2166 len1
-= overlap
- 1;
2167 len2
-= overlap
- 1;
2168 src_seq
= ( char * ) ckalloc ( maxReadLen
* sizeof ( char ) );
2169 bal_seq
= ( char * ) ckalloc ( maxReadLen
* sizeof ( char ) );
2170 int * START
= ( int * ) ckalloc ( num
* sizeof ( int ) );
2171 int * END
= ( int * ) ckalloc ( num
* sizeof ( int ) );
2172 int * INDEX1
= ( int * ) ckalloc ( num
* sizeof ( int ) );
2173 int * INDEX2
= ( int * ) ckalloc ( num
* sizeof ( int ) );
2174 double * SCORE
= ( double * ) ckalloc ( num
* sizeof ( double ) );
2175 boolean
* BAL
= ( boolean
* ) ckalloc ( num
* sizeof ( boolean
) );
2176 memset ( SCORE
, 0, num
* sizeof ( double ) );
2178 for ( i
= 0; i
< num
; i
++ )
2180 getSeqFromRead ( rdArray
[i
], src_seq
);
2181 FILL
= chopReadFillGap ( rdArray
[i
].len
, overlap
, src_seq
, bal_seq
, kmerS
, WordFilter
, &start
, &end
, &bal
, kmerCtg1
, len1
, kmerCtg2
, len2
, &startOnCtg1
, &endOnCtg2
);
2183 if ( !FILL
|| ( end
- start
) < min
|| ( end
- start
) > max
)
2188 if ( overlap
+ ( len1
- startOnCtg1
- 1 ) + endOnCtg2
- ( end
- start
) > ( int ) originOverlap
)
2191 } // contig1 and contig2 could not overlap more than origOverlap bases
2195 INDEX1
[i
] = startOnCtg1
;
2196 INDEX2
[i
] = endOnCtg2
;
2198 int matchLen
= 2 * overlap
< ( end
- start
+ overlap
) ? 2 * overlap
: ( end
- start
+ overlap
);
2200 int alignLen
= matchLen
;
2201 //compare the left of hit kmer on ctg1
2202 //int ctgLeft = (contig_array[ctg1->ctgID].length+originOverlap)-(len1+overlap-1)+startOnCtg1;
2203 int ctgLeft
= ( lenCtg1
) - ( len1
+ overlap
- 1 ) + startOnCtg1
;
2204 int readLeft
= start
- overlap
+ 1;
2205 int cmpLen
= ctgLeft
< readLeft
? ctgLeft
: readLeft
;
2206 cmpLen
= cmpLen
<= MAXREADLENGTH
? cmpLen
: MAXREADLENGTH
;
2207 //cutSeqFromCtg(ctg1->ctgID,ctgLeft-cmpLen,ctgLeft-1,fastSequence,originOverlap);
2208 cutSeqFromRead ( seqCtg1
, lenCtg1
, ctgLeft
- cmpLen
, ctgLeft
- 1, fastSequence
);
2212 cutSeqFromRead ( src_seq
, rdArray
[i
].len
, readLeft
- cmpLen
, readLeft
- 1, slowSequence
);
2216 cutSeqFromRead ( bal_seq
, rdArray
[i
].len
, readLeft
- cmpLen
, readLeft
- 1, slowSequence
);
2219 match
= compareSequences ( fastSequence
, slowSequence
, cmpLen
, cmpLen
);
2222 //compare the right of hit kmer on ctg1
2223 int ctgRight
= len1
- startOnCtg1
- 1;
2224 cmpLen
= ctgRight
< ( rdArray
[i
].len
- start
- 1 ) ? ctgRight
: ( rdArray
[i
].len
- start
- 1 );
2225 cmpLen
= cmpLen
<= MAXREADLENGTH
? cmpLen
: MAXREADLENGTH
;
2226 //cutSeqFromCtg(ctg1->ctgID,ctgLeft+overlap,ctgLeft+overlap+cmpLen-1,fastSequence,originOverlap);
2227 cutSeqFromRead ( seqCtg1
, lenCtg1
, ctgLeft
+ overlap
, ctgLeft
+ overlap
+ cmpLen
- 1, fastSequence
);
2231 cutSeqFromRead ( src_seq
, rdArray
[i
].len
, start
+ 1, start
+ cmpLen
, slowSequence
);
2235 cutSeqFromRead ( bal_seq
, rdArray
[i
].len
, start
+ 1, start
+ cmpLen
, slowSequence
);
2238 match
= compareSequences ( fastSequence
, slowSequence
, cmpLen
, cmpLen
);
2239 //fprintf(stderr,"%d -- %d\n",match,cmpLen);
2242 //compare the left of hit kmer on ctg2
2243 ctgLeft
= endOnCtg2
;
2244 readLeft
= end
- overlap
+ 1;
2245 cmpLen
= ctgLeft
< readLeft
? ctgLeft
: readLeft
;
2246 cmpLen
= ctgLeft
<= MAXREADLENGTH
? ctgLeft
: MAXREADLENGTH
;
2247 //cutSeqFromCtg(ctg2->ctgID,endOnCtg2-cmpLen,endOnCtg2-1,fastSequence,originOverlap);
2248 cutSeqFromRead ( seqCtg2
, lenCtg2
, endOnCtg2
- cmpLen
, endOnCtg2
- 1, fastSequence
);
2252 cutSeqFromRead ( src_seq
, rdArray
[i
].len
, readLeft
- cmpLen
, readLeft
- 1, slowSequence
);
2256 cutSeqFromRead ( bal_seq
, rdArray
[i
].len
, readLeft
- cmpLen
, readLeft
- 1, slowSequence
);
2259 match
= compareSequences ( fastSequence
, slowSequence
, cmpLen
, cmpLen
);
2262 //compare the right of hit kmer on ctg2
2263 //ctgRight = contig_array[ctg2->ctgID].length+originOverlap-endOnCtg2-overlap;
2264 ctgRight
= lenCtg2
- endOnCtg2
- overlap
;
2265 cmpLen
= ctgRight
< ( rdArray
[i
].len
- end
- 1 ) ? ctgRight
: ( rdArray
[i
].len
- end
- 1 );
2266 cmpLen
= cmpLen
<= MAXREADLENGTH
? cmpLen
: MAXREADLENGTH
;
2267 //cutSeqFromCtg(ctg2->ctgID,endOnCtg2+overlap,endOnCtg2+overlap+cmpLen-1,fastSequence,originOverlap);
2268 cutSeqFromRead ( seqCtg2
, lenCtg2
, endOnCtg2
+ overlap
, endOnCtg2
+ overlap
+ cmpLen
- 1, fastSequence
);
2272 cutSeqFromRead ( src_seq
, rdArray
[i
].len
, end
+ 1, end
+ cmpLen
, slowSequence
);
2276 cutSeqFromRead ( bal_seq
, rdArray
[i
].len
, end
+ 1, end
+ cmpLen
, slowSequence
);
2279 match
= compareSequences ( fastSequence
, slowSequence
, cmpLen
, cmpLen
);
2283 if(cmpLen>0&&match!=cmpLen+overlap){
2284 printSeq(stderr,fastSequence,cmpLen+overlap);
2285 printSeq(stderr,slowSequence,cmpLen+overlap);
2286 printKmerSeqLocal(stderr,kmerCtg2[endOnCtg2],overlap);
2287 fprintf(stderr,": %d(%d)\n",bal,endOnCtg2);
2288 }else if(cmpLen>0&&match==cmpLen+overlap)
2289 fprintf(stderr,"Perfect\n");
2291 double score
= ( double ) matchLen
/ alignLen
;
2293 if ( maxScore
< score
)
2296 //fprintf(stderr,"%4.2f (%d/%d)\n",maxScore,matchLen,alignLen);
2305 fprintf(stderr,"SCORE: %4.2f\n",maxScore);
2307 if ( maxScore
> 0.9 )
2310 for(i=0;i<lenCtg1;i++)
2311 fprintf(stderr,"%c",int2base(seqCtg1[i]));
2312 fprintf(stderr,": CTG1\n");
2313 for(i=0;i<lenCtg2;i++)
2314 fprintf(stderr,"%c",int2base(seqCtg2[i]));
2315 fprintf(stderr,": CTG2\n");
2316 fprintf(stderr,"%d+%d -- %d+%d, SCORE: %4.2f\n ",offset1,offset2,cut1,cut2,maxScore);
2318 getSeqFromRead ( rdArray
[maxIndex
], src_seq
);
2319 reverseComplementSeq ( src_seq
, rdArray
[maxIndex
].len
, bal_seq
);
2320 int leftRemain
= offset1
- ( len1
- INDEX1
[maxIndex
] - 1 ) > 0 ? offset1
- ( len1
- INDEX1
[maxIndex
] - 1 ) : 0;
2321 int rightRemain
= offset2
- ( overlap
+ INDEX2
[maxIndex
] ) > 0 ? offset2
- ( overlap
+ INDEX2
[maxIndex
] ) : 0;
2322 ctg1
->gapSeqOffset
= gapSeqArray
->item_c
;
2323 ctg1
->gapSeqLen
= END
[maxIndex
] - START
[maxIndex
] + leftRemain
+ rightRemain
;
2325 if ( darrayPut ( gapSeqArray
, ctg1
->gapSeqOffset
+ ( END
[maxIndex
] - START
[maxIndex
] + leftRemain
+ rightRemain
) / 4 ) )
2327 pt
= ( char * ) darrayPut ( gapSeqArray
, ctg1
->gapSeqOffset
);
2329 for ( j
= 0; j
< leftRemain
; j
++ ) //get the left side of the gap region from search
2331 writeChar2tightString ( seqGap
[j
], pt
, j
);
2332 //fprintf(stderr,"%c",int2base(seqGap[j]));
2335 for ( j
= START
[maxIndex
] + 1; j
<= END
[maxIndex
]; j
++ )
2337 if ( BAL
[maxIndex
] )
2339 writeChar2tightString ( bal_seq
[j
], pt
, j
- START
[maxIndex
] - 1 + leftRemain
);
2340 //fprintf(stderr,"%c",int2base(bal_seq[j]));
2344 writeChar2tightString ( src_seq
[j
], pt
, j
- START
[maxIndex
] - 1 + leftRemain
);
2345 //fprintf(stderr,"%c",int2base(src_seq[j]));
2349 for ( j
= offset2
- rightRemain
; j
< offset2
; j
++ ) //get the right side of the gap region from search
2351 writeChar2tightString ( seqGap
[j
+ leftRemain
], pt
, j
+ END
[maxIndex
] - START
[maxIndex
] + leftRemain
);
2352 //fprintf(stderr,"%c",int2base(seqGap[j+leftRemain]));
2356 fprintf(stderr,": GAPSEQ (%d+%d)(%d+%d)(%d+%d)(%d+%d) B %d\n",offset1,offset2,cut1,cut2,
2357 len1-INDEX1[maxIndex]-1,INDEX2[maxIndex],START[maxIndex],END[maxIndex],BAL[maxIndex]);
2359 ctg1
->cutTail
= len1
- INDEX1
[maxIndex
] - 1 - offset1
+ cut1
> cut1
? len1
- INDEX1
[maxIndex
] - 1 - offset1
+ cut1
: cut1
;
2360 ctg2
->cutHead
= overlap
+ INDEX2
[maxIndex
] - offset2
+ cut2
> cut2
? overlap
+ INDEX2
[maxIndex
] - offset2
+ cut2
: cut2
;
2361 ctg2
->scaftig_start
= 0;
2366 free ( ( void * ) START
);
2367 free ( ( void * ) END
);
2368 free ( ( void * ) INDEX1
);
2369 free ( ( void * ) INDEX2
);
2370 free ( ( void * ) SCORE
);
2371 free ( ( void * ) BAL
);
2372 free ( ( void * ) src_seq
);
2373 free ( ( void * ) bal_seq
);