modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / localAsm.c
blobe61d9e4e9cdcf16788b17ae7fb0de41f433b673f
1 /*
2 * localAsm.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
29 #define CTGendLen 35 // shouldn't larger than max_read_len
30 #define UPlimit 5000
31 #define MaxRouteNum 10
33 static void kmerSet_mark ( KmerSet * set );
34 static void trace4Repeat ( Kmer currW, int steps, int min, int max, int * num_route, KmerSet * kset,
35 Kmer kmerDest, int overlap, Kmer WORDF, int * traceCounter, int maxRoute,
36 kmer_t ** soFarNode, short * multiOccu1, short * multiOccu2, int * routeLens,
37 char ** foundRoutes, char * soFarSeq, long long * soFarLinks, double * avgLinks );
39 #ifdef MER127
40 static Kmer prevKmerLocal ( Kmer next, char ch, int overlap )
42 Kmer word = KmerRightBitMoveBy2 ( next );
44 if ( 2 * ( overlap - 1 ) < 64 )
46 word.low2 |= ( ( ( ubyte8 ) ch ) << 2 * ( overlap - 1 ) );
49 if ( 2 * ( overlap - 1 ) >= 64 && 2 * ( overlap - 1 ) < 128 )
51 word.high2 |= ( ( ubyte8 ) ch ) << ( 2 * ( overlap - 1 ) - 64 );
54 if ( 2 * ( overlap - 1 ) >= 128 && 2 * ( overlap - 1 ) < 192 )
56 word.low1 |= ( ( ubyte8 ) ch ) << ( 2 * ( overlap - 1 ) - 128 );
59 if ( 2 * ( overlap - 1 ) >= 192 && 2 * ( overlap - 1 ) < 256 )
61 word.high1 |= ( ( ubyte8 ) ch ) << ( 2 * ( overlap - 1 ) - 192 );
64 return word;
67 static Kmer nextKmerLocal ( Kmer prev, char ch, Kmer WordFilter )
69 Kmer word = KmerLeftBitMoveBy2 ( prev );
70 word = KmerAnd ( word, WordFilter );
71 word.low2 |= ch;
72 return word;
74 #else
75 static Kmer prevKmerLocal ( Kmer next, char ch, int overlap )
77 Kmer word = KmerRightBitMoveBy2 ( next );
79 if ( 2 * ( overlap - 1 ) < 64 )
81 word.low |= ( ( ( ubyte8 ) ch ) << 2 * ( overlap - 1 ) );
83 else
85 word.high |= ( ( ubyte8 ) ch ) << ( 2 * ( overlap - 1 ) - 64 );
88 return word;
91 static Kmer nextKmerLocal ( Kmer prev, char ch, Kmer WordFilter )
93 Kmer word = KmerLeftBitMoveBy2 ( prev );
94 word = KmerAnd ( word, WordFilter );
95 word.low |= ch;
96 return word;
98 #endif
99 static void singleKmer ( int t, KmerSet * kset, int flag, Kmer * kmerBuffer, char * prevcBuffer, char * nextcBuffer )
101 kmer_t * pos;
102 put_kmerset ( kset, kmerBuffer[t], prevcBuffer[t], nextcBuffer[t], &pos );
104 if ( pos->inEdge == flag )
106 return;
108 else if ( pos->inEdge == 0 )
110 pos->inEdge = flag;
112 else if ( pos->inEdge == 1 && flag == 2 )
114 pos->inEdge = 3;
116 else if ( pos->inEdge == 2 && flag == 1 )
118 pos->inEdge = 3;
122 static void putKmer2DBgraph ( KmerSet * kset, int flag, int kmer_c, Kmer * kmerBuffer, char * prevcBuffer, char * nextcBuffer )
124 int t;
126 for ( t = 0; t < kmer_c; t++ )
128 singleKmer ( t, kset, flag, kmerBuffer, prevcBuffer, nextcBuffer );
132 static void getSeqFromRead ( READNEARBY read, char * src_seq )
134 int len_seq = read.len;
135 int j;
136 char * tightSeq = ( char * ) darrayGet ( readSeqInGap, read.seqStarter );
138 for ( j = 0; j < len_seq; j++ )
140 src_seq[j] = getCharInTightString ( tightSeq, j );
144 #ifdef MER127
145 static void chopKmer4Ctg ( Kmer * kmerCtg, int lenCtg, int overlap, char * src_seq, Kmer WORDF )
147 int index, j;
148 Kmer word;
149 word.high1 = word.low1 = word.high2 = word.low2 = 0;
151 for ( index = 0; index < overlap; index++ )
153 word = KmerLeftBitMoveBy2 ( word );
154 word.low2 |= src_seq[index];
157 index = 0;
158 kmerCtg[index++] = word;
160 for ( j = 1; j <= lenCtg - overlap; j++ )
162 word = nextKmerLocal ( word, src_seq[j - 1 + overlap], WORDF );
163 kmerCtg[index++] = word;
166 #else
167 static void chopKmer4Ctg ( Kmer * kmerCtg, int lenCtg, int overlap, char * src_seq, Kmer WORDF )
169 int index, j;
170 Kmer word;
171 word.high = word.low = 0;
173 for ( index = 0; index < overlap; index++ )
175 word = KmerLeftBitMoveBy2 ( word );
176 word.low |= src_seq[index];
179 index = 0;
180 kmerCtg[index++] = word;
182 for ( j = 1; j <= lenCtg - overlap; j++ )
184 word = nextKmerLocal ( word, src_seq[j - 1 + overlap], WORDF );
185 kmerCtg[index++] = word;
188 #endif
190 static void chopKmer4read ( int len_seq, int overlap, char * src_seq, char * bal_seq, Kmer * kmerBuffer, char * prevcBuffer, char * nextcBuffer, int * kmer_c, Kmer WORDF )
192 int j, bal_j;
193 Kmer word, bal_word;
194 int index;
195 char InvalidCh = 4;
197 if ( len_seq < overlap + 1 )
199 *kmer_c = 0;
200 return;
203 #ifdef MER127
204 word.high1 = word.low1 = word.high2 = word.low2 = 0;
206 for ( index = 0; index < overlap; index++ )
208 word = KmerLeftBitMoveBy2 ( word );
209 word.low2 |= src_seq[index];
212 #else
213 word.high = word.low = 0;
215 for ( index = 0; index < overlap; index++ )
217 word = KmerLeftBitMoveBy2 ( word );
218 word.low |= src_seq[index];
221 #endif
222 reverseComplementSeq ( src_seq, len_seq, bal_seq );
223 // complementary node
224 bal_word = reverseComplement ( word, overlap );
225 bal_j = len_seq - 0 - overlap; // 0;
226 index = 0;
228 if ( KmerSmaller ( word, bal_word ) )
230 kmerBuffer[index] = word;
231 prevcBuffer[index] = InvalidCh;
232 nextcBuffer[index++] = src_seq[0 + overlap];
234 else
236 kmerBuffer[index] = bal_word;
237 prevcBuffer[index] = bal_seq[bal_j - 1];
238 nextcBuffer[index++] = InvalidCh;
241 for ( j = 1; j <= len_seq - overlap; j++ )
243 word = nextKmerLocal ( word, src_seq[j - 1 + overlap], WORDF );
244 bal_j = len_seq - j - overlap; // j;
245 bal_word = prevKmerLocal ( bal_word, bal_seq[bal_j], overlap );
247 if ( KmerSmaller ( word, bal_word ) )
249 kmerBuffer[index] = word;
250 prevcBuffer[index] = src_seq[j - 1];
252 if ( j < len_seq - overlap )
254 nextcBuffer[index++] = src_seq[j + overlap];
256 else
258 nextcBuffer[index++] = InvalidCh;
261 //printf("%dth: %p with %p\n",kmer_c-1,word,hashBanBuffer[kmer_c-1]);
263 else
265 // complementary node
266 kmerBuffer[index] = bal_word;
268 if ( bal_j > 0 )
270 prevcBuffer[index] = bal_seq[bal_j - 1];
272 else
274 prevcBuffer[index] = InvalidCh;
277 nextcBuffer[index++] = bal_seq[bal_j + overlap];
278 //printf("%dth: %p with %p\n",kmer_c-1,bal_word,hashBanBuffer[kmer_c-1]);
282 *kmer_c = index;
285 static void headTightStr ( char * tightStr, int length, int start, int headLen, int revS, char * src_seq )
287 int i, index = 0;
289 if ( !revS )
291 for ( i = start; i < start + headLen; i++ )
293 src_seq[index++] = getCharInTightString ( tightStr, i );
296 else
298 for ( i = length - 1 - start; i >= length - headLen - start; i-- )
300 src_seq[index++] = int_comp ( getCharInTightString ( tightStr, i ) );
305 static int getSeqFromCtg ( CTGinSCAF * ctg, boolean fromHead, unsigned int len, int originOverlap, char * src_seq )
307 unsigned int ctgId = ctg->ctgID;
308 unsigned int bal_ctg = getTwinCtg ( ctgId );
310 if ( contig_array[ctgId].length < 1 )
312 return 0;
315 unsigned int length = contig_array[ctgId].length + originOverlap;
316 len = len < length ? len : length;
318 if ( fromHead )
320 if ( contig_array[ctgId].seq )
322 headTightStr ( contig_array[ctgId].seq, length, 0, len, 0, src_seq );
324 else
326 headTightStr ( contig_array[bal_ctg].seq, length, 0, len, 1, src_seq );
329 else
331 if ( contig_array[ctgId].seq )
333 headTightStr ( contig_array[ctgId].seq, length, length - len, len, 0, src_seq );
335 else
337 headTightStr ( contig_array[bal_ctg].seq, length, length - len, len, 1, src_seq );
341 return len;
344 static KmerSet * readsInGap2DBgraph ( READNEARBY * rdArray, int num, CTGinSCAF * ctg1, CTGinSCAF * ctg2, int originOverlap, Kmer * kmerCtg1, Kmer * kmerCtg2, int overlap, Kmer WordFilter )
346 int kmer_c;
347 Kmer * kmerBuffer;
348 char * nextcBuffer, *prevcBuffer;
349 int i;
350 int buffer_size = maxReadLen > CTGendLen ? maxReadLen : CTGendLen;
351 KmerSet * kmerS = NULL;
352 int lenCtg1;
353 int lenCtg2;
354 char * bal_seq;
355 char * src_seq;
356 src_seq = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
357 bal_seq = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
358 lenCtg1 = getSeqFromCtg ( ctg1, 0, CTGendLen, originOverlap, src_seq );
359 lenCtg2 = getSeqFromCtg ( ctg2, 1, CTGendLen, originOverlap, src_seq );
361 if ( lenCtg1 <= overlap || lenCtg2 <= overlap )
363 free ( ( void * ) src_seq );
364 free ( ( void * ) bal_seq );
365 return kmerS;
368 kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) );
369 prevcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
370 nextcBuffer = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
371 kmerS = init_kmerset ( 1024, 0.77f );
373 for ( i = 0; i < num; i++ )
375 getSeqFromRead ( rdArray[i], src_seq );
376 chopKmer4read ( rdArray[i].len, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter );
377 putKmer2DBgraph ( kmerS, 0, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer );
380 lenCtg1 = getSeqFromCtg ( ctg1, 0, CTGendLen, originOverlap, src_seq );
381 chopKmer4Ctg ( kmerCtg1, lenCtg1, overlap, src_seq, WordFilter );
382 chopKmer4read ( lenCtg1, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter );
383 putKmer2DBgraph ( kmerS, 1, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer );
384 lenCtg2 = getSeqFromCtg ( ctg2, 1, CTGendLen, originOverlap, src_seq );
385 chopKmer4Ctg ( kmerCtg2, lenCtg2, overlap, src_seq, WordFilter );
386 chopKmer4read ( lenCtg2, overlap, src_seq, bal_seq, kmerBuffer, prevcBuffer, nextcBuffer, &kmer_c, WordFilter );
387 putKmer2DBgraph ( kmerS, 2, kmer_c, kmerBuffer, prevcBuffer, nextcBuffer );
389 if(ctg1->ctgID==3733&&ctg2->ctgID==3067){
390 for(i=0;i<lenCtg2;i++)
391 printf("%c",int2base((int)src_seq[i]));
392 printf("\n");
395 //printf("sequence length chop from contigs on both sides: %d %d\n",lenCtg1,lenCtg2);
396 //kmerSet_deLoop(kmerS,WordFilter);
397 kmerSet_mark ( kmerS );
398 free ( ( void * ) src_seq );
399 free ( ( void * ) bal_seq );
400 free ( ( void * ) kmerBuffer );
401 free ( ( void * ) nextcBuffer );
402 free ( ( void * ) prevcBuffer );
403 return kmerS;
406 #ifdef MER127
407 static void printKmerSeqLocal ( FILE * fp, Kmer kmer, int overlap )
409 int i, bit1, bit2, bit3, bit4;
410 char ch;
411 char kmerSeq[128];
413 if ( overlap < 32 )
415 bit4 = overlap;
416 bit3 = 0;
417 bit2 = 0;
418 bit1 = 0;
421 if ( overlap >= 32 && overlap < 64 )
423 bit4 = 32;
424 bit3 = overlap - 32;
425 bit2 = 0;
426 bit1 = 0;
429 if ( overlap >= 64 && overlap < 96 )
431 bit4 = 32;
432 bit3 = 32;
433 bit2 = overlap - 64;
434 bit1 = 0;
437 if ( overlap >= 96 && overlap < 128 )
439 bit4 = 32;
440 bit3 = 32;
441 bit2 = 32;
442 bit1 = overlap - 96;
445 for ( i = bit1 - 1; i >= 0; i-- )
447 ch = kmer.high1 & 0x3;
448 kmer.high1 >>= 2;
449 kmerSeq[i] = ch;
452 for ( i = bit2 - 1; i >= 0; i-- )
454 ch = kmer.low1 & 0x3;
455 kmer.low1 >>= 2;
456 kmerSeq[i + bit1] = ch;
459 for ( i = bit3 - 1; i >= 0; i-- )
461 ch = kmer.high2 & 0x3;
462 kmer.high2 >>= 2;
463 kmerSeq[i + bit1 + bit2] = ch;
466 for ( i = bit4 - 1; i >= 0; i-- )
468 ch = kmer.low2 & 0x3;
469 kmer.low2 >>= 2;
470 kmerSeq[i + bit1 + bit2 + bit3] = ch;
473 for ( i = 0; i < overlap; i++ )
475 fprintf ( fp, "%c", int2base ( ( int ) kmerSeq[i] ) );
478 #else
479 static void printKmerSeqLocal ( FILE * fp, Kmer kmer, int overlap )
481 int i, bit1, bit2;
482 char ch;
483 char kmerSeq[64];
484 bit2 = overlap > 32 ? 32 : overlap;
485 bit1 = overlap > 32 ? overlap - 32 : 0;
487 for ( i = bit1 - 1; i >= 0; i-- )
489 ch = kmer.high & 0x3;
490 kmer.high >>= 2;
491 kmerSeq[i] = ch;
494 for ( i = bit2 - 1; i >= 0; i-- )
496 ch = kmer.low & 0x3;
497 kmer.low >>= 2;
498 kmerSeq[i + bit1] = ch;
501 for ( i = 0; i < overlap; i++ )
503 fprintf ( fp, "%c", int2base ( ( int ) kmerSeq[i] ) );
506 #endif
508 static void kmerSet_mark ( KmerSet * set )
510 int i, in_num, out_num, cvgSingle;
511 kmer_t * rs;
512 long long counter = 0, linear = 0;
513 Kmer word;
514 set->iter_ptr = 0;
516 while ( set->iter_ptr < set->size )
518 if ( !is_kmer_entity_null ( set->flags, set->iter_ptr ) )
520 in_num = out_num = 0;
521 rs = set->array + set->iter_ptr;
522 word = rs->seq;
524 for ( i = 0; i < 4; i++ )
526 cvgSingle = get_kmer_left_cov ( *rs, i );
528 if ( cvgSingle > 0 )
530 in_num++;
533 cvgSingle = get_kmer_right_cov ( *rs, i );
535 if ( cvgSingle > 0 )
537 out_num++;
541 if ( rs->single )
543 counter++;
546 if ( in_num == 1 && out_num == 1 )
548 rs->linear = 1;
549 linear++;
553 set->iter_ptr++;
556 //printf("Allocated %ld node, %ld single nodes, %ld linear\n",(long)count_kmerset(set),counter,linear);
559 static kmer_t * searchNode ( Kmer word, KmerSet * kset, int overlap )
561 Kmer bal_word = reverseComplement ( word, overlap );
562 kmer_t * node;
563 boolean found;
565 if ( KmerSmaller ( word, bal_word ) )
567 found = search_kmerset ( kset, word, &node );
569 else
571 found = search_kmerset ( kset, bal_word, &node );
574 if ( found )
576 return node;
578 else
580 return NULL;
584 static int searchKmerOnCtg ( Kmer currW, Kmer * kmerDest, int num )
586 int i;
588 for ( i = 0; i < num; i++ )
590 if ( KmerEqual ( currW, kmerDest[i] ) )
592 return i;
596 return -1;
599 // pick on from n items randomly
600 static int nPick1 ( int * array, int n )
602 int m, i;
603 m = n - 1; //(int)(drand48()*n);
604 int value = array[m];
606 for ( i = m; i < n - 1; i++ )
608 array[i] = array[i + 1];
611 return value;
614 static void traceAlongDBgraph ( Kmer currW, int steps, int min, int max, int * num_route, KmerSet * kset,
615 Kmer * kmerDest, int num, int overlap, Kmer WORDF, char ** foundRoutes, int * routeEndOnCtg2,
616 int * routeLens, char * soFarSeq, int * traceCounter, int maxRoute, kmer_t ** soFarNode,
617 boolean * multiOccu, long long * soFarLinks, double * avgLinks )
619 ( *traceCounter ) ++;
621 if ( *traceCounter > UPlimit )
624 if(overlap==19&&kmerDest[0]==pubKmer)
625 printf("UPlimit\n");
627 return;
630 if ( steps > max || *num_route >= maxRoute )
633 if(overlap==19&&kmerDest[0]==pubKmer)
634 printf("max steps/maxRoute\n");
636 return;
639 Kmer word = reverseComplement ( currW, overlap );
640 boolean isSmaller = KmerSmaller ( currW, word );
641 int i;
642 char ch;
643 unsigned char links;
645 if ( isSmaller )
647 word = currW;
650 kmer_t * node;
651 boolean found = search_kmerset ( kset, word, &node );
653 // #ifdef DEBUG
654 if ( !found )
656 fprintf ( stderr, "%s Trace: can't find kmer ", __FUNCTION__ );
657 PrintKmer ( stderr, word );
658 fprintf ( stderr, " (input " );
659 PrintKmer ( stderr, currW );
660 fprintf ( stderr, ") at step %d.\n", steps );
662 #ifdef MER127
663 fprintf (stderr, "%s Trace: can't find kmer %llx %llx %llx %llx (input %llx %llx %llx %llx) at step %d.\n",
664 __FUNCTION__, word.high1, word.low1, word.high2, word.low2, currW.high1, currW.low1,
665 currW.high2, currW.low2, steps );
666 #else
667 printf ( "Trace: can't find kmer %llx %llx (input %llx %llx) at step %d\n",
668 word.high, word.low, currW.high, currW.low, steps );
669 #endif
671 return;
674 // #else
675 // if (!found) return;
676 // #endif
678 if ( node->twin > 1 )
680 return;
683 if ( soFarNode )
685 soFarNode[steps] = node;
688 if ( steps > 0 )
690 soFarSeq[steps - 1] = lastCharInKmer ( currW );
693 int index, end;
694 int linkCounter = *soFarLinks;
696 if ( steps >= min && node->inEdge > 1 && ( end = searchKmerOnCtg ( currW, kmerDest, num ) ) >= 0 )
698 index = *num_route;
700 if ( steps > 0 )
702 avgLinks[index] = ( double ) linkCounter / steps;
704 else
706 avgLinks[index] = 0;
709 //find node that appears more than once in the path
710 multiOccu[index] = 0;
712 for ( i = 0; i < steps + 1; i++ )
714 soFarNode[i]->deleted = 0;
717 for ( i = 0; i < steps + 1; i++ )
719 if ( soFarNode[i]->deleted )
721 multiOccu[index] = 1;
722 break;
725 soFarNode[i]->deleted = 1;
728 routeEndOnCtg2[index] = end;
729 routeLens[index] = steps;
730 char * array = foundRoutes[index];
732 for ( i = 0; i < steps; i++ )
734 array[i] = soFarSeq[i];
737 if ( i < max )
739 array[i] = 4;
740 } //indicate the end of the sequence
742 *num_route = ++index;
743 return;
746 steps++;
748 if ( isSmaller )
750 int array[] = { 0, 1, 2, 3 };
752 for ( i = 4; i > 0; i-- )
754 ch = nPick1 ( array, i );
755 links = get_kmer_right_cov ( *node, ch );
757 if ( !links )
759 continue;
762 *soFarLinks = linkCounter + links;
763 word = nextKmerLocal ( currW, ch, WORDF );
764 traceAlongDBgraph ( word, steps, min, max, num_route, kset, kmerDest, num, overlap, WORDF,
765 foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, traceCounter, maxRoute,
766 soFarNode, multiOccu, soFarLinks, avgLinks );
769 else
771 int array[] = { 0, 1, 2, 3 };
773 for ( i = 4; i > 0; i-- )
775 ch = nPick1 ( array, i );
776 links = get_kmer_left_cov ( *node, ch );
778 if ( !links )
780 continue;
783 *soFarLinks = linkCounter + links;
784 word = nextKmerLocal ( currW, int_comp ( ch ), WORDF );
785 traceAlongDBgraph ( word, steps, min, max, num_route, kset, kmerDest, num, overlap, WORDF,
786 foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, traceCounter, maxRoute,
787 soFarNode, multiOccu, soFarLinks, avgLinks );
792 static int searchFgap ( KmerSet * kset, CTGinSCAF * ctg1, CTGinSCAF * ctg2, Kmer * kmerCtg1, Kmer * kmerCtg2,
793 unsigned int origOverlap, int overlap, DARRAY * gapSeqArray, int len1, int len2,
794 Kmer WordFilter, int * offset1, int * offset2, char * seqGap, int * cut1, int * cut2 )
796 int i;
797 int ret = 0;
798 kmer_t * node, **soFarNode;
799 int num_route;
800 int gapLen = ctg2->start - ctg1->end - origOverlap + overlap;
801 int min = gapLen - GLDiff > 0 ? gapLen - GLDiff : 0; //0531
802 int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff;
803 char ** foundRoutes;
804 char * soFarSeq;
805 int traceCounter;
806 int * routeEndOnCtg2;
807 int * routeLens;
808 boolean * multiOccu;
809 long long soFarLinks;
810 double * avgLinks;
811 //mask linear internal linear kmer on contig1 end
812 routeEndOnCtg2 = ( int * ) ckalloc ( MaxRouteNum * sizeof ( int ) );
813 routeLens = ( int * ) ckalloc ( MaxRouteNum * sizeof ( int ) );
814 multiOccu = ( boolean * ) ckalloc ( MaxRouteNum * sizeof ( boolean ) );
815 short * MULTI1 = ( short * ) ckalloc ( MaxRouteNum * sizeof ( short ) );
816 short * MULTI2 = ( short * ) ckalloc ( MaxRouteNum * sizeof ( short ) );
817 soFarSeq = ( char * ) ckalloc ( max * sizeof ( char ) );
818 soFarNode = ( kmer_t ** ) ckalloc ( ( max + 1 ) * sizeof ( kmer_t * ) );
819 foundRoutes = ( char ** ) ckalloc ( MaxRouteNum * sizeof ( char * ) );;
820 avgLinks = ( double * ) ckalloc ( MaxRouteNum * sizeof ( double ) );;
822 for ( i = 0; i < MaxRouteNum; i++ )
824 foundRoutes[i] = ( char * ) ckalloc ( max * sizeof ( char ) );
827 for ( i = len1 - 1; i >= 0; i-- )
829 num_route = traceCounter = soFarLinks = 0;
830 int steps = 0;
831 traceAlongDBgraph ( kmerCtg1[i], steps, min, max, &num_route, kset, kmerCtg2, len2, overlap, WordFilter,
832 foundRoutes, routeEndOnCtg2, routeLens, soFarSeq, &traceCounter, MaxRouteNum, soFarNode,
833 multiOccu, &soFarLinks, avgLinks );
835 if ( num_route > 0 )
837 int m, minEnd = routeEndOnCtg2[0];
839 for ( m = 0; m < num_route; m++ )
841 if ( routeLens[m] < 0 )
843 continue;
846 if ( routeEndOnCtg2[m] < minEnd )
848 minEnd = routeEndOnCtg2[m];
852 /* else if(minFreq>1){
853 for(m=0;m<num_route;m++){
854 if(routeEndOnCtg2[m]!=minEnd)
855 continue;
856 for(j=0;j<max;j++){
857 if(foundRoutes[m][j]>3)
858 break;
859 printf("%c",int2base((int)foundRoutes[m][j]));
861 printf(": %4.2f\n",avgLinks[m]);
863 } */
864 num_route = traceCounter = soFarLinks = 0;
865 steps = 0;
866 trace4Repeat ( kmerCtg1[i], steps, min, max, &num_route, kset, kmerCtg2[minEnd], overlap, WordFilter, &traceCounter,
867 MaxRouteNum, soFarNode, MULTI1, MULTI2, routeLens, foundRoutes, soFarSeq, &soFarLinks, avgLinks );
868 int j, best = 0;
869 int maxLen = routeLens[0];
870 double maxLink = avgLinks[0];
871 char * pt;
872 boolean repeat = 0, sameLen = 1;
873 int leftMost = max, rightMost = max;
874 #ifdef DEBUG
876 if ( num_route < 1 )
878 fprintf ( stderr, "After trace4Repeat: non route was found.\n" );
879 continue;
882 #else
884 if ( num_route < 1 ) { continue; }
886 #endif
888 if ( num_route > 1 )
890 // if multi paths are found, we check on the repeatative occurrences and links/length
891 for ( m = 0; m < num_route; m++ )
893 if ( routeLens[m] < 0 )
895 continue;
898 if ( MULTI1[m] >= 0 && MULTI2[m] >= 0 )
900 repeat = 1;
901 leftMost = leftMost > MULTI1[m] ? MULTI1[m] : leftMost;
902 rightMost = rightMost > MULTI2[m] ? MULTI2[m] : rightMost;
905 if ( routeLens[m] != maxLen )
907 sameLen = 0;
910 if ( routeLens[m] < maxLen )
912 maxLen = routeLens[m];
915 if ( avgLinks[m] > maxLink )
917 maxLink = avgLinks[m];
918 best = m;
923 if ( repeat )
925 *offset1 = *offset2 = *cut1 = *cut2 = 0;
926 int index = 0;
927 char ch;
929 for ( j = 0; j < leftMost; j++ )
931 if ( routeLens[0] < j + overlap + 1 )
933 break;
935 else
937 ch = foundRoutes[0][j];
940 for ( m = 1; m < num_route; m++ )
942 if ( routeLens[m] < 0 )
944 continue;
947 if ( ch != foundRoutes[m][j] )
949 break;
953 if ( m == num_route )
955 seqGap[index++] = ch;
957 else
959 break;
963 *offset1 = index;
964 index = 0;
966 for ( j = 0; j < rightMost; j++ )
968 if ( routeLens[0] - overlap - 1 < j )
970 break;
972 else
974 ch = foundRoutes[0][routeLens[0] - overlap - 1 - j];
977 for ( m = 1; m < num_route; m++ )
979 if ( routeLens[m] < 0 )
981 continue;
984 if ( ch != foundRoutes[m][routeLens[m] - overlap - 1 - j] )
986 break;
990 if ( m == num_route )
992 index++;
994 else
996 break;
1000 *offset2 = index;
1002 for ( j = 0; j < *offset2; j++ )
1004 seqGap[*offset1 + *offset2 - 1 - j] = foundRoutes[0][routeLens[0] - overlap - 1 - j];
1007 if ( *offset1 > 0 || *offset2 > 0 )
1009 *cut1 = len1 - i - 1;
1010 *cut2 = minEnd;
1012 //fprintf(stderr,"\n");
1013 for ( m = 0; m < num_route; m++ )
1015 for ( j = 0; j < max; j++ )
1017 if ( foundRoutes[m][j] > 3 )
1019 break;
1022 //fprintf(stderr,"%c",int2base((int)foundRoutes[m][j]));
1025 //fprintf(stderr,": %4.2f\n",avgLinks[m]);
1029 fprintf(stderr,">Gap (%d + %d) (%d + %d)\n",*offset1,*offset2,*cut1,*cut2);
1030 for(index=0;index<*offset1+*offset2;index++)
1031 fprintf(stderr,"%c",int2base(seqGap[index]));
1032 fprintf(stderr,"\n"); */
1035 ret = 3;
1036 break;
1039 if ( overlap + ( len1 - i - 1 ) + minEnd - routeLens[best] > ( int ) origOverlap )
1041 continue;
1044 ctg1->gapSeqOffset = gapSeqArray->item_c;
1045 ctg1->gapSeqLen = routeLens[best];
1047 if ( !darrayPut ( gapSeqArray, ctg1->gapSeqOffset + maxLen / 4 ) )
1049 continue;
1052 pt = ( char * ) darrayPut ( gapSeqArray, ctg1->gapSeqOffset );
1055 printKmerSeqLocal(stderr,kmerCtg1[i],overlap);
1056 fprintf(stderr,"-");
1058 for ( j = 0; j < max; j++ )
1060 if ( foundRoutes[best][j] > 3 )
1062 break;
1065 writeChar2tightString ( foundRoutes[best][j], pt, j );
1066 //fprintf(stderr,"%c",int2base((int)foundRoutes[best][j]));
1069 //fprintf(stderr,": GAPSEQ %d + %d, avglink %4.2f\n",len1-i-1,minEnd,avgLinks[best]);
1070 ctg1->cutTail = len1 - i - 1;
1071 ctg2->cutHead = overlap + minEnd;
1072 ctg2->scaftig_start = 0;
1073 ret = 1;
1074 break;
1075 /* }if(num_route>1){
1076 ret = 2;
1077 break; */
1079 else //mark node which leads to dead end
1081 node = searchNode ( kmerCtg1[i], kset, overlap );
1083 if ( node )
1085 node->twin = 2;
1090 for ( i = 0; i < MaxRouteNum; i++ )
1092 free ( ( void * ) foundRoutes[i] );
1095 free ( ( void * ) soFarSeq );
1096 free ( ( void * ) soFarNode );
1097 free ( ( void * ) multiOccu );
1098 free ( ( void * ) MULTI1 );
1099 free ( ( void * ) MULTI2 );
1100 free ( ( void * ) foundRoutes );
1101 free ( ( void * ) routeEndOnCtg2 );
1102 free ( ( void * ) routeLens );
1103 return ret;
1106 static void trace4Repeat ( Kmer currW, int steps, int min, int max, int * num_route, KmerSet * kset, Kmer kmerDest, int overlap, Kmer WORDF,
1107 int * traceCounter, int maxRoute, kmer_t ** soFarNode, short * multiOccu1, short * multiOccu2, int * routeLens,
1108 char ** foundRoutes, char * soFarSeq, long long * soFarLinks, double * avgLinks )
1110 ( *traceCounter ) ++;
1112 if ( *traceCounter > UPlimit )
1114 return;
1117 if ( steps > max || *num_route >= maxRoute )
1119 return;
1122 Kmer word = reverseComplement ( currW, overlap );
1123 boolean isSmaller = KmerSmaller ( currW, word );
1124 char ch;
1125 unsigned char links;
1126 int index, i;
1128 if ( isSmaller )
1130 word = currW;
1133 kmer_t * node;
1134 boolean found = search_kmerset ( kset, word, &node );
1136 // #ifdef DEBUG
1137 if ( !found )
1139 fprintf ( stderr, "%s Trace: can't find kmer ", __FUNCTION__ );
1140 PrintKmer ( stderr, word );
1141 fprintf ( stderr, " (input " );
1142 PrintKmer ( stderr, currW );
1143 fprintf ( stderr, ") at step %d.\n", steps );
1145 #ifdef MER127
1146 printf ( "%s Trace: can't find kmer %llx %llx %llx %llx (input %llx %llx %llx %llx) at step %d\n",
1147 __FUNCTION__, word.high1, word.low1, word.high2, word.low2, currW.high1, currW.low1,
1148 currW.high2, currW.low2, steps );
1149 #else
1150 printf ( "Trace: can't find kmer %llx %llx (input %llx %llx) at step %d\n",
1151 word.high, word.low, currW.high, currW.low, steps );
1152 #endif
1154 return;
1157 // #else
1158 // if (!found) return;
1159 // #endif
1161 if ( soFarNode )
1163 soFarNode[steps] = node;
1166 if ( soFarSeq && steps > 0 )
1168 soFarSeq[steps - 1] = lastCharInKmer ( currW );
1171 int linkCounter;
1173 if ( soFarLinks )
1175 linkCounter = *soFarLinks;
1178 if ( steps >= min && KmerEqual ( currW, kmerDest ) )
1180 index = *num_route;
1182 if ( avgLinks && steps > 0 )
1184 avgLinks[index] = ( double ) linkCounter / steps;
1186 else if ( avgLinks )
1188 avgLinks[index] = 0;
1191 //find node that appears more than once in the path
1192 if ( multiOccu1 && multiOccu2 )
1194 for ( i = 0; i < steps + 1; i++ )
1196 soFarNode[i]->deleted = 0;
1199 int rightMost = 0;
1200 boolean MULTI = 0;
1202 for ( i = 0; i < steps + 1; i++ )
1204 if ( soFarNode[i]->deleted )
1206 rightMost = rightMost < i - 1 ? i - 1 : rightMost;
1207 MULTI = 1;
1210 soFarNode[i]->deleted = 1;
1213 if ( !MULTI )
1215 multiOccu1[index] = multiOccu2[index] = -1;
1217 else
1219 multiOccu2[index] = steps - 2 - rightMost < 0 ? 0 : steps - 2 - rightMost; //[0 steps-2]
1221 for ( i = 0; i < steps + 1; i++ )
1223 soFarNode[i]->deleted = 0;
1226 int leftMost = steps - 2;
1228 for ( i = steps; i >= 0; i-- )
1230 if ( soFarNode[i]->deleted )
1232 leftMost = leftMost > i - 1 ? i - 1 : leftMost;
1235 soFarNode[i]->deleted = 1;
1238 multiOccu1[index] = leftMost < 0 ? 0 : leftMost; //[0 steps-2]
1242 if ( routeLens )
1244 routeLens[index] = steps;
1247 if ( soFarSeq )
1249 char * array = foundRoutes[index];
1251 for ( i = 0; i < steps; i++ )
1253 array[i] = soFarSeq[i];
1256 if ( i < max )
1258 array[i] = 4;
1259 } //indicate the end of the sequence
1262 *num_route = ++index;
1265 steps++;
1267 if ( isSmaller )
1269 int array[] = { 0, 1, 2, 3 };
1271 for ( i = 4; i > 0; i-- )
1273 ch = nPick1 ( array, i );
1274 links = get_kmer_right_cov ( *node, ch );
1276 if ( !links )
1278 continue;
1281 if ( soFarLinks )
1283 *soFarLinks = linkCounter + links;
1286 word = nextKmerLocal ( currW, ch, WORDF );
1287 trace4Repeat ( word, steps, min, max, num_route, kset, kmerDest, overlap, WORDF, traceCounter, maxRoute, soFarNode,
1288 multiOccu1, multiOccu2, routeLens, foundRoutes, soFarSeq, soFarLinks, avgLinks );
1291 else
1293 int array[] = { 0, 1, 2, 3 };
1295 for ( i = 4; i > 0; i-- )
1297 ch = nPick1 ( array, i );
1298 links = get_kmer_left_cov ( *node, ch );
1300 if ( !links )
1302 continue;
1305 if ( soFarLinks )
1307 *soFarLinks = linkCounter + links;
1310 word = nextKmerLocal ( currW, int_comp ( ch ), WORDF );
1311 trace4Repeat ( word, steps, min, max, num_route, kset, kmerDest, overlap, WORDF, traceCounter, maxRoute, soFarNode,
1312 multiOccu1, multiOccu2, routeLens, foundRoutes, soFarSeq, soFarLinks, avgLinks );
1317 //found repeat node on contig ends
1318 static void maskRepeatNode ( KmerSet * kset, Kmer * kmerCtg1, Kmer * kmerCtg2, int overlap, int len1, int len2, int max, Kmer WordFilter )
1320 int i;
1321 int num_route, steps;
1322 int min = 1, maxRoute = 1;
1323 int traceCounter;
1324 Kmer word, bal_word;
1325 kmer_t * node;
1326 boolean found;
1327 int counter = 0;
1329 for ( i = 0; i < len1; i++ )
1331 word = kmerCtg1[i];
1332 bal_word = reverseComplement ( word, overlap );
1334 if ( KmerLarger ( word, bal_word ) )
1336 word = bal_word;
1339 found = search_kmerset ( kset, word, &node );
1341 if ( !found || node->linear )
1343 //printf("Found no node for kmer %llx\n",word);
1344 continue;
1347 num_route = traceCounter = 0;
1348 steps = 0;
1349 trace4Repeat ( word, steps, min, max, &num_route, kset, word, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL );
1351 if ( num_route < 1 )
1353 continue;
1356 counter++;
1357 node->checked = 1;
1360 for ( i = 0; i < len2; i++ )
1362 word = kmerCtg2[i];
1363 bal_word = reverseComplement ( word, overlap );
1365 if ( KmerLarger ( word, bal_word ) )
1367 word = bal_word;
1370 found = search_kmerset ( kset, word, &node );
1372 if ( !found || node->linear )
1374 //printf("Found no node for kmer %llx\n",word);
1375 continue;
1378 num_route = traceCounter = 0;
1379 steps = 0;
1380 trace4Repeat ( word, steps, min, max, &num_route, kset, word, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL );
1382 if ( num_route < 1 )
1384 continue;
1387 counter++;
1388 node->checked = 1;
1391 //printf("MR: %d(%d)\n",counter,len1+len2);
1395 static boolean chopReadFillGap(int len_seq,int overlap,char *src_seq, char *bal_seq,
1396 KmerSet *kset,Kmer WORDF,int *start,int *end,boolean *bal,
1397 Kmer *KmerCtg1,int len1,Kmer *KmerCtg2,int len2,int *index1,int *index2)
1399 int index,j=0,bal_j;
1400 Kmer word,bal_word;
1401 int flag=0,bal_flag=0;
1402 int ctg1start,bal_ctg1start,ctg2end,bal_ctg2end;
1403 int seqStart,bal_start,seqEnd,bal_end;
1404 kmer_t *node;
1405 boolean found;
1407 if(len_seq<overlap+1){
1408 return 0;
1410 word = 0;
1411 for (index = 0;index<overlap;index++){
1412 word <<= 2;
1413 word += src_seq[index];
1415 reverseComplementSeq(src_seq, len_seq,bal_seq);
1417 // complementary node
1418 bal_word = reverseComplement(word,overlap);
1419 bal_j = len_seq-0-overlap; // 0;
1420 flag = bal_flag = 0;
1421 if(word<bal_word){
1422 found = search_kmerset(kset,word,&node);
1423 }else{
1424 found = search_kmerset(kset,bal_word,&node);
1426 if(found&&!node->linear&&!node->checked){
1427 if(!flag&&node->inEdge==1){
1428 ctg1start = searchKmerOnCtg(word,KmerCtg1,len1);
1429 if(ctg1start>0){
1430 flag = 1;
1431 seqStart = j + overlap-1;
1434 if(!bal_flag&&node->inEdge==2){
1435 bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2);
1436 if(bal_ctg2end>0){
1437 bal_flag = 2;
1438 bal_end = bal_j+overlap-1;
1443 for(j = 1; j <= len_seq - overlap; j ++) {
1444 word = nextKmerLocal(word,src_seq[j-1+overlap],WORDF);
1445 bal_j = len_seq-j-overlap; // j;
1446 bal_word = prevKmerLocal(bal_word,bal_seq[bal_j],overlap);
1448 if(word<bal_word){
1449 found = search_kmerset(kset,word,&node);
1450 }else{
1451 found = search_kmerset(kset,bal_word,&node);
1453 if(found&&!node->linear&&!node->checked){
1454 if(!flag&&node->inEdge==1){
1455 ctg1start = searchKmerOnCtg(word,KmerCtg1,len1);
1456 if(ctg1start>0){
1457 flag = 1;
1458 seqStart = j + overlap-1;
1460 }else if(flag==1&&node->inEdge==1){
1461 index = searchKmerOnCtg(word,KmerCtg1,len1);
1462 if(index>ctg1start){ // choose hit closer to gap
1463 ctg1start = index;
1464 seqStart = j + overlap-1;
1466 }else if(flag==1&&node->inEdge==2){
1467 ctg2end = searchKmerOnCtg(word,KmerCtg2,len2);
1468 if(ctg2end>0){
1469 flag = 3;
1470 seqEnd = j+overlap-1;
1471 break;
1475 if(!bal_flag&&node->inEdge==2){
1476 bal_ctg2end = searchKmerOnCtg(bal_word,KmerCtg2,len2);
1477 if(bal_ctg2end>0){
1478 bal_flag = 2;
1479 bal_end = bal_j+overlap-1;
1481 }else if(bal_flag==2&&node->inEdge==2){
1482 index = searchKmerOnCtg(bal_word,KmerCtg2,len2);
1483 if(index<bal_ctg2end){ // choose hit closer to gap
1484 index = bal_ctg2end;
1485 bal_end = bal_j+overlap-1;
1487 }else if(bal_flag==2&&node->inEdge==1){
1488 bal_ctg1start = searchKmerOnCtg(bal_word,KmerCtg1,len1);
1489 if(bal_ctg1start>0){
1490 bal_flag = 3;
1491 bal_start = bal_j+overlap-1;
1492 break;
1497 if(flag==3){
1498 *start = seqStart;
1499 *end = seqEnd;
1500 *bal = 0;
1501 *index1 = ctg1start;
1502 *index2 = ctg2end;
1503 return 1;
1504 }else if(bal_flag==3){
1505 *start = bal_start;
1506 *end = bal_end;
1507 *bal = 1;
1508 *index1 = bal_ctg1start;
1509 *index2 = bal_ctg2end;
1510 return 1;
1512 return 0;
1515 static boolean readsCrossGap(READNEARBY *rdArray, int num, int originOverlap,DARRAY *gapSeqArray,
1516 Kmer *kmerCtg1,Kmer *kmerCtg2,int overlap,int len1,int len2,
1517 CTGinSCAF *ctg1,CTGinSCAF *ctg2,KmerSet *kmerS,Kmer WordFilter,int min,int max)
1519 int i,j,start,end,startOnCtg1,endOnCtg2;
1520 char *bal_seq;
1521 char *src_seq;
1522 char *pt;
1523 boolean bal,ret=0,FILL;
1525 src_seq = (char *)ckalloc(maxReadLen*sizeof(char));
1526 bal_seq = (char *)ckalloc(maxReadLen*sizeof(char));
1528 for(i=0;i<num;i++){
1529 getSeqFromRead(rdArray[i],src_seq);
1530 FILL = chopReadFillGap(rdArray[i].len,overlap,src_seq,bal_seq,
1531 kmerS,WordFilter,&start,&end,&bal,
1532 kmerCtg1,len1,kmerCtg2,len2,&startOnCtg1,&endOnCtg2);
1534 if(!FILL||(end-start)<min||(end-start)>max)
1535 continue;
1536 fprintf(stderr,"Read across\n");
1537 //printf("Filled: K %d, ctg1 %d ctg2 %d,start %d end %d\n",overlap,startOnCtg1,endOnCtg2,start,end);
1538 if(overlap+(len1-startOnCtg1-1)+endOnCtg2-(end-start)>(int)originOverlap)
1539 continue; // contig1 and contig2 could not overlap more than origOverlap bases
1541 ctg1->gapSeqOffset = gapSeqArray->item_c;
1542 ctg1->gapSeqLen = end-start;
1543 if(!darrayPut(gapSeqArray,ctg1->gapSeqOffset+(end-start)/4))
1544 continue;
1545 pt = (char *)darrayPut(gapSeqArray,ctg1->gapSeqOffset);
1546 for(j=start+1;j<=end;j++){
1547 if(bal)
1548 writeChar2tightString(bal_seq[j],pt,j-start-1);
1549 else
1550 writeChar2tightString(src_seq[j],pt,j-start-1);
1553 ctg1->cutTail = len1-startOnCtg1-1;
1554 ctg2->cutHead = overlap + endOnCtg2;
1555 ctg2->scaftig_start = 0;
1557 ret = 1;
1558 break;
1561 free((void*)src_seq);
1562 free((void*)bal_seq);
1563 return ret;
1566 static void kmerSet_markTandem ( KmerSet * set, Kmer WordFilter, int overlap );
1567 static boolean readsCrossGap ( READNEARBY * rdArray, int num, int originOverlap, DARRAY * gapSeqArray,
1568 Kmer * kmerCtg1, Kmer * kmerCtg2, int overlap, CTGinSCAF * ctg1, CTGinSCAF * ctg2,
1569 KmerSet * kmerS, Kmer WordFilter, int min, int max, int offset1, int offset2,
1570 char * seqGap, char * seqCtg1, char * seqCtg2, int cut1, int cut2 );
1572 int localGraph ( READNEARBY * rdArray, int num, CTGinSCAF * ctg1, CTGinSCAF * ctg2,
1573 int origOverlap, Kmer * kmerCtg1, Kmer * kmerCtg2, int overlap,
1574 DARRAY * gapSeqArray, char * seqCtg1, char * seqCtg2, char * seqGap )
1576 /**************** put kmer in DBgraph ****************/
1577 KmerSet * kmerSet;
1578 Kmer WordFilter = createFilter ( overlap );
1580 if(ctg1->ctgID==56410&&ctg2->ctgID==61741)
1581 printf("Extract %d reads for gap [%d %d]\n",num,ctg1->ctgID,ctg2->ctgID);
1583 kmerSet = readsInGap2DBgraph ( rdArray, num, ctg1, ctg2, origOverlap, kmerCtg1, kmerCtg2, overlap, WordFilter );
1585 if ( !kmerSet )
1587 //printf("no kmer found\n");
1588 return 0;
1591 time_t tt;
1592 time ( &tt );
1593 //srand48 ( ( int ) tt );
1595 int i,j;
1596 for(i=0;i<2;i++){
1597 int array[] = {0,1,2,3};
1598 for(j=4;j>0;j--)
1599 fprintf(stderr,"%d ", nPick1(array,j));
1601 fprintf(stderr,"\n");
1603 /***************** search path to connect contig ends ********/
1604 int gapLen = ctg2->start - ctg1->end - origOverlap + overlap;
1605 int min = gapLen - GLDiff > 0 ? gapLen - GLDiff : 0;
1606 int max = gapLen + GLDiff < 10 ? 10 : gapLen + GLDiff;
1607 //count kmer number for contig1 and contig2 ends
1608 int len1, len2;
1609 len1 = CTGendLen < contig_array[ctg1->ctgID].length + origOverlap ? CTGendLen : contig_array[ctg1->ctgID].length + origOverlap;
1610 len2 = CTGendLen < contig_array[ctg2->ctgID].length + origOverlap ? CTGendLen : contig_array[ctg2->ctgID].length + origOverlap;
1611 len1 -= overlap - 1;
1612 len2 -= overlap - 1;
1613 //int pathNum = 2;
1614 int offset1 = 0, offset2 = 0, cut1 = 0, cut2 = 0;
1615 int pathNum = searchFgap ( kmerSet, ctg1, ctg2, kmerCtg1, kmerCtg2,
1616 origOverlap, overlap, gapSeqArray,
1617 len1, len2, WordFilter, &offset1, &offset2, seqGap, &cut1, &cut2 );
1619 //printf("SF: %d K %d\n",pathNum,overlap);
1620 if ( pathNum == 0 )
1622 free_kmerset ( kmerSet );
1623 return 0;
1625 else if ( pathNum == 1 )
1627 free_kmerset ( kmerSet );
1628 return 1;
1629 } /*
1631 else{
1632 printf("ret %d\n",pathNum);
1633 free_kmerset(kmerSet);
1634 return 0;
1635 } */
1636 /******************* cross the gap by single reads *********/
1637 //kmerSet_markTandem(kmerSet,WordFilter,overlap);
1638 maskRepeatNode ( kmerSet, kmerCtg1, kmerCtg2, overlap, len1, len2, max, WordFilter );
1639 boolean found = readsCrossGap ( rdArray, num, origOverlap, gapSeqArray,
1640 kmerCtg1, kmerCtg2, overlap, ctg1, ctg2, kmerSet, WordFilter, min, max,
1641 offset1, offset2, seqGap, seqCtg1, seqCtg2, cut1, cut2 );
1643 if ( found )
1645 //fprintf(stderr,"read across\n");
1646 free_kmerset ( kmerSet );
1647 return found;
1649 else
1651 free_kmerset ( kmerSet );
1652 return 0;
1656 static void kmerSet_markTandem ( KmerSet * set, Kmer WordFilter, int overlap )
1658 kmer_t * rs;
1659 long long counter = 0;
1660 int num_route, steps;
1661 int min = 1, max = overlap, maxRoute = 1;
1662 int traceCounter;
1663 set->iter_ptr = 0;
1665 while ( set->iter_ptr < set->size )
1667 if ( !is_kmer_entity_null ( set->flags, set->iter_ptr ) )
1669 rs = set->array + set->iter_ptr;
1671 if ( rs->inEdge > 0 )
1673 set->iter_ptr++;
1674 continue;
1677 num_route = traceCounter = 0;
1678 steps = 0;
1679 trace4Repeat ( rs->seq, steps, min, max, &num_route, set, rs->seq, overlap, WordFilter, &traceCounter, maxRoute, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL );
1681 if ( num_route < 1 )
1683 set->iter_ptr++;
1684 continue;
1688 printKmerSeqLocal(stderr,rs->seq,overlap);
1689 fprintf(stderr, "\n");
1691 rs->checked = 1;
1692 counter++;
1695 set->iter_ptr++;
1699 /******************* the following is for read-crossing gaps *************************/
1701 #define MAXREADLENGTH 100
1703 static const int INDEL = 0;
1704 static const int SIM[4][4] =
1706 {1, 0, 0, 0},
1707 {0, 1, 0, 0},
1708 {0, 0, 1, 0},
1709 {0, 0, 0, 1}
1711 static char fastSequence[MAXREADLENGTH];
1712 static char slowSequence[MAXREADLENGTH];
1714 static int Fmatrix[MAXREADLENGTH + 1][MAXREADLENGTH + 1];
1715 static int slowToFastMapping[MAXREADLENGTH + 1];
1716 static int fastToSlowMapping[MAXREADLENGTH + 1];
1718 static int max ( int A, int B, int C )
1720 A = A >= B ? A : B;
1721 return ( A >= C ? A : C );
1724 static int compareSequences ( char * sequence1, char * sequence2, int length1, int length2 )
1726 if ( length1 < 1 || length2 < 1 || length1 > MAXREADLENGTH || length2 > MAXREADLENGTH )
1728 return 0;
1731 int i, j;
1732 int Choice1, Choice2, Choice3;
1733 int maxScore;
1735 for ( i = 0; i <= length1; i++ )
1737 Fmatrix[i][0] = 0;
1740 for ( j = 0; j <= length2; j++ )
1742 Fmatrix[0][j] = 0;
1745 for ( i = 1; i <= length1; i++ )
1747 for ( j = 1; j <= length2; j++ )
1749 Choice1 = Fmatrix[i - 1][j - 1] + SIM[ ( int ) sequence1[i - 1]][ ( int ) sequence2[j - 1]];
1750 Choice2 = Fmatrix[i - 1][j] + INDEL;
1751 Choice3 = Fmatrix[i][j - 1] + INDEL;
1752 Fmatrix[i][j] = max ( Choice1, Choice2, Choice3 );
1756 maxScore = Fmatrix[length1][length2];
1757 return maxScore;
1760 static void mapSlowOntoFast ( int slowSeqLength, int fastSeqLength )
1762 int slowIndex = slowSeqLength;
1763 int fastIndex = fastSeqLength;
1764 int fastn, slown;
1766 if ( slowIndex == 0 )
1768 slowToFastMapping[0] = fastIndex;
1770 while ( fastIndex >= 0 )
1772 fastToSlowMapping[fastIndex--] = 0;
1775 return;
1778 if ( fastIndex == 0 )
1780 while ( slowIndex >= 0 )
1782 slowToFastMapping[slowIndex--] = 0;
1785 fastToSlowMapping[0] = slowIndex;
1786 return;
1789 while ( slowIndex > 0 && fastIndex > 0 )
1791 fastn = ( int ) fastSequence[fastIndex - 1]; //getCharInTightString(fastSequence,fastIndex-1);
1792 slown = ( int ) slowSequence[slowIndex - 1]; //getCharInTightString(slowSequence,slowIndex-1);
1794 if ( Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex - 1][slowIndex - 1] + SIM[fastn][slown] )
1796 fastToSlowMapping[--fastIndex] = --slowIndex;
1797 slowToFastMapping[slowIndex] = fastIndex;
1799 else if ( Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex - 1][slowIndex] + INDEL )
1801 fastToSlowMapping[--fastIndex] = slowIndex - 1;
1803 else if ( Fmatrix[fastIndex][slowIndex] == Fmatrix[fastIndex][slowIndex - 1] + INDEL )
1805 slowToFastMapping[--slowIndex] = fastIndex - 1;
1807 else
1809 fprintf ( stderr, "CompareSequence: Error trace.\n" );
1810 abort ();
1814 while ( slowIndex > 0 )
1816 slowToFastMapping[--slowIndex] = -1;
1819 while ( fastIndex > 0 )
1821 fastToSlowMapping[--fastIndex] = -1;
1824 slowToFastMapping[slowSeqLength] = fastSeqLength;
1825 fastToSlowMapping[fastSeqLength] = slowSeqLength;
1828 static boolean chopReadFillGap ( int len_seq, int overlap, char * src_seq, char * bal_seq,
1829 KmerSet * kset, Kmer WORDF, int * start, int * end, boolean * bal,
1830 Kmer * KmerCtg1, int len1, Kmer * KmerCtg2, int len2, int * index1, int * index2 )
1832 int index, j = 0, bal_j;
1833 Kmer word, bal_word;
1834 int flag = 0, bal_flag = 0;
1835 int ctg1start, bal_ctg1start, ctg2end, bal_ctg2end;
1836 int seqStart, bal_start, seqEnd, bal_end;
1837 kmer_t * node;
1838 boolean found;
1840 if ( len_seq < overlap + 1 )
1842 return 0;
1845 #ifdef MER127
1846 word.high1 = word.low1 = word.high2 = word.low2 = 0;
1848 for ( index = 0; index < overlap; index++ )
1850 word = KmerLeftBitMoveBy2 ( word );
1851 word.low2 |= src_seq[index];
1854 #else
1855 word.high = word.low = 0;
1857 for ( index = 0; index < overlap; index++ )
1859 word = KmerLeftBitMoveBy2 ( word );
1860 word.low |= src_seq[index];
1863 #endif
1864 reverseComplementSeq ( src_seq, len_seq, bal_seq );
1865 // complementary node
1866 bal_word = reverseComplement ( word, overlap );
1867 bal_j = len_seq - 0 - overlap; // 0;
1868 flag = bal_flag = 0;
1870 if ( KmerSmaller ( word, bal_word ) )
1872 found = search_kmerset ( kset, word, &node );
1874 else
1876 found = search_kmerset ( kset, bal_word, &node );
1879 //if ( !found ) { printf ( "chopReadFillGap 1292 not found!\n" ); }
1881 if ( found && !node->linear && !node->checked )
1883 if ( !flag && node->inEdge == 1 )
1885 ctg1start = searchKmerOnCtg ( word, KmerCtg1, len1 );
1887 if ( ctg1start >= 0 )
1889 flag = 1;
1890 seqStart = j + overlap - 1;
1894 if ( !bal_flag && node->inEdge == 2 )
1896 bal_ctg2end = searchKmerOnCtg ( bal_word, KmerCtg2, len2 );
1898 if ( bal_ctg2end >= 0 )
1900 bal_flag = 2;
1901 bal_end = bal_j + overlap - 1;
1906 for ( j = 1; j <= len_seq - overlap; j++ )
1908 word = nextKmerLocal ( word, src_seq[j - 1 + overlap], WORDF );
1909 bal_j = len_seq - j - overlap; // j;
1910 bal_word = prevKmerLocal ( bal_word, bal_seq[bal_j], overlap );
1912 if ( KmerSmaller ( word, bal_word ) )
1914 found = search_kmerset ( kset, word, &node );
1916 else
1918 found = search_kmerset ( kset, bal_word, &node );
1921 //if ( !found ) { printf ( "chopReadFillGap 1321 not found!\n" ); }
1923 if ( found && !node->linear && !node->checked )
1925 if ( !flag && node->inEdge == 1 )
1927 ctg1start = searchKmerOnCtg ( word, KmerCtg1, len1 );
1929 if ( ctg1start >= 0 )
1931 flag = 1;
1932 seqStart = j + overlap - 1;
1935 else if ( flag == 1 && node->inEdge == 1 )
1937 index = searchKmerOnCtg ( word, KmerCtg1, len1 );
1939 if ( index >= 0 && index > ctg1start ) // choose hit closer to gap
1941 ctg1start = index;
1942 seqStart = j + overlap - 1;
1945 else if ( flag == 1 && node->inEdge == 2 )
1947 ctg2end = searchKmerOnCtg ( word, KmerCtg2, len2 );
1949 if ( ctg2end >= 0 )
1951 flag = 3;
1952 seqEnd = j + overlap - 1;
1953 break;
1957 if ( !bal_flag && node->inEdge == 2 )
1959 bal_ctg2end = searchKmerOnCtg ( bal_word, KmerCtg2, len2 );
1961 if ( bal_ctg2end >= 0 )
1963 bal_flag = 2;
1964 bal_end = bal_j + overlap - 1;
1967 else if ( bal_flag == 2 && node->inEdge == 2 )
1969 index = searchKmerOnCtg ( bal_word, KmerCtg2, len2 );
1971 if ( index >= 0 && index < bal_ctg2end ) // choose hit closer to gap
1973 bal_ctg2end = index;
1974 bal_end = bal_j + overlap - 1;
1977 else if ( bal_flag == 2 && node->inEdge == 1 )
1979 bal_ctg1start = searchKmerOnCtg ( bal_word, KmerCtg1, len1 );
1981 if ( bal_ctg1start >= 0 )
1983 bal_flag = 3;
1984 bal_start = bal_j + overlap - 1;
1985 break;
1991 if ( flag == 3 )
1993 *start = seqStart;
1994 *end = seqEnd;
1995 *bal = 0;
1996 *index1 = ctg1start;
1997 *index2 = ctg2end;
1998 return 1;
2000 else if ( bal_flag == 3 )
2002 *start = bal_start;
2003 *end = bal_end;
2004 *bal = 1;
2005 *index1 = bal_ctg1start;
2006 *index2 = bal_ctg2end;
2007 return 1;
2010 return 0;
2013 static int cutSeqFromTightStr ( char * tightStr, int length, int start, int end, int revS, char * src_seq )
2015 int i, index = 0;
2016 end = end < length ? end : length - 1;
2017 start = start >= 0 ? start : 0;
2019 if ( !revS )
2021 for ( i = start; i <= end; i++ )
2023 src_seq[index++] = getCharInTightString ( tightStr, i );
2026 else
2028 for ( i = length - 1 - start; i >= length - end - 1; i-- )
2030 src_seq[index++] = int_comp ( getCharInTightString ( tightStr, i ) );
2034 return end - start + 1;
2037 static int cutSeqFromCtg ( unsigned int ctgID, int start, int end, char * sequence, int originOverlap )
2039 unsigned int bal_ctg = getTwinCtg ( ctgID );
2041 if ( contig_array[ctgID].length < 1 )
2043 return 0;
2046 int length = contig_array[ctgID].length + originOverlap;
2048 if ( contig_array[ctgID].seq )
2050 return cutSeqFromTightStr ( contig_array[ctgID].seq, length, start, end, 0, sequence );
2052 else
2054 return cutSeqFromTightStr ( contig_array[bal_ctg].seq, length, start, end, 1, sequence );
2058 static int cutSeqFromRead ( char * src_seq, int length, int start, int end, char * sequence )
2060 if ( end >= length )
2062 fprintf ( stderr, "The index is bigger than the length: end %d length %d.\n", end, length );
2065 end = end < length ? end : length - 1;
2066 start = start >= 0 ? start : 0;
2067 int i;
2069 for ( i = start; i <= end; i++ )
2071 sequence[i - start] = src_seq[i];
2074 return end - start + 1;
2077 void printSeq ( FILE * fo, char * seq, int len )
2079 int i;
2081 for ( i = 0; i < len; i++ )
2083 fprintf ( fo, "%c", int2base ( ( int ) seq[i] ) );
2086 fprintf ( fo, "\n" );
2089 static boolean readsCrossGap ( READNEARBY * rdArray, int num, int originOverlap, DARRAY * gapSeqArray,
2090 Kmer * kmerCtg1, Kmer * kmerCtg2, int overlap, CTGinSCAF * ctg1, CTGinSCAF * ctg2,
2091 KmerSet * kmerS, Kmer WordFilter, int min, int max, int offset1, int offset2,
2092 char * seqGap, char * seqCtg1, char * seqCtg2, int cut1, int cut2 )
2094 int i, j, start, end, startOnCtg1, endOnCtg2;
2095 char * bal_seq;
2096 char * src_seq;
2097 char * pt;
2098 boolean bal, ret = 0, FILL;
2099 double maxScore = 0.0;
2100 int maxIndex;
2101 int lenCtg1, lenCtg2;
2102 //build sequences on left and right of the uncertain region
2103 int buffer_size = maxReadLen > 100 ? maxReadLen : 100;
2104 int length = contig_array[ctg1->ctgID].length + originOverlap;
2106 if ( buffer_size > offset1 )
2108 lenCtg1 = cutSeqFromCtg ( ctg1->ctgID, length - cut1 - ( buffer_size - offset1 ), length - 1 - cut1, seqCtg1, originOverlap );
2110 for ( i = 0; i < offset1; i++ )
2112 seqCtg1[lenCtg1 + i] = seqGap[i];
2115 lenCtg1 += offset1;
2117 else
2119 for ( i = offset1 - buffer_size; i < offset1; i++ )
2121 seqCtg1[i + buffer_size - offset1] = seqGap[i];
2124 lenCtg1 = buffer_size;
2127 length = contig_array[ctg2->ctgID].length + originOverlap;
2129 if ( buffer_size > offset2 )
2131 lenCtg2 = cutSeqFromCtg ( ctg2->ctgID, cut2, buffer_size - offset2 - 1 + cut2, & ( seqCtg2[offset2] ), originOverlap );
2133 for ( i = 0; i < offset2; i++ )
2135 seqCtg2[i] = seqGap[i + offset1];
2138 lenCtg2 += offset2;
2140 else
2142 for ( i = 0; i < buffer_size; i++ )
2144 seqCtg2[i] = seqGap[i + offset1];
2147 lenCtg2 = buffer_size;
2151 if(offset1>0||offset2>0){
2152 for(i=0;i<lenCtg1;i++)
2153 fprintf(stderr,"%c",int2base(seqCtg1[i]));
2154 fprintf(stderr,": CTG1\n");
2155 for(i=0;i<lenCtg2;i++)
2156 fprintf(stderr,"%c",int2base(seqCtg2[i]));
2157 fprintf(stderr,": CTG2\n");
2160 //chop kmer from both ends of the uncertain region
2161 int len1, len2;
2162 len1 = CTGendLen < lenCtg1 ? CTGendLen : lenCtg1;
2163 len2 = CTGendLen < lenCtg2 ? CTGendLen : lenCtg2;
2164 chopKmer4Ctg ( kmerCtg1, len1, overlap, & ( seqCtg1[lenCtg1 - len1] ), WordFilter );
2165 chopKmer4Ctg ( kmerCtg2, len2, overlap, seqCtg2, WordFilter );
2166 len1 -= overlap - 1;
2167 len2 -= overlap - 1;
2168 src_seq = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) );
2169 bal_seq = ( char * ) ckalloc ( maxReadLen * sizeof ( char ) );
2170 int * START = ( int * ) ckalloc ( num * sizeof ( int ) );
2171 int * END = ( int * ) ckalloc ( num * sizeof ( int ) );
2172 int * INDEX1 = ( int * ) ckalloc ( num * sizeof ( int ) );
2173 int * INDEX2 = ( int * ) ckalloc ( num * sizeof ( int ) );
2174 double * SCORE = ( double * ) ckalloc ( num * sizeof ( double ) );
2175 boolean * BAL = ( boolean * ) ckalloc ( num * sizeof ( boolean ) );
2176 memset ( SCORE, 0, num * sizeof ( double ) );
2178 for ( i = 0; i < num; i++ )
2180 getSeqFromRead ( rdArray[i], src_seq );
2181 FILL = chopReadFillGap ( rdArray[i].len, overlap, src_seq, bal_seq, kmerS, WordFilter, &start, &end, &bal, kmerCtg1, len1, kmerCtg2, len2, &startOnCtg1, &endOnCtg2 );
2183 if ( !FILL || ( end - start ) < min || ( end - start ) > max )
2185 continue;
2188 if ( overlap + ( len1 - startOnCtg1 - 1 ) + endOnCtg2 - ( end - start ) > ( int ) originOverlap )
2190 continue;
2191 } // contig1 and contig2 could not overlap more than origOverlap bases
2193 START[i] = start;
2194 END[i] = end;
2195 INDEX1[i] = startOnCtg1;
2196 INDEX2[i] = endOnCtg2;
2197 BAL[i] = bal;
2198 int matchLen = 2 * overlap < ( end - start + overlap ) ? 2 * overlap : ( end - start + overlap );
2199 int match;
2200 int alignLen = matchLen;
2201 //compare the left of hit kmer on ctg1
2202 //int ctgLeft = (contig_array[ctg1->ctgID].length+originOverlap)-(len1+overlap-1)+startOnCtg1;
2203 int ctgLeft = ( lenCtg1 ) - ( len1 + overlap - 1 ) + startOnCtg1;
2204 int readLeft = start - overlap + 1;
2205 int cmpLen = ctgLeft < readLeft ? ctgLeft : readLeft;
2206 cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH;
2207 //cutSeqFromCtg(ctg1->ctgID,ctgLeft-cmpLen,ctgLeft-1,fastSequence,originOverlap);
2208 cutSeqFromRead ( seqCtg1, lenCtg1, ctgLeft - cmpLen, ctgLeft - 1, fastSequence );
2210 if ( !bal )
2212 cutSeqFromRead ( src_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence );
2214 else
2216 cutSeqFromRead ( bal_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence );
2219 match = compareSequences ( fastSequence, slowSequence, cmpLen, cmpLen );
2220 alignLen += cmpLen;
2221 matchLen += match;
2222 //compare the right of hit kmer on ctg1
2223 int ctgRight = len1 - startOnCtg1 - 1;
2224 cmpLen = ctgRight < ( rdArray[i].len - start - 1 ) ? ctgRight : ( rdArray[i].len - start - 1 );
2225 cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH;
2226 //cutSeqFromCtg(ctg1->ctgID,ctgLeft+overlap,ctgLeft+overlap+cmpLen-1,fastSequence,originOverlap);
2227 cutSeqFromRead ( seqCtg1, lenCtg1, ctgLeft + overlap, ctgLeft + overlap + cmpLen - 1, fastSequence );
2229 if ( !bal )
2231 cutSeqFromRead ( src_seq, rdArray[i].len, start + 1, start + cmpLen, slowSequence );
2233 else
2235 cutSeqFromRead ( bal_seq, rdArray[i].len, start + 1, start + cmpLen, slowSequence );
2238 match = compareSequences ( fastSequence, slowSequence, cmpLen, cmpLen );
2239 //fprintf(stderr,"%d -- %d\n",match,cmpLen);
2240 alignLen += cmpLen;
2241 matchLen += match;
2242 //compare the left of hit kmer on ctg2
2243 ctgLeft = endOnCtg2;
2244 readLeft = end - overlap + 1;
2245 cmpLen = ctgLeft < readLeft ? ctgLeft : readLeft;
2246 cmpLen = ctgLeft <= MAXREADLENGTH ? ctgLeft : MAXREADLENGTH;
2247 //cutSeqFromCtg(ctg2->ctgID,endOnCtg2-cmpLen,endOnCtg2-1,fastSequence,originOverlap);
2248 cutSeqFromRead ( seqCtg2, lenCtg2, endOnCtg2 - cmpLen, endOnCtg2 - 1, fastSequence );
2250 if ( !bal )
2252 cutSeqFromRead ( src_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence );
2254 else
2256 cutSeqFromRead ( bal_seq, rdArray[i].len, readLeft - cmpLen, readLeft - 1, slowSequence );
2259 match = compareSequences ( fastSequence, slowSequence, cmpLen, cmpLen );
2260 alignLen += cmpLen;
2261 matchLen += match;
2262 //compare the right of hit kmer on ctg2
2263 //ctgRight = contig_array[ctg2->ctgID].length+originOverlap-endOnCtg2-overlap;
2264 ctgRight = lenCtg2 - endOnCtg2 - overlap;
2265 cmpLen = ctgRight < ( rdArray[i].len - end - 1 ) ? ctgRight : ( rdArray[i].len - end - 1 );
2266 cmpLen = cmpLen <= MAXREADLENGTH ? cmpLen : MAXREADLENGTH;
2267 //cutSeqFromCtg(ctg2->ctgID,endOnCtg2+overlap,endOnCtg2+overlap+cmpLen-1,fastSequence,originOverlap);
2268 cutSeqFromRead ( seqCtg2, lenCtg2, endOnCtg2 + overlap, endOnCtg2 + overlap + cmpLen - 1, fastSequence );
2270 if ( !bal )
2272 cutSeqFromRead ( src_seq, rdArray[i].len, end + 1, end + cmpLen, slowSequence );
2274 else
2276 cutSeqFromRead ( bal_seq, rdArray[i].len, end + 1, end + cmpLen, slowSequence );
2279 match = compareSequences ( fastSequence, slowSequence, cmpLen, cmpLen );
2280 alignLen += cmpLen;
2281 matchLen += match;
2283 if(cmpLen>0&&match!=cmpLen+overlap){
2284 printSeq(stderr,fastSequence,cmpLen+overlap);
2285 printSeq(stderr,slowSequence,cmpLen+overlap);
2286 printKmerSeqLocal(stderr,kmerCtg2[endOnCtg2],overlap);
2287 fprintf(stderr,": %d(%d)\n",bal,endOnCtg2);
2288 }else if(cmpLen>0&&match==cmpLen+overlap)
2289 fprintf(stderr,"Perfect\n");
2291 double score = ( double ) matchLen / alignLen;
2293 if ( maxScore < score )
2295 maxScore = score;
2296 //fprintf(stderr,"%4.2f (%d/%d)\n",maxScore,matchLen,alignLen);
2297 maxIndex = i;
2300 SCORE[i] = score;
2304 if(maxScore>0.0)
2305 fprintf(stderr,"SCORE: %4.2f\n",maxScore);
2307 if ( maxScore > 0.9 )
2310 for(i=0;i<lenCtg1;i++)
2311 fprintf(stderr,"%c",int2base(seqCtg1[i]));
2312 fprintf(stderr,": CTG1\n");
2313 for(i=0;i<lenCtg2;i++)
2314 fprintf(stderr,"%c",int2base(seqCtg2[i]));
2315 fprintf(stderr,": CTG2\n");
2316 fprintf(stderr,"%d+%d -- %d+%d, SCORE: %4.2f\n ",offset1,offset2,cut1,cut2,maxScore);
2318 getSeqFromRead ( rdArray[maxIndex], src_seq );
2319 reverseComplementSeq ( src_seq, rdArray[maxIndex].len, bal_seq );
2320 int leftRemain = offset1 - ( len1 - INDEX1[maxIndex] - 1 ) > 0 ? offset1 - ( len1 - INDEX1[maxIndex] - 1 ) : 0;
2321 int rightRemain = offset2 - ( overlap + INDEX2[maxIndex] ) > 0 ? offset2 - ( overlap + INDEX2[maxIndex] ) : 0;
2322 ctg1->gapSeqOffset = gapSeqArray->item_c;
2323 ctg1->gapSeqLen = END[maxIndex] - START[maxIndex] + leftRemain + rightRemain;
2325 if ( darrayPut ( gapSeqArray, ctg1->gapSeqOffset + ( END[maxIndex] - START[maxIndex] + leftRemain + rightRemain ) / 4 ) )
2327 pt = ( char * ) darrayPut ( gapSeqArray, ctg1->gapSeqOffset );
2329 for ( j = 0; j < leftRemain; j++ ) //get the left side of the gap region from search
2331 writeChar2tightString ( seqGap[j], pt, j );
2332 //fprintf(stderr,"%c",int2base(seqGap[j]));
2335 for ( j = START[maxIndex] + 1; j <= END[maxIndex]; j++ )
2337 if ( BAL[maxIndex] )
2339 writeChar2tightString ( bal_seq[j], pt, j - START[maxIndex] - 1 + leftRemain );
2340 //fprintf(stderr,"%c",int2base(bal_seq[j]));
2342 else
2344 writeChar2tightString ( src_seq[j], pt, j - START[maxIndex] - 1 + leftRemain );
2345 //fprintf(stderr,"%c",int2base(src_seq[j]));
2349 for ( j = offset2 - rightRemain; j < offset2; j++ ) //get the right side of the gap region from search
2351 writeChar2tightString ( seqGap[j + leftRemain], pt, j + END[maxIndex] - START[maxIndex] + leftRemain );
2352 //fprintf(stderr,"%c",int2base(seqGap[j+leftRemain]));
2356 fprintf(stderr,": GAPSEQ (%d+%d)(%d+%d)(%d+%d)(%d+%d) B %d\n",offset1,offset2,cut1,cut2,
2357 len1-INDEX1[maxIndex]-1,INDEX2[maxIndex],START[maxIndex],END[maxIndex],BAL[maxIndex]);
2359 ctg1->cutTail = len1 - INDEX1[maxIndex] - 1 - offset1 + cut1 > cut1 ? len1 - INDEX1[maxIndex] - 1 - offset1 + cut1 : cut1;
2360 ctg2->cutHead = overlap + INDEX2[maxIndex] - offset2 + cut2 > cut2 ? overlap + INDEX2[maxIndex] - offset2 + cut2 : cut2;
2361 ctg2->scaftig_start = 0;
2362 ret = 1;
2366 free ( ( void * ) START );
2367 free ( ( void * ) END );
2368 free ( ( void * ) INDEX1 );
2369 free ( ( void * ) INDEX2 );
2370 free ( ( void * ) SCORE );
2371 free ( ( void * ) BAL );
2372 free ( ( void * ) src_seq );
2373 free ( ( void * ) bal_seq );
2374 return ret;