modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / prlReadFillGap.c
blobb1de77a8a494ac1a3a8b395cdd8ee149c3c2b6fc
1 /*
2 * prlReadFillGap.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
28 #include "zlib.h"
30 #define RDBLOCKSIZE 50
31 #define CTGappend 50
33 static Kmer MAXKMER;
34 static int Ncounter;
35 static int allGaps;
37 // for multi threads
38 static int * counters;
39 static pthread_mutex_t mutex;
40 static int scafBufSize = 100;
41 static boolean * flagBuf;
42 static unsigned char * thrdNoBuf;
43 static STACK ** ctgStackBuffer;
44 static int scafCounter;
45 static int scafInBuf;
46 static char * scaffBuffer;
48 static void MarkCtgOccu ( unsigned int ctg );
51 static void printRead(int len,char *seq)
53 int j;
54 fprintf(stderr,">read\n");
55 for(j=0;j<len;j++)
56 fprintf(stderr,"%c",int2base((int)getCharInTightString(seq,j)));
57 fprintf(stderr,"\n");
60 static void attach1read2contig ( unsigned int ctgID, int len, int pos, long long starter )
62 unsigned int ctg = index_array[ctgID]; //new index in contig array
64 if ( isLargerThanTwin ( ctg ) )
66 ctg = getTwinCtg ( ctg ); // put all reads in one contig of a twin
67 pos = contig_array[ctg].length + overlaplen - pos - len;
70 if ( !contig_array[ctg].closeReads )
72 contig_array[ctg].closeReads = ( STACK * ) createStack ( RDBLOCKSIZE, sizeof ( READNEARBY ) );
75 READNEARBY * rd = ( READNEARBY * ) stackPush ( contig_array[ctg].closeReads );
76 rd->len = len;
77 rd->dis = pos;
78 rd->seqStarter = starter;
81 static void convertIndex ()
83 int * length_array = ( int * ) ckalloc ( ( num_ctg + 1 ) * sizeof ( int ) );
84 unsigned int i;
86 for ( i = 1; i <= num_ctg; i++ )
88 length_array[i] = 0;
91 for ( i = 1; i <= num_ctg; i++ )
93 if ( index_array[i] > 0 )
95 length_array[index_array[i]] = i;
99 for ( i = 1; i <= num_ctg; i++ )
101 index_array[i] = length_array[i];
102 } //contig i with new index: index_array[i]
104 free ( ( void * ) length_array );
107 static long long getRead1by1_gz ( gzFile * fp, DARRAY * readSeqInGap )
109 long long readCounter = 0;
111 if ( !fp )
113 return readCounter;
116 int len, ctgID, pos;
117 long long starter;
118 char * pt;
119 char * freadBuf = ( char * ) ckalloc ( ( maxReadLen / 4 + 1 ) * sizeof ( char ) );
121 while ( gzread ( fp, &len, sizeof ( int ) ) == 4 )
123 if ( gzread ( fp, &ctgID, sizeof ( int ) ) != 4 )
125 break;
128 if ( gzread ( fp, &pos, sizeof ( int ) ) != 4 )
130 break;
133 if ( gzread ( fp, freadBuf, sizeof ( char ) * ( unsigned ) ( len / 4 + 1 ) ) != ( unsigned ) ( len / 4 + 1 ) )
135 break;
138 //put seq to dynamic array
139 starter = readSeqInGap->item_c;
141 if ( !darrayPut ( readSeqInGap, starter + len / 4 ) ) // make sure there's room for this seq
143 break;
146 pt = ( char * ) darrayPut ( readSeqInGap, starter );
147 bcopy ( freadBuf, pt, len / 4 + 1 );
148 attach1read2contig ( ctgID, len, pos, starter );
149 readCounter++;
152 free ( ( void * ) freadBuf );
153 return readCounter;
156 static long long getRead1by1 ( FILE * fp, DARRAY * readSeqInGap )
158 long long readCounter = 0;
160 if ( !fp )
162 return readCounter;
165 int len, ctgID, pos;
166 long long starter;
167 char * pt;
168 char * freadBuf = ( char * ) ckalloc ( ( maxReadLen / 4 + 1 ) * sizeof ( char ) );
170 while ( fread ( &len, sizeof ( int ), 1, fp ) == 1 )
172 if ( fread ( &ctgID, sizeof ( int ), 1, fp ) != 1 )
174 break;
177 if ( fread ( &pos, sizeof ( int ), 1, fp ) != 1 )
179 break;
182 if ( fread ( freadBuf, sizeof ( char ), len / 4 + 1, fp ) != ( unsigned ) ( len / 4 + 1 ) )
184 break;
187 //put seq to dynamic array
188 starter = readSeqInGap->item_c;
190 if ( !darrayPut ( readSeqInGap, starter + len / 4 ) ) // make sure there's room for this seq
192 break;
195 pt = ( char * ) darrayPut ( readSeqInGap, starter );
196 bcopy ( freadBuf, pt, len / 4 + 1 );
197 attach1read2contig ( ctgID, len, pos, starter );
198 readCounter++;
201 free ( ( void * ) freadBuf );
202 return readCounter;
205 // Darray *readSeqInGap
206 static boolean loadReads4gap ( char * graphfile )
208 FILE * fp1, *fp2;
209 gzFile * fp;
210 char name[1024];
211 long long readCounter;
213 if ( COMPATIBLE_MODE == 1 )
215 sprintf ( name, "%s.readInGap", graphfile );
216 fp1 = fopen ( name, "rb" );
218 else
220 sprintf ( name, "%s.readInGap.gz", graphfile );
221 fp = gzopen ( name, "rb" );
224 sprintf ( name, "%s.longReadInGap", graphfile );
225 fp2 = fopen ( name, "rb" );
227 if ( COMPATIBLE_MODE == 1 && !fp1 && !fp2 )
229 fprintf ( stderr, "Can't open %s.readInGap and %s.longReadInGap!\n", graphfile, graphfile );
230 return 0;
232 else if ( COMPATIBLE_MODE == 0 && !fp && !fp2 )
234 fprintf ( stderr, "Can't open %s.readInGap.gz and %s.longReadInGap!\n", graphfile, graphfile );
235 return 0;
238 if ( !orig2new )
240 convertIndex ();
241 orig2new = 1;
244 readSeqInGap = ( DARRAY * ) createDarray ( 1000000, sizeof ( char ) );
246 if ( COMPATIBLE_MODE == 1 && fp1 )
248 readCounter = getRead1by1 ( fp1, readSeqInGap );
249 fprintf ( stderr, "Loaded %lld reads from %s.readInGap.\n", readCounter, graphfile );
250 fclose ( fp1 );
252 else if ( COMPATIBLE_MODE == 0 && fp )
254 readCounter = getRead1by1_gz ( fp, readSeqInGap );
255 fprintf ( stderr, "Loaded %lld reads from %s.readInGap.\n", readCounter, graphfile );
256 gzclose ( fp );
259 if ( fp2 )
261 readCounter = getRead1by1 ( fp2, readSeqInGap );
262 fprintf ( stderr, "Loaded %lld reads from %s.LongReadInGap.\n", readCounter, graphfile );
263 fclose ( fp2 );
266 return 1;
269 static void debugging1 ()
271 unsigned int i;
273 if ( orig2new )
275 unsigned int * length_array = ( unsigned int * ) ckalloc ( ( num_ctg + 1 ) * sizeof ( unsigned int ) );
277 //use length_array to change info in index_array
278 for ( i = 1; i <= num_ctg; i++ )
280 length_array[i] = 0;
283 for ( i = 1; i <= num_ctg; i++ )
285 if ( index_array[i] > 0 )
287 length_array[index_array[i]] = i;
291 for ( i = 1; i <= num_ctg; i++ )
293 index_array[i] = length_array[i];
294 } //contig i with original index: index_array[i]
296 orig2new = 0;
299 READNEARBY * rd;
300 int j;
301 char * pt;
303 for ( i = 1; i <= num_ctg; i++ )
305 if ( !contig_array[i].closeReads )
307 continue;
310 if ( index_array[i] != 735 )
312 continue;
315 fprintf ( stderr, "Contig %d, len %d: \n", index_array[i], contig_array[i].length );
316 stackBackup ( contig_array[i].closeReads );
318 while ( ( rd = ( READNEARBY * ) stackPop ( contig_array[i].closeReads ) ) != NULL )
320 fprintf ( stderr, "%d\t%d\t%lld\t", rd->dis, rd->len, rd->seqStarter );
321 pt = ( char * ) darrayGet ( readSeqInGap, rd->seqStarter );
323 for ( j = 0; j < rd->len; j++ )
325 fprintf ( stderr, "%c", int2base ( ( int ) getCharInTightString ( pt, j ) ) );
328 fprintf ( stderr, "\n" );
331 stackRecover ( contig_array[i].closeReads );
335 static void initiateCtgInScaf ( CTGinSCAF * actg )
337 actg->cutTail = 0;
338 actg->cutHead = overlaplen;
339 actg->gapSeqLen = 0;
342 static int procGap ( char * line, STACK * ctgsStack )
344 char * tp;
345 int length, i, seg;
346 unsigned int ctg;
347 CTGinSCAF * ctgPt;
348 tp = strtok ( line, " " );
349 tp = strtok ( NULL, " " ); //length
350 length = atoi ( tp );
351 tp = strtok ( NULL, " " ); //seg
352 seg = atoi ( tp );
354 if ( !seg )
356 return length;
359 for ( i = 0; i < seg; i++ )
361 tp = strtok ( NULL, " " );
362 ctg = atoi ( tp );
363 MarkCtgOccu ( ctg );
364 ctgPt = ( CTGinSCAF * ) stackPush ( ctgsStack );
365 initiateCtgInScaf ( ctgPt );
366 ctgPt->ctgID = ctg;
367 ctgPt->start = 0;
368 ctgPt->end = 0;
369 ctgPt->scaftig_start = 0;
370 ctgPt->mask = 1;
373 return length;
376 static void debugging2 ( int index, STACK * ctgsStack )
378 CTGinSCAF * actg;
379 stackBackup ( ctgsStack );
380 fprintf ( stderr, ">scaffold%d\t%d 0.0\n", index, ctgsStack->item_c );
382 while ( ( actg = stackPop ( ctgsStack ) ) != NULL )
384 fprintf ( stderr, "%d\t%d\t%d\t%d\n", actg->ctgID, actg->start, actg->end, actg->scaftig_start );
387 stackRecover ( ctgsStack );
390 static int cmp_reads ( const void * a, const void * b )
392 READNEARBY * A, *B;
393 A = ( READNEARBY * ) a;
394 B = ( READNEARBY * ) b;
396 if ( A->dis > B->dis )
398 return 1;
400 else if ( A->dis == B->dis )
402 return 0;
404 else
406 return -1;
410 static void cutRdArray ( READNEARBY * rdArray, int gapStart, int gapEnd, int * count, int arrayLen, READNEARBY * cutArray )
412 int i;
413 int num = 0;
415 for ( i = 0; i < arrayLen; i++ )
417 if ( rdArray[i].dis > gapEnd )
419 break;
422 if ( ( rdArray[i].dis + rdArray[i].len ) >= gapStart )
424 cutArray[num].dis = rdArray[i].dis;
425 cutArray[num].len = rdArray[i].len;
426 cutArray[num++].seqStarter = rdArray[i].seqStarter;
430 *count = num;
433 static void outputTightStr2Visual ( FILE * foc2, int ctg, int * num, int start, int length, int outputlen, int revS )
435 int i, end, column = 0, n = *num;
436 char * tightStr = contig_array[ctg].seq;
438 if ( n == 1 ) { fprintf ( foc2, ">%d\n", index_array[ctg] ); }
439 else { fprintf ( foc2, ">%d-%d\n", index_array[ctg], n - 1 ); }
441 if ( !revS )
443 end = start + outputlen <= length ? start + outputlen : length;
445 for ( i = start; i < end; i++ )
447 fprintf ( foc2, "%c", int2base ( ( int ) getCharInTightString ( tightStr, i ) ) );
449 if ( ( ++column ) % 100 == 0 )
451 //column = 0;
452 fprintf ( foc2, "\n" );
456 if ( column % 100 != 0 )
458 fprintf ( foc2, "\n" );
461 else
463 end = length - start - outputlen - 1 >= 0 ? length - start - outputlen : 0;
465 for ( i = end; i <= length - 1 - start; i++ )
467 fprintf ( foc2, "%c", int2base ( ( int ) getCharInTightString ( tightStr, i ) ) );
469 if ( ( ++column ) % 100 == 0 )
471 fprintf ( foc2, "\n" );
472 //column = 0;
476 if ( column % 100 != 0 )
478 fprintf ( foc2, "\n" );
483 void outputTightStr ( FILE * fp, char * tightStr, int start, int length, int outputlen, int revS, int * col )
485 int i;
486 int end;
487 int column = *col;
489 if ( !revS )
491 end = start + outputlen <= length ? start + outputlen : length;
493 for ( i = start; i < end; i++ )
495 fprintf ( fp, "%c", int2base ( ( int ) getCharInTightString ( tightStr, i ) ) );
497 if ( ( ++column ) % 100 == 0 )
499 //column = 0;
500 fprintf ( fp, "\n" );
504 else
506 end = length - start - outputlen - 1 >= 0 ? length - start - outputlen : 0;
508 for ( i = length - 1 - start; i >= end; i-- )
510 fprintf ( fp, "%c", int2compbase ( ( int ) getCharInTightString ( tightStr, i ) ) );
512 if ( ( ++column ) % 100 == 0 )
514 fprintf ( fp, "\n" );
515 //column = 0;
520 *col = column;
523 static void outputTightStr2 ( char * tightStr, char * scaff, int start, int length, int outputlen, int revS, int * col )
525 int i;
526 int end;
527 int column = *col;
528 char a;
530 if ( !revS )
532 end = start + outputlen <= length ? start + outputlen : length;
534 for ( i = start; i < end; i++ )
536 a = int2base ( ( int ) getCharInTightString ( tightStr, i ) );
537 // fprintf (fp, "%c", int2base ((int) getCharInTightString (tightStr, i)));
538 scaff[column++] = a;
541 else
543 end = length - start - outputlen - 1 >= 0 ? length - start - outputlen : 0;
545 for ( i = length - 1 - start; i >= end; i-- )
547 a = int2compbase ( ( int ) getCharInTightString ( tightStr, i ) );
548 // fprintf (fp, "%c", int2compbase ((int) getCharInTightString (tightStr, i)));
549 scaff[column++] = a;
553 *col = column;
556 static void outputTightStrLowerCase2Visual ( FILE * foc2, int gapNum, char * tightStr, int start, int length, int outputlen )
558 int i, end, column = 0;
559 end = start + outputlen <= length ? start + outputlen : length;
560 fprintf ( foc2, ">%d-0\n", gapNum );
562 for ( i = start; i < end; i++ )
564 fprintf ( foc2, "%c", "actg"[ ( int ) getCharInTightString ( tightStr, i )] );
566 if ( ( ++column ) % 100 == 0 )
568 //column = 0;
569 fprintf ( foc2, "\n" );
573 if ( column % 100 != 0 )
575 fprintf ( foc2, "\n" );
579 static void outputTightStrLowerCase ( FILE * fp, char * tightStr, int start, int length, int outputlen, int revS, int * col )
581 int i;
582 int end;
583 int column = *col;
585 if ( !revS )
587 end = start + outputlen <= length ? start + outputlen : length;
589 for ( i = start; i < end; i++ )
591 fprintf ( fp, "%c", "actg"[ ( int ) getCharInTightString ( tightStr, i )] );
593 if ( ( ++column ) % 100 == 0 )
595 //column = 0;
596 fprintf ( fp, "\n" );
600 else
602 end = length - start - outputlen - 1 >= 0 ? length - start - outputlen : 0;
604 for ( i = length - 1 - start; i >= end; i-- )
606 fprintf ( fp, "%c", "tgac"[ ( int ) getCharInTightString ( tightStr, i )] );
608 if ( ( ++column ) % 100 == 0 )
610 fprintf ( fp, "\n" );
611 //column = 0;
616 *col = column;
619 static void outputTightStrLowerCase2 ( char * tightStr, char * scaff, int start, int length, int outputlen, int revS, int * col )
621 int i;
622 int end;
623 int column = *col;
624 char a;
626 if ( !revS )
628 end = start + outputlen <= length ? start + outputlen : length;
630 for ( i = start; i < end; i++ )
632 a = "actg"[ ( int ) getCharInTightString ( tightStr, i )];
633 // fprintf (fp, "%c", "actg"[(int) getCharInTightString (tightStr, i)]);
634 scaff[column++] = a;
637 else
639 end = length - start - outputlen - 1 >= 0 ? length - start - outputlen : 0;
641 for ( i = length - 1 - start; i >= end; i-- )
643 a = "tgac"[ ( int ) getCharInTightString ( tightStr, i )];
644 // fprintf (fp, "%c", "tgac"[(int) getCharInTightString (tightStr, i)]);
645 scaff[column++] = a;
649 *col = column;
652 static void outputNs ( FILE * fp, int gapN, int * col )
654 int i, column = *col;
656 for ( i = 0; i < gapN; i++ )
658 fprintf ( fp, "N" );
660 if ( ( ++column ) % 100 == 0 )
662 //column = 0;
663 fprintf ( fp, "\n" );
667 *col = column;
670 static void outputNs2 ( char * scaff, int gapN, int * col )
672 int i, column = *col;
674 for ( i = 0; i < gapN; i++ )
676 scaff[column] = 'N';
677 column++;
680 *col = column;
683 static void outputGapInfo ( unsigned int ctg1, unsigned int ctg2 )
685 unsigned int bal_ctg1 = getTwinCtg ( ctg1 );
686 unsigned int bal_ctg2 = getTwinCtg ( ctg2 );
688 if ( isLargerThanTwin ( ctg1 ) )
690 fprintf ( stderr, "%d\t", index_array[bal_ctg1] );
692 else
694 fprintf ( stderr, "%d\t", index_array[ctg1] );
697 if ( isLargerThanTwin ( ctg2 ) )
699 fprintf ( stderr, "%d\n", index_array[bal_ctg2] );
701 else
703 fprintf ( stderr, "%d\n", index_array[ctg2] );
707 static void output1gap ( FILE * fo, int scafIndex, CTGinSCAF * prevCtg, CTGinSCAF * actg, DARRAY * gapSeqArray )
709 unsigned int ctg1, bal_ctg1, length1;
710 int start1, outputlen1;
711 unsigned int ctg2, bal_ctg2, length2;
712 int start2, outputlen2;
713 char * pt;
714 int column = 0;
715 ctg1 = prevCtg->ctgID;
716 bal_ctg1 = getTwinCtg ( ctg1 );
717 start1 = prevCtg->cutHead;
718 length1 = contig_array[ctg1].length + overlaplen;
720 if ( length1 - prevCtg->cutTail - start1 > CTGappend )
722 outputlen1 = CTGappend;
723 start1 = length1 - prevCtg->cutTail - outputlen1;
725 else
727 outputlen1 = length1 - prevCtg->cutTail - start1;
730 ctg2 = actg->ctgID;
731 bal_ctg2 = getTwinCtg ( ctg2 );
732 start2 = actg->cutHead;
733 length2 = contig_array[ctg2].length + overlaplen;
735 if ( length2 - actg->cutTail - start2 > CTGappend )
737 outputlen2 = CTGappend;
739 else
741 outputlen2 = length2 - actg->cutTail - start2;
744 if ( isLargerThanTwin ( ctg1 ) )
746 fprintf ( fo, ">S%d_C%d_L%d_G%d", scafIndex, index_array[bal_ctg1], outputlen1, prevCtg->gapSeqLen );
748 else
750 fprintf ( fo, ">S%d_C%d_L%d_G%d", scafIndex, index_array[ctg1], outputlen1, prevCtg->gapSeqLen );
753 if ( isLargerThanTwin ( ctg2 ) )
755 fprintf ( fo, "_C%d_L%d\t", index_array[bal_ctg2], outputlen2 );
757 else
759 fprintf ( fo, "_C%d_L%d\t", index_array[ctg2], outputlen2 );
762 fprintf ( fo, "%d\n", start2 );
764 if ( contig_array[ctg1].seq )
766 outputTightStr ( fo, contig_array[ctg1].seq, start1, length1, outputlen1, 0, &column );
768 else if ( contig_array[bal_ctg1].seq )
770 outputTightStr ( fo, contig_array[bal_ctg1].seq, start1, length1, outputlen1, 1, &column );
773 pt = ( char * ) darrayPut ( gapSeqArray, prevCtg->gapSeqOffset );
774 outputTightStrLowerCase ( fo, pt, 0, prevCtg->gapSeqLen, prevCtg->gapSeqLen, 0, &column );
776 if ( contig_array[ctg2].seq )
778 outputTightStr ( fo, contig_array[ctg2].seq, start2, length2, outputlen2, 0, &column );
780 else if ( contig_array[bal_ctg2].seq )
782 outputTightStr ( fo, contig_array[bal_ctg2].seq, start2, length2, outputlen2, 1, &column );
785 if ( column % 100 != 0 )
787 fprintf ( fo, "\n" );
791 static void outputGapSeq ( FILE * fo, int index, STACK * ctgsStack, DARRAY * gapSeqArray )
793 CTGinSCAF * actg, *prevCtg = NULL;
794 stackRecover ( ctgsStack );
795 // fprintf (fo, ">scaffold%d\n", index);
797 while ( ( actg = stackPop ( ctgsStack ) ) != NULL )
799 /* if (prevCtg)
801 if (actg->scaftig_start)
803 fprintf (fo, "0\t%d\t%d\n", prevCtg->mask, actg->mask);
805 else
807 fprintf (fo, "1\t%d\t%d\n", prevCtg->mask, actg->mask);
811 if ( prevCtg && prevCtg->gapSeqLen > 0 )
812 { output1gap ( fo, index, prevCtg, actg, gapSeqArray ); }
814 prevCtg = actg;
818 static void outputScafSeq ( FILE * fo, FILE * foc, FILE * foc2, FILE * fo3, int index, STACK * ctgsStack, DARRAY * gapSeqArray )
820 CTGinSCAF * actg, *prevCtg = NULL;
821 unsigned int ctg, bal_ctg, ctg_out, length;
822 int start, outputlen, gapN;
823 char * pt;
824 int column = 0;
825 long long cvgSum = 0;
826 int lenSum = 0;
827 int i, t, lu_len = 0, lu_end = 0;
828 unsigned int ctg_start_pos = 0;
829 char strand;
830 unsigned int * pos_start = ( unsigned int * ) ckalloc ( 1000000 * sizeof ( unsigned int ) );
831 unsigned int * pos_end = ( unsigned int * ) ckalloc ( 1000000 * sizeof ( unsigned int ) );
832 // char index_contig[num_ctg][20];
833 char ** index_contig = ( char ** ) ckalloc ( 1000000 * sizeof ( char * ) );
835 for ( i = 0; i < 1000000; i++ )
836 { index_contig[i] = ( char * ) ckalloc ( 20 * sizeof ( char ) ); }
838 char * orien_array;
839 orien_array = ( char * ) ckalloc ( 1000000 * sizeof ( char ) );
840 // scaffBuffer = (char *) ckalloc (300000000 * sizeof (char));
841 stackRecover ( ctgsStack );
843 while ( ( actg = stackPop ( ctgsStack ) ) != NULL )
845 if ( ! ( contig_array[actg->ctgID].cvg > 0 ) )
847 continue;
850 lenSum += contig_array[actg->ctgID].length;
851 cvgSum += contig_array[actg->ctgID].length * contig_array[actg->ctgID].cvg;
854 if ( lenSum > 0 )
856 fprintf ( fo, ">scaffold%d %4.1f\n", index, ( double ) cvgSum / lenSum );
858 else
860 fprintf ( fo, ">scaffold%d 0.0\n", index );
863 fprintf ( foc, ">scaffold%d\n", index );
864 stackRecover ( ctgsStack );
866 while ( ( actg = stackPop ( ctgsStack ) ) != NULL )
868 ctg = actg->ctgID;
869 bal_ctg = getTwinCtg ( ctg );
870 length = contig_array[ctg].length + overlaplen;
872 if ( prevCtg && actg->scaftig_start )
874 gapN = actg->start - prevCtg->start - contig_array[prevCtg->ctgID].length;
875 gapN = gapN > 0 ? gapN : 1;
876 outputNs ( fo, gapN, &column );
877 ctg_start_pos += gapN;
878 //outputGapInfo(prevCtg->ctgID,ctg);
879 Ncounter++;
882 if ( !prevCtg )
884 start = 0;
886 else
888 start = actg->cutHead;
891 outputlen = length - start - actg->cutTail;
893 if ( contig_array[ctg].seq )
895 outputTightStr ( fo, contig_array[ctg].seq, start, length, outputlen, 0, &column );
896 lu_end = start + outputlen > length ? length : start + outputlen;
897 lu_len = lu_end - start;
898 strand = '+';
899 fprintf ( foc, "%d\t", index_array[ctg] );
901 else if ( contig_array[bal_ctg].seq )
903 outputTightStr ( fo, contig_array[bal_ctg].seq, start, length, outputlen, 1, &column );
904 lu_end = length - start - outputlen - 1 >= 0 ? length - start - outputlen : 0;
905 lu_len = length - start - lu_end;
906 strand = '-';
907 fprintf ( foc, "%d\t", index_array[bal_ctg] );
910 fprintf ( foc, "%u\t%c\t%d\n", ctg_start_pos, strand, lu_len );
911 ctg_start_pos += lu_len;
913 if ( actg->gapSeqLen < 1 )
915 prevCtg = actg;
916 continue;
919 pt = ( char * ) darrayPut ( gapSeqArray, actg->gapSeqOffset );
920 outputTightStrLowerCase ( fo, pt, 0, actg->gapSeqLen, actg->gapSeqLen, 0, &column );
921 ctg_start_pos = ctg_start_pos + actg->gapSeqLen;
922 prevCtg = actg;
925 if ( column % 100 != 0 )
927 fprintf ( fo, "\n" );
930 if ( visual )
932 scaffBuffer = ( char * ) ckalloc ( ( ctg_start_pos + 5 ) * sizeof ( char ) );
933 prevCtg = NULL;
934 column = 0;
935 ctg_start_pos = 0;
936 lenSum = 0;
937 stackRecover ( ctgsStack );
939 while ( ( actg = stackPop ( ctgsStack ) ) != NULL )
941 ctg = actg->ctgID;
942 bal_ctg = getTwinCtg ( ctg );
943 length = contig_array[ctg].length + overlaplen;
945 if ( prevCtg && actg->scaftig_start )
947 gapN = actg->start - prevCtg->start - contig_array[prevCtg->ctgID].length;
948 gapN = gapN > 0 ? gapN : 1;
949 outputNs2 ( scaffBuffer, gapN, &column );
950 ctg_start_pos += gapN;
951 // Ncounter++;
954 if ( !prevCtg )
956 start = 0;
958 else
960 start = actg->cutHead;
963 outputlen = length - start - actg->cutTail;
965 if ( contig_array[ctg].seq )
967 t = ++contig_index_array[index_array[ctg]];
968 outputTightStr2 ( contig_array[ctg].seq, scaffBuffer, start, length, outputlen, 0, &column );
969 lu_end = start + outputlen > length ? length : start + outputlen;
970 lu_len = lu_end - start;
971 strand = '+';
972 // fprintf (foc, "%d\t", index_array[ctg]);
973 lenSum++;
975 if ( ctg_start_pos - start >= 0 )
977 pos_start[lenSum] = ctg_start_pos - start;
978 pos_end[lenSum] = ctg_start_pos + length - start;
979 orien_array[lenSum] = '+';
981 if ( t == 1 ) { sprintf ( index_contig[lenSum], "%u", index_array[ctg] ); }
982 else { sprintf ( index_contig[lenSum], "%u-%d", index_array[ctg], t - 1 ); }
984 fprintf ( fo3, "{AFG\nacc:%s\nclr:0,%d\n}\n", ( index_contig[lenSum] ), length );
985 outputTightStr2Visual ( foc2, ctg, & ( contig_index_array[index_array[ctg]] ), 0, length, length, 0 );
987 else
989 pos_start[lenSum] = 0;
990 pos_end[lenSum] = ctg_start_pos + length - start;
991 orien_array[lenSum] = '+';
993 if ( t == 1 ) { sprintf ( index_contig[lenSum], "%u", index_array[ctg] ); }
994 else { sprintf ( index_contig[lenSum], "%u-%d", index_array[ctg], t - 1 ); }
996 fprintf ( fo3, "{AFG\nacc:%s\nclr:0,%d\n}\n", ( index_contig[lenSum] ), length );
997 outputTightStr2Visual ( foc2, ctg, & ( contig_index_array[index_array[ctg]] ), start - ctg_start_pos, length, length - start + ctg_start_pos, 0 );
1000 else if ( contig_array[bal_ctg].seq )
1002 t = ++contig_index_array[index_array[bal_ctg]];
1003 outputTightStr2 ( contig_array[bal_ctg].seq, scaffBuffer, start, length, outputlen, 1, &column );
1004 lu_end = length - start - outputlen - 1 >= 0 ? length - start - outputlen : 0;
1005 lu_len = length - start - lu_end;
1006 strand = '-';
1007 // fprintf (foc, "%d\t", index_array[bal_ctg]);
1008 lenSum++;
1010 if ( ctg_start_pos - start >= 0 )
1012 pos_start[lenSum] = ctg_start_pos - start;
1013 pos_end[lenSum] = ctg_start_pos + length - start;
1014 orien_array[lenSum] = '-';
1016 if ( t == 1 ) { sprintf ( index_contig[lenSum], "%u", index_array[bal_ctg] ); }
1017 else { sprintf ( index_contig[lenSum], "%u-%d", index_array[bal_ctg], t - 1 ); }
1019 fprintf ( fo3, "{AFG\nacc:%s\nclr:0,%d\n}\n", ( index_contig[lenSum] ), length );
1020 outputTightStr2Visual ( foc2, bal_ctg, & ( contig_index_array[index_array[bal_ctg]] ), 0, length, length, 1 );
1022 else
1024 pos_start[lenSum] = ctg_start_pos;
1025 pos_end[lenSum] = ctg_start_pos + lu_len;
1026 orien_array[lenSum] = '-';
1028 if ( t == 1 ) { sprintf ( index_contig[lenSum], "%u", index_array[bal_ctg] ); }
1029 else { sprintf ( index_contig[lenSum], "%u-%d", index_array[bal_ctg], t - 1 ); }
1031 fprintf ( fo3, "{AFG\nacc:%s\nclr:0,%d\n}\n", ( index_contig[lenSum] ), length );
1032 outputTightStr2Visual ( foc2, bal_ctg, & ( contig_index_array[index_array[bal_ctg]] ), start, length, outputlen, 1 );
1036 // fprintf (foc, "%u\t%c\t%d\n", ctg_start_pos, strand, lu_len);
1037 ctg_start_pos += lu_len;
1039 if ( actg->gapSeqLen < 1 )
1041 prevCtg = actg;
1042 continue;
1045 pt = ( char * ) darrayPut ( gapSeqArray, actg->gapSeqOffset );
1046 outputTightStrLowerCase2 ( pt, scaffBuffer, 0, actg->gapSeqLen, actg->gapSeqLen, 0, &column );
1047 outputTightStrLowerCase2Visual ( foc2, gapNum, pt, 0, actg->gapSeqLen, actg->gapSeqLen );
1048 fprintf ( fo3, "{AFG\nacc:%d-0\nclr:0,%d\n}\n", gapNum, actg->gapSeqLen );
1049 lenSum++;
1050 pos_start[lenSum] = ctg_start_pos;
1051 pos_end[lenSum] = ctg_start_pos + actg->gapSeqLen;
1052 orien_array[lenSum] = '+';
1053 sprintf ( index_contig[lenSum], "%d-0", gapNum );
1054 gapNum++;
1055 ctg_start_pos = ctg_start_pos + actg->gapSeqLen;
1056 prevCtg = actg;
1059 scaffNum++;
1060 fprintf ( fo3, "{CCO\nacc:%d\npla:P\nlen:%u\ncns:\n", scaffNum, ctg_start_pos );
1062 for ( i = 0; i < ctg_start_pos; i++ )
1064 if ( i != 0 && i % 100 == 0 && i < ctg_start_pos - 1 ) { fprintf ( fo3, "\n" ); }
1066 fprintf ( fo3, "%c", scaffBuffer[i] );
1069 fprintf ( fo3, "\n.\nqlt:\n" );
1071 for ( i = 0; i < ctg_start_pos; i++ )
1073 if ( i != 0 && i % 100 == 0 && i < ctg_start_pos - 1 ) { fprintf ( fo3, "\n" ); }
1075 fprintf ( fo3, "D" );
1078 fprintf ( fo3, "\n.\nnpc:%d\n", lenSum );
1080 for ( i = 1; i <= lenSum; i++ )
1082 if ( orien_array[i] == '+' ) { fprintf ( fo3, "{MPS\ntyp:R\nmid:%s\nsrc:\n.\npos:%u,%u\ndln:0\ndel:\n}\n", ( index_contig[i] ), pos_start[i], pos_end[i] ); }
1084 if ( orien_array[i] == '-' ) { fprintf ( fo3, "{MPS\ntyp:R\nmid:%s\nsrc:\n.\npos:%u,%u\ndln:0\ndel:\n}\n", ( index_contig[i] ), pos_end[i], pos_start[i] ); }
1087 fprintf ( fo3, "}\n" );
1088 free ( ( void * ) scaffBuffer );
1091 free ( ( void * ) pos_start );
1092 free ( ( void * ) pos_end );
1093 free ( ( void * ) orien_array );
1095 for ( i = 0; i < 1000000; i++ )
1097 free ( ( void * ) index_contig[i] );
1100 free ( ( void * ) index_contig );
1103 static void fill1scaf ( int index, STACK * ctgsStack, int thrdID );
1104 static void check1scaf ( int t, int thrdID )
1106 if ( flagBuf[t] )
1108 return;
1111 boolean late = 0;
1112 pthread_mutex_lock ( &mutex );
1114 if ( !flagBuf[t] )
1116 flagBuf[t] = 1;
1117 thrdNoBuf[t] = thrdID;
1119 else
1121 late = 1;
1124 pthread_mutex_unlock ( &mutex );
1126 if ( late )
1128 return;
1131 counters[thrdID]++;
1132 fill1scaf ( scafCounter + t + 1, ctgStackBuffer[t], thrdID );
1135 static void fill1scaf ( int index, STACK * ctgsStack, int thrdID )
1137 CTGinSCAF * actg, *prevCtg = NULL;
1138 READNEARBY * rdArray, *rdArray4gap, *rd;
1139 int numRd = 0, count, maxGLen = 0;
1140 unsigned int ctg, bal_ctg;
1141 STACK * rdStack;
1143 while ( ( actg = stackPop ( ctgsStack ) ) != NULL )
1145 if ( prevCtg )
1147 maxGLen = maxGLen < ( actg->start - prevCtg->end ) ? ( actg->start - prevCtg->end ) : maxGLen;
1150 ctg = actg->ctgID;
1151 bal_ctg = getTwinCtg ( ctg );
1153 if ( actg->mask )
1155 prevCtg = actg;
1156 continue;
1159 if ( contig_array[ctg].closeReads )
1161 numRd += contig_array[ctg].closeReads->item_c;
1163 else if ( contig_array[bal_ctg].closeReads )
1165 numRd += contig_array[bal_ctg].closeReads->item_c;
1168 prevCtg = actg;
1171 if ( numRd < 1 )
1173 return;
1176 rdArray = ( READNEARBY * ) ckalloc ( numRd * sizeof ( READNEARBY ) );
1177 rdArray4gap = ( READNEARBY * ) ckalloc ( numRd * sizeof ( READNEARBY ) );
1178 //fprintf(stderr,"scaffold%d reads4gap %d\n",index,numRd);
1179 // collect reads appended to contigs in this scaffold
1180 int numRd2 = 0;
1181 stackRecover ( ctgsStack );
1183 while ( ( actg = stackPop ( ctgsStack ) ) != NULL )
1185 ctg = actg->ctgID;
1186 bal_ctg = getTwinCtg ( ctg );
1188 if ( actg->mask )
1190 continue;
1193 if ( contig_array[ctg].closeReads )
1195 rdStack = contig_array[ctg].closeReads;
1197 else if ( contig_array[bal_ctg].closeReads )
1199 rdStack = contig_array[bal_ctg].closeReads;
1201 else
1203 continue;
1206 stackBackup ( rdStack );
1208 while ( ( rd = ( READNEARBY * ) stackPop ( rdStack ) ) != NULL )
1210 rdArray[numRd2].len = rd->len;
1211 rdArray[numRd2].seqStarter = rd->seqStarter;
1213 if ( isSmallerThanTwin ( ctg ) )
1215 rdArray[numRd2++].dis = actg->start - overlaplen + rd->dis;
1217 else
1218 { rdArray[numRd2++].dis = actg->start - overlaplen + contig_array[ctg].length - rd->len - rd->dis; }
1221 stackRecover ( rdStack );
1224 if ( numRd2 != numRd )
1226 fprintf ( stderr, "##reads numbers doesn't match, %d vs %d when scaffold %d.\n", numRd, numRd2, index );
1229 qsort ( rdArray, numRd, sizeof ( READNEARBY ), cmp_reads );
1230 //fill gap one by one
1231 int gapStart, gapEnd;
1232 int numIn = 0;
1233 boolean flag;
1234 int buffer_size = maxReadLen > 100 ? maxReadLen : 100;
1235 int maxGSLen = maxGLen + GLDiff < 10 ? 10 : maxGLen + GLDiff;
1236 //fprintf(stderr,"maxGlen %d, maxGSlen %d\n",maxGLen,maxGSLen);
1237 char * seqGap = ( char * ) ckalloc ( maxGSLen * sizeof ( char ) ); // temp array for gap sequence
1238 Kmer * kmerCtg1 = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) );
1239 Kmer * kmerCtg2 = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) );
1240 char * seqCtg1 = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
1241 char * seqCtg2 = ( char * ) ckalloc ( buffer_size * sizeof ( char ) );
1242 prevCtg = NULL;
1243 stackRecover ( ctgsStack );
1245 while ( ( actg = stackPop ( ctgsStack ) ) != NULL )
1247 if ( !prevCtg || !actg->scaftig_start )
1249 prevCtg = actg;
1250 continue;
1253 gapStart = prevCtg->end - 100;
1254 gapEnd = actg->start - overlaplen + 100;
1255 cutRdArray ( rdArray, gapStart, gapEnd, &count, numRd, rdArray4gap );
1256 numIn += count;
1258 if(!count){
1259 prevCtg = actg;
1260 continue;
1263 int overlap;
1265 for ( overlap = overlaplen; overlap > 14; overlap -= 2 )
1267 flag = localGraph ( rdArray4gap, count, prevCtg, actg, overlaplen, kmerCtg1, kmerCtg2, overlap, darrayBuf[thrdID], seqCtg1, seqCtg2, seqGap );
1269 //free_kmerset(kmerSet);
1271 if ( flag == 1 )
1274 fprintf(stderr,"Between ctg %d and %d, Found with %d\n",prevCtg->ctgID
1275 ,actg->ctgID,overlap);
1277 break;
1282 if(count==0)
1283 printf("Gap closed without reads\n");
1284 if(!flag)
1285 fprintf(stderr,"Between ctg %d and %d, NO routes found\n",prevCtg->ctgID,actg->ctgID);
1287 prevCtg = actg;
1290 //fprintf(stderr,"____scaffold%d reads in gap %d\n",index,numIn);
1291 free ( ( void * ) seqGap );
1292 free ( ( void * ) kmerCtg1 );
1293 free ( ( void * ) kmerCtg2 );
1294 free ( ( void * ) seqCtg1 );
1295 free ( ( void * ) seqCtg2 );
1296 free ( ( void * ) rdArray );
1297 free ( ( void * ) rdArray4gap );
1300 static void reverseStack ( STACK * dStack, STACK * sStack )
1302 CTGinSCAF * actg, *ctgPt;
1303 emptyStack ( dStack );
1305 while ( ( actg = ( CTGinSCAF * ) stackPop ( sStack ) ) != NULL )
1307 ctgPt = ( CTGinSCAF * ) stackPush ( dStack );
1308 ctgPt->ctgID = actg->ctgID;
1309 ctgPt->start = actg->start;
1310 ctgPt->end = actg->end;
1311 ctgPt->scaftig_start = actg->scaftig_start;
1312 ctgPt->mask = actg->mask;
1313 ctgPt->cutHead = actg->cutHead;
1314 ctgPt->cutTail = actg->cutTail;
1315 ctgPt->gapSeqLen = actg->gapSeqLen;
1316 ctgPt->gapSeqOffset = actg->gapSeqOffset;
1319 stackBackup ( dStack );
1322 #ifdef MER127
1323 static Kmer tightStr2Kmer ( char * tightStr, int start, int length, int revS )
1325 int i;
1326 Kmer word;
1327 word.high1 = word.low1 = word.high2 = word.low2 = 0;
1329 if ( !revS )
1331 if ( start + overlaplen > length )
1333 fprintf ( stderr, "The tightStr2Kmer A: no enough bases for kmer.\n" );
1334 return word;
1337 for ( i = start; i < start + overlaplen; i++ )
1339 word = KmerLeftBitMoveBy2 ( word );
1340 word.low2 |= getCharInTightString ( tightStr, i );
1343 else
1345 if ( length - start - overlaplen < 0 )
1347 fprintf ( stderr, "The tightStr2Kmer B: no enough bases for kmer.\n" );
1348 return word;
1351 for ( i = length - 1 - start; i > length - 1 - start - overlaplen; i-- )
1353 word = KmerLeftBitMoveBy2 ( word );
1354 word.low2 |= int_comp ( getCharInTightString ( tightStr, i ) );
1358 return word;
1361 static Kmer maxKmer ()
1363 Kmer word;
1364 word.high1 = word.low1 = word.high2 = word.low2 = 0;
1365 int i;
1367 for ( i = 0; i < overlaplen; i++ )
1369 word = KmerLeftBitMoveBy2 ( word );
1370 word.low2 |= 0x3;
1373 return word;
1375 #else
1376 static Kmer tightStr2Kmer ( char * tightStr, int start, int length, int revS )
1378 int i;
1379 Kmer word;
1380 word.high = word.low = 0;
1382 if ( !revS )
1384 if ( start + overlaplen > length )
1386 fprintf ( stderr, "The tightStr2Kmer A: no enough bases for kmer.\n" );
1387 return word;
1390 for ( i = start; i < start + overlaplen; i++ )
1392 word = KmerLeftBitMoveBy2 ( word );
1393 word.low |= getCharInTightString ( tightStr, i );
1396 else
1398 if ( length - start - overlaplen < 0 )
1400 fprintf ( stderr, "The tightStr2Kmer B: no enough bases for kmer.\n" );
1401 return word;
1404 for ( i = length - 1 - start; i > length - 1 - start - overlaplen; i-- )
1406 word = KmerLeftBitMoveBy2 ( word );
1407 word.low |= int_comp ( getCharInTightString ( tightStr, i ) );
1411 return word;
1414 static Kmer maxKmer ()
1416 Kmer word;
1417 word.high = word.low = 0;
1418 int i;
1420 for ( i = 0; i < overlaplen; i++ )
1422 word = KmerLeftBitMoveBy2 ( word );
1423 word.low |= 0x3;
1426 return word;
1428 #endif
1429 static int contigCatch ( unsigned int prev_ctg, unsigned int ctg )
1431 if ( contig_array[prev_ctg].length == 0 || contig_array[ctg].length == 0 )
1433 return 0;
1436 Kmer kmerAtEnd, kmerAtStart;
1437 Kmer MaxKmer;
1438 unsigned int bal_ctg1 = getTwinCtg ( prev_ctg );
1439 unsigned int bal_ctg2 = getTwinCtg ( ctg );
1440 int i, start;
1441 int len1 = contig_array[prev_ctg].length + overlaplen;
1442 int len2 = contig_array[ctg].length + overlaplen;
1443 start = contig_array[prev_ctg].length;
1445 if ( contig_array[prev_ctg].seq )
1447 kmerAtEnd = tightStr2Kmer ( contig_array[prev_ctg].seq, start, len1, 0 );
1449 else
1451 kmerAtEnd = tightStr2Kmer ( contig_array[bal_ctg1].seq, start, len1, 1 );
1454 start = 0;
1456 if ( contig_array[ctg].seq )
1458 kmerAtStart = tightStr2Kmer ( contig_array[ctg].seq, start, len2, 0 );
1460 else
1462 kmerAtStart = tightStr2Kmer ( contig_array[bal_ctg2].seq, start, len2, 1 );
1465 MaxKmer = MAXKMER;
1467 for ( i = 0; i < 10; i++ )
1469 if ( KmerEqual ( kmerAtStart, kmerAtEnd ) )
1471 break;
1474 MaxKmer = KmerRightBitMoveBy2 ( MaxKmer );
1475 kmerAtEnd = KmerAnd ( kmerAtEnd, MaxKmer );
1476 kmerAtStart = KmerRightBitMoveBy2 ( kmerAtStart );
1479 if ( i < 10 )
1481 return overlaplen - i;
1483 else
1485 return 0;
1489 static void initStackBuf ( STACK ** ctgStackBuffer, int scafBufSize )
1491 int i;
1493 for ( i = 0; i < scafBufSize; i++ )
1495 flagBuf[i] = 1;
1496 ctgStackBuffer[i] = ( STACK * ) createStack ( 100, sizeof ( CTGinSCAF ) );
1499 static void freeStackBuf ( STACK ** ctgStackBuffer, int scafBufSize )
1501 int i;
1503 for ( i = 0; i < scafBufSize; i++ )
1505 freeStack ( ctgStackBuffer[i] );
1509 static void threadRoutine ( void * para )
1511 PARAMETER * prm;
1512 int i;
1513 prm = ( PARAMETER * ) para;
1515 //printf("%dth thread with threadID %d, hash_table %p\n",id,prm.threadID,prm.hash_table);
1516 while ( 1 )
1518 if ( * ( prm->selfSignal ) == 1 )
1520 emptyDarray ( darrayBuf[prm->threadID] );
1522 for ( i = 0; i < scafInBuf; i++ )
1524 check1scaf ( i, prm->threadID );
1527 * ( prm->selfSignal ) = 0;
1529 else if ( * ( prm->selfSignal ) == 2 )
1531 * ( prm->selfSignal ) = 0;
1532 break;
1535 usleep ( 1 );
1539 static void creatThrds ( pthread_t * threads, PARAMETER * paras )
1541 unsigned char i;
1542 int temp;
1544 for ( i = 0; i < thrd_num; i++ )
1546 if ( ( temp = pthread_create ( &threads[i], NULL, ( void * ) threadRoutine, & ( paras[i] ) ) ) != 0 )
1548 fprintf ( stderr, "Create threads failed.\n" );
1549 exit ( 1 );
1553 fprintf ( stderr, "%d thread(s) initialized.\n", thrd_num );
1556 static void sendWorkSignal ( unsigned char SIG, unsigned char * thrdSignals )
1558 int t;
1560 for ( t = 0; t < thrd_num; t++ )
1562 thrdSignals[t + 1] = SIG;
1565 while ( 1 )
1567 usleep ( 10 );
1569 for ( t = 0; t < thrd_num; t++ )
1570 if ( thrdSignals[t + 1] )
1572 break;
1575 if ( t == thrd_num )
1577 break;
1582 static void thread_wait ( pthread_t * threads )
1584 int i;
1586 for ( i = 0; i < thrd_num; i++ )
1587 if ( threads[i] != 0 )
1589 pthread_join ( threads[i], NULL );
1593 static void outputSeqs ( FILE * fo, FILE * foc, FILE * foc2, FILE * fo2, FILE * fo3, int scafInBuf )
1595 int i, thrdID;
1597 for ( i = 0; i < scafInBuf; i++ )
1599 thrdID = thrdNoBuf[i];
1600 outputScafSeq ( fo, foc, foc2, fo3, scafCounter + i + 1, ctgStackBuffer[i], darrayBuf[thrdID] );
1601 outputGapSeq ( fo2, scafCounter + i + 1, ctgStackBuffer[i], darrayBuf[thrdID] );
1605 static void MaskContig ( unsigned int ctg )
1607 contig_array[ctg].mask = 1;
1608 contig_array[getTwinCtg ( ctg )].mask = 1;
1611 static void MarkCtgOccu ( unsigned int ctg )
1613 contig_array[ctg].flag = 1;
1614 contig_array[getTwinCtg ( ctg )].flag = 1;
1617 static void output_ctg ( unsigned int ctg, FILE * fo )
1619 if ( contig_array[ctg].length < 1 )
1621 return;
1624 int len;
1625 unsigned int bal_ctg = getTwinCtg ( ctg );
1626 len = contig_array[ctg].length + overlaplen;
1627 int col = 0;
1629 if ( contig_array[ctg].seq )
1631 fprintf ( fo, ">C%d %4.1f\n", ctg, ( double ) contig_array[ctg].cvg );
1632 outputTightStr ( fo, contig_array[ctg].seq, 0, len, len, 0, &col );
1634 else if ( contig_array[bal_ctg].seq )
1636 fprintf ( fo, ">C%d %4.1f\n", bal_ctg, ( double ) contig_array[ctg].cvg );
1637 outputTightStr ( fo, contig_array[bal_ctg].seq, 0, len, len, 0, &col );
1640 contig_array[ctg].flag = 1;
1641 contig_array[bal_ctg].flag = 1;
1643 if ( len % 100 != 0 )
1645 fprintf ( fo, "\n" );
1649 void prlReadsCloseGap ( char * graphfile )
1651 //thrd_num=1;
1652 if ( fillGap )
1654 boolean flag;
1655 fprintf ( stderr, "\nStart to load reads for gap filling. %d length discrepancy is allowed.\n", GLDiff );
1656 fprintf ( stderr, "...\n" );
1657 flag = loadReads4gap ( graphfile );
1659 if ( !flag )
1661 return;
1665 if ( orig2new )
1667 convertIndex ();
1668 orig2new = 0;
1671 FILE * fp, *fo, *fo2, *fo3 = NULL, *foc, *foc2 = NULL;
1672 char line[1024];
1673 CTGinSCAF * actg;
1674 STACK * ctgStack, *aStack;
1675 int index = 0, offset = 0, counter, overallLen;
1676 int i, starter, prev_start, gapLen, catchable, scafnum;
1677 unsigned int ctg, prev_ctg = 0;
1678 boolean IsPrevGap;
1679 pthread_t threads[thrd_num];
1680 unsigned char thrdSignal[thrd_num + 1];
1681 PARAMETER paras[thrd_num];
1683 for ( ctg = 1; ctg <= num_ctg; ctg++ )
1685 contig_array[ctg].flag = 0;
1688 MAXKMER = maxKmer ();
1689 ctgStack = ( STACK * ) createStack ( 1000, sizeof ( CTGinSCAF ) );
1690 sprintf ( line, "%s.scaf_gap", graphfile );
1691 fp = ckopen ( line, "r" );
1692 sprintf ( line, "%s.scafSeq", graphfile );
1693 fo = ckopen ( line, "w" );
1694 sprintf ( line, "%s.contigPosInscaff", graphfile );
1695 foc = ckopen ( line, "w" );
1697 if ( visual )
1699 sprintf ( line, "%s.contig4asm", graphfile );
1700 foc2 = ckopen ( line, "w" );
1701 sprintf ( line, "%s.asm", graphfile );
1702 fo3 = ckopen ( line, "w" );
1705 sprintf ( line, "%s.gapSeq", graphfile );
1706 fo2 = ckopen ( line, "w" );
1707 pthread_mutex_init ( &mutex, NULL );
1708 flagBuf = ( boolean * ) ckalloc ( scafBufSize * sizeof ( boolean ) );;
1709 thrdNoBuf = ( unsigned char * ) ckalloc ( scafBufSize * sizeof ( unsigned char ) );;
1710 memset ( thrdNoBuf, 0, scafBufSize * sizeof ( char ) );
1711 ctgStackBuffer = ( STACK ** ) ckalloc ( scafBufSize * sizeof ( STACK * ) );
1712 initStackBuf ( ctgStackBuffer, scafBufSize );
1713 darrayBuf = ( DARRAY ** ) ckalloc ( thrd_num * sizeof ( DARRAY * ) );
1714 counters = ( int * ) ckalloc ( thrd_num * sizeof ( int ) );
1715 contig_index_array = ( int * ) ckalloc ( ( num_ctg + 1 ) * sizeof ( int ) );
1717 for ( i = 0; i <= num_ctg; i++ )
1719 contig_index_array[i] = 0;
1722 for ( i = 0; i < thrd_num; i++ )
1724 counters[i] = 0;
1725 darrayBuf[i] = ( DARRAY * ) createDarray ( 100000, sizeof ( char ) );
1726 thrdSignal[i + 1] = 0;
1727 paras[i].threadID = i;
1728 paras[i].mainSignal = &thrdSignal[0];
1729 paras[i].selfSignal = &thrdSignal[i + 1];
1732 if ( fillGap )
1734 creatThrds ( threads, paras );
1737 Ncounter = scafCounter = scafInBuf = allGaps = 0;
1739 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
1741 if ( line[0] == '>' )
1743 if ( index )
1745 aStack = ctgStackBuffer[scafInBuf];
1746 flagBuf[scafInBuf++] = 0;
1747 reverseStack ( aStack, ctgStack );
1749 if ( scafInBuf == scafBufSize )
1751 if ( fillGap )
1753 sendWorkSignal ( 1, thrdSignal );
1756 outputSeqs ( fo, foc, foc2, fo2, fo3, scafInBuf );
1757 scafCounter += scafInBuf;
1758 scafInBuf = 0;
1761 if ( index % 1000 == 0 )
1763 fprintf ( stderr, "%d scaffolds processed.\n", index );
1767 //read next scaff
1768 emptyStack ( ctgStack );
1769 IsPrevGap = offset = prev_ctg = 0;
1770 sscanf ( line + 9, "%d %d %d", &index, &counter, &overallLen );
1771 continue;
1774 if ( line[0] == 'G' ) // gap appears
1776 if ( fillGap )
1778 gapLen = procGap ( line, ctgStack );
1779 IsPrevGap = 1;
1782 continue;
1785 if ( line[0] >= '0' && line[0] <= '9' ) // a contig line
1787 sscanf ( line, "%d %d", &ctg, &starter );
1788 actg = ( CTGinSCAF * ) stackPush ( ctgStack );
1789 actg->ctgID = ctg;
1791 if ( contig_array[ctg].flag )
1793 MaskContig ( ctg );
1795 else
1797 MarkCtgOccu ( ctg );
1800 initiateCtgInScaf ( actg );
1802 if ( !prev_ctg )
1804 actg->cutHead = 0;
1806 else if ( !IsPrevGap )
1808 allGaps++;
1811 if ( !IsPrevGap )
1813 if ( prev_ctg && ( starter - prev_start - ( int ) contig_array[prev_ctg].length ) < ( ( int ) overlaplen * 4 ) )
1816 if(fillGap)
1817 catchable = contigCatch(prev_ctg,ctg);
1818 else
1820 catchable = 0;
1822 if ( catchable ) // prev_ctg and ctg overlap **bp
1824 allGaps--;
1826 if(isLargerThanTwin(prev_ctg))
1827 fprintf(stderr,"%d ####### by_overlap\n",getTwinCtg(prev_ctg));
1828 else
1829 fprintf(stderr,"%d ####### by_overlap\n",prev_ctg);
1831 actg->scaftig_start = 0;
1832 actg->cutHead = catchable;
1833 offset += - ( starter - prev_start - contig_array[prev_ctg].length ) + ( overlaplen - catchable );
1835 else
1837 actg->scaftig_start = 1;
1840 else
1842 actg->scaftig_start = 1;
1845 else
1847 offset += - ( starter - prev_start - contig_array[prev_ctg].length ) + gapLen;
1848 actg->scaftig_start = 0;
1851 actg->start = starter + offset;
1852 actg->end = actg->start + contig_array[ctg].length - 1;
1853 actg->mask = contig_array[ctg].mask;
1854 IsPrevGap = 0;
1855 prev_ctg = ctg;
1856 prev_start = starter;
1860 if ( index )
1862 aStack = ctgStackBuffer[scafInBuf];
1863 flagBuf[scafInBuf++] = 0;
1864 reverseStack ( aStack, ctgStack );
1866 if ( fillGap )
1868 sendWorkSignal ( 1, thrdSignal );
1871 outputSeqs ( fo, foc, foc2, fo2, fo3, scafInBuf );
1874 if ( visual )
1876 scafnum = scaffNum;
1878 for ( i = 1; i <= scafnum; i++ )
1880 fprintf ( fo3, "{SCF\nacc:%d\nnoc:0\n{CTP\nct1:%d\nct2:%d\nmea:0\nstd:0\nori:N\n}\n}\n", ++scaffNum, i, i );
1884 if ( fillGap )
1886 sendWorkSignal ( 2, thrdSignal );
1887 thread_wait ( threads );
1890 for ( ctg = 1; ctg <= num_ctg; ctg++ )
1892 if ( ( contig_array[ctg].length + overlaplen ) < 100 || contig_array[ctg].flag )
1894 continue;
1897 output_ctg ( ctg, fo );
1900 fprintf ( stderr, "\nDone with %d scaffolds, %d gaps finished, %d gaps overall.\n", index, allGaps - Ncounter, allGaps );
1901 index = 0;
1903 for ( i = 0; i < thrd_num; i++ )
1905 freeDarray ( darrayBuf[i] );
1906 index += counters[i];
1909 if ( fillGap )
1911 fprintf ( stderr, "Threads processed %d scaffolds.\n", index );
1914 free ( ( void * ) darrayBuf );
1916 if ( readSeqInGap )
1918 freeDarray ( readSeqInGap );
1921 fclose ( fp );
1922 fclose ( fo );
1923 fclose ( foc );
1924 fclose ( fo2 );
1926 if ( visual )
1928 fclose ( foc2 );
1929 fclose ( fo3 );
1932 freeStack ( ctgStack );
1933 freeStackBuf ( ctgStackBuffer, scafBufSize );
1934 free ( ( void * ) flagBuf );
1935 free ( ( void * ) thrdNoBuf );
1936 free ( ( void * ) ctgStackBuffer );