4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
30 #define RDBLOCKSIZE 50
38 static int * counters
;
39 static pthread_mutex_t mutex
;
40 static int scafBufSize
= 100;
41 static boolean
* flagBuf
;
42 static unsigned char * thrdNoBuf
;
43 static STACK
** ctgStackBuffer
;
44 static int scafCounter
;
46 static char * scaffBuffer
;
48 static void MarkCtgOccu ( unsigned int ctg
);
51 static void printRead(int len,char *seq)
54 fprintf(stderr,">read\n");
56 fprintf(stderr,"%c",int2base((int)getCharInTightString(seq,j)));
60 static void attach1read2contig ( unsigned int ctgID
, int len
, int pos
, long long starter
)
62 unsigned int ctg
= index_array
[ctgID
]; //new index in contig array
64 if ( isLargerThanTwin ( ctg
) )
66 ctg
= getTwinCtg ( ctg
); // put all reads in one contig of a twin
67 pos
= contig_array
[ctg
].length
+ overlaplen
- pos
- len
;
70 if ( !contig_array
[ctg
].closeReads
)
72 contig_array
[ctg
].closeReads
= ( STACK
* ) createStack ( RDBLOCKSIZE
, sizeof ( READNEARBY
) );
75 READNEARBY
* rd
= ( READNEARBY
* ) stackPush ( contig_array
[ctg
].closeReads
);
78 rd
->seqStarter
= starter
;
81 static void convertIndex ()
83 int * length_array
= ( int * ) ckalloc ( ( num_ctg
+ 1 ) * sizeof ( int ) );
86 for ( i
= 1; i
<= num_ctg
; i
++ )
91 for ( i
= 1; i
<= num_ctg
; i
++ )
93 if ( index_array
[i
] > 0 )
95 length_array
[index_array
[i
]] = i
;
99 for ( i
= 1; i
<= num_ctg
; i
++ )
101 index_array
[i
] = length_array
[i
];
102 } //contig i with new index: index_array[i]
104 free ( ( void * ) length_array
);
107 static long long getRead1by1_gz ( gzFile
* fp
, DARRAY
* readSeqInGap
)
109 long long readCounter
= 0;
119 char * freadBuf
= ( char * ) ckalloc ( ( maxReadLen
/ 4 + 1 ) * sizeof ( char ) );
121 while ( gzread ( fp
, &len
, sizeof ( int ) ) == 4 )
123 if ( gzread ( fp
, &ctgID
, sizeof ( int ) ) != 4 )
128 if ( gzread ( fp
, &pos
, sizeof ( int ) ) != 4 )
133 if ( gzread ( fp
, freadBuf
, sizeof ( char ) * ( unsigned ) ( len
/ 4 + 1 ) ) != ( unsigned ) ( len
/ 4 + 1 ) )
138 //put seq to dynamic array
139 starter
= readSeqInGap
->item_c
;
141 if ( !darrayPut ( readSeqInGap
, starter
+ len
/ 4 ) ) // make sure there's room for this seq
146 pt
= ( char * ) darrayPut ( readSeqInGap
, starter
);
147 bcopy ( freadBuf
, pt
, len
/ 4 + 1 );
148 attach1read2contig ( ctgID
, len
, pos
, starter
);
152 free ( ( void * ) freadBuf
);
156 static long long getRead1by1 ( FILE * fp
, DARRAY
* readSeqInGap
)
158 long long readCounter
= 0;
168 char * freadBuf
= ( char * ) ckalloc ( ( maxReadLen
/ 4 + 1 ) * sizeof ( char ) );
170 while ( fread ( &len
, sizeof ( int ), 1, fp
) == 1 )
172 if ( fread ( &ctgID
, sizeof ( int ), 1, fp
) != 1 )
177 if ( fread ( &pos
, sizeof ( int ), 1, fp
) != 1 )
182 if ( fread ( freadBuf
, sizeof ( char ), len
/ 4 + 1, fp
) != ( unsigned ) ( len
/ 4 + 1 ) )
187 //put seq to dynamic array
188 starter
= readSeqInGap
->item_c
;
190 if ( !darrayPut ( readSeqInGap
, starter
+ len
/ 4 ) ) // make sure there's room for this seq
195 pt
= ( char * ) darrayPut ( readSeqInGap
, starter
);
196 bcopy ( freadBuf
, pt
, len
/ 4 + 1 );
197 attach1read2contig ( ctgID
, len
, pos
, starter
);
201 free ( ( void * ) freadBuf
);
205 // Darray *readSeqInGap
206 static boolean
loadReads4gap ( char * graphfile
)
211 long long readCounter
;
213 if ( COMPATIBLE_MODE
== 1 )
215 sprintf ( name
, "%s.readInGap", graphfile
);
216 fp1
= fopen ( name
, "rb" );
220 sprintf ( name
, "%s.readInGap.gz", graphfile
);
221 fp
= gzopen ( name
, "rb" );
224 sprintf ( name
, "%s.longReadInGap", graphfile
);
225 fp2
= fopen ( name
, "rb" );
227 if ( COMPATIBLE_MODE
== 1 && !fp1
&& !fp2
)
229 fprintf ( stderr
, "Can't open %s.readInGap and %s.longReadInGap!\n", graphfile
, graphfile
);
232 else if ( COMPATIBLE_MODE
== 0 && !fp
&& !fp2
)
234 fprintf ( stderr
, "Can't open %s.readInGap.gz and %s.longReadInGap!\n", graphfile
, graphfile
);
244 readSeqInGap
= ( DARRAY
* ) createDarray ( 1000000, sizeof ( char ) );
246 if ( COMPATIBLE_MODE
== 1 && fp1
)
248 readCounter
= getRead1by1 ( fp1
, readSeqInGap
);
249 fprintf ( stderr
, "Loaded %lld reads from %s.readInGap.\n", readCounter
, graphfile
);
252 else if ( COMPATIBLE_MODE
== 0 && fp
)
254 readCounter
= getRead1by1_gz ( fp
, readSeqInGap
);
255 fprintf ( stderr
, "Loaded %lld reads from %s.readInGap.\n", readCounter
, graphfile
);
261 readCounter
= getRead1by1 ( fp2
, readSeqInGap
);
262 fprintf ( stderr
, "Loaded %lld reads from %s.LongReadInGap.\n", readCounter
, graphfile
);
269 static void debugging1 ()
275 unsigned int * length_array
= ( unsigned int * ) ckalloc ( ( num_ctg
+ 1 ) * sizeof ( unsigned int ) );
277 //use length_array to change info in index_array
278 for ( i
= 1; i
<= num_ctg
; i
++ )
283 for ( i
= 1; i
<= num_ctg
; i
++ )
285 if ( index_array
[i
] > 0 )
287 length_array
[index_array
[i
]] = i
;
291 for ( i
= 1; i
<= num_ctg
; i
++ )
293 index_array
[i
] = length_array
[i
];
294 } //contig i with original index: index_array[i]
303 for ( i
= 1; i
<= num_ctg
; i
++ )
305 if ( !contig_array
[i
].closeReads
)
310 if ( index_array
[i
] != 735 )
315 fprintf ( stderr
, "Contig %d, len %d: \n", index_array
[i
], contig_array
[i
].length
);
316 stackBackup ( contig_array
[i
].closeReads
);
318 while ( ( rd
= ( READNEARBY
* ) stackPop ( contig_array
[i
].closeReads
) ) != NULL
)
320 fprintf ( stderr
, "%d\t%d\t%lld\t", rd
->dis
, rd
->len
, rd
->seqStarter
);
321 pt
= ( char * ) darrayGet ( readSeqInGap
, rd
->seqStarter
);
323 for ( j
= 0; j
< rd
->len
; j
++ )
325 fprintf ( stderr
, "%c", int2base ( ( int ) getCharInTightString ( pt
, j
) ) );
328 fprintf ( stderr
, "\n" );
331 stackRecover ( contig_array
[i
].closeReads
);
335 static void initiateCtgInScaf ( CTGinSCAF
* actg
)
338 actg
->cutHead
= overlaplen
;
342 static int procGap ( char * line
, STACK
* ctgsStack
)
348 tp
= strtok ( line
, " " );
349 tp
= strtok ( NULL
, " " ); //length
350 length
= atoi ( tp
);
351 tp
= strtok ( NULL
, " " ); //seg
359 for ( i
= 0; i
< seg
; i
++ )
361 tp
= strtok ( NULL
, " " );
364 ctgPt
= ( CTGinSCAF
* ) stackPush ( ctgsStack
);
365 initiateCtgInScaf ( ctgPt
);
369 ctgPt
->scaftig_start
= 0;
376 static void debugging2 ( int index
, STACK
* ctgsStack
)
379 stackBackup ( ctgsStack
);
380 fprintf ( stderr
, ">scaffold%d\t%d 0.0\n", index
, ctgsStack
->item_c
);
382 while ( ( actg
= stackPop ( ctgsStack
) ) != NULL
)
384 fprintf ( stderr
, "%d\t%d\t%d\t%d\n", actg
->ctgID
, actg
->start
, actg
->end
, actg
->scaftig_start
);
387 stackRecover ( ctgsStack
);
390 static int cmp_reads ( const void * a
, const void * b
)
393 A
= ( READNEARBY
* ) a
;
394 B
= ( READNEARBY
* ) b
;
396 if ( A
->dis
> B
->dis
)
400 else if ( A
->dis
== B
->dis
)
410 static void cutRdArray ( READNEARBY
* rdArray
, int gapStart
, int gapEnd
, int * count
, int arrayLen
, READNEARBY
* cutArray
)
415 for ( i
= 0; i
< arrayLen
; i
++ )
417 if ( rdArray
[i
].dis
> gapEnd
)
422 if ( ( rdArray
[i
].dis
+ rdArray
[i
].len
) >= gapStart
)
424 cutArray
[num
].dis
= rdArray
[i
].dis
;
425 cutArray
[num
].len
= rdArray
[i
].len
;
426 cutArray
[num
++].seqStarter
= rdArray
[i
].seqStarter
;
433 static void outputTightStr2Visual ( FILE * foc2
, int ctg
, int * num
, int start
, int length
, int outputlen
, int revS
)
435 int i
, end
, column
= 0, n
= *num
;
436 char * tightStr
= contig_array
[ctg
].seq
;
438 if ( n
== 1 ) { fprintf ( foc2
, ">%d\n", index_array
[ctg
] ); }
439 else { fprintf ( foc2
, ">%d-%d\n", index_array
[ctg
], n
- 1 ); }
443 end
= start
+ outputlen
<= length
? start
+ outputlen
: length
;
445 for ( i
= start
; i
< end
; i
++ )
447 fprintf ( foc2
, "%c", int2base ( ( int ) getCharInTightString ( tightStr
, i
) ) );
449 if ( ( ++column
) % 100 == 0 )
452 fprintf ( foc2
, "\n" );
456 if ( column
% 100 != 0 )
458 fprintf ( foc2
, "\n" );
463 end
= length
- start
- outputlen
- 1 >= 0 ? length
- start
- outputlen
: 0;
465 for ( i
= end
; i
<= length
- 1 - start
; i
++ )
467 fprintf ( foc2
, "%c", int2base ( ( int ) getCharInTightString ( tightStr
, i
) ) );
469 if ( ( ++column
) % 100 == 0 )
471 fprintf ( foc2
, "\n" );
476 if ( column
% 100 != 0 )
478 fprintf ( foc2
, "\n" );
483 void outputTightStr ( FILE * fp
, char * tightStr
, int start
, int length
, int outputlen
, int revS
, int * col
)
491 end
= start
+ outputlen
<= length
? start
+ outputlen
: length
;
493 for ( i
= start
; i
< end
; i
++ )
495 fprintf ( fp
, "%c", int2base ( ( int ) getCharInTightString ( tightStr
, i
) ) );
497 if ( ( ++column
) % 100 == 0 )
500 fprintf ( fp
, "\n" );
506 end
= length
- start
- outputlen
- 1 >= 0 ? length
- start
- outputlen
: 0;
508 for ( i
= length
- 1 - start
; i
>= end
; i
-- )
510 fprintf ( fp
, "%c", int2compbase ( ( int ) getCharInTightString ( tightStr
, i
) ) );
512 if ( ( ++column
) % 100 == 0 )
514 fprintf ( fp
, "\n" );
523 static void outputTightStr2 ( char * tightStr
, char * scaff
, int start
, int length
, int outputlen
, int revS
, int * col
)
532 end
= start
+ outputlen
<= length
? start
+ outputlen
: length
;
534 for ( i
= start
; i
< end
; i
++ )
536 a
= int2base ( ( int ) getCharInTightString ( tightStr
, i
) );
537 // fprintf (fp, "%c", int2base ((int) getCharInTightString (tightStr, i)));
543 end
= length
- start
- outputlen
- 1 >= 0 ? length
- start
- outputlen
: 0;
545 for ( i
= length
- 1 - start
; i
>= end
; i
-- )
547 a
= int2compbase ( ( int ) getCharInTightString ( tightStr
, i
) );
548 // fprintf (fp, "%c", int2compbase ((int) getCharInTightString (tightStr, i)));
556 static void outputTightStrLowerCase2Visual ( FILE * foc2
, int gapNum
, char * tightStr
, int start
, int length
, int outputlen
)
558 int i
, end
, column
= 0;
559 end
= start
+ outputlen
<= length
? start
+ outputlen
: length
;
560 fprintf ( foc2
, ">%d-0\n", gapNum
);
562 for ( i
= start
; i
< end
; i
++ )
564 fprintf ( foc2
, "%c", "actg"[ ( int ) getCharInTightString ( tightStr
, i
)] );
566 if ( ( ++column
) % 100 == 0 )
569 fprintf ( foc2
, "\n" );
573 if ( column
% 100 != 0 )
575 fprintf ( foc2
, "\n" );
579 static void outputTightStrLowerCase ( FILE * fp
, char * tightStr
, int start
, int length
, int outputlen
, int revS
, int * col
)
587 end
= start
+ outputlen
<= length
? start
+ outputlen
: length
;
589 for ( i
= start
; i
< end
; i
++ )
591 fprintf ( fp
, "%c", "actg"[ ( int ) getCharInTightString ( tightStr
, i
)] );
593 if ( ( ++column
) % 100 == 0 )
596 fprintf ( fp
, "\n" );
602 end
= length
- start
- outputlen
- 1 >= 0 ? length
- start
- outputlen
: 0;
604 for ( i
= length
- 1 - start
; i
>= end
; i
-- )
606 fprintf ( fp
, "%c", "tgac"[ ( int ) getCharInTightString ( tightStr
, i
)] );
608 if ( ( ++column
) % 100 == 0 )
610 fprintf ( fp
, "\n" );
619 static void outputTightStrLowerCase2 ( char * tightStr
, char * scaff
, int start
, int length
, int outputlen
, int revS
, int * col
)
628 end
= start
+ outputlen
<= length
? start
+ outputlen
: length
;
630 for ( i
= start
; i
< end
; i
++ )
632 a
= "actg"[ ( int ) getCharInTightString ( tightStr
, i
)];
633 // fprintf (fp, "%c", "actg"[(int) getCharInTightString (tightStr, i)]);
639 end
= length
- start
- outputlen
- 1 >= 0 ? length
- start
- outputlen
: 0;
641 for ( i
= length
- 1 - start
; i
>= end
; i
-- )
643 a
= "tgac"[ ( int ) getCharInTightString ( tightStr
, i
)];
644 // fprintf (fp, "%c", "tgac"[(int) getCharInTightString (tightStr, i)]);
652 static void outputNs ( FILE * fp
, int gapN
, int * col
)
654 int i
, column
= *col
;
656 for ( i
= 0; i
< gapN
; i
++ )
660 if ( ( ++column
) % 100 == 0 )
663 fprintf ( fp
, "\n" );
670 static void outputNs2 ( char * scaff
, int gapN
, int * col
)
672 int i
, column
= *col
;
674 for ( i
= 0; i
< gapN
; i
++ )
683 static void outputGapInfo ( unsigned int ctg1
, unsigned int ctg2
)
685 unsigned int bal_ctg1
= getTwinCtg ( ctg1
);
686 unsigned int bal_ctg2
= getTwinCtg ( ctg2
);
688 if ( isLargerThanTwin ( ctg1
) )
690 fprintf ( stderr
, "%d\t", index_array
[bal_ctg1
] );
694 fprintf ( stderr
, "%d\t", index_array
[ctg1
] );
697 if ( isLargerThanTwin ( ctg2
) )
699 fprintf ( stderr
, "%d\n", index_array
[bal_ctg2
] );
703 fprintf ( stderr
, "%d\n", index_array
[ctg2
] );
707 static void output1gap ( FILE * fo
, int scafIndex
, CTGinSCAF
* prevCtg
, CTGinSCAF
* actg
, DARRAY
* gapSeqArray
)
709 unsigned int ctg1
, bal_ctg1
, length1
;
710 int start1
, outputlen1
;
711 unsigned int ctg2
, bal_ctg2
, length2
;
712 int start2
, outputlen2
;
715 ctg1
= prevCtg
->ctgID
;
716 bal_ctg1
= getTwinCtg ( ctg1
);
717 start1
= prevCtg
->cutHead
;
718 length1
= contig_array
[ctg1
].length
+ overlaplen
;
720 if ( length1
- prevCtg
->cutTail
- start1
> CTGappend
)
722 outputlen1
= CTGappend
;
723 start1
= length1
- prevCtg
->cutTail
- outputlen1
;
727 outputlen1
= length1
- prevCtg
->cutTail
- start1
;
731 bal_ctg2
= getTwinCtg ( ctg2
);
732 start2
= actg
->cutHead
;
733 length2
= contig_array
[ctg2
].length
+ overlaplen
;
735 if ( length2
- actg
->cutTail
- start2
> CTGappend
)
737 outputlen2
= CTGappend
;
741 outputlen2
= length2
- actg
->cutTail
- start2
;
744 if ( isLargerThanTwin ( ctg1
) )
746 fprintf ( fo
, ">S%d_C%d_L%d_G%d", scafIndex
, index_array
[bal_ctg1
], outputlen1
, prevCtg
->gapSeqLen
);
750 fprintf ( fo
, ">S%d_C%d_L%d_G%d", scafIndex
, index_array
[ctg1
], outputlen1
, prevCtg
->gapSeqLen
);
753 if ( isLargerThanTwin ( ctg2
) )
755 fprintf ( fo
, "_C%d_L%d\t", index_array
[bal_ctg2
], outputlen2
);
759 fprintf ( fo
, "_C%d_L%d\t", index_array
[ctg2
], outputlen2
);
762 fprintf ( fo
, "%d\n", start2
);
764 if ( contig_array
[ctg1
].seq
)
766 outputTightStr ( fo
, contig_array
[ctg1
].seq
, start1
, length1
, outputlen1
, 0, &column
);
768 else if ( contig_array
[bal_ctg1
].seq
)
770 outputTightStr ( fo
, contig_array
[bal_ctg1
].seq
, start1
, length1
, outputlen1
, 1, &column
);
773 pt
= ( char * ) darrayPut ( gapSeqArray
, prevCtg
->gapSeqOffset
);
774 outputTightStrLowerCase ( fo
, pt
, 0, prevCtg
->gapSeqLen
, prevCtg
->gapSeqLen
, 0, &column
);
776 if ( contig_array
[ctg2
].seq
)
778 outputTightStr ( fo
, contig_array
[ctg2
].seq
, start2
, length2
, outputlen2
, 0, &column
);
780 else if ( contig_array
[bal_ctg2
].seq
)
782 outputTightStr ( fo
, contig_array
[bal_ctg2
].seq
, start2
, length2
, outputlen2
, 1, &column
);
785 if ( column
% 100 != 0 )
787 fprintf ( fo
, "\n" );
791 static void outputGapSeq ( FILE * fo
, int index
, STACK
* ctgsStack
, DARRAY
* gapSeqArray
)
793 CTGinSCAF
* actg
, *prevCtg
= NULL
;
794 stackRecover ( ctgsStack
);
795 // fprintf (fo, ">scaffold%d\n", index);
797 while ( ( actg
= stackPop ( ctgsStack
) ) != NULL
)
801 if (actg->scaftig_start)
803 fprintf (fo, "0\t%d\t%d\n", prevCtg->mask, actg->mask);
807 fprintf (fo, "1\t%d\t%d\n", prevCtg->mask, actg->mask);
811 if ( prevCtg
&& prevCtg
->gapSeqLen
> 0 )
812 { output1gap ( fo
, index
, prevCtg
, actg
, gapSeqArray
); }
818 static void outputScafSeq ( FILE * fo
, FILE * foc
, FILE * foc2
, FILE * fo3
, int index
, STACK
* ctgsStack
, DARRAY
* gapSeqArray
)
820 CTGinSCAF
* actg
, *prevCtg
= NULL
;
821 unsigned int ctg
, bal_ctg
, ctg_out
, length
;
822 int start
, outputlen
, gapN
;
825 long long cvgSum
= 0;
827 int i
, t
, lu_len
= 0, lu_end
= 0;
828 unsigned int ctg_start_pos
= 0;
830 unsigned int * pos_start
= ( unsigned int * ) ckalloc ( 1000000 * sizeof ( unsigned int ) );
831 unsigned int * pos_end
= ( unsigned int * ) ckalloc ( 1000000 * sizeof ( unsigned int ) );
832 // char index_contig[num_ctg][20];
833 char ** index_contig
= ( char ** ) ckalloc ( 1000000 * sizeof ( char * ) );
835 for ( i
= 0; i
< 1000000; i
++ )
836 { index_contig
[i
] = ( char * ) ckalloc ( 20 * sizeof ( char ) ); }
839 orien_array
= ( char * ) ckalloc ( 1000000 * sizeof ( char ) );
840 // scaffBuffer = (char *) ckalloc (300000000 * sizeof (char));
841 stackRecover ( ctgsStack
);
843 while ( ( actg
= stackPop ( ctgsStack
) ) != NULL
)
845 if ( ! ( contig_array
[actg
->ctgID
].cvg
> 0 ) )
850 lenSum
+= contig_array
[actg
->ctgID
].length
;
851 cvgSum
+= contig_array
[actg
->ctgID
].length
* contig_array
[actg
->ctgID
].cvg
;
856 fprintf ( fo
, ">scaffold%d %4.1f\n", index
, ( double ) cvgSum
/ lenSum
);
860 fprintf ( fo
, ">scaffold%d 0.0\n", index
);
863 fprintf ( foc
, ">scaffold%d\n", index
);
864 stackRecover ( ctgsStack
);
866 while ( ( actg
= stackPop ( ctgsStack
) ) != NULL
)
869 bal_ctg
= getTwinCtg ( ctg
);
870 length
= contig_array
[ctg
].length
+ overlaplen
;
872 if ( prevCtg
&& actg
->scaftig_start
)
874 gapN
= actg
->start
- prevCtg
->start
- contig_array
[prevCtg
->ctgID
].length
;
875 gapN
= gapN
> 0 ? gapN
: 1;
876 outputNs ( fo
, gapN
, &column
);
877 ctg_start_pos
+= gapN
;
878 //outputGapInfo(prevCtg->ctgID,ctg);
888 start
= actg
->cutHead
;
891 outputlen
= length
- start
- actg
->cutTail
;
893 if ( contig_array
[ctg
].seq
)
895 outputTightStr ( fo
, contig_array
[ctg
].seq
, start
, length
, outputlen
, 0, &column
);
896 lu_end
= start
+ outputlen
> length
? length
: start
+ outputlen
;
897 lu_len
= lu_end
- start
;
899 fprintf ( foc
, "%d\t", index_array
[ctg
] );
901 else if ( contig_array
[bal_ctg
].seq
)
903 outputTightStr ( fo
, contig_array
[bal_ctg
].seq
, start
, length
, outputlen
, 1, &column
);
904 lu_end
= length
- start
- outputlen
- 1 >= 0 ? length
- start
- outputlen
: 0;
905 lu_len
= length
- start
- lu_end
;
907 fprintf ( foc
, "%d\t", index_array
[bal_ctg
] );
910 fprintf ( foc
, "%u\t%c\t%d\n", ctg_start_pos
, strand
, lu_len
);
911 ctg_start_pos
+= lu_len
;
913 if ( actg
->gapSeqLen
< 1 )
919 pt
= ( char * ) darrayPut ( gapSeqArray
, actg
->gapSeqOffset
);
920 outputTightStrLowerCase ( fo
, pt
, 0, actg
->gapSeqLen
, actg
->gapSeqLen
, 0, &column
);
921 ctg_start_pos
= ctg_start_pos
+ actg
->gapSeqLen
;
925 if ( column
% 100 != 0 )
927 fprintf ( fo
, "\n" );
932 scaffBuffer
= ( char * ) ckalloc ( ( ctg_start_pos
+ 5 ) * sizeof ( char ) );
937 stackRecover ( ctgsStack
);
939 while ( ( actg
= stackPop ( ctgsStack
) ) != NULL
)
942 bal_ctg
= getTwinCtg ( ctg
);
943 length
= contig_array
[ctg
].length
+ overlaplen
;
945 if ( prevCtg
&& actg
->scaftig_start
)
947 gapN
= actg
->start
- prevCtg
->start
- contig_array
[prevCtg
->ctgID
].length
;
948 gapN
= gapN
> 0 ? gapN
: 1;
949 outputNs2 ( scaffBuffer
, gapN
, &column
);
950 ctg_start_pos
+= gapN
;
960 start
= actg
->cutHead
;
963 outputlen
= length
- start
- actg
->cutTail
;
965 if ( contig_array
[ctg
].seq
)
967 t
= ++contig_index_array
[index_array
[ctg
]];
968 outputTightStr2 ( contig_array
[ctg
].seq
, scaffBuffer
, start
, length
, outputlen
, 0, &column
);
969 lu_end
= start
+ outputlen
> length
? length
: start
+ outputlen
;
970 lu_len
= lu_end
- start
;
972 // fprintf (foc, "%d\t", index_array[ctg]);
975 if ( ctg_start_pos
- start
>= 0 )
977 pos_start
[lenSum
] = ctg_start_pos
- start
;
978 pos_end
[lenSum
] = ctg_start_pos
+ length
- start
;
979 orien_array
[lenSum
] = '+';
981 if ( t
== 1 ) { sprintf ( index_contig
[lenSum
], "%u", index_array
[ctg
] ); }
982 else { sprintf ( index_contig
[lenSum
], "%u-%d", index_array
[ctg
], t
- 1 ); }
984 fprintf ( fo3
, "{AFG\nacc:%s\nclr:0,%d\n}\n", ( index_contig
[lenSum
] ), length
);
985 outputTightStr2Visual ( foc2
, ctg
, & ( contig_index_array
[index_array
[ctg
]] ), 0, length
, length
, 0 );
989 pos_start
[lenSum
] = 0;
990 pos_end
[lenSum
] = ctg_start_pos
+ length
- start
;
991 orien_array
[lenSum
] = '+';
993 if ( t
== 1 ) { sprintf ( index_contig
[lenSum
], "%u", index_array
[ctg
] ); }
994 else { sprintf ( index_contig
[lenSum
], "%u-%d", index_array
[ctg
], t
- 1 ); }
996 fprintf ( fo3
, "{AFG\nacc:%s\nclr:0,%d\n}\n", ( index_contig
[lenSum
] ), length
);
997 outputTightStr2Visual ( foc2
, ctg
, & ( contig_index_array
[index_array
[ctg
]] ), start
- ctg_start_pos
, length
, length
- start
+ ctg_start_pos
, 0 );
1000 else if ( contig_array
[bal_ctg
].seq
)
1002 t
= ++contig_index_array
[index_array
[bal_ctg
]];
1003 outputTightStr2 ( contig_array
[bal_ctg
].seq
, scaffBuffer
, start
, length
, outputlen
, 1, &column
);
1004 lu_end
= length
- start
- outputlen
- 1 >= 0 ? length
- start
- outputlen
: 0;
1005 lu_len
= length
- start
- lu_end
;
1007 // fprintf (foc, "%d\t", index_array[bal_ctg]);
1010 if ( ctg_start_pos
- start
>= 0 )
1012 pos_start
[lenSum
] = ctg_start_pos
- start
;
1013 pos_end
[lenSum
] = ctg_start_pos
+ length
- start
;
1014 orien_array
[lenSum
] = '-';
1016 if ( t
== 1 ) { sprintf ( index_contig
[lenSum
], "%u", index_array
[bal_ctg
] ); }
1017 else { sprintf ( index_contig
[lenSum
], "%u-%d", index_array
[bal_ctg
], t
- 1 ); }
1019 fprintf ( fo3
, "{AFG\nacc:%s\nclr:0,%d\n}\n", ( index_contig
[lenSum
] ), length
);
1020 outputTightStr2Visual ( foc2
, bal_ctg
, & ( contig_index_array
[index_array
[bal_ctg
]] ), 0, length
, length
, 1 );
1024 pos_start
[lenSum
] = ctg_start_pos
;
1025 pos_end
[lenSum
] = ctg_start_pos
+ lu_len
;
1026 orien_array
[lenSum
] = '-';
1028 if ( t
== 1 ) { sprintf ( index_contig
[lenSum
], "%u", index_array
[bal_ctg
] ); }
1029 else { sprintf ( index_contig
[lenSum
], "%u-%d", index_array
[bal_ctg
], t
- 1 ); }
1031 fprintf ( fo3
, "{AFG\nacc:%s\nclr:0,%d\n}\n", ( index_contig
[lenSum
] ), length
);
1032 outputTightStr2Visual ( foc2
, bal_ctg
, & ( contig_index_array
[index_array
[bal_ctg
]] ), start
, length
, outputlen
, 1 );
1036 // fprintf (foc, "%u\t%c\t%d\n", ctg_start_pos, strand, lu_len);
1037 ctg_start_pos
+= lu_len
;
1039 if ( actg
->gapSeqLen
< 1 )
1045 pt
= ( char * ) darrayPut ( gapSeqArray
, actg
->gapSeqOffset
);
1046 outputTightStrLowerCase2 ( pt
, scaffBuffer
, 0, actg
->gapSeqLen
, actg
->gapSeqLen
, 0, &column
);
1047 outputTightStrLowerCase2Visual ( foc2
, gapNum
, pt
, 0, actg
->gapSeqLen
, actg
->gapSeqLen
);
1048 fprintf ( fo3
, "{AFG\nacc:%d-0\nclr:0,%d\n}\n", gapNum
, actg
->gapSeqLen
);
1050 pos_start
[lenSum
] = ctg_start_pos
;
1051 pos_end
[lenSum
] = ctg_start_pos
+ actg
->gapSeqLen
;
1052 orien_array
[lenSum
] = '+';
1053 sprintf ( index_contig
[lenSum
], "%d-0", gapNum
);
1055 ctg_start_pos
= ctg_start_pos
+ actg
->gapSeqLen
;
1060 fprintf ( fo3
, "{CCO\nacc:%d\npla:P\nlen:%u\ncns:\n", scaffNum
, ctg_start_pos
);
1062 for ( i
= 0; i
< ctg_start_pos
; i
++ )
1064 if ( i
!= 0 && i
% 100 == 0 && i
< ctg_start_pos
- 1 ) { fprintf ( fo3
, "\n" ); }
1066 fprintf ( fo3
, "%c", scaffBuffer
[i
] );
1069 fprintf ( fo3
, "\n.\nqlt:\n" );
1071 for ( i
= 0; i
< ctg_start_pos
; i
++ )
1073 if ( i
!= 0 && i
% 100 == 0 && i
< ctg_start_pos
- 1 ) { fprintf ( fo3
, "\n" ); }
1075 fprintf ( fo3
, "D" );
1078 fprintf ( fo3
, "\n.\nnpc:%d\n", lenSum
);
1080 for ( i
= 1; i
<= lenSum
; i
++ )
1082 if ( orien_array
[i
] == '+' ) { fprintf ( fo3
, "{MPS\ntyp:R\nmid:%s\nsrc:\n.\npos:%u,%u\ndln:0\ndel:\n}\n", ( index_contig
[i
] ), pos_start
[i
], pos_end
[i
] ); }
1084 if ( orien_array
[i
] == '-' ) { fprintf ( fo3
, "{MPS\ntyp:R\nmid:%s\nsrc:\n.\npos:%u,%u\ndln:0\ndel:\n}\n", ( index_contig
[i
] ), pos_end
[i
], pos_start
[i
] ); }
1087 fprintf ( fo3
, "}\n" );
1088 free ( ( void * ) scaffBuffer
);
1091 free ( ( void * ) pos_start
);
1092 free ( ( void * ) pos_end
);
1093 free ( ( void * ) orien_array
);
1095 for ( i
= 0; i
< 1000000; i
++ )
1097 free ( ( void * ) index_contig
[i
] );
1100 free ( ( void * ) index_contig
);
1103 static void fill1scaf ( int index
, STACK
* ctgsStack
, int thrdID
);
1104 static void check1scaf ( int t
, int thrdID
)
1112 pthread_mutex_lock ( &mutex
);
1117 thrdNoBuf
[t
] = thrdID
;
1124 pthread_mutex_unlock ( &mutex
);
1132 fill1scaf ( scafCounter
+ t
+ 1, ctgStackBuffer
[t
], thrdID
);
1135 static void fill1scaf ( int index
, STACK
* ctgsStack
, int thrdID
)
1137 CTGinSCAF
* actg
, *prevCtg
= NULL
;
1138 READNEARBY
* rdArray
, *rdArray4gap
, *rd
;
1139 int numRd
= 0, count
, maxGLen
= 0;
1140 unsigned int ctg
, bal_ctg
;
1143 while ( ( actg
= stackPop ( ctgsStack
) ) != NULL
)
1147 maxGLen
= maxGLen
< ( actg
->start
- prevCtg
->end
) ? ( actg
->start
- prevCtg
->end
) : maxGLen
;
1151 bal_ctg
= getTwinCtg ( ctg
);
1159 if ( contig_array
[ctg
].closeReads
)
1161 numRd
+= contig_array
[ctg
].closeReads
->item_c
;
1163 else if ( contig_array
[bal_ctg
].closeReads
)
1165 numRd
+= contig_array
[bal_ctg
].closeReads
->item_c
;
1176 rdArray
= ( READNEARBY
* ) ckalloc ( numRd
* sizeof ( READNEARBY
) );
1177 rdArray4gap
= ( READNEARBY
* ) ckalloc ( numRd
* sizeof ( READNEARBY
) );
1178 //fprintf(stderr,"scaffold%d reads4gap %d\n",index,numRd);
1179 // collect reads appended to contigs in this scaffold
1181 stackRecover ( ctgsStack
);
1183 while ( ( actg
= stackPop ( ctgsStack
) ) != NULL
)
1186 bal_ctg
= getTwinCtg ( ctg
);
1193 if ( contig_array
[ctg
].closeReads
)
1195 rdStack
= contig_array
[ctg
].closeReads
;
1197 else if ( contig_array
[bal_ctg
].closeReads
)
1199 rdStack
= contig_array
[bal_ctg
].closeReads
;
1206 stackBackup ( rdStack
);
1208 while ( ( rd
= ( READNEARBY
* ) stackPop ( rdStack
) ) != NULL
)
1210 rdArray
[numRd2
].len
= rd
->len
;
1211 rdArray
[numRd2
].seqStarter
= rd
->seqStarter
;
1213 if ( isSmallerThanTwin ( ctg
) )
1215 rdArray
[numRd2
++].dis
= actg
->start
- overlaplen
+ rd
->dis
;
1218 { rdArray
[numRd2
++].dis
= actg
->start
- overlaplen
+ contig_array
[ctg
].length
- rd
->len
- rd
->dis
; }
1221 stackRecover ( rdStack
);
1224 if ( numRd2
!= numRd
)
1226 fprintf ( stderr
, "##reads numbers doesn't match, %d vs %d when scaffold %d.\n", numRd
, numRd2
, index
);
1229 qsort ( rdArray
, numRd
, sizeof ( READNEARBY
), cmp_reads
);
1230 //fill gap one by one
1231 int gapStart
, gapEnd
;
1234 int buffer_size
= maxReadLen
> 100 ? maxReadLen
: 100;
1235 int maxGSLen
= maxGLen
+ GLDiff
< 10 ? 10 : maxGLen
+ GLDiff
;
1236 //fprintf(stderr,"maxGlen %d, maxGSlen %d\n",maxGLen,maxGSLen);
1237 char * seqGap
= ( char * ) ckalloc ( maxGSLen
* sizeof ( char ) ); // temp array for gap sequence
1238 Kmer
* kmerCtg1
= ( Kmer
* ) ckalloc ( buffer_size
* sizeof ( Kmer
) );
1239 Kmer
* kmerCtg2
= ( Kmer
* ) ckalloc ( buffer_size
* sizeof ( Kmer
) );
1240 char * seqCtg1
= ( char * ) ckalloc ( buffer_size
* sizeof ( char ) );
1241 char * seqCtg2
= ( char * ) ckalloc ( buffer_size
* sizeof ( char ) );
1243 stackRecover ( ctgsStack
);
1245 while ( ( actg
= stackPop ( ctgsStack
) ) != NULL
)
1247 if ( !prevCtg
|| !actg
->scaftig_start
)
1253 gapStart
= prevCtg
->end
- 100;
1254 gapEnd
= actg
->start
- overlaplen
+ 100;
1255 cutRdArray ( rdArray
, gapStart
, gapEnd
, &count
, numRd
, rdArray4gap
);
1265 for ( overlap
= overlaplen
; overlap
> 14; overlap
-= 2 )
1267 flag
= localGraph ( rdArray4gap
, count
, prevCtg
, actg
, overlaplen
, kmerCtg1
, kmerCtg2
, overlap
, darrayBuf
[thrdID
], seqCtg1
, seqCtg2
, seqGap
);
1269 //free_kmerset(kmerSet);
1274 fprintf(stderr,"Between ctg %d and %d, Found with %d\n",prevCtg->ctgID
1275 ,actg->ctgID,overlap);
1283 printf("Gap closed without reads\n");
1285 fprintf(stderr,"Between ctg %d and %d, NO routes found\n",prevCtg->ctgID,actg->ctgID);
1290 //fprintf(stderr,"____scaffold%d reads in gap %d\n",index,numIn);
1291 free ( ( void * ) seqGap
);
1292 free ( ( void * ) kmerCtg1
);
1293 free ( ( void * ) kmerCtg2
);
1294 free ( ( void * ) seqCtg1
);
1295 free ( ( void * ) seqCtg2
);
1296 free ( ( void * ) rdArray
);
1297 free ( ( void * ) rdArray4gap
);
1300 static void reverseStack ( STACK
* dStack
, STACK
* sStack
)
1302 CTGinSCAF
* actg
, *ctgPt
;
1303 emptyStack ( dStack
);
1305 while ( ( actg
= ( CTGinSCAF
* ) stackPop ( sStack
) ) != NULL
)
1307 ctgPt
= ( CTGinSCAF
* ) stackPush ( dStack
);
1308 ctgPt
->ctgID
= actg
->ctgID
;
1309 ctgPt
->start
= actg
->start
;
1310 ctgPt
->end
= actg
->end
;
1311 ctgPt
->scaftig_start
= actg
->scaftig_start
;
1312 ctgPt
->mask
= actg
->mask
;
1313 ctgPt
->cutHead
= actg
->cutHead
;
1314 ctgPt
->cutTail
= actg
->cutTail
;
1315 ctgPt
->gapSeqLen
= actg
->gapSeqLen
;
1316 ctgPt
->gapSeqOffset
= actg
->gapSeqOffset
;
1319 stackBackup ( dStack
);
1323 static Kmer
tightStr2Kmer ( char * tightStr
, int start
, int length
, int revS
)
1327 word
.high1
= word
.low1
= word
.high2
= word
.low2
= 0;
1331 if ( start
+ overlaplen
> length
)
1333 fprintf ( stderr
, "The tightStr2Kmer A: no enough bases for kmer.\n" );
1337 for ( i
= start
; i
< start
+ overlaplen
; i
++ )
1339 word
= KmerLeftBitMoveBy2 ( word
);
1340 word
.low2
|= getCharInTightString ( tightStr
, i
);
1345 if ( length
- start
- overlaplen
< 0 )
1347 fprintf ( stderr
, "The tightStr2Kmer B: no enough bases for kmer.\n" );
1351 for ( i
= length
- 1 - start
; i
> length
- 1 - start
- overlaplen
; i
-- )
1353 word
= KmerLeftBitMoveBy2 ( word
);
1354 word
.low2
|= int_comp ( getCharInTightString ( tightStr
, i
) );
1361 static Kmer
maxKmer ()
1364 word
.high1
= word
.low1
= word
.high2
= word
.low2
= 0;
1367 for ( i
= 0; i
< overlaplen
; i
++ )
1369 word
= KmerLeftBitMoveBy2 ( word
);
1376 static Kmer
tightStr2Kmer ( char * tightStr
, int start
, int length
, int revS
)
1380 word
.high
= word
.low
= 0;
1384 if ( start
+ overlaplen
> length
)
1386 fprintf ( stderr
, "The tightStr2Kmer A: no enough bases for kmer.\n" );
1390 for ( i
= start
; i
< start
+ overlaplen
; i
++ )
1392 word
= KmerLeftBitMoveBy2 ( word
);
1393 word
.low
|= getCharInTightString ( tightStr
, i
);
1398 if ( length
- start
- overlaplen
< 0 )
1400 fprintf ( stderr
, "The tightStr2Kmer B: no enough bases for kmer.\n" );
1404 for ( i
= length
- 1 - start
; i
> length
- 1 - start
- overlaplen
; i
-- )
1406 word
= KmerLeftBitMoveBy2 ( word
);
1407 word
.low
|= int_comp ( getCharInTightString ( tightStr
, i
) );
1414 static Kmer
maxKmer ()
1417 word
.high
= word
.low
= 0;
1420 for ( i
= 0; i
< overlaplen
; i
++ )
1422 word
= KmerLeftBitMoveBy2 ( word
);
1429 static int contigCatch ( unsigned int prev_ctg
, unsigned int ctg
)
1431 if ( contig_array
[prev_ctg
].length
== 0 || contig_array
[ctg
].length
== 0 )
1436 Kmer kmerAtEnd
, kmerAtStart
;
1438 unsigned int bal_ctg1
= getTwinCtg ( prev_ctg
);
1439 unsigned int bal_ctg2
= getTwinCtg ( ctg
);
1441 int len1
= contig_array
[prev_ctg
].length
+ overlaplen
;
1442 int len2
= contig_array
[ctg
].length
+ overlaplen
;
1443 start
= contig_array
[prev_ctg
].length
;
1445 if ( contig_array
[prev_ctg
].seq
)
1447 kmerAtEnd
= tightStr2Kmer ( contig_array
[prev_ctg
].seq
, start
, len1
, 0 );
1451 kmerAtEnd
= tightStr2Kmer ( contig_array
[bal_ctg1
].seq
, start
, len1
, 1 );
1456 if ( contig_array
[ctg
].seq
)
1458 kmerAtStart
= tightStr2Kmer ( contig_array
[ctg
].seq
, start
, len2
, 0 );
1462 kmerAtStart
= tightStr2Kmer ( contig_array
[bal_ctg2
].seq
, start
, len2
, 1 );
1467 for ( i
= 0; i
< 10; i
++ )
1469 if ( KmerEqual ( kmerAtStart
, kmerAtEnd
) )
1474 MaxKmer
= KmerRightBitMoveBy2 ( MaxKmer
);
1475 kmerAtEnd
= KmerAnd ( kmerAtEnd
, MaxKmer
);
1476 kmerAtStart
= KmerRightBitMoveBy2 ( kmerAtStart
);
1481 return overlaplen
- i
;
1489 static void initStackBuf ( STACK
** ctgStackBuffer
, int scafBufSize
)
1493 for ( i
= 0; i
< scafBufSize
; i
++ )
1496 ctgStackBuffer
[i
] = ( STACK
* ) createStack ( 100, sizeof ( CTGinSCAF
) );
1499 static void freeStackBuf ( STACK
** ctgStackBuffer
, int scafBufSize
)
1503 for ( i
= 0; i
< scafBufSize
; i
++ )
1505 freeStack ( ctgStackBuffer
[i
] );
1509 static void threadRoutine ( void * para
)
1513 prm
= ( PARAMETER
* ) para
;
1515 //printf("%dth thread with threadID %d, hash_table %p\n",id,prm.threadID,prm.hash_table);
1518 if ( * ( prm
->selfSignal
) == 1 )
1520 emptyDarray ( darrayBuf
[prm
->threadID
] );
1522 for ( i
= 0; i
< scafInBuf
; i
++ )
1524 check1scaf ( i
, prm
->threadID
);
1527 * ( prm
->selfSignal
) = 0;
1529 else if ( * ( prm
->selfSignal
) == 2 )
1531 * ( prm
->selfSignal
) = 0;
1539 static void creatThrds ( pthread_t
* threads
, PARAMETER
* paras
)
1544 for ( i
= 0; i
< thrd_num
; i
++ )
1546 if ( ( temp
= pthread_create ( &threads
[i
], NULL
, ( void * ) threadRoutine
, & ( paras
[i
] ) ) ) != 0 )
1548 fprintf ( stderr
, "Create threads failed.\n" );
1553 fprintf ( stderr
, "%d thread(s) initialized.\n", thrd_num
);
1556 static void sendWorkSignal ( unsigned char SIG
, unsigned char * thrdSignals
)
1560 for ( t
= 0; t
< thrd_num
; t
++ )
1562 thrdSignals
[t
+ 1] = SIG
;
1569 for ( t
= 0; t
< thrd_num
; t
++ )
1570 if ( thrdSignals
[t
+ 1] )
1575 if ( t
== thrd_num
)
1582 static void thread_wait ( pthread_t
* threads
)
1586 for ( i
= 0; i
< thrd_num
; i
++ )
1587 if ( threads
[i
] != 0 )
1589 pthread_join ( threads
[i
], NULL
);
1593 static void outputSeqs ( FILE * fo
, FILE * foc
, FILE * foc2
, FILE * fo2
, FILE * fo3
, int scafInBuf
)
1597 for ( i
= 0; i
< scafInBuf
; i
++ )
1599 thrdID
= thrdNoBuf
[i
];
1600 outputScafSeq ( fo
, foc
, foc2
, fo3
, scafCounter
+ i
+ 1, ctgStackBuffer
[i
], darrayBuf
[thrdID
] );
1601 outputGapSeq ( fo2
, scafCounter
+ i
+ 1, ctgStackBuffer
[i
], darrayBuf
[thrdID
] );
1605 static void MaskContig ( unsigned int ctg
)
1607 contig_array
[ctg
].mask
= 1;
1608 contig_array
[getTwinCtg ( ctg
)].mask
= 1;
1611 static void MarkCtgOccu ( unsigned int ctg
)
1613 contig_array
[ctg
].flag
= 1;
1614 contig_array
[getTwinCtg ( ctg
)].flag
= 1;
1617 static void output_ctg ( unsigned int ctg
, FILE * fo
)
1619 if ( contig_array
[ctg
].length
< 1 )
1625 unsigned int bal_ctg
= getTwinCtg ( ctg
);
1626 len
= contig_array
[ctg
].length
+ overlaplen
;
1629 if ( contig_array
[ctg
].seq
)
1631 fprintf ( fo
, ">C%d %4.1f\n", ctg
, ( double ) contig_array
[ctg
].cvg
);
1632 outputTightStr ( fo
, contig_array
[ctg
].seq
, 0, len
, len
, 0, &col
);
1634 else if ( contig_array
[bal_ctg
].seq
)
1636 fprintf ( fo
, ">C%d %4.1f\n", bal_ctg
, ( double ) contig_array
[ctg
].cvg
);
1637 outputTightStr ( fo
, contig_array
[bal_ctg
].seq
, 0, len
, len
, 0, &col
);
1640 contig_array
[ctg
].flag
= 1;
1641 contig_array
[bal_ctg
].flag
= 1;
1643 if ( len
% 100 != 0 )
1645 fprintf ( fo
, "\n" );
1649 void prlReadsCloseGap ( char * graphfile
)
1655 fprintf ( stderr
, "\nStart to load reads for gap filling. %d length discrepancy is allowed.\n", GLDiff
);
1656 fprintf ( stderr
, "...\n" );
1657 flag
= loadReads4gap ( graphfile
);
1671 FILE * fp
, *fo
, *fo2
, *fo3
= NULL
, *foc
, *foc2
= NULL
;
1674 STACK
* ctgStack
, *aStack
;
1675 int index
= 0, offset
= 0, counter
, overallLen
;
1676 int i
, starter
, prev_start
, gapLen
, catchable
, scafnum
;
1677 unsigned int ctg
, prev_ctg
= 0;
1679 pthread_t threads
[thrd_num
];
1680 unsigned char thrdSignal
[thrd_num
+ 1];
1681 PARAMETER paras
[thrd_num
];
1683 for ( ctg
= 1; ctg
<= num_ctg
; ctg
++ )
1685 contig_array
[ctg
].flag
= 0;
1688 MAXKMER
= maxKmer ();
1689 ctgStack
= ( STACK
* ) createStack ( 1000, sizeof ( CTGinSCAF
) );
1690 sprintf ( line
, "%s.scaf_gap", graphfile
);
1691 fp
= ckopen ( line
, "r" );
1692 sprintf ( line
, "%s.scafSeq", graphfile
);
1693 fo
= ckopen ( line
, "w" );
1694 sprintf ( line
, "%s.contigPosInscaff", graphfile
);
1695 foc
= ckopen ( line
, "w" );
1699 sprintf ( line
, "%s.contig4asm", graphfile
);
1700 foc2
= ckopen ( line
, "w" );
1701 sprintf ( line
, "%s.asm", graphfile
);
1702 fo3
= ckopen ( line
, "w" );
1705 sprintf ( line
, "%s.gapSeq", graphfile
);
1706 fo2
= ckopen ( line
, "w" );
1707 pthread_mutex_init ( &mutex
, NULL
);
1708 flagBuf
= ( boolean
* ) ckalloc ( scafBufSize
* sizeof ( boolean
) );;
1709 thrdNoBuf
= ( unsigned char * ) ckalloc ( scafBufSize
* sizeof ( unsigned char ) );;
1710 memset ( thrdNoBuf
, 0, scafBufSize
* sizeof ( char ) );
1711 ctgStackBuffer
= ( STACK
** ) ckalloc ( scafBufSize
* sizeof ( STACK
* ) );
1712 initStackBuf ( ctgStackBuffer
, scafBufSize
);
1713 darrayBuf
= ( DARRAY
** ) ckalloc ( thrd_num
* sizeof ( DARRAY
* ) );
1714 counters
= ( int * ) ckalloc ( thrd_num
* sizeof ( int ) );
1715 contig_index_array
= ( int * ) ckalloc ( ( num_ctg
+ 1 ) * sizeof ( int ) );
1717 for ( i
= 0; i
<= num_ctg
; i
++ )
1719 contig_index_array
[i
] = 0;
1722 for ( i
= 0; i
< thrd_num
; i
++ )
1725 darrayBuf
[i
] = ( DARRAY
* ) createDarray ( 100000, sizeof ( char ) );
1726 thrdSignal
[i
+ 1] = 0;
1727 paras
[i
].threadID
= i
;
1728 paras
[i
].mainSignal
= &thrdSignal
[0];
1729 paras
[i
].selfSignal
= &thrdSignal
[i
+ 1];
1734 creatThrds ( threads
, paras
);
1737 Ncounter
= scafCounter
= scafInBuf
= allGaps
= 0;
1739 while ( fgets ( line
, sizeof ( line
), fp
) != NULL
)
1741 if ( line
[0] == '>' )
1745 aStack
= ctgStackBuffer
[scafInBuf
];
1746 flagBuf
[scafInBuf
++] = 0;
1747 reverseStack ( aStack
, ctgStack
);
1749 if ( scafInBuf
== scafBufSize
)
1753 sendWorkSignal ( 1, thrdSignal
);
1756 outputSeqs ( fo
, foc
, foc2
, fo2
, fo3
, scafInBuf
);
1757 scafCounter
+= scafInBuf
;
1761 if ( index
% 1000 == 0 )
1763 fprintf ( stderr
, "%d scaffolds processed.\n", index
);
1768 emptyStack ( ctgStack
);
1769 IsPrevGap
= offset
= prev_ctg
= 0;
1770 sscanf ( line
+ 9, "%d %d %d", &index
, &counter
, &overallLen
);
1774 if ( line
[0] == 'G' ) // gap appears
1778 gapLen
= procGap ( line
, ctgStack
);
1785 if ( line
[0] >= '0' && line
[0] <= '9' ) // a contig line
1787 sscanf ( line
, "%d %d", &ctg
, &starter
);
1788 actg
= ( CTGinSCAF
* ) stackPush ( ctgStack
);
1791 if ( contig_array
[ctg
].flag
)
1797 MarkCtgOccu ( ctg
);
1800 initiateCtgInScaf ( actg
);
1806 else if ( !IsPrevGap
)
1813 if ( prev_ctg
&& ( starter
- prev_start
- ( int ) contig_array
[prev_ctg
].length
) < ( ( int ) overlaplen
* 4 ) )
1817 catchable = contigCatch(prev_ctg,ctg);
1822 if ( catchable
) // prev_ctg and ctg overlap **bp
1826 if(isLargerThanTwin(prev_ctg))
1827 fprintf(stderr,"%d ####### by_overlap\n",getTwinCtg(prev_ctg));
1829 fprintf(stderr,"%d ####### by_overlap\n",prev_ctg);
1831 actg
->scaftig_start
= 0;
1832 actg
->cutHead
= catchable
;
1833 offset
+= - ( starter
- prev_start
- contig_array
[prev_ctg
].length
) + ( overlaplen
- catchable
);
1837 actg
->scaftig_start
= 1;
1842 actg
->scaftig_start
= 1;
1847 offset
+= - ( starter
- prev_start
- contig_array
[prev_ctg
].length
) + gapLen
;
1848 actg
->scaftig_start
= 0;
1851 actg
->start
= starter
+ offset
;
1852 actg
->end
= actg
->start
+ contig_array
[ctg
].length
- 1;
1853 actg
->mask
= contig_array
[ctg
].mask
;
1856 prev_start
= starter
;
1862 aStack
= ctgStackBuffer
[scafInBuf
];
1863 flagBuf
[scafInBuf
++] = 0;
1864 reverseStack ( aStack
, ctgStack
);
1868 sendWorkSignal ( 1, thrdSignal
);
1871 outputSeqs ( fo
, foc
, foc2
, fo2
, fo3
, scafInBuf
);
1878 for ( i
= 1; i
<= scafnum
; i
++ )
1880 fprintf ( fo3
, "{SCF\nacc:%d\nnoc:0\n{CTP\nct1:%d\nct2:%d\nmea:0\nstd:0\nori:N\n}\n}\n", ++scaffNum
, i
, i
);
1886 sendWorkSignal ( 2, thrdSignal
);
1887 thread_wait ( threads
);
1890 for ( ctg
= 1; ctg
<= num_ctg
; ctg
++ )
1892 if ( ( contig_array
[ctg
].length
+ overlaplen
) < 100 || contig_array
[ctg
].flag
)
1897 output_ctg ( ctg
, fo
);
1900 fprintf ( stderr
, "\nDone with %d scaffolds, %d gaps finished, %d gaps overall.\n", index
, allGaps
- Ncounter
, allGaps
);
1903 for ( i
= 0; i
< thrd_num
; i
++ )
1905 freeDarray ( darrayBuf
[i
] );
1906 index
+= counters
[i
];
1911 fprintf ( stderr
, "Threads processed %d scaffolds.\n", index
);
1914 free ( ( void * ) darrayBuf
);
1918 freeDarray ( readSeqInGap
);
1932 freeStack ( ctgStack
);
1933 freeStackBuf ( ctgStackBuffer
, scafBufSize
);
1934 free ( ( void * ) flagBuf
);
1935 free ( ( void * ) thrdNoBuf
);
1936 free ( ( void * ) ctgStackBuffer
);