4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
30 static long long * kmerCounter
;
32 //buffer related varibles for chop kmer
33 static unsigned int read_c
;
35 static char * seqBuffer
;
36 static int * lenBuffer
;
37 static unsigned int * indexArray
;
38 static unsigned int * seqBreakers
; //record sum length to indicate start pos on seqBuffer
39 static int * ctgIdArray
;
41 //static Kmer *firstKmers;
43 //buffer related varibles for splay tree
44 static unsigned int buffer_size
= 100000000;
45 static unsigned int seq_buffer_size
;
46 static unsigned int max_read_c
;
47 static volatile unsigned int kmer_c
;
48 static Kmer
* kmerBuffer
;
49 static ubyte8
* hashBanBuffer
;
50 static boolean
* smallerBuffer
;
52 static void singleKmer ( int t
, KmerSet
* kset
, unsigned int seq_index
, unsigned int pos
);
53 static void chopKmer4read ( int t
, int threadID
);
55 static void threadRoutine ( void * para
)
60 prm
= ( PARAMETER
* ) para
;
65 if ( * ( prm
->selfSignal
) == 1 )
67 unsigned int seq_index
= 0;
70 for ( i
= 0; i
< kmer_c
; i
++ )
72 if ( seq_index
< read_c
&& indexArray
[seq_index
+ 1] == i
)
74 seq_index
++; // which sequence this kmer belongs to
78 if ( ( unsigned char ) ( hashBanBuffer
[i
] % thrd_num
) != id
)
84 kmerCounter
[id
+ 1]++;
85 singleKmer ( i
, KmerSets
[id
], seq_index
, pos
++ );
88 * ( prm
->selfSignal
) = 0;
90 else if ( * ( prm
->selfSignal
) == 2 )
92 for ( i
= 0; i
< read_c
; i
++ )
94 if ( i
% thrd_num
!= id
)
99 chopKmer4read ( i
, id
+ 1 );
102 * ( prm
->selfSignal
) = 0;
104 else if ( * ( prm
->selfSignal
) == 3 )
106 * ( prm
->selfSignal
) = 0;
114 /*************************************************
118 Stores a kmer into kmerset and updates related status.
120 1. t: kmerBuffer index
122 3. seq_index: contig id array index
123 4. pos: the postion of the kmer on contig
128 *************************************************/
129 static void singleKmer ( int t
, KmerSet
* kset
, unsigned int seq_index
, unsigned int pos
)
133 flag
= put_kmerset ( kset
, kmerBuffer
[t
], 4, 4, &node
);
137 if ( smallerBuffer
[t
] )
146 node
->l_links
= ctgIdArray
[seq_index
];
156 static void creatThrds ( pthread_t
* threads
, PARAMETER
* paras
)
161 for ( i
= 0; i
< thrd_num
; i
++ )
163 if ( ( temp
= pthread_create ( &threads
[i
], NULL
, ( void * ) threadRoutine
, & ( paras
[i
] ) ) ) != 0 )
165 fprintf ( stderr
, "Create threads failed.\n" );
170 fprintf ( stderr
, "%d thread(s) initialized.\n", thrd_num
);
173 static void thread_wait ( pthread_t
* threads
)
177 for ( i
= 0; i
< thrd_num
; i
++ )
178 if ( threads
[i
] != 0 )
180 pthread_join ( threads
[i
], NULL
);
184 static void chopKmer4read ( int t
, int threadID
)
186 char * src_seq
= seqBuffer
+ seqBreakers
[t
];
187 char * bal_seq
= rcSeq
[threadID
];
188 int len_seq
= lenBuffer
[t
];
190 ubyte8 hash_ban
, bal_hash_ban
;
194 word
.high1
= word
.low1
= word
.high2
= word
.low2
= 0;
196 for ( index
= 0; index
< overlaplen
; index
++ )
198 word
= KmerLeftBitMoveBy2 ( word
);
199 word
.low2
|= src_seq
[index
];
203 word
.high
= word
.low
= 0;
205 for ( index
= 0; index
< overlaplen
; index
++ )
207 word
= KmerLeftBitMoveBy2 ( word
);
208 word
.low
|= src_seq
[index
];
212 reverseComplementSeq ( src_seq
, len_seq
, bal_seq
);
213 // complementary node
214 bal_word
= reverseComplement ( word
, overlaplen
);
215 bal_j
= len_seq
- 0 - overlaplen
;
216 index
= indexArray
[t
];
218 if ( KmerSmaller ( word
, bal_word
) )
220 hash_ban
= hash_kmer ( word
);
221 kmerBuffer
[index
] = word
;
222 hashBanBuffer
[index
] = hash_ban
;
223 smallerBuffer
[index
++] = 1;
227 bal_hash_ban
= hash_kmer ( bal_word
);
228 kmerBuffer
[index
] = bal_word
;
229 hashBanBuffer
[index
] = bal_hash_ban
;
230 smallerBuffer
[index
++] = 0;
233 for ( j
= 1; j
<= len_seq
- overlaplen
; j
++ )
235 word
= nextKmer ( word
, src_seq
[j
- 1 + overlaplen
] );
236 bal_j
= len_seq
- j
- overlaplen
;
237 bal_word
= prevKmer ( bal_word
, bal_seq
[bal_j
] );
239 if ( KmerSmaller ( word
, bal_word
) )
241 hash_ban
= hash_kmer ( word
);
242 kmerBuffer
[index
] = word
;
243 hashBanBuffer
[index
] = hash_ban
;
244 smallerBuffer
[index
++] = 1;
248 // complementary node
249 bal_hash_ban
= hash_kmer ( bal_word
);
250 kmerBuffer
[index
] = bal_word
;
251 hashBanBuffer
[index
] = bal_hash_ban
;
252 smallerBuffer
[index
++] = 0;
257 static void sendWorkSignal ( unsigned char SIG
, unsigned char * thrdSignals
)
261 for ( t
= 0; t
< thrd_num
; t
++ )
263 thrdSignals
[t
+ 1] = SIG
;
270 for ( t
= 0; t
< thrd_num
; t
++ )
271 if ( thrdSignals
[t
+ 1] )
284 /*************************************************
295 *************************************************/
296 static int getID ( char * name
)
298 if ( name
[0] >= '0' && name
[0] <= '9' )
300 return atoi ( & ( name
[0] ) );
309 /*************************************************
313 Chops contigs not shorter than 'len_cut' into kmers:
314 1. if the kmer occured twice or more , mark it deleted;
315 2. 'node->l_links' stores the contig id and 'node->r_links' stores the position the kmer on contig.
317 1. grapfile: the prefix of contig file
318 2. len_cut: the minimum length of contig
323 *************************************************/
324 boolean
prlContig2nodes ( char * grapfile
, int len_cut
)
326 long long i
, num_seq
;
327 char name
[256], *next_name
;
329 pthread_t threads
[thrd_num
];
330 time_t start_t
, stop_t
;
331 unsigned char thrdSignal
[thrd_num
+ 1];
332 PARAMETER paras
[thrd_num
];
333 int maxCtgLen
, minCtgLen
, nameLen
;
334 unsigned int lenSum
, contigId
;
336 WORDFILTER
= createFilter ( overlaplen
);
338 sprintf ( name
, "%s.contig", grapfile
);
339 fp
= ckopen ( name
, "r" );
340 maxCtgLen
= nameLen
= 10;
342 num_seq
= readseqpar ( &maxCtgLen
, &minCtgLen
, &nameLen
, fp
);
343 fprintf ( stderr
, "\n%lld contig(s), maximum sequence length %d, minimum sequence length %d, maximum name length %d.\n", num_seq
, maxCtgLen
, minCtgLen
, nameLen
);
344 maxReadLen
= maxCtgLen
;
347 fprintf ( stderr
, "Time spent on parsing contigs file: %ds.\n", ( int ) ( stop_t
- start_t
) );
348 next_name
= ( char * ) ckalloc ( ( maxNameLen
+ 1 ) * sizeof ( char ) );
349 // extract all the EDONs
350 seq_buffer_size
= buffer_size
* 2;
351 max_read_c
= seq_buffer_size
/ 20;
352 kmerBuffer
= ( Kmer
* ) ckalloc ( buffer_size
* sizeof ( Kmer
) );
353 hashBanBuffer
= ( ubyte8
* ) ckalloc ( buffer_size
* sizeof ( ubyte8
) );
354 smallerBuffer
= ( boolean
* ) ckalloc ( buffer_size
* sizeof ( boolean
) );
355 seqBuffer
= ( char * ) ckalloc ( seq_buffer_size
* sizeof ( char ) );
356 lenBuffer
= ( int * ) ckalloc ( max_read_c
* sizeof ( int ) );
357 indexArray
= ( unsigned int * ) ckalloc ( ( max_read_c
+ 1 ) * sizeof ( unsigned int ) );
358 seqBreakers
= ( unsigned int * ) ckalloc ( ( max_read_c
+ 1 ) * sizeof ( unsigned int ) );
359 ctgIdArray
= ( int * ) ckalloc ( max_read_c
* sizeof ( int ) );
360 fp
= ckopen ( name
, "r" );
361 rcSeq
= ( char ** ) ckalloc ( ( thrd_num
+ 1 ) * sizeof ( char * ) );
365 kmerCounter
= ( long long * ) ckalloc ( ( thrd_num
+ 1 ) * sizeof ( long long ) );
366 KmerSets
= ( KmerSet
** ) ckalloc ( thrd_num
* sizeof ( KmerSet
* ) );
368 for ( i
= 0; i
< thrd_num
; i
++ )
370 KmerSets
[i
] = init_kmerset ( 1024, 0.77f
);
371 thrdSignal
[i
+ 1] = 0;
372 paras
[i
].threadID
= i
;
373 paras
[i
].mainSignal
= &thrdSignal
[0];
374 paras
[i
].selfSignal
= &thrdSignal
[i
+ 1];
375 kmerCounter
[i
+ 1] = 0;
376 rcSeq
[i
+ 1] = ( char * ) ckalloc ( maxCtgLen
* sizeof ( char ) );
379 creatThrds ( threads
, paras
);
382 kmer_c
= thrdSignal
[0] = kmerCounter
[0] = 0;
384 read_c
= lenSum
= i
= seqBreakers
[0] = indexArray
[0] = 0;
385 readseq1by1 ( seqBuffer
+ seqBreakers
[read_c
], next_name
, & ( lenBuffer
[read_c
] ), fp
, -1 );
387 while ( !feof ( fp
) )
389 contigId
= getID ( next_name
);
390 readseq1by1 ( seqBuffer
+ seqBreakers
[read_c
], next_name
, & ( lenBuffer
[read_c
] ), fp
, 1 );
392 if ( ( ++i
) % 10000000 == 0 )
394 fprintf ( stderr
, "--- %lldth contig(s).\n", i
);
397 if ( lenBuffer
[read_c
] < overlaplen
+ 1 || lenBuffer
[read_c
] < len_cut
)
399 contigId
= getID ( next_name
);
403 ctgIdArray
[read_c
] = contigId
> 0 ? contigId
: i
;
404 lenSum
+= lenBuffer
[read_c
];
405 kmer_c
+= lenBuffer
[read_c
] - overlaplen
+ 1;
407 seqBreakers
[read_c
] = lenSum
;
408 indexArray
[read_c
] = kmer_c
;
410 if ( read_c
== max_read_c
|| ( lenSum
+ maxCtgLen
) > seq_buffer_size
|| ( kmer_c
+ maxCtgLen
- overlaplen
+ 1 ) > buffer_size
)
412 kmerCounter
[0] += kmer_c
;
413 sendWorkSignal ( 2, thrdSignal
); //chopKmer4read
414 sendWorkSignal ( 1, thrdSignal
); //singleKmer
415 kmer_c
= read_c
= lenSum
= 0;
421 kmerCounter
[0] += kmer_c
;
422 sendWorkSignal ( 2, thrdSignal
); //chopKmer4read
423 sendWorkSignal ( 1, thrdSignal
); //singleKmer
426 sendWorkSignal ( 3, thrdSignal
); //stop threads
427 thread_wait ( threads
);
429 fprintf ( stderr
, "Time spent on hashing contigs: %ds.\n", ( int ) ( stop_t
- start_t
) );
433 unsigned long long alloCounter
= 0;
434 unsigned long long allKmerCounter
= 0;
436 for ( i
= 0; i
< thrd_num
; i
++ )
438 alloCounter
+= count_kmerset ( ( KmerSets
[i
] ) );
439 allKmerCounter
+= kmerCounter
[i
+ 1];
440 free ( ( void * ) rcSeq
[i
+ 1] );
443 fprintf ( stderr
, "%lli node(s) allocated, %lli kmer(s) in contigs, %lli kmer(s) processed.\n", alloCounter
, kmerCounter
[0], allKmerCounter
);
446 free ( ( void * ) rcSeq
);
447 free ( ( void * ) kmerCounter
);
448 free ( ( void * ) seqBuffer
);
449 free ( ( void * ) lenBuffer
);
450 free ( ( void * ) indexArray
);
451 free ( ( void * ) seqBreakers
);
452 free ( ( void * ) ctgIdArray
);
453 free ( ( void * ) kmerBuffer
);
454 free ( ( void * ) hashBanBuffer
);
455 free ( ( void * ) smallerBuffer
);
456 free ( ( void * ) next_name
);