modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / prlHashCtg.c
blobf14c7c77b2651fa9a89d56616a8360f02fa418bf
1 /*
2 * prlHashCtg.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
29 //debugging variables
30 static long long * kmerCounter;
32 //buffer related varibles for chop kmer
33 static unsigned int read_c;
34 static char ** rcSeq;
35 static char * seqBuffer;
36 static int * lenBuffer;
37 static unsigned int * indexArray;
38 static unsigned int * seqBreakers; //record sum length to indicate start pos on seqBuffer
39 static int * ctgIdArray;
41 //static Kmer *firstKmers;
43 //buffer related varibles for splay tree
44 static unsigned int buffer_size = 100000000;
45 static unsigned int seq_buffer_size;
46 static unsigned int max_read_c;
47 static volatile unsigned int kmer_c;
48 static Kmer * kmerBuffer;
49 static ubyte8 * hashBanBuffer;
50 static boolean * smallerBuffer;
52 static void singleKmer ( int t, KmerSet * kset, unsigned int seq_index, unsigned int pos );
53 static void chopKmer4read ( int t, int threadID );
55 static void threadRoutine ( void * para )
57 PARAMETER * prm;
58 unsigned int i;
59 unsigned char id;
60 prm = ( PARAMETER * ) para;
61 id = prm->threadID;
63 while ( 1 )
65 if ( * ( prm->selfSignal ) == 1 )
67 unsigned int seq_index = 0;
68 unsigned int pos = 0;
70 for ( i = 0; i < kmer_c; i++ )
72 if ( seq_index < read_c && indexArray[seq_index + 1] == i )
74 seq_index++; // which sequence this kmer belongs to
75 pos = 0;
78 if ( ( unsigned char ) ( hashBanBuffer[i] % thrd_num ) != id )
80 pos++;
81 continue;
84 kmerCounter[id + 1]++;
85 singleKmer ( i, KmerSets[id], seq_index, pos++ );
88 * ( prm->selfSignal ) = 0;
90 else if ( * ( prm->selfSignal ) == 2 )
92 for ( i = 0; i < read_c; i++ )
94 if ( i % thrd_num != id )
96 continue;
99 chopKmer4read ( i, id + 1 );
102 * ( prm->selfSignal ) = 0;
104 else if ( * ( prm->selfSignal ) == 3 )
106 * ( prm->selfSignal ) = 0;
107 break;
110 usleep ( 1 );
114 /*************************************************
115 Function:
116 singleKmer
117 Description:
118 Stores a kmer into kmerset and updates related status.
119 Input:
120 1. t: kmerBuffer index
121 2. kset: KmerSet
122 3. seq_index: contig id array index
123 4. pos: the postion of the kmer on contig
124 Output:
125 None.
126 Return:
127 None.
128 *************************************************/
129 static void singleKmer ( int t, KmerSet * kset, unsigned int seq_index, unsigned int pos )
131 boolean flag;
132 kmer_t * node;
133 flag = put_kmerset ( kset, kmerBuffer[t], 4, 4, &node );
135 if ( !flag )
137 if ( smallerBuffer[t] )
139 node->twin = 0;
141 else
143 node->twin = 1;
146 node->l_links = ctgIdArray[seq_index];
148 node->r_links = pos;
150 else
152 node->deleted = 1;
156 static void creatThrds ( pthread_t * threads, PARAMETER * paras )
158 unsigned char i;
159 int temp;
161 for ( i = 0; i < thrd_num; i++ )
163 if ( ( temp = pthread_create ( &threads[i], NULL, ( void * ) threadRoutine, & ( paras[i] ) ) ) != 0 )
165 fprintf ( stderr, "Create threads failed.\n" );
166 exit ( 1 );
170 fprintf ( stderr, "%d thread(s) initialized.\n", thrd_num );
173 static void thread_wait ( pthread_t * threads )
175 int i;
177 for ( i = 0; i < thrd_num; i++ )
178 if ( threads[i] != 0 )
180 pthread_join ( threads[i], NULL );
184 static void chopKmer4read ( int t, int threadID )
186 char * src_seq = seqBuffer + seqBreakers[t];
187 char * bal_seq = rcSeq[threadID];
188 int len_seq = lenBuffer[t];
189 int j, bal_j;
190 ubyte8 hash_ban, bal_hash_ban;
191 Kmer word, bal_word;
192 int index;
193 #ifdef MER127
194 word.high1 = word.low1 = word.high2 = word.low2 = 0;
196 for ( index = 0; index < overlaplen; index++ )
198 word = KmerLeftBitMoveBy2 ( word );
199 word.low2 |= src_seq[index];
202 #else
203 word.high = word.low = 0;
205 for ( index = 0; index < overlaplen; index++ )
207 word = KmerLeftBitMoveBy2 ( word );
208 word.low |= src_seq[index];
211 #endif
212 reverseComplementSeq ( src_seq, len_seq, bal_seq );
213 // complementary node
214 bal_word = reverseComplement ( word, overlaplen );
215 bal_j = len_seq - 0 - overlaplen;
216 index = indexArray[t];
218 if ( KmerSmaller ( word, bal_word ) )
220 hash_ban = hash_kmer ( word );
221 kmerBuffer[index] = word;
222 hashBanBuffer[index] = hash_ban;
223 smallerBuffer[index++] = 1;
225 else
227 bal_hash_ban = hash_kmer ( bal_word );
228 kmerBuffer[index] = bal_word;
229 hashBanBuffer[index] = bal_hash_ban;
230 smallerBuffer[index++] = 0;
233 for ( j = 1; j <= len_seq - overlaplen; j++ )
235 word = nextKmer ( word, src_seq[j - 1 + overlaplen] );
236 bal_j = len_seq - j - overlaplen;
237 bal_word = prevKmer ( bal_word, bal_seq[bal_j] );
239 if ( KmerSmaller ( word, bal_word ) )
241 hash_ban = hash_kmer ( word );
242 kmerBuffer[index] = word;
243 hashBanBuffer[index] = hash_ban;
244 smallerBuffer[index++] = 1;
246 else
248 // complementary node
249 bal_hash_ban = hash_kmer ( bal_word );
250 kmerBuffer[index] = bal_word;
251 hashBanBuffer[index] = bal_hash_ban;
252 smallerBuffer[index++] = 0;
257 static void sendWorkSignal ( unsigned char SIG, unsigned char * thrdSignals )
259 int t;
261 for ( t = 0; t < thrd_num; t++ )
263 thrdSignals[t + 1] = SIG;
266 while ( 1 )
268 usleep ( 10 );
270 for ( t = 0; t < thrd_num; t++ )
271 if ( thrdSignals[t + 1] )
273 break;
276 if ( t == thrd_num )
278 break;
284 /*************************************************
285 Function:
286 getID
287 Description:
288 Gets contig id.
289 Input:
290 1.name contig name
291 Output:
292 None.
293 Return:
294 None.
295 *************************************************/
296 static int getID ( char * name )
298 if ( name[0] >= '0' && name[0] <= '9' )
300 return atoi ( & ( name[0] ) );
302 else
304 return 0;
309 /*************************************************
310 Function:
311 prlContig2nodes
312 Description:
313 Chops contigs not shorter than 'len_cut' into kmers:
314 1. if the kmer occured twice or more , mark it deleted;
315 2. 'node->l_links' stores the contig id and 'node->r_links' stores the position the kmer on contig.
316 Input:
317 1. grapfile: the prefix of contig file
318 2. len_cut: the minimum length of contig
319 Output:
320 None.
321 Return:
322 True always.
323 *************************************************/
324 boolean prlContig2nodes ( char * grapfile, int len_cut )
326 long long i, num_seq;
327 char name[256], *next_name;
328 FILE * fp;
329 pthread_t threads[thrd_num];
330 time_t start_t, stop_t;
331 unsigned char thrdSignal[thrd_num + 1];
332 PARAMETER paras[thrd_num];
333 int maxCtgLen, minCtgLen, nameLen;
334 unsigned int lenSum, contigId;
335 //init
336 WORDFILTER = createFilter ( overlaplen );
337 time ( &start_t );
338 sprintf ( name, "%s.contig", grapfile );
339 fp = ckopen ( name, "r" );
340 maxCtgLen = nameLen = 10;
341 minCtgLen = 1000;
342 num_seq = readseqpar ( &maxCtgLen, &minCtgLen, &nameLen, fp );
343 fprintf ( stderr, "\n%lld contig(s), maximum sequence length %d, minimum sequence length %d, maximum name length %d.\n", num_seq, maxCtgLen, minCtgLen, nameLen );
344 maxReadLen = maxCtgLen;
345 fclose ( fp );
346 time ( &stop_t );
347 fprintf ( stderr, "Time spent on parsing contigs file: %ds.\n", ( int ) ( stop_t - start_t ) );
348 next_name = ( char * ) ckalloc ( ( maxNameLen + 1 ) * sizeof ( char ) );
349 // extract all the EDONs
350 seq_buffer_size = buffer_size * 2;
351 max_read_c = seq_buffer_size / 20;
352 kmerBuffer = ( Kmer * ) ckalloc ( buffer_size * sizeof ( Kmer ) );
353 hashBanBuffer = ( ubyte8 * ) ckalloc ( buffer_size * sizeof ( ubyte8 ) );
354 smallerBuffer = ( boolean * ) ckalloc ( buffer_size * sizeof ( boolean ) );
355 seqBuffer = ( char * ) ckalloc ( seq_buffer_size * sizeof ( char ) );
356 lenBuffer = ( int * ) ckalloc ( max_read_c * sizeof ( int ) );
357 indexArray = ( unsigned int * ) ckalloc ( ( max_read_c + 1 ) * sizeof ( unsigned int ) );
358 seqBreakers = ( unsigned int * ) ckalloc ( ( max_read_c + 1 ) * sizeof ( unsigned int ) );
359 ctgIdArray = ( int * ) ckalloc ( max_read_c * sizeof ( int ) );
360 fp = ckopen ( name, "r" );
361 rcSeq = ( char ** ) ckalloc ( ( thrd_num + 1 ) * sizeof ( char * ) );
363 if ( 1 )
365 kmerCounter = ( long long * ) ckalloc ( ( thrd_num + 1 ) * sizeof ( long long ) );
366 KmerSets = ( KmerSet ** ) ckalloc ( thrd_num * sizeof ( KmerSet * ) );
368 for ( i = 0; i < thrd_num; i++ )
370 KmerSets[i] = init_kmerset ( 1024, 0.77f );
371 thrdSignal[i + 1] = 0;
372 paras[i].threadID = i;
373 paras[i].mainSignal = &thrdSignal[0];
374 paras[i].selfSignal = &thrdSignal[i + 1];
375 kmerCounter[i + 1] = 0;
376 rcSeq[i + 1] = ( char * ) ckalloc ( maxCtgLen * sizeof ( char ) );
379 creatThrds ( threads, paras );
382 kmer_c = thrdSignal[0] = kmerCounter[0] = 0;
383 time ( &start_t );
384 read_c = lenSum = i = seqBreakers[0] = indexArray[0] = 0;
385 readseq1by1 ( seqBuffer + seqBreakers[read_c], next_name, & ( lenBuffer[read_c] ), fp, -1 );
387 while ( !feof ( fp ) )
389 contigId = getID ( next_name );
390 readseq1by1 ( seqBuffer + seqBreakers[read_c], next_name, & ( lenBuffer[read_c] ), fp, 1 );
392 if ( ( ++i ) % 10000000 == 0 )
394 fprintf ( stderr, "--- %lldth contig(s).\n", i );
397 if ( lenBuffer[read_c] < overlaplen + 1 || lenBuffer[read_c] < len_cut )
399 contigId = getID ( next_name );
400 continue;
403 ctgIdArray[read_c] = contigId > 0 ? contigId : i;
404 lenSum += lenBuffer[read_c];
405 kmer_c += lenBuffer[read_c] - overlaplen + 1;
406 read_c++;
407 seqBreakers[read_c] = lenSum;
408 indexArray[read_c] = kmer_c;
410 if ( read_c == max_read_c || ( lenSum + maxCtgLen ) > seq_buffer_size || ( kmer_c + maxCtgLen - overlaplen + 1 ) > buffer_size )
412 kmerCounter[0] += kmer_c;
413 sendWorkSignal ( 2, thrdSignal ); //chopKmer4read
414 sendWorkSignal ( 1, thrdSignal ); //singleKmer
415 kmer_c = read_c = lenSum = 0;
419 if ( read_c )
421 kmerCounter[0] += kmer_c;
422 sendWorkSignal ( 2, thrdSignal ); //chopKmer4read
423 sendWorkSignal ( 1, thrdSignal ); //singleKmer
426 sendWorkSignal ( 3, thrdSignal ); //stop threads
427 thread_wait ( threads );
428 time ( &stop_t );
429 fprintf ( stderr, "Time spent on hashing contigs: %ds.\n", ( int ) ( stop_t - start_t ) );
431 if ( 1 )
433 unsigned long long alloCounter = 0;
434 unsigned long long allKmerCounter = 0;
436 for ( i = 0; i < thrd_num; i++ )
438 alloCounter += count_kmerset ( ( KmerSets[i] ) );
439 allKmerCounter += kmerCounter[i + 1];
440 free ( ( void * ) rcSeq[i + 1] );
443 fprintf ( stderr, "%lli node(s) allocated, %lli kmer(s) in contigs, %lli kmer(s) processed.\n", alloCounter, kmerCounter[0], allKmerCounter );
446 free ( ( void * ) rcSeq );
447 free ( ( void * ) kmerCounter );
448 free ( ( void * ) seqBuffer );
449 free ( ( void * ) lenBuffer );
450 free ( ( void * ) indexArray );
451 free ( ( void * ) seqBreakers );
452 free ( ( void * ) ctgIdArray );
453 free ( ( void * ) kmerBuffer );
454 free ( ( void * ) hashBanBuffer );
455 free ( ( void * ) smallerBuffer );
456 free ( ( void * ) next_name );
457 fclose ( fp );
458 return 1;