modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / cutTipPreGraph.c
blobb1fb2c87a672170660dd708836d323e43b7b4efc
1 /*
2 * cutTipPreGraph.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
29 static int tip_c; //tips counter
30 static long long * linearCounter; //counter for linear kmer node number
33 static void Mark1in1outNode ();
34 static void thread_mark ( KmerSet * set, unsigned char thrdID );
37 static void printKmer(Kmer kmer)
39 printKmerSeq(stdout,kmer);
40 printf("\n");
43 static int clipTipFromNode ( kmer_t * node1, int cut_len, boolean THIN )
45 unsigned char ret = 0, in_num, out_num, link;
46 int sum, count;
47 kmer_t * out_node;
48 Kmer tempKmer, pre_word, word, bal_word;
49 ubyte8 hash_ban;
50 char ch1, ch;
51 boolean smaller, found;
52 int setPicker;
53 unsigned int max_links, singleCvg;
55 if (node1->linear || node1->deleted)
57 return ret;
60 if (THIN && !node1->single)
62 return ret;
65 in_num = count_branch2prev ( node1 );
66 out_num = count_branch2next ( node1 );
68 if ( in_num == 0 && out_num == 1 )
70 pre_word = node1->seq;
72 for ( ch1 = 0; ch1 < 4; ch1++ )
74 link = get_kmer_right_cov ( *node1, ch1 );
76 if ( link )
78 break;
82 word = nextKmer ( pre_word, ch1 );
84 else if ( in_num == 1 && out_num == 0 )
86 pre_word = reverseComplement ( node1->seq, overlaplen );
88 for ( ch1 = 0; ch1 < 4; ch1++ )
90 link = get_kmer_left_cov ( *node1, ch1 );
92 if ( link )
94 break;
98 word = nextKmer ( pre_word, int_comp ( ch1 ) );
100 else
102 return ret;
105 count = 1;
106 bal_word = reverseComplement ( word, overlaplen );
108 if ( KmerLarger ( word, bal_word ) )
110 tempKmer = bal_word;
111 bal_word = word;
112 word = tempKmer;
113 smaller = 0;
115 else
117 smaller = 1;
120 hash_ban = hash_kmer ( word );
121 setPicker = hash_ban % thrd_num;
122 found = search_kmerset ( KmerSets[setPicker], word, &out_node );
124 if ( !found )
126 fprintf ( stderr, "Kmer " );
127 PrintKmer ( stderr, word );
128 fprintf ( stderr, " is not found, node1 " );
129 PrintKmer ( stderr, node1->seq );
130 fprintf ( stderr, " .\n" );
132 #ifdef MER127
133 fprintf (stderr,"kmer %llx%llx%llx%llx not found, node1 %llx%llx%llx%llx\n", word.high1, word.low1, word.high2, word.low2, node1->seq.high1, node1->seq.low1, node1->seq.high2,
134 node1->seq.low2);
135 #else
136 fprintf (stderr,"kmer %llx%llx not found, node1 %llx%llx\n", word.high, word.low, node1->seq.high, node1->seq.low);
137 #endif
139 exit ( 1 );
142 while ( out_node->linear )
144 count++;
146 if ( THIN && !out_node->single )
148 break;
151 if ( count > cut_len )
153 return ret;
156 if ( smaller )
158 pre_word = word;
160 for ( ch = 0; ch < 4; ch++ )
162 link = get_kmer_right_cov ( *out_node, ch );
164 if ( link )
166 break;
170 word = nextKmer ( pre_word, ch );
171 bal_word = reverseComplement ( word, overlaplen );
173 if ( KmerLarger ( word, bal_word ) )
175 tempKmer = bal_word;
176 bal_word = word;
177 word = tempKmer;
178 smaller = 0;
180 else
182 smaller = 1;
185 hash_ban = hash_kmer ( word );
186 setPicker = hash_ban % thrd_num;
187 found = search_kmerset ( KmerSets[setPicker], word, &out_node );
189 if ( !found )
191 fprintf ( stderr, "Kmer " );
192 PrintKmer ( stderr, word );
193 fprintf ( stderr, " is not found, node1 " );
194 PrintKmer ( stderr, node1->seq );
195 fprintf ( stderr, " .\n" );
196 fprintf ( stderr, "Pre_word " );
197 PrintKmer ( stderr, pre_word );
198 fprintf ( stderr, " with %d(smaller).\n", ch );
200 #ifdef MER127
201 fprintf (stderr,"kmer %llx%llx%llx%llx not found, node1 %llx%llx%llx%llx\n", word.high1, word.low1, word.high2, word.low2, node1->seq.high1, node1->seq.low1, node1->seq.high2,
202 node1->seq.low2);
203 fprintf (stderr,"pre_word %llx%llx%llx%llx with %d(smaller)\n", pre_word.high1, pre_word.low1, pre_word.high2, pre_word.low2, ch);
204 #else
205 fprintf (stderr,"kmer %llx%llx not found, node1 %llx%llx\n", word.high, word.low, node1->seq.high, node1->seq.low);
206 fprintf (stderr,"pre_word %llx%llx with %d(smaller)\n", pre_word.high, pre_word.low, ch);
207 #endif
209 exit ( 1 );
212 else
214 pre_word = bal_word;
216 for ( ch = 0; ch < 4; ch++ )
218 link = get_kmer_left_cov ( *out_node, ch );
220 if ( link )
222 break;
226 word = nextKmer ( pre_word, int_comp ( ch ) );
227 bal_word = reverseComplement ( word, overlaplen );
229 if ( KmerLarger ( word, bal_word ) )
231 tempKmer = bal_word;
232 bal_word = word;
233 word = tempKmer;
234 smaller = 0;
236 else
238 smaller = 1;
241 hash_ban = hash_kmer ( word );
242 setPicker = hash_ban % thrd_num;
243 found = search_kmerset ( KmerSets[setPicker], word, &out_node );
245 if ( !found )
247 fprintf ( stderr, "Kmer " );
248 PrintKmer ( stderr, word );
249 fprintf ( stderr, " is not found, node1 " );
250 PrintKmer ( stderr, node1->seq );
251 fprintf ( stderr, " .\n" );
252 fprintf ( stderr, "Pre_word " );
253 PrintKmer ( stderr, reverseComplement ( pre_word, overlaplen ) );
254 fprintf ( stderr, " with %d(larger).\n", int_comp ( ch ) );
256 #ifdef MER127
257 fprintf (stderr,"kmer %llx%llx%llx%llx not found, node1 %llx%llx%llx%llx\n", word.high1, word.low1, word.high2, word.low2, node1->seq.high1, node1->seq.low1, node1->seq.high2,
258 node1->seq.low2);
259 fprintf (stderr,"pre_word %llx%llx%llx%llx with %d(larger)\n", reverseComplement (pre_word, overlaplen).high1, reverseComplement (pre_word, overlaplen).low1,
260 reverseComplement (pre_word, overlaplen).high2, reverseComplement (pre_word, overlaplen).low2, int_comp (ch));
261 #else
262 fprintf (stderr,"kmer %llx%llx not found, node1 %llx%llx\n", word.high, word.low, node1->seq.high, node1->seq.low);
263 fprintf (stderr,"pre_word %llx%llx with %d(larger)\n", reverseComplement (pre_word, overlaplen).high, reverseComplement (pre_word, overlaplen).low, int_comp (ch));
264 #endif
266 exit ( 1 );
271 if ( ( sum = count_branch2next ( out_node ) + count_branch2prev ( out_node ) ) == 1 )
273 tip_c++;
274 node1->deleted = 1;
275 out_node->deleted = 1;
276 return 1;
278 else
280 ch = firstCharInKmer ( pre_word );
282 if ( THIN )
284 tip_c++;
285 node1->deleted = 1;
286 dislink2prevUncertain ( out_node, ch, smaller );
287 out_node->linear = 0;
288 return 1;
291 // make sure this tip doesn't provide most links to out_node
292 max_links = 0;
294 for ( ch1 = 0; ch1 < 4; ch1++ )
296 if ( smaller )
298 singleCvg = get_kmer_left_cov ( *out_node, ch1 );
300 if ( singleCvg > max_links )
302 max_links = singleCvg;
305 else
307 singleCvg = get_kmer_right_cov ( *out_node, ch1 );
309 if ( singleCvg > max_links )
311 max_links = singleCvg;
316 if ( smaller && get_kmer_left_cov ( *out_node, ch ) < max_links )
318 tip_c++;
319 node1->deleted = 1;
320 dislink2prevUncertain ( out_node, ch, smaller );
322 if ( count_branch2prev ( out_node ) == 1 && count_branch2next ( out_node ) == 1 )
324 out_node->linear = 1;
327 return 1;
330 if ( !smaller && get_kmer_right_cov ( *out_node, int_comp ( ch ) ) < max_links )
332 tip_c++;
333 node1->deleted = 1;
334 dislink2prevUncertain ( out_node, ch, smaller );
336 if ( count_branch2prev ( out_node ) == 1 && count_branch2next ( out_node ) == 1 )
338 out_node->linear = 1;
341 return 1;
345 return 0;
349 /*************************************************
350 Function:
351 removeSingleTips
352 Description:
353 Removes the tips starting from a kmer whose coverage is 1,
354 and marks the linear kmers again (max tips length is 2 * k-mer).
355 Input:
356 None.
357 Output:
358 None.
359 Return:
360 None.
361 *************************************************/
363 void removeSingleTips ()
365 int i, flag = 0, cut_len_tip;
366 kmer_t * rs;
367 KmerSet * set;
368 //count_ends(hash_table);
369 cut_len_tip = 2 * overlaplen; // >= maxReadLen4all-overlaplen+1 ? 2*overlaplen : maxReadLen4all-overlaplen+1;
370 //if(cut_len_tip > 100) cut_len_tip = 100;
371 fprintf ( stderr, "Start to remove frequency-one-kmer tips shorter than %d.\n", cut_len_tip );
372 tip_c = 0;
374 for ( i = 0; i < thrd_num; i++ )
376 set = KmerSets[i];
377 set->iter_ptr = 0;
379 while ( set->iter_ptr < set->size )
381 if ( !is_kmer_entity_null ( set->flags, set->iter_ptr ) )
383 rs = set->array + set->iter_ptr;
385 if ( !rs->linear && !rs->deleted && rs->single )
387 flag += clipTipFromNode ( rs, cut_len_tip, 1 );
390 // flag += clipTipFromNode (rs, cut_len_tip, 1);
393 set->iter_ptr++;
397 fprintf ( stderr, "Total %d tip(s) removed.\n", tip_c );
398 Mark1in1outNode ();
402 /*************************************************
403 Function:
404 removeSingleTips
405 Description:
406 Removes tips and marks the linear kmers again(max tips length is 2 * k-mer).
407 Input:
408 None.
409 Output:
410 None.
411 Return:
412 None.
413 *************************************************/
414 void removeMinorTips ()
416 int i, flag = 0, cut_len_tip;
417 kmer_t * rs;
418 KmerSet * set;
419 //count_ends(hash_table);
420 //cut_len_tip = 2*overlaplen >= maxReadLen4all-overlaplen+1 ? 2*overlaplen : maxReadLen4all-overlaplen+1;
421 cut_len_tip = 2 * overlaplen;
422 //if(cut_len_tip > 100) cut_len_tip = 100;
423 fprintf ( stderr, "Start to remove tips with minority links.\n" );
424 tip_c = 0;
425 flag = 1;
426 int round = 1;
428 while ( flag )
430 flag = 0;
432 for ( i = 0; i < thrd_num; i++ )
434 set = KmerSets[i];
435 set->iter_ptr = 0;
437 while ( set->iter_ptr < set->size )
439 if ( !is_kmer_entity_null ( set->flags, set->iter_ptr ) )
441 rs = set->array + set->iter_ptr;
443 if ( !rs->linear && !rs->deleted )
445 flag += clipTipFromNode ( rs, cut_len_tip, 0 );
448 // flag += clipTipFromNode (rs, cut_len_tip, 0);
451 set->iter_ptr++;
454 // fprintf (stderr,"Remove minor tips in kmer set %d is done.\n", i);
457 fprintf ( stderr, "%d tip(s) removed in cycle %d.\n", flag, round++ );
461 for (i = 0; i < thrd_num; i++)
463 set = KmerSets[i];
464 flag = 1;
466 while (flag)
468 flag = 0;
469 set->iter_ptr = 0;
471 while (set->iter_ptr < set->size)
473 if (!is_kmer_entity_null (set->flags, set->iter_ptr))
475 rs = set->array + set->iter_ptr;
476 flag += clipTipFromNode (rs, cut_len_tip, 0);
479 set->iter_ptr++;
483 fprintf (stderr,"Remove minor tips in kmer set %d is done.\n", i);
486 fprintf ( stderr, "Total %d tip(s) removed.\n", tip_c );
487 Mark1in1outNode ();
490 static void threadRoutine ( void * para )
492 PARAMETER * prm;
493 unsigned char id;
494 prm = ( PARAMETER * ) para;
495 id = prm->threadID;
497 //printf("%dth thread with threadID %d, hash_table %p\n",id,prm.threadID,prm.hash_table);
498 while ( 1 )
500 if ( * ( prm->selfSignal ) == 2 )
502 * ( prm->selfSignal ) = 0;
503 break;
505 else if ( * ( prm->selfSignal ) == 1 )
507 thread_mark ( KmerSets[id], id );
508 * ( prm->selfSignal ) = 0;
511 usleep ( 1 );
515 static void creatThrds ( pthread_t * threads, PARAMETER * paras )
517 unsigned char i;
518 int temp;
520 for ( i = 0; i < thrd_num; i++ )
522 if ( ( temp = pthread_create ( &threads[i], NULL, ( void * ) threadRoutine, & ( paras[i] ) ) ) != 0 )
524 fprintf ( stderr, "Create threads failed.\n" );
525 exit ( 1 );
529 fprintf ( stderr, "%d thread(s) initialized.\n", thrd_num );
532 static void thread_mark ( KmerSet * set, unsigned char thrdID )
534 int in_num, out_num;
535 kmer_t * rs;
536 set->iter_ptr = 0;
538 while ( set->iter_ptr < set->size )
540 if ( !is_kmer_entity_null ( set->flags, set->iter_ptr ) )
542 rs = set->array + set->iter_ptr;
544 if ( rs->deleted || rs->linear )
546 set->iter_ptr++;
547 continue;;
550 in_num = count_branch2prev ( rs );
551 out_num = count_branch2next ( rs );
553 if ( in_num == 1 && out_num == 1 )
555 rs->linear = 1;
556 linearCounter[thrdID]++;
560 set->iter_ptr++;
563 //printf("%lld more linear\n",linearCounter[thrdID]);
566 static void thread_wait ( pthread_t * threads )
568 int i;
570 for ( i = 0; i < thrd_num; i++ )
571 if ( threads[i] != 0 )
573 pthread_join ( threads[i], NULL );
577 static void sendWorkSignal ( unsigned char SIG, unsigned char * thrdSignals )
579 int t;
581 for ( t = 0; t < thrd_num; t++ )
583 thrdSignals[t + 1] = SIG;
586 while ( 1 )
588 usleep ( 10 );
590 for ( t = 0; t < thrd_num; t++ )
591 if ( thrdSignals[t + 1] )
593 break;
596 if ( t == thrd_num )
598 break;
603 static void Mark1in1outNode ()
605 int i;
606 long long counter = 0;
607 pthread_t threads[thrd_num];
608 unsigned char thrdSignal[thrd_num + 1];
609 PARAMETER paras[thrd_num];
611 for ( i = 0; i < thrd_num; i++ )
613 thrdSignal[i + 1] = 0;
614 paras[i].threadID = i;
615 paras[i].mainSignal = &thrdSignal[0];
616 paras[i].selfSignal = &thrdSignal[i + 1];
619 creatThrds ( threads, paras );
620 thrdSignal[0] = 0;
621 linearCounter = ( long long * ) ckalloc ( thrd_num * sizeof ( long long ) );
623 for ( i = 0; i < thrd_num; i++ )
625 linearCounter[i] = 0;
628 sendWorkSignal ( 1, thrdSignal ); //mark linear nodes
629 sendWorkSignal ( 2, thrdSignal ); //stop threads
630 thread_wait ( threads );
632 for ( i = 0; i < thrd_num; i++ )
634 counter += linearCounter[i];
637 free ( ( void * ) linearCounter );
638 fprintf ( stderr, "%lld linear node(s) marked.\n", counter );