4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
29 static int tip_c
; //tips counter
30 static long long * linearCounter
; //counter for linear kmer node number
33 static void Mark1in1outNode ();
34 static void thread_mark ( KmerSet
* set
, unsigned char thrdID
);
37 static void printKmer(Kmer kmer)
39 printKmerSeq(stdout,kmer);
43 static int clipTipFromNode ( kmer_t
* node1
, int cut_len
, boolean THIN
)
45 unsigned char ret
= 0, in_num
, out_num
, link
;
48 Kmer tempKmer
, pre_word
, word
, bal_word
;
51 boolean smaller
, found
;
53 unsigned int max_links
, singleCvg
;
55 if (node1->linear || node1->deleted)
60 if (THIN && !node1->single)
65 in_num
= count_branch2prev ( node1
);
66 out_num
= count_branch2next ( node1
);
68 if ( in_num
== 0 && out_num
== 1 )
70 pre_word
= node1
->seq
;
72 for ( ch1
= 0; ch1
< 4; ch1
++ )
74 link
= get_kmer_right_cov ( *node1
, ch1
);
82 word
= nextKmer ( pre_word
, ch1
);
84 else if ( in_num
== 1 && out_num
== 0 )
86 pre_word
= reverseComplement ( node1
->seq
, overlaplen
);
88 for ( ch1
= 0; ch1
< 4; ch1
++ )
90 link
= get_kmer_left_cov ( *node1
, ch1
);
98 word
= nextKmer ( pre_word
, int_comp ( ch1
) );
106 bal_word
= reverseComplement ( word
, overlaplen
);
108 if ( KmerLarger ( word
, bal_word
) )
120 hash_ban
= hash_kmer ( word
);
121 setPicker
= hash_ban
% thrd_num
;
122 found
= search_kmerset ( KmerSets
[setPicker
], word
, &out_node
);
126 fprintf ( stderr
, "Kmer " );
127 PrintKmer ( stderr
, word
);
128 fprintf ( stderr
, " is not found, node1 " );
129 PrintKmer ( stderr
, node1
->seq
);
130 fprintf ( stderr
, " .\n" );
133 fprintf (stderr,"kmer %llx%llx%llx%llx not found, node1 %llx%llx%llx%llx\n", word.high1, word.low1, word.high2, word.low2, node1->seq.high1, node1->seq.low1, node1->seq.high2,
136 fprintf (stderr,"kmer %llx%llx not found, node1 %llx%llx\n", word.high, word.low, node1->seq.high, node1->seq.low);
142 while ( out_node
->linear
)
146 if ( THIN
&& !out_node
->single
)
151 if ( count
> cut_len
)
160 for ( ch
= 0; ch
< 4; ch
++ )
162 link
= get_kmer_right_cov ( *out_node
, ch
);
170 word
= nextKmer ( pre_word
, ch
);
171 bal_word
= reverseComplement ( word
, overlaplen
);
173 if ( KmerLarger ( word
, bal_word
) )
185 hash_ban
= hash_kmer ( word
);
186 setPicker
= hash_ban
% thrd_num
;
187 found
= search_kmerset ( KmerSets
[setPicker
], word
, &out_node
);
191 fprintf ( stderr
, "Kmer " );
192 PrintKmer ( stderr
, word
);
193 fprintf ( stderr
, " is not found, node1 " );
194 PrintKmer ( stderr
, node1
->seq
);
195 fprintf ( stderr
, " .\n" );
196 fprintf ( stderr
, "Pre_word " );
197 PrintKmer ( stderr
, pre_word
);
198 fprintf ( stderr
, " with %d(smaller).\n", ch
);
201 fprintf (stderr,"kmer %llx%llx%llx%llx not found, node1 %llx%llx%llx%llx\n", word.high1, word.low1, word.high2, word.low2, node1->seq.high1, node1->seq.low1, node1->seq.high2,
203 fprintf (stderr,"pre_word %llx%llx%llx%llx with %d(smaller)\n", pre_word.high1, pre_word.low1, pre_word.high2, pre_word.low2, ch);
205 fprintf (stderr,"kmer %llx%llx not found, node1 %llx%llx\n", word.high, word.low, node1->seq.high, node1->seq.low);
206 fprintf (stderr,"pre_word %llx%llx with %d(smaller)\n", pre_word.high, pre_word.low, ch);
216 for ( ch
= 0; ch
< 4; ch
++ )
218 link
= get_kmer_left_cov ( *out_node
, ch
);
226 word
= nextKmer ( pre_word
, int_comp ( ch
) );
227 bal_word
= reverseComplement ( word
, overlaplen
);
229 if ( KmerLarger ( word
, bal_word
) )
241 hash_ban
= hash_kmer ( word
);
242 setPicker
= hash_ban
% thrd_num
;
243 found
= search_kmerset ( KmerSets
[setPicker
], word
, &out_node
);
247 fprintf ( stderr
, "Kmer " );
248 PrintKmer ( stderr
, word
);
249 fprintf ( stderr
, " is not found, node1 " );
250 PrintKmer ( stderr
, node1
->seq
);
251 fprintf ( stderr
, " .\n" );
252 fprintf ( stderr
, "Pre_word " );
253 PrintKmer ( stderr
, reverseComplement ( pre_word
, overlaplen
) );
254 fprintf ( stderr
, " with %d(larger).\n", int_comp ( ch
) );
257 fprintf (stderr,"kmer %llx%llx%llx%llx not found, node1 %llx%llx%llx%llx\n", word.high1, word.low1, word.high2, word.low2, node1->seq.high1, node1->seq.low1, node1->seq.high2,
259 fprintf (stderr,"pre_word %llx%llx%llx%llx with %d(larger)\n", reverseComplement (pre_word, overlaplen).high1, reverseComplement (pre_word, overlaplen).low1,
260 reverseComplement (pre_word, overlaplen).high2, reverseComplement (pre_word, overlaplen).low2, int_comp (ch));
262 fprintf (stderr,"kmer %llx%llx not found, node1 %llx%llx\n", word.high, word.low, node1->seq.high, node1->seq.low);
263 fprintf (stderr,"pre_word %llx%llx with %d(larger)\n", reverseComplement (pre_word, overlaplen).high, reverseComplement (pre_word, overlaplen).low, int_comp (ch));
271 if ( ( sum
= count_branch2next ( out_node
) + count_branch2prev ( out_node
) ) == 1 )
275 out_node
->deleted
= 1;
280 ch
= firstCharInKmer ( pre_word
);
286 dislink2prevUncertain ( out_node
, ch
, smaller
);
287 out_node
->linear
= 0;
291 // make sure this tip doesn't provide most links to out_node
294 for ( ch1
= 0; ch1
< 4; ch1
++ )
298 singleCvg
= get_kmer_left_cov ( *out_node
, ch1
);
300 if ( singleCvg
> max_links
)
302 max_links
= singleCvg
;
307 singleCvg
= get_kmer_right_cov ( *out_node
, ch1
);
309 if ( singleCvg
> max_links
)
311 max_links
= singleCvg
;
316 if ( smaller
&& get_kmer_left_cov ( *out_node
, ch
) < max_links
)
320 dislink2prevUncertain ( out_node
, ch
, smaller
);
322 if ( count_branch2prev ( out_node
) == 1 && count_branch2next ( out_node
) == 1 )
324 out_node
->linear
= 1;
330 if ( !smaller
&& get_kmer_right_cov ( *out_node
, int_comp ( ch
) ) < max_links
)
334 dislink2prevUncertain ( out_node
, ch
, smaller
);
336 if ( count_branch2prev ( out_node
) == 1 && count_branch2next ( out_node
) == 1 )
338 out_node
->linear
= 1;
349 /*************************************************
353 Removes the tips starting from a kmer whose coverage is 1,
354 and marks the linear kmers again (max tips length is 2 * k-mer).
361 *************************************************/
363 void removeSingleTips ()
365 int i
, flag
= 0, cut_len_tip
;
368 //count_ends(hash_table);
369 cut_len_tip
= 2 * overlaplen
; // >= maxReadLen4all-overlaplen+1 ? 2*overlaplen : maxReadLen4all-overlaplen+1;
370 //if(cut_len_tip > 100) cut_len_tip = 100;
371 fprintf ( stderr
, "Start to remove frequency-one-kmer tips shorter than %d.\n", cut_len_tip
);
374 for ( i
= 0; i
< thrd_num
; i
++ )
379 while ( set
->iter_ptr
< set
->size
)
381 if ( !is_kmer_entity_null ( set
->flags
, set
->iter_ptr
) )
383 rs
= set
->array
+ set
->iter_ptr
;
385 if ( !rs
->linear
&& !rs
->deleted
&& rs
->single
)
387 flag
+= clipTipFromNode ( rs
, cut_len_tip
, 1 );
390 // flag += clipTipFromNode (rs, cut_len_tip, 1);
397 fprintf ( stderr
, "Total %d tip(s) removed.\n", tip_c
);
402 /*************************************************
406 Removes tips and marks the linear kmers again(max tips length is 2 * k-mer).
413 *************************************************/
414 void removeMinorTips ()
416 int i
, flag
= 0, cut_len_tip
;
419 //count_ends(hash_table);
420 //cut_len_tip = 2*overlaplen >= maxReadLen4all-overlaplen+1 ? 2*overlaplen : maxReadLen4all-overlaplen+1;
421 cut_len_tip
= 2 * overlaplen
;
422 //if(cut_len_tip > 100) cut_len_tip = 100;
423 fprintf ( stderr
, "Start to remove tips with minority links.\n" );
432 for ( i
= 0; i
< thrd_num
; i
++ )
437 while ( set
->iter_ptr
< set
->size
)
439 if ( !is_kmer_entity_null ( set
->flags
, set
->iter_ptr
) )
441 rs
= set
->array
+ set
->iter_ptr
;
443 if ( !rs
->linear
&& !rs
->deleted
)
445 flag
+= clipTipFromNode ( rs
, cut_len_tip
, 0 );
448 // flag += clipTipFromNode (rs, cut_len_tip, 0);
454 // fprintf (stderr,"Remove minor tips in kmer set %d is done.\n", i);
457 fprintf ( stderr
, "%d tip(s) removed in cycle %d.\n", flag
, round
++ );
461 for (i = 0; i < thrd_num; i++)
471 while (set->iter_ptr < set->size)
473 if (!is_kmer_entity_null (set->flags, set->iter_ptr))
475 rs = set->array + set->iter_ptr;
476 flag += clipTipFromNode (rs, cut_len_tip, 0);
483 fprintf (stderr,"Remove minor tips in kmer set %d is done.\n", i);
486 fprintf ( stderr
, "Total %d tip(s) removed.\n", tip_c
);
490 static void threadRoutine ( void * para
)
494 prm
= ( PARAMETER
* ) para
;
497 //printf("%dth thread with threadID %d, hash_table %p\n",id,prm.threadID,prm.hash_table);
500 if ( * ( prm
->selfSignal
) == 2 )
502 * ( prm
->selfSignal
) = 0;
505 else if ( * ( prm
->selfSignal
) == 1 )
507 thread_mark ( KmerSets
[id
], id
);
508 * ( prm
->selfSignal
) = 0;
515 static void creatThrds ( pthread_t
* threads
, PARAMETER
* paras
)
520 for ( i
= 0; i
< thrd_num
; i
++ )
522 if ( ( temp
= pthread_create ( &threads
[i
], NULL
, ( void * ) threadRoutine
, & ( paras
[i
] ) ) ) != 0 )
524 fprintf ( stderr
, "Create threads failed.\n" );
529 fprintf ( stderr
, "%d thread(s) initialized.\n", thrd_num
);
532 static void thread_mark ( KmerSet
* set
, unsigned char thrdID
)
538 while ( set
->iter_ptr
< set
->size
)
540 if ( !is_kmer_entity_null ( set
->flags
, set
->iter_ptr
) )
542 rs
= set
->array
+ set
->iter_ptr
;
544 if ( rs
->deleted
|| rs
->linear
)
550 in_num
= count_branch2prev ( rs
);
551 out_num
= count_branch2next ( rs
);
553 if ( in_num
== 1 && out_num
== 1 )
556 linearCounter
[thrdID
]++;
563 //printf("%lld more linear\n",linearCounter[thrdID]);
566 static void thread_wait ( pthread_t
* threads
)
570 for ( i
= 0; i
< thrd_num
; i
++ )
571 if ( threads
[i
] != 0 )
573 pthread_join ( threads
[i
], NULL
);
577 static void sendWorkSignal ( unsigned char SIG
, unsigned char * thrdSignals
)
581 for ( t
= 0; t
< thrd_num
; t
++ )
583 thrdSignals
[t
+ 1] = SIG
;
590 for ( t
= 0; t
< thrd_num
; t
++ )
591 if ( thrdSignals
[t
+ 1] )
603 static void Mark1in1outNode ()
606 long long counter
= 0;
607 pthread_t threads
[thrd_num
];
608 unsigned char thrdSignal
[thrd_num
+ 1];
609 PARAMETER paras
[thrd_num
];
611 for ( i
= 0; i
< thrd_num
; i
++ )
613 thrdSignal
[i
+ 1] = 0;
614 paras
[i
].threadID
= i
;
615 paras
[i
].mainSignal
= &thrdSignal
[0];
616 paras
[i
].selfSignal
= &thrdSignal
[i
+ 1];
619 creatThrds ( threads
, paras
);
621 linearCounter
= ( long long * ) ckalloc ( thrd_num
* sizeof ( long long ) );
623 for ( i
= 0; i
< thrd_num
; i
++ )
625 linearCounter
[i
] = 0;
628 sendWorkSignal ( 1, thrdSignal
); //mark linear nodes
629 sendWorkSignal ( 2, thrdSignal
); //stop threads
630 thread_wait ( threads
);
632 for ( i
= 0; i
< thrd_num
; i
++ )
634 counter
+= linearCounter
[i
];
637 free ( ( void * ) linearCounter
);
638 fprintf ( stderr
, "%lld linear node(s) marked.\n", counter
);