modified: src1/worker.c
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / sparsePregraph / build_edge.cpp
blob2b9ceabbadb4b61e44a96f5b9c92f9a719c2c207
1 /*
2 * build_edge.cpp
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "core.h"
24 #include "stdinc.h"
25 #include "build_edge.h"
26 #include "seq_util.h"
27 #include "zlib.h"
28 #include "sparse_kmer.h"
30 /*************************************************
31 Function:
32 RemovingWeakNodesAndEdges2
33 Description:
34 1. Removes the nodes whose frequencies are not greater than threadhold NodeCovTh.
35 2. Removes the kmer-edges (the connections between nodes) whose frequencies
36 are not greater than threadhold EdgeCovTh.
38 Some statistics function here temporarily added.
39 Input:
40 1. ht: the graph hash table
41 2. K_size: kmer size
42 3. NodeCovTh: the threadhold for removing low coverage nodes
43 4. EdgeCovTh: the threadhold for removing low coverage kmer-edges
44 5. bucket_cnt: the current bucket number
45 6. edge_cnt: the current kmer-edge number
46 Output:
47 1. bucket_cnt: the bucket number after removing weak nodes and kmer-edges
48 2. edge_cnt: the kmer-edge number after removing weak nodes and kmer-edges
49 Return:
50 None.
51 *************************************************/
52 void RemovingWeakNodesAndEdges2 ( hashtable2 * ht, int K_size, int NodeCovTh, int EdgeCovTh, size_t * bucket_cnt, size_t * edge_cnt )
54 stat_edge_num ( ht );
55 stat_edge_cvg_len ( ht );
56 int Removed_Nodes_cnt = 0, Removed_Edges_cnt = 0;
57 bucket2 * bktptr = NULL, *bktptr_tmp = NULL;
58 bucket2 ** bktp2p = NULL;
59 edge_node * edge_ptr = NULL, *next_edge = NULL, *edge_tmp = NULL;
60 int smaller;
61 fprintf ( stderr, "Start to remove weak nodes and kmer-edges.\n" );
64 for(size_t i=0;i<ht->ht_sz;++i)
66 bktptr=ht->store_pos[i];
67 while(bktptr!=NULL)
69 if(bktptr->kmer_info.cov1==0)printf("zero\n");
71 bktptr=bktptr->nxt_bucket;
74 }*/
76 //removing weak nodes
77 for ( size_t i = 0; i < ht->ht_sz; ++i )
79 bktptr = ht->store_pos[i];
81 while ( bktptr != NULL )
83 if ( bktptr->kmer_info.cov1 <= NodeCovTh )
85 bktptr->kmer_info.deleted = 1;
86 Removed_Nodes_cnt++;
87 edge_ptr = bktptr->kmer_info.right;
89 while ( edge_ptr )
91 edge_ptr->used = 1;
92 edge_ptr = edge_ptr->nxt_edge;
93 Removed_Edges_cnt++;
96 edge_ptr = bktptr->kmer_info.left;
98 while ( edge_ptr )
100 edge_ptr->used = 1;
101 edge_ptr = edge_ptr->nxt_edge;
102 Removed_Edges_cnt++;
106 bktptr = bktptr->nxt_bucket;
110 //removing dead edges
111 for ( size_t i = 0; i < ht->ht_sz; ++i )
113 bktptr = ht->store_pos[i];
115 while ( bktptr != NULL )
117 edge_ptr = bktptr->kmer_info.right;
119 while ( edge_ptr )
121 if ( edge_ptr->edge_cov <= EdgeCovTh )
123 edge_ptr->used = 1; //becasuse the cvg of edges is symmetrial, so it's ok
124 Removed_Edges_cnt++;
126 else
128 bktptr_tmp = lastKmer ( ht, K_size, bktptr, edge_ptr, 0, smaller );
130 if ( !bktptr_tmp )
132 fprintf ( stderr, "ERROR: to node not found error!\n" );
133 exit ( -1 );
136 if ( bktptr_tmp ->kmer_info.deleted )
138 edge_ptr->used = 1;
139 Removed_Edges_cnt++;
143 edge_ptr = edge_ptr->nxt_edge;
146 edge_ptr = bktptr->kmer_info.left;
148 while ( edge_ptr )
150 if ( edge_ptr->edge_cov <= EdgeCovTh )
152 edge_ptr->used = 1; //becasuse the cvg of edges is symmetrial, so it's ok
153 Removed_Edges_cnt++;
155 else
157 bktptr_tmp = lastKmer ( ht, K_size, bktptr, edge_ptr, 1, smaller );
159 if ( !bktptr_tmp )
161 fprintf ( stderr, "ERROR: to node not found error! \n" );
162 exit ( -1 );
165 if ( bktptr_tmp ->kmer_info.deleted )
167 edge_ptr->used = 1;
168 Removed_Edges_cnt++;
172 edge_ptr = edge_ptr->nxt_edge;
175 bktptr = bktptr->nxt_bucket;
179 for ( size_t i = 0; i < ht->ht_sz; ++i )
181 bktptr = ht->store_pos[i];
182 bktp2p = & ( ht->store_pos[i] );
184 while ( bktptr != NULL )
186 edge_ptr = bktptr->kmer_info.right;
188 while ( edge_ptr )
190 next_edge = edge_ptr->nxt_edge;
192 if ( edge_ptr->used )
194 removeEdge ( bktptr, edge_ptr, 0 );
195 //Removed_Edges_cnt2++;
198 edge_ptr = next_edge;
201 edge_ptr = bktptr->kmer_info.left;
203 while ( edge_ptr )
205 next_edge = edge_ptr->nxt_edge;
207 if ( edge_ptr->used )
209 removeEdge ( bktptr, edge_ptr, 1 );
210 //Removed_Edges_cnt2++;
213 edge_ptr = next_edge;
216 bktptr_tmp = bktptr->nxt_bucket;
218 if ( bktptr->kmer_info.deleted )
220 free ( bktptr );
221 ( *bktp2p ) = bktptr_tmp;
222 //Removed_Nodes_cnt2++;
224 else
226 bktp2p = & ( bktptr->nxt_bucket );
229 bktptr = bktptr_tmp;
233 fprintf ( stderr, "%llu nodes removed.\n", Removed_Nodes_cnt );
234 fprintf ( stderr, "%llu edges removed.\n", Removed_Edges_cnt );
235 fprintf ( stderr, "\n" );
236 ( *bucket_cnt ) -= Removed_Nodes_cnt;
237 ( *edge_cnt ) -= Removed_Edges_cnt;
242 /*************************************************
243 Function:
244 removeMinorTips
245 Description:
246 Removes minor tips whose length are not longer than a threadhold usually set to 2*Kmer.
247 Input:
248 1. ht: the graph hash table
249 2. K_size: kmer size
250 3. cut_len_tip: the threadhold for tips' length
251 Output:
252 1. tip_c: useless
253 Return:
254 None.
255 *************************************************/
256 void removeMinorTips ( struct hashtable2 * ht, int K_size, int cut_len_tip, int & tip_c )
258 mask1in1out ( ht );
259 bucket2 * bktptr = NULL;
260 size_t flag = 1;
261 size_t total = 0;
262 int j = 0;
264 while ( flag )
266 flag = 0;
268 for ( size_t i = 0; i < ht->ht_sz; ++i )
270 bktptr = ht->store_pos[i];
272 while ( bktptr != NULL )
274 flag += clipTipFromNode ( ht, K_size, bktptr, cut_len_tip );
275 bktptr = bktptr->nxt_bucket;
279 j++;
281 if ( flag )
283 fprintf ( stderr, "%llu tips removed in cycle %d.\n\n", flag, j );
284 total += flag;
286 else
288 fprintf ( stderr, "Total %llu tips removed.\n", total );
291 if ( flag ) { mask1in1out ( ht ); }
296 /*************************************************
297 Function:
298 mask1in1out
299 Description:
300 Marks "1 in 1 out" node linear.
301 Input:
302 1. ht: the graph hashtable
303 Output:
304 None.
305 Return:
306 None.
307 *************************************************/
308 static void mask1in1out ( hashtable2 * ht )
310 size_t total = 0, linear = 0;
311 static int call_times;
312 call_times++;
314 for ( size_t i = 0; i < ht->ht_sz; ++i )
316 struct bucket2 * bkt_ptr = ht->store_pos[i];
318 while ( bkt_ptr )
320 total++;//for stat
322 if ( ( bkt_ptr->kmer_info.left != NULL && bkt_ptr->kmer_info.left->nxt_edge == NULL )
323 && ( bkt_ptr->kmer_info.right != NULL && bkt_ptr->kmer_info.right->nxt_edge == NULL ) )
325 bkt_ptr->kmer_info.linear = 1;
326 linear++;//for stat
328 else
330 bkt_ptr->kmer_info.linear = 0;
333 bkt_ptr = bkt_ptr->nxt_bucket;
337 //fprintf(stderr,"Masking linear nodes, times: %d\n",call_times);
338 fprintf ( stderr, "Total nodes number: %llu\n", total );
339 fprintf ( stderr, "Linear nodes number: %llu\n", linear );
343 /*************************************************
344 Function:
345 clipTipFromNode
346 Description:
347 Removes tips from an end node.
348 Input:
349 1. ht: the graph hashtable
350 2. K_size: the kmer size
351 3. node: the tips starting node
352 4. cut_len_tip: the threadhold of tips' length
353 Output:
354 None.
355 Return:
356 1 if clips a tip successfully.
357 *************************************************/
358 static int clipTipFromNode ( hashtable2 * ht, int K_size, bucket2 * node, int cut_len_tip ) //only for remove minor tips
360 //linear return 0
361 if ( node->kmer_info.linear || node->kmer_info.deleted )
363 return 0;
366 // for not linear
367 int in_num, out_num;
368 int sum_edge_len = 0;
369 int smaller;
370 bool is_left;
371 bool pre_is_left;
372 edge_node * edge0;
373 in_num = count_left_edge_num ( node );
374 out_num = count_right_edge_num ( node );
376 if ( in_num == 0 && out_num == 1 ) { is_left = 0; }
377 else if ( in_num == 1 && out_num == 0 ) { is_left = 1; }
378 else { return 0; }
380 if ( is_left ) { edge0 = node->kmer_info.left; }
381 else { edge0 = node->kmer_info.right; }
383 bucket2 * next, *pre_node;
384 pre_node = node;
385 next = lastKmer ( ht, K_size, node, edge0, is_left, smaller );
387 while ( next->kmer_info.linear )
389 if ( sum_edge_len > cut_len_tip ) { return 0; }
391 is_left = ! ( is_left ^ smaller );
393 if ( is_left ) { edge0 = next->kmer_info.left; }
394 else { edge0 = next->kmer_info.right; }
396 sum_edge_len += edge0->len + 1;
397 pre_node = next;
398 next = lastKmer ( ht, K_size, next, edge0, is_left, smaller );
400 if ( !next )
402 fprintf ( stderr, "ERROR: linear edge not found error !\n" );
403 exit ( -1 );
407 pre_is_left = is_left;
408 is_left = ( is_left ^ smaller ); //back check orientation...
409 in_num = count_left_edge_num ( next );
410 out_num = count_right_edge_num ( next );
412 if ( is_left ) //check the last node left branch or not
414 if ( in_num == 1 )
416 return 0;
418 else if ( in_num > 1 )
420 edge_node * edge1 = NULL, * temp_edge = NULL;
421 bucket2 * temp_bucket = NULL;
422 int max_cvg = 0, single_cvg = 0, temp_smaller;
423 temp_edge = next->kmer_info.left;
425 while ( temp_edge )
427 single_cvg = temp_edge->edge_cov;
429 if ( single_cvg > max_cvg ) { max_cvg = single_cvg; }
431 if ( !edge1 )
433 temp_bucket = lastKmer ( ht, K_size, next, temp_edge, 1, temp_smaller );
435 if ( !temp_bucket )
437 fprintf ( stderr, "ERROR: edge to NULL found error ! a\n" );
438 exit ( 1 );
441 if ( pre_node == temp_bucket ) { edge1 = temp_edge; }
444 temp_edge = temp_edge->nxt_edge;
447 if ( !edge1 )
449 fprintf ( stderr, "ERROR: edge to node not found error ! b\n" );
450 exit ( 1 );
453 if ( edge1->edge_cov < max_cvg )
455 removeEdge ( next, edge1, 1 );
456 removeEdge ( pre_node, edge0, pre_is_left );
457 node->kmer_info.deleted = 1;
458 pre_node->kmer_info.deleted = 1;
459 return 1;
461 else
463 return 0;
466 else
468 fprintf ( stderr, "ERROR: left tips oritation error or edge not found error ! a\n" );
469 exit ( -1 );
472 else
474 if ( out_num == 1 )
476 return 0;
478 else if ( out_num > 1 )
480 edge_node * edge1 = NULL, * temp_edge = NULL;
481 bucket2 * temp_bucket = NULL;
482 int max_cvg = 0, single_cvg = 0, temp_smaller;
483 //ok change it to a edge_remove thred_hold locally later
484 //or only if it is the least cvg ->remove it
485 temp_edge = next->kmer_info.right;
487 while ( temp_edge )
489 single_cvg = temp_edge->edge_cov;
491 if ( single_cvg > max_cvg ) { max_cvg = single_cvg; }
493 if ( !edge1 )
495 temp_bucket = lastKmer ( ht, K_size, next, temp_edge, 0, temp_smaller );
497 if ( !temp_bucket )
499 fprintf ( stderr, "ERROR: edge to NULL found, error ! b\n" );
500 exit ( -1 );
503 if ( pre_node == temp_bucket ) { edge1 = temp_edge; }
506 temp_edge = temp_edge->nxt_edge;
509 if ( !edge1 )
511 fprintf ( stderr, "ERROR: edge to node not found error ! e\n" );
512 exit ( 1 );
515 if ( edge1->edge_cov < max_cvg )
517 removeEdge ( next, edge1, 0 );
518 removeEdge ( pre_node, edge0, pre_is_left );
519 node->kmer_info.deleted = 1;
520 pre_node->kmer_info.deleted = 1;
521 return 1;
523 else
525 return 0;
528 else
530 fprintf ( stderr, "ERROR: right tips oritation error or edge not found error! b\n" );
531 exit ( -1 );
538 static int count_left_edge_num ( bucket2 * bkt ) //63 127 same
540 int ret = 0;
542 if ( bkt )
544 edge_node * left_edge = bkt->kmer_info.left;
546 while ( left_edge )
548 ret++;
549 left_edge = left_edge->nxt_edge;
553 return ret;
556 static int count_right_edge_num ( bucket2 * bkt ) //63 127 same
558 int ret = 0;
560 if ( bkt )
562 edge_node * right_edge = bkt->kmer_info.right;
564 while ( right_edge )
566 ret++;
567 right_edge = right_edge->nxt_edge;
571 return ret;
575 /*************************************************
576 Function:
577 lastKmer
578 Description:
579 Searches the node that a node's kmer-edge end with.
580 Input:
581 1. ht: the graph hashtable
582 2. K_size: kmer size
583 3. node: the node whose kmer-edge will be searched
584 4. edge: the kmer-edge
585 5. is_left: whether the kmer-edge on the node's left side
586 Output:
587 1. smaller: whether the searched result, a kmer is smaller than its reversed complement
588 Return:
589 A pointer to the found node.
590 Null if not found.
591 *************************************************/
592 static bucket2 * lastKmer ( hashtable2 * ht, int K_size, bucket2 * node, edge_node * edge, int is_left, int & smaller ) //NEW
594 if ( !node || !edge ) { return NULL; }
596 kmer_t2 t_kmer, f_kmer;
597 t_kmer = node->kmer_t2;
598 kmer_t2 edge_seq;
599 memset ( edge_seq.kmer, 0, sizeof ( edge_seq ) );
600 ( edge_seq.kmer ) [sizeof ( edge_seq ) / sizeof ( uint64_t ) - 1] = edge->edge;
601 int edge_len = edge->len + 1;
603 if ( edge_len > K_size )
605 fprintf ( stderr, "ERROR: g value should be no great than kmer size!\n" );
606 exit ( -1 );
609 kmer_t2 KMER_FILTER;
610 initKmerFilter ( K_size, &KMER_FILTER );
612 if ( is_left ) //left edge
614 kmerMoveRight ( &t_kmer, edge_len );
615 kmerMoveLeft ( &edge_seq, K_size - edge_len );
616 kmerOr ( &t_kmer, &edge_seq );
617 kmerAnd ( &t_kmer, &KMER_FILTER );
619 else
621 kmerMoveLeft ( &t_kmer, edge_len );
622 kmerOr ( &t_kmer, &edge_seq );
623 kmerAnd ( &t_kmer, &KMER_FILTER );
626 f_kmer = t_kmer;
627 reverseCompKmer ( &f_kmer, K_size );
629 if ( kmerCompare ( &t_kmer, &f_kmer ) > 0 )
631 t_kmer = f_kmer;
632 smaller = 0;
634 else { smaller = 1; }
636 return search_kmer ( ht, &t_kmer );
639 static bucket2 * search_kmer ( hashtable2 * ht, kmer_t2 * t_kmer )
641 uint64_t hv = MurmurHash64A ( t_kmer, sizeof ( kmer_t2 ), 0 );
642 size_t hash_idx = ( size_t ) ( hv % ht->ht_sz );
643 bucket2 * starter = ht->store_pos[hash_idx];
645 while ( starter )
647 if ( kmerCompare ( & ( starter->kmer_t2 ), t_kmer ) == 0 )
649 return starter;
652 starter = starter->nxt_bucket;
655 return NULL;
661 static void removeEdge ( bucket2 * node, edge_node * edge, int is_left ) // remove only one side ... //63 127 same ...
663 edge_node * pre_edge = NULL, *cur_edge = NULL, *nxt_edge = NULL;
665 if ( !node || !edge )
667 return ;
670 if ( is_left )
672 cur_edge = node->kmer_info.left;
674 if ( cur_edge == NULL )
676 return ;
679 if ( cur_edge == edge )
681 nxt_edge = cur_edge->nxt_edge;
682 free ( cur_edge );
683 cur_edge = NULL;
684 node->kmer_info.left = nxt_edge;
685 return ;
688 else
690 cur_edge = node->kmer_info.right;
692 if ( cur_edge == NULL )
694 return ;
697 if ( cur_edge == edge )
699 nxt_edge = cur_edge->nxt_edge;
700 free ( cur_edge );
701 cur_edge = NULL;
702 node->kmer_info.right = nxt_edge;
703 return ;
707 pre_edge = cur_edge;
708 cur_edge = cur_edge->nxt_edge;
710 while ( cur_edge )
712 if ( cur_edge == edge ) { break; }
714 pre_edge = cur_edge;
715 cur_edge = cur_edge->nxt_edge;
718 if ( cur_edge )
720 nxt_edge = cur_edge->nxt_edge;
721 free ( cur_edge );
722 cur_edge = NULL;
723 pre_edge->nxt_edge = nxt_edge;
729 static void stat_edge_num ( hashtable2 * ht ) //63 127 same
731 int l_num = 0, r_num = 0;
732 size_t total_edge_num = 0, total_node_num = 0;
733 bucket2 * bkt = NULL;
734 map<int, size_t> edge_num_map;
736 for ( size_t i = 0; i < ht->ht_sz; i++ )
738 bkt = ht->store_pos[i];
740 while ( bkt )
742 total_node_num++;
743 l_num = count_left_edge_num ( bkt );
744 r_num = count_right_edge_num ( bkt );
745 total_edge_num += ( l_num + r_num );
746 edge_num_map[l_num]++;
747 edge_num_map[r_num]++;
748 bkt = bkt->nxt_bucket;
752 ofstream o_edge_num ( "edge_num_stat.txt" );
753 o_edge_num << "Total nodes number:" << total_node_num << endl;
754 o_edge_num << "Total kmer-edges number:" << total_edge_num << endl;
755 o_edge_num << "Average kmer-edges number per node:" << ( double ) total_edge_num / total_node_num << endl;
756 o_edge_num << "The frequence of kmer-edges number on a node's one side as below :" << endl;
757 map<int, size_t>::iterator it;
759 for ( it = edge_num_map.begin(); it != edge_num_map.end(); ++it )
761 o_edge_num << it->first << "\t" << it->second << endl;
764 o_edge_num.close();
769 static void stat_edge_cvg_len ( hashtable2 * ht )
771 map<int, size_t> edge_cvg_map;
772 map<int, size_t> edge_len_map;
773 bucket2 * bkt = NULL;
774 edge_node * temp_edge = NULL;
776 for ( size_t i = 0; i < ht->ht_sz; i++ )
778 bkt = ht->store_pos[i];
780 while ( bkt )
782 //left
783 temp_edge = bkt->kmer_info.left;
785 while ( temp_edge )
787 edge_cvg_map[temp_edge->edge_cov]++;
788 edge_len_map[temp_edge->len]++;
789 temp_edge = temp_edge->nxt_edge;
792 //right
793 temp_edge = bkt->kmer_info.right;
795 while ( temp_edge )
797 edge_cvg_map[temp_edge->edge_cov]++;
798 edge_len_map[temp_edge->len]++;
799 temp_edge = temp_edge->nxt_edge;
802 bkt = bkt->nxt_bucket;
806 ofstream o_edge_cvg ( "edge_cvg_stat.txt" );
807 ofstream o_edge_len ( "edge_len_stat.txt" );
808 map<int, size_t>::iterator it;
810 for ( it = edge_cvg_map.begin(); it != edge_cvg_map.end(); ++it )
812 o_edge_cvg << it->first << "\t" << it->second << endl;
815 for ( it = edge_len_map.begin(); it != edge_len_map.end(); ++it )
817 o_edge_len << it->first << "\t" << it->second << endl;
820 o_edge_cvg.close();
821 o_edge_len.close();
826 /*************************************************
827 Function:
828 kmer2edges
829 Description:
830 This is the main function for building edges by compacting the linear nodes.
831 Input:
832 1. ht: the graph hashtable
833 2. K_size: kmer size
834 3. out_file: the name of output file containing edges sequence
835 Output:
836 None.
837 Return:
838 None.
839 *************************************************/
840 void kmer2edges ( hashtable2 * ht, int K_size, char * outfile )
842 FILE * fp;
843 char temp[256];
844 sprintf ( temp, "%s", outfile );
845 fp = fopen ( temp, "w" );
847 if ( fp == NULL )
849 fprintf ( stderr, "ERROR: Can't create file %s. \n", temp );
850 exit ( -1 );
853 make_edge ( ht, K_size, fp );
854 fclose ( fp );
857 static void make_edge ( hashtable2 * ht, int K_size, FILE * fp ) //63 127 same
859 bucket2 * bktptr;
861 for ( size_t i = 0; i < ht->ht_sz; ++i )
863 bktptr = ht->store_pos[i];
865 while ( bktptr != NULL )
867 startEdgeFromNode ( ht, K_size, bktptr, fp );
868 bktptr = bktptr->nxt_bucket;
874 /*************************************************
875 Function:
876 startEdgeFromNode
877 Description:
878 Constructs edges from a branched node or end node.
879 for every branch (left , right)
880 1. Puts the linear node into a stack
881 2. Checks the edge to be built form the stack are plalindrome or not
882 3. Builds an edge by merge the linear nodes
883 Input:
884 1. ht: the graph hashtable
885 2. K_size: kmer size
886 3. fp: the file pointer for writing out edge sequences
887 Output:
888 None.
889 Return:
890 Zero.
891 *************************************************/
892 static int startEdgeFromNode ( hashtable2 * ht, int K_size, bucket2 * node, FILE * fp )
894 static size_t call_times;
895 call_times++;
897 if ( node->kmer_info.linear || node->kmer_info.deleted )
899 return 0;//linear node ...
902 int left, right;
903 left = count_left_edge_num ( node );
904 right = count_right_edge_num ( node );
906 if ( left == 0 && right == 0 )
908 return 0; //it's a dead node
911 list<stacked_node2 *> stack;
912 edge_node * t_edge = NULL, *t_next = NULL;
913 stacked_node2 * t_stacked_node = NULL;
914 vector<preEDGE2> loops_edges;
915 int node_c;
916 //for right edge
917 t_edge = node->kmer_info.right;
919 while ( t_edge )
921 if ( t_edge->used == 1 )
923 t_edge = t_edge->nxt_edge;
924 continue;
927 t_stacked_node = ( stacked_node2 * ) malloc ( sizeof ( stacked_node2 ) );
928 t_stacked_node->node = node;
929 t_stacked_node->is_left = 0;
930 t_stacked_node->edge = t_edge;
931 t_stacked_node->next = NULL;
932 stack.push_back ( t_stacked_node );
933 t_edge->used = 1;
934 stringBeads ( ht, K_size, stack, t_stacked_node, t_edge, &node_c );
935 process_1stack ( ht, K_size, stack, fp, loops_edges );
936 t_next = t_edge->nxt_edge;//because this procedure will remove the edge t_edge
937 dislink ( ht, K_size, stack.front() );
939 if ( stack.size() > 2 )
941 stack.pop_back();//change the stack
943 if ( stack.back() && stack.size() > 1 ) //last but second node
945 dislink ( ht, K_size, stack.back() );
949 stacked_node2 * head, *tmp_node;
950 head = stack.front();
952 while ( head )
954 tmp_node = head;
955 free ( tmp_node );
956 head = head->next;
959 stack.clear();
960 t_edge = t_next;
963 //for left edge
964 t_edge = node->kmer_info.left;
966 while ( t_edge )
968 if ( t_edge->used == 1 )
970 t_edge = t_edge->nxt_edge;
971 continue;
974 t_stacked_node = ( stacked_node2 * ) malloc ( sizeof ( stacked_node2 ) );
975 t_stacked_node->node = node;
976 t_stacked_node->is_left = 1;
977 t_stacked_node->edge = t_edge;
978 t_stacked_node->next = NULL;
979 stack.push_back ( t_stacked_node );
980 t_edge->used = 1;
981 stringBeads ( ht, K_size, stack, t_stacked_node, t_edge, &node_c ); //
982 process_1stack ( ht, K_size, stack, fp, loops_edges );
983 t_next = t_edge->nxt_edge;//because this procedure will remove the edge t_edge
984 dislink ( ht, K_size, stack.front() );
986 if ( stack.size() > 2 )
988 stack.pop_back();//change the stack
990 if ( stack.back() && stack.size() > 1 ) //last but second node
992 dislink ( ht, K_size, stack.back() );
996 //debug<<"before free stack"<<endl;
997 stacked_node2 * head, *tmp_node;
998 head = stack.front();
1000 while ( head )
1002 tmp_node = head;
1003 free ( tmp_node );
1004 head = head->next;
1007 stack.clear();
1008 t_edge = t_next;
1011 if ( loops_edges.size() > 0 )
1013 //fprintf(stderr,"loops_edges size %llu\n",loops_edges.size());
1014 int i, j, size;
1015 bool need_output;
1016 size = loops_edges.size();
1017 need_output = 1;
1019 //bool debug = 0;
1020 for ( i = 0; i < size; i++ )
1022 string seq = * ( loops_edges[i].full_edge );
1023 string rc_seq = revCompSeq ( seq );
1025 if(seq.compare("AATTGGACGTGAGAGCAAATTGTATTGAGCATACAATTTGCTCTCACGTCCAATT") == 0) {
1026 fprintf(stderr,"in loops_edges %d %s\n",i,seq.c_str());
1027 debug = 1;
1030 if(seq.compare("AATTGGACGTGAGAGCAAATTGTATGCTCAATACAATTTGCTCTCACGTCCAATT") == 0) {
1031 fprintf(stderr,"in loops_edges %d %s\n",i,seq.c_str());
1032 debug = 1;
1035 if(debug ){
1036 fprintf(stderr, "%d %s\n",i,seq.c_str());
1037 fprintf(stderr, "%d %s\n",i,rc_seq.c_str());
1040 for ( j = i + 1; j < size; j++ )
1042 string cur_seq = * ( loops_edges[j].full_edge );
1044 if ( seq.compare ( cur_seq ) == 0 )
1046 fprintf ( stderr, "ERROR: two equal loop edge sequence from same node, this should not happen!\n" );
1047 fprintf ( stderr, "%s\n", seq.c_str() );
1048 exit ( -1 );
1051 if ( rc_seq.compare ( cur_seq ) == 0 )
1053 fprintf ( stderr, "INFO: two loop edge sequence are reversed complemental!\n" );
1054 fprintf ( stderr, "%s\n", seq.c_str() );
1055 fprintf ( stderr, "%s\n", rc_seq.c_str() );
1056 need_output = 0;
1057 loops_edges[j].cvg += loops_edges[i].cvg;
1058 break;
1062 if ( need_output )
1064 output_1edge ( &loops_edges[i], K_size, fp );
1065 //fprintf(stderr,"need output %d %s\n",i,seq.c_str());
1068 delete ( loops_edges[i].full_edge );
1069 need_output = 1;
1073 return 0;
1078 /*************************************************
1079 Function:
1080 stringBeads
1081 Description:
1082 Puts the linear nodes into a stack.
1083 Input:
1084 1. ht: the graph hashtalbe
1085 2. K_size: kmer size
1086 3. stack: a stack
1087 4. from_node: the starting node of the stack
1088 5. from_edge: the kmer-edge of the first node
1089 6. node_c: useless
1090 Output:
1091 None.
1092 Return:
1093 None.
1094 *************************************************/
1095 static void stringBeads ( hashtable2 * ht, int K_size, list<stacked_node2 *> &stack, stacked_node2 * from_node, edge_node * from_edge, int * node_c )
1097 static size_t call_times;
1098 call_times++;
1099 bucket2 * t_bucket = from_node->node;
1100 edge_node * t_edge = from_edge;
1101 stacked_node2 * t_stacked_node = from_node;
1102 int is_left = from_node->is_left;
1103 int t_smaller;
1104 t_edge->used = 1;
1105 t_bucket = lastKmer ( ht, K_size, t_bucket, t_edge, is_left, t_smaller );
1107 if ( !t_bucket )
1109 fprintf ( stderr, "ERROR: to node not found in stringBeads()\n" );
1110 exit ( -1 );
1113 while ( t_bucket && t_bucket->kmer_info.linear )
1115 t_stacked_node = ( stacked_node2 * ) malloc ( sizeof ( stacked_node2 ) );
1116 t_stacked_node->node = t_bucket;
1117 is_left = ! ( is_left ^ t_smaller );
1118 t_stacked_node->is_left = is_left;
1120 if ( is_left ) { t_stacked_node->edge = t_bucket->kmer_info.left; }
1121 else { t_stacked_node->edge = t_bucket->kmer_info.right; }
1123 t_stacked_node->next = NULL;
1124 ( ( stacked_node2 * ) stack.back() )->next = t_stacked_node;
1125 stack.push_back ( t_stacked_node );
1126 t_stacked_node->edge->used = 1;
1127 t_bucket = lastKmer ( ht, K_size, t_bucket, t_stacked_node->edge, is_left, t_smaller );
1130 if ( t_bucket ) //should be always true for end node ..
1132 t_stacked_node = ( stacked_node2 * ) malloc ( sizeof ( stacked_node2 ) );
1133 t_stacked_node->node = t_bucket;
1134 is_left = ! ( is_left ^ t_smaller );
1135 t_stacked_node->is_left = is_left;
1136 t_stacked_node->edge = NULL;
1137 t_stacked_node->next = NULL;
1138 ( ( stacked_node2 * ) stack.back() )->next = t_stacked_node;
1139 stack.push_back ( t_stacked_node );
1142 //for debug
1143 static void pirntStack ( list<stacked_node2 *> &stack )
1145 static int times = 0;
1146 fprintf ( stderr, "call times %d \n ", times++ );
1147 stacked_node2 * ptr = stack.front();
1149 while ( ptr )
1151 printKmer ( & ( ptr->node->kmer_t2 ), stderr );
1153 if ( ptr->edge )
1154 { fprintf ( stderr, "%llx , %d ,", ptr->edge->edge, ptr->is_left ); }
1156 fprintf ( stderr, "->" );
1157 ptr = ptr->next;
1160 fprintf ( stderr, "\n" );
1162 /*************************************************
1163 Function:
1164 process_1stack
1165 Description:
1166 Processes the nodes in one stack
1167 1. Compacts the nodes to an edge
1168 2. Checks palindrome
1169 3. Calculates coverage
1170 Input:
1171 1. ht: the graph hashtable
1172 2. K_size: kmer size
1173 3. stack: the stack
1174 4. fp: the file pointer for writing
1175 Output:
1176 None.
1177 Return:
1178 None.
1179 *************************************************/
1180 static void process_1stack ( hashtable2 * ht, int K_size, list<stacked_node2 *> &stack, FILE * fp, vector<preEDGE2> &loops_edges )
1182 static size_t edge_c;// edge id
1183 static preEDGE2 long_edge_buf;
1184 preEDGE2 loops;
1185 int TipLenTh = 3 * K_size; //orig 100
1186 int TipCovTh = 5;
1188 if ( stack.size() < 2 )
1190 fprintf ( stderr, "only %llu nodes in the stack \n", stack.size() );
1191 exit ( -1 );
1193 else
1195 //palindrome check
1196 string full_edge = stack2string ( ht, K_size, stack ); //when output skip the first kmer first
1197 stacked_node2 * test = stack.front();
1198 bool palindrome = check_palindrome ( full_edge );
1199 int bal_edge = !palindrome;
1200 stacked_node2 * from_node = stack.front();
1201 stacked_node2 * to_node = stack.back();
1202 long_edge_buf.from_node = from_node;
1203 long_edge_buf.to_node = to_node;
1204 long_edge_buf.full_edge = &full_edge;
1205 long_edge_buf.bal_edge = bal_edge;
1206 uint64_t symbol = 0; //cvg stat
1207 edge_c++;
1209 if ( stack.size() == 2 )
1211 long_edge_buf.cvg = from_node->edge->edge_cov;
1213 else
1215 stacked_node2 * nd_tmp = from_node;
1217 while ( nd_tmp && nd_tmp->edge )
1219 symbol += nd_tmp->edge->edge_cov * ( nd_tmp->edge->len + 1 );
1220 nd_tmp = nd_tmp->next;
1223 int cvg = symbol / ( full_edge.size() - K_size );
1224 long_edge_buf.cvg = cvg;
1227 int from_left, from_right, to_left, to_right;
1228 from_left = count_left_edge_num ( from_node->node );
1229 from_right = count_right_edge_num ( from_node->node );
1230 to_left = count_left_edge_num ( to_node->node );
1231 to_right = count_right_edge_num ( to_node->node );
1233 //tips control
1235 if ( ( ( from_left + from_right == 1 ) && ( to_left + to_right == 1 ) && ( full_edge.size() < TipLenTh ) )
1236 || ( ( ( from_left + from_right == 1 ) || ( to_left + to_right == 1 ) )
1237 && ( full_edge.size() < TipLenTh ) && long_edge_buf.cvg < TipCovTh ) ) //tips args
1239 //if(full_edge.size()<TipLenTh && long_edge_buf.cvg<TipCovTh){//it's a tip or low cvg link
1240 static size_t tip_num;
1241 tip_num++;
1243 else
1245 //debug begin
1247 string bug_seq = *(long_edge_buf.full_edge);
1248 if(bug_seq.compare("AATTGGACGTGAGAGCAAATTGTATTGAGCATACAATTTGCTCTCACGTCCAATT") == 0) {
1249 fprintf(stderr,"%s\n",bug_seq.c_str());
1250 fprintf(stderr,"from %llx to %llx \n",long_edge_buf.from_node->node,long_edge_buf.to_node->node);
1254 if(bug_seq.compare("AATTGGACGTGAGAGCAAATTGTATGCTCAATACAATTTGCTCTCACGTCCAATT") == 0) {
1255 fprintf(stderr,"%s\n",bug_seq.c_str());
1256 fprintf(stderr,"from %llx to %llx \n",long_edge_buf.from_node->node,long_edge_buf.to_node->node);
1260 //debug end
1261 if ( long_edge_buf.from_node->node == long_edge_buf.to_node->node )
1263 loops = long_edge_buf;
1264 loops.full_edge = new string ( * ( long_edge_buf.full_edge ) );
1265 loops_edges.push_back ( loops );
1267 else
1269 //output edge
1270 output_1edge ( &long_edge_buf, K_size, fp );
1274 edge_c += bal_edge;
1280 // WARNING: the kmer atcg is different from soapdenovo's represent
1281 static void output_1edge ( preEDGE2 * long_edge, int K_size, FILE * fp )
1283 fprintf ( fp, ">length %d,", long_edge->full_edge->size() - K_size );
1284 const char * seq = long_edge->full_edge->c_str();
1285 //uint64_t from_kmer[2],to_kmer[2];
1286 kmer_t2 from_kmer, to_kmer;
1287 get_kmer_from_seq ( seq, long_edge->full_edge->size() , K_size, 0, &from_kmer );
1288 get_kmer_from_seq ( seq, long_edge->full_edge->size() , K_size, long_edge->full_edge->size() - K_size, &to_kmer );
1289 uint64_t * from, *to;
1290 from = from_kmer.kmer;
1291 to = to_kmer.kmer;
1292 #ifdef _63MER_
1293 fprintf ( fp, "%llx %llx,", from[0], from[1] );
1294 fprintf ( fp, "%llx %llx,", to[0], to[1] );
1295 #endif
1296 #ifdef _127MER_
1297 fprintf ( fp, "%llx %llx %llx %llx,", from[0], from[1], from[2], from[3] );
1298 fprintf ( fp, "%llx %llx %llx %llx,", to[0], to[1], to[2], to[3] );
1299 #endif
1300 fprintf ( fp, "cvg %d,%d\n", long_edge->cvg, long_edge->bal_edge );
1301 fprintf ( fp, "%s", seq );
1302 fprintf ( fp, "\n" );
1306 /*************************************************
1307 Function:
1308 dislink
1309 Description:
1310 Marks the kmer-edges between "form_node" and "form_node->next" used.
1311 Input:
1312 1. ht: the graph hashtable
1313 2. K_size: kmer size
1314 3. from_node: the first node of a stack
1315 Output:
1316 None.
1317 Return:
1318 None.
1319 *************************************************/
1320 static void dislink ( hashtable2 * ht, int K_size, stacked_node2 * from_node )
1322 from_node->edge->used = 1;
1323 int from_edge_len = from_node->edge->len;
1324 int smaller;
1326 if ( from_node->next )
1328 stacked_node2 * from_next = from_node->next;
1330 if ( from_next->node == from_node->node && from_next->is_left != from_node->is_left )
1332 return ;
1335 if ( from_next->is_left ) //remove right edge
1337 if ( from_next->node->kmer_info.linear )
1339 bucket2 * node_tmp = lastKmer ( ht, K_size, from_next->node, from_next->node->kmer_info.right, 0, smaller );
1341 if ( node_tmp == from_node->node )
1343 from_next->node->kmer_info.right->used = 1;
1345 else
1347 fprintf ( stderr, "ERROR: to node not found in dislink()\n" );
1350 else
1352 edge_node * edge_tmp = from_next->node->kmer_info.right;
1354 while ( edge_tmp )
1356 bucket2 * node_tmp = lastKmer ( ht, K_size, from_next->node, edge_tmp, 0, smaller );
1358 if ( node_tmp == from_node->node && edge_tmp->len == from_edge_len ) //there may be two or more edges between two nodes
1360 edge_tmp->used = 1;
1361 break;
1364 edge_tmp = edge_tmp->nxt_edge;
1367 if ( !edge_tmp )
1369 fprintf ( stderr, "ERROR: to node not found in dislink()\n" );
1373 else // remove left edge
1375 if ( from_next->node->kmer_info.linear )
1377 bucket2 * node_tmp = lastKmer ( ht, K_size, from_next->node, from_next->node->kmer_info.left, 1, smaller );
1379 if ( node_tmp == from_node->node )
1381 from_next->node->kmer_info.left ->used = 1;
1383 else
1385 fprintf ( stderr, "ERROR: to node not found in dislink()\n" );
1388 else
1390 edge_node * edge_tmp = from_next->node->kmer_info.left;
1392 while ( edge_tmp )
1394 bucket2 * node_tmp = lastKmer ( ht, K_size, from_next->node, edge_tmp, 1, smaller );
1396 if ( node_tmp == from_node->node && edge_tmp->len == from_edge_len )
1398 edge_tmp->used = 1;
1399 break;
1402 edge_tmp = edge_tmp->nxt_edge;
1405 if ( !edge_tmp )
1407 fprintf ( stderr, "ERROR: to node not found in dislink()\n" );
1415 /*************************************************
1416 Function:
1417 stack2string
1418 Description:
1419 Compacts the nodes in the stack to a sequence.
1420 Input:
1421 1. ht: the graph hashtable
1422 2. K_size: kmer size
1423 3. stack: the stack for processing
1424 Output:
1425 None.
1426 Return:
1427 The compacted string, namely the edge sequence.
1428 *************************************************/ //63 127 differ, fixed
1429 static string stack2string ( hashtable2 * ht, int K_size, list<stacked_node2 *> & stack )
1431 static size_t call_times;
1432 call_times++;
1433 string full_edge;
1434 stacked_node2 * t_stack_node = stack.front();
1435 char tmp[1024];
1436 uint64_t bits[2];
1437 kmer_t2 tmp_kmer = ( t_stack_node->node->kmer_t2 );
1439 if ( t_stack_node->is_left )
1441 reverseCompKmer ( &tmp_kmer, K_size );
1443 else
1447 bitsarr2str ( tmp_kmer.kmer, K_size, tmp, sizeof ( kmer_t2 ) / sizeof ( uint64_t ) );
1448 full_edge.append ( tmp ); //put first node
1450 while ( t_stack_node )
1452 if ( t_stack_node->edge )
1454 if ( t_stack_node->is_left )
1456 bits[0] = get_rev_comp_seq ( t_stack_node->edge->edge, t_stack_node->edge->len + 1 );
1457 bitsarr2str ( bits, t_stack_node->edge->len + 1, tmp, 1 );
1458 full_edge.append ( tmp );
1460 else
1462 bits[0] = t_stack_node->edge->edge;
1463 bitsarr2str ( bits, t_stack_node->edge->len + 1, tmp, 1 );
1464 full_edge.append ( tmp );
1468 t_stack_node = t_stack_node->next;
1471 return full_edge;
1474 static bool check_palindrome ( string & str ) //63 127 same
1476 size_t size = str.size();
1477 size_t mid = ( size / 2 ) + 1;
1479 for ( size_t i = 0; i < mid; ++i )
1481 switch ( str[i] )
1483 case 'A':
1485 if ( str[size - i - 1] != 'T' ) { return 0; }
1487 break;
1488 case 'C':
1490 if ( str[size - i - 1] != 'G' ) { return 0; }
1492 break;
1493 case 'T':
1495 if ( str[size - i - 1] != 'A' ) { return 0; }
1497 break;
1498 case 'G':
1500 if ( str[size - i - 1] != 'C' ) { return 0; }
1502 break;
1506 return 1;
1509 static string revCompSeq ( const string & str )
1511 string rc_seq;
1512 size_t size = str.size();
1514 for ( int i = size - 1; i >= 0; i-- )
1516 switch ( str[i] )
1518 case 'A':
1519 rc_seq.push_back ( 'T' );
1520 break;
1521 case 'C':
1522 rc_seq.push_back ( 'G' );
1523 break;
1524 case 'T':
1525 rc_seq.push_back ( 'A' );
1526 break;
1527 case 'G':
1528 rc_seq.push_back ( 'C' );
1529 break;
1533 return rc_seq;