4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
25 #include "build_edge.h"
28 #include "sparse_kmer.h"
30 /*************************************************
32 RemovingWeakNodesAndEdges2
34 1. Removes the nodes whose frequencies are not greater than threadhold NodeCovTh.
35 2. Removes the kmer-edges (the connections between nodes) whose frequencies
36 are not greater than threadhold EdgeCovTh.
38 Some statistics function here temporarily added.
40 1. ht: the graph hash table
42 3. NodeCovTh: the threadhold for removing low coverage nodes
43 4. EdgeCovTh: the threadhold for removing low coverage kmer-edges
44 5. bucket_cnt: the current bucket number
45 6. edge_cnt: the current kmer-edge number
47 1. bucket_cnt: the bucket number after removing weak nodes and kmer-edges
48 2. edge_cnt: the kmer-edge number after removing weak nodes and kmer-edges
51 *************************************************/
52 void RemovingWeakNodesAndEdges2 ( hashtable2
* ht
, int K_size
, int NodeCovTh
, int EdgeCovTh
, size_t * bucket_cnt
, size_t * edge_cnt
)
55 stat_edge_cvg_len ( ht
);
56 int Removed_Nodes_cnt
= 0, Removed_Edges_cnt
= 0;
57 bucket2
* bktptr
= NULL
, *bktptr_tmp
= NULL
;
58 bucket2
** bktp2p
= NULL
;
59 edge_node
* edge_ptr
= NULL
, *next_edge
= NULL
, *edge_tmp
= NULL
;
61 fprintf ( stderr
, "Start to remove weak nodes and kmer-edges.\n" );
64 for(size_t i=0;i<ht->ht_sz;++i)
66 bktptr=ht->store_pos[i];
69 if(bktptr->kmer_info.cov1==0)printf("zero\n");
71 bktptr=bktptr->nxt_bucket;
77 for ( size_t i
= 0; i
< ht
->ht_sz
; ++i
)
79 bktptr
= ht
->store_pos
[i
];
81 while ( bktptr
!= NULL
)
83 if ( bktptr
->kmer_info
.cov1
<= NodeCovTh
)
85 bktptr
->kmer_info
.deleted
= 1;
87 edge_ptr
= bktptr
->kmer_info
.right
;
92 edge_ptr
= edge_ptr
->nxt_edge
;
96 edge_ptr
= bktptr
->kmer_info
.left
;
101 edge_ptr
= edge_ptr
->nxt_edge
;
106 bktptr
= bktptr
->nxt_bucket
;
110 //removing dead edges
111 for ( size_t i
= 0; i
< ht
->ht_sz
; ++i
)
113 bktptr
= ht
->store_pos
[i
];
115 while ( bktptr
!= NULL
)
117 edge_ptr
= bktptr
->kmer_info
.right
;
121 if ( edge_ptr
->edge_cov
<= EdgeCovTh
)
123 edge_ptr
->used
= 1; //becasuse the cvg of edges is symmetrial, so it's ok
128 bktptr_tmp
= lastKmer ( ht
, K_size
, bktptr
, edge_ptr
, 0, smaller
);
132 fprintf ( stderr
, "ERROR: to node not found error!\n" );
136 if ( bktptr_tmp
->kmer_info
.deleted
)
143 edge_ptr
= edge_ptr
->nxt_edge
;
146 edge_ptr
= bktptr
->kmer_info
.left
;
150 if ( edge_ptr
->edge_cov
<= EdgeCovTh
)
152 edge_ptr
->used
= 1; //becasuse the cvg of edges is symmetrial, so it's ok
157 bktptr_tmp
= lastKmer ( ht
, K_size
, bktptr
, edge_ptr
, 1, smaller
);
161 fprintf ( stderr
, "ERROR: to node not found error! \n" );
165 if ( bktptr_tmp
->kmer_info
.deleted
)
172 edge_ptr
= edge_ptr
->nxt_edge
;
175 bktptr
= bktptr
->nxt_bucket
;
179 for ( size_t i
= 0; i
< ht
->ht_sz
; ++i
)
181 bktptr
= ht
->store_pos
[i
];
182 bktp2p
= & ( ht
->store_pos
[i
] );
184 while ( bktptr
!= NULL
)
186 edge_ptr
= bktptr
->kmer_info
.right
;
190 next_edge
= edge_ptr
->nxt_edge
;
192 if ( edge_ptr
->used
)
194 removeEdge ( bktptr
, edge_ptr
, 0 );
195 //Removed_Edges_cnt2++;
198 edge_ptr
= next_edge
;
201 edge_ptr
= bktptr
->kmer_info
.left
;
205 next_edge
= edge_ptr
->nxt_edge
;
207 if ( edge_ptr
->used
)
209 removeEdge ( bktptr
, edge_ptr
, 1 );
210 //Removed_Edges_cnt2++;
213 edge_ptr
= next_edge
;
216 bktptr_tmp
= bktptr
->nxt_bucket
;
218 if ( bktptr
->kmer_info
.deleted
)
221 ( *bktp2p
) = bktptr_tmp
;
222 //Removed_Nodes_cnt2++;
226 bktp2p
= & ( bktptr
->nxt_bucket
);
233 fprintf ( stderr
, "%llu nodes removed.\n", Removed_Nodes_cnt
);
234 fprintf ( stderr
, "%llu edges removed.\n", Removed_Edges_cnt
);
235 fprintf ( stderr
, "\n" );
236 ( *bucket_cnt
) -= Removed_Nodes_cnt
;
237 ( *edge_cnt
) -= Removed_Edges_cnt
;
242 /*************************************************
246 Removes minor tips whose length are not longer than a threadhold usually set to 2*Kmer.
248 1. ht: the graph hash table
250 3. cut_len_tip: the threadhold for tips' length
255 *************************************************/
256 void removeMinorTips ( struct hashtable2
* ht
, int K_size
, int cut_len_tip
, int & tip_c
)
259 bucket2
* bktptr
= NULL
;
268 for ( size_t i
= 0; i
< ht
->ht_sz
; ++i
)
270 bktptr
= ht
->store_pos
[i
];
272 while ( bktptr
!= NULL
)
274 flag
+= clipTipFromNode ( ht
, K_size
, bktptr
, cut_len_tip
);
275 bktptr
= bktptr
->nxt_bucket
;
283 fprintf ( stderr
, "%llu tips removed in cycle %d.\n\n", flag
, j
);
288 fprintf ( stderr
, "Total %llu tips removed.\n", total
);
291 if ( flag
) { mask1in1out ( ht
); }
296 /*************************************************
300 Marks "1 in 1 out" node linear.
302 1. ht: the graph hashtable
307 *************************************************/
308 static void mask1in1out ( hashtable2
* ht
)
310 size_t total
= 0, linear
= 0;
311 static int call_times
;
314 for ( size_t i
= 0; i
< ht
->ht_sz
; ++i
)
316 struct bucket2
* bkt_ptr
= ht
->store_pos
[i
];
322 if ( ( bkt_ptr
->kmer_info
.left
!= NULL
&& bkt_ptr
->kmer_info
.left
->nxt_edge
== NULL
)
323 && ( bkt_ptr
->kmer_info
.right
!= NULL
&& bkt_ptr
->kmer_info
.right
->nxt_edge
== NULL
) )
325 bkt_ptr
->kmer_info
.linear
= 1;
330 bkt_ptr
->kmer_info
.linear
= 0;
333 bkt_ptr
= bkt_ptr
->nxt_bucket
;
337 //fprintf(stderr,"Masking linear nodes, times: %d\n",call_times);
338 fprintf ( stderr
, "Total nodes number: %llu\n", total
);
339 fprintf ( stderr
, "Linear nodes number: %llu\n", linear
);
343 /*************************************************
347 Removes tips from an end node.
349 1. ht: the graph hashtable
350 2. K_size: the kmer size
351 3. node: the tips starting node
352 4. cut_len_tip: the threadhold of tips' length
356 1 if clips a tip successfully.
357 *************************************************/
358 static int clipTipFromNode ( hashtable2
* ht
, int K_size
, bucket2
* node
, int cut_len_tip
) //only for remove minor tips
361 if ( node
->kmer_info
.linear
|| node
->kmer_info
.deleted
)
368 int sum_edge_len
= 0;
373 in_num
= count_left_edge_num ( node
);
374 out_num
= count_right_edge_num ( node
);
376 if ( in_num
== 0 && out_num
== 1 ) { is_left
= 0; }
377 else if ( in_num
== 1 && out_num
== 0 ) { is_left
= 1; }
380 if ( is_left
) { edge0
= node
->kmer_info
.left
; }
381 else { edge0
= node
->kmer_info
.right
; }
383 bucket2
* next
, *pre_node
;
385 next
= lastKmer ( ht
, K_size
, node
, edge0
, is_left
, smaller
);
387 while ( next
->kmer_info
.linear
)
389 if ( sum_edge_len
> cut_len_tip
) { return 0; }
391 is_left
= ! ( is_left
^ smaller
);
393 if ( is_left
) { edge0
= next
->kmer_info
.left
; }
394 else { edge0
= next
->kmer_info
.right
; }
396 sum_edge_len
+= edge0
->len
+ 1;
398 next
= lastKmer ( ht
, K_size
, next
, edge0
, is_left
, smaller
);
402 fprintf ( stderr
, "ERROR: linear edge not found error !\n" );
407 pre_is_left
= is_left
;
408 is_left
= ( is_left
^ smaller
); //back check orientation...
409 in_num
= count_left_edge_num ( next
);
410 out_num
= count_right_edge_num ( next
);
412 if ( is_left
) //check the last node left branch or not
418 else if ( in_num
> 1 )
420 edge_node
* edge1
= NULL
, * temp_edge
= NULL
;
421 bucket2
* temp_bucket
= NULL
;
422 int max_cvg
= 0, single_cvg
= 0, temp_smaller
;
423 temp_edge
= next
->kmer_info
.left
;
427 single_cvg
= temp_edge
->edge_cov
;
429 if ( single_cvg
> max_cvg
) { max_cvg
= single_cvg
; }
433 temp_bucket
= lastKmer ( ht
, K_size
, next
, temp_edge
, 1, temp_smaller
);
437 fprintf ( stderr
, "ERROR: edge to NULL found error ! a\n" );
441 if ( pre_node
== temp_bucket
) { edge1
= temp_edge
; }
444 temp_edge
= temp_edge
->nxt_edge
;
449 fprintf ( stderr
, "ERROR: edge to node not found error ! b\n" );
453 if ( edge1
->edge_cov
< max_cvg
)
455 removeEdge ( next
, edge1
, 1 );
456 removeEdge ( pre_node
, edge0
, pre_is_left
);
457 node
->kmer_info
.deleted
= 1;
458 pre_node
->kmer_info
.deleted
= 1;
468 fprintf ( stderr
, "ERROR: left tips oritation error or edge not found error ! a\n" );
478 else if ( out_num
> 1 )
480 edge_node
* edge1
= NULL
, * temp_edge
= NULL
;
481 bucket2
* temp_bucket
= NULL
;
482 int max_cvg
= 0, single_cvg
= 0, temp_smaller
;
483 //ok change it to a edge_remove thred_hold locally later
484 //or only if it is the least cvg ->remove it
485 temp_edge
= next
->kmer_info
.right
;
489 single_cvg
= temp_edge
->edge_cov
;
491 if ( single_cvg
> max_cvg
) { max_cvg
= single_cvg
; }
495 temp_bucket
= lastKmer ( ht
, K_size
, next
, temp_edge
, 0, temp_smaller
);
499 fprintf ( stderr
, "ERROR: edge to NULL found, error ! b\n" );
503 if ( pre_node
== temp_bucket
) { edge1
= temp_edge
; }
506 temp_edge
= temp_edge
->nxt_edge
;
511 fprintf ( stderr
, "ERROR: edge to node not found error ! e\n" );
515 if ( edge1
->edge_cov
< max_cvg
)
517 removeEdge ( next
, edge1
, 0 );
518 removeEdge ( pre_node
, edge0
, pre_is_left
);
519 node
->kmer_info
.deleted
= 1;
520 pre_node
->kmer_info
.deleted
= 1;
530 fprintf ( stderr
, "ERROR: right tips oritation error or edge not found error! b\n" );
538 static int count_left_edge_num ( bucket2
* bkt
) //63 127 same
544 edge_node
* left_edge
= bkt
->kmer_info
.left
;
549 left_edge
= left_edge
->nxt_edge
;
556 static int count_right_edge_num ( bucket2
* bkt
) //63 127 same
562 edge_node
* right_edge
= bkt
->kmer_info
.right
;
567 right_edge
= right_edge
->nxt_edge
;
575 /*************************************************
579 Searches the node that a node's kmer-edge end with.
581 1. ht: the graph hashtable
583 3. node: the node whose kmer-edge will be searched
584 4. edge: the kmer-edge
585 5. is_left: whether the kmer-edge on the node's left side
587 1. smaller: whether the searched result, a kmer is smaller than its reversed complement
589 A pointer to the found node.
591 *************************************************/
592 static bucket2
* lastKmer ( hashtable2
* ht
, int K_size
, bucket2
* node
, edge_node
* edge
, int is_left
, int & smaller
) //NEW
594 if ( !node
|| !edge
) { return NULL
; }
596 kmer_t2 t_kmer
, f_kmer
;
597 t_kmer
= node
->kmer_t2
;
599 memset ( edge_seq
.kmer
, 0, sizeof ( edge_seq
) );
600 ( edge_seq
.kmer
) [sizeof ( edge_seq
) / sizeof ( uint64_t ) - 1] = edge
->edge
;
601 int edge_len
= edge
->len
+ 1;
603 if ( edge_len
> K_size
)
605 fprintf ( stderr
, "ERROR: g value should be no great than kmer size!\n" );
610 initKmerFilter ( K_size
, &KMER_FILTER
);
612 if ( is_left
) //left edge
614 kmerMoveRight ( &t_kmer
, edge_len
);
615 kmerMoveLeft ( &edge_seq
, K_size
- edge_len
);
616 kmerOr ( &t_kmer
, &edge_seq
);
617 kmerAnd ( &t_kmer
, &KMER_FILTER
);
621 kmerMoveLeft ( &t_kmer
, edge_len
);
622 kmerOr ( &t_kmer
, &edge_seq
);
623 kmerAnd ( &t_kmer
, &KMER_FILTER
);
627 reverseCompKmer ( &f_kmer
, K_size
);
629 if ( kmerCompare ( &t_kmer
, &f_kmer
) > 0 )
634 else { smaller
= 1; }
636 return search_kmer ( ht
, &t_kmer
);
639 static bucket2
* search_kmer ( hashtable2
* ht
, kmer_t2
* t_kmer
)
641 uint64_t hv
= MurmurHash64A ( t_kmer
, sizeof ( kmer_t2
), 0 );
642 size_t hash_idx
= ( size_t ) ( hv
% ht
->ht_sz
);
643 bucket2
* starter
= ht
->store_pos
[hash_idx
];
647 if ( kmerCompare ( & ( starter
->kmer_t2
), t_kmer
) == 0 )
652 starter
= starter
->nxt_bucket
;
661 static void removeEdge ( bucket2
* node
, edge_node
* edge
, int is_left
) // remove only one side ... //63 127 same ...
663 edge_node
* pre_edge
= NULL
, *cur_edge
= NULL
, *nxt_edge
= NULL
;
665 if ( !node
|| !edge
)
672 cur_edge
= node
->kmer_info
.left
;
674 if ( cur_edge
== NULL
)
679 if ( cur_edge
== edge
)
681 nxt_edge
= cur_edge
->nxt_edge
;
684 node
->kmer_info
.left
= nxt_edge
;
690 cur_edge
= node
->kmer_info
.right
;
692 if ( cur_edge
== NULL
)
697 if ( cur_edge
== edge
)
699 nxt_edge
= cur_edge
->nxt_edge
;
702 node
->kmer_info
.right
= nxt_edge
;
708 cur_edge
= cur_edge
->nxt_edge
;
712 if ( cur_edge
== edge
) { break; }
715 cur_edge
= cur_edge
->nxt_edge
;
720 nxt_edge
= cur_edge
->nxt_edge
;
723 pre_edge
->nxt_edge
= nxt_edge
;
729 static void stat_edge_num ( hashtable2
* ht
) //63 127 same
731 int l_num
= 0, r_num
= 0;
732 size_t total_edge_num
= 0, total_node_num
= 0;
733 bucket2
* bkt
= NULL
;
734 map
<int, size_t> edge_num_map
;
736 for ( size_t i
= 0; i
< ht
->ht_sz
; i
++ )
738 bkt
= ht
->store_pos
[i
];
743 l_num
= count_left_edge_num ( bkt
);
744 r_num
= count_right_edge_num ( bkt
);
745 total_edge_num
+= ( l_num
+ r_num
);
746 edge_num_map
[l_num
]++;
747 edge_num_map
[r_num
]++;
748 bkt
= bkt
->nxt_bucket
;
752 ofstream
o_edge_num ( "edge_num_stat.txt" );
753 o_edge_num
<< "Total nodes number:" << total_node_num
<< endl
;
754 o_edge_num
<< "Total kmer-edges number:" << total_edge_num
<< endl
;
755 o_edge_num
<< "Average kmer-edges number per node:" << ( double ) total_edge_num
/ total_node_num
<< endl
;
756 o_edge_num
<< "The frequence of kmer-edges number on a node's one side as below :" << endl
;
757 map
<int, size_t>::iterator it
;
759 for ( it
= edge_num_map
.begin(); it
!= edge_num_map
.end(); ++it
)
761 o_edge_num
<< it
->first
<< "\t" << it
->second
<< endl
;
769 static void stat_edge_cvg_len ( hashtable2
* ht
)
771 map
<int, size_t> edge_cvg_map
;
772 map
<int, size_t> edge_len_map
;
773 bucket2
* bkt
= NULL
;
774 edge_node
* temp_edge
= NULL
;
776 for ( size_t i
= 0; i
< ht
->ht_sz
; i
++ )
778 bkt
= ht
->store_pos
[i
];
783 temp_edge
= bkt
->kmer_info
.left
;
787 edge_cvg_map
[temp_edge
->edge_cov
]++;
788 edge_len_map
[temp_edge
->len
]++;
789 temp_edge
= temp_edge
->nxt_edge
;
793 temp_edge
= bkt
->kmer_info
.right
;
797 edge_cvg_map
[temp_edge
->edge_cov
]++;
798 edge_len_map
[temp_edge
->len
]++;
799 temp_edge
= temp_edge
->nxt_edge
;
802 bkt
= bkt
->nxt_bucket
;
806 ofstream
o_edge_cvg ( "edge_cvg_stat.txt" );
807 ofstream
o_edge_len ( "edge_len_stat.txt" );
808 map
<int, size_t>::iterator it
;
810 for ( it
= edge_cvg_map
.begin(); it
!= edge_cvg_map
.end(); ++it
)
812 o_edge_cvg
<< it
->first
<< "\t" << it
->second
<< endl
;
815 for ( it
= edge_len_map
.begin(); it
!= edge_len_map
.end(); ++it
)
817 o_edge_len
<< it
->first
<< "\t" << it
->second
<< endl
;
826 /*************************************************
830 This is the main function for building edges by compacting the linear nodes.
832 1. ht: the graph hashtable
834 3. out_file: the name of output file containing edges sequence
839 *************************************************/
840 void kmer2edges ( hashtable2
* ht
, int K_size
, char * outfile
)
844 sprintf ( temp
, "%s", outfile
);
845 fp
= fopen ( temp
, "w" );
849 fprintf ( stderr
, "ERROR: Can't create file %s. \n", temp
);
853 make_edge ( ht
, K_size
, fp
);
857 static void make_edge ( hashtable2
* ht
, int K_size
, FILE * fp
) //63 127 same
861 for ( size_t i
= 0; i
< ht
->ht_sz
; ++i
)
863 bktptr
= ht
->store_pos
[i
];
865 while ( bktptr
!= NULL
)
867 startEdgeFromNode ( ht
, K_size
, bktptr
, fp
);
868 bktptr
= bktptr
->nxt_bucket
;
874 /*************************************************
878 Constructs edges from a branched node or end node.
879 for every branch (left , right)
880 1. Puts the linear node into a stack
881 2. Checks the edge to be built form the stack are plalindrome or not
882 3. Builds an edge by merge the linear nodes
884 1. ht: the graph hashtable
886 3. fp: the file pointer for writing out edge sequences
891 *************************************************/
892 static int startEdgeFromNode ( hashtable2
* ht
, int K_size
, bucket2
* node
, FILE * fp
)
894 static size_t call_times
;
897 if ( node
->kmer_info
.linear
|| node
->kmer_info
.deleted
)
899 return 0;//linear node ...
903 left
= count_left_edge_num ( node
);
904 right
= count_right_edge_num ( node
);
906 if ( left
== 0 && right
== 0 )
908 return 0; //it's a dead node
911 list
<stacked_node2
*> stack
;
912 edge_node
* t_edge
= NULL
, *t_next
= NULL
;
913 stacked_node2
* t_stacked_node
= NULL
;
914 vector
<preEDGE2
> loops_edges
;
917 t_edge
= node
->kmer_info
.right
;
921 if ( t_edge
->used
== 1 )
923 t_edge
= t_edge
->nxt_edge
;
927 t_stacked_node
= ( stacked_node2
* ) malloc ( sizeof ( stacked_node2
) );
928 t_stacked_node
->node
= node
;
929 t_stacked_node
->is_left
= 0;
930 t_stacked_node
->edge
= t_edge
;
931 t_stacked_node
->next
= NULL
;
932 stack
.push_back ( t_stacked_node
);
934 stringBeads ( ht
, K_size
, stack
, t_stacked_node
, t_edge
, &node_c
);
935 process_1stack ( ht
, K_size
, stack
, fp
, loops_edges
);
936 t_next
= t_edge
->nxt_edge
;//because this procedure will remove the edge t_edge
937 dislink ( ht
, K_size
, stack
.front() );
939 if ( stack
.size() > 2 )
941 stack
.pop_back();//change the stack
943 if ( stack
.back() && stack
.size() > 1 ) //last but second node
945 dislink ( ht
, K_size
, stack
.back() );
949 stacked_node2
* head
, *tmp_node
;
950 head
= stack
.front();
964 t_edge
= node
->kmer_info
.left
;
968 if ( t_edge
->used
== 1 )
970 t_edge
= t_edge
->nxt_edge
;
974 t_stacked_node
= ( stacked_node2
* ) malloc ( sizeof ( stacked_node2
) );
975 t_stacked_node
->node
= node
;
976 t_stacked_node
->is_left
= 1;
977 t_stacked_node
->edge
= t_edge
;
978 t_stacked_node
->next
= NULL
;
979 stack
.push_back ( t_stacked_node
);
981 stringBeads ( ht
, K_size
, stack
, t_stacked_node
, t_edge
, &node_c
); //
982 process_1stack ( ht
, K_size
, stack
, fp
, loops_edges
);
983 t_next
= t_edge
->nxt_edge
;//because this procedure will remove the edge t_edge
984 dislink ( ht
, K_size
, stack
.front() );
986 if ( stack
.size() > 2 )
988 stack
.pop_back();//change the stack
990 if ( stack
.back() && stack
.size() > 1 ) //last but second node
992 dislink ( ht
, K_size
, stack
.back() );
996 //debug<<"before free stack"<<endl;
997 stacked_node2
* head
, *tmp_node
;
998 head
= stack
.front();
1011 if ( loops_edges
.size() > 0 )
1013 //fprintf(stderr,"loops_edges size %llu\n",loops_edges.size());
1016 size
= loops_edges
.size();
1020 for ( i
= 0; i
< size
; i
++ )
1022 string seq
= * ( loops_edges
[i
].full_edge
);
1023 string rc_seq
= revCompSeq ( seq
);
1025 if(seq.compare("AATTGGACGTGAGAGCAAATTGTATTGAGCATACAATTTGCTCTCACGTCCAATT") == 0) {
1026 fprintf(stderr,"in loops_edges %d %s\n",i,seq.c_str());
1030 if(seq.compare("AATTGGACGTGAGAGCAAATTGTATGCTCAATACAATTTGCTCTCACGTCCAATT") == 0) {
1031 fprintf(stderr,"in loops_edges %d %s\n",i,seq.c_str());
1036 fprintf(stderr, "%d %s\n",i,seq.c_str());
1037 fprintf(stderr, "%d %s\n",i,rc_seq.c_str());
1040 for ( j
= i
+ 1; j
< size
; j
++ )
1042 string cur_seq
= * ( loops_edges
[j
].full_edge
);
1044 if ( seq
.compare ( cur_seq
) == 0 )
1046 fprintf ( stderr
, "ERROR: two equal loop edge sequence from same node, this should not happen!\n" );
1047 fprintf ( stderr
, "%s\n", seq
.c_str() );
1051 if ( rc_seq
.compare ( cur_seq
) == 0 )
1053 fprintf ( stderr
, "INFO: two loop edge sequence are reversed complemental!\n" );
1054 fprintf ( stderr
, "%s\n", seq
.c_str() );
1055 fprintf ( stderr
, "%s\n", rc_seq
.c_str() );
1057 loops_edges
[j
].cvg
+= loops_edges
[i
].cvg
;
1064 output_1edge ( &loops_edges
[i
], K_size
, fp
);
1065 //fprintf(stderr,"need output %d %s\n",i,seq.c_str());
1068 delete ( loops_edges
[i
].full_edge
);
1078 /*************************************************
1082 Puts the linear nodes into a stack.
1084 1. ht: the graph hashtalbe
1085 2. K_size: kmer size
1087 4. from_node: the starting node of the stack
1088 5. from_edge: the kmer-edge of the first node
1094 *************************************************/
1095 static void stringBeads ( hashtable2
* ht
, int K_size
, list
<stacked_node2
*> &stack
, stacked_node2
* from_node
, edge_node
* from_edge
, int * node_c
)
1097 static size_t call_times
;
1099 bucket2
* t_bucket
= from_node
->node
;
1100 edge_node
* t_edge
= from_edge
;
1101 stacked_node2
* t_stacked_node
= from_node
;
1102 int is_left
= from_node
->is_left
;
1105 t_bucket
= lastKmer ( ht
, K_size
, t_bucket
, t_edge
, is_left
, t_smaller
);
1109 fprintf ( stderr
, "ERROR: to node not found in stringBeads()\n" );
1113 while ( t_bucket
&& t_bucket
->kmer_info
.linear
)
1115 t_stacked_node
= ( stacked_node2
* ) malloc ( sizeof ( stacked_node2
) );
1116 t_stacked_node
->node
= t_bucket
;
1117 is_left
= ! ( is_left
^ t_smaller
);
1118 t_stacked_node
->is_left
= is_left
;
1120 if ( is_left
) { t_stacked_node
->edge
= t_bucket
->kmer_info
.left
; }
1121 else { t_stacked_node
->edge
= t_bucket
->kmer_info
.right
; }
1123 t_stacked_node
->next
= NULL
;
1124 ( ( stacked_node2
* ) stack
.back() )->next
= t_stacked_node
;
1125 stack
.push_back ( t_stacked_node
);
1126 t_stacked_node
->edge
->used
= 1;
1127 t_bucket
= lastKmer ( ht
, K_size
, t_bucket
, t_stacked_node
->edge
, is_left
, t_smaller
);
1130 if ( t_bucket
) //should be always true for end node ..
1132 t_stacked_node
= ( stacked_node2
* ) malloc ( sizeof ( stacked_node2
) );
1133 t_stacked_node
->node
= t_bucket
;
1134 is_left
= ! ( is_left
^ t_smaller
);
1135 t_stacked_node
->is_left
= is_left
;
1136 t_stacked_node
->edge
= NULL
;
1137 t_stacked_node
->next
= NULL
;
1138 ( ( stacked_node2
* ) stack
.back() )->next
= t_stacked_node
;
1139 stack
.push_back ( t_stacked_node
);
1143 static void pirntStack ( list
<stacked_node2
*> &stack
)
1145 static int times
= 0;
1146 fprintf ( stderr
, "call times %d \n ", times
++ );
1147 stacked_node2
* ptr
= stack
.front();
1151 printKmer ( & ( ptr
->node
->kmer_t2
), stderr
);
1154 { fprintf ( stderr
, "%llx , %d ,", ptr
->edge
->edge
, ptr
->is_left
); }
1156 fprintf ( stderr
, "->" );
1160 fprintf ( stderr
, "\n" );
1162 /*************************************************
1166 Processes the nodes in one stack
1167 1. Compacts the nodes to an edge
1168 2. Checks palindrome
1169 3. Calculates coverage
1171 1. ht: the graph hashtable
1172 2. K_size: kmer size
1174 4. fp: the file pointer for writing
1179 *************************************************/
1180 static void process_1stack ( hashtable2
* ht
, int K_size
, list
<stacked_node2
*> &stack
, FILE * fp
, vector
<preEDGE2
> &loops_edges
)
1182 static size_t edge_c
;// edge id
1183 static preEDGE2 long_edge_buf
;
1185 int TipLenTh
= 3 * K_size
; //orig 100
1188 if ( stack
.size() < 2 )
1190 fprintf ( stderr
, "only %llu nodes in the stack \n", stack
.size() );
1196 string full_edge
= stack2string ( ht
, K_size
, stack
); //when output skip the first kmer first
1197 stacked_node2
* test
= stack
.front();
1198 bool palindrome
= check_palindrome ( full_edge
);
1199 int bal_edge
= !palindrome
;
1200 stacked_node2
* from_node
= stack
.front();
1201 stacked_node2
* to_node
= stack
.back();
1202 long_edge_buf
.from_node
= from_node
;
1203 long_edge_buf
.to_node
= to_node
;
1204 long_edge_buf
.full_edge
= &full_edge
;
1205 long_edge_buf
.bal_edge
= bal_edge
;
1206 uint64_t symbol
= 0; //cvg stat
1209 if ( stack
.size() == 2 )
1211 long_edge_buf
.cvg
= from_node
->edge
->edge_cov
;
1215 stacked_node2
* nd_tmp
= from_node
;
1217 while ( nd_tmp
&& nd_tmp
->edge
)
1219 symbol
+= nd_tmp
->edge
->edge_cov
* ( nd_tmp
->edge
->len
+ 1 );
1220 nd_tmp
= nd_tmp
->next
;
1223 int cvg
= symbol
/ ( full_edge
.size() - K_size
);
1224 long_edge_buf
.cvg
= cvg
;
1227 int from_left
, from_right
, to_left
, to_right
;
1228 from_left
= count_left_edge_num ( from_node
->node
);
1229 from_right
= count_right_edge_num ( from_node
->node
);
1230 to_left
= count_left_edge_num ( to_node
->node
);
1231 to_right
= count_right_edge_num ( to_node
->node
);
1235 if ( ( ( from_left
+ from_right
== 1 ) && ( to_left
+ to_right
== 1 ) && ( full_edge
.size() < TipLenTh
) )
1236 || ( ( ( from_left
+ from_right
== 1 ) || ( to_left
+ to_right
== 1 ) )
1237 && ( full_edge
.size() < TipLenTh
) && long_edge_buf
.cvg
< TipCovTh
) ) //tips args
1239 //if(full_edge.size()<TipLenTh && long_edge_buf.cvg<TipCovTh){//it's a tip or low cvg link
1240 static size_t tip_num
;
1247 string bug_seq = *(long_edge_buf.full_edge);
1248 if(bug_seq.compare("AATTGGACGTGAGAGCAAATTGTATTGAGCATACAATTTGCTCTCACGTCCAATT") == 0) {
1249 fprintf(stderr,"%s\n",bug_seq.c_str());
1250 fprintf(stderr,"from %llx to %llx \n",long_edge_buf.from_node->node,long_edge_buf.to_node->node);
1254 if(bug_seq.compare("AATTGGACGTGAGAGCAAATTGTATGCTCAATACAATTTGCTCTCACGTCCAATT") == 0) {
1255 fprintf(stderr,"%s\n",bug_seq.c_str());
1256 fprintf(stderr,"from %llx to %llx \n",long_edge_buf.from_node->node,long_edge_buf.to_node->node);
1261 if ( long_edge_buf
.from_node
->node
== long_edge_buf
.to_node
->node
)
1263 loops
= long_edge_buf
;
1264 loops
.full_edge
= new string ( * ( long_edge_buf
.full_edge
) );
1265 loops_edges
.push_back ( loops
);
1270 output_1edge ( &long_edge_buf
, K_size
, fp
);
1280 // WARNING: the kmer atcg is different from soapdenovo's represent
1281 static void output_1edge ( preEDGE2
* long_edge
, int K_size
, FILE * fp
)
1283 fprintf ( fp
, ">length %d,", long_edge
->full_edge
->size() - K_size
);
1284 const char * seq
= long_edge
->full_edge
->c_str();
1285 //uint64_t from_kmer[2],to_kmer[2];
1286 kmer_t2 from_kmer
, to_kmer
;
1287 get_kmer_from_seq ( seq
, long_edge
->full_edge
->size() , K_size
, 0, &from_kmer
);
1288 get_kmer_from_seq ( seq
, long_edge
->full_edge
->size() , K_size
, long_edge
->full_edge
->size() - K_size
, &to_kmer
);
1289 uint64_t * from
, *to
;
1290 from
= from_kmer
.kmer
;
1293 fprintf ( fp
, "%llx %llx,", from
[0], from
[1] );
1294 fprintf ( fp
, "%llx %llx,", to
[0], to
[1] );
1297 fprintf ( fp
, "%llx %llx %llx %llx,", from
[0], from
[1], from
[2], from
[3] );
1298 fprintf ( fp
, "%llx %llx %llx %llx,", to
[0], to
[1], to
[2], to
[3] );
1300 fprintf ( fp
, "cvg %d,%d\n", long_edge
->cvg
, long_edge
->bal_edge
);
1301 fprintf ( fp
, "%s", seq
);
1302 fprintf ( fp
, "\n" );
1306 /*************************************************
1310 Marks the kmer-edges between "form_node" and "form_node->next" used.
1312 1. ht: the graph hashtable
1313 2. K_size: kmer size
1314 3. from_node: the first node of a stack
1319 *************************************************/
1320 static void dislink ( hashtable2
* ht
, int K_size
, stacked_node2
* from_node
)
1322 from_node
->edge
->used
= 1;
1323 int from_edge_len
= from_node
->edge
->len
;
1326 if ( from_node
->next
)
1328 stacked_node2
* from_next
= from_node
->next
;
1330 if ( from_next
->node
== from_node
->node
&& from_next
->is_left
!= from_node
->is_left
)
1335 if ( from_next
->is_left
) //remove right edge
1337 if ( from_next
->node
->kmer_info
.linear
)
1339 bucket2
* node_tmp
= lastKmer ( ht
, K_size
, from_next
->node
, from_next
->node
->kmer_info
.right
, 0, smaller
);
1341 if ( node_tmp
== from_node
->node
)
1343 from_next
->node
->kmer_info
.right
->used
= 1;
1347 fprintf ( stderr
, "ERROR: to node not found in dislink()\n" );
1352 edge_node
* edge_tmp
= from_next
->node
->kmer_info
.right
;
1356 bucket2
* node_tmp
= lastKmer ( ht
, K_size
, from_next
->node
, edge_tmp
, 0, smaller
);
1358 if ( node_tmp
== from_node
->node
&& edge_tmp
->len
== from_edge_len
) //there may be two or more edges between two nodes
1364 edge_tmp
= edge_tmp
->nxt_edge
;
1369 fprintf ( stderr
, "ERROR: to node not found in dislink()\n" );
1373 else // remove left edge
1375 if ( from_next
->node
->kmer_info
.linear
)
1377 bucket2
* node_tmp
= lastKmer ( ht
, K_size
, from_next
->node
, from_next
->node
->kmer_info
.left
, 1, smaller
);
1379 if ( node_tmp
== from_node
->node
)
1381 from_next
->node
->kmer_info
.left
->used
= 1;
1385 fprintf ( stderr
, "ERROR: to node not found in dislink()\n" );
1390 edge_node
* edge_tmp
= from_next
->node
->kmer_info
.left
;
1394 bucket2
* node_tmp
= lastKmer ( ht
, K_size
, from_next
->node
, edge_tmp
, 1, smaller
);
1396 if ( node_tmp
== from_node
->node
&& edge_tmp
->len
== from_edge_len
)
1402 edge_tmp
= edge_tmp
->nxt_edge
;
1407 fprintf ( stderr
, "ERROR: to node not found in dislink()\n" );
1415 /*************************************************
1419 Compacts the nodes in the stack to a sequence.
1421 1. ht: the graph hashtable
1422 2. K_size: kmer size
1423 3. stack: the stack for processing
1427 The compacted string, namely the edge sequence.
1428 *************************************************/ //63 127 differ, fixed
1429 static string
stack2string ( hashtable2
* ht
, int K_size
, list
<stacked_node2
*> & stack
)
1431 static size_t call_times
;
1434 stacked_node2
* t_stack_node
= stack
.front();
1437 kmer_t2 tmp_kmer
= ( t_stack_node
->node
->kmer_t2
);
1439 if ( t_stack_node
->is_left
)
1441 reverseCompKmer ( &tmp_kmer
, K_size
);
1447 bitsarr2str ( tmp_kmer
.kmer
, K_size
, tmp
, sizeof ( kmer_t2
) / sizeof ( uint64_t ) );
1448 full_edge
.append ( tmp
); //put first node
1450 while ( t_stack_node
)
1452 if ( t_stack_node
->edge
)
1454 if ( t_stack_node
->is_left
)
1456 bits
[0] = get_rev_comp_seq ( t_stack_node
->edge
->edge
, t_stack_node
->edge
->len
+ 1 );
1457 bitsarr2str ( bits
, t_stack_node
->edge
->len
+ 1, tmp
, 1 );
1458 full_edge
.append ( tmp
);
1462 bits
[0] = t_stack_node
->edge
->edge
;
1463 bitsarr2str ( bits
, t_stack_node
->edge
->len
+ 1, tmp
, 1 );
1464 full_edge
.append ( tmp
);
1468 t_stack_node
= t_stack_node
->next
;
1474 static bool check_palindrome ( string
& str
) //63 127 same
1476 size_t size
= str
.size();
1477 size_t mid
= ( size
/ 2 ) + 1;
1479 for ( size_t i
= 0; i
< mid
; ++i
)
1485 if ( str
[size
- i
- 1] != 'T' ) { return 0; }
1490 if ( str
[size
- i
- 1] != 'G' ) { return 0; }
1495 if ( str
[size
- i
- 1] != 'A' ) { return 0; }
1500 if ( str
[size
- i
- 1] != 'C' ) { return 0; }
1509 static string
revCompSeq ( const string
& str
)
1512 size_t size
= str
.size();
1514 for ( int i
= size
- 1; i
>= 0; i
-- )
1519 rc_seq
.push_back ( 'T' );
1522 rc_seq
.push_back ( 'G' );
1525 rc_seq
.push_back ( 'A' );
1528 rc_seq
.push_back ( 'C' );