modified: nfig1.py
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / sparsePregraph / build_preArc.cpp
blobad4f2ca4530614ccbae76af1c91cc7a7c4db89f4
1 /*
2 * build_preArc.cpp
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
24 #include "stdinc.h"
25 #include "build_preArc.h"
26 #include "seq_util.h"
27 #include "global.h"
28 #include "multi_threads.h"
30 #include <unistd.h>
32 #include <math.h>
33 #include "bam.h"
34 #include "faidx.h"
35 #include "knetfile.h"
36 #include "sam_view.h"
37 #include "xcurses.h"
38 #include "zlib.h"
39 #include "bgzf.h"
40 #include "glf.h"
41 #include "kstring.h"
42 #include "razf.h"
43 #include "sam_header.h"
44 #include "zconf.h"
46 #include "sam.h"
48 #include "sparse_kmer.h"
50 //programming guide:
52 //step1:read edge & build vertex hash
53 //step1_include:
54 //a.create id for every edge & it's rev_comp id
55 //b.build vertex for every edge (done...)
57 //step2:read reads & chop kmers
59 //step3:search vertex with the kmers & mark the found position & get the Vertex
60 //step3_extend:if found two or more vetex in one read , record the path to solve tiny repeat
62 //step4:compare the left kmer right kmer to vertex's edge_kmer
64 //step5: if left & right both match build a preArc or increase the multiplicity by 1
68 void init_vertex_hash ( vertex_hash2 * v_ht, size_t sz )
70 v_ht->ht_sz = sz;
71 v_ht->store_pos = ( vertex2 ** ) calloc ( sz, sizeof ( vertex2 * ) );
75 /*************************************************
76 Function:
77 build_vertexes
78 Description:
79 1. Reads sequence from *.sparse.edge.
80 2. Builds vertexes by cutting the edge sequence's end kmers.
81 Input:
82 1. v_ht: vertex hashtable
83 2. K_size: kmer size
84 3. edge_file: edge file
85 Output:
86 None.
87 Return:
88 None.
89 *************************************************/
90 void build_vertexes ( vertex_hash2 * v_ht, int K_size, char * edge_file )
92 FILE * fp;
93 kmer_t2 from_kmer, to_kmer;
94 size_t line_len, edge_len_left;
95 int edge_len;
96 int cvg;
97 bool bal_ed;//»ØÎÄΪ0
98 const int BUFF_LEN = 1024;
99 char line[BUFF_LEN];
100 char str[32];
101 char to_buff[BUFF_LEN];//buffer 2k edge seq BUFF_LEN>4*K_size
102 int processed = 0; //0±íʾδ´¦Àí 1±íʾ ´¦ÀíÍê±Ï 2 ±íʾÒѾ­´¦ÀíÁËfrom_vertex ,to_vertex »¹Ã»Óд¦Àí
103 size_t edge_id = 0;
104 fp = fopen ( edge_file, "r" );
106 if ( !fp )
108 fprintf ( stderr, "ERROR: Cannot open edge_file %s. Now exit to system...\n", edge_file );
109 exit ( -1 );
112 vertex2 * v_tmp;
113 edge_starter2 * e_tmp;
114 int is_found;
115 bool is_left;
117 while ( fgets ( line, BUFF_LEN, fp ) != NULL )
119 //debug<<"processed "<<processed<<endl;
120 if ( line[0] == '>' ) //get one edge length, from vertex, to vertex,cvg,bal
122 if ( processed == 1 && bal_ed )
124 edge_id++;//Èç¹û²»ÊÇ»ØÎÄ
127 #ifdef _63MER_
128 sscanf ( line + 7, "%d,%llx %llx ,%llx %llx ,%s %d,%d", &edge_len,
129 & ( from_kmer.kmer ) [0], & ( from_kmer.kmer ) [1], & ( to_kmer.kmer ) [0], & ( to_kmer.kmer ) [1], str, &cvg, &bal_ed ); // from_kmer to_kmer is of no use here
130 #endif
131 #ifdef _127MER_
132 sscanf ( line + 7, "%d,%llx %llx %llx %llx ,%llx %llx %llx %llx ,%s %d,%d", &edge_len,
133 & ( from_kmer.kmer ) [0], & ( from_kmer.kmer ) [1], & ( from_kmer.kmer ) [2], & ( from_kmer.kmer ) [3],
134 & ( to_kmer.kmer ) [0], & ( to_kmer.kmer ) [1], & ( to_kmer.kmer ) [2], & ( to_kmer.kmer ) [3], str, &cvg, &bal_ed ); // from_kmer to_kmer is of no use here
135 #endif
136 edge_len_left = K_size + edge_len;
137 processed = 0;
138 edge_id++;// current edge positive strand id
139 //debug<<line<<"edge_id "<<edge_id<<endl;
141 else
143 if ( processed == 0 )
145 line_len = strlen ( line );
147 if ( line[line_len - 1] == '\n' )
149 line[line_len - 1] = '\0';
150 line_len --;
153 if ( edge_len_left - line_len == 0 ) //edge completely loaded
155 //do all process
156 process_edge ( v_ht, K_size, line, line_len, 1, edge_id, bal_ed );
157 processed = 1;
158 edge_len_left = 0;
159 continue;
161 else //edge partly loaded at the first time.
163 if ( line_len < 2 * K_size ) //line_len < 2*K_size &&edge_len_left - line_len > 0
165 fprintf ( stderr, "ERROR:it won't happen in 63mer/127mer\n" );
166 exit ( 1 );
168 else
170 process_edge ( v_ht, K_size, line, line_len, 2, edge_id, bal_ed );
171 processed = 2;
172 edge_len_left -= line_len;
174 if ( edge_len_left >= 2 * K_size )
176 //no need to buf the to kmer seq
178 else if ( edge_len_left < 2 * K_size )
180 //to_buff[100];/ copy the last 2K char of line to to_buff already no '\n'
181 strcpy ( to_buff, line + ( line_len - 2 * K_size ) );
183 else
185 fprintf ( stderr, "ERROR: in cal the edge_len_left!!\n" );
186 exit ( 1 );
191 else if ( processed == 2 )
193 //if(line[0]=='\n') continue;
194 line_len = strlen ( line );
196 if ( line[line_len - 1] == '\n' )
198 line[line_len - 1] = '\0';
199 line_len --;
202 edge_len_left -= line_len;
204 if ( edge_len_left == 0 ) //load the complete edge sequence
206 //process the to kmer
207 if ( line_len >= 2 * K_size )
209 process_edge ( v_ht, K_size, line, line_len, 3, edge_id, bal_ed );
211 else
213 //need to use the to_buff
214 int buf_len = strlen ( to_buff );
215 strcpy ( to_buff + buf_len, line );
216 buf_len = strlen ( to_buff );
217 process_edge ( v_ht, K_size, to_buff, buf_len, 3, edge_id, bal_ed );
220 processed = 1;
221 continue;
223 else
225 if ( edge_len_left >= 2 * K_size )
227 //no need to buf the to kmer seq
229 else if ( edge_len_left < 2 * K_size )
231 //to_buff[100];/ copy the last 2K char of line to to_buff
232 strcpy ( to_buff, line + ( line_len - 2 * K_size ) );
234 else
236 fprintf ( stderr, "ERROR: in cal the edge_len_left!!\n" );
237 exit ( 1 );
241 else
243 if ( line[0] == '\n' ) { continue; } //µ±len = 1023ʱ
245 fprintf ( stderr, "ERROR: in cal the status_processed !! %d \n", processed );
246 exit ( 1 );
251 fclose ( fp );
255 /*************************************************
256 Function:
257 process_edge
258 Description:
259 It builds vetexes from one or part of one edge sequence.
260 Input:
261 1. v_ht: hashtable
262 2. K_size: kmer size
263 3. seq: edge sequence
264 4. len: edge length
265 5. type: 1: process head and tail; 2: process head ; 3:process tail
266 6. edge_id: edge id
267 7. bal_edge: 0:palindrome 1:else
268 Output:
269 None.
270 Return:
271 None.
272 *************************************************/
273 static void process_edge ( vertex_hash2 * v_ht, int K_size, char * seq, int len, int type, size_t edge_id, bool bal_edge )
275 kmer_t2 vertex_kmer;
276 kmer_t2 edge_kmer;
277 vertex2 * v_tmp;
278 edge_starter2 * e_tmp;
279 int is_found;
280 bool is_left;
281 int edge_kmer_len;
283 switch ( type )
285 case 1: //process all ..
286 //process the head
287 get_kmer_from_seq ( seq, len, K_size, 0, &vertex_kmer );
289 if ( len <= K_size + gap ) //get the last kmer
291 get_kmer_from_seq ( seq, len, K_size, len - K_size, &edge_kmer );
292 edge_kmer_len = len - K_size;
294 else
296 //get_kmer_from_seq(seq, len, K_size, K_size,&edge_kmer);
297 get_kmer_from_seq ( seq, len, K_size, gap, &edge_kmer );
298 edge_kmer_len = gap;
301 is_left = 0;//right
302 v_tmp = put_vertex ( v_ht, vertex_kmer, is_found );
303 put_edge ( v_tmp, edge_kmer, is_left, edge_kmer_len, edge_id );
304 reverseCompKmer ( &vertex_kmer, K_size );
305 reverseCompKmer ( &edge_kmer, K_size );
306 is_left = 1;//left
307 v_tmp = put_vertex ( v_ht, vertex_kmer, is_found );
308 put_edge ( v_tmp, edge_kmer, is_left, edge_kmer_len, edge_id + bal_edge );
309 //process the tail
310 get_kmer_from_seq ( seq, len, K_size, len - K_size, &vertex_kmer );
312 if ( len <= K_size + gap ) //get the first kmer
314 get_kmer_from_seq ( seq, len, K_size, 0, &edge_kmer );
315 edge_kmer_len = len - K_size;
317 else
319 get_kmer_from_seq ( seq, len, K_size, len - K_size - gap, &edge_kmer );
320 edge_kmer_len = gap;
323 is_left = 1;
324 v_tmp = put_vertex ( v_ht, vertex_kmer, is_found );
325 put_edge ( v_tmp, edge_kmer, is_left, edge_kmer_len, edge_id );
326 reverseCompKmer ( &vertex_kmer, K_size );
327 reverseCompKmer ( &edge_kmer, K_size );
328 is_left = 0;//right
329 v_tmp = put_vertex ( v_ht, vertex_kmer, is_found );
330 put_edge ( v_tmp, edge_kmer, is_left, edge_kmer_len, edge_id + bal_edge );
331 break;
332 case 2:
333 //process only the head
334 get_kmer_from_seq ( seq, len, K_size, 0, &vertex_kmer );
336 if ( len <= K_size + gap )
338 get_kmer_from_seq ( seq, len, K_size, len - K_size, &edge_kmer );
339 edge_kmer_len = len - K_size;
341 else
343 get_kmer_from_seq ( seq, len, K_size, gap, &edge_kmer );
344 edge_kmer_len = gap;
347 is_left = 0;//right
348 v_tmp = put_vertex ( v_ht, vertex_kmer, is_found );
349 put_edge ( v_tmp, edge_kmer, is_left, edge_kmer_len, edge_id );
350 reverseCompKmer ( &vertex_kmer, K_size );
351 reverseCompKmer ( &edge_kmer, K_size );
352 is_left = 1;//left
353 v_tmp = put_vertex ( v_ht, vertex_kmer, is_found );
354 put_edge ( v_tmp, edge_kmer, is_left, edge_kmer_len, edge_id + bal_edge );
355 break;
356 case 3:
357 //process only the tail
358 get_kmer_from_seq ( seq, len, K_size, len - K_size, &vertex_kmer );
360 if ( len <= K_size + gap )
362 get_kmer_from_seq ( seq, len, K_size, 0, &edge_kmer );
363 edge_kmer_len = len - K_size;
365 else
367 get_kmer_from_seq ( seq, len, K_size, len - K_size - gap, &edge_kmer );
368 edge_kmer_len = gap;
371 is_left = 1;
372 v_tmp = put_vertex ( v_ht, vertex_kmer, is_found );
373 put_edge ( v_tmp, edge_kmer, is_left, edge_kmer_len, edge_id );
374 reverseCompKmer ( &vertex_kmer, K_size );
375 reverseCompKmer ( &edge_kmer, K_size );
376 is_left = 0;//right
377 v_tmp = put_vertex ( v_ht, vertex_kmer, is_found );
378 put_edge ( v_tmp, edge_kmer, is_left, edge_kmer_len, edge_id + bal_edge );
379 break;
380 default:
381 fprintf ( stderr, "ERROR: wrong process type in process_edge()\n" );
382 exit ( 1 );
387 static vertex2 * put_vertex ( vertex_hash2 * v_ht, kmer_t2 vertex_kmer, int & is_found ) //63 127 differ fixed
389 uint64_t hv = MurmurHash64A ( vertex_kmer.kmer, sizeof ( kmer_t2 ), 0 ); //hash value
390 uint64_t idx = ( size_t ) ( hv % v_ht->ht_sz );
391 vertex2 * ver = ( v_ht->store_pos ) [idx];
393 if ( !ver )
395 ( v_ht->store_pos ) [idx] = ( vertex2 * ) malloc ( sizeof ( vertex2 ) );
396 ver = ( v_ht->store_pos ) [idx];
397 ver->kmer_t2 = vertex_kmer;
398 ver->left = NULL;
399 ver->right = NULL;
400 ver->next = NULL;
401 is_found = 0;
402 return ver;
405 while ( ver )
407 if ( kmerCompare ( & ( ver->kmer_t2 ), &vertex_kmer ) == 0 )
409 is_found = 1;
410 return ver;
413 if ( ver->next == NULL ) { break; }
415 ver = ver->next;
418 is_found = 0;
419 ver->next = ( vertex2 * ) malloc ( sizeof ( vertex2 ) );
420 ver->next->kmer_t2 = vertex_kmer;
421 ver->next->left = NULL;
422 ver->next->right = NULL;
423 ver->next->next = NULL;
424 return ver->next;
427 static void put_edge ( vertex2 * ver, kmer_t2 edge_kmer, bool is_left, int len, size_t edge_id ) //fixed
429 edge_starter2 * tmp = NULL;
431 if ( is_left )
433 if ( !ver->left )
435 ver->left = ( edge_starter2 * ) malloc ( sizeof ( edge_starter2 ) );
436 ver->left->edge_kmer = edge_kmer;
437 ver->left->edge_id = edge_id;
438 ver->left->len = len;//record the length of edge (1~k)
439 ver->left->next = NULL;
440 return;
443 tmp = ver->left;
445 else
447 if ( !ver->right )
449 ver->right = ( edge_starter2 * ) malloc ( sizeof ( edge_starter2 ) );
450 ver->right->edge_kmer = edge_kmer;
451 ver->right->edge_id = edge_id;
452 ver->right->len = len;//record the length of edge (1~k)
453 ver->right->next = NULL;
454 return;
457 tmp = ver->right;
460 while ( tmp->next ) //because there are no two edges equal attached with one node ...
462 tmp = tmp->next;
465 tmp->next = ( edge_starter2 * ) malloc ( sizeof ( edge_starter2 ) );
466 tmp->next->edge_kmer = edge_kmer;
467 tmp->next->edge_id = edge_id;
468 tmp->next->len = len;//record the length of edge (1~k)
469 tmp->next->next = NULL;
475 static vertex2 * search_vertex ( vertex_hash2 * v_ht, kmer_t2 * vertex_kmer ) //fixed ...
477 uint64_t hv = MurmurHash64A ( vertex_kmer->kmer, sizeof ( kmer_t2 ), 0 ); //hash value
478 uint64_t idx = ( size_t ) ( hv % v_ht->ht_sz );
479 vertex2 * ver = ( v_ht->store_pos ) [idx];
481 while ( ver )
483 if ( kmerCompare ( & ( ver->kmer_t2 ), vertex_kmer ) == 0 )
485 return ver;
488 ver = ver->next;
491 return NULL;
496 void init_preArc_array ( preArc_array * arc_array, size_t sz ) //63 127 same
498 arc_array->array_sz = sz;
499 arc_array->store_pos = ( preArc ** ) calloc ( sz, sizeof ( preArc * ) );
503 static void chop_kmers ( const char * read, int len, int K_size, kmer_t2 * kmer_array, int kmer_array_len, int & kmer_num )
505 if ( len <= K_size )
507 kmer_num = 0;
508 return ;
511 kmer_num = len - K_size + 1;
513 if ( kmer_num > kmer_array_len )
515 fprintf ( stderr, "ERROR: the kmer_array_len is not enough! %d\n", kmer_num );
516 exit ( 1 );
519 kmer_t2 kmer;
521 for ( int i = 0; i < kmer_num; ++i ) //optimize later
523 get_kmer_from_seq ( read, len, K_size, i, &kmer );
524 kmer_array[i] = kmer;
528 /*************************************************
529 Function:
530 put_preArc_threaded
531 Description:
532 Stores one preArc into preArc_array
533 Input:
534 1. arc_arr: preArc array
535 2. locks: the locks array for modifying preArc array
536 3. left_id: left edge id
537 4. right_id: right edge id
538 5. added_multi: added weigth
539 Output:
540 None.
541 Return:
542 None.
543 *************************************************/
544 static inline void put_preArc_threaded ( preArc_array * arc_arr, pthread_spinlock_t * locks, size_t left_id, size_t right_id, int added_multi )
546 pthread_spin_lock ( &locks[left_id] );
547 put_preArc ( arc_arr, left_id, right_id, added_multi );
548 pthread_spin_unlock ( &locks[left_id] );
552 static inline void put_preArc ( preArc_array * arc_arr, size_t left_id, size_t right_id, int added_multi )
554 preArc * arc = ( arc_arr->store_pos ) [left_id];
556 if ( !arc )
558 ( arc_arr->store_pos ) [left_id] = ( preArc * ) malloc ( sizeof ( preArc ) );
559 arc = ( arc_arr->store_pos ) [left_id];
560 arc->to_ed = right_id;
561 arc->multiplicity = added_multi;
562 arc->next = NULL;
563 return;
566 while ( arc )
568 if ( arc->to_ed == right_id )
570 arc->multiplicity += added_multi;
571 return;
574 if ( arc->next == NULL ) { break; }
576 arc = arc->next;
579 arc->next = ( preArc * ) malloc ( sizeof ( preArc ) );
580 arc->next->to_ed = right_id;
581 arc->next->multiplicity = added_multi;
582 arc->next->next = NULL;
586 static inline void put_preArc ( preArc_array * arc_arr, size_t left_id, size_t right_id )
588 preArc * arc = ( arc_arr->store_pos ) [left_id];
590 if ( !arc )
592 ( arc_arr->store_pos ) [left_id] = ( preArc * ) malloc ( sizeof ( preArc ) );
593 arc = ( arc_arr->store_pos ) [left_id];
594 arc->to_ed = right_id;
595 arc->multiplicity = 1;
596 arc->next = NULL;
597 return;
600 while ( arc )
602 if ( arc->to_ed == right_id )
604 arc->multiplicity++;
605 return;
608 if ( arc->next == NULL ) { break; }
610 arc = arc->next;
613 arc->next = ( preArc * ) malloc ( sizeof ( preArc ) );
614 arc->next->to_ed = right_id;
615 arc->next->multiplicity = 1;
616 arc->next->next = NULL;
620 void output_preArcs ( preArc_array * arc_arr, char * outfile )
622 FILE * fp;
623 fp = fopen ( outfile, "w" );
625 if ( !fp )
627 fprintf ( stderr, "ERROR: can't create file %s in output_preArc\n", outfile );
628 exit ( 1 );
631 preArc * parc;
633 for ( size_t i = 0; i < arc_arr->array_sz; ++i )
635 parc = ( arc_arr->store_pos ) [i];
637 if ( parc )
639 fprintf ( fp, "%u", i );
640 int j = 0;
642 while ( parc )
644 j++;
645 fprintf ( fp, " %u %u", parc->to_ed, parc->multiplicity );
646 parc = parc->next;
648 if ( parc && j % 4 == 0 )
650 fprintf ( fp, "\n" );
651 fprintf ( fp, "%u", i );
655 fprintf ( fp, "\n" );
659 fclose ( fp );
663 static void free_vertex ( vertex2 * tmp )
665 edge_starter2 * edge_s, *edge_s2;
666 edge_s = tmp->left;
668 while ( edge_s )
670 edge_s2 = edge_s;
671 edge_s = edge_s->next;
672 free ( edge_s2 );
675 edge_s = tmp->right;
677 while ( edge_s )
679 edge_s2 = edge_s;
680 edge_s = edge_s->next;
681 free ( edge_s2 );
684 free ( tmp );
687 void free_vertex_hash ( vertex_hash2 * v_ht )
689 vertex2 * tmp, *tmp2;
691 for ( size_t i = 0; i < v_ht->ht_sz; ++i )
693 tmp = ( v_ht->store_pos ) [i];
695 while ( tmp )
697 tmp2 = tmp;
698 tmp = tmp->next;
699 free_vertex ( tmp2 );
703 free ( v_ht->store_pos );
706 /*************************************************
707 Function:
708 process_1read_preArc
709 Description:
710 This is the core function for building preArcs.
711 1. Chops one read into kmers.
712 2. Searches the kmers in vertex hash.
713 3. Aligns the vertex's kmer-edge sequences to the read sequence on both sides.
714 4. Constructs preArcs according the mapping result on both sides of a vertex.
716 @since r53:
717 5. add -R support, solves tiny repeat.
718 Input:
719 1. arc_arr: preArc array
720 2. locks: locks array
721 3. v_ht: vertex hash
722 4. K_size: kmer size
723 5. cut_off_len: cut off length
724 6. read: read
725 Output:
726 None.
727 Return:
728 None.
729 *************************************************/
730 void process_1read_preArc ( preArc_array * arc_arr, pthread_spinlock_t * locks, int thread_id, vertex_hash2 * v_ht, int K_size, int cut_off_len, const char * read )
732 const int BUFF_LEN = 1024;
733 kmer_t2 kmers[BUFF_LEN];
734 int kmer_array_len = cut_off_len - K_size + 1;
735 int kmer_num ;
736 vertex2 * v_tmp;
737 edge_starter2 * e_tmp;
738 size_t left_id;
739 size_t right_id;
740 int left_found = 0, right_found = 0;
741 int edge_len;
742 //update
743 //int map_len;
744 //int shortest_maplen = 0;
745 //add for -R solving tiny repeats
746 unsigned int path[128];
747 unsigned int counter = 0;
748 //int read_len,i=0;
749 int read_len = strlen ( read );
751 while(read[i]!='\0'){
752 i++;
754 read_len = i;
755 //read_len = strlen(read);
756 if(read[read_len-1]=='\n'){
757 read[read_len-1]='\0';
758 read_len--;
761 if ( read_len > cut_off_len ) { read_len = cut_off_len; }
763 kmer_array_len = read_len - K_size + 1;
764 chop_kmers ( read, read_len, K_size, kmers, kmer_array_len, kmer_num );
766 for ( int i = 1; i < kmer_num - 1; ++i ) //search every kmer exclude the begin and end kmer
768 v_tmp = search_vertex ( v_ht, &kmers[i] );
770 if ( v_tmp ) //found
772 //search left edge kmer got left id
773 e_tmp = v_tmp->left;
775 while ( e_tmp )
777 edge_len = e_tmp->len;
779 if ( edge_len <= i )
781 if ( kmerCompare ( & ( kmers[i - edge_len] ), & ( e_tmp->edge_kmer ) ) == 0 )
783 left_id = e_tmp->edge_id;
785 if ( left_found )
787 fprintf ( stderr, "ERROR: left edge id found already !new found id %llu \n", left_id );
788 fprintf ( stderr, "i:%d ,edge_len:%d\n", i, edge_len );
789 printKmerSeq ( & ( kmers[i - edge_len] ), K_size, stderr );
790 printKmerSeq ( & ( e_tmp->edge_kmer ), K_size, stderr );
791 exit ( 1 );
794 left_found = 1;
796 break;
799 else
801 kmer_t2 read_edge = kmers[0];
803 if ( K_size > i )
805 kmerMoveRight ( &read_edge, K_size - i );
808 kmer_t2 KMER_FILTER;
809 initKmerFilter ( i, &KMER_FILTER );
810 kmer_t2 edge_kmer = e_tmp->edge_kmer;
812 if ( K_size > edge_len )
814 kmerMoveRight ( &edge_kmer, K_size - edge_len );
817 kmerAnd ( &read_edge, &KMER_FILTER );
818 kmerAnd ( &edge_kmer, &KMER_FILTER );
820 if ( kmerCompare ( &read_edge, &edge_kmer ) == 0 )
822 left_found++;
823 left_id = e_tmp->edge_id;
825 if ( left_found == 2 )
827 //debug_build<<"can't distinct which left edge\n";
828 break;
833 e_tmp = e_tmp->next;
836 //update maplen_control
838 if(edge_len >= shortest_maplen){
839 if(map_len < shortest_maplen) left_found = 0;
840 }else{
841 if(map_len != edge_len) left_found = 0;
844 if ( left_found != 1 ) {left_found = 0; right_found = 0; continue;} //not found or multi found
846 //todo : aln if left_found = 0 ... find the best
847 //search right edge kmer got right id
848 e_tmp = v_tmp->right;
850 while ( e_tmp )
852 edge_len = e_tmp->len;
854 if ( edge_len <= kmer_num - 1 - i )
856 if ( kmerCompare ( & ( kmers[i + edge_len] ), & ( e_tmp->edge_kmer ) ) == 0 )
858 right_id = e_tmp->edge_id;
860 if ( right_found )
862 fprintf ( stderr, "ERROR: right edge id found already, new found id %llu !\n", right_id );
863 fprintf ( stderr, "i:%d ,edge_len:%d\n", i, edge_len );
864 printKmerSeq ( & ( kmers[i + edge_len] ), K_size, stderr );
865 printKmerSeq ( & ( e_tmp->edge_kmer ), K_size, stderr );
866 exit ( 1 );
869 right_found = 1;
871 break;
874 else
876 int read_edge_len = ( kmer_num - 1 - i );
877 kmer_t2 KMER_FILTER;
878 initKmerFilter ( read_edge_len, &KMER_FILTER );
879 kmer_t2 read_edge = kmers[kmer_num - 1];
880 kmerAnd ( &read_edge, &KMER_FILTER );
881 kmer_t2 edge_kmer = e_tmp->edge_kmer;
883 if ( edge_len > read_edge_len )
885 kmerMoveRight ( &edge_kmer, ( edge_len - read_edge_len ) );
888 kmerAnd ( &edge_kmer, &KMER_FILTER );
890 if ( kmerCompare ( &read_edge, &edge_kmer ) == 0 )
892 right_found++;
893 right_id = e_tmp->edge_id;
895 if ( right_found == 2 )
897 //debug_build<<"can't distinct which right edge\n";
898 break;
903 e_tmp = e_tmp->next;
906 //update map_len control
908 if(edge_len >= shortest_maplen){
909 if(map_len < shortest_maplen) right_found = 0;
910 }else{
911 if(map_len != edge_len) right_found = 0;
914 if ( right_found != 1 ) {left_found = 0; right_found = 0; continue;}
916 //todo : aln if right_found = 0 ... find the best
917 //if(left_found == 1 && right_found ==1)
918 //store this preArc
919 //preArc_array *arc_arr
920 put_preArc_threaded ( arc_arr, locks, left_id, right_id, 1 );
922 //constructing the path ...
923 if ( solve )
925 if ( counter == 0 )
927 counter = 2;
928 path[1] = left_id;
929 path[2] = right_id;
931 else if ( counter <= 100 )
933 if ( path[counter] == left_id )
935 path[++counter] = right_id;
937 else
939 path[++counter] = left_id;
940 path[++counter] = right_id;
945 //end ...
946 left_found = 0;
947 right_found = 0;
951 //add to path buffer , if full filled ,output it
952 if ( solve )
954 if ( counter >= 3 && counter <= 100 )
956 path[0] = counter;
957 int tmp = is_full ( path_buffer[thread_id] );
959 if ( tmp == 1 )
961 //output it
962 output_edge_path_buffer_locked ( path_buffer[thread_id], path_fp, &file_lock );
964 else if ( tmp == -1 )
966 //error status
967 fprintf ( stderr, "ERROR: path buffer overflow!! system exit .\n" );
968 exit ( -1 );
971 put_path_2_buffer ( path_buffer[thread_id], path );
977 void free_preArc_array ( preArc_array * arc_array )
979 preArc * tmp, *tmp2;
981 for ( size_t i = 0; i < arc_array->array_sz; ++i )
983 tmp = ( arc_array->store_pos ) [i];
985 while ( tmp )
987 tmp2 = tmp;
988 tmp = tmp->next;
989 free ( tmp2 );
993 free ( arc_array->store_pos );
998 /*************************************************
999 Function:
1000 build_preArc_threaded
1001 Description:
1002 This is the main entry for building preArcs.
1003 Input:
1004 1. arc_arr: preArc array
1005 2. v_ht: vertex hash
1006 3. K_size: kmer size
1007 4. cut_off_len: cut off length
1008 5. in_filenames_vt: input reads file names
1009 6. thread_num: thread number
1010 Output:
1011 None.
1012 Return:
1013 None.
1014 *************************************************/
1015 void build_preArc_threaded ( preArc_array * arc_arr, vertex_hash2 * v_ht, int K_size, int cut_off_len, vector<string> *in_filenames_vt, int thread_num )
1017 //create main io thread
1018 int read_buf_sz = 102400 * thrd_num_s;
1019 read_buf0 = new string[read_buf_sz];
1020 read_buf1 = new string[read_buf_sz];
1021 io_stat0 = 1; //must be one, if io_stat0 =0 ,the io thread will work immediately
1022 io_stat1 = 1;
1023 io_ready = 0;
1024 io_para_main io_para_mains;
1025 io_para_mains.read_buf_sz = read_buf_sz;
1026 io_para_mains.in_filenames_vt = in_filenames_vt;
1027 pthread_t io_thread;
1028 int temp;
1030 //fprintf(stderr,"Creating main io thread ...\n");
1031 if ( ( temp = pthread_create ( &io_thread, NULL, run_io_thread_main, &io_para_mains ) ) != 0 )
1033 fprintf ( stderr, "ERROR: failed creating main io thread.\n" );
1034 exit ( -1 );
1037 fprintf ( stderr, "1 io thread initialized.\n" );
1038 //create work threads ..
1039 //fprintf(stderr,"Creating work threads ...\n");
1040 pthread_t threads[thrd_num_s];
1041 unsigned char thrdSignal[thrd_num_s + 1];
1042 PARAMETER paras[thrd_num_s];
1043 locks = ( pthread_spinlock_t * ) calloc ( arc_arr->array_sz, sizeof ( pthread_spinlock_t ) );
1045 //init as unlock stat ..
1046 for ( size_t i = 0; i < arc_arr->array_sz; ++i )
1048 locks[i] = 1;
1051 for ( int k = 0; k < thrd_num_s; k++ )
1053 thrdSignal[k + 1] = 0;
1054 paras[k].threadID = k;
1055 paras[k].mainSignal = &thrdSignal[0];
1056 paras[k].selfSignal = &thrdSignal[k + 1];
1057 paras[k].ht = NULL;
1058 paras[k].preArcs = arc_arr;
1059 paras[k].v_ht = v_ht;
1060 paras[k].cut_off_len = cut_off_len;
1061 paras[k].K_size = K_size;
1062 paras[k].gap = gap;
1065 creatThrds ( threads, paras );
1066 thrdSignal[0] = 0;
1068 //run it
1069 while ( 1 )
1071 sendIOWorkSignal();
1073 while ( io_ready == 0 ) {usleep ( 1 );}
1075 if ( io_ready )
1077 sendWorkSignal ( 12, thrdSignal );
1080 if ( io_ready == 2 )
1082 //fprintf(stderr,"All reads have been processed!\n");
1083 break;
1087 sendWorkSignal ( 3, thrdSignal );
1088 thread_wait ( threads );
1089 delete [] read_buf0;
1090 delete [] read_buf1;
1091 free ( ( void * ) locks );
1092 free_vertex_hash ( v_ht );
1096 /*************************************************
1097 Function:
1098 create_edge_path_buffer
1099 Description:
1100 Creates an edge_path_buffer struct dynamicly.
1101 Input:
1102 None.
1103 Output:
1104 None.
1105 Return:
1106 an edge_path_buffer pointer to heap
1107 *************************************************/
1108 edge_path_buffer * create_edge_path_buffer
1109 ( unsigned int * mark_on_edge,
1110 pthread_spinlock_t * locks,
1111 unsigned long long buff_size,
1112 unsigned int max_path_length )
1114 if ( ! ( mark_on_edge && locks ) )
1116 fprintf ( stderr, "ERROR: The initial mark_on_edge array or locks are not valid! Exit System ...\n" );
1117 exit ( -1 );
1120 edge_path_buffer * new_buffer = ( edge_path_buffer * ) calloc ( 1, sizeof ( edge_path_buffer ) );
1121 new_buffer->mark_on_edge = mark_on_edge;
1122 new_buffer->locks = locks;
1123 new_buffer->buff_size = buff_size;
1124 new_buffer->max_path_length = max_path_length;
1125 new_buffer->filled_num = 0;
1126 new_buffer->path_buffer = NULL;
1127 unsigned int ** tmp;
1128 tmp = ( unsigned int ** ) calloc ( buff_size, sizeof ( unsigned int * ) );
1130 for ( size_t i = 0; i < buff_size; i++ )
1132 tmp[i] = ( unsigned int * ) calloc ( max_path_length, sizeof ( unsigned int ) );
1135 new_buffer->path_buffer = tmp;
1136 return new_buffer;
1139 /*************************************************
1140 Function:
1141 create_edge_path_buffer
1142 Description:
1143 free the path_buffer in an edge_path_buffer struct
1144 Input:
1145 None
1146 Output:
1147 None
1148 Return:
1149 None
1150 *************************************************/
1151 void destory_edge_path_buffer ( struct edge_path_buffer * buffer )
1153 unsigned int ** tmp = buffer->path_buffer;
1155 for ( size_t i = 0; i < buffer->buff_size; i++ )
1157 free ( ( void * ) ( tmp[i] ) );
1160 free ( ( void * ) tmp );
1161 buffer->filled_num = 0;
1164 void clear_edge_path_buffer ( struct edge_path_buffer * buffer )
1166 unsigned int ** tmp = buffer->path_buffer;
1168 for ( size_t i = 0; i < buffer->buff_size; i++ )
1170 memset ( tmp[i], 0, buffer->max_path_length * sizeof ( unsigned int ) );
1173 buffer->filled_num = 0;
1177 void output_edge_path_buffer ( struct edge_path_buffer * buffer, FILE * path_file )
1179 if ( debug )
1181 static size_t times = 0, total = 0;
1182 total += buffer->filled_num;
1183 fprintf ( stderr, "call output_edge_path_buffer %lu %lu\n", times++, total );
1186 if ( !path_file )
1188 fprintf ( stderr, "ERROR: The path_file is not avilable!\n" );
1189 exit ( -1 );
1192 unsigned int counter;
1193 unsigned int ** tmp = buffer->path_buffer;
1195 for ( size_t i = 0; i < buffer->filled_num; i++ )
1197 counter = tmp[i][0];
1198 fwrite ( &counter, sizeof ( char ), 1, path_file );
1199 fwrite ( tmp[i] + 1, sizeof ( unsigned int ), ( int ) counter, path_file );
1202 buffer->filled_num = 0;
1206 /*************************************************
1207 Function:
1208 output_edge_path_buffer_locked
1209 Description:
1210 Output the buffer to a file for multi-threading.
1211 Input:
1212 None.
1213 Output:
1214 None.
1215 Return:
1216 None.
1217 *************************************************/
1218 void output_edge_path_buffer_locked ( struct edge_path_buffer * buffer, FILE * path_file, pthread_mutex_t * file_mutex )
1220 static size_t times = 0, total = 0;;
1222 if ( !path_file )
1224 fprintf ( stderr, "ERROR: The path_file is not avilable!\n" );
1225 exit ( -1 );
1228 unsigned int counter;
1229 unsigned int ** tmp = buffer->path_buffer;
1230 pthread_mutex_lock ( file_mutex );
1232 if ( debug )
1234 total += buffer->filled_num;
1235 fprintf ( stderr, "call output_edge_path_buffer_locked %lu %lu\n", times++, total );
1238 for ( size_t i = 0; i < buffer->filled_num; i++ )
1240 counter = tmp[i][0];
1241 fwrite ( &counter, sizeof ( char ), 1, path_file );
1242 fwrite ( tmp[i] + 1, sizeof ( unsigned int ), ( int ) counter, path_file );
1245 pthread_mutex_unlock ( file_mutex );
1246 buffer->filled_num = 0;
1251 /*************************************************
1252 Function:
1253 put_path_2_buffer
1254 Description:
1255 Stores the path to buffer and update mark_on_edge array at the same time.
1256 Input:
1257 None.
1258 Output:
1259 None.
1260 Return:
1261 None.
1262 *************************************************/
1263 int put_path_2_buffer ( struct edge_path_buffer * buffer, unsigned int * path )
1265 if ( debug )
1267 static size_t times = 0;
1268 static pthread_spinlock_t lock = 1;
1269 pthread_spin_lock ( &lock );
1270 fprintf ( stderr, "call put_path_2_buffer %lu\n", times++ );
1271 pthread_spin_unlock ( &lock );
1274 unsigned long long pos = buffer->filled_num;
1276 if ( pos >= buffer->buff_size )
1278 return -1;
1281 memcpy ( ( buffer->path_buffer ) [pos], path, buffer->max_path_length * sizeof ( unsigned int ) );
1283 for ( unsigned int i = 1; i < path[0]; i++ )
1285 pthread_spin_lock ( ( buffer->locks ) + path[i] );
1286 ( ( buffer->mark_on_edge ) [path[i]] ) ++;
1287 pthread_spin_unlock ( ( buffer->locks ) + path[i] );
1290 buffer->filled_num++;
1291 return 1;
1296 int is_full ( struct edge_path_buffer * buffer )
1298 if ( buffer->filled_num == buffer->buff_size )
1300 return 1;
1302 else if ( buffer->filled_num < buffer->buff_size )
1304 return 0;
1306 else
1308 return -1;
1313 void clear_status ( struct edge_path_buffer * buffer )
1315 buffer->filled_num = 0;