4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
25 #include "build_preArc.h"
28 #include "multi_threads.h"
43 #include "sam_header.h"
48 #include "sparse_kmer.h"
52 //step1:read edge & build vertex hash
54 //a.create id for every edge & it's rev_comp id
55 //b.build vertex for every edge (done...)
57 //step2:read reads & chop kmers
59 //step3:search vertex with the kmers & mark the found position & get the Vertex
60 //step3_extend:if found two or more vetex in one read , record the path to solve tiny repeat
62 //step4:compare the left kmer right kmer to vertex's edge_kmer
64 //step5: if left & right both match build a preArc or increase the multiplicity by 1
68 void init_vertex_hash ( vertex_hash2
* v_ht
, size_t sz
)
71 v_ht
->store_pos
= ( vertex2
** ) calloc ( sz
, sizeof ( vertex2
* ) );
75 /*************************************************
79 1. Reads sequence from *.sparse.edge.
80 2. Builds vertexes by cutting the edge sequence's end kmers.
82 1. v_ht: vertex hashtable
84 3. edge_file: edge file
89 *************************************************/
90 void build_vertexes ( vertex_hash2
* v_ht
, int K_size
, char * edge_file
)
93 kmer_t2 from_kmer
, to_kmer
;
94 size_t line_len
, edge_len_left
;
98 const int BUFF_LEN
= 1024;
101 char to_buff
[BUFF_LEN
];//buffer 2k edge seq BUFF_LEN>4*K_size
102 int processed
= 0; //0±íʾδ´¦Àí 1±íʾ ´¦ÀíÍê±Ï 2 ±íʾÒѾ´¦ÀíÁËfrom_vertex ,to_vertex »¹Ã»Óд¦Àí
104 fp
= fopen ( edge_file
, "r" );
108 fprintf ( stderr
, "ERROR: Cannot open edge_file %s. Now exit to system...\n", edge_file
);
113 edge_starter2
* e_tmp
;
117 while ( fgets ( line
, BUFF_LEN
, fp
) != NULL
)
119 //debug<<"processed "<<processed<<endl;
120 if ( line
[0] == '>' ) //get one edge length, from vertex, to vertex,cvg,bal
122 if ( processed
== 1 && bal_ed
)
124 edge_id
++;//Èç¹û²»ÊÇ»ØÎÄ
128 sscanf ( line
+ 7, "%d,%llx %llx ,%llx %llx ,%s %d,%d", &edge_len
,
129 & ( from_kmer
.kmer
) [0], & ( from_kmer
.kmer
) [1], & ( to_kmer
.kmer
) [0], & ( to_kmer
.kmer
) [1], str
, &cvg
, &bal_ed
); // from_kmer to_kmer is of no use here
132 sscanf ( line
+ 7, "%d,%llx %llx %llx %llx ,%llx %llx %llx %llx ,%s %d,%d", &edge_len
,
133 & ( from_kmer
.kmer
) [0], & ( from_kmer
.kmer
) [1], & ( from_kmer
.kmer
) [2], & ( from_kmer
.kmer
) [3],
134 & ( to_kmer
.kmer
) [0], & ( to_kmer
.kmer
) [1], & ( to_kmer
.kmer
) [2], & ( to_kmer
.kmer
) [3], str
, &cvg
, &bal_ed
); // from_kmer to_kmer is of no use here
136 edge_len_left
= K_size
+ edge_len
;
138 edge_id
++;// current edge positive strand id
139 //debug<<line<<"edge_id "<<edge_id<<endl;
143 if ( processed
== 0 )
145 line_len
= strlen ( line
);
147 if ( line
[line_len
- 1] == '\n' )
149 line
[line_len
- 1] = '\0';
153 if ( edge_len_left
- line_len
== 0 ) //edge completely loaded
156 process_edge ( v_ht
, K_size
, line
, line_len
, 1, edge_id
, bal_ed
);
161 else //edge partly loaded at the first time.
163 if ( line_len
< 2 * K_size
) //line_len < 2*K_size &&edge_len_left - line_len > 0
165 fprintf ( stderr
, "ERROR:it won't happen in 63mer/127mer\n" );
170 process_edge ( v_ht
, K_size
, line
, line_len
, 2, edge_id
, bal_ed
);
172 edge_len_left
-= line_len
;
174 if ( edge_len_left
>= 2 * K_size
)
176 //no need to buf the to kmer seq
178 else if ( edge_len_left
< 2 * K_size
)
180 //to_buff[100];/ copy the last 2K char of line to to_buff already no '\n'
181 strcpy ( to_buff
, line
+ ( line_len
- 2 * K_size
) );
185 fprintf ( stderr
, "ERROR: in cal the edge_len_left!!\n" );
191 else if ( processed
== 2 )
193 //if(line[0]=='\n') continue;
194 line_len
= strlen ( line
);
196 if ( line
[line_len
- 1] == '\n' )
198 line
[line_len
- 1] = '\0';
202 edge_len_left
-= line_len
;
204 if ( edge_len_left
== 0 ) //load the complete edge sequence
206 //process the to kmer
207 if ( line_len
>= 2 * K_size
)
209 process_edge ( v_ht
, K_size
, line
, line_len
, 3, edge_id
, bal_ed
);
213 //need to use the to_buff
214 int buf_len
= strlen ( to_buff
);
215 strcpy ( to_buff
+ buf_len
, line
);
216 buf_len
= strlen ( to_buff
);
217 process_edge ( v_ht
, K_size
, to_buff
, buf_len
, 3, edge_id
, bal_ed
);
225 if ( edge_len_left
>= 2 * K_size
)
227 //no need to buf the to kmer seq
229 else if ( edge_len_left
< 2 * K_size
)
231 //to_buff[100];/ copy the last 2K char of line to to_buff
232 strcpy ( to_buff
, line
+ ( line_len
- 2 * K_size
) );
236 fprintf ( stderr
, "ERROR: in cal the edge_len_left!!\n" );
243 if ( line
[0] == '\n' ) { continue; } //µ±len = 1023ʱ
245 fprintf ( stderr
, "ERROR: in cal the status_processed !! %d \n", processed
);
255 /*************************************************
259 It builds vetexes from one or part of one edge sequence.
263 3. seq: edge sequence
265 5. type: 1: process head and tail; 2: process head ; 3:process tail
267 7. bal_edge: 0:palindrome 1:else
272 *************************************************/
273 static void process_edge ( vertex_hash2
* v_ht
, int K_size
, char * seq
, int len
, int type
, size_t edge_id
, bool bal_edge
)
278 edge_starter2
* e_tmp
;
285 case 1: //process all ..
287 get_kmer_from_seq ( seq
, len
, K_size
, 0, &vertex_kmer
);
289 if ( len
<= K_size
+ gap
) //get the last kmer
291 get_kmer_from_seq ( seq
, len
, K_size
, len
- K_size
, &edge_kmer
);
292 edge_kmer_len
= len
- K_size
;
296 //get_kmer_from_seq(seq, len, K_size, K_size,&edge_kmer);
297 get_kmer_from_seq ( seq
, len
, K_size
, gap
, &edge_kmer
);
302 v_tmp
= put_vertex ( v_ht
, vertex_kmer
, is_found
);
303 put_edge ( v_tmp
, edge_kmer
, is_left
, edge_kmer_len
, edge_id
);
304 reverseCompKmer ( &vertex_kmer
, K_size
);
305 reverseCompKmer ( &edge_kmer
, K_size
);
307 v_tmp
= put_vertex ( v_ht
, vertex_kmer
, is_found
);
308 put_edge ( v_tmp
, edge_kmer
, is_left
, edge_kmer_len
, edge_id
+ bal_edge
);
310 get_kmer_from_seq ( seq
, len
, K_size
, len
- K_size
, &vertex_kmer
);
312 if ( len
<= K_size
+ gap
) //get the first kmer
314 get_kmer_from_seq ( seq
, len
, K_size
, 0, &edge_kmer
);
315 edge_kmer_len
= len
- K_size
;
319 get_kmer_from_seq ( seq
, len
, K_size
, len
- K_size
- gap
, &edge_kmer
);
324 v_tmp
= put_vertex ( v_ht
, vertex_kmer
, is_found
);
325 put_edge ( v_tmp
, edge_kmer
, is_left
, edge_kmer_len
, edge_id
);
326 reverseCompKmer ( &vertex_kmer
, K_size
);
327 reverseCompKmer ( &edge_kmer
, K_size
);
329 v_tmp
= put_vertex ( v_ht
, vertex_kmer
, is_found
);
330 put_edge ( v_tmp
, edge_kmer
, is_left
, edge_kmer_len
, edge_id
+ bal_edge
);
333 //process only the head
334 get_kmer_from_seq ( seq
, len
, K_size
, 0, &vertex_kmer
);
336 if ( len
<= K_size
+ gap
)
338 get_kmer_from_seq ( seq
, len
, K_size
, len
- K_size
, &edge_kmer
);
339 edge_kmer_len
= len
- K_size
;
343 get_kmer_from_seq ( seq
, len
, K_size
, gap
, &edge_kmer
);
348 v_tmp
= put_vertex ( v_ht
, vertex_kmer
, is_found
);
349 put_edge ( v_tmp
, edge_kmer
, is_left
, edge_kmer_len
, edge_id
);
350 reverseCompKmer ( &vertex_kmer
, K_size
);
351 reverseCompKmer ( &edge_kmer
, K_size
);
353 v_tmp
= put_vertex ( v_ht
, vertex_kmer
, is_found
);
354 put_edge ( v_tmp
, edge_kmer
, is_left
, edge_kmer_len
, edge_id
+ bal_edge
);
357 //process only the tail
358 get_kmer_from_seq ( seq
, len
, K_size
, len
- K_size
, &vertex_kmer
);
360 if ( len
<= K_size
+ gap
)
362 get_kmer_from_seq ( seq
, len
, K_size
, 0, &edge_kmer
);
363 edge_kmer_len
= len
- K_size
;
367 get_kmer_from_seq ( seq
, len
, K_size
, len
- K_size
- gap
, &edge_kmer
);
372 v_tmp
= put_vertex ( v_ht
, vertex_kmer
, is_found
);
373 put_edge ( v_tmp
, edge_kmer
, is_left
, edge_kmer_len
, edge_id
);
374 reverseCompKmer ( &vertex_kmer
, K_size
);
375 reverseCompKmer ( &edge_kmer
, K_size
);
377 v_tmp
= put_vertex ( v_ht
, vertex_kmer
, is_found
);
378 put_edge ( v_tmp
, edge_kmer
, is_left
, edge_kmer_len
, edge_id
+ bal_edge
);
381 fprintf ( stderr
, "ERROR: wrong process type in process_edge()\n" );
387 static vertex2
* put_vertex ( vertex_hash2
* v_ht
, kmer_t2 vertex_kmer
, int & is_found
) //63 127 differ fixed
389 uint64_t hv
= MurmurHash64A ( vertex_kmer
.kmer
, sizeof ( kmer_t2
), 0 ); //hash value
390 uint64_t idx
= ( size_t ) ( hv
% v_ht
->ht_sz
);
391 vertex2
* ver
= ( v_ht
->store_pos
) [idx
];
395 ( v_ht
->store_pos
) [idx
] = ( vertex2
* ) malloc ( sizeof ( vertex2
) );
396 ver
= ( v_ht
->store_pos
) [idx
];
397 ver
->kmer_t2
= vertex_kmer
;
407 if ( kmerCompare ( & ( ver
->kmer_t2
), &vertex_kmer
) == 0 )
413 if ( ver
->next
== NULL
) { break; }
419 ver
->next
= ( vertex2
* ) malloc ( sizeof ( vertex2
) );
420 ver
->next
->kmer_t2
= vertex_kmer
;
421 ver
->next
->left
= NULL
;
422 ver
->next
->right
= NULL
;
423 ver
->next
->next
= NULL
;
427 static void put_edge ( vertex2
* ver
, kmer_t2 edge_kmer
, bool is_left
, int len
, size_t edge_id
) //fixed
429 edge_starter2
* tmp
= NULL
;
435 ver
->left
= ( edge_starter2
* ) malloc ( sizeof ( edge_starter2
) );
436 ver
->left
->edge_kmer
= edge_kmer
;
437 ver
->left
->edge_id
= edge_id
;
438 ver
->left
->len
= len
;//record the length of edge (1~k)
439 ver
->left
->next
= NULL
;
449 ver
->right
= ( edge_starter2
* ) malloc ( sizeof ( edge_starter2
) );
450 ver
->right
->edge_kmer
= edge_kmer
;
451 ver
->right
->edge_id
= edge_id
;
452 ver
->right
->len
= len
;//record the length of edge (1~k)
453 ver
->right
->next
= NULL
;
460 while ( tmp
->next
) //because there are no two edges equal attached with one node ...
465 tmp
->next
= ( edge_starter2
* ) malloc ( sizeof ( edge_starter2
) );
466 tmp
->next
->edge_kmer
= edge_kmer
;
467 tmp
->next
->edge_id
= edge_id
;
468 tmp
->next
->len
= len
;//record the length of edge (1~k)
469 tmp
->next
->next
= NULL
;
475 static vertex2
* search_vertex ( vertex_hash2
* v_ht
, kmer_t2
* vertex_kmer
) //fixed ...
477 uint64_t hv
= MurmurHash64A ( vertex_kmer
->kmer
, sizeof ( kmer_t2
), 0 ); //hash value
478 uint64_t idx
= ( size_t ) ( hv
% v_ht
->ht_sz
);
479 vertex2
* ver
= ( v_ht
->store_pos
) [idx
];
483 if ( kmerCompare ( & ( ver
->kmer_t2
), vertex_kmer
) == 0 )
496 void init_preArc_array ( preArc_array
* arc_array
, size_t sz
) //63 127 same
498 arc_array
->array_sz
= sz
;
499 arc_array
->store_pos
= ( preArc
** ) calloc ( sz
, sizeof ( preArc
* ) );
503 static void chop_kmers ( const char * read
, int len
, int K_size
, kmer_t2
* kmer_array
, int kmer_array_len
, int & kmer_num
)
511 kmer_num
= len
- K_size
+ 1;
513 if ( kmer_num
> kmer_array_len
)
515 fprintf ( stderr
, "ERROR: the kmer_array_len is not enough! %d\n", kmer_num
);
521 for ( int i
= 0; i
< kmer_num
; ++i
) //optimize later
523 get_kmer_from_seq ( read
, len
, K_size
, i
, &kmer
);
524 kmer_array
[i
] = kmer
;
528 /*************************************************
532 Stores one preArc into preArc_array
534 1. arc_arr: preArc array
535 2. locks: the locks array for modifying preArc array
536 3. left_id: left edge id
537 4. right_id: right edge id
538 5. added_multi: added weigth
543 *************************************************/
544 static inline void put_preArc_threaded ( preArc_array
* arc_arr
, pthread_spinlock_t
* locks
, size_t left_id
, size_t right_id
, int added_multi
)
546 pthread_spin_lock ( &locks
[left_id
] );
547 put_preArc ( arc_arr
, left_id
, right_id
, added_multi
);
548 pthread_spin_unlock ( &locks
[left_id
] );
552 static inline void put_preArc ( preArc_array
* arc_arr
, size_t left_id
, size_t right_id
, int added_multi
)
554 preArc
* arc
= ( arc_arr
->store_pos
) [left_id
];
558 ( arc_arr
->store_pos
) [left_id
] = ( preArc
* ) malloc ( sizeof ( preArc
) );
559 arc
= ( arc_arr
->store_pos
) [left_id
];
560 arc
->to_ed
= right_id
;
561 arc
->multiplicity
= added_multi
;
568 if ( arc
->to_ed
== right_id
)
570 arc
->multiplicity
+= added_multi
;
574 if ( arc
->next
== NULL
) { break; }
579 arc
->next
= ( preArc
* ) malloc ( sizeof ( preArc
) );
580 arc
->next
->to_ed
= right_id
;
581 arc
->next
->multiplicity
= added_multi
;
582 arc
->next
->next
= NULL
;
586 static inline void put_preArc ( preArc_array
* arc_arr
, size_t left_id
, size_t right_id
)
588 preArc
* arc
= ( arc_arr
->store_pos
) [left_id
];
592 ( arc_arr
->store_pos
) [left_id
] = ( preArc
* ) malloc ( sizeof ( preArc
) );
593 arc
= ( arc_arr
->store_pos
) [left_id
];
594 arc
->to_ed
= right_id
;
595 arc
->multiplicity
= 1;
602 if ( arc
->to_ed
== right_id
)
608 if ( arc
->next
== NULL
) { break; }
613 arc
->next
= ( preArc
* ) malloc ( sizeof ( preArc
) );
614 arc
->next
->to_ed
= right_id
;
615 arc
->next
->multiplicity
= 1;
616 arc
->next
->next
= NULL
;
620 void output_preArcs ( preArc_array
* arc_arr
, char * outfile
)
623 fp
= fopen ( outfile
, "w" );
627 fprintf ( stderr
, "ERROR: can't create file %s in output_preArc\n", outfile
);
633 for ( size_t i
= 0; i
< arc_arr
->array_sz
; ++i
)
635 parc
= ( arc_arr
->store_pos
) [i
];
639 fprintf ( fp
, "%u", i
);
645 fprintf ( fp
, " %u %u", parc
->to_ed
, parc
->multiplicity
);
648 if ( parc
&& j
% 4 == 0 )
650 fprintf ( fp
, "\n" );
651 fprintf ( fp
, "%u", i
);
655 fprintf ( fp
, "\n" );
663 static void free_vertex ( vertex2
* tmp
)
665 edge_starter2
* edge_s
, *edge_s2
;
671 edge_s
= edge_s
->next
;
680 edge_s
= edge_s
->next
;
687 void free_vertex_hash ( vertex_hash2
* v_ht
)
689 vertex2
* tmp
, *tmp2
;
691 for ( size_t i
= 0; i
< v_ht
->ht_sz
; ++i
)
693 tmp
= ( v_ht
->store_pos
) [i
];
699 free_vertex ( tmp2
);
703 free ( v_ht
->store_pos
);
706 /*************************************************
710 This is the core function for building preArcs.
711 1. Chops one read into kmers.
712 2. Searches the kmers in vertex hash.
713 3. Aligns the vertex's kmer-edge sequences to the read sequence on both sides.
714 4. Constructs preArcs according the mapping result on both sides of a vertex.
717 5. add -R support, solves tiny repeat.
719 1. arc_arr: preArc array
720 2. locks: locks array
723 5. cut_off_len: cut off length
729 *************************************************/
730 void process_1read_preArc ( preArc_array
* arc_arr
, pthread_spinlock_t
* locks
, int thread_id
, vertex_hash2
* v_ht
, int K_size
, int cut_off_len
, const char * read
)
732 const int BUFF_LEN
= 1024;
733 kmer_t2 kmers
[BUFF_LEN
];
734 int kmer_array_len
= cut_off_len
- K_size
+ 1;
737 edge_starter2
* e_tmp
;
740 int left_found
= 0, right_found
= 0;
744 //int shortest_maplen = 0;
745 //add for -R solving tiny repeats
746 unsigned int path
[128];
747 unsigned int counter
= 0;
749 int read_len
= strlen ( read
);
751 while(read[i]!='\0'){
755 //read_len = strlen(read);
756 if(read[read_len-1]=='\n'){
757 read[read_len-1]='\0';
761 if ( read_len
> cut_off_len
) { read_len
= cut_off_len
; }
763 kmer_array_len
= read_len
- K_size
+ 1;
764 chop_kmers ( read
, read_len
, K_size
, kmers
, kmer_array_len
, kmer_num
);
766 for ( int i
= 1; i
< kmer_num
- 1; ++i
) //search every kmer exclude the begin and end kmer
768 v_tmp
= search_vertex ( v_ht
, &kmers
[i
] );
772 //search left edge kmer got left id
777 edge_len
= e_tmp
->len
;
781 if ( kmerCompare ( & ( kmers
[i
- edge_len
] ), & ( e_tmp
->edge_kmer
) ) == 0 )
783 left_id
= e_tmp
->edge_id
;
787 fprintf ( stderr
, "ERROR: left edge id found already !new found id %llu \n", left_id
);
788 fprintf ( stderr
, "i:%d ,edge_len:%d\n", i
, edge_len
);
789 printKmerSeq ( & ( kmers
[i
- edge_len
] ), K_size
, stderr
);
790 printKmerSeq ( & ( e_tmp
->edge_kmer
), K_size
, stderr
);
801 kmer_t2 read_edge
= kmers
[0];
805 kmerMoveRight ( &read_edge
, K_size
- i
);
809 initKmerFilter ( i
, &KMER_FILTER
);
810 kmer_t2 edge_kmer
= e_tmp
->edge_kmer
;
812 if ( K_size
> edge_len
)
814 kmerMoveRight ( &edge_kmer
, K_size
- edge_len
);
817 kmerAnd ( &read_edge
, &KMER_FILTER
);
818 kmerAnd ( &edge_kmer
, &KMER_FILTER
);
820 if ( kmerCompare ( &read_edge
, &edge_kmer
) == 0 )
823 left_id
= e_tmp
->edge_id
;
825 if ( left_found
== 2 )
827 //debug_build<<"can't distinct which left edge\n";
836 //update maplen_control
838 if(edge_len >= shortest_maplen){
839 if(map_len < shortest_maplen) left_found = 0;
841 if(map_len != edge_len) left_found = 0;
844 if ( left_found
!= 1 ) {left_found
= 0; right_found
= 0; continue;} //not found or multi found
846 //todo : aln if left_found = 0 ... find the best
847 //search right edge kmer got right id
848 e_tmp
= v_tmp
->right
;
852 edge_len
= e_tmp
->len
;
854 if ( edge_len
<= kmer_num
- 1 - i
)
856 if ( kmerCompare ( & ( kmers
[i
+ edge_len
] ), & ( e_tmp
->edge_kmer
) ) == 0 )
858 right_id
= e_tmp
->edge_id
;
862 fprintf ( stderr
, "ERROR: right edge id found already, new found id %llu !\n", right_id
);
863 fprintf ( stderr
, "i:%d ,edge_len:%d\n", i
, edge_len
);
864 printKmerSeq ( & ( kmers
[i
+ edge_len
] ), K_size
, stderr
);
865 printKmerSeq ( & ( e_tmp
->edge_kmer
), K_size
, stderr
);
876 int read_edge_len
= ( kmer_num
- 1 - i
);
878 initKmerFilter ( read_edge_len
, &KMER_FILTER
);
879 kmer_t2 read_edge
= kmers
[kmer_num
- 1];
880 kmerAnd ( &read_edge
, &KMER_FILTER
);
881 kmer_t2 edge_kmer
= e_tmp
->edge_kmer
;
883 if ( edge_len
> read_edge_len
)
885 kmerMoveRight ( &edge_kmer
, ( edge_len
- read_edge_len
) );
888 kmerAnd ( &edge_kmer
, &KMER_FILTER
);
890 if ( kmerCompare ( &read_edge
, &edge_kmer
) == 0 )
893 right_id
= e_tmp
->edge_id
;
895 if ( right_found
== 2 )
897 //debug_build<<"can't distinct which right edge\n";
906 //update map_len control
908 if(edge_len >= shortest_maplen){
909 if(map_len < shortest_maplen) right_found = 0;
911 if(map_len != edge_len) right_found = 0;
914 if ( right_found
!= 1 ) {left_found
= 0; right_found
= 0; continue;}
916 //todo : aln if right_found = 0 ... find the best
917 //if(left_found == 1 && right_found ==1)
919 //preArc_array *arc_arr
920 put_preArc_threaded ( arc_arr
, locks
, left_id
, right_id
, 1 );
922 //constructing the path ...
931 else if ( counter
<= 100 )
933 if ( path
[counter
] == left_id
)
935 path
[++counter
] = right_id
;
939 path
[++counter
] = left_id
;
940 path
[++counter
] = right_id
;
951 //add to path buffer , if full filled ,output it
954 if ( counter
>= 3 && counter
<= 100 )
957 int tmp
= is_full ( path_buffer
[thread_id
] );
962 output_edge_path_buffer_locked ( path_buffer
[thread_id
], path_fp
, &file_lock
);
964 else if ( tmp
== -1 )
967 fprintf ( stderr
, "ERROR: path buffer overflow!! system exit .\n" );
971 put_path_2_buffer ( path_buffer
[thread_id
], path
);
977 void free_preArc_array ( preArc_array
* arc_array
)
981 for ( size_t i
= 0; i
< arc_array
->array_sz
; ++i
)
983 tmp
= ( arc_array
->store_pos
) [i
];
993 free ( arc_array
->store_pos
);
998 /*************************************************
1000 build_preArc_threaded
1002 This is the main entry for building preArcs.
1004 1. arc_arr: preArc array
1005 2. v_ht: vertex hash
1006 3. K_size: kmer size
1007 4. cut_off_len: cut off length
1008 5. in_filenames_vt: input reads file names
1009 6. thread_num: thread number
1014 *************************************************/
1015 void build_preArc_threaded ( preArc_array
* arc_arr
, vertex_hash2
* v_ht
, int K_size
, int cut_off_len
, vector
<string
> *in_filenames_vt
, int thread_num
)
1017 //create main io thread
1018 int read_buf_sz
= 102400 * thrd_num_s
;
1019 read_buf0
= new string
[read_buf_sz
];
1020 read_buf1
= new string
[read_buf_sz
];
1021 io_stat0
= 1; //must be one, if io_stat0 =0 ,the io thread will work immediately
1024 io_para_main io_para_mains
;
1025 io_para_mains
.read_buf_sz
= read_buf_sz
;
1026 io_para_mains
.in_filenames_vt
= in_filenames_vt
;
1027 pthread_t io_thread
;
1030 //fprintf(stderr,"Creating main io thread ...\n");
1031 if ( ( temp
= pthread_create ( &io_thread
, NULL
, run_io_thread_main
, &io_para_mains
) ) != 0 )
1033 fprintf ( stderr
, "ERROR: failed creating main io thread.\n" );
1037 fprintf ( stderr
, "1 io thread initialized.\n" );
1038 //create work threads ..
1039 //fprintf(stderr,"Creating work threads ...\n");
1040 pthread_t threads
[thrd_num_s
];
1041 unsigned char thrdSignal
[thrd_num_s
+ 1];
1042 PARAMETER paras
[thrd_num_s
];
1043 locks
= ( pthread_spinlock_t
* ) calloc ( arc_arr
->array_sz
, sizeof ( pthread_spinlock_t
) );
1045 //init as unlock stat ..
1046 for ( size_t i
= 0; i
< arc_arr
->array_sz
; ++i
)
1051 for ( int k
= 0; k
< thrd_num_s
; k
++ )
1053 thrdSignal
[k
+ 1] = 0;
1054 paras
[k
].threadID
= k
;
1055 paras
[k
].mainSignal
= &thrdSignal
[0];
1056 paras
[k
].selfSignal
= &thrdSignal
[k
+ 1];
1058 paras
[k
].preArcs
= arc_arr
;
1059 paras
[k
].v_ht
= v_ht
;
1060 paras
[k
].cut_off_len
= cut_off_len
;
1061 paras
[k
].K_size
= K_size
;
1065 creatThrds ( threads
, paras
);
1073 while ( io_ready
== 0 ) {usleep ( 1 );}
1077 sendWorkSignal ( 12, thrdSignal
);
1080 if ( io_ready
== 2 )
1082 //fprintf(stderr,"All reads have been processed!\n");
1087 sendWorkSignal ( 3, thrdSignal
);
1088 thread_wait ( threads
);
1089 delete [] read_buf0
;
1090 delete [] read_buf1
;
1091 free ( ( void * ) locks
);
1092 free_vertex_hash ( v_ht
);
1096 /*************************************************
1098 create_edge_path_buffer
1100 Creates an edge_path_buffer struct dynamicly.
1106 an edge_path_buffer pointer to heap
1107 *************************************************/
1108 edge_path_buffer
* create_edge_path_buffer
1109 ( unsigned int * mark_on_edge
,
1110 pthread_spinlock_t
* locks
,
1111 unsigned long long buff_size
,
1112 unsigned int max_path_length
)
1114 if ( ! ( mark_on_edge
&& locks
) )
1116 fprintf ( stderr
, "ERROR: The initial mark_on_edge array or locks are not valid! Exit System ...\n" );
1120 edge_path_buffer
* new_buffer
= ( edge_path_buffer
* ) calloc ( 1, sizeof ( edge_path_buffer
) );
1121 new_buffer
->mark_on_edge
= mark_on_edge
;
1122 new_buffer
->locks
= locks
;
1123 new_buffer
->buff_size
= buff_size
;
1124 new_buffer
->max_path_length
= max_path_length
;
1125 new_buffer
->filled_num
= 0;
1126 new_buffer
->path_buffer
= NULL
;
1127 unsigned int ** tmp
;
1128 tmp
= ( unsigned int ** ) calloc ( buff_size
, sizeof ( unsigned int * ) );
1130 for ( size_t i
= 0; i
< buff_size
; i
++ )
1132 tmp
[i
] = ( unsigned int * ) calloc ( max_path_length
, sizeof ( unsigned int ) );
1135 new_buffer
->path_buffer
= tmp
;
1139 /*************************************************
1141 create_edge_path_buffer
1143 free the path_buffer in an edge_path_buffer struct
1150 *************************************************/
1151 void destory_edge_path_buffer ( struct edge_path_buffer
* buffer
)
1153 unsigned int ** tmp
= buffer
->path_buffer
;
1155 for ( size_t i
= 0; i
< buffer
->buff_size
; i
++ )
1157 free ( ( void * ) ( tmp
[i
] ) );
1160 free ( ( void * ) tmp
);
1161 buffer
->filled_num
= 0;
1164 void clear_edge_path_buffer ( struct edge_path_buffer
* buffer
)
1166 unsigned int ** tmp
= buffer
->path_buffer
;
1168 for ( size_t i
= 0; i
< buffer
->buff_size
; i
++ )
1170 memset ( tmp
[i
], 0, buffer
->max_path_length
* sizeof ( unsigned int ) );
1173 buffer
->filled_num
= 0;
1177 void output_edge_path_buffer ( struct edge_path_buffer
* buffer
, FILE * path_file
)
1181 static size_t times
= 0, total
= 0;
1182 total
+= buffer
->filled_num
;
1183 fprintf ( stderr
, "call output_edge_path_buffer %lu %lu\n", times
++, total
);
1188 fprintf ( stderr
, "ERROR: The path_file is not avilable!\n" );
1192 unsigned int counter
;
1193 unsigned int ** tmp
= buffer
->path_buffer
;
1195 for ( size_t i
= 0; i
< buffer
->filled_num
; i
++ )
1197 counter
= tmp
[i
][0];
1198 fwrite ( &counter
, sizeof ( char ), 1, path_file
);
1199 fwrite ( tmp
[i
] + 1, sizeof ( unsigned int ), ( int ) counter
, path_file
);
1202 buffer
->filled_num
= 0;
1206 /*************************************************
1208 output_edge_path_buffer_locked
1210 Output the buffer to a file for multi-threading.
1217 *************************************************/
1218 void output_edge_path_buffer_locked ( struct edge_path_buffer
* buffer
, FILE * path_file
, pthread_mutex_t
* file_mutex
)
1220 static size_t times
= 0, total
= 0;;
1224 fprintf ( stderr
, "ERROR: The path_file is not avilable!\n" );
1228 unsigned int counter
;
1229 unsigned int ** tmp
= buffer
->path_buffer
;
1230 pthread_mutex_lock ( file_mutex
);
1234 total
+= buffer
->filled_num
;
1235 fprintf ( stderr
, "call output_edge_path_buffer_locked %lu %lu\n", times
++, total
);
1238 for ( size_t i
= 0; i
< buffer
->filled_num
; i
++ )
1240 counter
= tmp
[i
][0];
1241 fwrite ( &counter
, sizeof ( char ), 1, path_file
);
1242 fwrite ( tmp
[i
] + 1, sizeof ( unsigned int ), ( int ) counter
, path_file
);
1245 pthread_mutex_unlock ( file_mutex
);
1246 buffer
->filled_num
= 0;
1251 /*************************************************
1255 Stores the path to buffer and update mark_on_edge array at the same time.
1262 *************************************************/
1263 int put_path_2_buffer ( struct edge_path_buffer
* buffer
, unsigned int * path
)
1267 static size_t times
= 0;
1268 static pthread_spinlock_t lock
= 1;
1269 pthread_spin_lock ( &lock
);
1270 fprintf ( stderr
, "call put_path_2_buffer %lu\n", times
++ );
1271 pthread_spin_unlock ( &lock
);
1274 unsigned long long pos
= buffer
->filled_num
;
1276 if ( pos
>= buffer
->buff_size
)
1281 memcpy ( ( buffer
->path_buffer
) [pos
], path
, buffer
->max_path_length
* sizeof ( unsigned int ) );
1283 for ( unsigned int i
= 1; i
< path
[0]; i
++ )
1285 pthread_spin_lock ( ( buffer
->locks
) + path
[i
] );
1286 ( ( buffer
->mark_on_edge
) [path
[i
]] ) ++;
1287 pthread_spin_unlock ( ( buffer
->locks
) + path
[i
] );
1290 buffer
->filled_num
++;
1296 int is_full ( struct edge_path_buffer
* buffer
)
1298 if ( buffer
->filled_num
== buffer
->buff_size
)
1302 else if ( buffer
->filled_num
< buffer
->buff_size
)
1313 void clear_status ( struct edge_path_buffer
* buffer
)
1315 buffer
->filled_num
= 0;