4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
29 #include "multi_threads.h"
30 #include "build_graph.h"
31 #include "build_edge.h"
32 #include "build_preArc.h"
35 #include "convert_soapdenovo.h"
38 static int LOAD_GRAPH
= 0, BUILD_DBG
= 1, BUILD_EDGES
= 1, BUILD_PREARCS
= 1;
39 //static int run_mode=0;
41 extern "C" int call_pregraph_sparse ( int argc
, char ** argv
);
42 static void initenv ( int argc
, char ** argv
);
43 static void parse_args ( vector
<string
> &in_filenames_vt
);
44 static void display_pregraph_usage();
47 int main ( int argc, char ** argv )
49 fprintf ( stderr, "\nVersion 1.0.3: released on July 13th, 2012\nCompile %s\t%s\n\n", __DATE__, __TIME__ );
50 call_pregraph_sparse ( argc, argv );
55 /*************************************************
59 The entry function for calling pregraph_sparse, its processes include
60 1. Parses the args form command.
61 2. Builds the sparse-kmer graph.
62 3. Removes low coverage sparse-kmers and kmer-edges (the connections between edges).
64 5. Builds edges by compacting linear kmer nodes.
65 6. Builds preArc (the connections between edges) by maping reads to edges.
67 @see display_pregraph_usage()
78 *************************************************/
79 extern "C" int call_pregraph_sparse ( int argc
, char ** argv
)
81 time_t all_beg_time
, all_end_time
;
82 time ( &all_beg_time
);
83 initenv ( argc
, argv
);
84 vector
<string
> in_filenames_vt
;
85 parse_args ( in_filenames_vt
);
86 struct hashtable2 ht2
;
87 size_t hashTableSZ
= 1000000;
88 time_t beg_time
, read_time
;
89 size_t bucket_count
= 0, edge_cnt
= 0;
91 if ( ( !LOAD_GRAPH
) && BUILD_DBG
)
95 for ( round
= 1; round
<= 2; round
++ )
97 //fprintf(stderr,"Building the sparse de Brujin graph, round: %d\n",round);
98 fprintf ( stderr
, "Start to build the sparse de Brujin graph, round: %d\n", round
);
104 uint64_t TotalSamplings
= 0;
106 //initialize the hashtable size
107 if ( GenomeSize
== 0 )
109 fprintf ( stderr
, "Error! Genome size not given.\n" );
113 hashTableSZ
= ( size_t ) GenomeSize
/ max ( gap
- 1, 5 );
114 int read_buf_sz
= 102400 * thrd_num_s
;
115 Init_HT2 ( &ht2
, hashTableSZ
);
116 //create main io thread
117 read_buf0
= new string
[read_buf_sz
];
118 read_buf1
= new string
[read_buf_sz
];
119 io_stat0
= 1; //must be one, if io_stat0 equals 0, the io threads will work immediately
122 io_para_main io_para_mains
;
123 io_para_mains
.read_buf_sz
= read_buf_sz
;
124 io_para_mains
.in_filenames_vt
= &in_filenames_vt
;
128 // fprintf(stderr,"Creating main io thread ...\n");
129 if ( ( temp
= pthread_create ( &io_thread
, NULL
, run_io_thread_main
, &io_para_mains
) ) != 0 )
131 fprintf ( stderr
, "ERROR: failed creating main io thread.\n" );
135 fprintf ( stderr
, "1 io thread initialized.\n" );
136 //create work threads for round 1
137 pthread_t threads
[thrd_num_s
];
138 unsigned char thrdSignal
[thrd_num_s
+ 1];
139 PARAMETER paras
[thrd_num_s
];
140 locks
= ( pthread_spinlock_t
* ) calloc ( ht2
.ht_sz
, sizeof ( pthread_spinlock_t
) );
142 //initialize the locks unlock
143 for ( size_t i
= 0; i
< ht2
.ht_sz
; ++i
)
149 //fprintf(stderr,"Creating work threads ...\n");
150 bucket_count_total
= ( size_t * ) calloc ( thrd_num_s
, sizeof ( size_t ) );
151 edge_cnt_total
= ( size_t * ) calloc ( thrd_num_s
, sizeof ( size_t ) );
153 for ( int k
= 0; k
< thrd_num_s
; k
++ )
155 thrdSignal
[k
+ 1] = 0;
156 paras
[k
].threadID
= k
;
157 paras
[k
].mainSignal
= &thrdSignal
[0];
158 paras
[k
].selfSignal
= &thrdSignal
[k
+ 1];
160 paras
[k
].K_size
= K_size
;
164 creatThrds ( threads
, paras
);
167 size_t processed_reads
= 0;
173 while ( io_ready
== 0 ) {usleep ( 1 );}
177 processed_reads
+= read_num
;
178 sendWorkSignal ( 10, thrdSignal
);
180 for ( int k1
= 0; k1
< thrd_num_s
; ++k1
)
182 bucket_count
+= bucket_count_total
[k1
];
183 bucket_count_total
[k1
] = 0;
189 //fprintf(stderr,"All reads have been processed !\n");
194 sendWorkSignal ( 3, thrdSignal
);
195 thread_wait ( threads
);
196 SwitchBuckets ( &ht2
, K_size
);
198 for ( size_t i
= 0; i
< ht2
.ht_sz
; ++i
) //this procedure can be removed
200 struct bucket2
* bktptr
= ht2
.store_pos
[i
];
202 while ( bktptr
!= NULL
)
204 bktptr
->kmer_info
.cov1
= 0;
205 bktptr
= bktptr
->nxt_bucket
;
209 free ( ( void * ) bucket_count_total
);
210 free ( ( void * ) locks
);
211 free ( ( void * ) edge_cnt_total
);
215 //fprintf(stderr,"Round 1 consumes time: %.fs.\n",difftime(read_time,beg_time));
216 //fprintf(stderr,"Number of processed reads: %llu \n",processed_reads);
217 fprintf ( stderr
, "Time spent on building graph round 1: %.fs, %llu reads processed, %llu nodes allocated\n",
218 difftime ( read_time
, beg_time
), processed_reads
, bucket_count
);
219 fprintf ( stderr
, "\n" );
226 //create main io thread
227 int read_buf_sz
= 102400 * thrd_num_s
;
228 read_buf0
= new string
[read_buf_sz
];
229 read_buf1
= new string
[read_buf_sz
];
230 io_stat0
= 1; //must be one, if io_stat0 =0 ,the io thread will work immediately
233 io_para_main io_para_mains
;
234 io_para_mains
.read_buf_sz
= read_buf_sz
;
235 io_para_mains
.in_filenames_vt
= &in_filenames_vt
;
239 //fprintf(stderr,"Creating main io thread ...\n");
240 if ( ( temp
= pthread_create ( &io_thread
, NULL
, run_io_thread_main
, &io_para_mains
) ) != 0 )
242 fprintf ( stderr
, "ERROR: failed creating main io thread.\n" );
246 fprintf ( stderr
, "1 io thread initialized.\n" );
247 //create work threads for round 2
248 pthread_t threads
[thrd_num_s
];
249 unsigned char thrdSignal
[thrd_num_s
+ 1];
250 PARAMETER paras
[thrd_num_s
];
251 locks
= ( pthread_spinlock_t
* ) calloc ( ht2
.ht_sz
, sizeof ( pthread_spinlock_t
) );
253 //initialize locks unlock
254 for ( size_t i
= 0; i
< ht2
.ht_sz
; ++i
)
260 //fprintf(stderr,"Creating work threads ...\n");
261 bucket_count_total
= ( size_t * ) calloc ( thrd_num_s
, sizeof ( size_t ) );
262 edge_cnt_total
= ( size_t * ) calloc ( thrd_num_s
, sizeof ( size_t ) );
264 for ( int k
= 0; k
< thrd_num_s
; k
++ )
266 thrdSignal
[k
+ 1] = 0;
267 paras
[k
].threadID
= k
;
268 paras
[k
].mainSignal
= &thrdSignal
[0];
269 paras
[k
].selfSignal
= &thrdSignal
[k
+ 1];
271 paras
[k
].K_size
= K_size
;
275 creatThrds ( threads
, paras
);
280 size_t processed_reads
= 0;
286 while ( io_ready
== 0 ) {usleep ( 1 );}
291 processed_reads
+= read_num
;
292 sendWorkSignal ( 11, thrdSignal
);
294 for ( int k1
= 0; k1
< thrd_num_s
; ++k1
)
296 edge_cnt
+= edge_cnt_total
[k1
];
297 edge_cnt_total
[k1
] = 0;
303 //fprintf(stderr,"All reads have been processed !\n");
308 sendWorkSignal ( 3, thrdSignal
);
309 thread_wait ( threads
);
310 free ( ( void * ) bucket_count_total
);
311 free ( ( void * ) locks
);
312 free ( ( void * ) edge_cnt_total
);
315 SavingSparseKmerGraph2 ( &ht2
, graphfile
);
317 //fprintf(stderr,"Round 2 consumed time: %.fs.\n",difftime(read_time,beg_time));
318 //fprintf(stderr,"Number of processed reads: %llu \n",processed_reads);
319 fprintf ( stderr
, "Time spent on building graph round 2: %.fs, %llu reads processed\n", difftime ( read_time
, beg_time
), processed_reads
);
320 fprintf ( stderr
, "%llu nodes allocated, %llu kmer-edges allocated.\n", bucket_count
, edge_cnt
);
321 fprintf ( stderr
, "\n" );
328 fprintf ( stderr
, "Loading the graph ...\n" );
329 LoadingSparseKmerGraph2 ( &ht2
, graphfile
);
334 RemovingWeakNodesAndEdges2 ( &ht2
, K_size
, NodeCovTh
, EdgeCovTh
, &bucket_count
, &edge_cnt
);
335 int cut_len_tip
= 2 * K_size
;
337 time_t start
, finish
, interval
;
338 fprintf ( stderr
, "Start to remove tips with minority links.\n" );
339 start
= time ( NULL
);
340 removeMinorTips ( &ht2
, K_size
, cut_len_tip
, tip_c
);
341 finish
= time ( NULL
);
342 interval
= ( finish
- start
) ;
343 //fprintf(stderr,"Removing minor tips consumes %llu s.\n\n",interval);
344 fprintf ( stderr
, "Time spent on removing tips: %llus.\n\n", interval
);
345 fprintf ( stderr
, "Start to construct edges.\n" );
346 start
= time ( NULL
);
348 sprintf ( outfile
, "%s.sparse.edge", graphfile
);
349 kmer2edges ( &ht2
, K_size
, outfile
);
350 free_hashtable ( &ht2
);
352 sprintf ( temp
, "%s.sparse.edge", graphfile
);
353 convert ( temp
, K_size
, graphfile
);
354 finish
= time ( NULL
);
355 interval
= ( finish
- start
) ;
356 //fprintf(stderr,"Building edges consumes %llu s.\n\n",interval);
357 fprintf ( stderr
, "Time spent on constructing edges: %llus.\n\n", interval
);
360 if ( BUILD_PREARCS
) //build preArc
365 sprintf ( basicInfo
, "%s.preGraphBasic", graphfile
);
369 fin
= fopen ( basicInfo
, "r" );
373 fprintf ( stderr
, "ERROR: can't open file %s\n", basicInfo
);
379 while ( fgets ( line
, 1024, fin
) != NULL
)
381 if ( line
[0] == 'V' ) //VERTEX
383 sscanf ( line
+ 6, "%lu %s %d", &v_sz
, str
, &K_size
);
387 if ( line
[0] == 'E' ) //EDGEs
389 sscanf ( line
, "%s %lu", str
, &e_sz
);
397 fprintf ( stderr
, "ERROR: preGraphBasic file is in invaild format!\n" );
402 preArc_array arc_arr
;
404 sprintf ( edge_file
, "%s.sparse.edge", graphfile
);
405 time_t start
, finish
, interval
;
407 //fprintf(stderr,"Building vertexes ...\n");
408 fprintf ( stderr
, "Start to build vertex indexes.\n" );
409 start
= time ( NULL
);
410 init_vertex_hash ( &v_ht
, v_sz
);
411 build_vertexes ( &v_ht
, K_size
, edge_file
);
412 finish
= time ( NULL
);
413 interval
= ( finish
- start
) ;
414 //fprintf(stderr,"Building vertexes consumes %llu s.\n\n",interval);
415 fprintf ( stderr
, "Time spent on building vertex indexes: %llus.\n\n", interval
);
417 //fprintf(stderr,"Building preArcs ...\n");
418 fprintf ( stderr
, "Start to build preArcs.\n" );
419 start
= time ( NULL
);
420 int cut_off_len
= 256;//tmp
421 init_preArc_array ( &arc_arr
, e_sz
+ 1 );
425 /* initialize the threads' common data mark_on_edge s_locks*/
426 mark_on_edge
= ( unsigned int * ) calloc ( e_sz
+ 1, sizeof ( unsigned int ) );
427 s_locks
= ( pthread_spinlock_t
* ) calloc ( e_sz
+ 1, sizeof ( pthread_spinlock_t
) );
429 for ( size_t i
= 0; i
< e_sz
+ 1; i
++ )
434 /*buffers for seperate threads*/
435 path_buffer
= ( edge_path_buffer
** ) calloc ( thrd_num_s
, sizeof ( edge_path_buffer
) );
437 for ( int i
= 0; i
< thrd_num_s
; i
++ )
439 path_buffer
[i
] = create_edge_path_buffer ( mark_on_edge
, s_locks
, buff_size
, max_path_length
);
442 /*initialize the output file */
443 char mark_file
[128], path_file
[128];
444 sprintf ( mark_file
, "%s.markOnEdge", graphfile
);
445 sprintf ( path_file
, "%s.path", graphfile
);
446 mark_fp
= fopen ( mark_file
, "w" );
447 path_fp
= fopen ( path_file
, "w" );
448 pthread_mutex_init ( &file_lock
, NULL
);
451 build_preArc_threaded ( &arc_arr
, &v_ht
, K_size
, cut_off_len
, &in_filenames_vt
, thrd_num_s
);
455 //output mark_on_edge
456 size_t markCounter
= 0;
458 for ( size_t i
= 1; i
<= e_sz
; i
++ )
460 markCounter
+= mark_on_edge
[i
];
461 fprintf ( mark_fp
, "%d\n", mark_on_edge
[i
] );
464 fprintf ( stderr
, "Total number of marks in file markOnEdge: %lu\n", markCounter
);
468 for ( int i
= 0; i
< thrd_num_s
; i
++ )
470 output_edge_path_buffer ( path_buffer
[i
], path_fp
);
475 //destory buffers ...
476 for ( int i
= 0; i
< thrd_num_s
; i
++ )
478 destory_edge_path_buffer ( path_buffer
[i
] );
481 free ( ( void * ) mark_on_edge
);
482 free ( ( void * ) s_locks
);
483 pthread_mutex_destroy ( &file_lock
);
486 finish
= time ( NULL
);
487 interval
= ( finish
- start
);
488 //fprintf(stderr,"Building preArcs consumes %llu s.\n\n",interval);
489 fprintf ( stderr
, "Time spent on building preArcs: %llus.\n\n", interval
);
490 char prearc_file
[128];
491 sprintf ( prearc_file
, "%s.preArc", graphfile
);
492 output_preArcs ( &arc_arr
, prearc_file
);
495 time ( &all_end_time
);
496 fprintf ( stderr
, "Overall time spent on constructing lightgraph: %.fm.\n", difftime ( all_end_time
, all_beg_time
) / 60 );
500 static void initenv ( int argc
, char ** argv
)
503 int inpseq
, outseq
, genome_sz
;
504 extern char * optarg
;
507 inpseq
= outseq
= genome_sz
= 0;
509 while ( ( copt
= getopt ( argc
, argv
, "s:o:K:g:z:d:e:p:m:r:R" ) ) != EOF
)
515 sscanf ( optarg
, "%s", shortrdsfile
);
519 sscanf ( optarg
, "%s", graphfile
);
522 sscanf ( optarg
, "%s", temp
);
523 K_size
= atoi ( temp
);
526 sscanf ( optarg
, "%s", temp
);
531 sscanf ( optarg
, "%Lu", &GenomeSize
);
534 sscanf ( optarg
, "%s", temp
);
535 NodeCovTh
= atoi ( temp
) >= 0 ? atoi ( temp
) : 0;
538 sscanf ( optarg
, "%s", temp
);
539 EdgeCovTh
= atoi ( temp
) >= 0 ? atoi ( temp
) : 0;
545 sscanf ( optarg
, "%s", temp
);
546 thrd_num_s
= atoi ( temp
);
551 sscanf ( optarg
, "%s", temp
);
552 run_mode
= atoi ( temp
);
556 if ( inpseq
== 0 || outseq
== 0 )
558 display_pregraph_usage();
564 if ( inpseq
== 0 || outseq
== 0 || genome_sz
== 0 )
566 display_pregraph_usage();
571 static void parse_args ( vector
<string
> &in_filenames_vt
)
573 if ( K_size
% 2 == 0 )
582 fprintf ( stderr
, "ERROR: Parameter K is set too large, max value is 63.\n" );
591 fprintf ( stderr
, "ERROR: Parameter K is set too large, max value is 127.\n" );
599 fprintf ( stderr
, "ERROR: Parameter g is set too large, max value is 25.\n" );
604 fprintf ( stderr
, "********************\n" );
605 fprintf ( stderr
, "SparsePregraph\n" );
606 fprintf ( stderr
, "********************\n" );
607 fprintf ( stderr
, "\n" );
608 fprintf ( stderr
, "Parameters: " );
609 fprintf ( stderr
, "pregraph_sparse -s %s", shortrdsfile
);
610 fprintf ( stderr
, " -K %d", K_size
);
611 fprintf ( stderr
, " -g %d", gap
);
612 fprintf ( stderr
, " -z %lu", GenomeSize
);
613 fprintf ( stderr
, " -d %d", NodeCovTh
);
614 fprintf ( stderr
, " -e %d", EdgeCovTh
);
618 fprintf ( stderr
, " -R " );
621 fprintf ( stderr
, " -r %d", run_mode
);
622 fprintf ( stderr
, " -p %d", thrd_num_s
);
623 fprintf ( stderr
, " -o %s\n\n", graphfile
);
625 if ( run_mode
== 0 ) //build all
632 else if ( run_mode
== 1 ) //build edges ,build preArcs
638 else if ( run_mode
== 2 ) //build graph only
645 else if ( run_mode
== 3 ) //build edges only
651 else if ( run_mode
== 4 ) //build preArc only
660 fprintf ( stderr
, "ERROR: invalid runMode!\n" );
664 read_lib ( in_filenames_vt
, shortrdsfile
);
666 fprintf(stderr,"The reads for building sparse de Brujin graph:\n");
667 for (int i=0;i<in_filenames_vt.size();++i){
668 fprintf(stderr,"%s\n",in_filenames_vt[i].c_str());
670 fprintf(stderr,"\n");
674 static void display_pregraph_usage()
676 fprintf ( stderr
, "Usage: sparse_pregraph -s configFile -K kmer -z genomeSize -o outputGraph [-g maxKmerEdgeLength -d kmerFreqCutoff -e kmerEdgeFreqCutoff -R -r runMode -p n_cpu]\n" );
677 fprintf ( stderr
, " -s <string> configFile: the config file of solexa reads\n" );
679 fprintf ( stderr
, " -K <int> kmer(min 13, max 63): kmer size, [23]\n" );
682 fprintf ( stderr
, " -K <int> kmer(min 13, max 127): kmer size, [23]\n" );
684 fprintf ( stderr
, " -g <int> maxKmerEdgeLength(min 1, max 25): number of skipped intermediate kmers, [15]\n" );
685 fprintf ( stderr
, " -z <int> genomeSize(required): estimated genome size\n" );
686 fprintf ( stderr
, " -d <int> kmerFreqCutoff: delete kmers with frequency no larger than,[1]\n" );
687 fprintf ( stderr
, " -e <int> kmerEdgeFreqCutoff: delete kmers' related edge with frequency no larger than [1]\n" );
688 fprintf ( stderr
, " -R (optional) output extra information for resolving repeats in contig step, [NO]\n" );
689 fprintf ( stderr
, " -r <int> runMode: 0 build graph & build edge and preArc, 1 load graph by prefix & build edge and preArc, 2 build graph only, 3 build edges only, 4 build preArcs only [0] \n" );
690 fprintf ( stderr
, " -p <int> n_cpu: number of cpu for use,[8]\n" );
691 fprintf ( stderr
, " -o <int> outputGraph: prefix of output graph file name\n" );