modified: nfig1.py
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / sparsePregraph / pregraph_sparse.cpp
blobcd422f358241c3e39611031718675b4e91a0a1a7
1 /*
2 * pregraph_sparse.cpp
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "global.h"
24 #include "stdinc.h"
25 #include "core.h"
27 #include <unistd.h>
29 #include "multi_threads.h"
30 #include "build_graph.h"
31 #include "build_edge.h"
32 #include "build_preArc.h"
33 #include "io_func.h"
34 #include "seq_util.h"
35 #include "convert_soapdenovo.h"
38 static int LOAD_GRAPH = 0, BUILD_DBG = 1, BUILD_EDGES = 1, BUILD_PREARCS = 1;
39 //static int run_mode=0;
41 extern "C" int call_pregraph_sparse ( int argc, char ** argv );
42 static void initenv ( int argc, char ** argv );
43 static void parse_args ( vector<string> &in_filenames_vt );
44 static void display_pregraph_usage();
47 int main ( int argc, char ** argv )
49 fprintf ( stderr, "\nVersion 1.0.3: released on July 13th, 2012\nCompile %s\t%s\n\n", __DATE__, __TIME__ );
50 call_pregraph_sparse ( argc, argv );
51 }*/
55 /*************************************************
56 Function:
57 call_pregraph_sparse
58 Description:
59 The entry function for calling pregraph_sparse, its processes include
60 1. Parses the args form command.
61 2. Builds the sparse-kmer graph.
62 3. Removes low coverage sparse-kmers and kmer-edges (the connections between edges).
63 4. Removes tips.
64 5. Builds edges by compacting linear kmer nodes.
65 6. Builds preArc (the connections between edges) by maping reads to edges.
66 Input:
67 @see display_pregraph_usage()
68 Output:
69 files:
70 *.ht_idx
71 *.ht_content
72 *.kmerFreq
73 *.sparse.edge
74 *.edge.gz
75 *.preArc
76 Return:
77 None.
78 *************************************************/
79 extern "C" int call_pregraph_sparse ( int argc, char ** argv )
81 time_t all_beg_time, all_end_time;
82 time ( &all_beg_time );
83 initenv ( argc, argv );
84 vector<string> in_filenames_vt;
85 parse_args ( in_filenames_vt );
86 struct hashtable2 ht2;
87 size_t hashTableSZ = 1000000;
88 time_t beg_time, read_time;
89 size_t bucket_count = 0, edge_cnt = 0;
91 if ( ( !LOAD_GRAPH ) && BUILD_DBG )
93 int round = 1;
95 for ( round = 1; round <= 2; round++ )
97 //fprintf(stderr,"Building the sparse de Brujin graph, round: %d\n",round);
98 fprintf ( stderr, "Start to build the sparse de Brujin graph, round: %d\n", round );
100 if ( round == 1 )
102 time ( &beg_time );
103 edge_cnt = 0;
104 uint64_t TotalSamplings = 0;
106 //initialize the hashtable size
107 if ( GenomeSize == 0 )
109 fprintf ( stderr, "Error! Genome size not given.\n" );
110 return -1;
113 hashTableSZ = ( size_t ) GenomeSize / max ( gap - 1, 5 );
114 int read_buf_sz = 102400 * thrd_num_s;
115 Init_HT2 ( &ht2, hashTableSZ );
116 //create main io thread
117 read_buf0 = new string[read_buf_sz];
118 read_buf1 = new string[read_buf_sz];
119 io_stat0 = 1; //must be one, if io_stat0 equals 0, the io threads will work immediately
120 io_stat1 = 1;
121 io_ready = 0;
122 io_para_main io_para_mains;
123 io_para_mains.read_buf_sz = read_buf_sz;
124 io_para_mains.in_filenames_vt = &in_filenames_vt;
125 pthread_t io_thread;
126 int temp;
128 // fprintf(stderr,"Creating main io thread ...\n");
129 if ( ( temp = pthread_create ( &io_thread, NULL, run_io_thread_main, &io_para_mains ) ) != 0 )
131 fprintf ( stderr, "ERROR: failed creating main io thread.\n" );
132 exit ( -1 );
135 fprintf ( stderr, "1 io thread initialized.\n" );
136 //create work threads for round 1
137 pthread_t threads[thrd_num_s];
138 unsigned char thrdSignal[thrd_num_s + 1];
139 PARAMETER paras[thrd_num_s];
140 locks = ( pthread_spinlock_t * ) calloc ( ht2.ht_sz, sizeof ( pthread_spinlock_t ) );
142 //initialize the locks unlock
143 for ( size_t i = 0; i < ht2.ht_sz; ++i )
145 locks[i] = 1;
148 //create threads
149 //fprintf(stderr,"Creating work threads ...\n");
150 bucket_count_total = ( size_t * ) calloc ( thrd_num_s, sizeof ( size_t ) );
151 edge_cnt_total = ( size_t * ) calloc ( thrd_num_s, sizeof ( size_t ) );
153 for ( int k = 0; k < thrd_num_s; k++ )
155 thrdSignal[k + 1] = 0;
156 paras[k].threadID = k;
157 paras[k].mainSignal = &thrdSignal[0];
158 paras[k].selfSignal = &thrdSignal[k + 1];
159 paras[k].ht = &ht2;
160 paras[k].K_size = K_size;
161 paras[k].gap = gap;
164 creatThrds ( threads, paras );
165 thrdSignal[0] = 0;
166 //begin to work
167 size_t processed_reads = 0;
169 while ( 1 )
171 sendIOWorkSignal();
173 while ( io_ready == 0 ) {usleep ( 1 );}
175 if ( io_ready )
177 processed_reads += read_num;
178 sendWorkSignal ( 10, thrdSignal );
180 for ( int k1 = 0; k1 < thrd_num_s; ++k1 )
182 bucket_count += bucket_count_total[k1];
183 bucket_count_total[k1] = 0;
187 if ( io_ready == 2 )
189 //fprintf(stderr,"All reads have been processed !\n");
190 break;
194 sendWorkSignal ( 3, thrdSignal );
195 thread_wait ( threads );
196 SwitchBuckets ( &ht2, K_size );
198 for ( size_t i = 0; i < ht2.ht_sz; ++i ) //this procedure can be removed
200 struct bucket2 * bktptr = ht2.store_pos[i];
202 while ( bktptr != NULL )
204 bktptr->kmer_info.cov1 = 0;
205 bktptr = bktptr->nxt_bucket;
209 free ( ( void * ) bucket_count_total );
210 free ( ( void * ) locks );
211 free ( ( void * ) edge_cnt_total );
212 delete [] read_buf0;
213 delete [] read_buf1;
214 time ( &read_time );
215 //fprintf(stderr,"Round 1 consumes time: %.fs.\n",difftime(read_time,beg_time));
216 //fprintf(stderr,"Number of processed reads: %llu \n",processed_reads);
217 fprintf ( stderr, "Time spent on building graph round 1: %.fs, %llu reads processed, %llu nodes allocated\n",
218 difftime ( read_time, beg_time ), processed_reads, bucket_count );
219 fprintf ( stderr, "\n" );
222 if ( round == 2 )
224 time ( &beg_time );
225 edge_cnt = 0;
226 //create main io thread
227 int read_buf_sz = 102400 * thrd_num_s;
228 read_buf0 = new string[read_buf_sz];
229 read_buf1 = new string[read_buf_sz];
230 io_stat0 = 1; //must be one, if io_stat0 =0 ,the io thread will work immediately
231 io_stat1 = 1;
232 io_ready = 0;
233 io_para_main io_para_mains;
234 io_para_mains.read_buf_sz = read_buf_sz;
235 io_para_mains.in_filenames_vt = &in_filenames_vt;
236 pthread_t io_thread;
237 int temp;
239 //fprintf(stderr,"Creating main io thread ...\n");
240 if ( ( temp = pthread_create ( &io_thread, NULL, run_io_thread_main, &io_para_mains ) ) != 0 )
242 fprintf ( stderr, "ERROR: failed creating main io thread.\n" );
243 exit ( -1 );
246 fprintf ( stderr, "1 io thread initialized.\n" );
247 //create work threads for round 2
248 pthread_t threads[thrd_num_s];
249 unsigned char thrdSignal[thrd_num_s + 1];
250 PARAMETER paras[thrd_num_s];
251 locks = ( pthread_spinlock_t * ) calloc ( ht2.ht_sz, sizeof ( pthread_spinlock_t ) );
253 //initialize locks unlock
254 for ( size_t i = 0; i < ht2.ht_sz; ++i )
256 locks[i] = 1;
259 //create threads
260 //fprintf(stderr,"Creating work threads ...\n");
261 bucket_count_total = ( size_t * ) calloc ( thrd_num_s, sizeof ( size_t ) );
262 edge_cnt_total = ( size_t * ) calloc ( thrd_num_s, sizeof ( size_t ) );
264 for ( int k = 0; k < thrd_num_s; k++ )
266 thrdSignal[k + 1] = 0;
267 paras[k].threadID = k;
268 paras[k].mainSignal = &thrdSignal[0];
269 paras[k].selfSignal = &thrdSignal[k + 1];
270 paras[k].ht = &ht2;
271 paras[k].K_size = K_size;
272 paras[k].gap = gap;
275 creatThrds ( threads, paras );
276 thrdSignal[0] = 0;
277 //begin to work
278 int foundcount = 0;
279 int flipcount = 0;
280 size_t processed_reads = 0;
282 while ( 1 )
284 sendIOWorkSignal();
286 while ( io_ready == 0 ) {usleep ( 1 );}
288 if ( io_ready )
290 //read_c = read_num;
291 processed_reads += read_num;
292 sendWorkSignal ( 11, thrdSignal );
294 for ( int k1 = 0; k1 < thrd_num_s; ++k1 )
296 edge_cnt += edge_cnt_total[k1];
297 edge_cnt_total[k1] = 0;
301 if ( io_ready == 2 )
303 //fprintf(stderr,"All reads have been processed !\n");
304 break;
308 sendWorkSignal ( 3, thrdSignal );
309 thread_wait ( threads );
310 free ( ( void * ) bucket_count_total );
311 free ( ( void * ) locks );
312 free ( ( void * ) edge_cnt_total );
313 delete [] read_buf0;
314 delete [] read_buf1;
315 SavingSparseKmerGraph2 ( &ht2, graphfile );
316 time ( &read_time );
317 //fprintf(stderr,"Round 2 consumed time: %.fs.\n",difftime(read_time,beg_time));
318 //fprintf(stderr,"Number of processed reads: %llu \n",processed_reads);
319 fprintf ( stderr, "Time spent on building graph round 2: %.fs, %llu reads processed\n", difftime ( read_time, beg_time ), processed_reads );
320 fprintf ( stderr, "%llu nodes allocated, %llu kmer-edges allocated.\n", bucket_count, edge_cnt );
321 fprintf ( stderr, "\n" );
326 if ( LOAD_GRAPH )
328 fprintf ( stderr, "Loading the graph ...\n" );
329 LoadingSparseKmerGraph2 ( &ht2, graphfile );
332 if ( BUILD_EDGES )
334 RemovingWeakNodesAndEdges2 ( &ht2, K_size, NodeCovTh, EdgeCovTh, &bucket_count, &edge_cnt );
335 int cut_len_tip = 2 * K_size;
336 int tip_c = 0;
337 time_t start, finish, interval;
338 fprintf ( stderr, "Start to remove tips with minority links.\n" );
339 start = time ( NULL );
340 removeMinorTips ( &ht2, K_size, cut_len_tip, tip_c );
341 finish = time ( NULL );
342 interval = ( finish - start ) ;
343 //fprintf(stderr,"Removing minor tips consumes %llu s.\n\n",interval);
344 fprintf ( stderr, "Time spent on removing tips: %llus.\n\n", interval );
345 fprintf ( stderr, "Start to construct edges.\n" );
346 start = time ( NULL );
347 char outfile[256];
348 sprintf ( outfile, "%s.sparse.edge", graphfile );
349 kmer2edges ( &ht2, K_size, outfile );
350 free_hashtable ( &ht2 );
351 char temp[256];
352 sprintf ( temp, "%s.sparse.edge", graphfile );
353 convert ( temp, K_size, graphfile );
354 finish = time ( NULL );
355 interval = ( finish - start ) ;
356 //fprintf(stderr,"Building edges consumes %llu s.\n\n",interval);
357 fprintf ( stderr, "Time spent on constructing edges: %llus.\n\n", interval );
360 if ( BUILD_PREARCS ) //build preArc
362 size_t v_sz, e_sz;
363 int K_size;
364 char basicInfo[128];
365 sprintf ( basicInfo, "%s.preGraphBasic", graphfile );
366 FILE * fin;
367 char line[1024];
368 char str[32];
369 fin = fopen ( basicInfo, "r" );
371 if ( !fin )
373 fprintf ( stderr, "ERROR: can't open file %s\n", basicInfo );
374 exit ( 1 );
377 bool a = 0, b = 0;
379 while ( fgets ( line, 1024, fin ) != NULL )
381 if ( line[0] == 'V' ) //VERTEX
383 sscanf ( line + 6, "%lu %s %d", &v_sz, str, &K_size );
384 a = 1;
387 if ( line[0] == 'E' ) //EDGEs
389 sscanf ( line, "%s %lu", str, &e_sz );
390 b = 1;
391 break;
395 if ( !a || !b )
397 fprintf ( stderr, "ERROR: preGraphBasic file is in invaild format!\n" );
398 exit ( 1 );
401 vertex_hash2 v_ht;
402 preArc_array arc_arr;
403 char edge_file[128];
404 sprintf ( edge_file, "%s.sparse.edge", graphfile );
405 time_t start, finish, interval;
406 //step1:
407 //fprintf(stderr,"Building vertexes ...\n");
408 fprintf ( stderr, "Start to build vertex indexes.\n" );
409 start = time ( NULL );
410 init_vertex_hash ( &v_ht, v_sz );
411 build_vertexes ( &v_ht, K_size, edge_file );
412 finish = time ( NULL );
413 interval = ( finish - start ) ;
414 //fprintf(stderr,"Building vertexes consumes %llu s.\n\n",interval);
415 fprintf ( stderr, "Time spent on building vertex indexes: %llus.\n\n", interval );
416 //step2:
417 //fprintf(stderr,"Building preArcs ...\n");
418 fprintf ( stderr, "Start to build preArcs.\n" );
419 start = time ( NULL );
420 int cut_off_len = 256;//tmp
421 init_preArc_array ( &arc_arr, e_sz + 1 );
423 if ( solve )
425 /* initialize the threads' common data mark_on_edge s_locks*/
426 mark_on_edge = ( unsigned int * ) calloc ( e_sz + 1, sizeof ( unsigned int ) );
427 s_locks = ( pthread_spinlock_t * ) calloc ( e_sz + 1, sizeof ( pthread_spinlock_t ) );
429 for ( size_t i = 0; i < e_sz + 1; i++ )
431 s_locks[i] = 1;
434 /*buffers for seperate threads*/
435 path_buffer = ( edge_path_buffer ** ) calloc ( thrd_num_s, sizeof ( edge_path_buffer ) );
437 for ( int i = 0; i < thrd_num_s; i++ )
439 path_buffer[i] = create_edge_path_buffer ( mark_on_edge, s_locks, buff_size, max_path_length );
442 /*initialize the output file */
443 char mark_file[128], path_file[128];
444 sprintf ( mark_file, "%s.markOnEdge", graphfile );
445 sprintf ( path_file, "%s.path", graphfile );
446 mark_fp = fopen ( mark_file, "w" );
447 path_fp = fopen ( path_file, "w" );
448 pthread_mutex_init ( &file_lock, NULL );
451 build_preArc_threaded ( &arc_arr, &v_ht, K_size, cut_off_len, &in_filenames_vt, thrd_num_s );
453 if ( solve )
455 //output mark_on_edge
456 size_t markCounter = 0;
458 for ( size_t i = 1; i <= e_sz; i++ )
460 markCounter += mark_on_edge[i];
461 fprintf ( mark_fp, "%d\n", mark_on_edge[i] );
464 fprintf ( stderr, "Total number of marks in file markOnEdge: %lu\n", markCounter );
465 fclose ( mark_fp );
467 //output path_buffer
468 for ( int i = 0; i < thrd_num_s; i++ )
470 output_edge_path_buffer ( path_buffer[i], path_fp );
473 fclose ( path_fp );
475 //destory buffers ...
476 for ( int i = 0; i < thrd_num_s; i++ )
478 destory_edge_path_buffer ( path_buffer[i] );
481 free ( ( void * ) mark_on_edge );
482 free ( ( void * ) s_locks );
483 pthread_mutex_destroy ( &file_lock );
486 finish = time ( NULL );
487 interval = ( finish - start );
488 //fprintf(stderr,"Building preArcs consumes %llu s.\n\n",interval);
489 fprintf ( stderr, "Time spent on building preArcs: %llus.\n\n", interval );
490 char prearc_file[128];
491 sprintf ( prearc_file, "%s.preArc", graphfile );
492 output_preArcs ( &arc_arr, prearc_file );
495 time ( &all_end_time );
496 fprintf ( stderr, "Overall time spent on constructing lightgraph: %.fm.\n", difftime ( all_end_time, all_beg_time ) / 60 );
497 return 0;
500 static void initenv ( int argc, char ** argv )
502 int copt;
503 int inpseq, outseq, genome_sz;
504 extern char * optarg;
505 char temp[100];
506 optind = 1;
507 inpseq = outseq = genome_sz = 0;
509 while ( ( copt = getopt ( argc, argv, "s:o:K:g:z:d:e:p:m:r:R" ) ) != EOF )
511 switch ( copt )
513 case 's':
514 inpseq = 1;
515 sscanf ( optarg, "%s", shortrdsfile );
516 continue;
517 case 'o':
518 outseq = 1;
519 sscanf ( optarg, "%s", graphfile );
520 continue;
521 case 'K':
522 sscanf ( optarg, "%s", temp );
523 K_size = atoi ( temp );
524 continue;
525 case 'g':
526 sscanf ( optarg, "%s", temp );
527 gap = atoi ( temp );
528 continue;
529 case 'z':
530 genome_sz = 1;
531 sscanf ( optarg, "%Lu", &GenomeSize );
532 continue;
533 case 'd':
534 sscanf ( optarg, "%s", temp );
535 NodeCovTh = atoi ( temp ) >= 0 ? atoi ( temp ) : 0;
536 continue;
537 case 'e':
538 sscanf ( optarg, "%s", temp );
539 EdgeCovTh = atoi ( temp ) >= 0 ? atoi ( temp ) : 0;
540 continue;
541 case 'R':
542 solve = 1;
543 continue;
544 case 'p':
545 sscanf ( optarg, "%s", temp );
546 thrd_num_s = atoi ( temp );
547 continue;
548 case 'm':
549 continue;
550 case 'r':
551 sscanf ( optarg, "%s", temp );
552 run_mode = atoi ( temp );
553 continue;
554 default:
556 if ( inpseq == 0 || outseq == 0 )
558 display_pregraph_usage();
559 exit ( -1 );
564 if ( inpseq == 0 || outseq == 0 || genome_sz == 0 )
566 display_pregraph_usage();
567 exit ( -1 );
571 static void parse_args ( vector<string> &in_filenames_vt )
573 if ( K_size % 2 == 0 )
575 K_size--;
578 #ifdef _63MER_
580 if ( K_size > 63 )
582 fprintf ( stderr, "ERROR: Parameter K is set too large, max value is 63.\n" );
583 exit ( -1 );
586 #endif
587 #ifdef _127MER_
589 if ( K_size > 127 )
591 fprintf ( stderr, "ERROR: Parameter K is set too large, max value is 127.\n" );
592 exit ( -1 );
595 #endif
597 if ( gap > 25 )
599 fprintf ( stderr, "ERROR: Parameter g is set too large, max value is 25.\n" );
600 exit ( -1 );
603 //print the args
604 fprintf ( stderr, "********************\n" );
605 fprintf ( stderr, "SparsePregraph\n" );
606 fprintf ( stderr, "********************\n" );
607 fprintf ( stderr, "\n" );
608 fprintf ( stderr, "Parameters: " );
609 fprintf ( stderr, "pregraph_sparse -s %s", shortrdsfile );
610 fprintf ( stderr, " -K %d", K_size );
611 fprintf ( stderr, " -g %d", gap );
612 fprintf ( stderr, " -z %lu", GenomeSize );
613 fprintf ( stderr, " -d %d", NodeCovTh );
614 fprintf ( stderr, " -e %d", EdgeCovTh );
616 if ( solve )
618 fprintf ( stderr, " -R " );
621 fprintf ( stderr, " -r %d", run_mode );
622 fprintf ( stderr, " -p %d", thrd_num_s );
623 fprintf ( stderr, " -o %s\n\n", graphfile );
625 if ( run_mode == 0 ) //build all
627 LOAD_GRAPH = 0;
628 BUILD_DBG = 1;
629 BUILD_EDGES = 1;
630 BUILD_PREARCS = 1;
632 else if ( run_mode == 1 ) //build edges ,build preArcs
634 LOAD_GRAPH = 1;
635 BUILD_EDGES = 1;
636 BUILD_PREARCS = 1;
638 else if ( run_mode == 2 ) //build graph only
640 LOAD_GRAPH = 0;
641 BUILD_DBG = 1;
642 BUILD_EDGES = 0;
643 BUILD_PREARCS = 0;
645 else if ( run_mode == 3 ) //build edges only
647 LOAD_GRAPH = 1;
648 BUILD_EDGES = 1;
649 BUILD_PREARCS = 0;
651 else if ( run_mode == 4 ) //build preArc only
653 LOAD_GRAPH = 0;
654 BUILD_DBG = 0;
655 BUILD_EDGES = 0;
656 BUILD_PREARCS = 1;
658 else
660 fprintf ( stderr, "ERROR: invalid runMode!\n" );
661 exit ( -1 );
664 read_lib ( in_filenames_vt, shortrdsfile );
666 fprintf(stderr,"The reads for building sparse de Brujin graph:\n");
667 for (int i=0;i<in_filenames_vt.size();++i){
668 fprintf(stderr,"%s\n",in_filenames_vt[i].c_str());
670 fprintf(stderr,"\n");
674 static void display_pregraph_usage()
676 fprintf ( stderr, "Usage: sparse_pregraph -s configFile -K kmer -z genomeSize -o outputGraph [-g maxKmerEdgeLength -d kmerFreqCutoff -e kmerEdgeFreqCutoff -R -r runMode -p n_cpu]\n" );
677 fprintf ( stderr, " -s <string> configFile: the config file of solexa reads\n" );
678 #ifdef _63MER_
679 fprintf ( stderr, " -K <int> kmer(min 13, max 63): kmer size, [23]\n" );
680 #endif
681 #ifdef _127MER_
682 fprintf ( stderr, " -K <int> kmer(min 13, max 127): kmer size, [23]\n" );
683 #endif
684 fprintf ( stderr, " -g <int> maxKmerEdgeLength(min 1, max 25): number of skipped intermediate kmers, [15]\n" );
685 fprintf ( stderr, " -z <int> genomeSize(required): estimated genome size\n" );
686 fprintf ( stderr, " -d <int> kmerFreqCutoff: delete kmers with frequency no larger than,[1]\n" );
687 fprintf ( stderr, " -e <int> kmerEdgeFreqCutoff: delete kmers' related edge with frequency no larger than [1]\n" );
688 fprintf ( stderr, " -R (optional) output extra information for resolving repeats in contig step, [NO]\n" );
689 fprintf ( stderr, " -r <int> runMode: 0 build graph & build edge and preArc, 1 load graph by prefix & build edge and preArc, 2 build graph only, 3 build edges only, 4 build preArcs only [0] \n" );
690 fprintf ( stderr, " -p <int> n_cpu: number of cpu for use,[8]\n" );
691 fprintf ( stderr, " -o <int> outputGraph: prefix of output graph file name\n" );