1 /* synced_bcf_reader.c -- stream through multiple VCF files.
3 Copyright (C) 2012-2014 Genome Research Ltd.
5 Author: Petr Danecek <pd3@sanger.ac.uk>
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
34 #include "htslib/synced_bcf_reader.h"
35 #include "htslib/kseq.h"
36 #include "htslib/khash_str2int.h"
37 #include "htslib/bgzf.h"
38 #include "htslib/thread_pool.h"
39 #include "bcf_sr_sort.h"
41 #define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi
49 typedef struct _region_t
52 int nregs
, mregs
, creg
;
56 #define BCF_SR_AUX(x) ((aux_t*)((x)->aux))
63 static void _regions_add(bcf_sr_regions_t
*reg
, const char *chr
, int start
, int end
);
64 static bcf_sr_regions_t
*_regions_init_string(const char *str
);
65 static int _regions_match_alleles(bcf_sr_regions_t
*reg
, int als_idx
, bcf1_t
*rec
);
67 char *bcf_sr_strerror(int errnum
)
72 return strerror(errno
); break;
74 return "not compressed with bgzip"; break;
76 return "could not load index"; break;
78 return "unknown file type"; break;
80 return "API usage error"; break;
82 return "could not parse header"; break;
84 return "no BGZF EOF marker; file may be truncated"; break;
86 return "Out of memory"; break;
88 return "VCF parse error"; break;
90 return "BCF read error"; break;
95 int bcf_sr_set_opt(bcf_srs_t
*readers
, bcf_sr_opt_t opt
, ...)
100 case BCF_SR_REQUIRE_IDX
:
101 readers
->require_index
= 1;
104 case BCF_SR_PAIR_LOGIC
:
106 BCF_SR_AUX(readers
)->sort
.pair
= va_arg(args
, int);
115 static int *init_filters(bcf_hdr_t
*hdr
, const char *filters
, int *nfilters
)
117 kstring_t str
= {0,0,0};
118 const char *tmp
= filters
, *prev
= filters
;
119 int nout
= 0, *out
= NULL
;
122 if ( *tmp
==',' || !*tmp
)
124 out
= (int*) realloc(out
, (nout
+1)*sizeof(int));
125 if ( tmp
-prev
==1 && *prev
=='.' )
133 kputsn(prev
, tmp
-prev
, &str
);
134 out
[nout
] = bcf_hdr_id2int(hdr
, BCF_DT_ID
, str
.s
);
135 if ( out
[nout
]>=0 ) nout
++;
142 if ( str
.m
) free(str
.s
);
147 int bcf_sr_set_regions(bcf_srs_t
*readers
, const char *regions
, int is_file
)
149 assert( !readers
->regions
);
150 if ( readers
->nreaders
)
152 hts_log_error("Must call bcf_sr_set_regions() before bcf_sr_add_reader()");
155 readers
->regions
= bcf_sr_regions_init(regions
,is_file
,0,1,-2);
156 if ( !readers
->regions
) return -1;
157 readers
->explicit_regs
= 1;
158 readers
->require_index
= 1;
161 int bcf_sr_set_targets(bcf_srs_t
*readers
, const char *targets
, int is_file
, int alleles
)
163 assert( !readers
->targets
);
164 if ( targets
[0]=='^' )
166 readers
->targets_exclude
= 1;
169 readers
->targets
= bcf_sr_regions_init(targets
,is_file
,0,1,-2);
170 if ( !readers
->targets
) return -1;
171 readers
->targets_als
= alleles
;
175 int bcf_sr_set_threads(bcf_srs_t
*files
, int n_threads
)
177 if (!(files
->n_threads
= n_threads
))
180 files
->p
= calloc(1, sizeof(*files
->p
));
182 files
->errnum
= no_memory
;
185 if (!(files
->p
->pool
= hts_tpool_init(n_threads
)))
191 void bcf_sr_destroy_threads(bcf_srs_t
*files
) {
196 hts_tpool_destroy(files
->p
->pool
);
200 int bcf_sr_add_reader(bcf_srs_t
*files
, const char *fname
)
202 htsFile
* file_ptr
= hts_open(fname
, "r");
204 files
->errnum
= open_failed
;
208 files
->has_line
= (int*) realloc(files
->has_line
, sizeof(int)*(files
->nreaders
+1));
209 files
->has_line
[files
->nreaders
] = 0;
210 files
->readers
= (bcf_sr_t
*) realloc(files
->readers
, sizeof(bcf_sr_t
)*(files
->nreaders
+1));
211 bcf_sr_t
*reader
= &files
->readers
[files
->nreaders
++];
212 memset(reader
,0,sizeof(bcf_sr_t
));
214 reader
->file
= file_ptr
;
218 if ( reader
->file
->format
.compression
==bgzf
)
220 BGZF
*bgzf
= hts_get_bgzfp(reader
->file
);
221 if ( bgzf
&& bgzf_check_EOF(bgzf
) == 0 ) {
222 files
->errnum
= no_eof
;
223 hts_log_warning("No BGZF EOF marker; file '%s' may be truncated", fname
);
226 bgzf_thread_pool(bgzf
, files
->p
->pool
, files
->p
->qsize
);
229 if ( files
->require_index
)
231 if ( reader
->file
->format
.format
==vcf
)
233 if ( reader
->file
->format
.compression
!=bgzf
)
235 files
->errnum
= not_bgzf
;
239 reader
->tbx_idx
= tbx_index_load(fname
);
240 if ( !reader
->tbx_idx
)
242 files
->errnum
= idx_load_failed
;
246 reader
->header
= bcf_hdr_read(reader
->file
);
248 else if ( reader
->file
->format
.format
==bcf
)
250 if ( reader
->file
->format
.compression
!=bgzf
)
252 files
->errnum
= not_bgzf
;
256 reader
->header
= bcf_hdr_read(reader
->file
);
258 reader
->bcf_idx
= bcf_index_load(fname
);
259 if ( !reader
->bcf_idx
)
261 files
->errnum
= idx_load_failed
;
267 files
->errnum
= file_type_error
;
273 if ( reader
->file
->format
.format
==bcf
|| reader
->file
->format
.format
==vcf
)
275 reader
->header
= bcf_hdr_read(reader
->file
);
279 files
->errnum
= file_type_error
;
282 files
->streaming
= 1;
284 if ( files
->streaming
&& files
->nreaders
>1 )
286 files
->errnum
= api_usage_error
;
287 hts_log_error("Must set require_index when the number of readers is greater than one");
290 if ( files
->streaming
&& files
->regions
)
292 files
->errnum
= api_usage_error
;
293 hts_log_error("Cannot tabix-jump in streaming mode");
296 if ( !reader
->header
)
298 files
->errnum
= header_error
;
302 reader
->fname
= strdup(fname
);
303 if ( files
->apply_filters
)
304 reader
->filter_ids
= init_filters(reader
->header
, files
->apply_filters
, &reader
->nfilter_ids
);
306 // Update list of chromosomes
307 if ( !files
->explicit_regs
&& !files
->streaming
)
310 const char **names
= reader
->tbx_idx
? tbx_seqnames(reader
->tbx_idx
, &n
) : bcf_hdr_seqnames(reader
->header
, &n
);
313 if ( !files
->regions
)
314 files
->regions
= _regions_init_string(names
[i
]);
316 _regions_add(files
->regions
, names
[i
], -1, -1);
324 bcf_srs_t
*bcf_sr_init(void)
326 bcf_srs_t
*files
= (bcf_srs_t
*) calloc(1,sizeof(bcf_srs_t
));
327 files
->aux
= (aux_t
*) calloc(1,sizeof(aux_t
));
328 bcf_sr_sort_init(&BCF_SR_AUX(files
)->sort
);
332 static void bcf_sr_destroy1(bcf_sr_t
*reader
)
335 if ( reader
->tbx_idx
) tbx_destroy(reader
->tbx_idx
);
336 if ( reader
->bcf_idx
) hts_idx_destroy(reader
->bcf_idx
);
337 bcf_hdr_destroy(reader
->header
);
338 hts_close(reader
->file
);
339 if ( reader
->itr
) tbx_itr_destroy(reader
->itr
);
341 for (j
=0; j
<reader
->mbuffer
; j
++)
342 bcf_destroy1(reader
->buffer
[j
]);
343 free(reader
->buffer
);
344 free(reader
->samples
);
345 free(reader
->filter_ids
);
348 void bcf_sr_destroy(bcf_srs_t
*files
)
351 for (i
=0; i
<files
->nreaders
; i
++)
352 bcf_sr_destroy1(&files
->readers
[i
]);
353 free(files
->has_line
);
354 free(files
->readers
);
355 for (i
=0; i
<files
->n_smpl
; i
++) free(files
->samples
[i
]);
356 free(files
->samples
);
357 if (files
->targets
) bcf_sr_regions_destroy(files
->targets
);
358 if (files
->regions
) bcf_sr_regions_destroy(files
->regions
);
359 if (files
->tmps
.m
) free(files
->tmps
.s
);
360 if (files
->n_threads
) bcf_sr_destroy_threads(files
);
361 bcf_sr_sort_destroy(&BCF_SR_AUX(files
)->sort
);
366 void bcf_sr_remove_reader(bcf_srs_t
*files
, int i
)
368 assert( !files
->samples
); // not ready for this yet
369 bcf_sr_sort_remove_reader(files
, &BCF_SR_AUX(files
)->sort
, i
);
370 bcf_sr_destroy1(&files
->readers
[i
]);
371 if ( i
+1 < files
->nreaders
)
373 memmove(&files
->readers
[i
], &files
->readers
[i
+1], (files
->nreaders
-i
-1)*sizeof(bcf_sr_t
));
374 memmove(&files
->has_line
[i
], &files
->has_line
[i
+1], (files
->nreaders
-i
-1)*sizeof(int));
380 void debug_buffer(FILE *fp
, bcf_sr_t
*reader
)
383 for (j
=0; j
<=reader
->nbuffer
; j
++)
385 bcf1_t
*line
= reader
->buffer
[j
];
386 fprintf(fp
,"\t%p\t%s%s\t%s:%d\t%s ", (void*)line
,reader
->fname
,j
==0?"*":" ",reader
->header
->id
[BCF_DT_CTG
][line
->rid
].key
,line
->pos
+1,line
->n_allele
?line
->d
.allele
[0]:"");
388 for (k
=1; k
<line
->n_allele
; k
++) fprintf(fp
," %s", line
->d
.allele
[k
]);
393 void debug_buffers(FILE *fp
, bcf_srs_t
*files
)
396 for (i
=0; i
<files
->nreaders
; i
++)
398 fprintf(fp
, "has_line: %d\t%s\n", bcf_sr_has_line(files
,i
),files
->readers
[i
].fname
);
399 debug_buffer(fp
, &files
->readers
[i
]);
404 static inline int has_filter(bcf_sr_t
*reader
, bcf1_t
*line
)
407 if ( !line
->d
.n_flt
)
409 for (j
=0; j
<reader
->nfilter_ids
; j
++)
410 if ( reader
->filter_ids
[j
]<0 ) return 1;
413 for (i
=0; i
<line
->d
.n_flt
; i
++)
415 for (j
=0; j
<reader
->nfilter_ids
; j
++)
416 if ( line
->d
.flt
[i
]==reader
->filter_ids
[j
] ) return 1;
421 static int _reader_seek(bcf_sr_t
*reader
, const char *seq
, int start
, int end
)
423 if ( end
>=MAX_CSI_COOR
)
425 hts_log_error("The coordinate is out of csi index limit: %d", end
+1);
430 hts_itr_destroy(reader
->itr
);
434 if ( reader
->tbx_idx
)
436 int tid
= tbx_name2id(reader
->tbx_idx
, seq
);
437 if ( tid
==-1 ) return -1; // the sequence not present in this file
438 reader
->itr
= tbx_itr_queryi(reader
->tbx_idx
,tid
,start
,end
+1);
442 int tid
= bcf_hdr_name2id(reader
->header
, seq
);
443 if ( tid
==-1 ) return -1; // the sequence not present in this file
444 reader
->itr
= bcf_itr_queryi(reader
->bcf_idx
,tid
,start
,end
+1);
447 hts_log_error("Could not seek: %s:%d-%d", seq
, start
+ 1, end
+ 1);
454 * _readers_next_region() - jumps to next region if necessary
455 * Returns 0 on success or -1 when there are no more regions left
457 static int _readers_next_region(bcf_srs_t
*files
)
459 // Need to open new chromosome? Check number of lines in all readers' buffers
461 for (i
=0; i
<files
->nreaders
; i
++)
462 if ( !files
->readers
[i
].itr
&& !files
->readers
[i
].nbuffer
) eos
++;
464 if ( eos
!=files
->nreaders
)
466 // Some of the readers still has buffered lines
470 // No lines in the buffer, need to open new region or quit
471 if ( bcf_sr_regions_next(files
->regions
)<0 ) return -1;
473 for (i
=0; i
<files
->nreaders
; i
++)
474 _reader_seek(&files
->readers
[i
],files
->regions
->seq_names
[files
->regions
->iseq
],files
->regions
->start
,files
->regions
->end
);
480 * _reader_fill_buffer() - buffers all records with the same coordinate
482 static void _reader_fill_buffer(bcf_srs_t
*files
, bcf_sr_t
*reader
)
484 // Return if the buffer is full: the coordinate of the last buffered record differs
485 if ( reader
->nbuffer
&& reader
->buffer
[reader
->nbuffer
]->pos
!= reader
->buffer
[1]->pos
) return;
487 // No iterator (sequence not present in this file) and not streaming
488 if ( !reader
->itr
&& !files
->streaming
) return;
490 // Fill the buffer with records starting at the same position
494 if ( reader
->nbuffer
+1 >= reader
->mbuffer
)
496 // Increase buffer size
497 reader
->mbuffer
+= 8;
498 reader
->buffer
= (bcf1_t
**) realloc(reader
->buffer
, sizeof(bcf1_t
*)*reader
->mbuffer
);
499 for (i
=8; i
>0; i
--) // initialize
501 reader
->buffer
[reader
->mbuffer
-i
] = bcf_init1();
502 reader
->buffer
[reader
->mbuffer
-i
]->max_unpack
= files
->max_unpack
;
503 reader
->buffer
[reader
->mbuffer
-i
]->pos
= -1; // for rare cases when VCF starts from 1
506 if ( files
->streaming
)
508 if ( reader
->file
->format
.format
==vcf
)
510 if ( (ret
=hts_getline(reader
->file
, KS_SEP_LINE
, &files
->tmps
)) < 0 ) break; // no more lines
511 ret
= vcf_parse1(&files
->tmps
, reader
->header
, reader
->buffer
[reader
->nbuffer
+1]);
512 if ( ret
<0 ) { files
->errnum
= vcf_parse_error
; break; }
514 else if ( reader
->file
->format
.format
==bcf
)
516 ret
= bcf_read1(reader
->file
, reader
->header
, reader
->buffer
[reader
->nbuffer
+1]);
517 if ( ret
< -1 ) files
->errnum
= bcf_read_error
;
518 if ( ret
< 0 ) break; // no more lines or an error
522 hts_log_error("Fixme: not ready for this");
526 else if ( reader
->tbx_idx
)
528 if ( (ret
=tbx_itr_next(reader
->file
, reader
->tbx_idx
, reader
->itr
, &files
->tmps
)) < 0 ) break; // no more lines
529 ret
= vcf_parse1(&files
->tmps
, reader
->header
, reader
->buffer
[reader
->nbuffer
+1]);
530 if ( ret
<0 ) { files
->errnum
= vcf_parse_error
; break; }
534 ret
= bcf_itr_next(reader
->file
, reader
->itr
, reader
->buffer
[reader
->nbuffer
+1]);
535 if ( ret
< -1 ) files
->errnum
= bcf_read_error
;
536 if ( ret
< 0 ) break; // no more lines or an error
537 bcf_subset_format(reader
->header
,reader
->buffer
[reader
->nbuffer
+1]);
541 if ( !reader
->nfilter_ids
)
542 bcf_unpack(reader
->buffer
[reader
->nbuffer
+1], BCF_UN_STR
);
545 bcf_unpack(reader
->buffer
[reader
->nbuffer
+1], BCF_UN_STR
|BCF_UN_FLT
);
546 if ( !has_filter(reader
, reader
->buffer
[reader
->nbuffer
+1]) ) continue;
550 if ( reader
->buffer
[reader
->nbuffer
]->pos
!= reader
->buffer
[1]->pos
) break; // the buffer is full
554 // done for this region
555 tbx_itr_destroy(reader
->itr
);
561 * _readers_shift_buffer() - removes the first line and all subsequent lines with the same position
563 static void _reader_shift_buffer(bcf_sr_t
*reader
)
566 for (i
=2; i
<=reader
->nbuffer
; i
++)
567 if ( reader
->buffer
[i
]->pos
!=reader
->buffer
[1]->pos
) break;
568 if ( i
<=reader
->nbuffer
)
570 // A record with a different position follows, swap it. Because of the reader's logic,
571 // only one such line can be present.
572 bcf1_t
*tmp
= reader
->buffer
[1]; reader
->buffer
[1] = reader
->buffer
[i
]; reader
->buffer
[i
] = tmp
;
576 reader
->nbuffer
= 0; // no other line
579 int _reader_next_line(bcf_srs_t
*files
)
581 int i
, min_pos
= INT_MAX
;
582 const char *chr
= NULL
;
584 // Loop until next suitable line is found or all readers have finished
587 // Get all readers ready for the next region.
588 if ( files
->regions
&& _readers_next_region(files
)<0 ) break;
591 for (i
=0; i
<files
->nreaders
; i
++)
593 _reader_fill_buffer(files
, &files
->readers
[i
]);
595 // Update the minimum coordinate
596 if ( !files
->readers
[i
].nbuffer
) continue;
597 if ( min_pos
> files
->readers
[i
].buffer
[1]->pos
)
599 min_pos
= files
->readers
[i
].buffer
[1]->pos
;
600 chr
= bcf_seqname(files
->readers
[i
].header
, files
->readers
[i
].buffer
[1]);
603 if ( min_pos
==INT_MAX
)
605 if ( !files
->regions
) break;
609 // Skip this position if not present in targets
610 if ( files
->targets
)
612 int ret
= bcf_sr_regions_overlap(files
->targets
, chr
, min_pos
, min_pos
);
613 if ( (!files
->targets_exclude
&& ret
<0) || (files
->targets_exclude
&& !ret
) )
615 // Remove all lines with this position from the buffer
616 for (i
=0; i
<files
->nreaders
; i
++)
617 if ( files
->readers
[i
].nbuffer
&& files
->readers
[i
].buffer
[1]->pos
==min_pos
)
618 _reader_shift_buffer(&files
->readers
[i
]);
625 break; // done: chr and min_pos are set
627 if ( !chr
) return 0;
629 return bcf_sr_sort_next(files
, &BCF_SR_AUX(files
)->sort
, chr
, min_pos
);
632 int bcf_sr_next_line(bcf_srs_t
*files
)
634 if ( !files
->targets_als
)
635 return _reader_next_line(files
);
639 int i
, ret
= _reader_next_line(files
);
640 if ( !ret
) return ret
;
642 for (i
=0; i
<files
->nreaders
; i
++)
643 if ( files
->has_line
[i
] ) break;
645 if ( _regions_match_alleles(files
->targets
, files
->targets_als
-1, files
->readers
[i
].buffer
[0]) ) return ret
;
647 // Check if there are more duplicate lines in the buffers. If not, return this line as if it
648 // matched the targets, even if there is a type mismatch
649 for (i
=0; i
<files
->nreaders
; i
++)
651 if ( !files
->has_line
[i
] ) continue;
652 if ( files
->readers
[i
].nbuffer
==0 || files
->readers
[i
].buffer
[1]->pos
!=files
->readers
[i
].buffer
[0]->pos
) continue;
655 if ( i
==files
->nreaders
) return ret
; // no more lines left, output even if target alleles are not of the same type
659 static void bcf_sr_seek_start(bcf_srs_t
*readers
)
661 bcf_sr_regions_t
*reg
= readers
->regions
;
663 for (i
=0; i
<reg
->nseqs
; i
++)
664 reg
->regs
[i
].creg
= -1;
669 int bcf_sr_seek(bcf_srs_t
*readers
, const char *seq
, int pos
)
671 if ( !readers
->regions
) return 0;
675 bcf_sr_seek_start(readers
);
678 bcf_sr_regions_overlap(readers
->regions
, seq
, pos
, pos
);
680 for (i
=0; i
<readers
->nreaders
; i
++)
682 nret
+= _reader_seek(&readers
->readers
[i
],seq
,pos
,MAX_CSI_COOR
-1);
687 int bcf_sr_set_samples(bcf_srs_t
*files
, const char *fname
, int is_file
)
689 int i
, j
, nsmpl
, free_smpl
= 0;
692 void *exclude
= (fname
[0]=='^') ? khash_str2int_init() : NULL
;
693 if ( exclude
|| strcmp("-",fname
) ) // "-" stands for all samples
695 smpl
= hts_readlist(fname
, is_file
, &nsmpl
);
698 hts_log_error("Could not read the file: \"%s\"", fname
);
703 for (i
=0; i
<nsmpl
; i
++)
704 khash_str2int_inc(exclude
, smpl
[i
]);
710 smpl
= files
->readers
[0].header
->samples
; // intersection of all samples
711 nsmpl
= bcf_hdr_nsamples(files
->readers
[0].header
);
714 files
->samples
= NULL
;
716 for (i
=0; i
<nsmpl
; i
++)
718 if ( exclude
&& khash_str2int_has_key(exclude
,smpl
[i
]) ) continue;
721 for (j
=0; j
<files
->nreaders
; j
++)
723 if ( bcf_hdr_id2int(files
->readers
[j
].header
, BCF_DT_SAMPLE
, smpl
[i
])<0 ) break;
726 if ( n_isec
!=files
->nreaders
)
728 hts_log_warning("The sample \"%s\" was not found in %s, skipping",
729 smpl
[i
], files
->readers
[n_isec
].fname
);
733 files
->samples
= (char**) realloc(files
->samples
, (files
->n_smpl
+1)*sizeof(const char*));
734 files
->samples
[files
->n_smpl
++] = strdup(smpl
[i
]);
737 if ( exclude
) khash_str2int_destroy(exclude
);
740 for (i
=0; i
<nsmpl
; i
++) free(smpl
[i
]);
744 if ( !files
->n_smpl
)
746 if ( files
->nreaders
>1 )
747 hts_log_warning("No samples in common");
750 for (i
=0; i
<files
->nreaders
; i
++)
752 bcf_sr_t
*reader
= &files
->readers
[i
];
753 reader
->samples
= (int*) malloc(sizeof(int)*files
->n_smpl
);
754 reader
->n_smpl
= files
->n_smpl
;
755 for (j
=0; j
<files
->n_smpl
; j
++)
756 reader
->samples
[j
] = bcf_hdr_id2int(reader
->header
, BCF_DT_SAMPLE
, files
->samples
[j
]);
761 // Add a new region into a list sorted by start,end. On input the coordinates
762 // are 1-based, stored 0-based, inclusive.
763 static void _regions_add(bcf_sr_regions_t
*reg
, const char *chr
, int start
, int end
)
765 if ( start
==-1 && end
==-1 )
767 start
= 0; end
= MAX_CSI_COOR
-1;
771 start
--; end
--; // store 0-based coordinates
774 if ( !reg
->seq_hash
)
775 reg
->seq_hash
= khash_str2int_init();
778 if ( khash_str2int_get(reg
->seq_hash
, chr
, &iseq
)<0 )
780 // the chromosome block does not exist
782 reg
->seq_names
= (char**) realloc(reg
->seq_names
,sizeof(char*)*reg
->nseqs
);
783 reg
->regs
= (region_t
*) realloc(reg
->regs
,sizeof(region_t
)*reg
->nseqs
);
784 memset(®
->regs
[reg
->nseqs
-1],0,sizeof(region_t
));
785 reg
->seq_names
[iseq
] = strdup(chr
);
786 reg
->regs
[iseq
].creg
= -1;
787 khash_str2int_set(reg
->seq_hash
,reg
->seq_names
[iseq
],iseq
);
790 region_t
*creg
= ®
->regs
[iseq
];
792 // the regions may not be sorted on input: binary search
793 int i
, min
= 0, max
= creg
->nregs
- 1;
797 if ( start
< creg
->regs
[i
].start
) max
= i
- 1;
798 else if ( start
> creg
->regs
[i
].start
) min
= i
+ 1;
801 if ( min
>max
|| creg
->regs
[i
].start
!=start
|| creg
->regs
[i
].end
!=end
)
803 // no such region, insert a new one just after max
804 hts_expand(region1_t
,creg
->nregs
+1,creg
->mregs
,creg
->regs
);
805 if ( ++max
< creg
->nregs
)
806 memmove(&creg
->regs
[max
+1],&creg
->regs
[max
],(creg
->nregs
- max
)*sizeof(region1_t
));
807 creg
->regs
[max
].start
= start
;
808 creg
->regs
[max
].end
= end
;
813 // File name or a list of genomic locations. If file name, NULL is returned.
814 static bcf_sr_regions_t
*_regions_init_string(const char *str
)
816 bcf_sr_regions_t
*reg
= (bcf_sr_regions_t
*) calloc(1, sizeof(bcf_sr_regions_t
));
817 reg
->start
= reg
->end
= -1;
818 reg
->prev_start
= reg
->prev_seq
= -1;
820 kstring_t tmp
= {0,0,0};
821 const char *sp
= str
, *ep
= str
;
825 while ( *ep
&& *ep
!=',' && *ep
!=':' ) ep
++;
827 kputsn(sp
,ep
-sp
,&tmp
);
831 from
= hts_parse_decimal(sp
,(char**)&ep
,0);
834 hts_log_error("Could not parse the region(s): %s", str
);
835 free(reg
); free(tmp
.s
); return NULL
;
837 if ( !*ep
|| *ep
==',' )
839 _regions_add(reg
, tmp
.s
, from
, from
);
845 hts_log_error("Could not parse the region(s): %s", str
);
846 free(reg
); free(tmp
.s
); return NULL
;
850 to
= hts_parse_decimal(sp
,(char**)&ep
,0);
851 if ( *ep
&& *ep
!=',' )
853 hts_log_error("Could not parse the region(s): %s", str
);
854 free(reg
); free(tmp
.s
); return NULL
;
856 if ( sp
==ep
) to
= MAX_CSI_COOR
-1;
857 _regions_add(reg
, tmp
.s
, from
, to
);
863 if ( tmp
.l
) _regions_add(reg
, tmp
.s
, -1, -1);
872 // ichr,ifrom,ito are 0-based;
873 // returns -1 on error, 0 if the line is a comment line, 1 on success
874 static int _regions_parse_line(char *line
, int ichr
,int ifrom
,int ito
, char **chr
,char **chr_end
,int *from
,int *to
)
878 if ( line
[0]=='#' ) return 0;
880 int k
,l
; // index of the start and end column of the tab-delimited file
887 char *se
= line
, *ss
= NULL
; // start and end
889 for (i
=0; i
<=k
&& *se
; i
++)
891 ss
= i
==0 ? se
++ : ++se
;
892 while (*se
&& *se
!='\t') se
++;
894 if ( i
<=k
) return -1;
897 *from
= *to
= hts_parse_decimal(ss
, &tmp
, 0);
898 if ( tmp
==ss
) return -1;
903 *from
= hts_parse_decimal(ss
, &tmp
, 0);
905 *to
= hts_parse_decimal(ss
, &tmp
, 0);
906 if ( ss
==tmp
) return -1;
908 for (i
=k
; i
<l
&& *se
; i
++)
911 while (*se
&& *se
!='\t') se
++;
913 if ( i
<l
) return -1;
915 *to
= hts_parse_decimal(ss
, &tmp
, 0);
917 *from
= hts_parse_decimal(ss
, &tmp
, 0);
918 if ( ss
==tmp
) return -1;
922 for (i
=0; i
<=ichr
&& *se
; i
++)
924 if ( i
>0 ) ss
= ++se
;
925 while (*se
&& *se
!='\t') se
++;
927 if ( i
<=ichr
) return -1;
933 bcf_sr_regions_t
*bcf_sr_regions_init(const char *regions
, int is_file
, int ichr
, int ifrom
, int ito
)
935 bcf_sr_regions_t
*reg
;
936 if ( !is_file
) return _regions_init_string(regions
);
938 reg
= (bcf_sr_regions_t
*) calloc(1, sizeof(bcf_sr_regions_t
));
939 reg
->start
= reg
->end
= -1;
940 reg
->prev_start
= reg
->prev_seq
= -1;
942 reg
->file
= hts_open(regions
, "rb");
945 hts_log_error("Could not open file: %s", regions
);
950 reg
->tbx
= tbx_index_load(regions
);
953 int len
= strlen(regions
);
954 int is_bed
= strcasecmp(".bed",regions
+len
-4) ? 0 : 1;
955 if ( !is_bed
&& !strcasecmp(".bed.gz",regions
+len
-7) ) is_bed
= 1;
957 if ( reg
->file
->format
.format
==vcf
) ito
= 1;
959 // read the whole file, tabix index is not present
960 while ( hts_getline(reg
->file
, KS_SEP_LINE
, ®
->line
) > 0 )
964 ret
= _regions_parse_line(reg
->line
.s
, ichr
,ifrom
,abs(ito
), &chr
,&chr_end
,&from
,&to
);
968 ret
= _regions_parse_line(reg
->line
.s
, ichr
,ifrom
,ifrom
, &chr
,&chr_end
,&from
,&to
);
971 hts_log_error("Could not parse the file %s, using the columns %d,%d[,%d]",
972 regions
,ichr
+1,ifrom
+1,ito
+1);
973 hts_close(reg
->file
); reg
->file
= NULL
; free(reg
);
977 if ( !ret
) continue;
978 if ( is_bed
) from
++;
980 _regions_add(reg
, chr
, from
, to
);
983 hts_close(reg
->file
); reg
->file
= NULL
;
984 if ( !reg
->nseqs
) { free(reg
); return NULL
; }
988 reg
->seq_names
= (char**) tbx_seqnames(reg
->tbx
, ®
->nseqs
);
989 if ( !reg
->seq_hash
)
990 reg
->seq_hash
= khash_str2int_init();
992 for (i
=0; i
<reg
->nseqs
; i
++)
994 khash_str2int_set(reg
->seq_hash
,reg
->seq_names
[i
],i
);
996 reg
->fname
= strdup(regions
);
1001 void bcf_sr_regions_destroy(bcf_sr_regions_t
*reg
)
1005 if ( reg
->itr
) tbx_itr_destroy(reg
->itr
);
1006 if ( reg
->tbx
) tbx_destroy(reg
->tbx
);
1007 if ( reg
->file
) hts_close(reg
->file
);
1008 if ( reg
->als
) free(reg
->als
);
1009 if ( reg
->als_str
.s
) free(reg
->als_str
.s
);
1013 // free only in-memory names, tbx names are const
1014 for (i
=0; i
<reg
->nseqs
; i
++)
1016 free(reg
->seq_names
[i
]);
1017 free(reg
->regs
[i
].regs
);
1021 free(reg
->seq_names
);
1022 khash_str2int_destroy(reg
->seq_hash
);
1026 int bcf_sr_regions_seek(bcf_sr_regions_t
*reg
, const char *seq
)
1028 reg
->iseq
= reg
->start
= reg
->end
= -1;
1029 if ( khash_str2int_get(reg
->seq_hash
, seq
, ®
->iseq
) < 0 ) return -1; // sequence seq not in regions
1031 // using in-memory regions
1034 reg
->regs
[reg
->iseq
].creg
= -1;
1038 // reading regions from tabix
1039 if ( reg
->itr
) tbx_itr_destroy(reg
->itr
);
1040 reg
->itr
= tbx_itr_querys(reg
->tbx
, seq
);
1041 if ( reg
->itr
) return 0;
1046 int bcf_sr_regions_next(bcf_sr_regions_t
*reg
)
1048 if ( reg
->iseq
<0 ) return -1;
1049 reg
->start
= reg
->end
= -1;
1052 // using in-memory regions
1055 while ( reg
->iseq
< reg
->nseqs
)
1057 reg
->regs
[reg
->iseq
].creg
++;
1058 if ( reg
->regs
[reg
->iseq
].creg
< reg
->regs
[reg
->iseq
].nregs
) break;
1061 if ( reg
->iseq
>= reg
->nseqs
) { reg
->iseq
= -1; return -1; } // no more regions left
1062 region1_t
*creg
= ®
->regs
[reg
->iseq
].regs
[reg
->regs
[reg
->iseq
].creg
];
1063 reg
->start
= creg
->start
;
1064 reg
->end
= creg
->end
;
1068 // reading from tabix
1069 char *chr
, *chr_end
;
1070 int ichr
= 0, ifrom
= 1, ito
= 2, is_bed
= 0, from
, to
;
1073 ichr
= reg
->tbx
->conf
.sc
-1;
1074 ifrom
= reg
->tbx
->conf
.bc
-1;
1075 ito
= reg
->tbx
->conf
.ec
-1;
1076 if ( ito
<0 ) ito
= ifrom
;
1077 is_bed
= reg
->tbx
->conf
.preset
==TBX_UCSC
? 1 : 0;
1085 // tabix index present, reading a chromosome block
1086 ret
= tbx_itr_next(reg
->file
, reg
->tbx
, reg
->itr
, ®
->line
);
1087 if ( ret
<0 ) { reg
->iseq
= -1; return -1; }
1093 // Waited for seek which never came. Reopen in text mode and stream
1094 // through the regions, otherwise hts_getline would fail
1095 hts_close(reg
->file
);
1096 reg
->file
= hts_open(reg
->fname
, "r");
1099 hts_log_error("Could not open file: %s", reg
->fname
);
1101 bcf_sr_regions_destroy(reg
);
1107 // tabix index absent, reading the whole file
1108 ret
= hts_getline(reg
->file
, KS_SEP_LINE
, ®
->line
);
1109 if ( ret
<0 ) { reg
->iseq
= -1; return -1; }
1111 ret
= _regions_parse_line(reg
->line
.s
, ichr
,ifrom
,ito
, &chr
,&chr_end
,&from
,&to
);
1114 hts_log_error("Could not parse the file %s, using the columns %d,%d,%d",
1115 reg
->fname
,ichr
+1,ifrom
+1,ito
+1);
1119 if ( is_bed
) from
++;
1122 if ( khash_str2int_get(reg
->seq_hash
, chr
, ®
->iseq
)<0 )
1124 hts_log_error("Broken tabix index? The sequence \"%s\" not in dictionary [%s]",
1130 reg
->start
= from
- 1;
1135 static int _regions_match_alleles(bcf_sr_regions_t
*reg
, int als_idx
, bcf1_t
*rec
)
1139 // payload is not supported for in-memory regions, switch to regidx instead in future
1140 hts_log_error("Compressed and indexed targets file is required");
1144 int i
= 0, max_len
= 0;
1147 char *ss
= reg
->line
.s
;
1148 while ( i
<als_idx
&& *ss
)
1150 if ( *ss
=='\t' ) i
++;
1155 while ( *se
&& *se
!='\t' )
1157 if ( *se
==',' ) reg
->nals
++;
1160 ks_resize(®
->als_str
, se
-ss
+1+reg
->nals
);
1162 hts_expand(char*,reg
->nals
,reg
->mals
,reg
->als
);
1168 if ( *se
=='\t' ) break;
1169 if ( *se
!=',' ) continue;
1170 reg
->als
[reg
->nals
] = ®
->als_str
.s
[reg
->als_str
.l
];
1171 kputsn(ss
,se
-ss
,®
->als_str
);
1172 if ( ®
->als_str
.s
[reg
->als_str
.l
] - reg
->als
[reg
->nals
] > max_len
) max_len
= ®
->als_str
.s
[reg
->als_str
.l
] - reg
->als
[reg
->nals
];
1177 reg
->als
[reg
->nals
] = ®
->als_str
.s
[reg
->als_str
.l
];
1178 kputsn(ss
,se
-ss
,®
->als_str
);
1179 if ( ®
->als_str
.s
[reg
->als_str
.l
] - reg
->als
[reg
->nals
] > max_len
) max_len
= ®
->als_str
.s
[reg
->als_str
.l
] - reg
->als
[reg
->nals
];
1181 reg
->als_type
= max_len
> 1 ? VCF_INDEL
: VCF_SNP
; // this is a simplified check, see vcf.c:bcf_set_variant_types
1183 int type
= bcf_get_variant_types(rec
);
1184 if ( reg
->als_type
& VCF_INDEL
)
1185 return type
& VCF_INDEL
? 1 : 0;
1186 return !(type
& VCF_INDEL
) ? 1 : 0;
1189 int bcf_sr_regions_overlap(bcf_sr_regions_t
*reg
, const char *seq
, int start
, int end
)
1192 if ( khash_str2int_get(reg
->seq_hash
, seq
, &iseq
)<0 ) return -1; // no such sequence
1194 if ( reg
->prev_seq
==-1 || iseq
!=reg
->prev_seq
|| reg
->prev_start
> start
) // new chromosome or after a seek
1196 // flush regions left on previous chromosome
1197 if ( reg
->missed_reg_handler
&& reg
->prev_seq
!=-1 && reg
->iseq
!=-1 )
1198 bcf_sr_regions_flush(reg
);
1200 bcf_sr_regions_seek(reg
, seq
);
1201 reg
->start
= reg
->end
= -1;
1203 if ( reg
->prev_seq
==iseq
&& reg
->iseq
!=iseq
) return -2; // no more regions on this chromosome
1204 reg
->prev_seq
= reg
->iseq
;
1205 reg
->prev_start
= start
;
1207 while ( iseq
==reg
->iseq
&& reg
->end
< start
)
1209 if ( bcf_sr_regions_next(reg
) < 0 ) return -2; // no more regions left
1210 if ( reg
->iseq
!= iseq
) return -1; // does not overlap any regions
1211 if ( reg
->missed_reg_handler
&& reg
->end
< start
) reg
->missed_reg_handler(reg
, reg
->missed_reg_data
);
1213 if ( reg
->start
<= end
) return 0; // region overlap
1214 return -1; // no overlap
1217 void bcf_sr_regions_flush(bcf_sr_regions_t
*reg
)
1219 if ( !reg
->missed_reg_handler
|| reg
->prev_seq
==-1 ) return;
1220 while ( !bcf_sr_regions_next(reg
) ) reg
->missed_reg_handler(reg
, reg
->missed_reg_data
);