modified: diffout.py
[GalaxyCodeBases.git] / c_cpp / lib / htslib / synced_bcf_reader.c
blob47715da4ae60968aa33da793723a81067249b6eb
1 /* synced_bcf_reader.c -- stream through multiple VCF files.
3 Copyright (C) 2012-2014 Genome Research Ltd.
5 Author: Petr Danecek <pd3@sanger.ac.uk>
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
25 #include <config.h>
27 #include <stdio.h>
28 #include <unistd.h>
29 #include <string.h>
30 #include <strings.h>
31 #include <limits.h>
32 #include <errno.h>
33 #include <sys/stat.h>
34 #include "htslib/synced_bcf_reader.h"
35 #include "htslib/kseq.h"
36 #include "htslib/khash_str2int.h"
37 #include "htslib/bgzf.h"
38 #include "htslib/thread_pool.h"
39 #include "bcf_sr_sort.h"
41 #define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi
43 typedef struct
45 uint32_t start, end;
47 region1_t;
49 typedef struct _region_t
51 region1_t *regs;
52 int nregs, mregs, creg;
54 region_t;
56 #define BCF_SR_AUX(x) ((aux_t*)((x)->aux))
57 typedef struct
59 sr_sort_t sort;
61 aux_t;
63 static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end);
64 static bcf_sr_regions_t *_regions_init_string(const char *str);
65 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec);
67 char *bcf_sr_strerror(int errnum)
69 switch (errnum)
71 case open_failed:
72 return strerror(errno); break;
73 case not_bgzf:
74 return "not compressed with bgzip"; break;
75 case idx_load_failed:
76 return "could not load index"; break;
77 case file_type_error:
78 return "unknown file type"; break;
79 case api_usage_error:
80 return "API usage error"; break;
81 case header_error:
82 return "could not parse header"; break;
83 case no_eof:
84 return "no BGZF EOF marker; file may be truncated"; break;
85 case no_memory:
86 return "Out of memory"; break;
87 case vcf_parse_error:
88 return "VCF parse error"; break;
89 case bcf_read_error:
90 return "BCF read error"; break;
91 default: return "";
95 int bcf_sr_set_opt(bcf_srs_t *readers, bcf_sr_opt_t opt, ...)
97 va_list args;
98 switch (opt)
100 case BCF_SR_REQUIRE_IDX:
101 readers->require_index = 1;
102 return 0;
104 case BCF_SR_PAIR_LOGIC:
105 va_start(args, opt);
106 BCF_SR_AUX(readers)->sort.pair = va_arg(args, int);
107 return 0;
109 default:
110 break;
112 return 1;
115 static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters)
117 kstring_t str = {0,0,0};
118 const char *tmp = filters, *prev = filters;
119 int nout = 0, *out = NULL;
120 while ( 1 )
122 if ( *tmp==',' || !*tmp )
124 out = (int*) realloc(out, (nout+1)*sizeof(int));
125 if ( tmp-prev==1 && *prev=='.' )
127 out[nout] = -1;
128 nout++;
130 else
132 str.l = 0;
133 kputsn(prev, tmp-prev, &str);
134 out[nout] = bcf_hdr_id2int(hdr, BCF_DT_ID, str.s);
135 if ( out[nout]>=0 ) nout++;
137 if ( !*tmp ) break;
138 prev = tmp+1;
140 tmp++;
142 if ( str.m ) free(str.s);
143 *nfilters = nout;
144 return out;
147 int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file)
149 assert( !readers->regions );
150 if ( readers->nreaders )
152 hts_log_error("Must call bcf_sr_set_regions() before bcf_sr_add_reader()");
153 return -1;
155 readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2);
156 if ( !readers->regions ) return -1;
157 readers->explicit_regs = 1;
158 readers->require_index = 1;
159 return 0;
161 int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles)
163 assert( !readers->targets );
164 if ( targets[0]=='^' )
166 readers->targets_exclude = 1;
167 targets++;
169 readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2);
170 if ( !readers->targets ) return -1;
171 readers->targets_als = alleles;
172 return 0;
175 int bcf_sr_set_threads(bcf_srs_t *files, int n_threads)
177 if (!(files->n_threads = n_threads))
178 return 0;
180 files->p = calloc(1, sizeof(*files->p));
181 if (!files->p) {
182 files->errnum = no_memory;
183 return -1;
185 if (!(files->p->pool = hts_tpool_init(n_threads)))
186 return -1;
188 return 0;
191 void bcf_sr_destroy_threads(bcf_srs_t *files) {
192 if (!files->p)
193 return;
195 if (files->p->pool)
196 hts_tpool_destroy(files->p->pool);
197 free(files->p);
200 int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
202 htsFile* file_ptr = hts_open(fname, "r");
203 if ( ! file_ptr ) {
204 files->errnum = open_failed;
205 return 0;
208 files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
209 files->has_line[files->nreaders] = 0;
210 files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
211 bcf_sr_t *reader = &files->readers[files->nreaders++];
212 memset(reader,0,sizeof(bcf_sr_t));
214 reader->file = file_ptr;
216 files->errnum = 0;
218 if ( reader->file->format.compression==bgzf )
220 BGZF *bgzf = hts_get_bgzfp(reader->file);
221 if ( bgzf && bgzf_check_EOF(bgzf) == 0 ) {
222 files->errnum = no_eof;
223 hts_log_warning("No BGZF EOF marker; file '%s' may be truncated", fname);
225 if (files->p)
226 bgzf_thread_pool(bgzf, files->p->pool, files->p->qsize);
229 if ( files->require_index )
231 if ( reader->file->format.format==vcf )
233 if ( reader->file->format.compression!=bgzf )
235 files->errnum = not_bgzf;
236 return 0;
239 reader->tbx_idx = tbx_index_load(fname);
240 if ( !reader->tbx_idx )
242 files->errnum = idx_load_failed;
243 return 0;
246 reader->header = bcf_hdr_read(reader->file);
248 else if ( reader->file->format.format==bcf )
250 if ( reader->file->format.compression!=bgzf )
252 files->errnum = not_bgzf;
253 return 0;
256 reader->header = bcf_hdr_read(reader->file);
258 reader->bcf_idx = bcf_index_load(fname);
259 if ( !reader->bcf_idx )
261 files->errnum = idx_load_failed;
262 return 0;
265 else
267 files->errnum = file_type_error;
268 return 0;
271 else
273 if ( reader->file->format.format==bcf || reader->file->format.format==vcf )
275 reader->header = bcf_hdr_read(reader->file);
277 else
279 files->errnum = file_type_error;
280 return 0;
282 files->streaming = 1;
284 if ( files->streaming && files->nreaders>1 )
286 files->errnum = api_usage_error;
287 hts_log_error("Must set require_index when the number of readers is greater than one");
288 return 0;
290 if ( files->streaming && files->regions )
292 files->errnum = api_usage_error;
293 hts_log_error("Cannot tabix-jump in streaming mode");
294 return 0;
296 if ( !reader->header )
298 files->errnum = header_error;
299 return 0;
302 reader->fname = strdup(fname);
303 if ( files->apply_filters )
304 reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids);
306 // Update list of chromosomes
307 if ( !files->explicit_regs && !files->streaming )
309 int n,i;
310 const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
311 for (i=0; i<n; i++)
313 if ( !files->regions )
314 files->regions = _regions_init_string(names[i]);
315 else
316 _regions_add(files->regions, names[i], -1, -1);
318 free(names);
321 return 1;
324 bcf_srs_t *bcf_sr_init(void)
326 bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t));
327 files->aux = (aux_t*) calloc(1,sizeof(aux_t));
328 bcf_sr_sort_init(&BCF_SR_AUX(files)->sort);
329 return files;
332 static void bcf_sr_destroy1(bcf_sr_t *reader)
334 free(reader->fname);
335 if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx);
336 if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx);
337 bcf_hdr_destroy(reader->header);
338 hts_close(reader->file);
339 if ( reader->itr ) tbx_itr_destroy(reader->itr);
340 int j;
341 for (j=0; j<reader->mbuffer; j++)
342 bcf_destroy1(reader->buffer[j]);
343 free(reader->buffer);
344 free(reader->samples);
345 free(reader->filter_ids);
348 void bcf_sr_destroy(bcf_srs_t *files)
350 int i;
351 for (i=0; i<files->nreaders; i++)
352 bcf_sr_destroy1(&files->readers[i]);
353 free(files->has_line);
354 free(files->readers);
355 for (i=0; i<files->n_smpl; i++) free(files->samples[i]);
356 free(files->samples);
357 if (files->targets) bcf_sr_regions_destroy(files->targets);
358 if (files->regions) bcf_sr_regions_destroy(files->regions);
359 if (files->tmps.m) free(files->tmps.s);
360 if (files->n_threads) bcf_sr_destroy_threads(files);
361 bcf_sr_sort_destroy(&BCF_SR_AUX(files)->sort);
362 free(files->aux);
363 free(files);
366 void bcf_sr_remove_reader(bcf_srs_t *files, int i)
368 assert( !files->samples ); // not ready for this yet
369 bcf_sr_sort_remove_reader(files, &BCF_SR_AUX(files)->sort, i);
370 bcf_sr_destroy1(&files->readers[i]);
371 if ( i+1 < files->nreaders )
373 memmove(&files->readers[i], &files->readers[i+1], (files->nreaders-i-1)*sizeof(bcf_sr_t));
374 memmove(&files->has_line[i], &files->has_line[i+1], (files->nreaders-i-1)*sizeof(int));
376 files->nreaders--;
380 void debug_buffer(FILE *fp, bcf_sr_t *reader)
382 int j;
383 for (j=0; j<=reader->nbuffer; j++)
385 bcf1_t *line = reader->buffer[j];
386 fprintf(fp,"\t%p\t%s%s\t%s:%d\t%s ", (void*)line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:"");
387 int k;
388 for (k=1; k<line->n_allele; k++) fprintf(fp," %s", line->d.allele[k]);
389 fprintf(fp,"\n");
393 void debug_buffers(FILE *fp, bcf_srs_t *files)
395 int i;
396 for (i=0; i<files->nreaders; i++)
398 fprintf(fp, "has_line: %d\t%s\n", bcf_sr_has_line(files,i),files->readers[i].fname);
399 debug_buffer(fp, &files->readers[i]);
401 fprintf(fp,"\n");
404 static inline int has_filter(bcf_sr_t *reader, bcf1_t *line)
406 int i, j;
407 if ( !line->d.n_flt )
409 for (j=0; j<reader->nfilter_ids; j++)
410 if ( reader->filter_ids[j]<0 ) return 1;
411 return 0;
413 for (i=0; i<line->d.n_flt; i++)
415 for (j=0; j<reader->nfilter_ids; j++)
416 if ( line->d.flt[i]==reader->filter_ids[j] ) return 1;
418 return 0;
421 static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end)
423 if ( end>=MAX_CSI_COOR )
425 hts_log_error("The coordinate is out of csi index limit: %d", end+1);
426 exit(1);
428 if ( reader->itr )
430 hts_itr_destroy(reader->itr);
431 reader->itr = NULL;
433 reader->nbuffer = 0;
434 if ( reader->tbx_idx )
436 int tid = tbx_name2id(reader->tbx_idx, seq);
437 if ( tid==-1 ) return -1; // the sequence not present in this file
438 reader->itr = tbx_itr_queryi(reader->tbx_idx,tid,start,end+1);
440 else
442 int tid = bcf_hdr_name2id(reader->header, seq);
443 if ( tid==-1 ) return -1; // the sequence not present in this file
444 reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,start,end+1);
446 if (!reader->itr) {
447 hts_log_error("Could not seek: %s:%d-%d", seq, start + 1, end + 1);
448 assert(0);
450 return 0;
454 * _readers_next_region() - jumps to next region if necessary
455 * Returns 0 on success or -1 when there are no more regions left
457 static int _readers_next_region(bcf_srs_t *files)
459 // Need to open new chromosome? Check number of lines in all readers' buffers
460 int i, eos = 0;
461 for (i=0; i<files->nreaders; i++)
462 if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++;
464 if ( eos!=files->nreaders )
466 // Some of the readers still has buffered lines
467 return 0;
470 // No lines in the buffer, need to open new region or quit
471 if ( bcf_sr_regions_next(files->regions)<0 ) return -1;
473 for (i=0; i<files->nreaders; i++)
474 _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end);
476 return 0;
480 * _reader_fill_buffer() - buffers all records with the same coordinate
482 static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
484 // Return if the buffer is full: the coordinate of the last buffered record differs
485 if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return;
487 // No iterator (sequence not present in this file) and not streaming
488 if ( !reader->itr && !files->streaming ) return;
490 // Fill the buffer with records starting at the same position
491 int i, ret = 0;
492 while (1)
494 if ( reader->nbuffer+1 >= reader->mbuffer )
496 // Increase buffer size
497 reader->mbuffer += 8;
498 reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
499 for (i=8; i>0; i--) // initialize
501 reader->buffer[reader->mbuffer-i] = bcf_init1();
502 reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack;
503 reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1
506 if ( files->streaming )
508 if ( reader->file->format.format==vcf )
510 if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines
511 ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
512 if ( ret<0 ) { files->errnum = vcf_parse_error; break; }
514 else if ( reader->file->format.format==bcf )
516 ret = bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1]);
517 if ( ret < -1 ) files->errnum = bcf_read_error;
518 if ( ret < 0 ) break; // no more lines or an error
520 else
522 hts_log_error("Fixme: not ready for this");
523 exit(1);
526 else if ( reader->tbx_idx )
528 if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines
529 ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
530 if ( ret<0 ) { files->errnum = vcf_parse_error; break; }
532 else
534 ret = bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1]);
535 if ( ret < -1 ) files->errnum = bcf_read_error;
536 if ( ret < 0 ) break; // no more lines or an error
537 bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
540 // apply filter
541 if ( !reader->nfilter_ids )
542 bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR);
543 else
545 bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
546 if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue;
548 reader->nbuffer++;
550 if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full
552 if ( ret<0 )
554 // done for this region
555 tbx_itr_destroy(reader->itr);
556 reader->itr = NULL;
561 * _readers_shift_buffer() - removes the first line and all subsequent lines with the same position
563 static void _reader_shift_buffer(bcf_sr_t *reader)
565 int i;
566 for (i=2; i<=reader->nbuffer; i++)
567 if ( reader->buffer[i]->pos!=reader->buffer[1]->pos ) break;
568 if ( i<=reader->nbuffer )
570 // A record with a different position follows, swap it. Because of the reader's logic,
571 // only one such line can be present.
572 bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp;
573 reader->nbuffer = 1;
575 else
576 reader->nbuffer = 0; // no other line
579 int _reader_next_line(bcf_srs_t *files)
581 int i, min_pos = INT_MAX;
582 const char *chr = NULL;
584 // Loop until next suitable line is found or all readers have finished
585 while ( 1 )
587 // Get all readers ready for the next region.
588 if ( files->regions && _readers_next_region(files)<0 ) break;
590 // Fill buffers
591 for (i=0; i<files->nreaders; i++)
593 _reader_fill_buffer(files, &files->readers[i]);
595 // Update the minimum coordinate
596 if ( !files->readers[i].nbuffer ) continue;
597 if ( min_pos > files->readers[i].buffer[1]->pos )
599 min_pos = files->readers[i].buffer[1]->pos;
600 chr = bcf_seqname(files->readers[i].header, files->readers[i].buffer[1]);
603 if ( min_pos==INT_MAX )
605 if ( !files->regions ) break;
606 continue;
609 // Skip this position if not present in targets
610 if ( files->targets )
612 int ret = bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos);
613 if ( (!files->targets_exclude && ret<0) || (files->targets_exclude && !ret) )
615 // Remove all lines with this position from the buffer
616 for (i=0; i<files->nreaders; i++)
617 if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos )
618 _reader_shift_buffer(&files->readers[i]);
619 min_pos = INT_MAX;
620 chr = NULL;
621 continue;
625 break; // done: chr and min_pos are set
627 if ( !chr ) return 0;
629 return bcf_sr_sort_next(files, &BCF_SR_AUX(files)->sort, chr, min_pos);
632 int bcf_sr_next_line(bcf_srs_t *files)
634 if ( !files->targets_als )
635 return _reader_next_line(files);
637 while (1)
639 int i, ret = _reader_next_line(files);
640 if ( !ret ) return ret;
642 for (i=0; i<files->nreaders; i++)
643 if ( files->has_line[i] ) break;
645 if ( _regions_match_alleles(files->targets, files->targets_als-1, files->readers[i].buffer[0]) ) return ret;
647 // Check if there are more duplicate lines in the buffers. If not, return this line as if it
648 // matched the targets, even if there is a type mismatch
649 for (i=0; i<files->nreaders; i++)
651 if ( !files->has_line[i] ) continue;
652 if ( files->readers[i].nbuffer==0 || files->readers[i].buffer[1]->pos!=files->readers[i].buffer[0]->pos ) continue;
653 break;
655 if ( i==files->nreaders ) return ret; // no more lines left, output even if target alleles are not of the same type
659 static void bcf_sr_seek_start(bcf_srs_t *readers)
661 bcf_sr_regions_t *reg = readers->regions;
662 int i;
663 for (i=0; i<reg->nseqs; i++)
664 reg->regs[i].creg = -1;
665 reg->iseq = 0;
669 int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos)
671 if ( !readers->regions ) return 0;
672 if ( !seq && !pos )
674 // seek to start
675 bcf_sr_seek_start(readers);
676 return 0;
678 bcf_sr_regions_overlap(readers->regions, seq, pos, pos);
679 int i, nret = 0;
680 for (i=0; i<readers->nreaders; i++)
682 nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1);
684 return nret;
687 int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
689 int i, j, nsmpl, free_smpl = 0;
690 char **smpl = NULL;
692 void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL;
693 if ( exclude || strcmp("-",fname) ) // "-" stands for all samples
695 smpl = hts_readlist(fname, is_file, &nsmpl);
696 if ( !smpl )
698 hts_log_error("Could not read the file: \"%s\"", fname);
699 return 0;
701 if ( exclude )
703 for (i=0; i<nsmpl; i++)
704 khash_str2int_inc(exclude, smpl[i]);
706 free_smpl = 1;
708 if ( !smpl )
710 smpl = files->readers[0].header->samples; // intersection of all samples
711 nsmpl = bcf_hdr_nsamples(files->readers[0].header);
714 files->samples = NULL;
715 files->n_smpl = 0;
716 for (i=0; i<nsmpl; i++)
718 if ( exclude && khash_str2int_has_key(exclude,smpl[i]) ) continue;
720 int n_isec = 0;
721 for (j=0; j<files->nreaders; j++)
723 if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break;
724 n_isec++;
726 if ( n_isec!=files->nreaders )
728 hts_log_warning("The sample \"%s\" was not found in %s, skipping",
729 smpl[i], files->readers[n_isec].fname);
730 continue;
733 files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*));
734 files->samples[files->n_smpl++] = strdup(smpl[i]);
737 if ( exclude ) khash_str2int_destroy(exclude);
738 if ( free_smpl )
740 for (i=0; i<nsmpl; i++) free(smpl[i]);
741 free(smpl);
744 if ( !files->n_smpl )
746 if ( files->nreaders>1 )
747 hts_log_warning("No samples in common");
748 return 0;
750 for (i=0; i<files->nreaders; i++)
752 bcf_sr_t *reader = &files->readers[i];
753 reader->samples = (int*) malloc(sizeof(int)*files->n_smpl);
754 reader->n_smpl = files->n_smpl;
755 for (j=0; j<files->n_smpl; j++)
756 reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]);
758 return 1;
761 // Add a new region into a list sorted by start,end. On input the coordinates
762 // are 1-based, stored 0-based, inclusive.
763 static void _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end)
765 if ( start==-1 && end==-1 )
767 start = 0; end = MAX_CSI_COOR-1;
769 else
771 start--; end--; // store 0-based coordinates
774 if ( !reg->seq_hash )
775 reg->seq_hash = khash_str2int_init();
777 int iseq;
778 if ( khash_str2int_get(reg->seq_hash, chr, &iseq)<0 )
780 // the chromosome block does not exist
781 iseq = reg->nseqs++;
782 reg->seq_names = (char**) realloc(reg->seq_names,sizeof(char*)*reg->nseqs);
783 reg->regs = (region_t*) realloc(reg->regs,sizeof(region_t)*reg->nseqs);
784 memset(&reg->regs[reg->nseqs-1],0,sizeof(region_t));
785 reg->seq_names[iseq] = strdup(chr);
786 reg->regs[iseq].creg = -1;
787 khash_str2int_set(reg->seq_hash,reg->seq_names[iseq],iseq);
790 region_t *creg = &reg->regs[iseq];
792 // the regions may not be sorted on input: binary search
793 int i, min = 0, max = creg->nregs - 1;
794 while ( min<=max )
796 i = (max+min)/2;
797 if ( start < creg->regs[i].start ) max = i - 1;
798 else if ( start > creg->regs[i].start ) min = i + 1;
799 else break;
801 if ( min>max || creg->regs[i].start!=start || creg->regs[i].end!=end )
803 // no such region, insert a new one just after max
804 hts_expand(region1_t,creg->nregs+1,creg->mregs,creg->regs);
805 if ( ++max < creg->nregs )
806 memmove(&creg->regs[max+1],&creg->regs[max],(creg->nregs - max)*sizeof(region1_t));
807 creg->regs[max].start = start;
808 creg->regs[max].end = end;
809 creg->nregs++;
813 // File name or a list of genomic locations. If file name, NULL is returned.
814 static bcf_sr_regions_t *_regions_init_string(const char *str)
816 bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
817 reg->start = reg->end = -1;
818 reg->prev_start = reg->prev_seq = -1;
820 kstring_t tmp = {0,0,0};
821 const char *sp = str, *ep = str;
822 int from, to;
823 while ( 1 )
825 while ( *ep && *ep!=',' && *ep!=':' ) ep++;
826 tmp.l = 0;
827 kputsn(sp,ep-sp,&tmp);
828 if ( *ep==':' )
830 sp = ep+1;
831 from = hts_parse_decimal(sp,(char**)&ep,0);
832 if ( sp==ep )
834 hts_log_error("Could not parse the region(s): %s", str);
835 free(reg); free(tmp.s); return NULL;
837 if ( !*ep || *ep==',' )
839 _regions_add(reg, tmp.s, from, from);
840 sp = ep;
841 continue;
843 if ( *ep!='-' )
845 hts_log_error("Could not parse the region(s): %s", str);
846 free(reg); free(tmp.s); return NULL;
848 ep++;
849 sp = ep;
850 to = hts_parse_decimal(sp,(char**)&ep,0);
851 if ( *ep && *ep!=',' )
853 hts_log_error("Could not parse the region(s): %s", str);
854 free(reg); free(tmp.s); return NULL;
856 if ( sp==ep ) to = MAX_CSI_COOR-1;
857 _regions_add(reg, tmp.s, from, to);
858 if ( !*ep ) break;
859 sp = ep;
861 else
863 if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1);
864 if ( !*ep ) break;
865 sp = ++ep;
868 free(tmp.s);
869 return reg;
872 // ichr,ifrom,ito are 0-based;
873 // returns -1 on error, 0 if the line is a comment line, 1 on success
874 static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,char **chr_end,int *from,int *to)
876 *chr_end = NULL;
878 if ( line[0]=='#' ) return 0;
880 int k,l; // index of the start and end column of the tab-delimited file
881 if ( ifrom <= ito )
882 k = ifrom, l = ito;
883 else
884 l = ifrom, k = ito;
886 int i;
887 char *se = line, *ss = NULL; // start and end
888 char *tmp;
889 for (i=0; i<=k && *se; i++)
891 ss = i==0 ? se++ : ++se;
892 while (*se && *se!='\t') se++;
894 if ( i<=k ) return -1;
895 if ( k==l )
897 *from = *to = hts_parse_decimal(ss, &tmp, 0);
898 if ( tmp==ss ) return -1;
900 else
902 if ( k==ifrom )
903 *from = hts_parse_decimal(ss, &tmp, 0);
904 else
905 *to = hts_parse_decimal(ss, &tmp, 0);
906 if ( ss==tmp ) return -1;
908 for (i=k; i<l && *se; i++)
910 ss = ++se;
911 while (*se && *se!='\t') se++;
913 if ( i<l ) return -1;
914 if ( k==ifrom )
915 *to = hts_parse_decimal(ss, &tmp, 0);
916 else
917 *from = hts_parse_decimal(ss, &tmp, 0);
918 if ( ss==tmp ) return -1;
921 ss = se = line;
922 for (i=0; i<=ichr && *se; i++)
924 if ( i>0 ) ss = ++se;
925 while (*se && *se!='\t') se++;
927 if ( i<=ichr ) return -1;
928 *chr_end = se;
929 *chr = ss;
930 return 1;
933 bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito)
935 bcf_sr_regions_t *reg;
936 if ( !is_file ) return _regions_init_string(regions);
938 reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
939 reg->start = reg->end = -1;
940 reg->prev_start = reg->prev_seq = -1;
942 reg->file = hts_open(regions, "rb");
943 if ( !reg->file )
945 hts_log_error("Could not open file: %s", regions);
946 free(reg);
947 return NULL;
950 reg->tbx = tbx_index_load(regions);
951 if ( !reg->tbx )
953 int len = strlen(regions);
954 int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1;
955 if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1;
957 if ( reg->file->format.format==vcf ) ito = 1;
959 // read the whole file, tabix index is not present
960 while ( hts_getline(reg->file, KS_SEP_LINE, &reg->line) > 0 )
962 char *chr, *chr_end;
963 int from, to, ret;
964 ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to);
965 if ( ret < 0 )
967 if ( ito<0 )
968 ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to);
969 if ( ret<0 )
971 hts_log_error("Could not parse the file %s, using the columns %d,%d[,%d]",
972 regions,ichr+1,ifrom+1,ito+1);
973 hts_close(reg->file); reg->file = NULL; free(reg);
974 return NULL;
977 if ( !ret ) continue;
978 if ( is_bed ) from++;
979 *chr_end = 0;
980 _regions_add(reg, chr, from, to);
981 *chr_end = '\t';
983 hts_close(reg->file); reg->file = NULL;
984 if ( !reg->nseqs ) { free(reg); return NULL; }
985 return reg;
988 reg->seq_names = (char**) tbx_seqnames(reg->tbx, &reg->nseqs);
989 if ( !reg->seq_hash )
990 reg->seq_hash = khash_str2int_init();
991 int i;
992 for (i=0; i<reg->nseqs; i++)
994 khash_str2int_set(reg->seq_hash,reg->seq_names[i],i);
996 reg->fname = strdup(regions);
997 reg->is_bin = 1;
998 return reg;
1001 void bcf_sr_regions_destroy(bcf_sr_regions_t *reg)
1003 int i;
1004 free(reg->fname);
1005 if ( reg->itr ) tbx_itr_destroy(reg->itr);
1006 if ( reg->tbx ) tbx_destroy(reg->tbx);
1007 if ( reg->file ) hts_close(reg->file);
1008 if ( reg->als ) free(reg->als);
1009 if ( reg->als_str.s ) free(reg->als_str.s);
1010 free(reg->line.s);
1011 if ( reg->regs )
1013 // free only in-memory names, tbx names are const
1014 for (i=0; i<reg->nseqs; i++)
1016 free(reg->seq_names[i]);
1017 free(reg->regs[i].regs);
1020 free(reg->regs);
1021 free(reg->seq_names);
1022 khash_str2int_destroy(reg->seq_hash);
1023 free(reg);
1026 int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq)
1028 reg->iseq = reg->start = reg->end = -1;
1029 if ( khash_str2int_get(reg->seq_hash, seq, &reg->iseq) < 0 ) return -1; // sequence seq not in regions
1031 // using in-memory regions
1032 if ( reg->regs )
1034 reg->regs[reg->iseq].creg = -1;
1035 return 0;
1038 // reading regions from tabix
1039 if ( reg->itr ) tbx_itr_destroy(reg->itr);
1040 reg->itr = tbx_itr_querys(reg->tbx, seq);
1041 if ( reg->itr ) return 0;
1043 return -1;
1046 int bcf_sr_regions_next(bcf_sr_regions_t *reg)
1048 if ( reg->iseq<0 ) return -1;
1049 reg->start = reg->end = -1;
1050 reg->nals = 0;
1052 // using in-memory regions
1053 if ( reg->regs )
1055 while ( reg->iseq < reg->nseqs )
1057 reg->regs[reg->iseq].creg++;
1058 if ( reg->regs[reg->iseq].creg < reg->regs[reg->iseq].nregs ) break;
1059 reg->iseq++;
1061 if ( reg->iseq >= reg->nseqs ) { reg->iseq = -1; return -1; } // no more regions left
1062 region1_t *creg = &reg->regs[reg->iseq].regs[reg->regs[reg->iseq].creg];
1063 reg->start = creg->start;
1064 reg->end = creg->end;
1065 return 0;
1068 // reading from tabix
1069 char *chr, *chr_end;
1070 int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to;
1071 if ( reg->tbx )
1073 ichr = reg->tbx->conf.sc-1;
1074 ifrom = reg->tbx->conf.bc-1;
1075 ito = reg->tbx->conf.ec-1;
1076 if ( ito<0 ) ito = ifrom;
1077 is_bed = reg->tbx->conf.preset==TBX_UCSC ? 1 : 0;
1080 int ret = 0;
1081 while ( !ret )
1083 if ( reg->itr )
1085 // tabix index present, reading a chromosome block
1086 ret = tbx_itr_next(reg->file, reg->tbx, reg->itr, &reg->line);
1087 if ( ret<0 ) { reg->iseq = -1; return -1; }
1089 else
1091 if ( reg->is_bin )
1093 // Waited for seek which never came. Reopen in text mode and stream
1094 // through the regions, otherwise hts_getline would fail
1095 hts_close(reg->file);
1096 reg->file = hts_open(reg->fname, "r");
1097 if ( !reg->file )
1099 hts_log_error("Could not open file: %s", reg->fname);
1100 reg->file = NULL;
1101 bcf_sr_regions_destroy(reg);
1102 return -1;
1104 reg->is_bin = 0;
1107 // tabix index absent, reading the whole file
1108 ret = hts_getline(reg->file, KS_SEP_LINE, &reg->line);
1109 if ( ret<0 ) { reg->iseq = -1; return -1; }
1111 ret = _regions_parse_line(reg->line.s, ichr,ifrom,ito, &chr,&chr_end,&from,&to);
1112 if ( ret<0 )
1114 hts_log_error("Could not parse the file %s, using the columns %d,%d,%d",
1115 reg->fname,ichr+1,ifrom+1,ito+1);
1116 return -1;
1119 if ( is_bed ) from++;
1121 *chr_end = 0;
1122 if ( khash_str2int_get(reg->seq_hash, chr, &reg->iseq)<0 )
1124 hts_log_error("Broken tabix index? The sequence \"%s\" not in dictionary [%s]",
1125 chr, reg->line.s);
1126 exit(1);
1128 *chr_end = '\t';
1130 reg->start = from - 1;
1131 reg->end = to - 1;
1132 return 0;
1135 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec)
1137 if ( reg->regs )
1139 // payload is not supported for in-memory regions, switch to regidx instead in future
1140 hts_log_error("Compressed and indexed targets file is required");
1141 exit(1);
1144 int i = 0, max_len = 0;
1145 if ( !reg->nals )
1147 char *ss = reg->line.s;
1148 while ( i<als_idx && *ss )
1150 if ( *ss=='\t' ) i++;
1151 ss++;
1153 char *se = ss;
1154 reg->nals = 1;
1155 while ( *se && *se!='\t' )
1157 if ( *se==',' ) reg->nals++;
1158 se++;
1160 ks_resize(&reg->als_str, se-ss+1+reg->nals);
1161 reg->als_str.l = 0;
1162 hts_expand(char*,reg->nals,reg->mals,reg->als);
1163 reg->nals = 0;
1165 se = ss;
1166 while ( *(++se) )
1168 if ( *se=='\t' ) break;
1169 if ( *se!=',' ) continue;
1170 reg->als[reg->nals] = &reg->als_str.s[reg->als_str.l];
1171 kputsn(ss,se-ss,&reg->als_str);
1172 if ( &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals];
1173 reg->als_str.l++;
1174 reg->nals++;
1175 ss = ++se;
1177 reg->als[reg->nals] = &reg->als_str.s[reg->als_str.l];
1178 kputsn(ss,se-ss,&reg->als_str);
1179 if ( &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals] > max_len ) max_len = &reg->als_str.s[reg->als_str.l] - reg->als[reg->nals];
1180 reg->nals++;
1181 reg->als_type = max_len > 1 ? VCF_INDEL : VCF_SNP; // this is a simplified check, see vcf.c:bcf_set_variant_types
1183 int type = bcf_get_variant_types(rec);
1184 if ( reg->als_type & VCF_INDEL )
1185 return type & VCF_INDEL ? 1 : 0;
1186 return !(type & VCF_INDEL) ? 1 : 0;
1189 int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end)
1191 int iseq;
1192 if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence
1194 if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek
1196 // flush regions left on previous chromosome
1197 if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 )
1198 bcf_sr_regions_flush(reg);
1200 bcf_sr_regions_seek(reg, seq);
1201 reg->start = reg->end = -1;
1203 if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome
1204 reg->prev_seq = reg->iseq;
1205 reg->prev_start = start;
1207 while ( iseq==reg->iseq && reg->end < start )
1209 if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left
1210 if ( reg->iseq != iseq ) return -1; // does not overlap any regions
1211 if ( reg->missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data);
1213 if ( reg->start <= end ) return 0; // region overlap
1214 return -1; // no overlap
1217 void bcf_sr_regions_flush(bcf_sr_regions_t *reg)
1219 if ( !reg->missed_reg_handler || reg->prev_seq==-1 ) return;
1220 while ( !bcf_sr_regions_next(reg) ) reg->missed_reg_handler(reg, reg->missed_reg_data);
1221 return;