modified: src1/input.c
[GalaxyCodeBases.git] / c_cpp / lib / htslib / hts.c
blob1c6ca8abe6af85be5644c88516dad41ed019f3c2
1 /* hts.c -- format-neutral I/O, indexing, and iterator API functions.
3 Copyright (C) 2008, 2009, 2012-2017 Genome Research Ltd.
4 Copyright (C) 2012, 2013 Broad Institute.
6 Author: Heng Li <lh3@sanger.ac.uk>
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
26 #include <config.h>
28 #include <zlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <strings.h>
32 #include <stdlib.h>
33 #include <limits.h>
34 #include <fcntl.h>
35 #include <errno.h>
36 #include <sys/stat.h>
37 #include <assert.h>
39 #include "htslib/hts.h"
40 #include "htslib/bgzf.h"
41 #include "cram/cram.h"
42 #include "htslib/hfile.h"
43 #include "htslib/hts_endian.h"
44 #include "version.h"
45 #include "hts_internal.h"
46 #include "hfile_internal.h"
48 #include "htslib/khash.h"
49 #include "htslib/kseq.h"
50 #include "htslib/ksort.h"
52 KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
54 int hts_verbose = HTS_LOG_WARNING;
56 const char *hts_version()
58 return HTS_VERSION;
61 const unsigned char seq_nt16_table[256] = {
62 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
63 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
64 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
65 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
66 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
67 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
68 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
69 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
71 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
72 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
73 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
74 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
75 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
76 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
77 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
78 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
81 const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
83 const int seq_nt16_int[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
85 /**********************
86 *** Basic file I/O ***
87 **********************/
89 static enum htsFormatCategory format_category(enum htsExactFormat fmt)
91 switch (fmt) {
92 case bam:
93 case sam:
94 case cram:
95 return sequence_data;
97 case vcf:
98 case bcf:
99 return variant_data;
101 case bai:
102 case crai:
103 case csi:
104 case gzi:
105 case tbi:
106 return index_file;
108 case bed:
109 return region_list;
111 case json:
112 return unknown_category;
114 case unknown_format:
115 case binary_format:
116 case text_format:
117 case format_maximum:
118 break;
121 return unknown_category;
124 // Decompress up to ten or so bytes by peeking at the file, which must be
125 // positioned at the start of a GZIP block.
126 static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
128 // Typically at most a couple of hundred bytes of input are required
129 // to get a few bytes of output from inflate(), so hopefully this buffer
130 // size suffices in general.
131 unsigned char buffer[512];
132 z_stream zs;
133 ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
135 if (npeek < 0) return 0;
137 zs.zalloc = NULL;
138 zs.zfree = NULL;
139 zs.next_in = buffer;
140 zs.avail_in = npeek;
141 zs.next_out = dest;
142 zs.avail_out = destsize;
143 if (inflateInit2(&zs, 31) != Z_OK) return 0;
145 while (zs.total_out < destsize)
146 if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
148 destsize = zs.total_out;
149 inflateEnd(&zs);
151 return destsize;
154 // Parse "x.y" text, taking care because the string is not NUL-terminated
155 // and filling in major/minor only when the digits are followed by a delimiter,
156 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
157 static void
158 parse_version(htsFormat *fmt, const unsigned char *u, const unsigned char *ulim)
160 const char *s = (const char *) u;
161 const char *slim = (const char *) ulim;
162 short v;
164 fmt->version.major = fmt->version.minor = -1;
166 for (v = 0; s < slim && isdigit_c(*s); s++)
167 v = 10 * v + *s - '0';
169 if (s < slim) {
170 fmt->version.major = v;
171 if (*s == '.') {
172 s++;
173 for (v = 0; s < slim && isdigit_c(*s); s++)
174 v = 10 * v + *s - '0';
175 if (s < slim)
176 fmt->version.minor = v;
178 else
179 fmt->version.minor = 0;
183 static int
184 cmp_nonblank(const char *key, const unsigned char *u, const unsigned char *ulim)
186 const unsigned char *ukey = (const unsigned char *) key;
188 while (*ukey)
189 if (u >= ulim) return +1;
190 else if (isspace_c(*u)) u++;
191 else if (*u != *ukey) return (*ukey < *u)? -1 : +1;
192 else u++, ukey++;
194 return 0;
197 int hts_detect_format(hFILE *hfile, htsFormat *fmt)
199 unsigned char s[21];
200 ssize_t len = hpeek(hfile, s, 18);
201 if (len < 0) return -1;
203 if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) {
204 // The stream is either gzip-compressed or BGZF-compressed.
205 // Determine which, and decompress the first few bytes.
206 fmt->compression = (len >= 18 && (s[3] & 4) &&
207 memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip;
208 len = decompress_peek(hfile, s, sizeof s);
210 else {
211 fmt->compression = no_compression;
212 len = hpeek(hfile, s, sizeof s);
214 if (len < 0) return -1;
216 fmt->compression_level = -1;
217 fmt->specific = NULL;
219 if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=3 && s[5]<=1) {
220 fmt->category = sequence_data;
221 fmt->format = cram;
222 fmt->version.major = s[4], fmt->version.minor = s[5];
223 fmt->compression = custom;
224 return 0;
226 else if (len >= 4 && s[3] <= '\4') {
227 if (memcmp(s, "BAM\1", 4) == 0) {
228 fmt->category = sequence_data;
229 fmt->format = bam;
230 // TODO Decompress enough to pick version from @HD-VN header
231 fmt->version.major = 1, fmt->version.minor = -1;
232 return 0;
234 else if (memcmp(s, "BAI\1", 4) == 0) {
235 fmt->category = index_file;
236 fmt->format = bai;
237 fmt->version.major = -1, fmt->version.minor = -1;
238 return 0;
240 else if (memcmp(s, "BCF\4", 4) == 0) {
241 fmt->category = variant_data;
242 fmt->format = bcf;
243 fmt->version.major = 1, fmt->version.minor = -1;
244 return 0;
246 else if (memcmp(s, "BCF\2", 4) == 0) {
247 fmt->category = variant_data;
248 fmt->format = bcf;
249 fmt->version.major = s[3];
250 fmt->version.minor = (len >= 5 && s[4] <= 2)? s[4] : 0;
251 return 0;
253 else if (memcmp(s, "CSI\1", 4) == 0) {
254 fmt->category = index_file;
255 fmt->format = csi;
256 fmt->version.major = 1, fmt->version.minor = -1;
257 return 0;
259 else if (memcmp(s, "TBI\1", 4) == 0) {
260 fmt->category = index_file;
261 fmt->format = tbi;
262 fmt->version.major = -1, fmt->version.minor = -1;
263 return 0;
266 else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) {
267 fmt->category = variant_data;
268 fmt->format = vcf;
269 if (len >= 21 && s[16] == 'v')
270 parse_version(fmt, &s[17], &s[len]);
271 else
272 fmt->version.major = fmt->version.minor = -1;
273 return 0;
275 else if (len >= 4 && s[0] == '@' &&
276 (memcmp(s, "@HD\t", 4) == 0 || memcmp(s, "@SQ\t", 4) == 0 ||
277 memcmp(s, "@RG\t", 4) == 0 || memcmp(s, "@PG\t", 4) == 0)) {
278 fmt->category = sequence_data;
279 fmt->format = sam;
280 // @HD-VN is not guaranteed to be the first tag, but then @HD is
281 // not guaranteed to be present at all...
282 if (len >= 9 && memcmp(s, "@HD\tVN:", 7) == 0)
283 parse_version(fmt, &s[7], &s[len]);
284 else
285 fmt->version.major = 1, fmt->version.minor = -1;
286 return 0;
288 else if (cmp_nonblank("{\"", s, &s[len]) == 0) {
289 fmt->category = unknown_category;
290 fmt->format = json;
291 fmt->version.major = fmt->version.minor = -1;
292 return 0;
294 else {
295 // Various possibilities for tab-delimited text:
296 // .crai (gzipped tab-delimited six columns: seqid 5*number)
297 // .bed ([3..12] tab-delimited columns)
298 // .bedpe (>= 10 tab-delimited columns)
299 // .sam (tab-delimited >= 11 columns: seqid number seqid...)
300 // FIXME For now, assume it's SAM
301 fmt->category = sequence_data;
302 fmt->format = sam;
303 fmt->version.major = 1, fmt->version.minor = -1;
304 return 0;
307 fmt->category = unknown_category;
308 fmt->format = unknown_format;
309 fmt->version.major = fmt->version.minor = -1;
310 fmt->compression = no_compression;
311 return 0;
314 char *hts_format_description(const htsFormat *format)
316 kstring_t str = { 0, 0, NULL };
318 switch (format->format) {
319 case sam: kputs("SAM", &str); break;
320 case bam: kputs("BAM", &str); break;
321 case cram: kputs("CRAM", &str); break;
322 case vcf: kputs("VCF", &str); break;
323 case bcf:
324 if (format->version.major == 1) kputs("Legacy BCF", &str);
325 else kputs("BCF", &str);
326 break;
327 case bai: kputs("BAI", &str); break;
328 case crai: kputs("CRAI", &str); break;
329 case csi: kputs("CSI", &str); break;
330 case tbi: kputs("Tabix", &str); break;
331 case json: kputs("JSON", &str); break;
332 default: kputs("unknown", &str); break;
335 if (format->version.major >= 0) {
336 kputs(" version ", &str);
337 kputw(format->version.major, &str);
338 if (format->version.minor >= 0) {
339 kputc('.', &str);
340 kputw(format->version.minor, &str);
344 switch (format->compression) {
345 case custom: kputs(" compressed", &str); break;
346 case gzip: kputs(" gzip-compressed", &str); break;
347 case bgzf:
348 switch (format->format) {
349 case bam:
350 case bcf:
351 case csi:
352 case tbi:
353 // These are by definition BGZF, so just use the generic term
354 kputs(" compressed", &str);
355 break;
356 default:
357 kputs(" BGZF-compressed", &str);
358 break;
360 break;
361 default: break;
364 switch (format->category) {
365 case sequence_data: kputs(" sequence", &str); break;
366 case variant_data: kputs(" variant calling", &str); break;
367 case index_file: kputs(" index", &str); break;
368 case region_list: kputs(" genomic region", &str); break;
369 default: break;
372 if (format->compression == no_compression)
373 switch (format->format) {
374 case sam:
375 case crai:
376 case vcf:
377 case bed:
378 case json:
379 kputs(" text", &str);
380 break;
382 default:
383 kputs(" data", &str);
384 break;
386 else
387 kputs(" data", &str);
389 return ks_release(&str);
392 htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
394 char smode[102], *cp, *cp2, *mode_c;
395 htsFile *fp = NULL;
396 hFILE *hfile;
397 char fmt_code = '\0';
399 strncpy(smode, mode, 100);
400 smode[100]=0;
401 if ((cp = strchr(smode, ',')))
402 *cp = '\0';
404 // Migrate format code (b or c) to the end of the smode buffer.
405 for (cp2 = cp = smode; *cp; cp++) {
406 if (*cp == 'b')
407 fmt_code = 'b';
408 else if (*cp == 'c')
409 fmt_code = 'c';
410 else
411 *cp2++ = *cp;
413 mode_c = cp2;
414 *cp2++ = fmt_code;
415 *cp2++ = 0;
416 *cp2++ = 0;
418 // Set or reset the format code if opts->format is used
419 if (fmt && fmt->format != unknown_format)
420 *mode_c = "\0g\0\0b\0c\0\0b\0g\0\0"[fmt->format];
422 hfile = hopen(fn, smode);
423 if (hfile == NULL) goto error;
425 fp = hts_hopen(hfile, fn, smode);
426 if (fp == NULL) goto error;
428 if (fmt && fmt->specific)
429 if (hts_opt_apply(fp, fmt->specific) != 0)
430 goto error;
432 return fp;
434 error:
435 hts_log_error("Failed to open file %s", fn);
437 if (hfile)
438 hclose_abruptly(hfile);
440 return NULL;
443 htsFile *hts_open(const char *fn, const char *mode) {
444 return hts_open_format(fn, mode, NULL);
448 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
449 * the prefix in lowercase into buf and returning a pointer to the suffix.
450 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
451 * prefix should be one of several known values of maximum length buflen-2.
452 * (If delim is not found, returns a pointer to the '\0'.)
454 static const char *
455 scan_keyword(const char *str, char delim, char *buf, size_t buflen)
457 size_t i = 0;
458 while (*str && *str != delim) {
459 if (i < buflen-1) buf[i++] = tolower_c(*str);
460 str++;
463 buf[i] = '\0';
464 return *str? str+1 : str;
468 * Parses arg and appends it to the option list.
470 * Returns 0 on success;
471 * -1 on failure.
473 int hts_opt_add(hts_opt **opts, const char *c_arg) {
474 hts_opt *o, *t;
475 char *val;
478 * IMPORTANT!!!
479 * If you add another string option here, don't forget to also add
480 * it to the case statement in hts_opt_apply.
483 if (!c_arg)
484 return -1;
486 if (!(o = malloc(sizeof(*o))))
487 return -1;
489 if (!(o->arg = strdup(c_arg))) {
490 free(o);
491 return -1;
494 if (!(val = strchr(o->arg, '=')))
495 val = "1"; // assume boolean
496 else
497 *val++ = '\0';
499 if (strcmp(o->arg, "decode_md") == 0 ||
500 strcmp(o->arg, "DECODE_MD") == 0)
501 o->opt = CRAM_OPT_DECODE_MD, o->val.i = atoi(val);
503 else if (strcmp(o->arg, "verbosity") == 0 ||
504 strcmp(o->arg, "VERBOSITY") == 0)
505 o->opt = CRAM_OPT_VERBOSITY, o->val.i = atoi(val);
507 else if (strcmp(o->arg, "seqs_per_slice") == 0 ||
508 strcmp(o->arg, "SEQS_PER_SLICE") == 0)
509 o->opt = CRAM_OPT_SEQS_PER_SLICE, o->val.i = atoi(val);
511 else if (strcmp(o->arg, "bases_per_slice") == 0 ||
512 strcmp(o->arg, "BASES_PER_SLICE") == 0)
513 o->opt = CRAM_OPT_BASES_PER_SLICE, o->val.i = atoi(val);
515 else if (strcmp(o->arg, "slices_per_container") == 0 ||
516 strcmp(o->arg, "SLICES_PER_CONTAINER") == 0)
517 o->opt = CRAM_OPT_SLICES_PER_CONTAINER, o->val.i = atoi(val);
519 else if (strcmp(o->arg, "embed_ref") == 0 ||
520 strcmp(o->arg, "EMBED_REF") == 0)
521 o->opt = CRAM_OPT_EMBED_REF, o->val.i = atoi(val);
523 else if (strcmp(o->arg, "no_ref") == 0 ||
524 strcmp(o->arg, "NO_REF") == 0)
525 o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val);
527 else if (strcmp(o->arg, "ignore_md5") == 0 ||
528 strcmp(o->arg, "IGNORE_MD5") == 0)
529 o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val);
531 else if (strcmp(o->arg, "use_bzip2") == 0 ||
532 strcmp(o->arg, "USE_BZIP2") == 0)
533 o->opt = CRAM_OPT_USE_BZIP2, o->val.i = atoi(val);
535 else if (strcmp(o->arg, "use_rans") == 0 ||
536 strcmp(o->arg, "USE_RANS") == 0)
537 o->opt = CRAM_OPT_USE_RANS, o->val.i = atoi(val);
539 else if (strcmp(o->arg, "use_lzma") == 0 ||
540 strcmp(o->arg, "USE_LZMA") == 0)
541 o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val);
543 else if (strcmp(o->arg, "reference") == 0 ||
544 strcmp(o->arg, "REFERENCE") == 0)
545 o->opt = CRAM_OPT_REFERENCE, o->val.s = val;
547 else if (strcmp(o->arg, "version") == 0 ||
548 strcmp(o->arg, "VERSION") == 0)
549 o->opt = CRAM_OPT_VERSION, o->val.s =val;
551 else if (strcmp(o->arg, "multi_seq_per_slice") == 0 ||
552 strcmp(o->arg, "MULTI_SEQ_PER_SLICE") == 0)
553 o->opt = CRAM_OPT_MULTI_SEQ_PER_SLICE, o->val.i = atoi(val);
555 else if (strcmp(o->arg, "nthreads") == 0 ||
556 strcmp(o->arg, "NTHREADS") == 0)
557 o->opt = HTS_OPT_NTHREADS, o->val.i = atoi(val);
559 else if (strcmp(o->arg, "cache_size") == 0 ||
560 strcmp(o->arg, "CACHE_SIZE") == 0) {
561 char *endp;
562 o->opt = HTS_OPT_CACHE_SIZE;
563 o->val.i = strtol(val, &endp, 0);
564 // NB: Doesn't support floats, eg 1.5g
565 // TODO: extend hts_parse_decimal? See also samtools sort.
566 switch (*endp) {
567 case 'g': case 'G': o->val.i *= 1024;
568 case 'm': case 'M': o->val.i *= 1024;
569 case 'k': case 'K': o->val.i *= 1024; break;
570 case '\0': break;
571 default:
572 hts_log_error("Unrecognised cache size suffix '%c'", *endp);
573 free(o->arg);
574 free(o);
575 return -1;
579 else if (strcmp(o->arg, "required_fields") == 0 ||
580 strcmp(o->arg, "REQUIRED_FIELDS") == 0)
581 o->opt = CRAM_OPT_REQUIRED_FIELDS, o->val.i = strtol(val, NULL, 0);
583 else if (strcmp(o->arg, "lossy_names") == 0 ||
584 strcmp(o->arg, "LOSSY_NAMES") == 0)
585 o->opt = CRAM_OPT_LOSSY_NAMES, o->val.i = strtol(val, NULL, 0);
587 else if (strcmp(o->arg, "name_prefix") == 0 ||
588 strcmp(o->arg, "NAME_PREFIX") == 0)
589 o->opt = CRAM_OPT_PREFIX, o->val.s = val;
591 else if (strcmp(o->arg, "block_size") == 0 ||
592 strcmp(o->arg, "BLOCK_SIZE") == 0)
593 o->opt = HTS_OPT_BLOCK_SIZE, o->val.i = strtol(val, NULL, 0);
595 else {
596 hts_log_error("Unknown option '%s'", o->arg);
597 free(o->arg);
598 free(o);
599 return -1;
602 o->next = NULL;
604 // Append; assumes small list.
605 if (*opts) {
606 t = *opts;
607 while (t->next)
608 t = t->next;
609 t->next = o;
610 } else {
611 *opts = o;
614 return 0;
618 * Applies an hts_opt option list to a given htsFile.
620 * Returns 0 on success
621 * -1 on failure
623 int hts_opt_apply(htsFile *fp, hts_opt *opts) {
624 hts_opt *last = NULL;
626 for (; opts; opts = (last=opts)->next) {
627 switch (opts->opt) {
628 case CRAM_OPT_REFERENCE:
629 case CRAM_OPT_VERSION:
630 case CRAM_OPT_PREFIX:
631 if (hts_set_opt(fp, opts->opt, opts->val.s) != 0)
632 return -1;
633 break;
634 default:
635 if (hts_set_opt(fp, opts->opt, opts->val.i) != 0)
636 return -1;
637 break;
641 return 0;
645 * Frees an hts_opt list.
647 void hts_opt_free(hts_opt *opts) {
648 hts_opt *last = NULL;
649 while (opts) {
650 opts = (last=opts)->next;
651 free(last->arg);
652 free(last);
658 * Tokenise options as (key(=value)?,)*(key(=value)?)?
659 * NB: No provision for ',' appearing in the value!
660 * Add backslashing rules?
662 * This could be used as part of a general command line option parser or
663 * as a string concatenated onto the file open mode.
665 * Returns 0 on success
666 * -1 on failure.
668 int hts_parse_opt_list(htsFormat *fmt, const char *str) {
669 while (str && *str) {
670 const char *str_start;
671 int len;
672 char arg[8001];
674 while (*str && *str == ',')
675 str++;
677 for (str_start = str; *str && *str != ','; str++);
678 len = str - str_start;
680 // Produce a nul terminated copy of the option
681 strncpy(arg, str_start, len < 8000 ? len : 8000);
682 arg[len < 8000 ? len : 8000] = '\0';
684 if (hts_opt_add((hts_opt **)&fmt->specific, arg) != 0)
685 return -1;
687 if (*str)
688 str++;
691 return 0;
695 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
696 * followed by a comma separated list of key=value options and splits
697 * these up into the fields of htsFormat struct.
699 * format is assumed to be already initialised, either to blank
700 * "unknown" values or via previous hts_opt_add calls.
702 * Returns 0 on success
703 * -1 on failure.
705 int hts_parse_format(htsFormat *format, const char *str) {
706 char fmt[8];
707 const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
709 format->version.minor = 0; // unknown
710 format->version.major = 0; // unknown
712 if (strcmp(fmt, "sam") == 0) {
713 format->category = sequence_data;
714 format->format = sam;
715 format->compression = no_compression;;
716 format->compression_level = 0;
717 } else if (strcmp(fmt, "bam") == 0) {
718 format->category = sequence_data;
719 format->format = bam;
720 format->compression = bgzf;
721 format->compression_level = -1;
722 } else if (strcmp(fmt, "cram") == 0) {
723 format->category = sequence_data;
724 format->format = cram;
725 format->compression = custom;
726 format->compression_level = -1;
727 } else if (strcmp(fmt, "vcf") == 0) {
728 format->category = variant_data;
729 format->format = vcf;
730 format->compression = no_compression;;
731 format->compression_level = 0;
732 } else if (strcmp(fmt, "bcf") == 0) {
733 format->category = variant_data;
734 format->format = bcf;
735 format->compression = bgzf;
736 format->compression_level = -1;
737 } else {
738 return -1;
741 return hts_parse_opt_list(format, cp);
746 * Tokenise options as (key(=value)?,)*(key(=value)?)?
747 * NB: No provision for ',' appearing in the value!
748 * Add backslashing rules?
750 * This could be used as part of a general command line option parser or
751 * as a string concatenated onto the file open mode.
753 * Returns 0 on success
754 * -1 on failure.
756 static int hts_process_opts(htsFile *fp, const char *opts) {
757 htsFormat fmt;
759 fmt.specific = NULL;
760 if (hts_parse_opt_list(&fmt, opts) != 0)
761 return -1;
763 if (hts_opt_apply(fp, fmt.specific) != 0) {
764 hts_opt_free(fmt.specific);
765 return -1;
768 hts_opt_free(fmt.specific);
770 return 0;
774 htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode)
776 hFILE *hfile_orig = hfile;
777 htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile));
778 char simple_mode[101], *cp, *opts;
779 simple_mode[100] = '\0';
781 if (fp == NULL) goto error;
783 fp->fn = strdup(fn);
784 fp->is_be = ed_is_big();
786 // Split mode into simple_mode,opts strings
787 if ((cp = strchr(mode, ','))) {
788 strncpy(simple_mode, mode, cp-mode <= 100 ? cp-mode : 100);
789 simple_mode[cp-mode] = '\0';
790 opts = cp+1;
791 } else {
792 strncpy(simple_mode, mode, 100);
793 opts = NULL;
796 if (strchr(simple_mode, 'r')) {
797 if (hts_detect_format(hfile, &fp->format) < 0) goto error;
799 if (fp->format.format == json) {
800 hFILE *hfile2 = hopen_json_redirect(hfile, simple_mode);
801 if (hfile2 == NULL) goto error;
803 // Build fp against the result of the redirection
804 hfile = hfile2;
805 if (hts_detect_format(hfile, &fp->format) < 0) goto error;
808 else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) {
809 htsFormat *fmt = &fp->format;
810 fp->is_write = 1;
812 if (strchr(simple_mode, 'b')) fmt->format = binary_format;
813 else if (strchr(simple_mode, 'c')) fmt->format = cram;
814 else fmt->format = text_format;
816 if (strchr(simple_mode, 'z')) fmt->compression = bgzf;
817 else if (strchr(simple_mode, 'g')) fmt->compression = gzip;
818 else if (strchr(simple_mode, 'u')) fmt->compression = no_compression;
819 else {
820 // No compression mode specified, set to the default for the format
821 switch (fmt->format) {
822 case binary_format: fmt->compression = bgzf; break;
823 case cram: fmt->compression = custom; break;
824 case text_format: fmt->compression = no_compression; break;
825 default: abort();
829 // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
830 fmt->category = format_category(fmt->format);
832 fmt->version.major = fmt->version.minor = -1;
833 fmt->compression_level = -1;
834 fmt->specific = NULL;
836 else { errno = EINVAL; goto error; }
838 switch (fp->format.format) {
839 case binary_format:
840 case bam:
841 case bcf:
842 fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
843 if (fp->fp.bgzf == NULL) goto error;
844 fp->is_bin = fp->is_bgzf = 1;
845 break;
847 case cram:
848 fp->fp.cram = cram_dopen(hfile, fn, simple_mode);
849 if (fp->fp.cram == NULL) goto error;
850 if (!fp->is_write)
851 cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
852 fp->is_cram = 1;
853 break;
855 case text_format:
856 case sam:
857 case vcf:
858 if (fp->format.compression != no_compression) {
859 fp->fp.bgzf = bgzf_hopen(hfile, simple_mode);
860 if (fp->fp.bgzf == NULL) goto error;
861 fp->is_bgzf = 1;
863 else
864 fp->fp.hfile = hfile;
865 break;
867 default:
868 errno = ENOEXEC;
869 goto error;
872 if (opts)
873 hts_process_opts(fp, opts);
875 // If redirecting, close the original hFILE now (pedantically we would
876 // instead close it in hts_close(), but this a simplifying optimisation)
877 if (hfile != hfile_orig) hclose_abruptly(hfile_orig);
879 return fp;
881 error:
882 hts_log_error("Failed to open file %s", fn);
884 // If redirecting, close the failed redirection hFILE that we have opened
885 if (hfile != hfile_orig) hclose_abruptly(hfile);
887 if (fp) {
888 free(fp->fn);
889 free(fp->fn_aux);
890 free(fp);
892 return NULL;
895 int hts_close(htsFile *fp)
897 int ret, save;
899 switch (fp->format.format) {
900 case binary_format:
901 case bam:
902 case bcf:
903 ret = bgzf_close(fp->fp.bgzf);
904 break;
906 case cram:
907 if (!fp->is_write) {
908 switch (cram_eof(fp->fp.cram)) {
909 case 2:
910 hts_log_warning("EOF marker is absent. The input is probably truncated");
911 break;
912 case 0: /* not at EOF, but may not have wanted all seqs */
913 default: /* case 1, expected EOF */
914 break;
917 ret = cram_close(fp->fp.cram);
918 break;
920 case text_format:
921 case sam:
922 case vcf:
923 if (fp->format.compression != no_compression)
924 ret = bgzf_close(fp->fp.bgzf);
925 else
926 ret = hclose(fp->fp.hfile);
927 break;
929 default:
930 ret = -1;
931 break;
934 save = errno;
935 free(fp->fn);
936 free(fp->fn_aux);
937 free(fp->line.s);
938 free(fp);
939 errno = save;
940 return ret;
943 const htsFormat *hts_get_format(htsFile *fp)
945 return fp? &fp->format : NULL;
948 const char *hts_format_file_extension(const htsFormat *format) {
949 if (!format)
950 return "?";
952 switch (format->format) {
953 case sam: return "sam";
954 case bam: return "bam";
955 case bai: return "bai";
956 case cram: return "cram";
957 case crai: return "crai";
958 case vcf: return "vcf";
959 case bcf: return "bcf";
960 case csi: return "csi";
961 case gzi: return "gzi";
962 case tbi: return "tbi";
963 case bed: return "bed";
964 default: return "?";
968 static hFILE *hts_hfile(htsFile *fp) {
969 switch (fp->format.format) {
970 case binary_format: // fall through; still valid if bcf?
971 case bam: return bgzf_hfile(fp->fp.bgzf);
972 case cram: return cram_hfile(fp->fp.cram);
973 case text_format: return fp->fp.hfile;
974 case sam: return fp->fp.hfile;
975 default: return NULL;
979 int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) {
980 int r;
981 va_list args;
983 switch (opt) {
984 case HTS_OPT_NTHREADS: {
985 va_start(args, opt);
986 int nthreads = va_arg(args, int);
987 va_end(args);
988 return hts_set_threads(fp, nthreads);
991 case HTS_OPT_BLOCK_SIZE: {
992 hFILE *hf = hts_hfile(fp);
994 if (hf) {
995 va_start(args, opt);
996 if (hfile_set_blksize(hf, va_arg(args, int)) != 0 && hts_verbose >= 2)
997 fprintf(stderr, "[W::%s] Failed to change block size\n", __func__);
998 va_end(args);
999 } else if (hts_verbose >= 2)
1000 // To do - implement for vcf/bcf.
1001 fprintf(stderr, "[W::%s] cannot change block size for this format\n", __func__);
1004 return 0;
1007 case HTS_OPT_THREAD_POOL: {
1008 va_start(args, opt);
1009 htsThreadPool *p = va_arg(args, htsThreadPool *);
1010 va_end(args);
1011 return hts_set_thread_pool(fp, p);
1014 case HTS_OPT_CACHE_SIZE: {
1015 va_start(args, opt);
1016 int cache_size = va_arg(args, int);
1017 va_end(args);
1018 hts_set_cache_size(fp, cache_size);
1019 return 0;
1022 default:
1023 break;
1026 if (fp->format.format != cram)
1027 return 0;
1029 va_start(args, opt);
1030 r = cram_set_voption(fp->fp.cram, opt, args);
1031 va_end(args);
1033 return r;
1036 BGZF *hts_get_bgzfp(htsFile *fp);
1038 int hts_set_threads(htsFile *fp, int n)
1040 if (fp->format.compression == bgzf) {
1041 return bgzf_mt(hts_get_bgzfp(fp), n, 256/*unused*/);
1042 } else if (fp->format.format == cram) {
1043 return hts_set_opt(fp, CRAM_OPT_NTHREADS, n);
1045 else return 0;
1048 int hts_set_thread_pool(htsFile *fp, htsThreadPool *p) {
1049 if (fp->format.compression == bgzf) {
1050 return bgzf_thread_pool(hts_get_bgzfp(fp), p->pool, p->qsize);
1051 } else if (fp->format.format == cram) {
1052 return hts_set_opt(fp, CRAM_OPT_THREAD_POOL, p);
1054 else return 0;
1057 void hts_set_cache_size(htsFile *fp, int n)
1059 if (fp->format.compression == bgzf)
1060 bgzf_set_cache_size(hts_get_bgzfp(fp), n);
1063 int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
1065 free(fp->fn_aux);
1066 if (fn_aux) {
1067 fp->fn_aux = strdup(fn_aux);
1068 if (fp->fn_aux == NULL) return -1;
1070 else fp->fn_aux = NULL;
1072 if (fp->format.format == cram)
1073 if (cram_set_option(fp->fp.cram, CRAM_OPT_REFERENCE, fp->fn_aux))
1074 return -1;
1076 return 0;
1079 // For VCF/BCF backward sweeper. Not exposing these functions because their
1080 // future is uncertain. Things will probably have to change with hFILE...
1081 BGZF *hts_get_bgzfp(htsFile *fp)
1083 if (fp->is_bgzf)
1084 return fp->fp.bgzf;
1085 else
1086 return NULL;
1088 int hts_useek(htsFile *fp, long uoffset, int where)
1090 if (fp->is_bgzf)
1091 return bgzf_useek(fp->fp.bgzf, uoffset, where);
1092 else
1093 return (hseek(fp->fp.hfile, uoffset, SEEK_SET) >= 0)? 0 : -1;
1095 long hts_utell(htsFile *fp)
1097 if (fp->is_bgzf)
1098 return bgzf_utell(fp->fp.bgzf);
1099 else
1100 return htell(fp->fp.hfile);
1103 int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
1105 int ret;
1106 if (! (delimiter == KS_SEP_LINE || delimiter == '\n')) {
1107 hts_log_error("Unexpected delimiter %d", delimiter);
1108 abort();
1111 switch (fp->format.compression) {
1112 case no_compression:
1113 str->l = 0;
1114 ret = kgetline(str, (kgets_func *) hgets, fp->fp.hfile);
1115 if (ret >= 0) ret = str->l;
1116 else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile);
1117 else ret = -1;
1118 break;
1120 case gzip:
1121 case bgzf:
1122 ret = bgzf_getline(fp->fp.bgzf, '\n', str);
1123 break;
1125 default:
1126 abort();
1129 ++fp->lineno;
1130 return ret;
1133 char **hts_readlist(const char *string, int is_file, int *_n)
1135 int m = 0, n = 0;
1136 char **s = 0;
1137 if ( is_file )
1139 BGZF *fp = bgzf_open(string, "r");
1140 if ( !fp ) return NULL;
1142 kstring_t str;
1143 str.s = 0; str.l = str.m = 0;
1144 while (bgzf_getline(fp, '\n', &str) >= 0)
1146 if (str.l == 0) continue;
1147 n++;
1148 hts_expand(char*,n,m,s);
1149 s[n-1] = strdup(str.s);
1151 bgzf_close(fp);
1152 free(str.s);
1154 else
1156 const char *q = string, *p = string;
1157 while ( 1 )
1159 if (*p == ',' || *p == 0)
1161 n++;
1162 hts_expand(char*,n,m,s);
1163 s[n-1] = (char*)calloc(p - q + 1, 1);
1164 strncpy(s[n-1], q, p - q);
1165 q = p + 1;
1167 if ( !*p ) break;
1168 p++;
1171 s = (char**)realloc(s, n * sizeof(char*));
1172 *_n = n;
1173 return s;
1176 char **hts_readlines(const char *fn, int *_n)
1178 int m = 0, n = 0;
1179 char **s = 0;
1180 BGZF *fp = bgzf_open(fn, "r");
1181 if ( fp ) { // read from file
1182 kstring_t str;
1183 str.s = 0; str.l = str.m = 0;
1184 while (bgzf_getline(fp, '\n', &str) >= 0) {
1185 if (str.l == 0) continue;
1186 if (m == n) {
1187 m = m? m<<1 : 16;
1188 s = (char**)realloc(s, m * sizeof(char*));
1190 s[n++] = strdup(str.s);
1192 bgzf_close(fp);
1193 s = (char**)realloc(s, n * sizeof(char*));
1194 free(str.s);
1195 } else if (*fn == ':') { // read from string
1196 const char *q, *p;
1197 for (q = p = fn + 1;; ++p)
1198 if (*p == ',' || *p == 0) {
1199 if (m == n) {
1200 m = m? m<<1 : 16;
1201 s = (char**)realloc(s, m * sizeof(char*));
1203 s[n] = (char*)calloc(p - q + 1, 1);
1204 strncpy(s[n++], q, p - q);
1205 q = p + 1;
1206 if (*p == 0) break;
1208 } else return 0;
1209 s = (char**)realloc(s, n * sizeof(char*));
1210 *_n = n;
1211 return s;
1214 // DEPRECATED: To be removed in a future HTSlib release
1215 int hts_file_type(const char *fname)
1217 int len = strlen(fname);
1218 if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
1219 if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
1220 if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
1221 if ( !strcmp("-",fname) ) return FT_STDIN;
1223 hFILE *f = hopen(fname, "r");
1224 if (f == NULL) return 0;
1226 htsFormat fmt;
1227 if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; }
1228 if (hclose(f) < 0) return 0;
1230 switch (fmt.format) {
1231 case vcf: return (fmt.compression == no_compression)? FT_VCF : FT_VCF_GZ;
1232 case bcf: return (fmt.compression == no_compression)? FT_BCF : FT_BCF_GZ;
1233 default: return 0;
1237 int hts_check_EOF(htsFile *fp)
1239 if (fp->format.compression == bgzf)
1240 return bgzf_check_EOF(hts_get_bgzfp(fp));
1241 else if (fp->format.format == cram)
1242 return cram_check_EOF(fp->fp.cram);
1243 else
1244 return 3;
1248 /****************
1249 *** Indexing ***
1250 ****************/
1252 #define HTS_MIN_MARKER_DIST 0x10000
1254 // Finds the special meta bin
1255 // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
1256 #define META_BIN(idx) ((idx)->n_bins + 1)
1258 #define pair64_lt(a,b) ((a).u < (b).u)
1260 KSORT_INIT(_off, hts_pair64_t, pair64_lt)
1262 typedef struct {
1263 int32_t m, n;
1264 uint64_t loff;
1265 hts_pair64_t *list;
1266 } bins_t;
1268 KHASH_MAP_INIT_INT(bin, bins_t)
1269 typedef khash_t(bin) bidx_t;
1271 typedef struct {
1272 int32_t n, m;
1273 uint64_t *offset;
1274 } lidx_t;
1276 struct __hts_idx_t {
1277 int fmt, min_shift, n_lvls, n_bins;
1278 uint32_t l_meta;
1279 int32_t n, m;
1280 uint64_t n_no_coor;
1281 bidx_t **bidx;
1282 lidx_t *lidx;
1283 uint8_t *meta; // MUST have a terminating NUL on the end
1284 struct {
1285 uint32_t last_bin, save_bin;
1286 int last_coor, last_tid, save_tid, finished;
1287 uint64_t last_off, save_off;
1288 uint64_t off_beg, off_end;
1289 uint64_t n_mapped, n_unmapped;
1290 } z; // keep internal states
1293 static char * idx_format_name(int fmt) {
1294 switch (fmt) {
1295 case HTS_FMT_CSI: return "csi";
1296 case HTS_FMT_BAI: return "bai";
1297 case HTS_FMT_TBI: return "tbi";
1298 case HTS_FMT_CRAI: return "crai";
1299 default: return "unknown";
1303 static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
1305 khint_t k;
1306 bins_t *l;
1307 int absent;
1308 k = kh_put(bin, b, bin, &absent);
1309 if (absent < 0) return -1; // Out of memory
1310 l = &kh_value(b, k);
1311 if (absent) {
1312 l->m = 1; l->n = 0;
1313 l->list = (hts_pair64_t*)calloc(l->m, sizeof(hts_pair64_t));
1314 if (!l->list) {
1315 kh_del(bin, b, k);
1316 return -1;
1318 } else if (l->n == l->m) {
1319 uint32_t new_m = l->m ? l->m << 1 : 1;
1320 hts_pair64_t *new_list = realloc(l->list, new_m * sizeof(hts_pair64_t));
1321 if (!new_list) return -1;
1322 l->list = new_list;
1323 l->m = new_m;
1325 l->list[l->n].u = beg;
1326 l->list[l->n++].v = end;
1327 return 0;
1330 static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
1332 int i, beg, end;
1333 beg = _beg >> min_shift;
1334 end = (_end - 1) >> min_shift;
1335 if (l->m < end + 1) {
1336 size_t new_m = l->m * 2 > end + 1 ? l->m * 2 : end + 1;
1337 uint64_t *new_offset;
1339 new_offset = (uint64_t*)realloc(l->offset, new_m * sizeof(uint64_t));
1340 if (!new_offset) return -1;
1342 // fill unused memory with (uint64_t)-1
1343 memset(new_offset + l->m, 0xff, sizeof(uint64_t) * (new_m - l->m));
1344 l->m = new_m;
1345 l->offset = new_offset;
1347 for (i = beg; i <= end; ++i) {
1348 if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
1350 if (l->n < end + 1) l->n = end + 1;
1351 return 0;
1354 hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
1356 hts_idx_t *idx;
1357 idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
1358 if (idx == NULL) return NULL;
1359 idx->fmt = fmt;
1360 idx->min_shift = min_shift;
1361 idx->n_lvls = n_lvls;
1362 idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
1363 idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu;
1364 idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
1365 idx->z.last_coor = 0xffffffffu;
1366 if (n) {
1367 idx->n = idx->m = n;
1368 idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
1369 if (idx->bidx == NULL) { free(idx); return NULL; }
1370 idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
1371 if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
1373 return idx;
1376 static void update_loff(hts_idx_t *idx, int i, int free_lidx)
1378 bidx_t *bidx = idx->bidx[i];
1379 lidx_t *lidx = &idx->lidx[i];
1380 khint_t k;
1381 int l;
1382 uint64_t offset0 = 0;
1383 if (bidx) {
1384 k = kh_get(bin, bidx, META_BIN(idx));
1385 if (k != kh_end(bidx))
1386 offset0 = kh_val(bidx, k).list[0].u;
1387 for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
1388 lidx->offset[l] = offset0;
1389 } else l = 1;
1390 for (; l < lidx->n; ++l) // fill missing values
1391 if (lidx->offset[l] == (uint64_t)-1)
1392 lidx->offset[l] = lidx->offset[l-1];
1393 if (bidx == 0) return;
1394 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
1395 if (kh_exist(bidx, k))
1397 if ( kh_key(bidx, k) < idx->n_bins )
1399 int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
1400 // disable linear index if bot_bin out of bounds
1401 kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
1403 else
1404 kh_val(bidx, k).loff = 0;
1406 if (free_lidx) {
1407 free(lidx->offset);
1408 lidx->m = lidx->n = 0;
1409 lidx->offset = 0;
1413 static void compress_binning(hts_idx_t *idx, int i)
1415 bidx_t *bidx = idx->bidx[i];
1416 khint_t k;
1417 int l, m;
1418 if (bidx == 0) return;
1419 // merge a bin to its parent if the bin is too small
1420 for (l = idx->n_lvls; l > 0; --l) {
1421 unsigned start = hts_bin_first(l);
1422 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1423 bins_t *p, *q;
1424 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
1425 p = &kh_value(bidx, k);
1426 if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
1427 if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
1428 khint_t kp;
1429 kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
1430 if (kp == kh_end(bidx)) continue;
1431 q = &kh_val(bidx, kp);
1432 if (q->n + p->n > q->m) {
1433 q->m = q->n + p->n;
1434 kroundup32(q->m);
1435 q->list = (hts_pair64_t*)realloc(q->list, q->m * sizeof(hts_pair64_t));
1437 memcpy(q->list + q->n, p->list, p->n * sizeof(hts_pair64_t));
1438 q->n += p->n;
1439 free(p->list);
1440 kh_del(bin, bidx, k);
1444 k = kh_get(bin, bidx, 0);
1445 if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
1446 // merge adjacent chunks that start from the same BGZF block
1447 for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
1448 bins_t *p;
1449 if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
1450 p = &kh_value(bidx, k);
1451 for (l = 1, m = 0; l < p->n; ++l) {
1452 if (p->list[m].v>>16 >= p->list[l].u>>16) {
1453 if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
1454 } else p->list[++m] = p->list[l];
1456 p->n = m + 1;
1460 void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
1462 int i;
1463 if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times
1464 if (idx->z.save_tid >= 0) {
1465 insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
1466 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
1467 insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
1469 for (i = 0; i < idx->n; ++i) {
1470 update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
1471 compress_binning(idx, i);
1473 idx->z.finished = 1;
1476 int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
1478 int bin;
1479 int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
1480 if (tid<0) beg = -1, end = 0;
1481 if (tid >= 0 && (beg > maxpos || end > maxpos)) {
1482 goto pos_too_big;
1484 if (tid >= idx->m) { // enlarge the index
1485 uint32_t new_m = idx->m * 2 > tid + 1 ? idx->m * 2 : tid + 1;
1486 bidx_t **new_bidx;
1487 lidx_t *new_lidx;
1489 new_bidx = (bidx_t**)realloc(idx->bidx, new_m * sizeof(bidx_t*));
1490 if (!new_bidx) return -1;
1491 idx->bidx = new_bidx;
1493 new_lidx = (lidx_t*) realloc(idx->lidx, new_m * sizeof(lidx_t));
1494 if (!new_lidx) return -1;
1495 idx->lidx = new_lidx;
1497 memset(&idx->bidx[idx->m], 0, (new_m - idx->m) * sizeof(bidx_t*));
1498 memset(&idx->lidx[idx->m], 0, (new_m - idx->m) * sizeof(lidx_t));
1499 idx->m = new_m;
1501 if (idx->n < tid + 1) idx->n = tid + 1;
1502 if (idx->z.finished) return 0;
1503 if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
1504 if ( tid>=0 && idx->n_no_coor )
1506 hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid, idx->z.last_tid);
1507 return -1;
1509 if (tid>=0 && idx->bidx[tid] != 0)
1511 hts_log_error("Chromosome blocks not continuous");
1512 return -1;
1514 idx->z.last_tid = tid;
1515 idx->z.last_bin = 0xffffffffu;
1516 } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
1517 hts_log_error("Unsorted positions on sequence #%d: %d followed by %d", tid+1, idx->z.last_coor+1, beg+1);
1518 return -1;
1520 if ( tid>=0 )
1522 if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
1523 if (is_mapped) {
1524 // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
1525 if (beg < 0) beg = 0;
1526 if (end <= 0) end = 1;
1527 // idx->z.last_off points to the start of the current record
1528 if (insert_to_l(&idx->lidx[tid], beg, end,
1529 idx->z.last_off, idx->min_shift) < 0) return -1;
1532 else idx->n_no_coor++;
1533 bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
1534 if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
1535 if (idx->z.save_bin != 0xffffffffu) { // save_bin==0xffffffffu only happens to the first record
1536 if (insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin,
1537 idx->z.save_off, idx->z.last_off) < 0) return -1;
1539 if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
1540 idx->z.off_end = idx->z.last_off;
1541 if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1542 idx->z.off_beg, idx->z.off_end) < 0) return -1;
1543 if (insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx),
1544 idx->z.n_mapped, idx->z.n_unmapped) < 0) return -1;
1545 idx->z.n_mapped = idx->z.n_unmapped = 0;
1546 idx->z.off_beg = idx->z.off_end;
1548 idx->z.save_off = idx->z.last_off;
1549 idx->z.save_bin = idx->z.last_bin = bin;
1550 idx->z.save_tid = tid;
1552 if (is_mapped) ++idx->z.n_mapped;
1553 else ++idx->z.n_unmapped;
1554 idx->z.last_off = offset;
1555 idx->z.last_coor = beg;
1556 return 0;
1558 pos_too_big: {
1559 int64_t max = end > beg ? end : beg, s = 1 << 14;
1560 int n_lvls = 0;
1561 while (max > s) {
1562 n_lvls++;
1563 s <<= 3;
1566 if (idx->fmt == HTS_FMT_CSI) {
1567 hts_log_error("Region %d..%d cannot be stored in a csi index "
1568 "with min_shift = %d, n_lvls = %d. Try using "
1569 "min_shift = 14, n_lvls >= %d",
1570 beg, end,
1571 idx->min_shift, idx->n_lvls,
1572 n_lvls);
1573 } else {
1574 hts_log_error("Region %d..%d cannot be stored in a %s index. "
1575 "Try using a csi index with min_shift = 14, "
1576 "n_lvls >= %d",
1577 beg, end, idx_format_name(idx->fmt),
1578 n_lvls);
1580 errno = ERANGE;
1581 return -1;
1585 void hts_idx_destroy(hts_idx_t *idx)
1587 khint_t k;
1588 int i;
1589 if (idx == 0) return;
1591 // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
1592 if (idx->fmt == HTS_FMT_CRAI) {
1593 hts_cram_idx_t *cidx = (hts_cram_idx_t *) idx;
1594 cram_index_free(cidx->cram);
1595 free(cidx);
1596 return;
1599 for (i = 0; i < idx->m; ++i) {
1600 bidx_t *bidx = idx->bidx[i];
1601 free(idx->lidx[i].offset);
1602 if (bidx == 0) continue;
1603 for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
1604 if (kh_exist(bidx, k))
1605 free(kh_value(bidx, k).list);
1606 kh_destroy(bin, bidx);
1608 free(idx->bidx); free(idx->lidx); free(idx->meta);
1609 free(idx);
1612 // The optimizer eliminates these ed_is_big() calls; still it would be good to
1613 // TODO Determine endianness at configure- or compile-time
1615 static inline ssize_t HTS_RESULT_USED idx_write_int32(BGZF *fp, int32_t x)
1617 if (ed_is_big()) x = ed_swap_4(x);
1618 return bgzf_write(fp, &x, sizeof x);
1621 static inline ssize_t HTS_RESULT_USED idx_write_uint32(BGZF *fp, uint32_t x)
1623 if (ed_is_big()) x = ed_swap_4(x);
1624 return bgzf_write(fp, &x, sizeof x);
1627 static inline ssize_t HTS_RESULT_USED idx_write_uint64(BGZF *fp, uint64_t x)
1629 if (ed_is_big()) x = ed_swap_8(x);
1630 return bgzf_write(fp, &x, sizeof x);
1633 static inline void swap_bins(bins_t *p)
1635 int i;
1636 for (i = 0; i < p->n; ++i) {
1637 ed_swap_8p(&p->list[i].u);
1638 ed_swap_8p(&p->list[i].v);
1642 static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt)
1644 int32_t i, j;
1646 #define check(ret) if ((ret) < 0) return -1
1648 check(idx_write_int32(fp, idx->n));
1649 if (fmt == HTS_FMT_TBI && idx->l_meta)
1650 check(bgzf_write(fp, idx->meta, idx->l_meta));
1652 for (i = 0; i < idx->n; ++i) {
1653 khint_t k;
1654 bidx_t *bidx = idx->bidx[i];
1655 lidx_t *lidx = &idx->lidx[i];
1656 // write binning index
1657 check(idx_write_int32(fp, bidx? kh_size(bidx) : 0));
1658 if (bidx)
1659 for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
1660 if (kh_exist(bidx, k)) {
1661 bins_t *p = &kh_value(bidx, k);
1662 check(idx_write_uint32(fp, kh_key(bidx, k)));
1663 if (fmt == HTS_FMT_CSI) check(idx_write_uint64(fp, p->loff));
1664 //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
1665 check(idx_write_int32(fp, p->n));
1666 for (j = 0; j < p->n; ++j) {
1667 check(idx_write_uint64(fp, p->list[j].u));
1668 check(idx_write_uint64(fp, p->list[j].v));
1672 // write linear index
1673 if (fmt != HTS_FMT_CSI) {
1674 check(idx_write_int32(fp, lidx->n));
1675 for (j = 0; j < lidx->n; ++j)
1676 check(idx_write_uint64(fp, lidx->offset[j]));
1680 check(idx_write_uint64(fp, idx->n_no_coor));
1681 return 0;
1682 #undef check
1685 int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
1687 int ret, save;
1688 char *fnidx = (char*)calloc(1, strlen(fn) + 5);
1689 if (fnidx == NULL) return -1;
1691 strcpy(fnidx, fn);
1692 switch (fmt) {
1693 case HTS_FMT_BAI: strcat(fnidx, ".bai"); break;
1694 case HTS_FMT_CSI: strcat(fnidx, ".csi"); break;
1695 case HTS_FMT_TBI: strcat(fnidx, ".tbi"); break;
1696 default: abort();
1699 ret = hts_idx_save_as(idx, fn, fnidx, fmt);
1700 save = errno;
1701 free(fnidx);
1702 errno = save;
1703 return ret;
1706 int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
1708 BGZF *fp;
1710 #define check(ret) if ((ret) < 0) goto fail
1712 if (fnidx == NULL) return hts_idx_save(idx, fn, fmt);
1714 fp = bgzf_open(fnidx, (fmt == HTS_FMT_BAI)? "wu" : "w");
1715 if (fp == NULL) return -1;
1717 if (fmt == HTS_FMT_CSI) {
1718 check(bgzf_write(fp, "CSI\1", 4));
1719 check(idx_write_int32(fp, idx->min_shift));
1720 check(idx_write_int32(fp, idx->n_lvls));
1721 check(idx_write_uint32(fp, idx->l_meta));
1722 if (idx->l_meta) check(bgzf_write(fp, idx->meta, idx->l_meta));
1723 } else if (fmt == HTS_FMT_TBI) {
1724 check(bgzf_write(fp, "TBI\1", 4));
1725 } else if (fmt == HTS_FMT_BAI) {
1726 check(bgzf_write(fp, "BAI\1", 4));
1727 } else abort();
1729 check(hts_idx_save_core(idx, fp, fmt));
1731 return bgzf_close(fp);
1732 #undef check
1734 fail:
1735 bgzf_close(fp);
1736 return -1;
1739 static int hts_idx_load_core(hts_idx_t *idx, BGZF *fp, int fmt)
1741 int32_t i, n, is_be;
1742 is_be = ed_is_big();
1743 if (idx == NULL) return -4;
1744 for (i = 0; i < idx->n; ++i) {
1745 bidx_t *h;
1746 lidx_t *l = &idx->lidx[i];
1747 uint32_t key;
1748 int j, absent;
1749 bins_t *p;
1750 h = idx->bidx[i] = kh_init(bin);
1751 if (bgzf_read(fp, &n, 4) != 4) return -1;
1752 if (is_be) ed_swap_4p(&n);
1753 for (j = 0; j < n; ++j) {
1754 khint_t k;
1755 if (bgzf_read(fp, &key, 4) != 4) return -1;
1756 if (is_be) ed_swap_4p(&key);
1757 k = kh_put(bin, h, key, &absent);
1758 if (absent <= 0) return -3; // Duplicate bin number
1759 p = &kh_val(h, k);
1760 if (fmt == HTS_FMT_CSI) {
1761 if (bgzf_read(fp, &p->loff, 8) != 8) return -1;
1762 if (is_be) ed_swap_8p(&p->loff);
1763 } else p->loff = 0;
1764 if (bgzf_read(fp, &p->n, 4) != 4) return -1;
1765 if (is_be) ed_swap_4p(&p->n);
1766 p->m = p->n;
1767 p->list = (hts_pair64_t*)malloc(p->m * sizeof(hts_pair64_t));
1768 if (p->list == NULL) return -2;
1769 if (bgzf_read(fp, p->list, p->n<<4) != p->n<<4) return -1;
1770 if (is_be) swap_bins(p);
1772 if (fmt != HTS_FMT_CSI) { // load linear index
1773 int j;
1774 if (bgzf_read(fp, &l->n, 4) != 4) return -1;
1775 if (is_be) ed_swap_4p(&l->n);
1776 l->m = l->n;
1777 l->offset = (uint64_t*)malloc(l->n * sizeof(uint64_t));
1778 if (l->offset == NULL) return -2;
1779 if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1;
1780 if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
1781 for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
1782 if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
1783 update_loff(idx, i, 1);
1786 if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
1787 if (is_be) ed_swap_8p(&idx->n_no_coor);
1788 return 0;
1791 static hts_idx_t *hts_idx_load_local(const char *fn)
1793 uint8_t magic[4];
1794 int i, is_be;
1795 hts_idx_t *idx = NULL;
1796 uint8_t *meta = NULL;
1797 BGZF *fp = bgzf_open(fn, "r");
1798 if (fp == NULL) return NULL;
1799 is_be = ed_is_big();
1800 if (bgzf_read(fp, magic, 4) != 4) goto fail;
1802 if (memcmp(magic, "CSI\1", 4) == 0) {
1803 uint32_t x[3], n;
1804 if (bgzf_read(fp, x, 12) != 12) goto fail;
1805 if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
1806 if (x[2]) {
1807 if (SIZE_MAX - x[2] < 1) goto fail; // Prevent possible overflow
1808 if ((meta = (uint8_t*)malloc((size_t) x[2] + 1)) == NULL) goto fail;
1809 if (bgzf_read(fp, meta, x[2]) != x[2]) goto fail;
1810 // Prevent possible strlen past the end in tbx_index_load2
1811 meta[x[2]] = '\0';
1813 if (bgzf_read(fp, &n, 4) != 4) goto fail;
1814 if (is_be) ed_swap_4p(&n);
1815 if ((idx = hts_idx_init(n, HTS_FMT_CSI, 0, x[0], x[1])) == NULL) goto fail;
1816 idx->l_meta = x[2];
1817 idx->meta = meta;
1818 meta = NULL;
1819 if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto fail;
1821 else if (memcmp(magic, "TBI\1", 4) == 0) {
1822 uint8_t x[8 * 4];
1823 uint32_t n;
1824 // Read file header
1825 if (bgzf_read(fp, x, sizeof(x)) != sizeof(x)) goto fail;
1826 n = le_to_u32(&x[0]); // location of n_ref
1827 if ((idx = hts_idx_init(n, HTS_FMT_TBI, 0, 14, 5)) == NULL) goto fail;
1828 n = le_to_u32(&x[7*4]); // location of l_nm
1829 if (n > UINT32_MAX - 29) goto fail; // Prevent possible overflow
1830 idx->l_meta = 28 + n;
1831 if ((idx->meta = (uint8_t*)malloc(idx->l_meta + 1)) == NULL) goto fail;
1832 // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
1833 // N.B. left in little-endian byte order.
1834 memcpy(idx->meta, &x[1*4], 28);
1835 // Read in sequence names.
1836 if (bgzf_read(fp, idx->meta + 28, n) != n) goto fail;
1837 // Prevent possible strlen past the end in tbx_index_load2
1838 idx->meta[idx->l_meta] = '\0';
1839 if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto fail;
1841 else if (memcmp(magic, "BAI\1", 4) == 0) {
1842 uint32_t n;
1843 if (bgzf_read(fp, &n, 4) != 4) goto fail;
1844 if (is_be) ed_swap_4p(&n);
1845 idx = hts_idx_init(n, HTS_FMT_BAI, 0, 14, 5);
1846 if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto fail;
1848 else { errno = EINVAL; goto fail; }
1850 bgzf_close(fp);
1851 return idx;
1853 fail:
1854 bgzf_close(fp);
1855 hts_idx_destroy(idx);
1856 free(meta);
1857 return NULL;
1860 int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta,
1861 int is_copy)
1863 uint8_t *new_meta = meta;
1864 if (is_copy) {
1865 size_t l = l_meta;
1866 if (l > SIZE_MAX - 1) {
1867 errno = ENOMEM;
1868 return -1;
1870 new_meta = malloc(l + 1);
1871 if (!new_meta) return -1;
1872 memcpy(new_meta, meta, l);
1873 // Prevent possible strlen past the end in tbx_index_load2
1874 meta[l + 1] = '\0';
1876 if (idx->meta) free(idx->meta);
1877 idx->l_meta = l_meta;
1878 idx->meta = new_meta;
1879 return 0;
1882 uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
1884 *l_meta = idx->l_meta;
1885 return idx->meta;
1888 const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
1890 if ( !idx->n )
1892 *n = 0;
1893 return NULL;
1896 int tid = 0, i;
1897 const char **names = (const char**) calloc(idx->n,sizeof(const char*));
1898 for (i=0; i<idx->n; i++)
1900 bidx_t *bidx = idx->bidx[i];
1901 if ( !bidx ) continue;
1902 names[tid++] = getid(hdr,i);
1904 *n = tid;
1905 return names;
1908 int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
1910 if ( idx->fmt == HTS_FMT_CRAI ) {
1911 *mapped = 0; *unmapped = 0;
1912 return -1;
1915 bidx_t *h = idx->bidx[tid];
1916 khint_t k = kh_get(bin, h, META_BIN(idx));
1917 if (k != kh_end(h)) {
1918 *mapped = kh_val(h, k).list[1].u;
1919 *unmapped = kh_val(h, k).list[1].v;
1920 return 0;
1921 } else {
1922 *mapped = 0; *unmapped = 0;
1923 return -1;
1927 uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
1929 return idx->n_no_coor;
1932 /****************
1933 *** Iterator ***
1934 ****************/
1936 static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
1938 int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
1939 if (beg >= end) return 0;
1940 if (end >= 1LL<<s) end = 1LL<<s;
1941 for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
1942 int b, e, n, i;
1943 b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
1944 if (itr->bins.n + n > itr->bins.m) {
1945 itr->bins.m = itr->bins.n + n;
1946 kroundup32(itr->bins.m);
1947 itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
1949 for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
1951 return itr->bins.n;
1954 hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
1956 int i, n_off, l, bin;
1957 hts_pair64_t *off;
1958 khint_t k;
1959 bidx_t *bidx;
1960 uint64_t min_off, max_off;
1961 hts_itr_t *iter = 0;
1962 if (tid < 0) {
1963 int finished0 = 0;
1964 uint64_t off0 = (uint64_t)-1;
1965 khint_t k;
1966 switch (tid) {
1967 case HTS_IDX_START:
1968 // Find the smallest offset, note that sequence ids may not be ordered sequentially
1969 for (i=0; i<idx->n; i++)
1971 bidx = idx->bidx[i];
1972 k = kh_get(bin, bidx, META_BIN(idx));
1973 if (k == kh_end(bidx)) continue;
1974 if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u;
1976 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
1977 break;
1979 case HTS_IDX_NOCOOR:
1980 /* No-coor reads sort after all of the mapped reads. The position
1981 is not stored in the index itself, so need to find the end
1982 offset for the last mapped read. A loop is needed here in
1983 case references at the end of the file have no mapped reads,
1984 or sequence ids are not ordered sequentially.
1985 See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
1986 for (i = 0; i < idx->n; i++) {
1987 bidx = idx->bidx[i];
1988 k = kh_get(bin, bidx, META_BIN(idx));
1989 if (k != kh_end(bidx)) {
1990 if (off0==(uint64_t)-1 || off0 < kh_val(bidx, k).list[0].v) {
1991 off0 = kh_val(bidx, k).list[0].v;
1995 if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
1996 break;
1998 case HTS_IDX_REST:
1999 off0 = 0;
2000 break;
2002 case HTS_IDX_NONE:
2003 finished0 = 1;
2004 off0 = 0;
2005 break;
2007 default:
2008 return 0;
2010 if (off0 != (uint64_t)-1) {
2011 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
2012 iter->read_rest = 1;
2013 iter->finished = finished0;
2014 iter->curr_off = off0;
2015 iter->readrec = readrec;
2016 return iter;
2017 } else return 0;
2020 if (beg < 0) beg = 0;
2021 if (end < beg) return 0;
2022 if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) return 0;
2024 iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
2025 iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
2026 iter->readrec = readrec;
2028 if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
2030 // compute min_off
2031 bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
2032 do {
2033 int first;
2034 k = kh_get(bin, bidx, bin);
2035 if (k != kh_end(bidx)) break;
2036 first = (hts_bin_parent(bin)<<3) + 1;
2037 if (bin > first) --bin;
2038 else bin = hts_bin_parent(bin);
2039 } while (bin);
2040 if (bin == 0) k = kh_get(bin, bidx, bin);
2041 min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
2043 // compute max_off: a virtual offset from a bin to the right of end
2044 bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
2045 if (bin >= idx->n_bins) bin = 0;
2046 while (1) {
2047 // search for an extant bin by moving right, but moving up to the
2048 // parent whenever we get to a first child (which also covers falling
2049 // off the RHS, which wraps around and immediately goes up to bin 0)
2050 while (bin % 8 == 1) bin = hts_bin_parent(bin);
2051 if (bin == 0) { max_off = (uint64_t)-1; break; }
2052 k = kh_get(bin, bidx, bin);
2053 if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; }
2054 bin++;
2057 // retrieve bins
2058 reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
2059 for (i = n_off = 0; i < iter->bins.n; ++i)
2060 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
2061 n_off += kh_value(bidx, k).n;
2062 if (n_off == 0) {
2063 // No overlapping bins means the iterator has already finished.
2064 iter->finished = 1;
2065 return iter;
2067 off = (hts_pair64_t*)calloc(n_off, sizeof(hts_pair64_t));
2068 for (i = n_off = 0; i < iter->bins.n; ++i) {
2069 if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
2070 int j;
2071 bins_t *p = &kh_value(bidx, k);
2072 for (j = 0; j < p->n; ++j)
2073 if (p->list[j].v > min_off && p->list[j].u < max_off)
2074 off[n_off++] = p->list[j];
2077 if (n_off == 0) {
2078 free(off);
2079 iter->finished = 1;
2080 return iter;
2082 ks_introsort(_off, n_off, off);
2083 // resolve completely contained adjacent blocks
2084 for (i = 1, l = 0; i < n_off; ++i)
2085 if (off[l].v < off[i].v) off[++l] = off[i];
2086 n_off = l + 1;
2087 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2088 for (i = 1; i < n_off; ++i)
2089 if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
2090 // merge adjacent blocks
2091 for (i = 1, l = 0; i < n_off; ++i) {
2092 if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
2093 else off[++l] = off[i];
2095 n_off = l + 1;
2096 iter->n_off = n_off; iter->off = off;
2097 return iter;
2100 void hts_itr_destroy(hts_itr_t *iter)
2102 if (iter) { free(iter->off); free(iter->bins.a); free(iter); }
2105 static inline long long push_digit(long long i, char c)
2107 // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
2108 int digit = c - '0';
2109 return 10 * i + digit;
2112 long long hts_parse_decimal(const char *str, char **strend, int flags)
2114 long long n = 0;
2115 int decimals = 0, e = 0, lost = 0;
2116 char sign = '+', esign = '+';
2117 const char *s;
2119 while (isspace_c(*str)) str++;
2120 s = str;
2122 if (*s == '+' || *s == '-') sign = *s++;
2123 while (*s)
2124 if (isdigit_c(*s)) n = push_digit(n, *s++);
2125 else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++;
2126 else break;
2128 if (*s == '.') {
2129 s++;
2130 while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++);
2133 if (*s == 'E' || *s == 'e') {
2134 s++;
2135 if (*s == '+' || *s == '-') esign = *s++;
2136 while (isdigit_c(*s)) e = push_digit(e, *s++);
2137 if (esign == '-') e = -e;
2140 e -= decimals;
2141 while (e > 0) n *= 10, e--;
2142 while (e < 0) lost += n % 10, n /= 10, e++;
2144 if (lost > 0) {
2145 hts_log_warning("Discarding fractional part of %.*s", (int)(s - str), str);
2148 if (strend) {
2149 *strend = (char *)s;
2150 } else if (*s) {
2151 hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
2154 return (sign == '+')? n : -n;
2157 const char *hts_parse_reg(const char *s, int *beg, int *end)
2159 char *hyphen;
2160 const char *colon = strrchr(s, ':');
2161 if (colon == NULL) {
2162 *beg = 0; *end = INT_MAX;
2163 return s + strlen(s);
2166 *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
2167 if (*beg < 0) *beg = 0;
2169 if (*hyphen == '\0') *end = INT_MAX;
2170 else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
2171 else return NULL;
2173 if (*beg >= *end) return NULL;
2174 return colon;
2177 hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
2179 int tid, beg, end;
2180 const char *q;
2182 if (strcmp(reg, ".") == 0)
2183 return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
2184 else if (strcmp(reg, "*") == 0)
2185 return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
2187 q = hts_parse_reg(reg, &beg, &end);
2188 if (q) {
2189 char tmp_a[1024], *tmp = tmp_a;
2190 if (q - reg + 1 > 1024)
2191 if (!(tmp = malloc(q - reg + 1)))
2192 return NULL;
2193 strncpy(tmp, reg, q - reg);
2194 tmp[q - reg] = 0;
2195 tid = getid(hdr, tmp);
2196 if (tmp != tmp_a)
2197 free(tmp);
2199 else {
2200 // not parsable as a region, but possibly a sequence named "foo:a"
2201 tid = getid(hdr, reg);
2202 beg = 0; end = INT_MAX;
2205 if (tid < 0) return NULL;
2206 return itr_query(idx, tid, beg, end, readrec);
2209 int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
2211 int ret, tid, beg, end;
2212 if (iter == NULL || iter->finished) return -1;
2213 if (iter->read_rest) {
2214 if (iter->curr_off) { // seek to the start
2215 if (bgzf_seek(fp, iter->curr_off, SEEK_SET) < 0) return -1;
2216 iter->curr_off = 0; // only seek once
2218 ret = iter->readrec(fp, data, r, &tid, &beg, &end);
2219 if (ret < 0) iter->finished = 1;
2220 iter->curr_tid = tid;
2221 iter->curr_beg = beg;
2222 iter->curr_end = end;
2223 return ret;
2225 // A NULL iter->off should always be accompanied by iter->finished.
2226 assert(iter->off != NULL);
2227 for (;;) {
2228 if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
2229 if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
2230 if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
2231 if (bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET) < 0) return -1;
2232 iter->curr_off = bgzf_tell(fp);
2234 ++iter->i;
2236 if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
2237 iter->curr_off = bgzf_tell(fp);
2238 if (tid != iter->tid || beg >= iter->end) { // no need to proceed
2239 ret = -1; break;
2240 } else if (end > iter->beg && iter->end > beg) {
2241 iter->curr_tid = tid;
2242 iter->curr_beg = beg;
2243 iter->curr_end = end;
2244 return ret;
2246 } else break; // end of file or error
2248 iter->finished = 1;
2249 return ret;
2252 /**********************
2253 *** Retrieve index ***
2254 **********************/
2256 static char *test_and_fetch(const char *fn)
2258 if (hisremote(fn)) {
2259 const int buf_size = 1 * 1024 * 1024;
2260 hFILE *fp_remote;
2261 FILE *fp;
2262 uint8_t *buf;
2263 int l;
2264 const char *p;
2265 for (p = fn + strlen(fn) - 1; p >= fn; --p)
2266 if (*p == '/') break;
2267 ++p; // p now points to the local file name
2268 // Attempt to open local file first
2269 if ((fp = fopen((char*)p, "rb")) != 0)
2271 fclose(fp);
2272 return (char*)p;
2274 // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index.
2275 if ((fp_remote = hopen(fn, "r")) == 0) return 0;
2276 if ((fp = fopen(p, "w")) == 0) {
2277 hts_log_error("Failed to create file %s in the working directory", p);
2278 hclose_abruptly(fp_remote);
2279 return 0;
2281 hts_log_info("Downloading file %s to local directory", fn);
2282 buf = (uint8_t*)calloc(buf_size, 1);
2283 while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp);
2284 free(buf);
2285 fclose(fp);
2286 if (hclose(fp_remote) != 0) {
2287 hts_log_error("Failed to close remote file %s", fn);
2289 return (char*)p;
2290 } else {
2291 hFILE *fp;
2292 if ((fp = hopen(fn, "r")) == 0) return 0;
2293 hclose_abruptly(fp);
2294 return (char*)fn;
2298 char *hts_idx_getfn(const char *fn, const char *ext)
2300 int i, l_fn, l_ext;
2301 char *fnidx, *ret;
2302 l_fn = strlen(fn); l_ext = strlen(ext);
2303 fnidx = (char*)calloc(l_fn + l_ext + 1, 1);
2304 strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext);
2305 if ((ret = test_and_fetch(fnidx)) == 0) {
2306 for (i = l_fn - 1; i > 0; --i)
2307 if (fnidx[i] == '.') break;
2308 strcpy(fnidx + i, ext);
2309 ret = test_and_fetch(fnidx);
2311 if (ret == 0) {
2312 free(fnidx);
2313 return 0;
2315 l_fn = strlen(ret);
2316 memmove(fnidx, ret, l_fn + 1);
2317 return fnidx;
2320 hts_idx_t *hts_idx_load(const char *fn, int fmt)
2322 char *fnidx;
2323 hts_idx_t *idx;
2324 fnidx = hts_idx_getfn(fn, ".csi");
2325 if (! fnidx) fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi");
2326 if (fnidx == 0) return 0;
2328 idx = hts_idx_load2(fn, fnidx);
2329 free(fnidx);
2330 return idx;
2333 hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
2335 // Check that the index file is up to date, the main file might have changed
2336 struct stat stat_idx,stat_main;
2337 if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
2339 if ( stat_idx.st_mtime < stat_main.st_mtime )
2340 hts_log_warning("The index file is older than the data file: %s", fnidx);
2343 return hts_idx_load_local(fnidx);
2348 /**********************
2349 *** Memory ***
2350 **********************/
2352 /* For use with hts_expand macros *only* */
2353 size_t hts_realloc_or_die(size_t n, size_t m, size_t m_sz, size_t size,
2354 int clear, void **ptr, const char *func) {
2355 /* If new_m and size are both below this limit, multiplying them
2356 together can't overflow */
2357 const size_t safe = (size_t) 1 << (sizeof(size_t) * 4);
2358 void *new_ptr;
2359 size_t bytes, new_m;
2361 new_m = n;
2362 kroundup_size_t(new_m);
2364 bytes = size * new_m;
2366 /* Check for overflow. Both ensure that new_m will fit in m (we make the
2367 pessimistic assumption that m is signed), and that bytes has not
2368 wrapped around. */
2369 if (new_m > (((size_t) 1 << (m_sz * 8 - 1)) - 1)
2370 || ((size > safe || new_m > safe)
2371 && bytes / new_m != size)) {
2372 errno = ENOMEM;
2373 goto die;
2376 new_ptr = realloc(*ptr, bytes);
2377 if (new_ptr == NULL) goto die;
2379 if (clear) {
2380 if (new_m > m) {
2381 memset((char *) new_ptr + m * size, 0, (new_m - m) * size);
2385 *ptr = new_ptr;
2387 return new_m;
2389 die:
2390 hts_log_error("%s", strerror(errno));
2391 exit(1);
2394 void hts_set_log_level(enum htsLogLevel level)
2396 hts_verbose = level;
2399 enum htsLogLevel hts_get_log_level()
2401 return hts_verbose;
2404 static char get_severity_tag(enum htsLogLevel severity)
2406 switch (severity) {
2407 case HTS_LOG_ERROR:
2408 return 'E';
2409 case HTS_LOG_WARNING:
2410 return 'W';
2411 case HTS_LOG_INFO:
2412 return 'I';
2413 case HTS_LOG_DEBUG:
2414 return 'D';
2415 case HTS_LOG_TRACE:
2416 return 'T';
2417 default:
2418 break;
2421 return '*';
2424 void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...)
2426 int save_errno = errno;
2427 if (severity <= hts_verbose) {
2428 va_list argptr;
2430 fprintf(stderr, "[%c::%s] ", get_severity_tag(severity), context);
2432 va_start(argptr, format);
2433 vfprintf(stderr, format, argptr);
2434 va_end(argptr);
2436 fprintf(stderr, "\n");
2438 errno = save_errno;