1 /* hts.c -- format-neutral I/O, indexing, and iterator API functions.
3 Copyright (C) 2008, 2009, 2012-2017 Genome Research Ltd.
4 Copyright (C) 2012, 2013 Broad Institute.
6 Author: Heng Li <lh3@sanger.ac.uk>
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
39 #include "htslib/hts.h"
40 #include "htslib/bgzf.h"
41 #include "cram/cram.h"
42 #include "htslib/hfile.h"
43 #include "htslib/hts_endian.h"
45 #include "hts_internal.h"
46 #include "hfile_internal.h"
48 #include "htslib/khash.h"
49 #include "htslib/kseq.h"
50 #include "htslib/ksort.h"
52 KHASH_INIT2(s2i
,, kh_cstr_t
, int64_t, 1, kh_str_hash_func
, kh_str_hash_equal
)
54 int hts_verbose
= HTS_LOG_WARNING
;
56 const char *hts_version()
61 const unsigned char seq_nt16_table
[256] = {
62 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
63 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
64 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
65 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
66 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
67 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
68 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
69 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
71 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
72 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
73 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
74 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
75 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
76 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
77 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
78 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
81 const char seq_nt16_str
[] = "=ACMGRSVTWYHKDBN";
83 const int seq_nt16_int
[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
85 /**********************
86 *** Basic file I/O ***
87 **********************/
89 static enum htsFormatCategory
format_category(enum htsExactFormat fmt
)
112 return unknown_category
;
121 return unknown_category
;
124 // Decompress up to ten or so bytes by peeking at the file, which must be
125 // positioned at the start of a GZIP block.
126 static size_t decompress_peek(hFILE
*fp
, unsigned char *dest
, size_t destsize
)
128 // Typically at most a couple of hundred bytes of input are required
129 // to get a few bytes of output from inflate(), so hopefully this buffer
130 // size suffices in general.
131 unsigned char buffer
[512];
133 ssize_t npeek
= hpeek(fp
, buffer
, sizeof buffer
);
135 if (npeek
< 0) return 0;
142 zs
.avail_out
= destsize
;
143 if (inflateInit2(&zs
, 31) != Z_OK
) return 0;
145 while (zs
.total_out
< destsize
)
146 if (inflate(&zs
, Z_SYNC_FLUSH
) != Z_OK
) break;
148 destsize
= zs
.total_out
;
154 // Parse "x.y" text, taking care because the string is not NUL-terminated
155 // and filling in major/minor only when the digits are followed by a delimiter,
156 // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer.
158 parse_version(htsFormat
*fmt
, const unsigned char *u
, const unsigned char *ulim
)
160 const char *s
= (const char *) u
;
161 const char *slim
= (const char *) ulim
;
164 fmt
->version
.major
= fmt
->version
.minor
= -1;
166 for (v
= 0; s
< slim
&& isdigit_c(*s
); s
++)
167 v
= 10 * v
+ *s
- '0';
170 fmt
->version
.major
= v
;
173 for (v
= 0; s
< slim
&& isdigit_c(*s
); s
++)
174 v
= 10 * v
+ *s
- '0';
176 fmt
->version
.minor
= v
;
179 fmt
->version
.minor
= 0;
184 cmp_nonblank(const char *key
, const unsigned char *u
, const unsigned char *ulim
)
186 const unsigned char *ukey
= (const unsigned char *) key
;
189 if (u
>= ulim
) return +1;
190 else if (isspace_c(*u
)) u
++;
191 else if (*u
!= *ukey
) return (*ukey
< *u
)? -1 : +1;
197 int hts_detect_format(hFILE
*hfile
, htsFormat
*fmt
)
200 ssize_t len
= hpeek(hfile
, s
, 18);
201 if (len
< 0) return -1;
203 if (len
>= 2 && s
[0] == 0x1f && s
[1] == 0x8b) {
204 // The stream is either gzip-compressed or BGZF-compressed.
205 // Determine which, and decompress the first few bytes.
206 fmt
->compression
= (len
>= 18 && (s
[3] & 4) &&
207 memcmp(&s
[12], "BC\2\0", 4) == 0)? bgzf
: gzip
;
208 len
= decompress_peek(hfile
, s
, sizeof s
);
211 fmt
->compression
= no_compression
;
212 len
= hpeek(hfile
, s
, sizeof s
);
214 if (len
< 0) return -1;
216 fmt
->compression_level
= -1;
217 fmt
->specific
= NULL
;
219 if (len
>= 6 && memcmp(s
,"CRAM",4) == 0 && s
[4]>=1 && s
[4]<=3 && s
[5]<=1) {
220 fmt
->category
= sequence_data
;
222 fmt
->version
.major
= s
[4], fmt
->version
.minor
= s
[5];
223 fmt
->compression
= custom
;
226 else if (len
>= 4 && s
[3] <= '\4') {
227 if (memcmp(s
, "BAM\1", 4) == 0) {
228 fmt
->category
= sequence_data
;
230 // TODO Decompress enough to pick version from @HD-VN header
231 fmt
->version
.major
= 1, fmt
->version
.minor
= -1;
234 else if (memcmp(s
, "BAI\1", 4) == 0) {
235 fmt
->category
= index_file
;
237 fmt
->version
.major
= -1, fmt
->version
.minor
= -1;
240 else if (memcmp(s
, "BCF\4", 4) == 0) {
241 fmt
->category
= variant_data
;
243 fmt
->version
.major
= 1, fmt
->version
.minor
= -1;
246 else if (memcmp(s
, "BCF\2", 4) == 0) {
247 fmt
->category
= variant_data
;
249 fmt
->version
.major
= s
[3];
250 fmt
->version
.minor
= (len
>= 5 && s
[4] <= 2)? s
[4] : 0;
253 else if (memcmp(s
, "CSI\1", 4) == 0) {
254 fmt
->category
= index_file
;
256 fmt
->version
.major
= 1, fmt
->version
.minor
= -1;
259 else if (memcmp(s
, "TBI\1", 4) == 0) {
260 fmt
->category
= index_file
;
262 fmt
->version
.major
= -1, fmt
->version
.minor
= -1;
266 else if (len
>= 16 && memcmp(s
, "##fileformat=VCF", 16) == 0) {
267 fmt
->category
= variant_data
;
269 if (len
>= 21 && s
[16] == 'v')
270 parse_version(fmt
, &s
[17], &s
[len
]);
272 fmt
->version
.major
= fmt
->version
.minor
= -1;
275 else if (len
>= 4 && s
[0] == '@' &&
276 (memcmp(s
, "@HD\t", 4) == 0 || memcmp(s
, "@SQ\t", 4) == 0 ||
277 memcmp(s
, "@RG\t", 4) == 0 || memcmp(s
, "@PG\t", 4) == 0)) {
278 fmt
->category
= sequence_data
;
280 // @HD-VN is not guaranteed to be the first tag, but then @HD is
281 // not guaranteed to be present at all...
282 if (len
>= 9 && memcmp(s
, "@HD\tVN:", 7) == 0)
283 parse_version(fmt
, &s
[7], &s
[len
]);
285 fmt
->version
.major
= 1, fmt
->version
.minor
= -1;
288 else if (cmp_nonblank("{\"", s
, &s
[len
]) == 0) {
289 fmt
->category
= unknown_category
;
291 fmt
->version
.major
= fmt
->version
.minor
= -1;
295 // Various possibilities for tab-delimited text:
296 // .crai (gzipped tab-delimited six columns: seqid 5*number)
297 // .bed ([3..12] tab-delimited columns)
298 // .bedpe (>= 10 tab-delimited columns)
299 // .sam (tab-delimited >= 11 columns: seqid number seqid...)
300 // FIXME For now, assume it's SAM
301 fmt
->category
= sequence_data
;
303 fmt
->version
.major
= 1, fmt
->version
.minor
= -1;
307 fmt
->category
= unknown_category
;
308 fmt
->format
= unknown_format
;
309 fmt
->version
.major
= fmt
->version
.minor
= -1;
310 fmt
->compression
= no_compression
;
314 char *hts_format_description(const htsFormat
*format
)
316 kstring_t str
= { 0, 0, NULL
};
318 switch (format
->format
) {
319 case sam
: kputs("SAM", &str
); break;
320 case bam
: kputs("BAM", &str
); break;
321 case cram
: kputs("CRAM", &str
); break;
322 case vcf
: kputs("VCF", &str
); break;
324 if (format
->version
.major
== 1) kputs("Legacy BCF", &str
);
325 else kputs("BCF", &str
);
327 case bai
: kputs("BAI", &str
); break;
328 case crai
: kputs("CRAI", &str
); break;
329 case csi
: kputs("CSI", &str
); break;
330 case tbi
: kputs("Tabix", &str
); break;
331 case json
: kputs("JSON", &str
); break;
332 default: kputs("unknown", &str
); break;
335 if (format
->version
.major
>= 0) {
336 kputs(" version ", &str
);
337 kputw(format
->version
.major
, &str
);
338 if (format
->version
.minor
>= 0) {
340 kputw(format
->version
.minor
, &str
);
344 switch (format
->compression
) {
345 case custom
: kputs(" compressed", &str
); break;
346 case gzip
: kputs(" gzip-compressed", &str
); break;
348 switch (format
->format
) {
353 // These are by definition BGZF, so just use the generic term
354 kputs(" compressed", &str
);
357 kputs(" BGZF-compressed", &str
);
364 switch (format
->category
) {
365 case sequence_data
: kputs(" sequence", &str
); break;
366 case variant_data
: kputs(" variant calling", &str
); break;
367 case index_file
: kputs(" index", &str
); break;
368 case region_list
: kputs(" genomic region", &str
); break;
372 if (format
->compression
== no_compression
)
373 switch (format
->format
) {
379 kputs(" text", &str
);
383 kputs(" data", &str
);
387 kputs(" data", &str
);
389 return ks_release(&str
);
392 htsFile
*hts_open_format(const char *fn
, const char *mode
, const htsFormat
*fmt
)
394 char smode
[102], *cp
, *cp2
, *mode_c
;
397 char fmt_code
= '\0';
399 strncpy(smode
, mode
, 100);
401 if ((cp
= strchr(smode
, ',')))
404 // Migrate format code (b or c) to the end of the smode buffer.
405 for (cp2
= cp
= smode
; *cp
; cp
++) {
418 // Set or reset the format code if opts->format is used
419 if (fmt
&& fmt
->format
!= unknown_format
)
420 *mode_c
= "\0g\0\0b\0c\0\0b\0g\0\0"[fmt
->format
];
422 hfile
= hopen(fn
, smode
);
423 if (hfile
== NULL
) goto error
;
425 fp
= hts_hopen(hfile
, fn
, smode
);
426 if (fp
== NULL
) goto error
;
428 if (fmt
&& fmt
->specific
)
429 if (hts_opt_apply(fp
, fmt
->specific
) != 0)
435 hts_log_error("Failed to open file %s", fn
);
438 hclose_abruptly(hfile
);
443 htsFile
*hts_open(const char *fn
, const char *mode
) {
444 return hts_open_format(fn
, mode
, NULL
);
448 * Splits str into a prefix, delimiter ('\0' or delim), and suffix, writing
449 * the prefix in lowercase into buf and returning a pointer to the suffix.
450 * On return, buf is always NUL-terminated; thus assumes that the "keyword"
451 * prefix should be one of several known values of maximum length buflen-2.
452 * (If delim is not found, returns a pointer to the '\0'.)
455 scan_keyword(const char *str
, char delim
, char *buf
, size_t buflen
)
458 while (*str
&& *str
!= delim
) {
459 if (i
< buflen
-1) buf
[i
++] = tolower_c(*str
);
464 return *str
? str
+1 : str
;
468 * Parses arg and appends it to the option list.
470 * Returns 0 on success;
473 int hts_opt_add(hts_opt
**opts
, const char *c_arg
) {
479 * If you add another string option here, don't forget to also add
480 * it to the case statement in hts_opt_apply.
486 if (!(o
= malloc(sizeof(*o
))))
489 if (!(o
->arg
= strdup(c_arg
))) {
494 if (!(val
= strchr(o
->arg
, '=')))
495 val
= "1"; // assume boolean
499 if (strcmp(o
->arg
, "decode_md") == 0 ||
500 strcmp(o
->arg
, "DECODE_MD") == 0)
501 o
->opt
= CRAM_OPT_DECODE_MD
, o
->val
.i
= atoi(val
);
503 else if (strcmp(o
->arg
, "verbosity") == 0 ||
504 strcmp(o
->arg
, "VERBOSITY") == 0)
505 o
->opt
= CRAM_OPT_VERBOSITY
, o
->val
.i
= atoi(val
);
507 else if (strcmp(o
->arg
, "seqs_per_slice") == 0 ||
508 strcmp(o
->arg
, "SEQS_PER_SLICE") == 0)
509 o
->opt
= CRAM_OPT_SEQS_PER_SLICE
, o
->val
.i
= atoi(val
);
511 else if (strcmp(o
->arg
, "bases_per_slice") == 0 ||
512 strcmp(o
->arg
, "BASES_PER_SLICE") == 0)
513 o
->opt
= CRAM_OPT_BASES_PER_SLICE
, o
->val
.i
= atoi(val
);
515 else if (strcmp(o
->arg
, "slices_per_container") == 0 ||
516 strcmp(o
->arg
, "SLICES_PER_CONTAINER") == 0)
517 o
->opt
= CRAM_OPT_SLICES_PER_CONTAINER
, o
->val
.i
= atoi(val
);
519 else if (strcmp(o
->arg
, "embed_ref") == 0 ||
520 strcmp(o
->arg
, "EMBED_REF") == 0)
521 o
->opt
= CRAM_OPT_EMBED_REF
, o
->val
.i
= atoi(val
);
523 else if (strcmp(o
->arg
, "no_ref") == 0 ||
524 strcmp(o
->arg
, "NO_REF") == 0)
525 o
->opt
= CRAM_OPT_NO_REF
, o
->val
.i
= atoi(val
);
527 else if (strcmp(o
->arg
, "ignore_md5") == 0 ||
528 strcmp(o
->arg
, "IGNORE_MD5") == 0)
529 o
->opt
= CRAM_OPT_IGNORE_MD5
, o
->val
.i
= atoi(val
);
531 else if (strcmp(o
->arg
, "use_bzip2") == 0 ||
532 strcmp(o
->arg
, "USE_BZIP2") == 0)
533 o
->opt
= CRAM_OPT_USE_BZIP2
, o
->val
.i
= atoi(val
);
535 else if (strcmp(o
->arg
, "use_rans") == 0 ||
536 strcmp(o
->arg
, "USE_RANS") == 0)
537 o
->opt
= CRAM_OPT_USE_RANS
, o
->val
.i
= atoi(val
);
539 else if (strcmp(o
->arg
, "use_lzma") == 0 ||
540 strcmp(o
->arg
, "USE_LZMA") == 0)
541 o
->opt
= CRAM_OPT_USE_LZMA
, o
->val
.i
= atoi(val
);
543 else if (strcmp(o
->arg
, "reference") == 0 ||
544 strcmp(o
->arg
, "REFERENCE") == 0)
545 o
->opt
= CRAM_OPT_REFERENCE
, o
->val
.s
= val
;
547 else if (strcmp(o
->arg
, "version") == 0 ||
548 strcmp(o
->arg
, "VERSION") == 0)
549 o
->opt
= CRAM_OPT_VERSION
, o
->val
.s
=val
;
551 else if (strcmp(o
->arg
, "multi_seq_per_slice") == 0 ||
552 strcmp(o
->arg
, "MULTI_SEQ_PER_SLICE") == 0)
553 o
->opt
= CRAM_OPT_MULTI_SEQ_PER_SLICE
, o
->val
.i
= atoi(val
);
555 else if (strcmp(o
->arg
, "nthreads") == 0 ||
556 strcmp(o
->arg
, "NTHREADS") == 0)
557 o
->opt
= HTS_OPT_NTHREADS
, o
->val
.i
= atoi(val
);
559 else if (strcmp(o
->arg
, "cache_size") == 0 ||
560 strcmp(o
->arg
, "CACHE_SIZE") == 0) {
562 o
->opt
= HTS_OPT_CACHE_SIZE
;
563 o
->val
.i
= strtol(val
, &endp
, 0);
564 // NB: Doesn't support floats, eg 1.5g
565 // TODO: extend hts_parse_decimal? See also samtools sort.
567 case 'g': case 'G': o
->val
.i
*= 1024;
568 case 'm': case 'M': o
->val
.i
*= 1024;
569 case 'k': case 'K': o
->val
.i
*= 1024; break;
572 hts_log_error("Unrecognised cache size suffix '%c'", *endp
);
579 else if (strcmp(o
->arg
, "required_fields") == 0 ||
580 strcmp(o
->arg
, "REQUIRED_FIELDS") == 0)
581 o
->opt
= CRAM_OPT_REQUIRED_FIELDS
, o
->val
.i
= strtol(val
, NULL
, 0);
583 else if (strcmp(o
->arg
, "lossy_names") == 0 ||
584 strcmp(o
->arg
, "LOSSY_NAMES") == 0)
585 o
->opt
= CRAM_OPT_LOSSY_NAMES
, o
->val
.i
= strtol(val
, NULL
, 0);
587 else if (strcmp(o
->arg
, "name_prefix") == 0 ||
588 strcmp(o
->arg
, "NAME_PREFIX") == 0)
589 o
->opt
= CRAM_OPT_PREFIX
, o
->val
.s
= val
;
591 else if (strcmp(o
->arg
, "block_size") == 0 ||
592 strcmp(o
->arg
, "BLOCK_SIZE") == 0)
593 o
->opt
= HTS_OPT_BLOCK_SIZE
, o
->val
.i
= strtol(val
, NULL
, 0);
596 hts_log_error("Unknown option '%s'", o
->arg
);
604 // Append; assumes small list.
618 * Applies an hts_opt option list to a given htsFile.
620 * Returns 0 on success
623 int hts_opt_apply(htsFile
*fp
, hts_opt
*opts
) {
624 hts_opt
*last
= NULL
;
626 for (; opts
; opts
= (last
=opts
)->next
) {
628 case CRAM_OPT_REFERENCE
:
629 case CRAM_OPT_VERSION
:
630 case CRAM_OPT_PREFIX
:
631 if (hts_set_opt(fp
, opts
->opt
, opts
->val
.s
) != 0)
635 if (hts_set_opt(fp
, opts
->opt
, opts
->val
.i
) != 0)
645 * Frees an hts_opt list.
647 void hts_opt_free(hts_opt
*opts
) {
648 hts_opt
*last
= NULL
;
650 opts
= (last
=opts
)->next
;
658 * Tokenise options as (key(=value)?,)*(key(=value)?)?
659 * NB: No provision for ',' appearing in the value!
660 * Add backslashing rules?
662 * This could be used as part of a general command line option parser or
663 * as a string concatenated onto the file open mode.
665 * Returns 0 on success
668 int hts_parse_opt_list(htsFormat
*fmt
, const char *str
) {
669 while (str
&& *str
) {
670 const char *str_start
;
674 while (*str
&& *str
== ',')
677 for (str_start
= str
; *str
&& *str
!= ','; str
++);
678 len
= str
- str_start
;
680 // Produce a nul terminated copy of the option
681 strncpy(arg
, str_start
, len
< 8000 ? len
: 8000);
682 arg
[len
< 8000 ? len
: 8000] = '\0';
684 if (hts_opt_add((hts_opt
**)&fmt
->specific
, arg
) != 0)
695 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
696 * followed by a comma separated list of key=value options and splits
697 * these up into the fields of htsFormat struct.
699 * format is assumed to be already initialised, either to blank
700 * "unknown" values or via previous hts_opt_add calls.
702 * Returns 0 on success
705 int hts_parse_format(htsFormat
*format
, const char *str
) {
707 const char *cp
= scan_keyword(str
, ',', fmt
, sizeof fmt
);
709 format
->version
.minor
= 0; // unknown
710 format
->version
.major
= 0; // unknown
712 if (strcmp(fmt
, "sam") == 0) {
713 format
->category
= sequence_data
;
714 format
->format
= sam
;
715 format
->compression
= no_compression
;;
716 format
->compression_level
= 0;
717 } else if (strcmp(fmt
, "bam") == 0) {
718 format
->category
= sequence_data
;
719 format
->format
= bam
;
720 format
->compression
= bgzf
;
721 format
->compression_level
= -1;
722 } else if (strcmp(fmt
, "cram") == 0) {
723 format
->category
= sequence_data
;
724 format
->format
= cram
;
725 format
->compression
= custom
;
726 format
->compression_level
= -1;
727 } else if (strcmp(fmt
, "vcf") == 0) {
728 format
->category
= variant_data
;
729 format
->format
= vcf
;
730 format
->compression
= no_compression
;;
731 format
->compression_level
= 0;
732 } else if (strcmp(fmt
, "bcf") == 0) {
733 format
->category
= variant_data
;
734 format
->format
= bcf
;
735 format
->compression
= bgzf
;
736 format
->compression_level
= -1;
741 return hts_parse_opt_list(format
, cp
);
746 * Tokenise options as (key(=value)?,)*(key(=value)?)?
747 * NB: No provision for ',' appearing in the value!
748 * Add backslashing rules?
750 * This could be used as part of a general command line option parser or
751 * as a string concatenated onto the file open mode.
753 * Returns 0 on success
756 static int hts_process_opts(htsFile
*fp
, const char *opts
) {
760 if (hts_parse_opt_list(&fmt
, opts
) != 0)
763 if (hts_opt_apply(fp
, fmt
.specific
) != 0) {
764 hts_opt_free(fmt
.specific
);
768 hts_opt_free(fmt
.specific
);
774 htsFile
*hts_hopen(hFILE
*hfile
, const char *fn
, const char *mode
)
776 hFILE
*hfile_orig
= hfile
;
777 htsFile
*fp
= (htsFile
*)calloc(1, sizeof(htsFile
));
778 char simple_mode
[101], *cp
, *opts
;
779 simple_mode
[100] = '\0';
781 if (fp
== NULL
) goto error
;
784 fp
->is_be
= ed_is_big();
786 // Split mode into simple_mode,opts strings
787 if ((cp
= strchr(mode
, ','))) {
788 strncpy(simple_mode
, mode
, cp
-mode
<= 100 ? cp
-mode
: 100);
789 simple_mode
[cp
-mode
] = '\0';
792 strncpy(simple_mode
, mode
, 100);
796 if (strchr(simple_mode
, 'r')) {
797 if (hts_detect_format(hfile
, &fp
->format
) < 0) goto error
;
799 if (fp
->format
.format
== json
) {
800 hFILE
*hfile2
= hopen_json_redirect(hfile
, simple_mode
);
801 if (hfile2
== NULL
) goto error
;
803 // Build fp against the result of the redirection
805 if (hts_detect_format(hfile
, &fp
->format
) < 0) goto error
;
808 else if (strchr(simple_mode
, 'w') || strchr(simple_mode
, 'a')) {
809 htsFormat
*fmt
= &fp
->format
;
812 if (strchr(simple_mode
, 'b')) fmt
->format
= binary_format
;
813 else if (strchr(simple_mode
, 'c')) fmt
->format
= cram
;
814 else fmt
->format
= text_format
;
816 if (strchr(simple_mode
, 'z')) fmt
->compression
= bgzf
;
817 else if (strchr(simple_mode
, 'g')) fmt
->compression
= gzip
;
818 else if (strchr(simple_mode
, 'u')) fmt
->compression
= no_compression
;
820 // No compression mode specified, set to the default for the format
821 switch (fmt
->format
) {
822 case binary_format
: fmt
->compression
= bgzf
; break;
823 case cram
: fmt
->compression
= custom
; break;
824 case text_format
: fmt
->compression
= no_compression
; break;
829 // Fill in category (if determinable; e.g. 'b' could be BAM or BCF)
830 fmt
->category
= format_category(fmt
->format
);
832 fmt
->version
.major
= fmt
->version
.minor
= -1;
833 fmt
->compression_level
= -1;
834 fmt
->specific
= NULL
;
836 else { errno
= EINVAL
; goto error
; }
838 switch (fp
->format
.format
) {
842 fp
->fp
.bgzf
= bgzf_hopen(hfile
, simple_mode
);
843 if (fp
->fp
.bgzf
== NULL
) goto error
;
844 fp
->is_bin
= fp
->is_bgzf
= 1;
848 fp
->fp
.cram
= cram_dopen(hfile
, fn
, simple_mode
);
849 if (fp
->fp
.cram
== NULL
) goto error
;
851 cram_set_option(fp
->fp
.cram
, CRAM_OPT_DECODE_MD
, 1);
858 if (fp
->format
.compression
!= no_compression
) {
859 fp
->fp
.bgzf
= bgzf_hopen(hfile
, simple_mode
);
860 if (fp
->fp
.bgzf
== NULL
) goto error
;
864 fp
->fp
.hfile
= hfile
;
873 hts_process_opts(fp
, opts
);
875 // If redirecting, close the original hFILE now (pedantically we would
876 // instead close it in hts_close(), but this a simplifying optimisation)
877 if (hfile
!= hfile_orig
) hclose_abruptly(hfile_orig
);
882 hts_log_error("Failed to open file %s", fn
);
884 // If redirecting, close the failed redirection hFILE that we have opened
885 if (hfile
!= hfile_orig
) hclose_abruptly(hfile
);
895 int hts_close(htsFile
*fp
)
899 switch (fp
->format
.format
) {
903 ret
= bgzf_close(fp
->fp
.bgzf
);
908 switch (cram_eof(fp
->fp
.cram
)) {
910 hts_log_warning("EOF marker is absent. The input is probably truncated");
912 case 0: /* not at EOF, but may not have wanted all seqs */
913 default: /* case 1, expected EOF */
917 ret
= cram_close(fp
->fp
.cram
);
923 if (fp
->format
.compression
!= no_compression
)
924 ret
= bgzf_close(fp
->fp
.bgzf
);
926 ret
= hclose(fp
->fp
.hfile
);
943 const htsFormat
*hts_get_format(htsFile
*fp
)
945 return fp
? &fp
->format
: NULL
;
948 const char *hts_format_file_extension(const htsFormat
*format
) {
952 switch (format
->format
) {
953 case sam
: return "sam";
954 case bam
: return "bam";
955 case bai
: return "bai";
956 case cram
: return "cram";
957 case crai
: return "crai";
958 case vcf
: return "vcf";
959 case bcf
: return "bcf";
960 case csi
: return "csi";
961 case gzi
: return "gzi";
962 case tbi
: return "tbi";
963 case bed
: return "bed";
968 static hFILE
*hts_hfile(htsFile
*fp
) {
969 switch (fp
->format
.format
) {
970 case binary_format
: // fall through; still valid if bcf?
971 case bam
: return bgzf_hfile(fp
->fp
.bgzf
);
972 case cram
: return cram_hfile(fp
->fp
.cram
);
973 case text_format
: return fp
->fp
.hfile
;
974 case sam
: return fp
->fp
.hfile
;
975 default: return NULL
;
979 int hts_set_opt(htsFile
*fp
, enum hts_fmt_option opt
, ...) {
984 case HTS_OPT_NTHREADS
: {
986 int nthreads
= va_arg(args
, int);
988 return hts_set_threads(fp
, nthreads
);
991 case HTS_OPT_BLOCK_SIZE
: {
992 hFILE
*hf
= hts_hfile(fp
);
996 if (hfile_set_blksize(hf
, va_arg(args
, int)) != 0 && hts_verbose
>= 2)
997 fprintf(stderr
, "[W::%s] Failed to change block size\n", __func__
);
999 } else if (hts_verbose
>= 2)
1000 // To do - implement for vcf/bcf.
1001 fprintf(stderr
, "[W::%s] cannot change block size for this format\n", __func__
);
1007 case HTS_OPT_THREAD_POOL
: {
1008 va_start(args
, opt
);
1009 htsThreadPool
*p
= va_arg(args
, htsThreadPool
*);
1011 return hts_set_thread_pool(fp
, p
);
1014 case HTS_OPT_CACHE_SIZE
: {
1015 va_start(args
, opt
);
1016 int cache_size
= va_arg(args
, int);
1018 hts_set_cache_size(fp
, cache_size
);
1026 if (fp
->format
.format
!= cram
)
1029 va_start(args
, opt
);
1030 r
= cram_set_voption(fp
->fp
.cram
, opt
, args
);
1036 BGZF
*hts_get_bgzfp(htsFile
*fp
);
1038 int hts_set_threads(htsFile
*fp
, int n
)
1040 if (fp
->format
.compression
== bgzf
) {
1041 return bgzf_mt(hts_get_bgzfp(fp
), n
, 256/*unused*/);
1042 } else if (fp
->format
.format
== cram
) {
1043 return hts_set_opt(fp
, CRAM_OPT_NTHREADS
, n
);
1048 int hts_set_thread_pool(htsFile
*fp
, htsThreadPool
*p
) {
1049 if (fp
->format
.compression
== bgzf
) {
1050 return bgzf_thread_pool(hts_get_bgzfp(fp
), p
->pool
, p
->qsize
);
1051 } else if (fp
->format
.format
== cram
) {
1052 return hts_set_opt(fp
, CRAM_OPT_THREAD_POOL
, p
);
1057 void hts_set_cache_size(htsFile
*fp
, int n
)
1059 if (fp
->format
.compression
== bgzf
)
1060 bgzf_set_cache_size(hts_get_bgzfp(fp
), n
);
1063 int hts_set_fai_filename(htsFile
*fp
, const char *fn_aux
)
1067 fp
->fn_aux
= strdup(fn_aux
);
1068 if (fp
->fn_aux
== NULL
) return -1;
1070 else fp
->fn_aux
= NULL
;
1072 if (fp
->format
.format
== cram
)
1073 if (cram_set_option(fp
->fp
.cram
, CRAM_OPT_REFERENCE
, fp
->fn_aux
))
1079 // For VCF/BCF backward sweeper. Not exposing these functions because their
1080 // future is uncertain. Things will probably have to change with hFILE...
1081 BGZF
*hts_get_bgzfp(htsFile
*fp
)
1088 int hts_useek(htsFile
*fp
, long uoffset
, int where
)
1091 return bgzf_useek(fp
->fp
.bgzf
, uoffset
, where
);
1093 return (hseek(fp
->fp
.hfile
, uoffset
, SEEK_SET
) >= 0)? 0 : -1;
1095 long hts_utell(htsFile
*fp
)
1098 return bgzf_utell(fp
->fp
.bgzf
);
1100 return htell(fp
->fp
.hfile
);
1103 int hts_getline(htsFile
*fp
, int delimiter
, kstring_t
*str
)
1106 if (! (delimiter
== KS_SEP_LINE
|| delimiter
== '\n')) {
1107 hts_log_error("Unexpected delimiter %d", delimiter
);
1111 switch (fp
->format
.compression
) {
1112 case no_compression
:
1114 ret
= kgetline(str
, (kgets_func
*) hgets
, fp
->fp
.hfile
);
1115 if (ret
>= 0) ret
= str
->l
;
1116 else if (herrno(fp
->fp
.hfile
)) ret
= -2, errno
= herrno(fp
->fp
.hfile
);
1122 ret
= bgzf_getline(fp
->fp
.bgzf
, '\n', str
);
1133 char **hts_readlist(const char *string
, int is_file
, int *_n
)
1139 BGZF
*fp
= bgzf_open(string
, "r");
1140 if ( !fp
) return NULL
;
1143 str
.s
= 0; str
.l
= str
.m
= 0;
1144 while (bgzf_getline(fp
, '\n', &str
) >= 0)
1146 if (str
.l
== 0) continue;
1148 hts_expand(char*,n
,m
,s
);
1149 s
[n
-1] = strdup(str
.s
);
1156 const char *q
= string
, *p
= string
;
1159 if (*p
== ',' || *p
== 0)
1162 hts_expand(char*,n
,m
,s
);
1163 s
[n
-1] = (char*)calloc(p
- q
+ 1, 1);
1164 strncpy(s
[n
-1], q
, p
- q
);
1171 s
= (char**)realloc(s
, n
* sizeof(char*));
1176 char **hts_readlines(const char *fn
, int *_n
)
1180 BGZF
*fp
= bgzf_open(fn
, "r");
1181 if ( fp
) { // read from file
1183 str
.s
= 0; str
.l
= str
.m
= 0;
1184 while (bgzf_getline(fp
, '\n', &str
) >= 0) {
1185 if (str
.l
== 0) continue;
1188 s
= (char**)realloc(s
, m
* sizeof(char*));
1190 s
[n
++] = strdup(str
.s
);
1193 s
= (char**)realloc(s
, n
* sizeof(char*));
1195 } else if (*fn
== ':') { // read from string
1197 for (q
= p
= fn
+ 1;; ++p
)
1198 if (*p
== ',' || *p
== 0) {
1201 s
= (char**)realloc(s
, m
* sizeof(char*));
1203 s
[n
] = (char*)calloc(p
- q
+ 1, 1);
1204 strncpy(s
[n
++], q
, p
- q
);
1209 s
= (char**)realloc(s
, n
* sizeof(char*));
1214 // DEPRECATED: To be removed in a future HTSlib release
1215 int hts_file_type(const char *fname
)
1217 int len
= strlen(fname
);
1218 if ( !strcasecmp(".vcf.gz",fname
+len
-7) ) return FT_VCF_GZ
;
1219 if ( !strcasecmp(".vcf",fname
+len
-4) ) return FT_VCF
;
1220 if ( !strcasecmp(".bcf",fname
+len
-4) ) return FT_BCF_GZ
;
1221 if ( !strcmp("-",fname
) ) return FT_STDIN
;
1223 hFILE
*f
= hopen(fname
, "r");
1224 if (f
== NULL
) return 0;
1227 if (hts_detect_format(f
, &fmt
) < 0) { hclose_abruptly(f
); return 0; }
1228 if (hclose(f
) < 0) return 0;
1230 switch (fmt
.format
) {
1231 case vcf
: return (fmt
.compression
== no_compression
)? FT_VCF
: FT_VCF_GZ
;
1232 case bcf
: return (fmt
.compression
== no_compression
)? FT_BCF
: FT_BCF_GZ
;
1237 int hts_check_EOF(htsFile
*fp
)
1239 if (fp
->format
.compression
== bgzf
)
1240 return bgzf_check_EOF(hts_get_bgzfp(fp
));
1241 else if (fp
->format
.format
== cram
)
1242 return cram_check_EOF(fp
->fp
.cram
);
1252 #define HTS_MIN_MARKER_DIST 0x10000
1254 // Finds the special meta bin
1255 // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
1256 #define META_BIN(idx) ((idx)->n_bins + 1)
1258 #define pair64_lt(a,b) ((a).u < (b).u)
1260 KSORT_INIT(_off
, hts_pair64_t
, pair64_lt
)
1268 KHASH_MAP_INIT_INT(bin
, bins_t
)
1269 typedef khash_t(bin
) bidx_t
;
1276 struct __hts_idx_t
{
1277 int fmt
, min_shift
, n_lvls
, n_bins
;
1283 uint8_t *meta
; // MUST have a terminating NUL on the end
1285 uint32_t last_bin
, save_bin
;
1286 int last_coor
, last_tid
, save_tid
, finished
;
1287 uint64_t last_off
, save_off
;
1288 uint64_t off_beg
, off_end
;
1289 uint64_t n_mapped
, n_unmapped
;
1290 } z
; // keep internal states
1293 static char * idx_format_name(int fmt
) {
1295 case HTS_FMT_CSI
: return "csi";
1296 case HTS_FMT_BAI
: return "bai";
1297 case HTS_FMT_TBI
: return "tbi";
1298 case HTS_FMT_CRAI
: return "crai";
1299 default: return "unknown";
1303 static inline int insert_to_b(bidx_t
*b
, int bin
, uint64_t beg
, uint64_t end
)
1308 k
= kh_put(bin
, b
, bin
, &absent
);
1309 if (absent
< 0) return -1; // Out of memory
1310 l
= &kh_value(b
, k
);
1313 l
->list
= (hts_pair64_t
*)calloc(l
->m
, sizeof(hts_pair64_t
));
1318 } else if (l
->n
== l
->m
) {
1319 uint32_t new_m
= l
->m
? l
->m
<< 1 : 1;
1320 hts_pair64_t
*new_list
= realloc(l
->list
, new_m
* sizeof(hts_pair64_t
));
1321 if (!new_list
) return -1;
1325 l
->list
[l
->n
].u
= beg
;
1326 l
->list
[l
->n
++].v
= end
;
1330 static inline int insert_to_l(lidx_t
*l
, int64_t _beg
, int64_t _end
, uint64_t offset
, int min_shift
)
1333 beg
= _beg
>> min_shift
;
1334 end
= (_end
- 1) >> min_shift
;
1335 if (l
->m
< end
+ 1) {
1336 size_t new_m
= l
->m
* 2 > end
+ 1 ? l
->m
* 2 : end
+ 1;
1337 uint64_t *new_offset
;
1339 new_offset
= (uint64_t*)realloc(l
->offset
, new_m
* sizeof(uint64_t));
1340 if (!new_offset
) return -1;
1342 // fill unused memory with (uint64_t)-1
1343 memset(new_offset
+ l
->m
, 0xff, sizeof(uint64_t) * (new_m
- l
->m
));
1345 l
->offset
= new_offset
;
1347 for (i
= beg
; i
<= end
; ++i
) {
1348 if (l
->offset
[i
] == (uint64_t)-1) l
->offset
[i
] = offset
;
1350 if (l
->n
< end
+ 1) l
->n
= end
+ 1;
1354 hts_idx_t
*hts_idx_init(int n
, int fmt
, uint64_t offset0
, int min_shift
, int n_lvls
)
1357 idx
= (hts_idx_t
*)calloc(1, sizeof(hts_idx_t
));
1358 if (idx
== NULL
) return NULL
;
1360 idx
->min_shift
= min_shift
;
1361 idx
->n_lvls
= n_lvls
;
1362 idx
->n_bins
= ((1<<(3 * n_lvls
+ 3)) - 1) / 7;
1363 idx
->z
.save_bin
= idx
->z
.save_tid
= idx
->z
.last_tid
= idx
->z
.last_bin
= 0xffffffffu
;
1364 idx
->z
.save_off
= idx
->z
.last_off
= idx
->z
.off_beg
= idx
->z
.off_end
= offset0
;
1365 idx
->z
.last_coor
= 0xffffffffu
;
1367 idx
->n
= idx
->m
= n
;
1368 idx
->bidx
= (bidx_t
**)calloc(n
, sizeof(bidx_t
*));
1369 if (idx
->bidx
== NULL
) { free(idx
); return NULL
; }
1370 idx
->lidx
= (lidx_t
*) calloc(n
, sizeof(lidx_t
));
1371 if (idx
->lidx
== NULL
) { free(idx
->bidx
); free(idx
); return NULL
; }
1376 static void update_loff(hts_idx_t
*idx
, int i
, int free_lidx
)
1378 bidx_t
*bidx
= idx
->bidx
[i
];
1379 lidx_t
*lidx
= &idx
->lidx
[i
];
1382 uint64_t offset0
= 0;
1384 k
= kh_get(bin
, bidx
, META_BIN(idx
));
1385 if (k
!= kh_end(bidx
))
1386 offset0
= kh_val(bidx
, k
).list
[0].u
;
1387 for (l
= 0; l
< lidx
->n
&& lidx
->offset
[l
] == (uint64_t)-1; ++l
)
1388 lidx
->offset
[l
] = offset0
;
1390 for (; l
< lidx
->n
; ++l
) // fill missing values
1391 if (lidx
->offset
[l
] == (uint64_t)-1)
1392 lidx
->offset
[l
] = lidx
->offset
[l
-1];
1393 if (bidx
== 0) return;
1394 for (k
= kh_begin(bidx
); k
!= kh_end(bidx
); ++k
) // set loff
1395 if (kh_exist(bidx
, k
))
1397 if ( kh_key(bidx
, k
) < idx
->n_bins
)
1399 int bot_bin
= hts_bin_bot(kh_key(bidx
, k
), idx
->n_lvls
);
1400 // disable linear index if bot_bin out of bounds
1401 kh_val(bidx
, k
).loff
= bot_bin
< lidx
->n
? lidx
->offset
[bot_bin
] : 0;
1404 kh_val(bidx
, k
).loff
= 0;
1408 lidx
->m
= lidx
->n
= 0;
1413 static void compress_binning(hts_idx_t
*idx
, int i
)
1415 bidx_t
*bidx
= idx
->bidx
[i
];
1418 if (bidx
== 0) return;
1419 // merge a bin to its parent if the bin is too small
1420 for (l
= idx
->n_lvls
; l
> 0; --l
) {
1421 unsigned start
= hts_bin_first(l
);
1422 for (k
= kh_begin(bidx
); k
!= kh_end(bidx
); ++k
) {
1424 if (!kh_exist(bidx
, k
) || kh_key(bidx
, k
) >= idx
->n_bins
|| kh_key(bidx
, k
) < start
) continue;
1425 p
= &kh_value(bidx
, k
);
1426 if (l
< idx
->n_lvls
&& p
->n
> 1) ks_introsort(_off
, p
->n
, p
->list
);
1427 if ((p
->list
[p
->n
- 1].v
>>16) - (p
->list
[0].u
>>16) < HTS_MIN_MARKER_DIST
) {
1429 kp
= kh_get(bin
, bidx
, hts_bin_parent(kh_key(bidx
, k
)));
1430 if (kp
== kh_end(bidx
)) continue;
1431 q
= &kh_val(bidx
, kp
);
1432 if (q
->n
+ p
->n
> q
->m
) {
1435 q
->list
= (hts_pair64_t
*)realloc(q
->list
, q
->m
* sizeof(hts_pair64_t
));
1437 memcpy(q
->list
+ q
->n
, p
->list
, p
->n
* sizeof(hts_pair64_t
));
1440 kh_del(bin
, bidx
, k
);
1444 k
= kh_get(bin
, bidx
, 0);
1445 if (k
!= kh_end(bidx
)) ks_introsort(_off
, kh_val(bidx
, k
).n
, kh_val(bidx
, k
).list
);
1446 // merge adjacent chunks that start from the same BGZF block
1447 for (k
= kh_begin(bidx
); k
!= kh_end(bidx
); ++k
) {
1449 if (!kh_exist(bidx
, k
) || kh_key(bidx
, k
) >= idx
->n_bins
) continue;
1450 p
= &kh_value(bidx
, k
);
1451 for (l
= 1, m
= 0; l
< p
->n
; ++l
) {
1452 if (p
->list
[m
].v
>>16 >= p
->list
[l
].u
>>16) {
1453 if (p
->list
[m
].v
< p
->list
[l
].v
) p
->list
[m
].v
= p
->list
[l
].v
;
1454 } else p
->list
[++m
] = p
->list
[l
];
1460 void hts_idx_finish(hts_idx_t
*idx
, uint64_t final_offset
)
1463 if (idx
== NULL
|| idx
->z
.finished
) return; // do not run this function on an empty index or multiple times
1464 if (idx
->z
.save_tid
>= 0) {
1465 insert_to_b(idx
->bidx
[idx
->z
.save_tid
], idx
->z
.save_bin
, idx
->z
.save_off
, final_offset
);
1466 insert_to_b(idx
->bidx
[idx
->z
.save_tid
], META_BIN(idx
), idx
->z
.off_beg
, final_offset
);
1467 insert_to_b(idx
->bidx
[idx
->z
.save_tid
], META_BIN(idx
), idx
->z
.n_mapped
, idx
->z
.n_unmapped
);
1469 for (i
= 0; i
< idx
->n
; ++i
) {
1470 update_loff(idx
, i
, (idx
->fmt
== HTS_FMT_CSI
));
1471 compress_binning(idx
, i
);
1473 idx
->z
.finished
= 1;
1476 int hts_idx_push(hts_idx_t
*idx
, int tid
, int beg
, int end
, uint64_t offset
, int is_mapped
)
1479 int64_t maxpos
= (int64_t) 1 << (idx
->min_shift
+ idx
->n_lvls
* 3);
1480 if (tid
<0) beg
= -1, end
= 0;
1481 if (tid
>= 0 && (beg
> maxpos
|| end
> maxpos
)) {
1484 if (tid
>= idx
->m
) { // enlarge the index
1485 uint32_t new_m
= idx
->m
* 2 > tid
+ 1 ? idx
->m
* 2 : tid
+ 1;
1489 new_bidx
= (bidx_t
**)realloc(idx
->bidx
, new_m
* sizeof(bidx_t
*));
1490 if (!new_bidx
) return -1;
1491 idx
->bidx
= new_bidx
;
1493 new_lidx
= (lidx_t
*) realloc(idx
->lidx
, new_m
* sizeof(lidx_t
));
1494 if (!new_lidx
) return -1;
1495 idx
->lidx
= new_lidx
;
1497 memset(&idx
->bidx
[idx
->m
], 0, (new_m
- idx
->m
) * sizeof(bidx_t
*));
1498 memset(&idx
->lidx
[idx
->m
], 0, (new_m
- idx
->m
) * sizeof(lidx_t
));
1501 if (idx
->n
< tid
+ 1) idx
->n
= tid
+ 1;
1502 if (idx
->z
.finished
) return 0;
1503 if (idx
->z
.last_tid
!= tid
|| (idx
->z
.last_tid
>= 0 && tid
< 0)) { // change of chromosome
1504 if ( tid
>=0 && idx
->n_no_coor
)
1506 hts_log_error("NO_COOR reads not in a single block at the end %d %d", tid
, idx
->z
.last_tid
);
1509 if (tid
>=0 && idx
->bidx
[tid
] != 0)
1511 hts_log_error("Chromosome blocks not continuous");
1514 idx
->z
.last_tid
= tid
;
1515 idx
->z
.last_bin
= 0xffffffffu
;
1516 } else if (tid
>= 0 && idx
->z
.last_coor
> beg
) { // test if positions are out of order
1517 hts_log_error("Unsorted positions on sequence #%d: %d followed by %d", tid
+1, idx
->z
.last_coor
+1, beg
+1);
1522 if (idx
->bidx
[tid
] == 0) idx
->bidx
[tid
] = kh_init(bin
);
1524 // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin
1525 if (beg
< 0) beg
= 0;
1526 if (end
<= 0) end
= 1;
1527 // idx->z.last_off points to the start of the current record
1528 if (insert_to_l(&idx
->lidx
[tid
], beg
, end
,
1529 idx
->z
.last_off
, idx
->min_shift
) < 0) return -1;
1532 else idx
->n_no_coor
++;
1533 bin
= hts_reg2bin(beg
, end
, idx
->min_shift
, idx
->n_lvls
);
1534 if ((int)idx
->z
.last_bin
!= bin
) { // then possibly write the binning index
1535 if (idx
->z
.save_bin
!= 0xffffffffu
) { // save_bin==0xffffffffu only happens to the first record
1536 if (insert_to_b(idx
->bidx
[idx
->z
.save_tid
], idx
->z
.save_bin
,
1537 idx
->z
.save_off
, idx
->z
.last_off
) < 0) return -1;
1539 if (idx
->z
.last_bin
== 0xffffffffu
&& idx
->z
.save_bin
!= 0xffffffffu
) { // change of chr; keep meta information
1540 idx
->z
.off_end
= idx
->z
.last_off
;
1541 if (insert_to_b(idx
->bidx
[idx
->z
.save_tid
], META_BIN(idx
),
1542 idx
->z
.off_beg
, idx
->z
.off_end
) < 0) return -1;
1543 if (insert_to_b(idx
->bidx
[idx
->z
.save_tid
], META_BIN(idx
),
1544 idx
->z
.n_mapped
, idx
->z
.n_unmapped
) < 0) return -1;
1545 idx
->z
.n_mapped
= idx
->z
.n_unmapped
= 0;
1546 idx
->z
.off_beg
= idx
->z
.off_end
;
1548 idx
->z
.save_off
= idx
->z
.last_off
;
1549 idx
->z
.save_bin
= idx
->z
.last_bin
= bin
;
1550 idx
->z
.save_tid
= tid
;
1552 if (is_mapped
) ++idx
->z
.n_mapped
;
1553 else ++idx
->z
.n_unmapped
;
1554 idx
->z
.last_off
= offset
;
1555 idx
->z
.last_coor
= beg
;
1559 int64_t max
= end
> beg
? end
: beg
, s
= 1 << 14;
1566 if (idx
->fmt
== HTS_FMT_CSI
) {
1567 hts_log_error("Region %d..%d cannot be stored in a csi index "
1568 "with min_shift = %d, n_lvls = %d. Try using "
1569 "min_shift = 14, n_lvls >= %d",
1571 idx
->min_shift
, idx
->n_lvls
,
1574 hts_log_error("Region %d..%d cannot be stored in a %s index. "
1575 "Try using a csi index with min_shift = 14, "
1577 beg
, end
, idx_format_name(idx
->fmt
),
1585 void hts_idx_destroy(hts_idx_t
*idx
)
1589 if (idx
== 0) return;
1591 // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
1592 if (idx
->fmt
== HTS_FMT_CRAI
) {
1593 hts_cram_idx_t
*cidx
= (hts_cram_idx_t
*) idx
;
1594 cram_index_free(cidx
->cram
);
1599 for (i
= 0; i
< idx
->m
; ++i
) {
1600 bidx_t
*bidx
= idx
->bidx
[i
];
1601 free(idx
->lidx
[i
].offset
);
1602 if (bidx
== 0) continue;
1603 for (k
= kh_begin(bidx
); k
!= kh_end(bidx
); ++k
)
1604 if (kh_exist(bidx
, k
))
1605 free(kh_value(bidx
, k
).list
);
1606 kh_destroy(bin
, bidx
);
1608 free(idx
->bidx
); free(idx
->lidx
); free(idx
->meta
);
1612 // The optimizer eliminates these ed_is_big() calls; still it would be good to
1613 // TODO Determine endianness at configure- or compile-time
1615 static inline ssize_t HTS_RESULT_USED
idx_write_int32(BGZF
*fp
, int32_t x
)
1617 if (ed_is_big()) x
= ed_swap_4(x
);
1618 return bgzf_write(fp
, &x
, sizeof x
);
1621 static inline ssize_t HTS_RESULT_USED
idx_write_uint32(BGZF
*fp
, uint32_t x
)
1623 if (ed_is_big()) x
= ed_swap_4(x
);
1624 return bgzf_write(fp
, &x
, sizeof x
);
1627 static inline ssize_t HTS_RESULT_USED
idx_write_uint64(BGZF
*fp
, uint64_t x
)
1629 if (ed_is_big()) x
= ed_swap_8(x
);
1630 return bgzf_write(fp
, &x
, sizeof x
);
1633 static inline void swap_bins(bins_t
*p
)
1636 for (i
= 0; i
< p
->n
; ++i
) {
1637 ed_swap_8p(&p
->list
[i
].u
);
1638 ed_swap_8p(&p
->list
[i
].v
);
1642 static int hts_idx_save_core(const hts_idx_t
*idx
, BGZF
*fp
, int fmt
)
1646 #define check(ret) if ((ret) < 0) return -1
1648 check(idx_write_int32(fp
, idx
->n
));
1649 if (fmt
== HTS_FMT_TBI
&& idx
->l_meta
)
1650 check(bgzf_write(fp
, idx
->meta
, idx
->l_meta
));
1652 for (i
= 0; i
< idx
->n
; ++i
) {
1654 bidx_t
*bidx
= idx
->bidx
[i
];
1655 lidx_t
*lidx
= &idx
->lidx
[i
];
1656 // write binning index
1657 check(idx_write_int32(fp
, bidx
? kh_size(bidx
) : 0));
1659 for (k
= kh_begin(bidx
); k
!= kh_end(bidx
); ++k
)
1660 if (kh_exist(bidx
, k
)) {
1661 bins_t
*p
= &kh_value(bidx
, k
);
1662 check(idx_write_uint32(fp
, kh_key(bidx
, k
)));
1663 if (fmt
== HTS_FMT_CSI
) check(idx_write_uint64(fp
, p
->loff
));
1664 //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
1665 check(idx_write_int32(fp
, p
->n
));
1666 for (j
= 0; j
< p
->n
; ++j
) {
1667 check(idx_write_uint64(fp
, p
->list
[j
].u
));
1668 check(idx_write_uint64(fp
, p
->list
[j
].v
));
1672 // write linear index
1673 if (fmt
!= HTS_FMT_CSI
) {
1674 check(idx_write_int32(fp
, lidx
->n
));
1675 for (j
= 0; j
< lidx
->n
; ++j
)
1676 check(idx_write_uint64(fp
, lidx
->offset
[j
]));
1680 check(idx_write_uint64(fp
, idx
->n_no_coor
));
1685 int hts_idx_save(const hts_idx_t
*idx
, const char *fn
, int fmt
)
1688 char *fnidx
= (char*)calloc(1, strlen(fn
) + 5);
1689 if (fnidx
== NULL
) return -1;
1693 case HTS_FMT_BAI
: strcat(fnidx
, ".bai"); break;
1694 case HTS_FMT_CSI
: strcat(fnidx
, ".csi"); break;
1695 case HTS_FMT_TBI
: strcat(fnidx
, ".tbi"); break;
1699 ret
= hts_idx_save_as(idx
, fn
, fnidx
, fmt
);
1706 int hts_idx_save_as(const hts_idx_t
*idx
, const char *fn
, const char *fnidx
, int fmt
)
1710 #define check(ret) if ((ret) < 0) goto fail
1712 if (fnidx
== NULL
) return hts_idx_save(idx
, fn
, fmt
);
1714 fp
= bgzf_open(fnidx
, (fmt
== HTS_FMT_BAI
)? "wu" : "w");
1715 if (fp
== NULL
) return -1;
1717 if (fmt
== HTS_FMT_CSI
) {
1718 check(bgzf_write(fp
, "CSI\1", 4));
1719 check(idx_write_int32(fp
, idx
->min_shift
));
1720 check(idx_write_int32(fp
, idx
->n_lvls
));
1721 check(idx_write_uint32(fp
, idx
->l_meta
));
1722 if (idx
->l_meta
) check(bgzf_write(fp
, idx
->meta
, idx
->l_meta
));
1723 } else if (fmt
== HTS_FMT_TBI
) {
1724 check(bgzf_write(fp
, "TBI\1", 4));
1725 } else if (fmt
== HTS_FMT_BAI
) {
1726 check(bgzf_write(fp
, "BAI\1", 4));
1729 check(hts_idx_save_core(idx
, fp
, fmt
));
1731 return bgzf_close(fp
);
1739 static int hts_idx_load_core(hts_idx_t
*idx
, BGZF
*fp
, int fmt
)
1741 int32_t i
, n
, is_be
;
1742 is_be
= ed_is_big();
1743 if (idx
== NULL
) return -4;
1744 for (i
= 0; i
< idx
->n
; ++i
) {
1746 lidx_t
*l
= &idx
->lidx
[i
];
1750 h
= idx
->bidx
[i
] = kh_init(bin
);
1751 if (bgzf_read(fp
, &n
, 4) != 4) return -1;
1752 if (is_be
) ed_swap_4p(&n
);
1753 for (j
= 0; j
< n
; ++j
) {
1755 if (bgzf_read(fp
, &key
, 4) != 4) return -1;
1756 if (is_be
) ed_swap_4p(&key
);
1757 k
= kh_put(bin
, h
, key
, &absent
);
1758 if (absent
<= 0) return -3; // Duplicate bin number
1760 if (fmt
== HTS_FMT_CSI
) {
1761 if (bgzf_read(fp
, &p
->loff
, 8) != 8) return -1;
1762 if (is_be
) ed_swap_8p(&p
->loff
);
1764 if (bgzf_read(fp
, &p
->n
, 4) != 4) return -1;
1765 if (is_be
) ed_swap_4p(&p
->n
);
1767 p
->list
= (hts_pair64_t
*)malloc(p
->m
* sizeof(hts_pair64_t
));
1768 if (p
->list
== NULL
) return -2;
1769 if (bgzf_read(fp
, p
->list
, p
->n
<<4) != p
->n
<<4) return -1;
1770 if (is_be
) swap_bins(p
);
1772 if (fmt
!= HTS_FMT_CSI
) { // load linear index
1774 if (bgzf_read(fp
, &l
->n
, 4) != 4) return -1;
1775 if (is_be
) ed_swap_4p(&l
->n
);
1777 l
->offset
= (uint64_t*)malloc(l
->n
* sizeof(uint64_t));
1778 if (l
->offset
== NULL
) return -2;
1779 if (bgzf_read(fp
, l
->offset
, l
->n
<< 3) != l
->n
<< 3) return -1;
1780 if (is_be
) for (j
= 0; j
< l
->n
; ++j
) ed_swap_8p(&l
->offset
[j
]);
1781 for (j
= 1; j
< l
->n
; ++j
) // fill missing values; may happen given older samtools and tabix
1782 if (l
->offset
[j
] == 0) l
->offset
[j
] = l
->offset
[j
-1];
1783 update_loff(idx
, i
, 1);
1786 if (bgzf_read(fp
, &idx
->n_no_coor
, 8) != 8) idx
->n_no_coor
= 0;
1787 if (is_be
) ed_swap_8p(&idx
->n_no_coor
);
1791 static hts_idx_t
*hts_idx_load_local(const char *fn
)
1795 hts_idx_t
*idx
= NULL
;
1796 uint8_t *meta
= NULL
;
1797 BGZF
*fp
= bgzf_open(fn
, "r");
1798 if (fp
== NULL
) return NULL
;
1799 is_be
= ed_is_big();
1800 if (bgzf_read(fp
, magic
, 4) != 4) goto fail
;
1802 if (memcmp(magic
, "CSI\1", 4) == 0) {
1804 if (bgzf_read(fp
, x
, 12) != 12) goto fail
;
1805 if (is_be
) for (i
= 0; i
< 3; ++i
) ed_swap_4p(&x
[i
]);
1807 if (SIZE_MAX
- x
[2] < 1) goto fail
; // Prevent possible overflow
1808 if ((meta
= (uint8_t*)malloc((size_t) x
[2] + 1)) == NULL
) goto fail
;
1809 if (bgzf_read(fp
, meta
, x
[2]) != x
[2]) goto fail
;
1810 // Prevent possible strlen past the end in tbx_index_load2
1813 if (bgzf_read(fp
, &n
, 4) != 4) goto fail
;
1814 if (is_be
) ed_swap_4p(&n
);
1815 if ((idx
= hts_idx_init(n
, HTS_FMT_CSI
, 0, x
[0], x
[1])) == NULL
) goto fail
;
1819 if (hts_idx_load_core(idx
, fp
, HTS_FMT_CSI
) < 0) goto fail
;
1821 else if (memcmp(magic
, "TBI\1", 4) == 0) {
1825 if (bgzf_read(fp
, x
, sizeof(x
)) != sizeof(x
)) goto fail
;
1826 n
= le_to_u32(&x
[0]); // location of n_ref
1827 if ((idx
= hts_idx_init(n
, HTS_FMT_TBI
, 0, 14, 5)) == NULL
) goto fail
;
1828 n
= le_to_u32(&x
[7*4]); // location of l_nm
1829 if (n
> UINT32_MAX
- 29) goto fail
; // Prevent possible overflow
1830 idx
->l_meta
= 28 + n
;
1831 if ((idx
->meta
= (uint8_t*)malloc(idx
->l_meta
+ 1)) == NULL
) goto fail
;
1832 // copy format, col_seq, col_beg, col_end, meta, skip, l_nm
1833 // N.B. left in little-endian byte order.
1834 memcpy(idx
->meta
, &x
[1*4], 28);
1835 // Read in sequence names.
1836 if (bgzf_read(fp
, idx
->meta
+ 28, n
) != n
) goto fail
;
1837 // Prevent possible strlen past the end in tbx_index_load2
1838 idx
->meta
[idx
->l_meta
] = '\0';
1839 if (hts_idx_load_core(idx
, fp
, HTS_FMT_TBI
) < 0) goto fail
;
1841 else if (memcmp(magic
, "BAI\1", 4) == 0) {
1843 if (bgzf_read(fp
, &n
, 4) != 4) goto fail
;
1844 if (is_be
) ed_swap_4p(&n
);
1845 idx
= hts_idx_init(n
, HTS_FMT_BAI
, 0, 14, 5);
1846 if (hts_idx_load_core(idx
, fp
, HTS_FMT_BAI
) < 0) goto fail
;
1848 else { errno
= EINVAL
; goto fail
; }
1855 hts_idx_destroy(idx
);
1860 int hts_idx_set_meta(hts_idx_t
*idx
, uint32_t l_meta
, uint8_t *meta
,
1863 uint8_t *new_meta
= meta
;
1866 if (l
> SIZE_MAX
- 1) {
1870 new_meta
= malloc(l
+ 1);
1871 if (!new_meta
) return -1;
1872 memcpy(new_meta
, meta
, l
);
1873 // Prevent possible strlen past the end in tbx_index_load2
1876 if (idx
->meta
) free(idx
->meta
);
1877 idx
->l_meta
= l_meta
;
1878 idx
->meta
= new_meta
;
1882 uint8_t *hts_idx_get_meta(hts_idx_t
*idx
, uint32_t *l_meta
)
1884 *l_meta
= idx
->l_meta
;
1888 const char **hts_idx_seqnames(const hts_idx_t
*idx
, int *n
, hts_id2name_f getid
, void *hdr
)
1897 const char **names
= (const char**) calloc(idx
->n
,sizeof(const char*));
1898 for (i
=0; i
<idx
->n
; i
++)
1900 bidx_t
*bidx
= idx
->bidx
[i
];
1901 if ( !bidx
) continue;
1902 names
[tid
++] = getid(hdr
,i
);
1908 int hts_idx_get_stat(const hts_idx_t
* idx
, int tid
, uint64_t* mapped
, uint64_t* unmapped
)
1910 if ( idx
->fmt
== HTS_FMT_CRAI
) {
1911 *mapped
= 0; *unmapped
= 0;
1915 bidx_t
*h
= idx
->bidx
[tid
];
1916 khint_t k
= kh_get(bin
, h
, META_BIN(idx
));
1917 if (k
!= kh_end(h
)) {
1918 *mapped
= kh_val(h
, k
).list
[1].u
;
1919 *unmapped
= kh_val(h
, k
).list
[1].v
;
1922 *mapped
= 0; *unmapped
= 0;
1927 uint64_t hts_idx_get_n_no_coor(const hts_idx_t
* idx
)
1929 return idx
->n_no_coor
;
1936 static inline int reg2bins(int64_t beg
, int64_t end
, hts_itr_t
*itr
, int min_shift
, int n_lvls
)
1938 int l
, t
, s
= min_shift
+ (n_lvls
<<1) + n_lvls
;
1939 if (beg
>= end
) return 0;
1940 if (end
>= 1LL<<s
) end
= 1LL<<s
;
1941 for (--end
, l
= 0, t
= 0; l
<= n_lvls
; s
-= 3, t
+= 1<<((l
<<1)+l
), ++l
) {
1943 b
= t
+ (beg
>>s
); e
= t
+ (end
>>s
); n
= e
- b
+ 1;
1944 if (itr
->bins
.n
+ n
> itr
->bins
.m
) {
1945 itr
->bins
.m
= itr
->bins
.n
+ n
;
1946 kroundup32(itr
->bins
.m
);
1947 itr
->bins
.a
= (int*)realloc(itr
->bins
.a
, sizeof(int) * itr
->bins
.m
);
1949 for (i
= b
; i
<= e
; ++i
) itr
->bins
.a
[itr
->bins
.n
++] = i
;
1954 hts_itr_t
*hts_itr_query(const hts_idx_t
*idx
, int tid
, int beg
, int end
, hts_readrec_func
*readrec
)
1956 int i
, n_off
, l
, bin
;
1960 uint64_t min_off
, max_off
;
1961 hts_itr_t
*iter
= 0;
1964 uint64_t off0
= (uint64_t)-1;
1968 // Find the smallest offset, note that sequence ids may not be ordered sequentially
1969 for (i
=0; i
<idx
->n
; i
++)
1971 bidx
= idx
->bidx
[i
];
1972 k
= kh_get(bin
, bidx
, META_BIN(idx
));
1973 if (k
== kh_end(bidx
)) continue;
1974 if ( off0
> kh_val(bidx
, k
).list
[0].u
) off0
= kh_val(bidx
, k
).list
[0].u
;
1976 if ( off0
==(uint64_t)-1 && idx
->n_no_coor
) off0
= 0; // only no-coor reads in this bam
1979 case HTS_IDX_NOCOOR
:
1980 /* No-coor reads sort after all of the mapped reads. The position
1981 is not stored in the index itself, so need to find the end
1982 offset for the last mapped read. A loop is needed here in
1983 case references at the end of the file have no mapped reads,
1984 or sequence ids are not ordered sequentially.
1985 See issue samtools#568 and commits b2aab8, 60c22d and cc207d. */
1986 for (i
= 0; i
< idx
->n
; i
++) {
1987 bidx
= idx
->bidx
[i
];
1988 k
= kh_get(bin
, bidx
, META_BIN(idx
));
1989 if (k
!= kh_end(bidx
)) {
1990 if (off0
==(uint64_t)-1 || off0
< kh_val(bidx
, k
).list
[0].v
) {
1991 off0
= kh_val(bidx
, k
).list
[0].v
;
1995 if ( off0
==(uint64_t)-1 && idx
->n_no_coor
) off0
= 0; // only no-coor reads in this bam
2010 if (off0
!= (uint64_t)-1) {
2011 iter
= (hts_itr_t
*)calloc(1, sizeof(hts_itr_t
));
2012 iter
->read_rest
= 1;
2013 iter
->finished
= finished0
;
2014 iter
->curr_off
= off0
;
2015 iter
->readrec
= readrec
;
2020 if (beg
< 0) beg
= 0;
2021 if (end
< beg
) return 0;
2022 if (tid
>= idx
->n
|| (bidx
= idx
->bidx
[tid
]) == NULL
) return 0;
2024 iter
= (hts_itr_t
*)calloc(1, sizeof(hts_itr_t
));
2025 iter
->tid
= tid
, iter
->beg
= beg
, iter
->end
= end
; iter
->i
= -1;
2026 iter
->readrec
= readrec
;
2028 if ( !kh_size(bidx
) ) { iter
->finished
= 1; return iter
; }
2031 bin
= hts_bin_first(idx
->n_lvls
) + (beg
>>idx
->min_shift
);
2034 k
= kh_get(bin
, bidx
, bin
);
2035 if (k
!= kh_end(bidx
)) break;
2036 first
= (hts_bin_parent(bin
)<<3) + 1;
2037 if (bin
> first
) --bin
;
2038 else bin
= hts_bin_parent(bin
);
2040 if (bin
== 0) k
= kh_get(bin
, bidx
, bin
);
2041 min_off
= k
!= kh_end(bidx
)? kh_val(bidx
, k
).loff
: 0;
2043 // compute max_off: a virtual offset from a bin to the right of end
2044 bin
= hts_bin_first(idx
->n_lvls
) + ((end
-1) >> idx
->min_shift
) + 1;
2045 if (bin
>= idx
->n_bins
) bin
= 0;
2047 // search for an extant bin by moving right, but moving up to the
2048 // parent whenever we get to a first child (which also covers falling
2049 // off the RHS, which wraps around and immediately goes up to bin 0)
2050 while (bin
% 8 == 1) bin
= hts_bin_parent(bin
);
2051 if (bin
== 0) { max_off
= (uint64_t)-1; break; }
2052 k
= kh_get(bin
, bidx
, bin
);
2053 if (k
!= kh_end(bidx
) && kh_val(bidx
, k
).n
> 0) { max_off
= kh_val(bidx
, k
).list
[0].u
; break; }
2058 reg2bins(beg
, end
, iter
, idx
->min_shift
, idx
->n_lvls
);
2059 for (i
= n_off
= 0; i
< iter
->bins
.n
; ++i
)
2060 if ((k
= kh_get(bin
, bidx
, iter
->bins
.a
[i
])) != kh_end(bidx
))
2061 n_off
+= kh_value(bidx
, k
).n
;
2063 // No overlapping bins means the iterator has already finished.
2067 off
= (hts_pair64_t
*)calloc(n_off
, sizeof(hts_pair64_t
));
2068 for (i
= n_off
= 0; i
< iter
->bins
.n
; ++i
) {
2069 if ((k
= kh_get(bin
, bidx
, iter
->bins
.a
[i
])) != kh_end(bidx
)) {
2071 bins_t
*p
= &kh_value(bidx
, k
);
2072 for (j
= 0; j
< p
->n
; ++j
)
2073 if (p
->list
[j
].v
> min_off
&& p
->list
[j
].u
< max_off
)
2074 off
[n_off
++] = p
->list
[j
];
2082 ks_introsort(_off
, n_off
, off
);
2083 // resolve completely contained adjacent blocks
2084 for (i
= 1, l
= 0; i
< n_off
; ++i
)
2085 if (off
[l
].v
< off
[i
].v
) off
[++l
] = off
[i
];
2087 // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
2088 for (i
= 1; i
< n_off
; ++i
)
2089 if (off
[i
-1].v
>= off
[i
].u
) off
[i
-1].v
= off
[i
].u
;
2090 // merge adjacent blocks
2091 for (i
= 1, l
= 0; i
< n_off
; ++i
) {
2092 if (off
[l
].v
>>16 == off
[i
].u
>>16) off
[l
].v
= off
[i
].v
;
2093 else off
[++l
] = off
[i
];
2096 iter
->n_off
= n_off
; iter
->off
= off
;
2100 void hts_itr_destroy(hts_itr_t
*iter
)
2102 if (iter
) { free(iter
->off
); free(iter
->bins
.a
); free(iter
); }
2105 static inline long long push_digit(long long i
, char c
)
2107 // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
2108 int digit
= c
- '0';
2109 return 10 * i
+ digit
;
2112 long long hts_parse_decimal(const char *str
, char **strend
, int flags
)
2115 int decimals
= 0, e
= 0, lost
= 0;
2116 char sign
= '+', esign
= '+';
2119 while (isspace_c(*str
)) str
++;
2122 if (*s
== '+' || *s
== '-') sign
= *s
++;
2124 if (isdigit_c(*s
)) n
= push_digit(n
, *s
++);
2125 else if (*s
== ',' && (flags
& HTS_PARSE_THOUSANDS_SEP
)) s
++;
2130 while (isdigit_c(*s
)) decimals
++, n
= push_digit(n
, *s
++);
2133 if (*s
== 'E' || *s
== 'e') {
2135 if (*s
== '+' || *s
== '-') esign
= *s
++;
2136 while (isdigit_c(*s
)) e
= push_digit(e
, *s
++);
2137 if (esign
== '-') e
= -e
;
2141 while (e
> 0) n
*= 10, e
--;
2142 while (e
< 0) lost
+= n
% 10, n
/= 10, e
++;
2145 hts_log_warning("Discarding fractional part of %.*s", (int)(s
- str
), str
);
2149 *strend
= (char *)s
;
2151 hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s
- str
), str
, s
);
2154 return (sign
== '+')? n
: -n
;
2157 const char *hts_parse_reg(const char *s
, int *beg
, int *end
)
2160 const char *colon
= strrchr(s
, ':');
2161 if (colon
== NULL
) {
2162 *beg
= 0; *end
= INT_MAX
;
2163 return s
+ strlen(s
);
2166 *beg
= hts_parse_decimal(colon
+1, &hyphen
, HTS_PARSE_THOUSANDS_SEP
) - 1;
2167 if (*beg
< 0) *beg
= 0;
2169 if (*hyphen
== '\0') *end
= INT_MAX
;
2170 else if (*hyphen
== '-') *end
= hts_parse_decimal(hyphen
+1, NULL
, HTS_PARSE_THOUSANDS_SEP
);
2173 if (*beg
>= *end
) return NULL
;
2177 hts_itr_t
*hts_itr_querys(const hts_idx_t
*idx
, const char *reg
, hts_name2id_f getid
, void *hdr
, hts_itr_query_func
*itr_query
, hts_readrec_func
*readrec
)
2182 if (strcmp(reg
, ".") == 0)
2183 return itr_query(idx
, HTS_IDX_START
, 0, 0, readrec
);
2184 else if (strcmp(reg
, "*") == 0)
2185 return itr_query(idx
, HTS_IDX_NOCOOR
, 0, 0, readrec
);
2187 q
= hts_parse_reg(reg
, &beg
, &end
);
2189 char tmp_a
[1024], *tmp
= tmp_a
;
2190 if (q
- reg
+ 1 > 1024)
2191 if (!(tmp
= malloc(q
- reg
+ 1)))
2193 strncpy(tmp
, reg
, q
- reg
);
2195 tid
= getid(hdr
, tmp
);
2200 // not parsable as a region, but possibly a sequence named "foo:a"
2201 tid
= getid(hdr
, reg
);
2202 beg
= 0; end
= INT_MAX
;
2205 if (tid
< 0) return NULL
;
2206 return itr_query(idx
, tid
, beg
, end
, readrec
);
2209 int hts_itr_next(BGZF
*fp
, hts_itr_t
*iter
, void *r
, void *data
)
2211 int ret
, tid
, beg
, end
;
2212 if (iter
== NULL
|| iter
->finished
) return -1;
2213 if (iter
->read_rest
) {
2214 if (iter
->curr_off
) { // seek to the start
2215 if (bgzf_seek(fp
, iter
->curr_off
, SEEK_SET
) < 0) return -1;
2216 iter
->curr_off
= 0; // only seek once
2218 ret
= iter
->readrec(fp
, data
, r
, &tid
, &beg
, &end
);
2219 if (ret
< 0) iter
->finished
= 1;
2220 iter
->curr_tid
= tid
;
2221 iter
->curr_beg
= beg
;
2222 iter
->curr_end
= end
;
2225 // A NULL iter->off should always be accompanied by iter->finished.
2226 assert(iter
->off
!= NULL
);
2228 if (iter
->curr_off
== 0 || iter
->curr_off
>= iter
->off
[iter
->i
].v
) { // then jump to the next chunk
2229 if (iter
->i
== iter
->n_off
- 1) { ret
= -1; break; } // no more chunks
2230 if (iter
->i
< 0 || iter
->off
[iter
->i
].v
!= iter
->off
[iter
->i
+1].u
) { // not adjacent chunks; then seek
2231 if (bgzf_seek(fp
, iter
->off
[iter
->i
+1].u
, SEEK_SET
) < 0) return -1;
2232 iter
->curr_off
= bgzf_tell(fp
);
2236 if ((ret
= iter
->readrec(fp
, data
, r
, &tid
, &beg
, &end
)) >= 0) {
2237 iter
->curr_off
= bgzf_tell(fp
);
2238 if (tid
!= iter
->tid
|| beg
>= iter
->end
) { // no need to proceed
2240 } else if (end
> iter
->beg
&& iter
->end
> beg
) {
2241 iter
->curr_tid
= tid
;
2242 iter
->curr_beg
= beg
;
2243 iter
->curr_end
= end
;
2246 } else break; // end of file or error
2252 /**********************
2253 *** Retrieve index ***
2254 **********************/
2256 static char *test_and_fetch(const char *fn
)
2258 if (hisremote(fn
)) {
2259 const int buf_size
= 1 * 1024 * 1024;
2265 for (p
= fn
+ strlen(fn
) - 1; p
>= fn
; --p
)
2266 if (*p
== '/') break;
2267 ++p
; // p now points to the local file name
2268 // Attempt to open local file first
2269 if ((fp
= fopen((char*)p
, "rb")) != 0)
2274 // Attempt to open remote file. Stay quiet on failure, it is OK to fail when trying first .csi then .tbi index.
2275 if ((fp_remote
= hopen(fn
, "r")) == 0) return 0;
2276 if ((fp
= fopen(p
, "w")) == 0) {
2277 hts_log_error("Failed to create file %s in the working directory", p
);
2278 hclose_abruptly(fp_remote
);
2281 hts_log_info("Downloading file %s to local directory", fn
);
2282 buf
= (uint8_t*)calloc(buf_size
, 1);
2283 while ((l
= hread(fp_remote
, buf
, buf_size
)) > 0) fwrite(buf
, 1, l
, fp
);
2286 if (hclose(fp_remote
) != 0) {
2287 hts_log_error("Failed to close remote file %s", fn
);
2292 if ((fp
= hopen(fn
, "r")) == 0) return 0;
2293 hclose_abruptly(fp
);
2298 char *hts_idx_getfn(const char *fn
, const char *ext
)
2302 l_fn
= strlen(fn
); l_ext
= strlen(ext
);
2303 fnidx
= (char*)calloc(l_fn
+ l_ext
+ 1, 1);
2304 strcpy(fnidx
, fn
); strcpy(fnidx
+ l_fn
, ext
);
2305 if ((ret
= test_and_fetch(fnidx
)) == 0) {
2306 for (i
= l_fn
- 1; i
> 0; --i
)
2307 if (fnidx
[i
] == '.') break;
2308 strcpy(fnidx
+ i
, ext
);
2309 ret
= test_and_fetch(fnidx
);
2316 memmove(fnidx
, ret
, l_fn
+ 1);
2320 hts_idx_t
*hts_idx_load(const char *fn
, int fmt
)
2324 fnidx
= hts_idx_getfn(fn
, ".csi");
2325 if (! fnidx
) fnidx
= hts_idx_getfn(fn
, fmt
== HTS_FMT_BAI
? ".bai" : ".tbi");
2326 if (fnidx
== 0) return 0;
2328 idx
= hts_idx_load2(fn
, fnidx
);
2333 hts_idx_t
*hts_idx_load2(const char *fn
, const char *fnidx
)
2335 // Check that the index file is up to date, the main file might have changed
2336 struct stat stat_idx
,stat_main
;
2337 if ( !stat(fn
, &stat_main
) && !stat(fnidx
, &stat_idx
) )
2339 if ( stat_idx
.st_mtime
< stat_main
.st_mtime
)
2340 hts_log_warning("The index file is older than the data file: %s", fnidx
);
2343 return hts_idx_load_local(fnidx
);
2348 /**********************
2350 **********************/
2352 /* For use with hts_expand macros *only* */
2353 size_t hts_realloc_or_die(size_t n
, size_t m
, size_t m_sz
, size_t size
,
2354 int clear
, void **ptr
, const char *func
) {
2355 /* If new_m and size are both below this limit, multiplying them
2356 together can't overflow */
2357 const size_t safe
= (size_t) 1 << (sizeof(size_t) * 4);
2359 size_t bytes
, new_m
;
2362 kroundup_size_t(new_m
);
2364 bytes
= size
* new_m
;
2366 /* Check for overflow. Both ensure that new_m will fit in m (we make the
2367 pessimistic assumption that m is signed), and that bytes has not
2369 if (new_m
> (((size_t) 1 << (m_sz
* 8 - 1)) - 1)
2370 || ((size
> safe
|| new_m
> safe
)
2371 && bytes
/ new_m
!= size
)) {
2376 new_ptr
= realloc(*ptr
, bytes
);
2377 if (new_ptr
== NULL
) goto die
;
2381 memset((char *) new_ptr
+ m
* size
, 0, (new_m
- m
) * size
);
2390 hts_log_error("%s", strerror(errno
));
2394 void hts_set_log_level(enum htsLogLevel level
)
2396 hts_verbose
= level
;
2399 enum htsLogLevel
hts_get_log_level()
2404 static char get_severity_tag(enum htsLogLevel severity
)
2409 case HTS_LOG_WARNING
:
2424 void hts_log(enum htsLogLevel severity
, const char *context
, const char *format
, ...)
2426 int save_errno
= errno
;
2427 if (severity
<= hts_verbose
) {
2430 fprintf(stderr
, "[%c::%s] ", get_severity_tag(severity
), context
);
2432 va_start(argptr
, format
);
2433 vfprintf(stderr
, format
, argptr
);
2436 fprintf(stderr
, "\n");