1 /* tbx.c -- tabix API functions.
3 Copyright (C) 2009, 2010, 2012-2015, 2017 Genome Research Ltd.
4 Copyright (C) 2010-2012 Broad Institute.
6 Author: Heng Li <lh3@sanger.ac.uk>
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
33 #include "htslib/tbx.h"
34 #include "htslib/bgzf.h"
35 #include "htslib/hts_endian.h"
36 #include "hts_internal.h"
38 #include "htslib/khash.h"
39 KHASH_DECLARE(s2i
, kh_cstr_t
, int64_t)
41 const tbx_conf_t tbx_conf_gff
= { 0, 1, 4, 5, '#', 0 };
42 const tbx_conf_t tbx_conf_bed
= { TBX_UCSC
, 1, 2, 3, '#', 0 };
43 const tbx_conf_t tbx_conf_psltbl
= { TBX_UCSC
, 15, 17, 18, '#', 0 };
44 const tbx_conf_t tbx_conf_sam
= { TBX_SAM
, 3, 4, 0, '@', 0 };
45 const tbx_conf_t tbx_conf_vcf
= { TBX_VCF
, 1, 2, 0, '#', 0 };
53 static inline int get_tid(tbx_t
*tbx
, const char *ss
, int is_add
)
57 if (tbx
->dict
== 0) tbx
->dict
= kh_init(s2i
);
58 d
= (khash_t(s2i
)*)tbx
->dict
;
61 k
= kh_put(s2i
, d
, ss
, &absent
);
63 return -1; // Out of memory
65 char *ss_dup
= strdup(ss
);
67 kh_key(d
, k
) = ss_dup
;
68 kh_val(d
, k
) = kh_size(d
) - 1;
71 return -1; // Out of memory
74 } else k
= kh_get(s2i
, d
, ss
);
75 return k
== kh_end(d
)? -1 : kh_val(d
, k
);
78 int tbx_name2id(tbx_t
*tbx
, const char *ss
)
80 return get_tid(tbx
, ss
, 0);
83 int tbx_parse1(const tbx_conf_t
*conf
, int len
, char *line
, tbx_intv_t
*intv
)
85 int i
, b
= 0, id
= 1, ncols
= 0;
87 intv
->ss
= intv
->se
= 0; intv
->beg
= intv
->end
= -1;
88 for (i
= 0; i
<= len
; ++i
) {
89 if (line
[i
] == '\t' || line
[i
] == 0) {
92 intv
->ss
= line
+ b
; intv
->se
= line
+ i
;
93 } else if (id
== conf
->bc
) {
94 // here ->beg is 0-based.
95 intv
->beg
= intv
->end
= strtol(line
+ b
, &s
, 0);
96 if ( s
==line
+b
) return -1; // expected int
97 if (!(conf
->preset
&TBX_UCSC
)) --intv
->beg
;
99 if (intv
->beg
< 0) intv
->beg
= 0;
100 if (intv
->end
< 1) intv
->end
= 1;
102 if ((conf
->preset
&0xffff) == TBX_GENERIC
) {
105 intv
->end
= strtol(line
+ b
, &s
, 0);
106 if ( s
==line
+b
) return -1; // expected int
108 } else if ((conf
->preset
&0xffff) == TBX_SAM
) {
109 if (id
== 6) { // CIGAR
112 for (s
= line
+ b
; s
< line
+ i
;) {
113 long x
= strtol(s
, &t
, 10);
114 char op
= toupper_c(*t
);
115 if (op
== 'M' || op
== 'D' || op
== 'N') l
+= x
;
119 intv
->end
= intv
->beg
+ l
;
121 } else if ((conf
->preset
&0xffff) == TBX_VCF
) {
123 if (b
< i
) intv
->end
= intv
->beg
+ (i
- b
);
124 } else if (id
== 8) { // look for "END="
127 s
= strstr(line
+ b
, "END=");
128 if (s
== line
+ b
) s
+= 4;
130 s
= strstr(line
+ b
, ";END=");
133 if (s
) intv
->end
= strtol(s
, &s
, 0);
142 if (intv
->ss
== 0 || intv
->se
== 0 || intv
->beg
< 0 || intv
->end
< 0) return -1;
146 static inline int get_intv(tbx_t
*tbx
, kstring_t
*str
, tbx_intv_t
*intv
, int is_add
)
148 if (tbx_parse1(&tbx
->conf
, str
->l
, str
->s
, intv
) == 0) {
150 *intv
->se
= '\0'; intv
->tid
= get_tid(tbx
, intv
->ss
, is_add
); *intv
->se
= c
;
151 return (intv
->tid
>= 0 && intv
->beg
>= 0 && intv
->end
>= 0)? 0 : -1;
154 switch (tbx
->conf
.preset
&0xffff)
156 case TBX_SAM
: type
= "TBX_SAM"; break;
157 case TBX_VCF
: type
= "TBX_VCF"; break;
158 case TBX_UCSC
: type
= "TBX_UCSC"; break;
159 default: type
= "TBX_GENERIC"; break;
161 hts_log_error("Failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"",
167 int tbx_readrec(BGZF
*fp
, void *tbxv
, void *sv
, int *tid
, int *beg
, int *end
)
169 tbx_t
*tbx
= (tbx_t
*) tbxv
;
170 kstring_t
*s
= (kstring_t
*) sv
;
172 if ((ret
= bgzf_getline(fp
, '\n', s
)) >= 0) {
174 get_intv(tbx
, s
, &intv
, 0);
175 *tid
= intv
.tid
; *beg
= intv
.beg
; *end
= intv
.end
;
180 void tbx_set_meta(tbx_t
*tbx
)
187 khash_t(s2i
) *d
= (khash_t(s2i
)*)tbx
->dict
;
189 memcpy(x
, &tbx
->conf
, 24);
190 name
= (char**)malloc(sizeof(char*) * kh_size(d
));
191 for (k
= kh_begin(d
), l
= 0; k
!= kh_end(d
); ++k
) {
192 if (!kh_exist(d
, k
)) continue;
193 name
[kh_val(d
, k
)] = (char*)kh_key(d
, k
);
194 l
+= strlen(kh_key(d
, k
)) + 1; // +1 to include '\0'
197 meta
= (uint8_t*)malloc(l_nm
+ 28);
199 for (i
= 0; i
< 7; ++i
)
200 x
[i
] = ed_swap_4(x
[i
]);
202 for (l
= 28, i
= 0; i
< (int)kh_size(d
); ++i
) {
203 int x
= strlen(name
[i
]) + 1;
204 memcpy(meta
+ l
, name
[i
], x
);
208 hts_idx_set_meta(tbx
->idx
, l
, meta
, 0);
211 tbx_t
*tbx_index(BGZF
*fp
, int min_shift
, const tbx_conf_t
*conf
)
215 int ret
, first
= 0, n_lvls
, fmt
;
217 uint64_t last_off
= 0;
220 str
.s
= 0; str
.l
= str
.m
= 0;
221 tbx
= (tbx_t
*)calloc(1, sizeof(tbx_t
));
223 if (min_shift
> 0) n_lvls
= (TBX_MAX_SHIFT
- min_shift
+ 2) / 3, fmt
= HTS_FMT_CSI
;
224 else min_shift
= 14, n_lvls
= 5, fmt
= HTS_FMT_TBI
;
225 while ((ret
= bgzf_getline(fp
, '\n', &str
)) >= 0) {
227 if (lineno
<= tbx
->conf
.line_skip
|| str
.s
[0] == tbx
->conf
.meta_char
) {
228 last_off
= bgzf_tell(fp
);
232 tbx
->idx
= hts_idx_init(0, fmt
, last_off
, min_shift
, n_lvls
);
235 get_intv(tbx
, &str
, &intv
, 1);
236 ret
= hts_idx_push(tbx
->idx
, intv
.tid
, intv
.beg
, intv
.end
, bgzf_tell(fp
), 1);
244 if ( !tbx
->idx
) tbx
->idx
= hts_idx_init(0, fmt
, last_off
, min_shift
, n_lvls
); // empty file
245 if ( !tbx
->dict
) tbx
->dict
= kh_init(s2i
);
246 hts_idx_finish(tbx
->idx
, bgzf_tell(fp
));
252 void tbx_destroy(tbx_t
*tbx
)
254 khash_t(s2i
) *d
= (khash_t(s2i
)*)tbx
->dict
;
258 for (k
= kh_begin(d
); k
!= kh_end(d
); ++k
)
259 if (kh_exist(d
, k
)) free((char*)kh_key(d
, k
));
261 hts_idx_destroy(tbx
->idx
);
266 int tbx_index_build3(const char *fn
, const char *fnidx
, int min_shift
, int n_threads
, const tbx_conf_t
*conf
)
271 if ((fp
= bgzf_open(fn
, "r")) == 0) return -1;
272 if ( n_threads
) bgzf_mt(fp
, n_threads
, 256);
273 if ( bgzf_compression(fp
) != bgzf
) { bgzf_close(fp
); return -1; }
274 tbx
= tbx_index(fp
, min_shift
, conf
);
276 if ( !tbx
) return -1;
277 ret
= hts_idx_save_as(tbx
->idx
, fn
, fnidx
, min_shift
> 0? HTS_FMT_CSI
: HTS_FMT_TBI
);
282 int tbx_index_build2(const char *fn
, const char *fnidx
, int min_shift
, const tbx_conf_t
*conf
)
284 return tbx_index_build3(fn
, fnidx
, min_shift
, 0, conf
);
287 int tbx_index_build(const char *fn
, int min_shift
, const tbx_conf_t
*conf
)
289 return tbx_index_build3(fn
, NULL
, min_shift
, 0, conf
);
292 tbx_t
*tbx_index_load2(const char *fn
, const char *fnidx
)
297 uint32_t l_meta
, l_nm
;
298 tbx
= (tbx_t
*)calloc(1, sizeof(tbx_t
));
299 tbx
->idx
= fnidx
? hts_idx_load2(fn
, fnidx
) : hts_idx_load(fn
, HTS_FMT_TBI
);
305 meta
= hts_idx_get_meta(tbx
->idx
, &l_meta
);
306 if ( !meta
|| l_meta
< 28) goto invalid
;
308 tbx
->conf
.preset
= le_to_i32(&meta
[0]);
309 tbx
->conf
.sc
= le_to_i32(&meta
[4]);
310 tbx
->conf
.bc
= le_to_i32(&meta
[8]);
311 tbx
->conf
.ec
= le_to_i32(&meta
[12]);
312 tbx
->conf
.meta_char
= le_to_i32(&meta
[16]);
313 tbx
->conf
.line_skip
= le_to_i32(&meta
[20]);
314 l_nm
= le_to_u32(&meta
[24]);
315 if (l_nm
> l_meta
- 28) goto invalid
;
317 p
= nm
= (char*)meta
+ 28;
318 // This assumes meta is NUL-terminated, so we can merrily strlen away.
319 // hts_idx_load_local() assures this for us by adding a NUL on the end
320 // of whatever it reads.
321 for (; p
- nm
< l_nm
; p
+= strlen(p
) + 1) {
322 if (get_tid(tbx
, p
, 1) < 0) {
323 hts_log_error("%s", strerror(errno
));
330 hts_log_error("Invalid index header for %s", fnidx
? fnidx
: fn
);
337 tbx_t
*tbx_index_load(const char *fn
)
339 return tbx_index_load2(fn
, NULL
);
342 const char **tbx_seqnames(tbx_t
*tbx
, int *n
)
344 khash_t(s2i
) *d
= (khash_t(s2i
)*)tbx
->dict
;
350 int tid
, m
= kh_size(d
);
351 const char **names
= (const char**) calloc(m
,sizeof(const char*));
353 for (k
=kh_begin(d
); k
<kh_end(d
); k
++)
355 if ( !kh_exist(d
,k
) ) continue;
358 names
[tid
] = kh_key(d
,k
);
360 // sanity check: there should be no gaps
361 for (tid
=0; tid
<m
; tid
++)