modified: src1/input.c
[GalaxyCodeBases.git] / c_cpp / lib / htslib / tbx.c
blob1d4b6b50beb892075671c495b5621d00dbc499a9
1 /* tbx.c -- tabix API functions.
3 Copyright (C) 2009, 2010, 2012-2015, 2017 Genome Research Ltd.
4 Copyright (C) 2010-2012 Broad Institute.
6 Author: Heng Li <lh3@sanger.ac.uk>
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
26 #include <config.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <stdio.h>
31 #include <assert.h>
32 #include <errno.h>
33 #include "htslib/tbx.h"
34 #include "htslib/bgzf.h"
35 #include "htslib/hts_endian.h"
36 #include "hts_internal.h"
38 #include "htslib/khash.h"
39 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
41 const tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 };
42 const tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 };
43 const tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 };
44 const tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 };
45 const tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 };
47 typedef struct {
48 int64_t beg, end;
49 char *ss, *se;
50 int tid;
51 } tbx_intv_t;
53 static inline int get_tid(tbx_t *tbx, const char *ss, int is_add)
55 khint_t k;
56 khash_t(s2i) *d;
57 if (tbx->dict == 0) tbx->dict = kh_init(s2i);
58 d = (khash_t(s2i)*)tbx->dict;
59 if (is_add) {
60 int absent;
61 k = kh_put(s2i, d, ss, &absent);
62 if (absent < 0) {
63 return -1; // Out of memory
64 } else if (absent) {
65 char *ss_dup = strdup(ss);
66 if (ss_dup) {
67 kh_key(d, k) = ss_dup;
68 kh_val(d, k) = kh_size(d) - 1;
69 } else {
70 kh_del(s2i, d, k);
71 return -1; // Out of memory
74 } else k = kh_get(s2i, d, ss);
75 return k == kh_end(d)? -1 : kh_val(d, k);
78 int tbx_name2id(tbx_t *tbx, const char *ss)
80 return get_tid(tbx, ss, 0);
83 int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
85 int i, b = 0, id = 1, ncols = 0;
86 char *s;
87 intv->ss = intv->se = 0; intv->beg = intv->end = -1;
88 for (i = 0; i <= len; ++i) {
89 if (line[i] == '\t' || line[i] == 0) {
90 ++ncols;
91 if (id == conf->sc) {
92 intv->ss = line + b; intv->se = line + i;
93 } else if (id == conf->bc) {
94 // here ->beg is 0-based.
95 intv->beg = intv->end = strtol(line + b, &s, 0);
96 if ( s==line+b ) return -1; // expected int
97 if (!(conf->preset&TBX_UCSC)) --intv->beg;
98 else ++intv->end;
99 if (intv->beg < 0) intv->beg = 0;
100 if (intv->end < 1) intv->end = 1;
101 } else {
102 if ((conf->preset&0xffff) == TBX_GENERIC) {
103 if (id == conf->ec)
105 intv->end = strtol(line + b, &s, 0);
106 if ( s==line+b ) return -1; // expected int
108 } else if ((conf->preset&0xffff) == TBX_SAM) {
109 if (id == 6) { // CIGAR
110 int l = 0;
111 char *t;
112 for (s = line + b; s < line + i;) {
113 long x = strtol(s, &t, 10);
114 char op = toupper_c(*t);
115 if (op == 'M' || op == 'D' || op == 'N') l += x;
116 s = t + 1;
118 if (l == 0) l = 1;
119 intv->end = intv->beg + l;
121 } else if ((conf->preset&0xffff) == TBX_VCF) {
122 if (id == 4) {
123 if (b < i) intv->end = intv->beg + (i - b);
124 } else if (id == 8) { // look for "END="
125 int c = line[i];
126 line[i] = 0;
127 s = strstr(line + b, "END=");
128 if (s == line + b) s += 4;
129 else if (s) {
130 s = strstr(line + b, ";END=");
131 if (s) s += 5;
133 if (s) intv->end = strtol(s, &s, 0);
134 line[i] = c;
138 b = i + 1;
139 ++id;
142 if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
143 return 0;
146 static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add)
148 if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) {
149 int c = *intv->se;
150 *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c;
151 return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
152 } else {
153 char *type = NULL;
154 switch (tbx->conf.preset&0xffff)
156 case TBX_SAM: type = "TBX_SAM"; break;
157 case TBX_VCF: type = "TBX_VCF"; break;
158 case TBX_UCSC: type = "TBX_UCSC"; break;
159 default: type = "TBX_GENERIC"; break;
161 hts_log_error("Failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"",
162 type, str->s);
163 return -1;
167 int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end)
169 tbx_t *tbx = (tbx_t *) tbxv;
170 kstring_t *s = (kstring_t *) sv;
171 int ret;
172 if ((ret = bgzf_getline(fp, '\n', s)) >= 0) {
173 tbx_intv_t intv;
174 get_intv(tbx, s, &intv, 0);
175 *tid = intv.tid; *beg = intv.beg; *end = intv.end;
177 return ret;
180 void tbx_set_meta(tbx_t *tbx)
182 int i, l = 0, l_nm;
183 uint32_t x[7];
184 char **name;
185 uint8_t *meta;
186 khint_t k;
187 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
189 memcpy(x, &tbx->conf, 24);
190 name = (char**)malloc(sizeof(char*) * kh_size(d));
191 for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) {
192 if (!kh_exist(d, k)) continue;
193 name[kh_val(d, k)] = (char*)kh_key(d, k);
194 l += strlen(kh_key(d, k)) + 1; // +1 to include '\0'
196 l_nm = x[6] = l;
197 meta = (uint8_t*)malloc(l_nm + 28);
198 if (ed_is_big())
199 for (i = 0; i < 7; ++i)
200 x[i] = ed_swap_4(x[i]);
201 memcpy(meta, x, 28);
202 for (l = 28, i = 0; i < (int)kh_size(d); ++i) {
203 int x = strlen(name[i]) + 1;
204 memcpy(meta + l, name[i], x);
205 l += x;
207 free(name);
208 hts_idx_set_meta(tbx->idx, l, meta, 0);
211 tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
213 tbx_t *tbx;
214 kstring_t str;
215 int ret, first = 0, n_lvls, fmt;
216 int64_t lineno = 0;
217 uint64_t last_off = 0;
218 tbx_intv_t intv;
220 str.s = 0; str.l = str.m = 0;
221 tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
222 tbx->conf = *conf;
223 if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI;
224 else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI;
225 while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
226 ++lineno;
227 if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) {
228 last_off = bgzf_tell(fp);
229 continue;
231 if (first == 0) {
232 tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
233 first = 1;
235 get_intv(tbx, &str, &intv, 1);
236 ret = hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1);
237 if (ret < 0)
239 free(str.s);
240 tbx_destroy(tbx);
241 return NULL;
244 if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); // empty file
245 if ( !tbx->dict ) tbx->dict = kh_init(s2i);
246 hts_idx_finish(tbx->idx, bgzf_tell(fp));
247 tbx_set_meta(tbx);
248 free(str.s);
249 return tbx;
252 void tbx_destroy(tbx_t *tbx)
254 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
255 if (d != NULL)
257 khint_t k;
258 for (k = kh_begin(d); k != kh_end(d); ++k)
259 if (kh_exist(d, k)) free((char*)kh_key(d, k));
261 hts_idx_destroy(tbx->idx);
262 kh_destroy(s2i, d);
263 free(tbx);
266 int tbx_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads, const tbx_conf_t *conf)
268 tbx_t *tbx;
269 BGZF *fp;
270 int ret;
271 if ((fp = bgzf_open(fn, "r")) == 0) return -1;
272 if ( n_threads ) bgzf_mt(fp, n_threads, 256);
273 if ( bgzf_compression(fp) != bgzf ) { bgzf_close(fp); return -1; }
274 tbx = tbx_index(fp, min_shift, conf);
275 bgzf_close(fp);
276 if ( !tbx ) return -1;
277 ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI);
278 tbx_destroy(tbx);
279 return ret;
282 int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
284 return tbx_index_build3(fn, fnidx, min_shift, 0, conf);
287 int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf)
289 return tbx_index_build3(fn, NULL, min_shift, 0, conf);
292 tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
294 tbx_t *tbx;
295 uint8_t *meta;
296 char *nm, *p;
297 uint32_t l_meta, l_nm;
298 tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
299 tbx->idx = fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_TBI);
300 if ( !tbx->idx )
302 free(tbx);
303 return NULL;
305 meta = hts_idx_get_meta(tbx->idx, &l_meta);
306 if ( !meta || l_meta < 28) goto invalid;
308 tbx->conf.preset = le_to_i32(&meta[0]);
309 tbx->conf.sc = le_to_i32(&meta[4]);
310 tbx->conf.bc = le_to_i32(&meta[8]);
311 tbx->conf.ec = le_to_i32(&meta[12]);
312 tbx->conf.meta_char = le_to_i32(&meta[16]);
313 tbx->conf.line_skip = le_to_i32(&meta[20]);
314 l_nm = le_to_u32(&meta[24]);
315 if (l_nm > l_meta - 28) goto invalid;
317 p = nm = (char*)meta + 28;
318 // This assumes meta is NUL-terminated, so we can merrily strlen away.
319 // hts_idx_load_local() assures this for us by adding a NUL on the end
320 // of whatever it reads.
321 for (; p - nm < l_nm; p += strlen(p) + 1) {
322 if (get_tid(tbx, p, 1) < 0) {
323 hts_log_error("%s", strerror(errno));
324 goto fail;
327 return tbx;
329 invalid:
330 hts_log_error("Invalid index header for %s", fnidx ? fnidx : fn);
332 fail:
333 tbx_destroy(tbx);
334 return NULL;
337 tbx_t *tbx_index_load(const char *fn)
339 return tbx_index_load2(fn, NULL);
342 const char **tbx_seqnames(tbx_t *tbx, int *n)
344 khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
345 if (d == NULL)
347 *n = 0;
348 return NULL;
350 int tid, m = kh_size(d);
351 const char **names = (const char**) calloc(m,sizeof(const char*));
352 khint_t k;
353 for (k=kh_begin(d); k<kh_end(d); k++)
355 if ( !kh_exist(d,k) ) continue;
356 tid = kh_val(d,k);
357 assert( tid<m );
358 names[tid] = kh_key(d,k);
360 // sanity check: there should be no gaps
361 for (tid=0; tid<m; tid++)
362 assert(names[tid]);
363 *n = m;
364 return names;