modified: diffout.py
[GalaxyCodeBases.git] / c_cpp / lib / htslib / faidx.c
blobf85ffb5dcdbb6cdb59fb7822f4d5c9cfad866dff
1 /* faidx.c -- FASTA random access.
3 Copyright (C) 2008, 2009, 2013-2017 Genome Research Ltd.
4 Portions copyright (C) 2011 Broad Institute.
6 Author: Heng Li <lh3@sanger.ac.uk>
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
26 #include <config.h>
28 #include <ctype.h>
29 #include <string.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <inttypes.h>
33 #include <errno.h>
34 #include <limits.h>
35 #include <unistd.h>
36 #include <assert.h>
38 #include "htslib/bgzf.h"
39 #include "htslib/faidx.h"
40 #include "htslib/hfile.h"
41 #include "htslib/khash.h"
42 #include "htslib/kstring.h"
43 #include "hts_internal.h"
45 typedef struct {
46 int32_t line_len, line_blen;
47 int64_t len;
48 uint64_t offset;
49 } faidx1_t;
50 KHASH_MAP_INIT_STR(s, faidx1_t)
52 struct __faidx_t {
53 BGZF *bgzf;
54 int n, m;
55 char **name;
56 khash_t(s) *hash;
59 #ifndef kroundup32
60 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
61 #endif
63 static inline int fai_insert_index(faidx_t *idx, const char *name, int64_t len, int line_len, int line_blen, uint64_t offset)
65 if (!name) {
66 hts_log_error("Malformed line");
67 return -1;
70 char *name_key = strdup(name);
71 int absent;
72 khint_t k = kh_put(s, idx->hash, name_key, &absent);
73 faidx1_t *v = &kh_value(idx->hash, k);
75 if (! absent) {
76 hts_log_warning("Ignoring duplicate sequence \"%s\" at byte offset %"PRIu64"", name, offset);
77 free(name_key);
78 return 0;
81 if (idx->n == idx->m) {
82 char **tmp;
83 idx->m = idx->m? idx->m<<1 : 16;
84 if (!(tmp = (char**)realloc(idx->name, sizeof(char*) * idx->m))) {
85 hts_log_error("Out of memory");
86 return -1;
88 idx->name = tmp;
90 idx->name[idx->n++] = name_key;
91 v->len = len;
92 v->line_len = line_len;
93 v->line_blen = line_blen;
94 v->offset = offset;
96 return 0;
99 faidx_t *fai_build_core(BGZF *bgzf)
101 kstring_t name = { 0, 0, NULL };
102 int c;
103 int line_len, line_blen, state;
104 int l1, l2;
105 faidx_t *idx;
106 uint64_t offset;
107 int64_t len;
109 idx = (faidx_t*)calloc(1, sizeof(faidx_t));
110 idx->hash = kh_init(s);
111 len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
112 while ( (c=bgzf_getc(bgzf))>=0 ) {
113 if (c == '\n') { // an empty line
114 if (state == 1) {
115 offset = bgzf_utell(bgzf);
116 continue;
117 } else if ((state == 0 && len < 0) || state == 2) continue;
118 else if (state == 0) { state = 2; continue; }
120 if (c == '>') { // fasta header
121 if (len >= 0) {
122 if (fai_insert_index(idx, name.s, len, line_len, line_blen, offset) != 0)
123 goto fail;
126 name.l = 0;
127 while ((c = bgzf_getc(bgzf)) >= 0)
128 if (! isspace(c)) kputc_(c, &name);
129 else if (name.l > 0 || c == '\n') break;
130 kputsn("", 0, &name);
132 if ( c<0 ) {
133 hts_log_error("The last entry has no sequence");
134 goto fail;
136 if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
137 state = 1; len = 0;
138 offset = bgzf_utell(bgzf);
139 } else {
140 if (state == 3) {
141 hts_log_error("Inlined empty line is not allowed in sequence '%s'", name.s);
142 goto fail;
144 if (state == 2) state = 3;
145 l1 = l2 = 0;
146 do {
147 ++l1;
148 if (isgraph(c)) ++l2;
149 } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
150 if (state == 3 && l2) {
151 hts_log_error("Different line length in sequence '%s'", name.s);
152 goto fail;
154 ++l1; len += l2;
155 if (state == 1) line_len = l1, line_blen = l2, state = 0;
156 else if (state == 0) {
157 if (l1 != line_len || l2 != line_blen) state = 2;
162 if (len >= 0) {
163 if (fai_insert_index(idx, name.s, len, line_len, line_blen, offset) != 0)
164 goto fail;
165 } else {
166 goto fail;
169 free(name.s);
170 return idx;
172 fail:
173 free(name.s);
174 fai_destroy(idx);
175 return NULL;
178 static int fai_save(const faidx_t *fai, hFILE *fp) {
179 khint_t k;
180 int i;
181 char buf[96]; // Must be big enough for format below.
183 for (i = 0; i < fai->n; ++i) {
184 faidx1_t x;
185 k = kh_get(s, fai->hash, fai->name[i]);
186 assert(k < kh_end(fai->hash));
187 x = kh_value(fai->hash, k);
188 snprintf(buf, sizeof(buf),
189 "\t%"PRId64"\t%"PRIu64"\t%"PRId32"\t%"PRId32"\n",
190 x.len, x.offset, x.line_blen, x.line_len);
191 if (hputs(fai->name[i], fp) != 0) return -1;
192 if (hputs(buf, fp) != 0) return -1;
194 return 0;
197 static faidx_t *fai_read(hFILE *fp, const char *fname)
199 faidx_t *fai;
200 char *buf = NULL, *p;
201 int line_len, line_blen, n;
202 int64_t len;
203 uint64_t offset;
204 ssize_t l, lnum = 1;
206 fai = (faidx_t*)calloc(1, sizeof(faidx_t));
207 if (!fai) return NULL;
209 fai->hash = kh_init(s);
210 if (!fai->hash) goto fail;
212 buf = (char*)calloc(0x10000, 1);
213 if (!buf) goto fail;
215 while ((l = hgetln(buf, 0x10000, fp)) > 0) {
216 for (p = buf; *p && !isspace_c(*p); ++p);
217 if (p - buf < l) {
218 *p = 0; ++p;
220 n = sscanf(p, "%"SCNd64"%"SCNu64"%d%d", &len, &offset, &line_blen, &line_len);
221 if (n != 4) {
222 hts_log_error("Could not understand FAI %s line %zd", fname, lnum);
223 goto fail;
225 if (fai_insert_index(fai, buf, len, line_len, line_blen, offset) != 0) {
226 goto fail;
228 if (buf[l - 1] == '\n') ++lnum;
231 if (l < 0) {
232 hts_log_error("Error while reading %s: %s", fname, strerror(errno));
233 goto fail;
235 free(buf);
236 return fai;
238 fail:
239 free(buf);
240 fai_destroy(fai);
241 return NULL;
244 void fai_destroy(faidx_t *fai)
246 int i;
247 if (!fai) return;
248 for (i = 0; i < fai->n; ++i) free(fai->name[i]);
249 free(fai->name);
250 kh_destroy(s, fai->hash);
251 if (fai->bgzf) bgzf_close(fai->bgzf);
252 free(fai);
255 int fai_build3(const char *fn, const char *fnfai, const char *fngzi)
257 kstring_t fai_kstr = { 0, 0, NULL };
258 kstring_t gzi_kstr = { 0, 0, NULL };
259 BGZF *bgzf = NULL;
260 hFILE *fp = NULL;
261 faidx_t *fai = NULL;
262 int save_errno, res;
264 if (!fnfai) {
265 if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
266 fnfai = fai_kstr.s;
268 if (!fngzi) {
269 if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
270 fngzi = gzi_kstr.s;
273 bgzf = bgzf_open(fn, "r");
274 if ( !bgzf ) {
275 hts_log_error("Failed to open the FASTA file %s", fn);
276 goto fail;
278 if ( bgzf->is_compressed ) {
279 if (bgzf_index_build_init(bgzf) != 0) {
280 hts_log_error("Failed to allocate bgzf index");
281 goto fail;
284 fai = fai_build_core(bgzf);
285 if ( !fai ) {
286 if (bgzf->is_compressed && bgzf->is_gzip) {
287 hts_log_error("Cannot index files compressed with gzip, please use bgzip");
289 goto fail;
291 if ( bgzf->is_compressed ) {
292 if (bgzf_index_dump(bgzf, fngzi, NULL) < 0) {
293 hts_log_error("Failed to make bgzf index %s", fngzi);
294 goto fail;
297 res = bgzf_close(bgzf);
298 bgzf = NULL;
299 if (res < 0) {
300 hts_log_error("Error on closing %s : %s", fn, strerror(errno));
301 goto fail;
303 fp = hopen(fnfai, "wb");
304 if ( !fp ) {
305 hts_log_error("Failed to open FASTA index %s : %s", fnfai, strerror(errno));
306 goto fail;
308 if (fai_save(fai, fp) != 0) {
309 hts_log_error("Failed to write FASTA index %s : %s", fnfai, strerror(errno));
310 goto fail;
312 if (hclose(fp) != 0) {
313 hts_log_error("Failed on closing FASTA index %s : %s", fnfai, strerror(errno));
314 goto fail;
317 free(fai_kstr.s);
318 free(gzi_kstr.s);
319 fai_destroy(fai);
320 return 0;
322 fail:
323 save_errno = errno;
324 free(fai_kstr.s);
325 free(gzi_kstr.s);
326 bgzf_close(bgzf);
327 fai_destroy(fai);
328 errno = save_errno;
329 return -1;
332 int fai_build(const char *fn) {
333 return fai_build3(fn, NULL, NULL);
336 faidx_t *fai_load3(const char *fn, const char *fnfai, const char *fngzi,
337 int flags)
339 kstring_t fai_kstr = { 0, 0, NULL };
340 kstring_t gzi_kstr = { 0, 0, NULL };
341 hFILE *fp = NULL;
342 faidx_t *fai = NULL;
343 int res;
345 if (fn == NULL)
346 return NULL;
348 if (fnfai == NULL) {
349 if (ksprintf(&fai_kstr, "%s.fai", fn) < 0) goto fail;
350 fnfai = fai_kstr.s;
352 if (fngzi == NULL) {
353 if (ksprintf(&gzi_kstr, "%s.gzi", fn) < 0) goto fail;
354 fngzi = gzi_kstr.s;
357 fp = hopen(fnfai, "rb");
359 if (fp == 0) {
360 if (!(flags & FAI_CREATE) || errno != ENOENT) {
361 hts_log_error("Failed to open FASTA index %s: %s", fnfai, strerror(errno));
362 goto fail;
365 hts_log_info("Build FASTA index");
367 if (fai_build3(fn, fnfai, fngzi) < 0) {
368 goto fail;
371 fp = hopen(fnfai, "rb");
372 if (fp == 0) {
373 hts_log_error("Failed to open FASTA index %s: %s", fnfai, strerror(errno));
374 goto fail;
378 fai = fai_read(fp, fnfai);
379 if (fai == NULL) {
380 hts_log_error("Failed to read FASTA index %s", fnfai);
381 goto fail;
384 res = hclose(fp);
385 fp = NULL;
386 if (res < 0) {
387 hts_log_error("Failed on closing FASTA index %s : %s", fnfai, strerror(errno));
388 goto fail;
391 fai->bgzf = bgzf_open(fn, "rb");
392 if (fai->bgzf == 0) {
393 hts_log_error("Failed to open FASTA file %s", fn);
394 goto fail;
396 if ( fai->bgzf->is_compressed==1 ) {
397 if ( bgzf_index_load(fai->bgzf, fngzi, NULL) < 0 ) {
398 hts_log_error("Failed to load .gzi index: %s", fngzi);
399 goto fail;
402 free(fai_kstr.s);
403 free(gzi_kstr.s);
404 return fai;
406 fail:
407 if (fai) fai_destroy(fai);
408 if (fp) hclose_abruptly(fp);
409 free(fai_kstr.s);
410 free(gzi_kstr.s);
411 return NULL;
414 faidx_t *fai_load(const char *fn)
416 return fai_load3(fn, NULL, NULL, FAI_CREATE);
419 static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
420 long beg, long end, int *len) {
421 char *s;
422 size_t l;
423 int c = 0;
424 int ret = bgzf_useek(fai->bgzf,
425 val->offset
426 + beg / val->line_blen * val->line_len
427 + beg % val->line_blen, SEEK_SET);
429 if (ret < 0) {
430 *len = -1;
431 hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)");
432 return NULL;
435 l = 0;
436 s = (char*)malloc((size_t) end - beg + 2);
437 if (!s) {
438 *len = -1;
439 return NULL;
442 while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 )
443 if (isgraph(c)) s[l++] = c;
444 if (c < 0) {
445 hts_log_error("Failed to retrieve block: %s",
446 c == -1 ? "unexpected end of file" : "error reading file");
447 free(s);
448 *len = -1;
449 return NULL;
452 s[l] = '\0';
453 *len = l < INT_MAX ? l : INT_MAX;
454 return s;
457 char *fai_fetch(const faidx_t *fai, const char *str, int *len)
459 char *s, *ep;
460 size_t i, l, k, name_end;
461 khiter_t iter;
462 faidx1_t val;
463 khash_t(s) *h;
464 long beg, end;
466 beg = end = -1;
467 h = fai->hash;
468 name_end = l = strlen(str);
469 s = (char*)malloc(l+1);
470 if (!s) {
471 *len = -1;
472 return NULL;
475 // remove space
476 for (i = k = 0; i < l; ++i)
477 if (!isspace_c(str[i])) s[k++] = str[i];
478 s[k] = 0;
479 name_end = l = k;
480 // determine the sequence name
481 for (i = l; i > 0; --i) if (s[i - 1] == ':') break; // look for colon from the end
482 if (i > 0) name_end = i - 1;
483 if (name_end < l) { // check if this is really the end
484 int n_hyphen = 0;
485 for (i = name_end + 1; i < l; ++i) {
486 if (s[i] == '-') ++n_hyphen;
487 else if (!isdigit_c(s[i]) && s[i] != ',') break;
489 if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
490 s[name_end] = 0;
491 iter = kh_get(s, h, s);
492 if (iter == kh_end(h)) { // cannot find the sequence name
493 iter = kh_get(s, h, str); // try str as the name
494 if (iter != kh_end(h)) {
495 s[name_end] = ':';
496 name_end = l;
499 } else iter = kh_get(s, h, str);
500 if(iter == kh_end(h)) {
501 hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str);
502 free(s);
503 *len = -2;
504 return 0;
506 val = kh_value(h, iter);
507 // parse the interval
508 if (name_end < l) {
509 int save_errno = errno;
510 errno = 0;
511 for (i = k = name_end + 1; i < l; ++i)
512 if (s[i] != ',') s[k++] = s[i];
513 s[k] = 0;
514 if (s[name_end + 1] == '-') {
515 beg = 0;
516 i = name_end + 2;
517 } else {
518 beg = strtol(s + name_end + 1, &ep, 10);
519 for (i = ep - s; i < k;) if (s[i++] == '-') break;
521 end = i < k? strtol(s + i, &ep, 10) : val.len;
522 if (beg > 0) --beg;
523 // Check for out of range numbers. Only going to be a problem on
524 // 32-bit platforms with >2Gb sequence length.
525 if (errno == ERANGE && (uint64_t) val.len > LONG_MAX) {
526 hts_log_error("Positions in range %s are too large for this platform", s);
527 free(s);
528 *len = -2;
529 return NULL;
531 errno = save_errno;
532 } else beg = 0, end = val.len;
533 if (beg >= val.len) beg = val.len;
534 if (end >= val.len) end = val.len;
535 if (beg > end) beg = end;
536 free(s);
538 // now retrieve the sequence
539 return fai_retrieve(fai, &val, beg, end, len);
542 int faidx_fetch_nseq(const faidx_t *fai)
544 return fai->n;
547 int faidx_nseq(const faidx_t *fai)
549 return fai->n;
552 const char *faidx_iseq(const faidx_t *fai, int i)
554 return fai->name[i];
557 int faidx_seq_len(const faidx_t *fai, const char *seq)
559 khint_t k = kh_get(s, fai->hash, seq);
560 if ( k == kh_end(fai->hash) ) return -1;
561 return kh_val(fai->hash, k).len;
564 char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
566 khiter_t iter;
567 faidx1_t val;
569 // Adjust position
570 iter = kh_get(s, fai->hash, c_name);
571 if (iter == kh_end(fai->hash))
573 *len = -2;
574 hts_log_error("The sequence \"%s\" not found", c_name);
575 return NULL;
577 val = kh_value(fai->hash, iter);
578 if(p_end_i < p_beg_i) p_beg_i = p_end_i;
579 if(p_beg_i < 0) p_beg_i = 0;
580 else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
581 if(p_end_i < 0) p_end_i = 0;
582 else if(val.len <= p_end_i) p_end_i = val.len - 1;
584 // Now retrieve the sequence
585 return fai_retrieve(fai, &val, p_beg_i, (long) p_end_i + 1, len);
588 int faidx_has_seq(const faidx_t *fai, const char *seq)
590 khiter_t iter = kh_get(s, fai->hash, seq);
591 if (iter == kh_end(fai->hash)) return 0;
592 return 1;