1 /* faidx.c -- FASTA random access.
3 Copyright (C) 2008, 2009, 2013-2017 Genome Research Ltd.
4 Portions copyright (C) 2011 Broad Institute.
6 Author: Heng Li <lh3@sanger.ac.uk>
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
38 #include "htslib/bgzf.h"
39 #include "htslib/faidx.h"
40 #include "htslib/hfile.h"
41 #include "htslib/khash.h"
42 #include "htslib/kstring.h"
43 #include "hts_internal.h"
46 int32_t line_len
, line_blen
;
50 KHASH_MAP_INIT_STR(s
, faidx1_t
)
60 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
63 static inline int fai_insert_index(faidx_t
*idx
, const char *name
, int64_t len
, int line_len
, int line_blen
, uint64_t offset
)
66 hts_log_error("Malformed line");
70 char *name_key
= strdup(name
);
72 khint_t k
= kh_put(s
, idx
->hash
, name_key
, &absent
);
73 faidx1_t
*v
= &kh_value(idx
->hash
, k
);
76 hts_log_warning("Ignoring duplicate sequence \"%s\" at byte offset %"PRIu64
"", name
, offset
);
81 if (idx
->n
== idx
->m
) {
83 idx
->m
= idx
->m
? idx
->m
<<1 : 16;
84 if (!(tmp
= (char**)realloc(idx
->name
, sizeof(char*) * idx
->m
))) {
85 hts_log_error("Out of memory");
90 idx
->name
[idx
->n
++] = name_key
;
92 v
->line_len
= line_len
;
93 v
->line_blen
= line_blen
;
99 faidx_t
*fai_build_core(BGZF
*bgzf
)
101 kstring_t name
= { 0, 0, NULL
};
103 int line_len
, line_blen
, state
;
109 idx
= (faidx_t
*)calloc(1, sizeof(faidx_t
));
110 idx
->hash
= kh_init(s
);
111 len
= line_len
= line_blen
= -1; state
= 0; l1
= l2
= -1; offset
= 0;
112 while ( (c
=bgzf_getc(bgzf
))>=0 ) {
113 if (c
== '\n') { // an empty line
115 offset
= bgzf_utell(bgzf
);
117 } else if ((state
== 0 && len
< 0) || state
== 2) continue;
118 else if (state
== 0) { state
= 2; continue; }
120 if (c
== '>') { // fasta header
122 if (fai_insert_index(idx
, name
.s
, len
, line_len
, line_blen
, offset
) != 0)
127 while ((c
= bgzf_getc(bgzf
)) >= 0)
128 if (! isspace(c
)) kputc_(c
, &name
);
129 else if (name
.l
> 0 || c
== '\n') break;
130 kputsn("", 0, &name
);
133 hts_log_error("The last entry has no sequence");
136 if (c
!= '\n') while ( (c
=bgzf_getc(bgzf
))>=0 && c
!= '\n');
138 offset
= bgzf_utell(bgzf
);
141 hts_log_error("Inlined empty line is not allowed in sequence '%s'", name
.s
);
144 if (state
== 2) state
= 3;
148 if (isgraph(c
)) ++l2
;
149 } while ( (c
=bgzf_getc(bgzf
))>=0 && c
!= '\n');
150 if (state
== 3 && l2
) {
151 hts_log_error("Different line length in sequence '%s'", name
.s
);
155 if (state
== 1) line_len
= l1
, line_blen
= l2
, state
= 0;
156 else if (state
== 0) {
157 if (l1
!= line_len
|| l2
!= line_blen
) state
= 2;
163 if (fai_insert_index(idx
, name
.s
, len
, line_len
, line_blen
, offset
) != 0)
178 static int fai_save(const faidx_t
*fai
, hFILE
*fp
) {
181 char buf
[96]; // Must be big enough for format below.
183 for (i
= 0; i
< fai
->n
; ++i
) {
185 k
= kh_get(s
, fai
->hash
, fai
->name
[i
]);
186 assert(k
< kh_end(fai
->hash
));
187 x
= kh_value(fai
->hash
, k
);
188 snprintf(buf
, sizeof(buf
),
189 "\t%"PRId64
"\t%"PRIu64
"\t%"PRId32
"\t%"PRId32
"\n",
190 x
.len
, x
.offset
, x
.line_blen
, x
.line_len
);
191 if (hputs(fai
->name
[i
], fp
) != 0) return -1;
192 if (hputs(buf
, fp
) != 0) return -1;
197 static faidx_t
*fai_read(hFILE
*fp
, const char *fname
)
200 char *buf
= NULL
, *p
;
201 int line_len
, line_blen
, n
;
206 fai
= (faidx_t
*)calloc(1, sizeof(faidx_t
));
207 if (!fai
) return NULL
;
209 fai
->hash
= kh_init(s
);
210 if (!fai
->hash
) goto fail
;
212 buf
= (char*)calloc(0x10000, 1);
215 while ((l
= hgetln(buf
, 0x10000, fp
)) > 0) {
216 for (p
= buf
; *p
&& !isspace_c(*p
); ++p
);
220 n
= sscanf(p
, "%"SCNd64
"%"SCNu64
"%d%d", &len
, &offset
, &line_blen
, &line_len
);
222 hts_log_error("Could not understand FAI %s line %zd", fname
, lnum
);
225 if (fai_insert_index(fai
, buf
, len
, line_len
, line_blen
, offset
) != 0) {
228 if (buf
[l
- 1] == '\n') ++lnum
;
232 hts_log_error("Error while reading %s: %s", fname
, strerror(errno
));
244 void fai_destroy(faidx_t
*fai
)
248 for (i
= 0; i
< fai
->n
; ++i
) free(fai
->name
[i
]);
250 kh_destroy(s
, fai
->hash
);
251 if (fai
->bgzf
) bgzf_close(fai
->bgzf
);
255 int fai_build3(const char *fn
, const char *fnfai
, const char *fngzi
)
257 kstring_t fai_kstr
= { 0, 0, NULL
};
258 kstring_t gzi_kstr
= { 0, 0, NULL
};
265 if (ksprintf(&fai_kstr
, "%s.fai", fn
) < 0) goto fail
;
269 if (ksprintf(&gzi_kstr
, "%s.gzi", fn
) < 0) goto fail
;
273 bgzf
= bgzf_open(fn
, "r");
275 hts_log_error("Failed to open the FASTA file %s", fn
);
278 if ( bgzf
->is_compressed
) {
279 if (bgzf_index_build_init(bgzf
) != 0) {
280 hts_log_error("Failed to allocate bgzf index");
284 fai
= fai_build_core(bgzf
);
286 if (bgzf
->is_compressed
&& bgzf
->is_gzip
) {
287 hts_log_error("Cannot index files compressed with gzip, please use bgzip");
291 if ( bgzf
->is_compressed
) {
292 if (bgzf_index_dump(bgzf
, fngzi
, NULL
) < 0) {
293 hts_log_error("Failed to make bgzf index %s", fngzi
);
297 res
= bgzf_close(bgzf
);
300 hts_log_error("Error on closing %s : %s", fn
, strerror(errno
));
303 fp
= hopen(fnfai
, "wb");
305 hts_log_error("Failed to open FASTA index %s : %s", fnfai
, strerror(errno
));
308 if (fai_save(fai
, fp
) != 0) {
309 hts_log_error("Failed to write FASTA index %s : %s", fnfai
, strerror(errno
));
312 if (hclose(fp
) != 0) {
313 hts_log_error("Failed on closing FASTA index %s : %s", fnfai
, strerror(errno
));
332 int fai_build(const char *fn
) {
333 return fai_build3(fn
, NULL
, NULL
);
336 faidx_t
*fai_load3(const char *fn
, const char *fnfai
, const char *fngzi
,
339 kstring_t fai_kstr
= { 0, 0, NULL
};
340 kstring_t gzi_kstr
= { 0, 0, NULL
};
349 if (ksprintf(&fai_kstr
, "%s.fai", fn
) < 0) goto fail
;
353 if (ksprintf(&gzi_kstr
, "%s.gzi", fn
) < 0) goto fail
;
357 fp
= hopen(fnfai
, "rb");
360 if (!(flags
& FAI_CREATE
) || errno
!= ENOENT
) {
361 hts_log_error("Failed to open FASTA index %s: %s", fnfai
, strerror(errno
));
365 hts_log_info("Build FASTA index");
367 if (fai_build3(fn
, fnfai
, fngzi
) < 0) {
371 fp
= hopen(fnfai
, "rb");
373 hts_log_error("Failed to open FASTA index %s: %s", fnfai
, strerror(errno
));
378 fai
= fai_read(fp
, fnfai
);
380 hts_log_error("Failed to read FASTA index %s", fnfai
);
387 hts_log_error("Failed on closing FASTA index %s : %s", fnfai
, strerror(errno
));
391 fai
->bgzf
= bgzf_open(fn
, "rb");
392 if (fai
->bgzf
== 0) {
393 hts_log_error("Failed to open FASTA file %s", fn
);
396 if ( fai
->bgzf
->is_compressed
==1 ) {
397 if ( bgzf_index_load(fai
->bgzf
, fngzi
, NULL
) < 0 ) {
398 hts_log_error("Failed to load .gzi index: %s", fngzi
);
407 if (fai
) fai_destroy(fai
);
408 if (fp
) hclose_abruptly(fp
);
414 faidx_t
*fai_load(const char *fn
)
416 return fai_load3(fn
, NULL
, NULL
, FAI_CREATE
);
419 static char *fai_retrieve(const faidx_t
*fai
, const faidx1_t
*val
,
420 long beg
, long end
, int *len
) {
424 int ret
= bgzf_useek(fai
->bgzf
,
426 + beg
/ val
->line_blen
* val
->line_len
427 + beg
% val
->line_blen
, SEEK_SET
);
431 hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)");
436 s
= (char*)malloc((size_t) end
- beg
+ 2);
442 while ( l
< end
- beg
&& (c
=bgzf_getc(fai
->bgzf
))>=0 )
443 if (isgraph(c
)) s
[l
++] = c
;
445 hts_log_error("Failed to retrieve block: %s",
446 c
== -1 ? "unexpected end of file" : "error reading file");
453 *len
= l
< INT_MAX
? l
: INT_MAX
;
457 char *fai_fetch(const faidx_t
*fai
, const char *str
, int *len
)
460 size_t i
, l
, k
, name_end
;
468 name_end
= l
= strlen(str
);
469 s
= (char*)malloc(l
+1);
476 for (i
= k
= 0; i
< l
; ++i
)
477 if (!isspace_c(str
[i
])) s
[k
++] = str
[i
];
480 // determine the sequence name
481 for (i
= l
; i
> 0; --i
) if (s
[i
- 1] == ':') break; // look for colon from the end
482 if (i
> 0) name_end
= i
- 1;
483 if (name_end
< l
) { // check if this is really the end
485 for (i
= name_end
+ 1; i
< l
; ++i
) {
486 if (s
[i
] == '-') ++n_hyphen
;
487 else if (!isdigit_c(s
[i
]) && s
[i
] != ',') break;
489 if (i
< l
|| n_hyphen
> 1) name_end
= l
; // malformated region string; then take str as the name
491 iter
= kh_get(s
, h
, s
);
492 if (iter
== kh_end(h
)) { // cannot find the sequence name
493 iter
= kh_get(s
, h
, str
); // try str as the name
494 if (iter
!= kh_end(h
)) {
499 } else iter
= kh_get(s
, h
, str
);
500 if(iter
== kh_end(h
)) {
501 hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str
);
506 val
= kh_value(h
, iter
);
507 // parse the interval
509 int save_errno
= errno
;
511 for (i
= k
= name_end
+ 1; i
< l
; ++i
)
512 if (s
[i
] != ',') s
[k
++] = s
[i
];
514 if (s
[name_end
+ 1] == '-') {
518 beg
= strtol(s
+ name_end
+ 1, &ep
, 10);
519 for (i
= ep
- s
; i
< k
;) if (s
[i
++] == '-') break;
521 end
= i
< k
? strtol(s
+ i
, &ep
, 10) : val
.len
;
523 // Check for out of range numbers. Only going to be a problem on
524 // 32-bit platforms with >2Gb sequence length.
525 if (errno
== ERANGE
&& (uint64_t) val
.len
> LONG_MAX
) {
526 hts_log_error("Positions in range %s are too large for this platform", s
);
532 } else beg
= 0, end
= val
.len
;
533 if (beg
>= val
.len
) beg
= val
.len
;
534 if (end
>= val
.len
) end
= val
.len
;
535 if (beg
> end
) beg
= end
;
538 // now retrieve the sequence
539 return fai_retrieve(fai
, &val
, beg
, end
, len
);
542 int faidx_fetch_nseq(const faidx_t
*fai
)
547 int faidx_nseq(const faidx_t
*fai
)
552 const char *faidx_iseq(const faidx_t
*fai
, int i
)
557 int faidx_seq_len(const faidx_t
*fai
, const char *seq
)
559 khint_t k
= kh_get(s
, fai
->hash
, seq
);
560 if ( k
== kh_end(fai
->hash
) ) return -1;
561 return kh_val(fai
->hash
, k
).len
;
564 char *faidx_fetch_seq(const faidx_t
*fai
, const char *c_name
, int p_beg_i
, int p_end_i
, int *len
)
570 iter
= kh_get(s
, fai
->hash
, c_name
);
571 if (iter
== kh_end(fai
->hash
))
574 hts_log_error("The sequence \"%s\" not found", c_name
);
577 val
= kh_value(fai
->hash
, iter
);
578 if(p_end_i
< p_beg_i
) p_beg_i
= p_end_i
;
579 if(p_beg_i
< 0) p_beg_i
= 0;
580 else if(val
.len
<= p_beg_i
) p_beg_i
= val
.len
- 1;
581 if(p_end_i
< 0) p_end_i
= 0;
582 else if(val
.len
<= p_end_i
) p_end_i
= val
.len
- 1;
584 // Now retrieve the sequence
585 return fai_retrieve(fai
, &val
, p_beg_i
, (long) p_end_i
+ 1, len
);
588 int faidx_has_seq(const faidx_t
*fai
, const char *seq
)
590 khiter_t iter
= kh_get(s
, fai
->hash
, seq
);
591 if (iter
== kh_end(fai
->hash
)) return 0;