2 Copyright (c) 2013-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 * The index is a gzipped tab-delimited text file with one line per slice.
34 * 1: reference number (0 to N-1, as per BAM ref_id)
35 * 2: reference position of 1st read in slice (1..?)
36 * 3: number of reads in slice
37 * 4: offset of container start (relative to end of SAM header, so 1st
38 * container is offset 0).
39 * 5: slice number within container (ie which landmark).
41 * In memory, we hold this in a nested containment list. Each list element is
42 * a cram_index struct. Each element in turn can contain its own list of
45 * Any start..end range which is entirely contained within another (and
46 * earlier as it is sorted) range will be held within it. This ensures that
47 * the outer list will never have containments and we can safely do a
48 * binary search to find the first range which overlaps any given coordinate.
59 #include <sys/types.h>
63 #include "htslib/bgzf.h"
64 #include "htslib/hfile.h"
65 #include "hts_internal.h"
66 #include "cram/cram.h"
70 static void dump_index_(cram_index
*e
, int level
) {
72 n
= printf("%*s%d / %d .. %d, ", level
*4, "", e
->refid
, e
->start
, e
->end
);
73 printf("%*soffset %"PRId64
"\n", MAX(0,50-n
), "", e
->offset
);
74 for (i
= 0; i
< e
->nslice
; i
++) {
75 dump_index_(&e
->e
[i
], level
+1);
79 static void dump_index(cram_fd
*fd
) {
81 for (i
= 0; i
< fd
->index_sz
; i
++) {
82 dump_index_(&fd
->index
[i
], 0);
87 static int kget_int32(kstring_t
*k
, size_t *pos
, int32_t *val_p
) {
92 while (p
< k
->l
&& (k
->s
[p
] == ' ' || k
->s
[p
] == '\t'))
95 if (p
< k
->l
&& k
->s
[p
] == '-')
98 if (p
>= k
->l
|| !(k
->s
[p
] >= '0' && k
->s
[p
] <= '9'))
101 while (p
< k
->l
&& k
->s
[p
] >= '0' && k
->s
[p
] <= '9')
102 val
= val
*10 + k
->s
[p
++]-'0';
110 static int kget_int64(kstring_t
*k
, size_t *pos
, int64_t *val_p
) {
115 while (p
< k
->l
&& (k
->s
[p
] == ' ' || k
->s
[p
] == '\t'))
118 if (p
< k
->l
&& k
->s
[p
] == '-')
121 if (p
>= k
->l
|| !(k
->s
[p
] >= '0' && k
->s
[p
] <= '9'))
124 while (p
< k
->l
&& k
->s
[p
] >= '0' && k
->s
[p
] <= '9')
125 val
= val
*10 + k
->s
[p
++]-'0';
134 * Loads a CRAM .crai index into memory.
136 * Returns 0 for success
139 int cram_index_load(cram_fd
*fd
, const char *fn
, const char *fn_idx
) {
143 kstring_t kstr
= {0};
146 cram_index
**idx_stack
= NULL
, *ep
, e
;
147 int idx_stack_alloc
= 0, idx_stack_ptr
= 0;
150 /* Check if already loaded */
154 fd
->index
= calloc((fd
->index_sz
= 1), sizeof(*fd
->index
));
160 idx
->start
= INT_MIN
;
163 idx_stack
= calloc(++idx_stack_alloc
, sizeof(*idx_stack
));
167 idx_stack
[idx_stack_ptr
] = idx
;
170 fn2
= hts_idx_getfn(fn
, ".crai");
177 if (!(fp
= hopen(fn_idx
, "r"))) {
182 // Load the file into memory
183 while ((len
= hread(fp
, buf
, sizeof(buf
))) > 0) {
184 if (kputsn(buf
, len
, &kstr
) < 0)
188 if (len
< 0 || kstr
.l
< 2)
194 // Uncompress if required
195 if (kstr
.s
[0] == 31 && (uc
)kstr
.s
[1] == 139) {
197 char *s
= zlib_mem_inflate(kstr
.s
, kstr
.l
, &l
);
204 kstr
.m
= l
; // conservative estimate of the size allocated
205 if (kputsn("", 0, &kstr
) < 0) // ensure kstr.s is NUL-terminated
210 // Parse it line at a time
213 if (kget_int32(&kstr
, &pos
, &e
.refid
) == -1)
216 if (kget_int32(&kstr
, &pos
, &e
.start
) == -1)
219 if (kget_int32(&kstr
, &pos
, &e
.end
) == -1)
222 if (kget_int64(&kstr
, &pos
, &e
.offset
) == -1)
225 if (kget_int32(&kstr
, &pos
, &e
.slice
) == -1)
228 if (kget_int32(&kstr
, &pos
, &e
.len
) == -1)
232 //printf("%d/%d..%d\n", e.refid, e.start, e.end);
235 fprintf(stderr
, "Malformed index file, refid %d\n", e
.refid
);
239 if (e
.refid
!= idx
->refid
) {
240 if (fd
->index_sz
< e
.refid
+2) {
242 int new_sz
= e
.refid
+2;
243 size_t index_end
= fd
->index_sz
* sizeof(*fd
->index
);
244 new_idx
= realloc(fd
->index
,
245 new_sz
* sizeof(*fd
->index
));
250 fd
->index_sz
= new_sz
;
251 memset(((char *)fd
->index
) + index_end
, 0,
252 fd
->index_sz
* sizeof(*fd
->index
) - index_end
);
254 idx
= &fd
->index
[e
.refid
+1];
255 idx
->refid
= e
.refid
;
256 idx
->start
= INT_MIN
;
258 idx
->nslice
= idx
->nalloc
= 0;
260 idx_stack
[(idx_stack_ptr
= 0)] = idx
;
263 while (!(e
.start
>= idx
->start
&& e
.end
<= idx
->end
) || idx
->end
== 0) {
264 idx
= idx_stack
[--idx_stack_ptr
];
267 // Now contains, so append
268 if (idx
->nslice
+1 >= idx
->nalloc
) {
270 idx
->nalloc
= idx
->nalloc
? idx
->nalloc
*2 : 16;
271 new_e
= realloc(idx
->e
, idx
->nalloc
* sizeof(*idx
->e
));
278 e
.nalloc
= e
.nslice
= 0; e
.e
= NULL
;
279 *(ep
= &idx
->e
[idx
->nslice
++]) = e
;
282 if (++idx_stack_ptr
>= idx_stack_alloc
) {
283 cram_index
**new_stack
;
284 idx_stack_alloc
*= 2;
285 new_stack
= realloc(idx_stack
, idx_stack_alloc
*sizeof(*idx_stack
));
288 idx_stack
= new_stack
;
290 idx_stack
[idx_stack_ptr
] = idx
;
292 while (pos
< kstr
.l
&& kstr
.s
[pos
] != '\n')
295 } while (pos
< kstr
.l
);
309 cram_index_free(fd
); // Also sets fd->index = NULL
313 static void cram_index_free_recurse(cram_index
*e
) {
316 for (i
= 0; i
< e
->nslice
; i
++) {
317 cram_index_free_recurse(&e
->e
[i
]);
323 void cram_index_free(cram_fd
*fd
) {
329 for (i
= 0; i
< fd
->index_sz
; i
++) {
330 cram_index_free_recurse(&fd
->index
[i
]);
338 * Searches the index for the first slice overlapping a reference ID
339 * and position, or one immediately preceding it if none is found in
340 * the index to overlap this position. (Our index may have missing
341 * entries, but we require at least one per reference.)
343 * If the index finds multiple slices overlapping this position we
344 * return the first one only. Subsequent calls should specifying
345 * "from" as the last slice we checked to find the next one. Otherwise
346 * set "from" to be NULL to find the first one.
348 * Returns the cram_index pointer on sucess
351 cram_index
*cram_index_query(cram_fd
*fd
, int refid
, int pos
,
356 if (refid
+1 < 0 || refid
+1 >= fd
->index_sz
)
360 from
= &fd
->index
[refid
+1];
362 // Ref with nothing aligned against it.
366 // This sequence is covered by the index, so binary search to find
367 // the optimal starting block.
368 i
= 0, j
= fd
->index
[refid
+1].nslice
-1;
369 for (k
= j
/2; k
!= i
; k
= (j
-i
)/2 + i
) {
370 if (from
->e
[k
].refid
> refid
) {
375 if (from
->e
[k
].refid
< refid
) {
380 if (from
->e
[k
].start
>= pos
) {
385 if (from
->e
[k
].start
< pos
) {
390 // i==j or i==j-1. Check if j is better.
391 if (j
>= 0 && from
->e
[j
].start
< pos
&& from
->e
[j
].refid
== refid
)
394 /* The above found *a* bin overlapping, but not necessarily the first */
395 while (i
> 0 && from
->e
[i
-1].end
>= pos
)
398 /* We may be one bin before the optimum, so check */
399 while (i
+1 < from
->nslice
&&
400 (from
->e
[i
].refid
< refid
||
401 from
->e
[i
].end
< pos
))
411 * Skips to a container overlapping the start coordinate listed in
414 * In theory we call cram_index_query multiple times, once per slice
415 * overlapping the range. However slices may be absent from the index
416 * which makes this problematic. Instead we find the left-most slice
417 * and then read from then on, skipping decoding of slices and/or
418 * whole containers when they don't overlap the specified cram_range.
420 * Returns 0 on success
421 * -1 on general failure
422 * -2 on no-data (empty chromosome)
424 int cram_seek_to_refpos(cram_fd
*fd
, cram_range
*r
) {
427 // Ideally use an index, so see if we have one.
428 if ((e
= cram_index_query(fd
, r
->refid
, r
->start
, NULL
))) {
429 if (0 != cram_seek(fd
, e
->offset
, SEEK_SET
))
430 if (0 != cram_seek(fd
, e
->offset
- fd
->first_container
, SEEK_CUR
))
433 // Absent from index, but this most likely means it simply has no data.
438 cram_free_container(fd
->ctr
);
448 * A specialised form of cram_index_build (below) that deals with slices
449 * having multiple references in this (ref_id -2). In this scenario we
450 * decode the slice to look at the RI data series instead.
452 * Returns 0 on success
454 * -4 on write failure
456 static int cram_index_build_multiref(cram_fd
*fd
,
463 int i
, ref
= -2, ref_start
= 0, ref_end
;
466 if (0 != cram_decode_slice(fd
, c
, s
, fd
->header
))
470 for (i
= 0; i
< s
->hdr
->num_records
; i
++) {
471 if (s
->crecs
[i
].ref_id
== ref
) {
472 if (ref_end
< s
->crecs
[i
].aend
)
473 ref_end
= s
->crecs
[i
].aend
;
478 sprintf(buf
, "%d\t%d\t%d\t%"PRId64
"\t%d\t%d\n",
479 ref
, ref_start
, ref_end
- ref_start
+ 1,
480 (int64_t)cpos
, landmark
, sz
);
481 if (bgzf_write(fp
, buf
, strlen(buf
)) < 0)
485 ref
= s
->crecs
[i
].ref_id
;
486 ref_start
= s
->crecs
[i
].apos
;
487 ref_end
= s
->crecs
[i
].aend
;
491 sprintf(buf
, "%d\t%d\t%d\t%"PRId64
"\t%d\t%d\n",
492 ref
, ref_start
, ref_end
- ref_start
+ 1,
493 (int64_t)cpos
, landmark
, sz
);
494 if (bgzf_write(fp
, buf
, strlen(buf
)) < 0)
502 * Builds an index file.
504 * fd is a newly opened cram file that we wish to index.
505 * fn_base is the filename of the associated CRAM file.
506 * fn_idx is the filename of the index file to be written;
507 * if NULL, we add ".crai" to fn_base to get the index filename.
509 * Returns 0 on success,
510 * negative on failure (-1 for read failure, -4 for write failure)
512 int cram_index_build(cram_fd
*fd
, const char *fn_base
, const char *fn_idx
) {
514 off_t cpos
, spos
, hpos
;
516 kstring_t fn_idx_str
= {0};
519 kputs(fn_base
, &fn_idx_str
);
520 kputs(".crai", &fn_idx_str
);
521 fn_idx
= fn_idx_str
.s
;
524 if (!(fp
= bgzf_open(fn_idx
, "wg"))) {
532 cpos
= htell(fd
->fp
);
533 while ((c
= cram_read_container(fd
))) {
537 perror("Cram container read");
541 hpos
= htell(fd
->fp
);
543 if (!(c
->comp_hdr_block
= cram_read_block(fd
)))
545 assert(c
->comp_hdr_block
->content_type
== COMPRESSION_HEADER
);
547 c
->comp_hdr
= cram_decode_compression_header(fd
, c
->comp_hdr_block
);
552 for (j
= 0; j
< c
->num_landmarks
; j
++) {
557 spos
= htell(fd
->fp
);
558 assert(spos
- cpos
- c
->offset
== c
->landmark
[j
]);
560 if (!(s
= cram_read_slice(fd
))) {
565 sz
= (int)(htell(fd
->fp
) - spos
);
567 if (s
->hdr
->ref_seq_id
== -2) {
568 ret
= cram_index_build_multiref(fd
, c
, s
, fp
,
569 cpos
, c
->landmark
[j
], sz
);
571 sprintf(buf
, "%d\t%d\t%d\t%"PRId64
"\t%d\t%d\n",
572 s
->hdr
->ref_seq_id
, s
->hdr
->ref_seq_start
,
573 s
->hdr
->ref_seq_span
, (int64_t)cpos
,
575 ret
= (bgzf_write(fp
, buf
, strlen(buf
)) >= 0)? 0 : -4;
586 cpos
= htell(fd
->fp
);
587 assert(cpos
== hpos
+ c
->length
);
589 cram_free_container(c
);
596 return (bgzf_close(fp
) >= 0)? 0 : -4;