modified: myjupyterlab.sh
[GalaxyCodeBases.git] / c_cpp / lib / htslib / cram / cram_index.c
blob53f178871dc541eab8ea3066800b64b83eb8cfeb
1 /*
2 Copyright (c) 2013-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 * The index is a gzipped tab-delimited text file with one line per slice.
33 * The columns are:
34 * 1: reference number (0 to N-1, as per BAM ref_id)
35 * 2: reference position of 1st read in slice (1..?)
36 * 3: number of reads in slice
37 * 4: offset of container start (relative to end of SAM header, so 1st
38 * container is offset 0).
39 * 5: slice number within container (ie which landmark).
41 * In memory, we hold this in a nested containment list. Each list element is
42 * a cram_index struct. Each element in turn can contain its own list of
43 * cram_index structs.
45 * Any start..end range which is entirely contained within another (and
46 * earlier as it is sorted) range will be held within it. This ensures that
47 * the outer list will never have containments and we can safely do a
48 * binary search to find the first range which overlaps any given coordinate.
51 #include <config.h>
53 #include <stdio.h>
54 #include <errno.h>
55 #include <assert.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <zlib.h>
59 #include <sys/types.h>
60 #include <sys/stat.h>
61 #include <math.h>
63 #include "htslib/bgzf.h"
64 #include "htslib/hfile.h"
65 #include "hts_internal.h"
66 #include "cram/cram.h"
67 #include "cram/os.h"
69 #if 0
70 static void dump_index_(cram_index *e, int level) {
71 int i, n;
72 n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
73 printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
74 for (i = 0; i < e->nslice; i++) {
75 dump_index_(&e->e[i], level+1);
79 static void dump_index(cram_fd *fd) {
80 int i;
81 for (i = 0; i < fd->index_sz; i++) {
82 dump_index_(&fd->index[i], 0);
85 #endif
87 static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
88 int sign = 1;
89 int32_t val = 0;
90 size_t p = *pos;
92 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
93 p++;
95 if (p < k->l && k->s[p] == '-')
96 sign = -1, p++;
98 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
99 return -1;
101 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
102 val = val*10 + k->s[p++]-'0';
104 *pos = p;
105 *val_p = sign*val;
107 return 0;
110 static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
111 int sign = 1;
112 int64_t val = 0;
113 size_t p = *pos;
115 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
116 p++;
118 if (p < k->l && k->s[p] == '-')
119 sign = -1, p++;
121 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
122 return -1;
124 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
125 val = val*10 + k->s[p++]-'0';
127 *pos = p;
128 *val_p = sign*val;
130 return 0;
134 * Loads a CRAM .crai index into memory.
136 * Returns 0 for success
137 * -1 for failure
139 int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
140 char *fn2 = NULL;
141 char buf[65536];
142 ssize_t len;
143 kstring_t kstr = {0};
144 hFILE *fp;
145 cram_index *idx;
146 cram_index **idx_stack = NULL, *ep, e;
147 int idx_stack_alloc = 0, idx_stack_ptr = 0;
148 size_t pos = 0;
150 /* Check if already loaded */
151 if (fd->index)
152 return 0;
154 fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
155 if (!fd->index)
156 return -1;
158 idx = &fd->index[0];
159 idx->refid = -1;
160 idx->start = INT_MIN;
161 idx->end = INT_MAX;
163 idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
164 if (!idx_stack)
165 goto fail;
167 idx_stack[idx_stack_ptr] = idx;
169 if (!fn_idx) {
170 fn2 = hts_idx_getfn(fn, ".crai");
171 if (!fn2)
172 goto fail;
174 fn_idx = fn2;
177 if (!(fp = hopen(fn_idx, "r"))) {
178 perror(fn_idx);
179 goto fail;
182 // Load the file into memory
183 while ((len = hread(fp, buf, sizeof(buf))) > 0) {
184 if (kputsn(buf, len, &kstr) < 0)
185 goto fail;
188 if (len < 0 || kstr.l < 2)
189 goto fail;
191 if (hclose(fp) < 0)
192 goto fail;
194 // Uncompress if required
195 if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
196 size_t l;
197 char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
198 if (!s)
199 goto fail;
201 free(kstr.s);
202 kstr.s = s;
203 kstr.l = l;
204 kstr.m = l; // conservative estimate of the size allocated
205 if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated
206 goto fail;
210 // Parse it line at a time
211 do {
212 /* 1.1 layout */
213 if (kget_int32(&kstr, &pos, &e.refid) == -1)
214 goto fail;
216 if (kget_int32(&kstr, &pos, &e.start) == -1)
217 goto fail;
219 if (kget_int32(&kstr, &pos, &e.end) == -1)
220 goto fail;
222 if (kget_int64(&kstr, &pos, &e.offset) == -1)
223 goto fail;
225 if (kget_int32(&kstr, &pos, &e.slice) == -1)
226 goto fail;
228 if (kget_int32(&kstr, &pos, &e.len) == -1)
229 goto fail;
231 e.end += e.start-1;
232 //printf("%d/%d..%d\n", e.refid, e.start, e.end);
234 if (e.refid < -1) {
235 fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
236 goto fail;
239 if (e.refid != idx->refid) {
240 if (fd->index_sz < e.refid+2) {
241 cram_index *new_idx;
242 int new_sz = e.refid+2;
243 size_t index_end = fd->index_sz * sizeof(*fd->index);
244 new_idx = realloc(fd->index,
245 new_sz * sizeof(*fd->index));
246 if (!new_idx)
247 goto fail;
249 fd->index = new_idx;
250 fd->index_sz = new_sz;
251 memset(((char *)fd->index) + index_end, 0,
252 fd->index_sz * sizeof(*fd->index) - index_end);
254 idx = &fd->index[e.refid+1];
255 idx->refid = e.refid;
256 idx->start = INT_MIN;
257 idx->end = INT_MAX;
258 idx->nslice = idx->nalloc = 0;
259 idx->e = NULL;
260 idx_stack[(idx_stack_ptr = 0)] = idx;
263 while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) {
264 idx = idx_stack[--idx_stack_ptr];
267 // Now contains, so append
268 if (idx->nslice+1 >= idx->nalloc) {
269 cram_index *new_e;
270 idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
271 new_e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
272 if (!new_e)
273 goto fail;
275 idx->e = new_e;
278 e.nalloc = e.nslice = 0; e.e = NULL;
279 *(ep = &idx->e[idx->nslice++]) = e;
280 idx = ep;
282 if (++idx_stack_ptr >= idx_stack_alloc) {
283 cram_index **new_stack;
284 idx_stack_alloc *= 2;
285 new_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
286 if (!new_stack)
287 goto fail;
288 idx_stack = new_stack;
290 idx_stack[idx_stack_ptr] = idx;
292 while (pos < kstr.l && kstr.s[pos] != '\n')
293 pos++;
294 pos++;
295 } while (pos < kstr.l);
297 free(idx_stack);
298 free(kstr.s);
299 free(fn2);
301 // dump_index(fd);
303 return 0;
305 fail:
306 free(kstr.s);
307 free(idx_stack);
308 free(fn2);
309 cram_index_free(fd); // Also sets fd->index = NULL
310 return -1;
313 static void cram_index_free_recurse(cram_index *e) {
314 if (e->e) {
315 int i;
316 for (i = 0; i < e->nslice; i++) {
317 cram_index_free_recurse(&e->e[i]);
319 free(e->e);
323 void cram_index_free(cram_fd *fd) {
324 int i;
326 if (!fd->index)
327 return;
329 for (i = 0; i < fd->index_sz; i++) {
330 cram_index_free_recurse(&fd->index[i]);
332 free(fd->index);
334 fd->index = NULL;
338 * Searches the index for the first slice overlapping a reference ID
339 * and position, or one immediately preceding it if none is found in
340 * the index to overlap this position. (Our index may have missing
341 * entries, but we require at least one per reference.)
343 * If the index finds multiple slices overlapping this position we
344 * return the first one only. Subsequent calls should specifying
345 * "from" as the last slice we checked to find the next one. Otherwise
346 * set "from" to be NULL to find the first one.
348 * Returns the cram_index pointer on sucess
349 * NULL on failure
351 cram_index *cram_index_query(cram_fd *fd, int refid, int pos,
352 cram_index *from) {
353 int i, j, k;
354 cram_index *e;
356 if (refid+1 < 0 || refid+1 >= fd->index_sz)
357 return NULL;
359 if (!from)
360 from = &fd->index[refid+1];
362 // Ref with nothing aligned against it.
363 if (!from->e)
364 return NULL;
366 // This sequence is covered by the index, so binary search to find
367 // the optimal starting block.
368 i = 0, j = fd->index[refid+1].nslice-1;
369 for (k = j/2; k != i; k = (j-i)/2 + i) {
370 if (from->e[k].refid > refid) {
371 j = k;
372 continue;
375 if (from->e[k].refid < refid) {
376 i = k;
377 continue;
380 if (from->e[k].start >= pos) {
381 j = k;
382 continue;
385 if (from->e[k].start < pos) {
386 i = k;
387 continue;
390 // i==j or i==j-1. Check if j is better.
391 if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
392 i = j;
394 /* The above found *a* bin overlapping, but not necessarily the first */
395 while (i > 0 && from->e[i-1].end >= pos)
396 i--;
398 /* We may be one bin before the optimum, so check */
399 while (i+1 < from->nslice &&
400 (from->e[i].refid < refid ||
401 from->e[i].end < pos))
402 i++;
404 e = &from->e[i];
406 return e;
411 * Skips to a container overlapping the start coordinate listed in
412 * cram_range.
414 * In theory we call cram_index_query multiple times, once per slice
415 * overlapping the range. However slices may be absent from the index
416 * which makes this problematic. Instead we find the left-most slice
417 * and then read from then on, skipping decoding of slices and/or
418 * whole containers when they don't overlap the specified cram_range.
420 * Returns 0 on success
421 * -1 on general failure
422 * -2 on no-data (empty chromosome)
424 int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
425 cram_index *e;
427 // Ideally use an index, so see if we have one.
428 if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
429 if (0 != cram_seek(fd, e->offset, SEEK_SET))
430 if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
431 return -1;
432 } else {
433 // Absent from index, but this most likely means it simply has no data.
434 return -2;
437 if (fd->ctr) {
438 cram_free_container(fd->ctr);
439 fd->ctr = NULL;
440 fd->ooc = 0;
443 return 0;
448 * A specialised form of cram_index_build (below) that deals with slices
449 * having multiple references in this (ref_id -2). In this scenario we
450 * decode the slice to look at the RI data series instead.
452 * Returns 0 on success
453 * -1 on read failure
454 * -4 on write failure
456 static int cram_index_build_multiref(cram_fd *fd,
457 cram_container *c,
458 cram_slice *s,
459 BGZF *fp,
460 off_t cpos,
461 int32_t landmark,
462 int sz) {
463 int i, ref = -2, ref_start = 0, ref_end;
464 char buf[1024];
466 if (0 != cram_decode_slice(fd, c, s, fd->header))
467 return -1;
469 ref_end = INT_MIN;
470 for (i = 0; i < s->hdr->num_records; i++) {
471 if (s->crecs[i].ref_id == ref) {
472 if (ref_end < s->crecs[i].aend)
473 ref_end = s->crecs[i].aend;
474 continue;
477 if (ref != -2) {
478 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
479 ref, ref_start, ref_end - ref_start + 1,
480 (int64_t)cpos, landmark, sz);
481 if (bgzf_write(fp, buf, strlen(buf)) < 0)
482 return -4;
485 ref = s->crecs[i].ref_id;
486 ref_start = s->crecs[i].apos;
487 ref_end = s->crecs[i].aend;
490 if (ref != -2) {
491 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
492 ref, ref_start, ref_end - ref_start + 1,
493 (int64_t)cpos, landmark, sz);
494 if (bgzf_write(fp, buf, strlen(buf)) < 0)
495 return -4;
498 return 0;
502 * Builds an index file.
504 * fd is a newly opened cram file that we wish to index.
505 * fn_base is the filename of the associated CRAM file.
506 * fn_idx is the filename of the index file to be written;
507 * if NULL, we add ".crai" to fn_base to get the index filename.
509 * Returns 0 on success,
510 * negative on failure (-1 for read failure, -4 for write failure)
512 int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
513 cram_container *c;
514 off_t cpos, spos, hpos;
515 BGZF *fp;
516 kstring_t fn_idx_str = {0};
518 if (! fn_idx) {
519 kputs(fn_base, &fn_idx_str);
520 kputs(".crai", &fn_idx_str);
521 fn_idx = fn_idx_str.s;
524 if (!(fp = bgzf_open(fn_idx, "wg"))) {
525 perror(fn_idx);
526 free(fn_idx_str.s);
527 return -4;
530 free(fn_idx_str.s);
532 cpos = htell(fd->fp);
533 while ((c = cram_read_container(fd))) {
534 int j;
536 if (fd->err) {
537 perror("Cram container read");
538 return -1;
541 hpos = htell(fd->fp);
543 if (!(c->comp_hdr_block = cram_read_block(fd)))
544 return -1;
545 assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
547 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
548 if (!c->comp_hdr)
549 return -1;
551 // 2.0 format
552 for (j = 0; j < c->num_landmarks; j++) {
553 char buf[1024];
554 cram_slice *s;
555 int sz, ret;
557 spos = htell(fd->fp);
558 assert(spos - cpos - c->offset == c->landmark[j]);
560 if (!(s = cram_read_slice(fd))) {
561 bgzf_close(fp);
562 return -1;
565 sz = (int)(htell(fd->fp) - spos);
567 if (s->hdr->ref_seq_id == -2) {
568 ret = cram_index_build_multiref(fd, c, s, fp,
569 cpos, c->landmark[j], sz);
570 } else {
571 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
572 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
573 s->hdr->ref_seq_span, (int64_t)cpos,
574 c->landmark[j], sz);
575 ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4;
578 cram_free_slice(s);
580 if (ret < 0) {
581 bgzf_close(fp);
582 return ret;
586 cpos = htell(fd->fp);
587 assert(cpos == hpos + c->length);
589 cram_free_container(c);
591 if (fd->err) {
592 bgzf_close(fp);
593 return -1;
596 return (bgzf_close(fp) >= 0)? 0 : -4;