c_cpp/lib/htslib/cram/cram_index.c

   1 /*
   2 Copyright (c) 2013-2014 Genome Research Ltd.
   3 Author: James Bonfield <jkb@sanger.ac.uk>
   4
   5 Redistribution and use in source and binary forms, with or without
   6 modification, are permitted provided that the following conditions are met:
   7
   8    1. Redistributions of source code must retain the above copyright notice,
   9 this list of conditions and the following disclaimer.
  10
  11    2. Redistributions in binary form must reproduce the above copyright notice,
  12 this list of conditions and the following disclaimer in the documentation
  13 and/or other materials provided with the distribution.
  14
  15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
  16 Institute nor the names of its contributors may be used to endorse or promote
  17 products derived from this software without specific prior written permission.
  18
  19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
  20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
  23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29 */
  30
  31 /*
  32  * The index is a gzipped tab-delimited text file with one line per slice.
  33  * The columns are:
  34  * 1: reference number (0 to N-1, as per BAM ref_id)
  35  * 2: reference position of 1st read in slice (1..?)
  36  * 3: number of reads in slice
  37  * 4: offset of container start (relative to end of SAM header, so 1st
  38  *    container is offset 0).
  39  * 5: slice number within container (ie which landmark).
  40  *
  41  * In memory, we hold this in a nested containment list. Each list element is
  42  * a cram_index struct. Each element in turn can contain its own list of
  43  * cram_index structs.
  44  *
  45  * Any start..end range which is entirely contained within another (and
  46  * earlier as it is sorted) range will be held within it. This ensures that
  47  * the outer list will never have containments and we can safely do a
  48  * binary search to find the first range which overlaps any given coordinate.
  49  */
  50
  51 #include <config.h>
  52
  53 #include <stdio.h>
  54 #include <errno.h>
  55 #include <assert.h>
  56 #include <stdlib.h>
  57 #include <string.h>
  58 #include <zlib.h>
  59 #include <sys/types.h>
  60 #include <sys/stat.h>
  61 #include <math.h>
  62
  63 #include "htslib/bgzf.h"
  64 #include "htslib/hfile.h"
  65 #include "hts_internal.h"
  66 #include "cram/cram.h"
  67 #include "cram/os.h"
  68
  69 #if 0
  70 static void dump_index_(cram_index *e, int level) {
  71     int i, n;
  72     n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
  73     printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
  74     for (i = 0; i < e->nslice; i++) {
  75         dump_index_(&e->e[i], level+1);
  76     }
  77 }
  78
  79 static void dump_index(cram_fd *fd) {
  80     int i;
  81     for (i = 0; i < fd->index_sz; i++) {
  82         dump_index_(&fd->index[i], 0);
  83     }
  84 }
  85 #endif
  86
  87 static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
  88     int sign = 1;
  89     int32_t val = 0;
  90     size_t p = *pos;
  91
  92     while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
  93            p++;
  94
  95     if (p < k->l && k->s[p] == '-')
  96         sign = -1, p++;
  97
  98     if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
  99         return -1;
 100
 101     while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
 102         val = val*10 + k->s[p++]-'0';
 103
 104     *pos = p;
 105     *val_p = sign*val;
 106
 107     return 0;
 108 }
 109
 110 static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
 111     int sign = 1;
 112     int64_t val = 0;
 113     size_t p = *pos;
 114
 115     while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
 116            p++;
 117
 118     if (p < k->l && k->s[p] == '-')
 119         sign = -1, p++;
 120
 121     if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
 122         return -1;
 123
 124     while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
 125         val = val*10 + k->s[p++]-'0';
 126
 127     *pos = p;
 128     *val_p = sign*val;
 129
 130     return 0;
 131 }
 132
 133 /*
 134  * Loads a CRAM .crai index into memory.
 135  *
 136  * Returns 0 for success
 137  *        -1 for failure
 138  */
 139 int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
 140     char *fn2 = NULL;
 141     char buf[65536];
 142     ssize_t len;
 143     kstring_t kstr = {0};
 144     hFILE *fp;
 145     cram_index *idx;
 146     cram_index **idx_stack = NULL, *ep, e;
 147     int idx_stack_alloc = 0, idx_stack_ptr = 0;
 148     size_t pos = 0;
 149
 150     /* Check if already loaded */
 151     if (fd->index)
 152         return 0;
 153
 154     fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
 155     if (!fd->index)
 156         return -1;
 157
 158     idx = &fd->index[0];
 159     idx->refid = -1;
 160     idx->start = INT_MIN;
 161     idx->end   = INT_MAX;
 162
 163     idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
 164     if (!idx_stack)
 165         goto fail;
 166
 167     idx_stack[idx_stack_ptr] = idx;
 168
 169     if (!fn_idx) {
 170         fn2 = hts_idx_getfn(fn, ".crai");
 171         if (!fn2)
 172             goto fail;
 173
 174         fn_idx = fn2;
 175     }
 176
 177     if (!(fp = hopen(fn_idx, "r"))) {
 178         perror(fn_idx);
 179         goto fail;
 180     }
 181
 182     // Load the file into memory
 183     while ((len = hread(fp, buf, sizeof(buf))) > 0) {
 184         if (kputsn(buf, len, &kstr) < 0)
 185             goto fail;
 186     }
 187
 188     if (len < 0 || kstr.l < 2)
 189         goto fail;
 190
 191     if (hclose(fp) < 0)
 192         goto fail;
 193
 194     // Uncompress if required
 195     if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
 196         size_t l;
 197         char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
 198         if (!s)
 199             goto fail;
 200
 201         free(kstr.s);
 202         kstr.s = s;
 203         kstr.l = l;
 204         kstr.m = l; // conservative estimate of the size allocated
 205         if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated
 206             goto fail;
 207     }
 208
 209
 210     // Parse it line at a time
 211     do {
 212         /* 1.1 layout */
 213         if (kget_int32(&kstr, &pos, &e.refid) == -1)
 214             goto fail;
 215
 216         if (kget_int32(&kstr, &pos, &e.start) == -1)
 217             goto fail;
 218
 219         if (kget_int32(&kstr, &pos, &e.end) == -1)
 220             goto fail;
 221
 222         if (kget_int64(&kstr, &pos, &e.offset) == -1)
 223             goto fail;
 224
 225         if (kget_int32(&kstr, &pos, &e.slice) == -1)
 226             goto fail;
 227
 228         if (kget_int32(&kstr, &pos, &e.len) == -1)
 229             goto fail;
 230
 231         e.end += e.start-1;
 232         //printf("%d/%d..%d\n", e.refid, e.start, e.end);
 233
 234         if (e.refid < -1) {
 235             fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
 236             goto fail;
 237         }
 238
 239         if (e.refid != idx->refid) {
 240             if (fd->index_sz < e.refid+2) {
 241                 cram_index *new_idx;
 242                 int new_sz = e.refid+2;
 243                 size_t index_end = fd->index_sz * sizeof(*fd->index);
 244                 new_idx = realloc(fd->index,
 245                                   new_sz * sizeof(*fd->index));
 246                 if (!new_idx)
 247                     goto fail;
 248
 249                 fd->index = new_idx;
 250                 fd->index_sz = new_sz;
 251                 memset(((char *)fd->index) + index_end, 0,
 252                        fd->index_sz * sizeof(*fd->index) - index_end);
 253             }
 254             idx = &fd->index[e.refid+1];
 255             idx->refid = e.refid;
 256             idx->start = INT_MIN;
 257             idx->end   = INT_MAX;
 258             idx->nslice = idx->nalloc = 0;
 259             idx->e = NULL;
 260             idx_stack[(idx_stack_ptr = 0)] = idx;
 261         }
 262
 263         while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) {
 264             idx = idx_stack[--idx_stack_ptr];
 265         }
 266
 267         // Now contains, so append
 268         if (idx->nslice+1 >= idx->nalloc) {
 269             cram_index *new_e;
 270             idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
 271             new_e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
 272             if (!new_e)
 273                 goto fail;
 274
 275             idx->e = new_e;
 276         }
 277
 278         e.nalloc = e.nslice = 0; e.e = NULL;
 279         *(ep = &idx->e[idx->nslice++]) = e;
 280         idx = ep;
 281
 282         if (++idx_stack_ptr >= idx_stack_alloc) {
 283             cram_index **new_stack;
 284             idx_stack_alloc *= 2;
 285             new_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
 286             if (!new_stack)
 287                 goto fail;
 288             idx_stack = new_stack;
 289         }
 290         idx_stack[idx_stack_ptr] = idx;
 291
 292         while (pos < kstr.l && kstr.s[pos] != '\n')
 293             pos++;
 294         pos++;
 295     } while (pos < kstr.l);
 296
 297     free(idx_stack);
 298     free(kstr.s);
 299     free(fn2);
 300
 301     // dump_index(fd);
 302
 303     return 0;
 304
 305  fail:
 306     free(kstr.s);
 307     free(idx_stack);
 308     free(fn2);
 309     cram_index_free(fd); // Also sets fd->index = NULL
 310     return -1;
 311 }
 312
 313 static void cram_index_free_recurse(cram_index *e) {
 314     if (e->e) {
 315         int i;
 316         for (i = 0; i < e->nslice; i++) {
 317             cram_index_free_recurse(&e->e[i]);
 318         }
 319         free(e->e);
 320     }
 321 }
 322
 323 void cram_index_free(cram_fd *fd) {
 324     int i;
 325
 326     if (!fd->index)
 327         return;
 328
 329     for (i = 0; i < fd->index_sz; i++) {
 330         cram_index_free_recurse(&fd->index[i]);
 331     }
 332     free(fd->index);
 333
 334     fd->index = NULL;
 335 }
 336
 337 /*
 338  * Searches the index for the first slice overlapping a reference ID
 339  * and position, or one immediately preceding it if none is found in
 340  * the index to overlap this position. (Our index may have missing
 341  * entries, but we require at least one per reference.)
 342  *
 343  * If the index finds multiple slices overlapping this position we
 344  * return the first one only. Subsequent calls should specifying
 345  * "from" as the last slice we checked to find the next one. Otherwise
 346  * set "from" to be NULL to find the first one.
 347  *
 348  * Returns the cram_index pointer on sucess
 349  *         NULL on failure
 350  */
 351 cram_index *cram_index_query(cram_fd *fd, int refid, int pos,
 352                              cram_index *from) {
 353     int i, j, k;
 354     cram_index *e;
 355
 356     if (refid+1 < 0 || refid+1 >= fd->index_sz)
 357         return NULL;
 358
 359     if (!from)
 360         from = &fd->index[refid+1];
 361
 362     // Ref with nothing aligned against it.
 363     if (!from->e)
 364         return NULL;
 365
 366     // This sequence is covered by the index, so binary search to find
 367     // the optimal starting block.
 368     i = 0, j = fd->index[refid+1].nslice-1;
 369     for (k = j/2; k != i; k = (j-i)/2 + i) {
 370         if (from->e[k].refid > refid) {
 371             j = k;
 372             continue;
 373         }
 374
 375         if (from->e[k].refid < refid) {
 376             i = k;
 377             continue;
 378         }
 379
 380         if (from->e[k].start >= pos) {
 381             j = k;
 382             continue;
 383         }
 384
 385         if (from->e[k].start < pos) {
 386             i = k;
 387             continue;
 388         }
 389     }
 390     // i==j or i==j-1. Check if j is better.
 391     if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
 392         i = j;
 393
 394     /* The above found *a* bin overlapping, but not necessarily the first */
 395     while (i > 0 && from->e[i-1].end >= pos)
 396         i--;
 397
 398     /* We may be one bin before the optimum, so check */
 399     while (i+1 < from->nslice &&
 400            (from->e[i].refid < refid ||
 401             from->e[i].end < pos))
 402         i++;
 403
 404     e = &from->e[i];
 405
 406     return e;
 407 }
 408
 409
 410 /*
 411  * Skips to a container overlapping the start coordinate listed in
 412  * cram_range.
 413  *
 414  * In theory we call cram_index_query multiple times, once per slice
 415  * overlapping the range. However slices may be absent from the index
 416  * which makes this problematic. Instead we find the left-most slice
 417  * and then read from then on, skipping decoding of slices and/or
 418  * whole containers when they don't overlap the specified cram_range.
 419  *
 420  * Returns 0 on success
 421  *        -1 on general failure
 422  *        -2 on no-data (empty chromosome)
 423  */
 424 int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
 425     cram_index *e;
 426
 427     // Ideally use an index, so see if we have one.
 428     if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
 429         if (0 != cram_seek(fd, e->offset, SEEK_SET))
 430             if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
 431                 return -1;
 432     } else {
 433         // Absent from index, but this most likely means it simply has no data.
 434         return -2;
 435     }
 436
 437     if (fd->ctr) {
 438         cram_free_container(fd->ctr);
 439         fd->ctr = NULL;
 440         fd->ooc = 0;
 441     }
 442
 443     return 0;
 444 }
 445
 446
 447 /*
 448  * A specialised form of cram_index_build (below) that deals with slices
 449  * having multiple references in this (ref_id -2). In this scenario we
 450  * decode the slice to look at the RI data series instead.
 451  *
 452  * Returns 0 on success
 453  *        -1 on read failure
 454  *        -4 on write failure
 455  */
 456 static int cram_index_build_multiref(cram_fd *fd,
 457                                      cram_container *c,
 458                                      cram_slice *s,
 459                                      BGZF *fp,
 460                                      off_t cpos,
 461                                      int32_t landmark,
 462                                      int sz) {
 463     int i, ref = -2, ref_start = 0, ref_end;
 464     char buf[1024];
 465
 466     if (0 != cram_decode_slice(fd, c, s, fd->header))
 467         return -1;
 468
 469     ref_end = INT_MIN;
 470     for (i = 0; i < s->hdr->num_records; i++) {
 471         if (s->crecs[i].ref_id == ref) {
 472             if (ref_end < s->crecs[i].aend)
 473                 ref_end = s->crecs[i].aend;
 474             continue;
 475         }
 476
 477         if (ref != -2) {
 478             sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
 479                     ref, ref_start, ref_end - ref_start + 1,
 480                     (int64_t)cpos, landmark, sz);
 481             if (bgzf_write(fp, buf, strlen(buf)) < 0)
 482                 return -4;
 483         }
 484
 485         ref = s->crecs[i].ref_id;
 486         ref_start = s->crecs[i].apos;
 487         ref_end   = s->crecs[i].aend;
 488     }
 489
 490     if (ref != -2) {
 491         sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
 492                 ref, ref_start, ref_end - ref_start + 1,
 493                 (int64_t)cpos, landmark, sz);
 494         if (bgzf_write(fp, buf, strlen(buf)) < 0)
 495             return -4;
 496     }
 497
 498     return 0;
 499 }
 500
 501 /*
 502  * Builds an index file.
 503  *
 504  * fd is a newly opened cram file that we wish to index.
 505  * fn_base is the filename of the associated CRAM file.
 506  * fn_idx is the filename of the index file to be written;
 507  * if NULL, we add ".crai" to fn_base to get the index filename.
 508  *
 509  * Returns 0 on success,
 510  *         negative on failure (-1 for read failure, -4 for write failure)
 511  */
 512 int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
 513     cram_container *c;
 514     off_t cpos, spos, hpos;
 515     BGZF *fp;
 516     kstring_t fn_idx_str = {0};
 517
 518     if (! fn_idx) {
 519         kputs(fn_base, &fn_idx_str);
 520         kputs(".crai", &fn_idx_str);
 521         fn_idx = fn_idx_str.s;
 522     }
 523
 524     if (!(fp = bgzf_open(fn_idx, "wg"))) {
 525         perror(fn_idx);
 526         free(fn_idx_str.s);
 527         return -4;
 528     }
 529
 530     free(fn_idx_str.s);
 531
 532     cpos = htell(fd->fp);
 533     while ((c = cram_read_container(fd))) {
 534         int j;
 535
 536         if (fd->err) {
 537             perror("Cram container read");
 538             return -1;
 539         }
 540
 541         hpos = htell(fd->fp);
 542
 543         if (!(c->comp_hdr_block = cram_read_block(fd)))
 544             return -1;
 545         assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
 546
 547         c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
 548         if (!c->comp_hdr)
 549             return -1;
 550
 551         // 2.0 format
 552         for (j = 0; j < c->num_landmarks; j++) {
 553             char buf[1024];
 554             cram_slice *s;
 555             int sz, ret;
 556
 557             spos = htell(fd->fp);
 558             assert(spos - cpos - c->offset == c->landmark[j]);
 559
 560             if (!(s = cram_read_slice(fd))) {
 561                 bgzf_close(fp);
 562                 return -1;
 563             }
 564
 565             sz = (int)(htell(fd->fp) - spos);
 566
 567             if (s->hdr->ref_seq_id == -2) {
 568                 ret = cram_index_build_multiref(fd, c, s, fp,
 569                                                 cpos, c->landmark[j], sz);
 570             } else {
 571                 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
 572                         s->hdr->ref_seq_id, s->hdr->ref_seq_start,
 573                         s->hdr->ref_seq_span, (int64_t)cpos,
 574                         c->landmark[j], sz);
 575                 ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4;
 576             }
 577
 578             cram_free_slice(s);
 579
 580             if (ret < 0) {
 581                 bgzf_close(fp);
 582                 return ret;
 583             }
 584         }
 585
 586         cpos = htell(fd->fp);
 587         assert(cpos == hpos + c->length);
 588
 589         cram_free_container(c);
 590     }
 591     if (fd->err) {
 592         bgzf_close(fp);
 593         return -1;
 594     }
 595
 596     return (bgzf_close(fp) >= 0)? 0 : -4;
 597 }