modified: src1/input.c
[GalaxyCodeBases.git] / c_cpp / lib / htslib / sam.c
blob5e9c20de29ce225d61204bb89f7e4e443547c539
1 /* sam.c -- SAM and BAM file I/O and manipulation.
3 Copyright (C) 2008-2010, 2012-2017 Genome Research Ltd.
4 Copyright (C) 2010, 2012, 2013 Broad Institute.
6 Author: Heng Li <lh3@sanger.ac.uk>
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
26 #include <config.h>
28 #include <strings.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <zlib.h>
34 #include <assert.h>
35 #include "htslib/sam.h"
36 #include "htslib/bgzf.h"
37 #include "cram/cram.h"
38 #include "hts_internal.h"
39 #include "htslib/hfile.h"
40 #include "htslib/hts_endian.h"
42 #include "htslib/khash.h"
43 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
45 typedef khash_t(s2i) sdict_t;
47 #ifndef EOVERFLOW
48 #define EOVERFLOW ERANGE
49 #endif
51 /**********************
52 *** BAM header I/O ***
53 **********************/
55 bam_hdr_t *bam_hdr_init()
57 return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
60 void bam_hdr_destroy(bam_hdr_t *h)
62 int32_t i;
63 if (h == NULL) return;
64 if (h->target_name) {
65 for (i = 0; i < h->n_targets; ++i)
66 free(h->target_name[i]);
67 free(h->target_name);
68 free(h->target_len);
70 free(h->text); free(h->cigar_tab);
71 if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
72 free(h);
75 bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
77 if (h0 == NULL) return NULL;
78 bam_hdr_t *h;
79 if ((h = bam_hdr_init()) == NULL) return NULL;
80 // copy the simple data
81 h->n_targets = h0->n_targets;
82 h->ignore_sam_err = h0->ignore_sam_err;
83 h->l_text = h0->l_text;
84 // Then the pointery stuff
85 h->cigar_tab = NULL;
86 h->sdict = NULL;
87 // TODO Check for memory allocation failures
88 h->text = (char*)calloc(h->l_text + 1, 1);
89 memcpy(h->text, h0->text, h->l_text);
90 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
91 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
92 int i;
93 for (i = 0; i < h->n_targets; ++i) {
94 h->target_len[i] = h0->target_len[i];
95 h->target_name[i] = strdup(h0->target_name[i]);
97 return h;
101 static bam_hdr_t *hdr_from_dict(sdict_t *d)
103 bam_hdr_t *h;
104 khint_t k;
105 h = bam_hdr_init();
106 h->sdict = d;
107 h->n_targets = kh_size(d);
108 // TODO Check for memory allocation failures
109 h->target_len = (uint32_t*)malloc(sizeof(uint32_t) * h->n_targets);
110 h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
111 for (k = kh_begin(d); k != kh_end(d); ++k) {
112 if (!kh_exist(d, k)) continue;
113 h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
114 h->target_len[kh_val(d, k)>>32] = kh_val(d, k) & 0xffffffffUL;
115 kh_val(d, k) >>= 32;
117 return h;
120 bam_hdr_t *bam_hdr_read(BGZF *fp)
122 bam_hdr_t *h;
123 char buf[4];
124 int magic_len, has_EOF;
125 int32_t i, name_len, num_names = 0;
126 size_t bufsize;
127 ssize_t bytes;
128 // check EOF
129 has_EOF = bgzf_check_EOF(fp);
130 if (has_EOF < 0) {
131 perror("[W::bam_hdr_read] bgzf_check_EOF");
132 } else if (has_EOF == 0) {
133 hts_log_warning("EOF marker is absent. The input is probably truncated");
135 // read "BAM1"
136 magic_len = bgzf_read(fp, buf, 4);
137 if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
138 hts_log_error("Invalid BAM binary header");
139 return 0;
141 h = bam_hdr_init();
142 if (!h) goto nomem;
144 // read plain text and the number of reference sequences
145 bytes = bgzf_read(fp, &h->l_text, 4);
146 if (bytes != 4) goto read_err;
147 if (fp->is_be) ed_swap_4p(&h->l_text);
149 bufsize = ((size_t) h->l_text) + 1;
150 if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
151 h->text = (char*)malloc(bufsize);
152 if (!h->text) goto nomem;
153 h->text[h->l_text] = 0; // make sure it is NULL terminated
154 bytes = bgzf_read(fp, h->text, h->l_text);
155 if (bytes != h->l_text) goto read_err;
157 bytes = bgzf_read(fp, &h->n_targets, 4);
158 if (bytes != 4) goto read_err;
159 if (fp->is_be) ed_swap_4p(&h->n_targets);
161 if (h->n_targets < 0) goto invalid;
163 // read reference sequence names and lengths
164 if (h->n_targets > 0) {
165 h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
166 if (!h->target_name) goto nomem;
167 h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
168 if (!h->target_len) goto nomem;
170 else {
171 h->target_name = NULL;
172 h->target_len = NULL;
175 for (i = 0; i != h->n_targets; ++i) {
176 bytes = bgzf_read(fp, &name_len, 4);
177 if (bytes != 4) goto read_err;
178 if (fp->is_be) ed_swap_4p(&name_len);
179 if (name_len <= 0) goto invalid;
181 h->target_name[i] = (char*)malloc(name_len);
182 if (!h->target_name[i]) goto nomem;
183 num_names++;
185 bytes = bgzf_read(fp, h->target_name[i], name_len);
186 if (bytes != name_len) goto read_err;
188 if (h->target_name[i][name_len - 1] != '\0') {
189 /* Fix missing NUL-termination. Is this being too nice?
190 We could alternatively bail out with an error. */
191 char *new_name;
192 if (name_len == INT32_MAX) goto invalid;
193 new_name = realloc(h->target_name[i], name_len + 1);
194 if (new_name == NULL) goto nomem;
195 h->target_name[i] = new_name;
196 h->target_name[i][name_len] = '\0';
199 bytes = bgzf_read(fp, &h->target_len[i], 4);
200 if (bytes != 4) goto read_err;
201 if (fp->is_be) ed_swap_4p(&h->target_len[i]);
203 return h;
205 nomem:
206 hts_log_error("Out of memory");
207 goto clean;
209 read_err:
210 if (bytes < 0) {
211 hts_log_error("Error reading BGZF stream");
212 } else {
213 hts_log_error("Truncated BAM header");
215 goto clean;
217 invalid:
218 hts_log_error("Invalid BAM binary header");
220 clean:
221 if (h != NULL) {
222 h->n_targets = num_names; // ensure we free only allocated target_names
223 bam_hdr_destroy(h);
225 return NULL;
228 int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
230 char buf[4];
231 int32_t i, name_len, x;
232 // write "BAM1"
233 strncpy(buf, "BAM\1", 4);
234 if (bgzf_write(fp, buf, 4) < 0) return -1;
235 // write plain text and the number of reference sequences
236 if (fp->is_be) {
237 x = ed_swap_4(h->l_text);
238 if (bgzf_write(fp, &x, 4) < 0) return -1;
239 if (h->l_text) {
240 if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
242 x = ed_swap_4(h->n_targets);
243 if (bgzf_write(fp, &x, 4) < 0) return -1;
244 } else {
245 if (bgzf_write(fp, &h->l_text, 4) < 0) return -1;
246 if (h->l_text) {
247 if (bgzf_write(fp, h->text, h->l_text) < 0) return -1;
249 if (bgzf_write(fp, &h->n_targets, 4) < 0) return -1;
251 // write sequence names and lengths
252 for (i = 0; i != h->n_targets; ++i) {
253 char *p = h->target_name[i];
254 name_len = strlen(p) + 1;
255 if (fp->is_be) {
256 x = ed_swap_4(name_len);
257 if (bgzf_write(fp, &x, 4) < 0) return -1;
258 } else {
259 if (bgzf_write(fp, &name_len, 4) < 0) return -1;
261 if (bgzf_write(fp, p, name_len) < 0) return -1;
262 if (fp->is_be) {
263 x = ed_swap_4(h->target_len[i]);
264 if (bgzf_write(fp, &x, 4) < 0) return -1;
265 } else {
266 if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
269 if (bgzf_flush(fp) < 0) return -1;
270 return 0;
273 int bam_name2id(bam_hdr_t *h, const char *ref)
275 sdict_t *d = (sdict_t*)h->sdict;
276 khint_t k;
277 if (h->sdict == 0) {
278 int i, absent;
279 d = kh_init(s2i);
280 for (i = 0; i < h->n_targets; ++i) {
281 k = kh_put(s2i, d, h->target_name[i], &absent);
282 kh_val(d, k) = i;
284 h->sdict = d;
286 k = kh_get(s2i, d, ref);
287 return k == kh_end(d)? -1 : kh_val(d, k);
290 /*************************
291 *** BAM alignment I/O ***
292 *************************/
294 bam1_t *bam_init1()
296 return (bam1_t*)calloc(1, sizeof(bam1_t));
299 void bam_destroy1(bam1_t *b)
301 if (b == 0) return;
302 free(b->data); free(b);
305 bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
307 uint8_t *data = bdst->data;
308 int m_data = bdst->m_data; // backup data and m_data
309 if (m_data < bsrc->l_data) { // double the capacity
310 m_data = bsrc->l_data; kroundup32(m_data);
311 data = (uint8_t*)realloc(data, m_data);
313 memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
314 *bdst = *bsrc; // copy the rest
315 // restore the backup
316 bdst->m_data = m_data;
317 bdst->data = data;
318 return bdst;
321 bam1_t *bam_dup1(const bam1_t *bsrc)
323 if (bsrc == NULL) return NULL;
324 bam1_t *bdst = bam_init1();
325 if (bdst == NULL) return NULL;
326 return bam_copy1(bdst, bsrc);
329 int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
331 int k, l;
332 for (k = l = 0; k < n_cigar; ++k)
333 if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
334 l += bam_cigar_oplen(cigar[k]);
335 return l;
338 int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
340 int k, l;
341 for (k = l = 0; k < n_cigar; ++k)
342 if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
343 l += bam_cigar_oplen(cigar[k]);
344 return l;
347 int32_t bam_endpos(const bam1_t *b)
349 if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
350 return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
351 else
352 return b->core.pos + 1;
355 static inline int aux_type2size(uint8_t type)
357 switch (type) {
358 case 'A': case 'c': case 'C':
359 return 1;
360 case 's': case 'S':
361 return 2;
362 case 'i': case 'I': case 'f':
363 return 4;
364 case 'd':
365 return 8;
366 case 'Z': case 'H': case 'B':
367 return type;
368 default:
369 return 0;
373 static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
375 uint32_t *cigar = (uint32_t*)(data + c->l_qname);
376 uint32_t i;
377 for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
380 int bam_read1(BGZF *fp, bam1_t *b)
382 bam1_core_t *c = &b->core;
383 int32_t block_len, ret, i;
384 uint32_t x[8];
385 if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
386 if (ret == 0) return -1; // normal end-of-file
387 else return -2; // truncated
389 if (bgzf_read(fp, x, 32) != 32) return -3;
390 if (fp->is_be) {
391 ed_swap_4p(&block_len);
392 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
394 c->tid = x[0]; c->pos = x[1];
395 c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
396 c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
397 if ((uint32_t) c->l_qname + c->l_extranul > 255) // l_qname would overflow
398 return -4;
399 c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
400 c->l_qseq = x[4];
401 c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
402 b->l_data = block_len - 32 + c->l_extranul;
403 if (b->l_data < 0 || c->l_qseq < 0 || c->l_qname < 1) return -4;
404 if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
405 + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) b->l_data)
406 return -4;
407 if (b->m_data < b->l_data) {
408 uint8_t *new_data;
409 uint32_t new_m = b->l_data;
410 kroundup32(new_m);
411 new_data = (uint8_t*)realloc(b->data, new_m);
412 if (!new_data)
413 return -4;
414 b->data = new_data;
415 b->m_data = new_m;
417 if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
418 for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
419 c->l_qname += c->l_extranul;
420 if (b->l_data < c->l_qname ||
421 bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
422 return -4;
423 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
424 return 4 + block_len;
427 int bam_write1(BGZF *fp, const bam1_t *b)
429 const bam1_core_t *c = &b->core;
430 uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
431 int i, ok;
432 if (c->n_cigar >= 65536) {
433 hts_log_error("Too many CIGAR operations (%d >= 64K for QNAME \"%s\")", c->n_cigar, bam_get_qname(b));
434 errno = EOVERFLOW;
435 return -1;
437 x[0] = c->tid;
438 x[1] = c->pos;
439 x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
440 x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
441 x[4] = c->l_qseq;
442 x[5] = c->mtid;
443 x[6] = c->mpos;
444 x[7] = c->isize;
445 ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
446 if (fp->is_be) {
447 for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
448 y = block_len;
449 if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
450 swap_data(c, b->l_data, b->data, 1);
451 } else {
452 if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
454 if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
455 if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
456 if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
457 if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
458 return ok? 4 + block_len : -1;
461 /********************
462 *** BAM indexing ***
463 ********************/
465 static hts_idx_t *bam_index(BGZF *fp, int min_shift)
467 int n_lvls, i, fmt, ret;
468 bam1_t *b;
469 hts_idx_t *idx;
470 bam_hdr_t *h;
471 h = bam_hdr_read(fp);
472 if (h == NULL) return NULL;
473 if (min_shift > 0) {
474 int64_t max_len = 0, s;
475 for (i = 0; i < h->n_targets; ++i)
476 if (max_len < h->target_len[i]) max_len = h->target_len[i];
477 max_len += 256;
478 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
479 fmt = HTS_FMT_CSI;
480 } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
481 idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
482 bam_hdr_destroy(h);
483 b = bam_init1();
484 while ((ret = bam_read1(fp, b)) >= 0) {
485 ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
486 if (ret < 0) goto err; // unsorted
488 if (ret < -1) goto err; // corrupted BAM file
490 hts_idx_finish(idx, bgzf_tell(fp));
491 bam_destroy1(b);
492 return idx;
494 err:
495 bam_destroy1(b);
496 hts_idx_destroy(idx);
497 return NULL;
500 int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
502 hts_idx_t *idx;
503 htsFile *fp;
504 int ret = 0;
506 if ((fp = hts_open(fn, "r")) == 0) return -2;
507 if (nthreads)
508 hts_set_threads(fp, nthreads);
510 switch (fp->format.format) {
511 case cram:
512 ret = cram_index_build(fp->fp.cram, fn, fnidx);
513 break;
515 case bam:
516 idx = bam_index(fp->fp.bgzf, min_shift);
517 if (idx) {
518 ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
519 if (ret < 0) ret = -4;
520 hts_idx_destroy(idx);
522 else ret = -1;
523 break;
525 default:
526 ret = -3;
527 break;
529 hts_close(fp);
531 return ret;
534 int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
536 return sam_index_build3(fn, fnidx, min_shift, 0);
539 int sam_index_build(const char *fn, int min_shift)
541 return sam_index_build3(fn, NULL, min_shift, 0);
544 // Provide bam_index_build() symbol for binary compability with earlier HTSlib
545 #undef bam_index_build
546 int bam_index_build(const char *fn, int min_shift)
548 return sam_index_build2(fn, NULL, min_shift);
551 static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
553 bam1_t *b = bv;
554 int ret;
555 if ((ret = bam_read1(fp, b)) >= 0) {
556 *tid = b->core.tid;
557 *beg = b->core.pos;
558 *end = bam_endpos(b);
560 return ret;
563 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
564 static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
566 htsFile *fp = fpv;
567 bam1_t *b = bv;
568 return cram_get_bam_seq(fp->fp.cram, &b);
571 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
572 static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
574 htsFile *fp = fpv;
575 bam1_t *b = bv;
576 switch (fp->format.format) {
577 case bam: return bam_read1(bgzfp, b);
578 case cram: return cram_get_bam_seq(fp->fp.cram, &b);
579 default:
580 // TODO Need headers available to implement this for SAM files
581 hts_log_error("Not implemented for SAM files");
582 abort();
586 hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
588 switch (fp->format.format) {
589 case bam:
590 return fnidx? hts_idx_load2(fn, fnidx) : hts_idx_load(fn, HTS_FMT_BAI);
592 case cram: {
593 if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
594 // Cons up a fake "index" just pointing at the associated cram_fd:
595 hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
596 if (idx == NULL) return NULL;
597 idx->fmt = HTS_FMT_CRAI;
598 idx->cram = fp->fp.cram;
599 return (hts_idx_t *) idx;
602 default:
603 return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
607 hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
609 return sam_index_load2(fp, fn, NULL);
612 static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
614 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
615 hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
616 if (iter == NULL) return NULL;
618 // Cons up a dummy iterator for which hts_itr_next() will simply invoke
619 // the readrec function:
620 iter->is_cram = 1;
621 iter->read_rest = 1;
622 iter->off = NULL;
623 iter->bins.a = NULL;
624 iter->readrec = readrec;
626 if (tid >= 0 || tid == HTS_IDX_NOCOOR) {
627 cram_range r = { tid == HTS_IDX_NOCOOR ? -1 : tid, beg+1, end };
628 int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
630 iter->curr_off = 0;
631 // The following fields are not required by hts_itr_next(), but are
632 // filled in in case user code wants to look at them.
633 iter->tid = tid;
634 iter->beg = beg;
635 iter->end = end;
637 switch (ret) {
638 case 0:
639 break;
641 case -2:
642 // No data vs this ref, so mark iterator as completed.
643 // Same as HTS_IDX_NONE.
644 iter->finished = 1;
645 break;
647 default:
648 free(iter);
649 return NULL;
652 else switch (tid) {
653 case HTS_IDX_REST:
654 iter->curr_off = 0;
655 break;
656 case HTS_IDX_NONE:
657 iter->curr_off = 0;
658 iter->finished = 1;
659 break;
660 default:
661 hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
662 abort();
663 break;
666 return iter;
669 hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
671 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
672 if (idx == NULL)
673 return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
674 else if (cidx->fmt == HTS_FMT_CRAI)
675 return cram_itr_query(idx, tid, beg, end, cram_readrec);
676 else
677 return hts_itr_query(idx, tid, beg, end, bam_readrec);
680 static int cram_name2id(void *fdv, const char *ref)
682 cram_fd *fd = (cram_fd *) fdv;
683 return sam_hdr_name2ref(fd->header, ref);
686 hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
688 const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
689 if (cidx->fmt == HTS_FMT_CRAI)
690 return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
691 else
692 return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
695 /**********************
696 *** SAM header I/O ***
697 **********************/
699 #include "htslib/kseq.h"
700 #include "htslib/kstring.h"
702 bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
704 const char *q, *r, *p;
705 khash_t(s2i) *d;
706 d = kh_init(s2i);
707 for (p = text; *p; ++p) {
708 if (strncmp(p, "@SQ\t", 4) == 0) {
709 char *sn = 0;
710 int ln = -1;
711 for (q = p + 4;; ++q) {
712 if (strncmp(q, "SN:", 3) == 0) {
713 q += 3;
714 for (r = q; *r != '\t' && *r != '\n' && *r != '\0'; ++r);
715 sn = (char*)calloc(r - q + 1, 1);
716 strncpy(sn, q, r - q);
717 q = r;
718 } else if (strncmp(q, "LN:", 3) == 0)
719 ln = strtol(q + 3, (char**)&q, 10);
720 while (*q != '\t' && *q != '\n' && *q != '\0') ++q;
721 if (*q == '\0' || *q == '\n') break;
723 p = q;
724 if (sn && ln >= 0) {
725 khint_t k;
726 int absent;
727 k = kh_put(s2i, d, sn, &absent);
728 if (!absent) {
729 hts_log_warning("Duplicated sequence '%s'", sn);
730 free(sn);
731 } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
734 while (*p != '\0' && *p != '\n') ++p;
736 return hdr_from_dict(d);
739 // Minimal sanitisation of a header to ensure.
740 // - null terminated string.
741 // - all lines start with @ (also implies no blank lines).
743 // Much more could be done, but currently is not, including:
744 // - checking header types are known (HD, SQ, etc).
745 // - syntax (eg checking tab separated fields).
746 // - validating n_targets matches @SQ records.
747 // - validating target lengths against @SQ records.
748 static bam_hdr_t *sam_hdr_sanitise(bam_hdr_t *h) {
749 if (!h)
750 return NULL;
752 // Special case for empty headers.
753 if (h->l_text == 0)
754 return h;
756 uint32_t i, lnum = 0;
757 char *cp = h->text, last = '\n';
758 for (i = 0; i < h->l_text; i++) {
759 // NB: l_text excludes terminating nul. This finds early ones.
760 if (cp[i] == 0)
761 break;
763 // Error on \n[^@], including duplicate newlines
764 if (last == '\n') {
765 lnum++;
766 if (cp[i] != '@') {
767 hts_log_error("Malformed SAM header at line %u", lnum);
768 bam_hdr_destroy(h);
769 return NULL;
773 last = cp[i];
776 if (i < h->l_text) { // Early nul found. Complain if not just padding.
777 uint32_t j = i;
778 while (j < h->l_text && cp[j] == '\0') j++;
779 if (j < h->l_text)
780 hts_log_warning("Unexpected NUL character in header. Possibly truncated");
783 // Add trailing newline and/or trailing nul if required.
784 if (last != '\n') {
785 hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
787 if (h->l_text == UINT32_MAX) {
788 hts_log_error("No room for extra newline");
789 bam_hdr_destroy(h);
790 return NULL;
793 if (i >= h->l_text - 1) {
794 cp = realloc(h->text, (size_t) h->l_text+2);
795 if (!cp) {
796 bam_hdr_destroy(h);
797 return NULL;
799 h->text = cp;
801 cp[i++] = '\n';
803 // l_text may be larger already due to multiple nul padding
804 if (h->l_text < i)
805 h->l_text = i;
806 cp[h->l_text] = '\0';
809 return h;
812 bam_hdr_t *sam_hdr_read(htsFile *fp)
814 switch (fp->format.format) {
815 case bam:
816 return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
818 case cram:
819 return sam_hdr_sanitise(cram_header_to_bam(fp->fp.cram->header));
821 case sam: {
822 kstring_t str = { 0, 0, NULL };
823 bam_hdr_t *h = NULL;
824 int ret, has_SQ = 0;
825 while ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
826 if (fp->line.s[0] != '@') break;
827 if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
828 kputsn(fp->line.s, fp->line.l, &str);
829 kputc('\n', &str);
831 if (ret < -1) goto error;
832 if (! has_SQ && fp->fn_aux) {
833 kstring_t line = { 0, 0, NULL };
834 hFILE *f = hopen(fp->fn_aux, "r");
835 if (f == NULL) goto error;
836 while (line.l = 0, kgetline(&line, (kgets_func *) hgets, f) >= 0) {
837 char *tab = strchr(line.s, '\t');
838 if (tab == NULL) continue;
839 kputs("@SQ\tSN:", &str);
840 kputsn(line.s, tab - line.s, &str);
841 kputs("\tLN:", &str);
842 kputl(atol(tab), &str);
843 kputc('\n', &str);
845 free(line.s);
846 if (hclose(f) != 0) {
847 hts_log_warning("Failed to close %s", fp->fn_aux);
850 if (str.l == 0) kputsn("", 0, &str);
851 h = sam_hdr_parse(str.l, str.s);
852 h->l_text = str.l; h->text = str.s;
853 return sam_hdr_sanitise(h);
855 error:
856 bam_hdr_destroy(h);
857 free(str.s);
858 return NULL;
861 default:
862 abort();
866 int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
868 if (!h) {
869 errno = EINVAL;
870 return -1;
873 switch (fp->format.format) {
874 case binary_format:
875 fp->format.category = sequence_data;
876 fp->format.format = bam;
877 /* fall-through */
878 case bam:
879 if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
880 break;
882 case cram: {
883 cram_fd *fd = fp->fp.cram;
884 SAM_hdr *hdr = bam_header_to_cram((bam_hdr_t *)h);
885 if (! hdr) return -1;
886 if (cram_set_header(fd, hdr) < 0) return -1;
887 if (fp->fn_aux)
888 cram_load_reference(fd, fp->fn_aux);
889 if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
891 break;
893 case text_format:
894 fp->format.category = sequence_data;
895 fp->format.format = sam;
896 /* fall-through */
897 case sam: {
898 char *p;
899 hputs(h->text, fp->fp.hfile);
900 p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
901 if (p == 0) {
902 int i;
903 for (i = 0; i < h->n_targets; ++i) {
904 fp->line.l = 0;
905 kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
906 kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
907 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
910 if ( hflush(fp->fp.hfile) != 0 ) return -1;
912 break;
914 default:
915 abort();
917 return 0;
920 /**********************
921 *** SAM record I/O ***
922 **********************/
924 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
926 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
927 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
928 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
929 #define _parse_err(cond, msg) do { if (cond) { hts_log_error(msg); goto err_ret; } } while (0)
930 #define _parse_err_param(cond, msg, param) do { if (cond) { hts_log_error(msg, param); goto err_ret; } } while (0)
931 #define _parse_warn(cond, msg) do { if (cond) { hts_log_warning(msg); } } while (0)
933 uint8_t *t;
934 char *p = s->s, *q;
935 int i;
936 kstring_t str;
937 bam1_core_t *c = &b->core;
939 str.l = b->l_data = 0;
940 str.s = (char*)b->data; str.m = b->m_data;
941 memset(c, 0, 32);
942 if (h->cigar_tab == 0) {
943 h->cigar_tab = (int8_t*) malloc(128);
944 for (i = 0; i < 128; ++i)
945 h->cigar_tab[i] = -1;
946 for (i = 0; BAM_CIGAR_STR[i]; ++i)
947 h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
949 // qname
950 q = _read_token(p);
951 _parse_warn(p - q <= 1, "empty query name");
952 _parse_err(p - q > 252, "query name too long");
953 kputsn_(q, p - q, &str);
954 for (c->l_extranul = 0; str.l % 4 != 0; c->l_extranul++)
955 kputc_('\0', &str);
956 c->l_qname = p - q + c->l_extranul;
957 // flag
958 c->flag = strtol(p, &p, 0);
959 if (*p++ != '\t') goto err_ret; // malformated flag
960 // chr
961 q = _read_token(p);
962 if (strcmp(q, "*")) {
963 _parse_err(h->n_targets == 0, "missing SAM header");
964 c->tid = bam_name2id(h, q);
965 _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
966 } else c->tid = -1;
967 // pos
968 c->pos = strtol(p, &p, 10) - 1;
969 if (*p++ != '\t') goto err_ret;
970 if (c->pos < 0 && c->tid >= 0) {
971 _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
972 c->tid = -1;
974 if (c->tid < 0) c->flag |= BAM_FUNMAP;
975 // mapq
976 c->qual = strtol(p, &p, 10);
977 if (*p++ != '\t') goto err_ret;
978 // cigar
979 if (*p != '*') {
980 uint32_t *cigar;
981 size_t n_cigar = 0;
982 for (q = p; *p && *p != '\t'; ++p)
983 if (!isdigit_c(*p)) ++n_cigar;
984 if (*p++ != '\t') goto err_ret;
985 _parse_err(n_cigar == 0, "no CIGAR operations");
986 _parse_err(n_cigar >= 2147483647, "too many CIGAR operations");
987 c->n_cigar = n_cigar;
988 _get_mem(uint32_t, &cigar, &str, c->n_cigar * sizeof(uint32_t));
989 for (i = 0; i < c->n_cigar; ++i, ++q) {
990 int op;
991 cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
992 op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
993 _parse_err(op < 0, "unrecognized CIGAR operator");
994 cigar[i] |= op;
996 // can't use bam_endpos() directly as some fields not yet set up
997 i = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
998 } else {
999 _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
1000 c->flag |= BAM_FUNMAP;
1001 q = _read_token(p);
1002 i = 1;
1004 c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
1005 // mate chr
1006 q = _read_token(p);
1007 if (strcmp(q, "=") == 0) {
1008 c->mtid = c->tid;
1009 } else if (strcmp(q, "*") == 0) {
1010 c->mtid = -1;
1011 } else {
1012 c->mtid = bam_name2id(h, q);
1013 _parse_warn(c->mtid < 0, "urecognized mate reference name; treated as unmapped");
1015 // mpos
1016 c->mpos = strtol(p, &p, 10) - 1;
1017 if (*p++ != '\t') goto err_ret;
1018 if (c->mpos < 0 && c->mtid >= 0) {
1019 _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
1020 c->mtid = -1;
1022 // tlen
1023 c->isize = strtol(p, &p, 10);
1024 if (*p++ != '\t') goto err_ret;
1025 // seq
1026 q = _read_token(p);
1027 if (strcmp(q, "*")) {
1028 c->l_qseq = p - q - 1;
1029 i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
1030 _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
1031 i = (c->l_qseq + 1) >> 1;
1032 _get_mem(uint8_t, &t, &str, i);
1033 memset(t, 0, i);
1034 for (i = 0; i < c->l_qseq; ++i)
1035 t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2);
1036 } else c->l_qseq = 0;
1037 // qual
1038 q = _read_token_aux(p);
1039 _get_mem(uint8_t, &t, &str, c->l_qseq);
1040 if (strcmp(q, "*")) {
1041 _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
1042 for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
1043 } else memset(t, 0xff, c->l_qseq);
1044 // aux
1045 while (p < s->s + s->l) {
1046 uint8_t type;
1047 q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
1048 _parse_err(p - q - 1 < 5, "incomplete aux field");
1049 kputsn_(q, 2, &str);
1050 q += 3; type = *q++; ++q; // q points to value
1051 if (type != 'Z' && type != 'H') // the only zero length acceptable fields
1052 _parse_err(p - q - 1 < 1, "incomplete aux field");
1054 // Ensure str has enough space for a double + type allocated.
1055 // This is so we can stuff bigger integers and floats directly into
1056 // the kstring. Sorry.
1057 _parse_err(ks_resize(&str, str.l + 16), "out of memory");
1059 if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
1060 kputc_('A', &str);
1061 kputc_(*q, &str);
1062 } else if (type == 'i' || type == 'I') {
1063 if (*q == '-') {
1064 long x = strtol(q, &q, 10);
1065 if (x >= INT8_MIN) {
1066 kputc_('c', &str); kputc_(x, &str);
1067 } else if (x >= INT16_MIN) {
1068 str.s[str.l++] = 's';
1069 i16_to_le(x, (uint8_t *) str.s + str.l);
1070 str.l += 2;
1071 } else {
1072 str.s[str.l++] = 'i';
1073 i32_to_le(x, (uint8_t *) str.s + str.l);
1074 str.l += 4;
1076 } else {
1077 unsigned long x = strtoul(q, &q, 10);
1078 if (x <= UINT8_MAX) {
1079 kputc_('C', &str); kputc_(x, &str);
1080 } else if (x <= UINT16_MAX) {
1081 str.s[str.l++] = 'S';
1082 u16_to_le(x, (uint8_t *) str.s + str.l);
1083 str.l += 2;
1084 } else {
1085 str.s[str.l++] = 'I';
1086 u32_to_le(x, (uint8_t *) str.s + str.l);
1087 str.l += 4;
1090 } else if (type == 'f') {
1091 str.s[str.l++] = 'f';
1092 float_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1093 str.l += sizeof(float);
1094 } else if (type == 'd') {
1095 str.s[str.l++] = 'd';
1096 double_to_le(strtod(q, &q), (uint8_t *) str.s + str.l);
1097 str.l += sizeof(double);
1098 } else if (type == 'Z' || type == 'H') {
1099 _parse_err(type == 'H' && !((p-q)&1),
1100 "hex field does not have an even number of digits");
1101 kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
1102 } else if (type == 'B') {
1103 int32_t n, size;
1104 size_t bytes;
1105 char *r;
1106 _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
1107 type = *q++; // q points to the first ',' following the typing byte
1109 size = aux_type2size(type);
1110 _parse_err_param(size <= 0 || size > 4,
1111 "unrecognized type B:%c", type);
1112 _parse_err(*q && *q != ',', "B aux field type not followed by ','");
1114 for (r = q, n = 0; *r; ++r)
1115 if (*r == ',') ++n;
1117 // Ensure space for type + values
1118 bytes = (size_t) n * (size_t) size;
1119 _parse_err(bytes / size != n
1120 || ks_resize(&str, str.l + bytes + 2 + sizeof(uint32_t)),
1121 "out of memory");
1122 str.s[str.l++] = 'B';
1123 str.s[str.l++] = type;
1124 i32_to_le(n, (uint8_t *) str.s + str.l);
1125 str.l += sizeof(uint32_t);
1127 // This ensures that q always ends up at the next comma after
1128 // reading a number even if it's followed by junk. It
1129 // prevents the possibility of trying to read more than n items.
1130 #define _skip_to_comma(q, p) do { while ((q) < (p) && *(q) != ',') (q)++; } while (0)
1132 if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
1133 else if (type == 'C') while (q + 1 < p) { uint8_t x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
1134 else if (type == 's') while (q + 1 < p) { i16_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1135 else if (type == 'S') while (q + 1 < p) { u16_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 2; _skip_to_comma(q, p); }
1136 else if (type == 'i') while (q + 1 < p) { i32_to_le(strtol(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1137 else if (type == 'I') while (q + 1 < p) { u32_to_le(strtoul(q + 1, &q, 0), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1138 else if (type == 'f') while (q + 1 < p) { float_to_le(strtod(q + 1, &q), (uint8_t *) str.s + str.l); str.l += 4; _skip_to_comma(q, p); }
1139 else _parse_err_param(1, "unrecognized type B:%c", type);
1141 #undef _skip_to_comma
1143 } else _parse_err_param(1, "unrecognized type %c", type);
1145 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1146 return 0;
1148 #undef _parse_warn
1149 #undef _parse_err
1150 #undef _parse_err_param
1151 #undef _get_mem
1152 #undef _read_token_aux
1153 #undef _read_token
1154 err_ret:
1155 b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
1156 return -2;
1159 int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
1161 switch (fp->format.format) {
1162 case bam: {
1163 int r = bam_read1(fp->fp.bgzf, b);
1164 if (r >= 0) {
1165 if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
1166 b->core.mtid >= h->n_targets || b->core.mtid < -1)
1167 return -3;
1169 return r;
1172 case cram: {
1173 int ret = cram_get_bam_seq(fp->fp.cram, &b);
1174 return ret >= 0
1175 ? ret
1176 : (cram_eof(fp->fp.cram) ? -1 : -2);
1179 case sam: {
1180 int ret;
1181 err_recover:
1182 if (fp->line.l == 0) {
1183 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
1184 if (ret < 0) return ret;
1186 ret = sam_parse1(&fp->line, h, b);
1187 fp->line.l = 0;
1188 if (ret < 0) {
1189 hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
1190 if (h->ignore_sam_err) goto err_recover;
1192 return ret;
1195 default:
1196 abort();
1200 int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
1202 int i;
1203 uint8_t *s, *end;
1204 const bam1_core_t *c = &b->core;
1206 str->l = 0;
1207 kputsn(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); kputc('\t', str); // query name
1208 kputw(c->flag, str); kputc('\t', str); // flag
1209 if (c->tid >= 0) { // chr
1210 kputs(h->target_name[c->tid] , str);
1211 kputc('\t', str);
1212 } else kputsn("*\t", 2, str);
1213 kputw(c->pos + 1, str); kputc('\t', str); // pos
1214 kputw(c->qual, str); kputc('\t', str); // qual
1215 if (c->n_cigar) { // cigar
1216 uint32_t *cigar = bam_get_cigar(b);
1217 for (i = 0; i < c->n_cigar; ++i) {
1218 kputw(bam_cigar_oplen(cigar[i]), str);
1219 kputc(bam_cigar_opchr(cigar[i]), str);
1221 } else kputc('*', str);
1222 kputc('\t', str);
1223 if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
1224 else if (c->mtid == c->tid) kputsn("=\t", 2, str);
1225 else {
1226 kputs(h->target_name[c->mtid], str);
1227 kputc('\t', str);
1229 kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
1230 kputw(c->isize, str); kputc('\t', str); // template len
1231 if (c->l_qseq) { // seq and qual
1232 uint8_t *s = bam_get_seq(b);
1233 for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
1234 kputc('\t', str);
1235 s = bam_get_qual(b);
1236 if (s[0] == 0xff) kputc('*', str);
1237 else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
1238 } else kputsn("*\t*", 3, str);
1240 s = bam_get_aux(b); // aux
1241 end = b->data + b->l_data;
1242 while (end - s >= 4) {
1243 uint8_t type, key[2];
1244 key[0] = s[0]; key[1] = s[1];
1245 s += 2; type = *s++;
1246 kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
1247 if (type == 'A') {
1248 kputsn("A:", 2, str);
1249 kputc(*s, str);
1250 ++s;
1251 } else if (type == 'C') {
1252 kputsn("i:", 2, str);
1253 kputw(*s, str);
1254 ++s;
1255 } else if (type == 'c') {
1256 kputsn("i:", 2, str);
1257 kputw(*(int8_t*)s, str);
1258 ++s;
1259 } else if (type == 'S') {
1260 if (end - s >= 2) {
1261 kputsn("i:", 2, str);
1262 kputuw(le_to_u16(s), str);
1263 s += 2;
1264 } else goto bad_aux;
1265 } else if (type == 's') {
1266 if (end - s >= 2) {
1267 kputsn("i:", 2, str);
1268 kputw(le_to_i16(s), str);
1269 s += 2;
1270 } else goto bad_aux;
1271 } else if (type == 'I') {
1272 if (end - s >= 4) {
1273 kputsn("i:", 2, str);
1274 kputuw(le_to_u32(s), str);
1275 s += 4;
1276 } else goto bad_aux;
1277 } else if (type == 'i') {
1278 if (end - s >= 4) {
1279 kputsn("i:", 2, str);
1280 kputw(le_to_i32(s), str);
1281 s += 4;
1282 } else goto bad_aux;
1283 } else if (type == 'f') {
1284 if (end - s >= 4) {
1285 ksprintf(str, "f:%g", le_to_float(s));
1286 s += 4;
1287 } else goto bad_aux;
1289 } else if (type == 'd') {
1290 if (end - s >= 8) {
1291 ksprintf(str, "d:%g", le_to_double(s));
1292 s += 8;
1293 } else goto bad_aux;
1294 } else if (type == 'Z' || type == 'H') {
1295 kputc(type, str); kputc(':', str);
1296 while (s < end && *s) kputc(*s++, str);
1297 if (s >= end)
1298 goto bad_aux;
1299 ++s;
1300 } else if (type == 'B') {
1301 uint8_t sub_type = *(s++);
1302 int sub_type_size = aux_type2size(sub_type);
1303 uint32_t n;
1304 if (sub_type_size == 0 || end - s < 4)
1305 goto bad_aux;
1306 n = le_to_u32(s);
1307 s += 4; // now points to the start of the array
1308 if ((end - s) / sub_type_size < n)
1309 goto bad_aux;
1310 kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
1311 for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
1312 kputc(',', str);
1313 if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; }
1314 else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
1315 else if ('s' == sub_type) { kputw(le_to_i16(s), str); s += 2; }
1316 else if ('S' == sub_type) { kputw(le_to_u16(s), str); s += 2; }
1317 else if ('i' == sub_type) { kputw(le_to_i32(s), str); s += 4; }
1318 else if ('I' == sub_type) { kputuw(le_to_u32(s), str); s += 4; }
1319 else if ('f' == sub_type) { kputd(le_to_float(s), str); s += 4; }
1320 else goto bad_aux; // Unknown sub-type
1322 } else { // Unknown type
1323 goto bad_aux;
1326 return str->l;
1328 bad_aux:
1329 hts_log_error("Corrupted aux data for read %.*s",
1330 b->core.l_qname, bam_get_qname(b));
1331 errno = EINVAL;
1332 return -1;
1335 int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
1337 switch (fp->format.format) {
1338 case binary_format:
1339 fp->format.category = sequence_data;
1340 fp->format.format = bam;
1341 /* fall-through */
1342 case bam:
1343 return bam_write1(fp->fp.bgzf, b);
1345 case cram:
1346 return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
1348 case text_format:
1349 fp->format.category = sequence_data;
1350 fp->format.format = sam;
1351 /* fall-through */
1352 case sam:
1353 if (sam_format1(h, b, &fp->line) < 0) return -1;
1354 kputc('\n', &fp->line);
1355 if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
1356 return fp->line.l;
1358 default:
1359 abort();
1363 /************************
1364 *** Auxiliary fields ***
1365 ************************/
1366 #ifndef HTS_LITTLE_ENDIAN
1367 static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
1368 int tsz = aux_type2size(type);
1370 if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
1372 switch (tsz) {
1373 case 'H': case 'Z': case 1: // Trivial
1374 memcpy(out, in, len);
1375 break;
1377 #define aux_val_to_le(type_t, store_le) do { \
1378 type_t v; \
1379 size_t i; \
1380 for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
1381 memcpy(&v, in + i, sizeof(type_t)); \
1382 store_le(v, out); \
1384 } while (0)
1386 case 2: aux_val_to_le(uint16_t, u16_to_le); break;
1387 case 4: aux_val_to_le(uint32_t, u32_to_le); break;
1388 case 8: aux_val_to_le(uint64_t, u64_to_le); break;
1390 #undef aux_val_to_le
1392 case 'B': { // Recurse!
1393 uint32_t n;
1394 if (len < 5) return -1;
1395 memcpy(&n, in + 1, 4);
1396 out[0] = in[0];
1397 u32_to_le(n, out + 1);
1398 return aux_to_le(in[0], out + 5, in + 5, len - 5);
1401 default: // Unknown type code
1402 return -1;
1407 return 0;
1409 #endif
1411 int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
1413 uint32_t new_len;
1415 assert(b->l_data >= 0);
1416 new_len = b->l_data + 3 + len;
1417 if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
1419 if (b->m_data < new_len) {
1420 uint32_t new_size = new_len;
1421 uint8_t *new_data;
1422 kroundup32(new_size);
1423 new_data = realloc(b->data, new_size);
1424 if (new_data == NULL) goto nomem;
1425 b->m_data = new_size;
1426 b->data = new_data;
1429 b->data[b->l_data] = tag[0];
1430 b->data[b->l_data + 1] = tag[1];
1431 b->data[b->l_data + 2] = type;
1433 #ifdef HTS_LITTLE_ENDIAN
1434 memcpy(b->data + b->l_data + 3, data, len);
1435 #else
1436 if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
1437 errno = EINVAL;
1438 return -1;
1440 #endif
1442 b->l_data = new_len;
1444 return 0;
1446 nomem:
1447 errno = ENOMEM;
1448 return -1;
1451 static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
1453 int size;
1454 uint32_t n;
1455 if (s >= end) return end;
1456 size = aux_type2size(*s); ++s; // skip type
1457 switch (size) {
1458 case 'Z':
1459 case 'H':
1460 while (*s && s < end) ++s;
1461 return s < end ? s + 1 : end;
1462 case 'B':
1463 if (end - s < 5) return NULL;
1464 size = aux_type2size(*s); ++s;
1465 n = le_to_u32(s);
1466 s += 4;
1467 if (size == 0 || end - s < size * n) return NULL;
1468 return s + size * n;
1469 case 0:
1470 return NULL;
1471 default:
1472 if (end - s < size) return NULL;
1473 return s + size;
1477 uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
1479 uint8_t *s, *end, *t = (uint8_t *) tag;
1480 uint16_t y = (uint16_t) t[0]<<8 | t[1];
1481 s = bam_get_aux(b);
1482 end = b->data + b->l_data;
1483 while (s != NULL && end - s >= 3) {
1484 uint16_t x = (uint16_t) s[0]<<8 | s[1];
1485 s += 2;
1486 if (x == y) {
1487 // Check the tag value is valid and complete
1488 uint8_t *e = skip_aux(s, end);
1489 if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
1490 goto bad_aux; // Unterminated string
1492 if (e != NULL) {
1493 return s;
1494 } else {
1495 goto bad_aux;
1498 s = skip_aux(s, end);
1500 if (s == NULL) goto bad_aux;
1501 errno = ENOENT;
1502 return NULL;
1504 bad_aux:
1505 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1506 errno = EINVAL;
1507 return NULL;
1509 // s MUST BE returned by bam_aux_get()
1510 int bam_aux_del(bam1_t *b, uint8_t *s)
1512 uint8_t *p, *aux;
1513 int l_aux = bam_get_l_aux(b);
1514 aux = bam_get_aux(b);
1515 p = s - 2;
1516 s = skip_aux(s, aux + l_aux);
1517 if (s == NULL) goto bad_aux;
1518 memmove(p, s, l_aux - (s - aux));
1519 b->l_data -= s - p;
1520 return 0;
1522 bad_aux:
1523 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
1524 errno = EINVAL;
1525 return -1;
1528 int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
1530 // FIXME: This is not at all efficient!
1531 uint8_t *s = bam_aux_get(b,tag);
1532 if (!s) {
1533 if (errno == ENOENT) { // Tag doesn't exist - add a new one
1534 return bam_aux_append(b, tag, 'Z', len, (const uint8_t *) data);
1535 } else { // Invalid aux data, give up.
1536 return -1;
1539 char type = *s;
1540 if (type != 'Z') {
1541 hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
1542 errno = EINVAL;
1543 return -1;
1546 bam_aux_del(b,s);
1547 s -= 2;
1548 int l_aux = bam_get_l_aux(b);
1550 b->l_data += 3 + len;
1551 if (b->m_data < b->l_data) {
1552 ptrdiff_t s_offset = s - b->data;
1553 b->m_data = b->l_data;
1554 kroundup32(b->m_data);
1555 b->data = (uint8_t*)realloc(b->data, b->m_data);
1556 s = b->data + s_offset;
1558 memmove(s+3+len, s, l_aux - (s - bam_get_aux(b)));
1559 s[0] = tag[0];
1560 s[1] = tag[1];
1561 s[2] = type;
1562 memmove(s+3,data,len);
1563 return 0;
1566 static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
1567 uint32_t idx)
1569 switch (type) {
1570 case 'c': return le_to_i8(s + idx);
1571 case 'C': return s[idx];
1572 case 's': return le_to_i16(s + 2 * idx);
1573 case 'S': return le_to_u16(s + 2 * idx);
1574 case 'i': return le_to_i32(s + 4 * idx);
1575 case 'I': return le_to_u32(s + 4 * idx);
1576 default:
1577 errno = EINVAL;
1578 return 0;
1582 int64_t bam_aux2i(const uint8_t *s)
1584 int type;
1585 type = *s++;
1586 return get_int_aux_val(type, s, 0);
1589 double bam_aux2f(const uint8_t *s)
1591 int type;
1592 type = *s++;
1593 if (type == 'd') return le_to_double(s);
1594 else if (type == 'f') return le_to_float(s);
1595 else return get_int_aux_val(type, s, 0);
1598 char bam_aux2A(const uint8_t *s)
1600 int type;
1601 type = *s++;
1602 if (type == 'A') return *(char*)s;
1603 errno = EINVAL;
1604 return 0;
1607 char *bam_aux2Z(const uint8_t *s)
1609 int type;
1610 type = *s++;
1611 if (type == 'Z' || type == 'H') return (char*)s;
1612 errno = EINVAL;
1613 return 0;
1616 uint32_t bam_auxB_len(const uint8_t *s)
1618 if (s[0] != 'B') {
1619 errno = EINVAL;
1620 return 0;
1622 return le_to_u32(s + 2);
1625 int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
1627 uint32_t len = bam_auxB_len(s);
1628 if (idx >= len) {
1629 errno = ERANGE;
1630 return 0;
1632 return get_int_aux_val(s[1], s + 6, idx);
1635 double bam_auxB2f(const uint8_t *s, uint32_t idx)
1637 uint32_t len = bam_auxB_len(s);
1638 if (idx >= len) {
1639 errno = ERANGE;
1640 return 0.0;
1642 if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
1643 else return get_int_aux_val(s[1], s + 6, idx);
1646 int sam_open_mode(char *mode, const char *fn, const char *format)
1648 // TODO Parse "bam5" etc for compression level
1649 if (format == NULL) {
1650 // Try to pick a format based on the filename extension
1651 const char *ext = fn? strrchr(fn, '.') : NULL;
1652 if (ext == NULL || strchr(ext, '/')) return -1;
1653 return sam_open_mode(mode, fn, ext+1);
1655 else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
1656 else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
1657 else if (strcmp(format, "sam") == 0) strcpy(mode, "");
1658 else return -1;
1660 return 0;
1663 // A version of sam_open_mode that can handle ,key=value options.
1664 // The format string is allocated and returned, to be freed by the caller.
1665 // Prefix should be "r" or "w",
1666 char *sam_open_mode_opts(const char *fn,
1667 const char *mode,
1668 const char *format)
1670 char *mode_opts = malloc((format ? strlen(format) : 1) +
1671 (mode ? strlen(mode) : 1) + 12);
1672 char *opts, *cp;
1673 int format_len;
1675 if (!mode_opts)
1676 return NULL;
1678 strcpy(mode_opts, mode ? mode : "r");
1679 cp = mode_opts + strlen(mode_opts);
1681 if (format == NULL) {
1682 // Try to pick a format based on the filename extension
1683 const char *ext = fn? strrchr(fn, '.') : NULL;
1684 if (ext == NULL || strchr(ext, '/')) {
1685 free(mode_opts);
1686 return NULL;
1688 return sam_open_mode(cp, fn, ext+1)
1689 ? (free(mode_opts), NULL)
1690 : mode_opts;
1693 if ((opts = strchr(format, ','))) {
1694 format_len = opts-format;
1695 } else {
1696 opts="";
1697 format_len = strlen(format);
1700 if (strncmp(format, "bam", format_len) == 0) {
1701 *cp++ = 'b';
1702 } else if (strncmp(format, "cram", format_len) == 0) {
1703 *cp++ = 'c';
1704 } else if (strncmp(format, "cram2", format_len) == 0) {
1705 *cp++ = 'c';
1706 strcpy(cp, ",VERSION=2.1");
1707 cp += 12;
1708 } else if (strncmp(format, "cram3", format_len) == 0) {
1709 *cp++ = 'c';
1710 strcpy(cp, ",VERSION=3.0");
1711 cp += 12;
1712 } else if (strncmp(format, "sam", format_len) == 0) {
1713 ; // format mode=""
1714 } else {
1715 free(mode_opts);
1716 return NULL;
1719 strcpy(cp, opts);
1721 return mode_opts;
1724 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
1725 int bam_str2flag(const char *str)
1727 char *end, *beg = (char*) str;
1728 long int flag = strtol(str, &end, 0);
1729 if ( end!=str ) return flag; // the conversion was successful
1730 flag = 0;
1731 while ( *str )
1733 end = beg;
1734 while ( *end && *end!=',' ) end++;
1735 if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
1736 else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
1737 else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
1738 else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
1739 else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
1740 else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
1741 else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
1742 else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
1743 else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
1744 else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
1745 else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
1746 else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
1747 else return -1;
1748 if ( !*end ) break;
1749 beg = end + 1;
1751 return flag;
1754 char *bam_flag2str(int flag)
1756 kstring_t str = {0,0,0};
1757 if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
1758 if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
1759 if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
1760 if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
1761 if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
1762 if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
1763 if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
1764 if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
1765 if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
1766 if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
1767 if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
1768 if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
1769 if ( str.l == 0 ) kputsn("", 0, &str);
1770 return str.s;
1774 /**************************
1775 *** Pileup and Mpileup ***
1776 **************************/
1778 #if !defined(BAM_NO_PILEUP)
1780 #include <assert.h>
1782 /*******************
1783 *** Memory pool ***
1784 *******************/
1786 typedef struct {
1787 int k, x, y, end;
1788 } cstate_t;
1790 static cstate_t g_cstate_null = { -1, 0, 0, 0 };
1792 typedef struct __linkbuf_t {
1793 bam1_t b;
1794 int32_t beg, end;
1795 cstate_t s;
1796 struct __linkbuf_t *next;
1797 bam_pileup_cd cd;
1798 } lbnode_t;
1800 typedef struct {
1801 int cnt, n, max;
1802 lbnode_t **buf;
1803 } mempool_t;
1805 static mempool_t *mp_init(void)
1807 mempool_t *mp;
1808 mp = (mempool_t*)calloc(1, sizeof(mempool_t));
1809 return mp;
1811 static void mp_destroy(mempool_t *mp)
1813 int k;
1814 for (k = 0; k < mp->n; ++k) {
1815 free(mp->buf[k]->b.data);
1816 free(mp->buf[k]);
1818 free(mp->buf);
1819 free(mp);
1821 static inline lbnode_t *mp_alloc(mempool_t *mp)
1823 ++mp->cnt;
1824 if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
1825 else return mp->buf[--mp->n];
1827 static inline void mp_free(mempool_t *mp, lbnode_t *p)
1829 --mp->cnt; p->next = 0; // clear lbnode_t::next here
1830 if (mp->n == mp->max) {
1831 mp->max = mp->max? mp->max<<1 : 256;
1832 mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
1834 mp->buf[mp->n++] = p;
1837 /**********************
1838 *** CIGAR resolver ***
1839 **********************/
1841 /* s->k: the index of the CIGAR operator that has just been processed.
1842 s->x: the reference coordinate of the start of s->k
1843 s->y: the query coordiante of the start of s->k
1845 static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
1847 #define _cop(c) ((c)&BAM_CIGAR_MASK)
1848 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
1850 bam1_t *b = p->b;
1851 bam1_core_t *c = &b->core;
1852 uint32_t *cigar = bam_get_cigar(b);
1853 int k;
1854 // determine the current CIGAR operation
1855 // fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
1856 if (s->k == -1) { // never processed
1857 if (c->n_cigar == 1) { // just one operation, save a loop
1858 if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
1859 } else { // find the first match or deletion
1860 for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
1861 int op = _cop(cigar[k]);
1862 int l = _cln(cigar[k]);
1863 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
1864 else if (op == BAM_CREF_SKIP) s->x += l;
1865 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
1867 assert(k < c->n_cigar);
1868 s->k = k;
1870 } else { // the read has been processed before
1871 int op, l = _cln(cigar[s->k]);
1872 if (pos - s->x >= l) { // jump to the next operation
1873 assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
1874 op = _cop(cigar[s->k+1]);
1875 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
1876 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
1877 s->x += l;
1878 ++s->k;
1879 } else { // find the next M/D/N/=/X
1880 if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
1881 s->x += l;
1882 for (k = s->k + 1; k < c->n_cigar; ++k) {
1883 op = _cop(cigar[k]), l = _cln(cigar[k]);
1884 if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
1885 else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
1887 s->k = k;
1889 assert(s->k < c->n_cigar); // otherwise a bug
1890 } // else, do nothing
1892 { // collect pileup information
1893 int op, l;
1894 op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
1895 p->is_del = p->indel = p->is_refskip = 0;
1896 if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
1897 int op2 = _cop(cigar[s->k+1]);
1898 int l2 = _cln(cigar[s->k+1]);
1899 if (op2 == BAM_CDEL) p->indel = -(int)l2;
1900 else if (op2 == BAM_CINS) p->indel = l2;
1901 else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
1902 int l3 = 0;
1903 for (k = s->k + 2; k < c->n_cigar; ++k) {
1904 op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
1905 if (op2 == BAM_CINS) l3 += l2;
1906 else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
1908 if (l3 > 0) p->indel = l3;
1911 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
1912 p->qpos = s->y + (pos - s->x);
1913 } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
1914 p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
1915 p->is_refskip = (op == BAM_CREF_SKIP);
1916 } // cannot be other operations; otherwise a bug
1917 p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
1919 return 1;
1922 /***********************
1923 *** Pileup iterator ***
1924 ***********************/
1926 // Dictionary of overlapping reads
1927 KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
1928 typedef khash_t(olap_hash) olap_hash_t;
1930 struct __bam_plp_t {
1931 mempool_t *mp;
1932 lbnode_t *head, *tail;
1933 int32_t tid, pos, max_tid, max_pos;
1934 int is_eof, max_plp, error, maxcnt;
1935 uint64_t id;
1936 bam_pileup1_t *plp;
1937 // for the "auto" interface only
1938 bam1_t *b;
1939 bam_plp_auto_f func;
1940 void *data;
1941 olap_hash_t *overlaps;
1943 // For notification of creation and destruction events
1944 // and associated client-owned pointer.
1945 int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
1946 int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
1949 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
1951 bam_plp_t iter;
1952 iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
1953 iter->mp = mp_init();
1954 iter->head = iter->tail = mp_alloc(iter->mp);
1955 iter->max_tid = iter->max_pos = -1;
1956 iter->maxcnt = 8000;
1957 if (func) {
1958 iter->func = func;
1959 iter->data = data;
1960 iter->b = bam_init1();
1962 return iter;
1965 void bam_plp_init_overlaps(bam_plp_t iter)
1967 iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads
1970 void bam_plp_destroy(bam_plp_t iter)
1972 lbnode_t *p, *pnext;
1973 if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
1974 for (p = iter->head; p != NULL; p = pnext) {
1975 pnext = p->next;
1976 mp_free(iter->mp, p);
1978 mp_destroy(iter->mp);
1979 if (iter->b) bam_destroy1(iter->b);
1980 free(iter->plp);
1981 free(iter);
1984 void bam_plp_constructor(bam_plp_t plp,
1985 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
1986 plp->plp_construct = func;
1989 void bam_plp_destructor(bam_plp_t plp,
1990 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
1991 plp->plp_destruct = func;
1994 //---------------------------------
1995 //--- Tweak overlapping reads
1996 //---------------------------------
1999 * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index
2000 * cigar_iref2iseq_next() - get the next CMATCH base
2001 * @cigar: pointer to current cigar block (rw)
2002 * @cigar_max: pointer just beyond the last cigar block
2003 * @icig: position within the current cigar block (rw)
2004 * @iseq: position in the sequence (rw)
2005 * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
2007 * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
2009 static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2011 int pos = *iref;
2012 if ( pos < 0 ) return -1;
2013 *icig = 0;
2014 *iseq = 0;
2015 *iref = 0;
2016 while ( *cigar<cigar_max )
2018 int cig = (**cigar) & BAM_CIGAR_MASK;
2019 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2021 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2022 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2023 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2025 pos -= ncig;
2026 if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
2027 (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
2028 continue;
2030 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2031 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
2033 pos -= ncig;
2034 if ( pos<0 ) pos = 0;
2035 (*cigar)++; *icig = 0; *iref += ncig;
2036 continue;
2038 hts_log_error("Unexpected cigar %d", cig);
2039 assert(0);
2041 *iseq = -1;
2042 return -1;
2044 static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
2046 while ( *cigar < cigar_max )
2048 int cig = (**cigar) & BAM_CIGAR_MASK;
2049 int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
2051 if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
2053 if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; }
2054 (*iseq)++; (*icig)++; (*iref)++;
2055 return BAM_CMATCH;
2057 if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
2058 if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2059 if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
2060 if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
2061 hts_log_error("Unexpected cigar %d", cig);
2062 assert(0);
2064 *iseq = -1;
2065 *iref = -1;
2066 return -1;
2069 static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
2071 uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
2072 uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
2073 int a_icig = 0, a_iseq = 0;
2074 int b_icig = 0, b_iseq = 0;
2075 uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
2076 uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b);
2078 int iref = b->core.pos;
2079 int a_iref = iref - a->core.pos;
2080 int b_iref = iref - b->core.pos;
2081 int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2082 if ( a_ret<0 ) return; // no overlap
2083 int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2084 if ( b_ret<0 ) return; // no overlap
2086 #if DBG
2087 fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
2088 a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
2089 #endif
2091 while ( 1 )
2093 // Increment reference position
2094 while ( a_iref>=0 && a_iref < iref - a->core.pos )
2095 a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
2096 if ( a_ret<0 ) break; // done
2097 if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
2099 while ( b_iref>=0 && b_iref < iref - b->core.pos )
2100 b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
2101 if ( b_ret<0 ) break; // done
2102 if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
2104 iref++;
2105 if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels
2107 if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
2109 #if DBG
2110 fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
2111 #endif
2112 // we are very confident about this base
2113 int qual = a_qual[a_iseq] + b_qual[b_iseq];
2114 a_qual[a_iseq] = qual>200 ? 200 : qual;
2115 b_qual[b_iseq] = 0;
2117 else
2119 if ( a_qual[a_iseq] >= b_qual[b_iseq] )
2121 #if DBG
2122 fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
2123 #endif
2124 a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch
2125 b_qual[b_iseq] = 0;
2127 else
2129 #if DBG
2130 fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
2131 #endif
2132 b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
2133 a_qual[a_iseq] = 0;
2137 #if DBG
2138 fprintf(stderr,"\n");
2139 #endif
2142 // Fix overlapping reads. Simple soft-clipping did not give good results.
2143 // Lowering qualities of unwanted bases is more selective and works better.
2145 static void overlap_push(bam_plp_t iter, lbnode_t *node)
2147 if ( !iter->overlaps ) return;
2149 // mapped mates and paired reads only
2150 if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
2152 // no overlap possible, unless some wild cigar
2153 if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
2155 khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
2156 if ( kitr==kh_end(iter->overlaps) )
2158 int ret;
2159 kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
2160 kh_value(iter->overlaps, kitr) = node;
2162 else
2164 lbnode_t *a = kh_value(iter->overlaps, kitr);
2165 tweak_overlap_quality(&a->b, &node->b);
2166 kh_del(olap_hash, iter->overlaps, kitr);
2167 assert(a->end-1 == a->s.end);
2168 a->end = bam_endpos(&a->b);
2169 a->s.end = a->end - 1;
2173 static void overlap_remove(bam_plp_t iter, const bam1_t *b)
2175 if ( !iter->overlaps ) return;
2177 khiter_t kitr;
2178 if ( b )
2180 kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
2181 if ( kitr!=kh_end(iter->overlaps) )
2182 kh_del(olap_hash, iter->overlaps, kitr);
2184 else
2186 // remove all
2187 for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
2188 if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
2194 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
2195 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
2196 // buffer yet (the current position is still the maximum position across all buffered reads).
2197 const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2199 if (iter->error) { *_n_plp = -1; return NULL; }
2200 *_n_plp = 0;
2201 if (iter->is_eof && iter->head == iter->tail) return NULL;
2202 while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
2203 int n_plp = 0;
2204 // write iter->plp at iter->pos
2205 lbnode_t **pptr = &iter->head;
2206 while (*pptr != iter->tail) {
2207 lbnode_t *p = *pptr;
2208 if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
2209 overlap_remove(iter, &p->b);
2210 if (iter->plp_destruct)
2211 iter->plp_destruct(iter->data, &p->b, &p->cd);
2212 *pptr = p->next; mp_free(iter->mp, p);
2214 else {
2215 if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
2216 if (n_plp == iter->max_plp) { // then double the capacity
2217 iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
2218 iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
2220 iter->plp[n_plp].b = &p->b;
2221 iter->plp[n_plp].cd = p->cd;
2222 if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
2224 pptr = &(*pptr)->next;
2227 *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
2228 // update iter->tid and iter->pos
2229 if (iter->head != iter->tail) {
2230 if (iter->tid > iter->head->b.core.tid) {
2231 hts_log_error("Unsorted input. Pileup aborts");
2232 iter->error = 1;
2233 *_n_plp = -1;
2234 return NULL;
2237 if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
2238 iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
2239 } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
2240 iter->pos = iter->head->beg; // jump to the next position
2241 } else ++iter->pos; // scan contiguously
2242 // return
2243 if (n_plp) return iter->plp;
2244 if (iter->is_eof && iter->head == iter->tail) break;
2246 return NULL;
2249 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
2251 if (iter->error) return -1;
2252 if (b) {
2253 if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
2254 // Skip only unmapped reads here, any additional filtering must be done in iter->func
2255 if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
2256 if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
2258 overlap_remove(iter, b);
2259 return 0;
2261 bam_copy1(&iter->tail->b, b);
2262 overlap_push(iter, iter->tail);
2263 #ifndef BAM_NO_ID
2264 iter->tail->b.id = iter->id++;
2265 #endif
2266 iter->tail->beg = b->core.pos;
2267 iter->tail->end = bam_endpos(b);
2268 iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
2269 if (b->core.tid < iter->max_tid) {
2270 hts_log_error("The input is not sorted (chromosomes out of order)");
2271 iter->error = 1;
2272 return -1;
2274 if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
2275 hts_log_error("The input is not sorted (reads out of order)");
2276 iter->error = 1;
2277 return -1;
2279 iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
2280 if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
2281 if (iter->plp_construct)
2282 iter->plp_construct(iter->data, b, &iter->tail->cd);
2283 iter->tail->next = mp_alloc(iter->mp);
2284 iter->tail = iter->tail->next;
2286 } else iter->is_eof = 1;
2287 return 0;
2290 const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
2292 const bam_pileup1_t *plp;
2293 if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
2294 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2295 else { // no pileup line can be obtained; read alignments
2296 *_n_plp = 0;
2297 if (iter->is_eof) return 0;
2298 int ret;
2299 while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
2300 if (bam_plp_push(iter, iter->b) < 0) {
2301 *_n_plp = -1;
2302 return 0;
2304 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2305 // otherwise no pileup line can be returned; read the next alignment.
2307 if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
2308 bam_plp_push(iter, 0);
2309 if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
2310 return 0;
2314 void bam_plp_reset(bam_plp_t iter)
2316 overlap_remove(iter, NULL);
2317 iter->max_tid = iter->max_pos = -1;
2318 iter->tid = iter->pos = 0;
2319 iter->is_eof = 0;
2320 while (iter->head != iter->tail) {
2321 lbnode_t *p = iter->head;
2322 iter->head = p->next;
2323 mp_free(iter->mp, p);
2327 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
2329 iter->maxcnt = maxcnt;
2332 /************************
2333 *** Mpileup iterator ***
2334 ************************/
2336 struct __bam_mplp_t {
2337 int n;
2338 uint64_t min, *pos;
2339 bam_plp_t *iter;
2340 int *n_plp;
2341 const bam_pileup1_t **plp;
2344 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
2346 int i;
2347 bam_mplp_t iter;
2348 iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
2349 iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t));
2350 iter->n_plp = (int*)calloc(n, sizeof(int));
2351 iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
2352 iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
2353 iter->n = n;
2354 iter->min = (uint64_t)-1;
2355 for (i = 0; i < n; ++i) {
2356 iter->iter[i] = bam_plp_init(func, data[i]);
2357 iter->pos[i] = iter->min;
2359 return iter;
2362 void bam_mplp_init_overlaps(bam_mplp_t iter)
2364 int i;
2365 for (i = 0; i < iter->n; ++i)
2366 bam_plp_init_overlaps(iter->iter[i]);
2369 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
2371 int i;
2372 for (i = 0; i < iter->n; ++i)
2373 iter->iter[i]->maxcnt = maxcnt;
2376 void bam_mplp_destroy(bam_mplp_t iter)
2378 int i;
2379 for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
2380 free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
2381 free(iter);
2384 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
2386 int i, ret = 0;
2387 uint64_t new_min = (uint64_t)-1;
2388 for (i = 0; i < iter->n; ++i) {
2389 if (iter->pos[i] == iter->min) {
2390 int tid, pos;
2391 iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
2392 if ( iter->iter[i]->error ) return -1;
2393 iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
2395 if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
2397 iter->min = new_min;
2398 if (new_min == (uint64_t)-1) return 0;
2399 *_tid = new_min>>32; *_pos = (uint32_t)new_min;
2400 for (i = 0; i < iter->n; ++i) {
2401 if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
2402 n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
2403 ++ret;
2404 } else n_plp[i] = 0, plp[i] = 0;
2406 return ret;
2409 void bam_mplp_reset(bam_mplp_t iter)
2411 int i;
2412 iter->min = (uint64_t)-1;
2413 for (i = 0; i < iter->n; ++i) {
2414 bam_plp_reset(iter->iter[i]);
2415 iter->pos[i] = (uint64_t)-1;
2416 iter->n_plp[i] = 0;
2417 iter->plp[i] = NULL;
2421 void bam_mplp_constructor(bam_mplp_t iter,
2422 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2423 int i;
2424 for (i = 0; i < iter->n; ++i)
2425 bam_plp_constructor(iter->iter[i], func);
2428 void bam_mplp_destructor(bam_mplp_t iter,
2429 int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
2430 int i;
2431 for (i = 0; i < iter->n; ++i)
2432 bam_plp_destructor(iter->iter[i], func);
2435 #endif // ~!defined(BAM_NO_PILEUP)