1 /* sam.c -- SAM and BAM file I/O and manipulation.
3 Copyright (C) 2008-2010, 2012-2017 Genome Research Ltd.
4 Copyright (C) 2010, 2012, 2013 Broad Institute.
6 Author: Heng Li <lh3@sanger.ac.uk>
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
35 #include "htslib/sam.h"
36 #include "htslib/bgzf.h"
37 #include "cram/cram.h"
38 #include "hts_internal.h"
39 #include "htslib/hfile.h"
40 #include "htslib/hts_endian.h"
42 #include "htslib/khash.h"
43 KHASH_DECLARE(s2i
, kh_cstr_t
, int64_t)
45 typedef khash_t(s2i
) sdict_t
;
48 #define EOVERFLOW ERANGE
51 /**********************
52 *** BAM header I/O ***
53 **********************/
55 bam_hdr_t
*bam_hdr_init()
57 return (bam_hdr_t
*)calloc(1, sizeof(bam_hdr_t
));
60 void bam_hdr_destroy(bam_hdr_t
*h
)
63 if (h
== NULL
) return;
65 for (i
= 0; i
< h
->n_targets
; ++i
)
66 free(h
->target_name
[i
]);
70 free(h
->text
); free(h
->cigar_tab
);
71 if (h
->sdict
) kh_destroy(s2i
, (sdict_t
*)h
->sdict
);
75 bam_hdr_t
*bam_hdr_dup(const bam_hdr_t
*h0
)
77 if (h0
== NULL
) return NULL
;
79 if ((h
= bam_hdr_init()) == NULL
) return NULL
;
80 // copy the simple data
81 h
->n_targets
= h0
->n_targets
;
82 h
->ignore_sam_err
= h0
->ignore_sam_err
;
83 h
->l_text
= h0
->l_text
;
84 // Then the pointery stuff
87 // TODO Check for memory allocation failures
88 h
->text
= (char*)calloc(h
->l_text
+ 1, 1);
89 memcpy(h
->text
, h0
->text
, h
->l_text
);
90 h
->target_len
= (uint32_t*)calloc(h
->n_targets
, sizeof(uint32_t));
91 h
->target_name
= (char**)calloc(h
->n_targets
, sizeof(char*));
93 for (i
= 0; i
< h
->n_targets
; ++i
) {
94 h
->target_len
[i
] = h0
->target_len
[i
];
95 h
->target_name
[i
] = strdup(h0
->target_name
[i
]);
101 static bam_hdr_t
*hdr_from_dict(sdict_t
*d
)
107 h
->n_targets
= kh_size(d
);
108 // TODO Check for memory allocation failures
109 h
->target_len
= (uint32_t*)malloc(sizeof(uint32_t) * h
->n_targets
);
110 h
->target_name
= (char**)malloc(sizeof(char*) * h
->n_targets
);
111 for (k
= kh_begin(d
); k
!= kh_end(d
); ++k
) {
112 if (!kh_exist(d
, k
)) continue;
113 h
->target_name
[kh_val(d
, k
)>>32] = (char*)kh_key(d
, k
);
114 h
->target_len
[kh_val(d
, k
)>>32] = kh_val(d
, k
) & 0xffffffffUL
;
120 bam_hdr_t
*bam_hdr_read(BGZF
*fp
)
124 int magic_len
, has_EOF
;
125 int32_t i
, name_len
, num_names
= 0;
129 has_EOF
= bgzf_check_EOF(fp
);
131 perror("[W::bam_hdr_read] bgzf_check_EOF");
132 } else if (has_EOF
== 0) {
133 hts_log_warning("EOF marker is absent. The input is probably truncated");
136 magic_len
= bgzf_read(fp
, buf
, 4);
137 if (magic_len
!= 4 || strncmp(buf
, "BAM\1", 4)) {
138 hts_log_error("Invalid BAM binary header");
144 // read plain text and the number of reference sequences
145 bytes
= bgzf_read(fp
, &h
->l_text
, 4);
146 if (bytes
!= 4) goto read_err
;
147 if (fp
->is_be
) ed_swap_4p(&h
->l_text
);
149 bufsize
= ((size_t) h
->l_text
) + 1;
150 if (bufsize
< h
->l_text
) goto nomem
; // so large that adding 1 overflowed
151 h
->text
= (char*)malloc(bufsize
);
152 if (!h
->text
) goto nomem
;
153 h
->text
[h
->l_text
] = 0; // make sure it is NULL terminated
154 bytes
= bgzf_read(fp
, h
->text
, h
->l_text
);
155 if (bytes
!= h
->l_text
) goto read_err
;
157 bytes
= bgzf_read(fp
, &h
->n_targets
, 4);
158 if (bytes
!= 4) goto read_err
;
159 if (fp
->is_be
) ed_swap_4p(&h
->n_targets
);
161 if (h
->n_targets
< 0) goto invalid
;
163 // read reference sequence names and lengths
164 if (h
->n_targets
> 0) {
165 h
->target_name
= (char**)calloc(h
->n_targets
, sizeof(char*));
166 if (!h
->target_name
) goto nomem
;
167 h
->target_len
= (uint32_t*)calloc(h
->n_targets
, sizeof(uint32_t));
168 if (!h
->target_len
) goto nomem
;
171 h
->target_name
= NULL
;
172 h
->target_len
= NULL
;
175 for (i
= 0; i
!= h
->n_targets
; ++i
) {
176 bytes
= bgzf_read(fp
, &name_len
, 4);
177 if (bytes
!= 4) goto read_err
;
178 if (fp
->is_be
) ed_swap_4p(&name_len
);
179 if (name_len
<= 0) goto invalid
;
181 h
->target_name
[i
] = (char*)malloc(name_len
);
182 if (!h
->target_name
[i
]) goto nomem
;
185 bytes
= bgzf_read(fp
, h
->target_name
[i
], name_len
);
186 if (bytes
!= name_len
) goto read_err
;
188 if (h
->target_name
[i
][name_len
- 1] != '\0') {
189 /* Fix missing NUL-termination. Is this being too nice?
190 We could alternatively bail out with an error. */
192 if (name_len
== INT32_MAX
) goto invalid
;
193 new_name
= realloc(h
->target_name
[i
], name_len
+ 1);
194 if (new_name
== NULL
) goto nomem
;
195 h
->target_name
[i
] = new_name
;
196 h
->target_name
[i
][name_len
] = '\0';
199 bytes
= bgzf_read(fp
, &h
->target_len
[i
], 4);
200 if (bytes
!= 4) goto read_err
;
201 if (fp
->is_be
) ed_swap_4p(&h
->target_len
[i
]);
206 hts_log_error("Out of memory");
211 hts_log_error("Error reading BGZF stream");
213 hts_log_error("Truncated BAM header");
218 hts_log_error("Invalid BAM binary header");
222 h
->n_targets
= num_names
; // ensure we free only allocated target_names
228 int bam_hdr_write(BGZF
*fp
, const bam_hdr_t
*h
)
231 int32_t i
, name_len
, x
;
233 strncpy(buf
, "BAM\1", 4);
234 if (bgzf_write(fp
, buf
, 4) < 0) return -1;
235 // write plain text and the number of reference sequences
237 x
= ed_swap_4(h
->l_text
);
238 if (bgzf_write(fp
, &x
, 4) < 0) return -1;
240 if (bgzf_write(fp
, h
->text
, h
->l_text
) < 0) return -1;
242 x
= ed_swap_4(h
->n_targets
);
243 if (bgzf_write(fp
, &x
, 4) < 0) return -1;
245 if (bgzf_write(fp
, &h
->l_text
, 4) < 0) return -1;
247 if (bgzf_write(fp
, h
->text
, h
->l_text
) < 0) return -1;
249 if (bgzf_write(fp
, &h
->n_targets
, 4) < 0) return -1;
251 // write sequence names and lengths
252 for (i
= 0; i
!= h
->n_targets
; ++i
) {
253 char *p
= h
->target_name
[i
];
254 name_len
= strlen(p
) + 1;
256 x
= ed_swap_4(name_len
);
257 if (bgzf_write(fp
, &x
, 4) < 0) return -1;
259 if (bgzf_write(fp
, &name_len
, 4) < 0) return -1;
261 if (bgzf_write(fp
, p
, name_len
) < 0) return -1;
263 x
= ed_swap_4(h
->target_len
[i
]);
264 if (bgzf_write(fp
, &x
, 4) < 0) return -1;
266 if (bgzf_write(fp
, &h
->target_len
[i
], 4) < 0) return -1;
269 if (bgzf_flush(fp
) < 0) return -1;
273 int bam_name2id(bam_hdr_t
*h
, const char *ref
)
275 sdict_t
*d
= (sdict_t
*)h
->sdict
;
280 for (i
= 0; i
< h
->n_targets
; ++i
) {
281 k
= kh_put(s2i
, d
, h
->target_name
[i
], &absent
);
286 k
= kh_get(s2i
, d
, ref
);
287 return k
== kh_end(d
)? -1 : kh_val(d
, k
);
290 /*************************
291 *** BAM alignment I/O ***
292 *************************/
296 return (bam1_t
*)calloc(1, sizeof(bam1_t
));
299 void bam_destroy1(bam1_t
*b
)
302 free(b
->data
); free(b
);
305 bam1_t
*bam_copy1(bam1_t
*bdst
, const bam1_t
*bsrc
)
307 uint8_t *data
= bdst
->data
;
308 int m_data
= bdst
->m_data
; // backup data and m_data
309 if (m_data
< bsrc
->l_data
) { // double the capacity
310 m_data
= bsrc
->l_data
; kroundup32(m_data
);
311 data
= (uint8_t*)realloc(data
, m_data
);
313 memcpy(data
, bsrc
->data
, bsrc
->l_data
); // copy var-len data
314 *bdst
= *bsrc
; // copy the rest
315 // restore the backup
316 bdst
->m_data
= m_data
;
321 bam1_t
*bam_dup1(const bam1_t
*bsrc
)
323 if (bsrc
== NULL
) return NULL
;
324 bam1_t
*bdst
= bam_init1();
325 if (bdst
== NULL
) return NULL
;
326 return bam_copy1(bdst
, bsrc
);
329 int bam_cigar2qlen(int n_cigar
, const uint32_t *cigar
)
332 for (k
= l
= 0; k
< n_cigar
; ++k
)
333 if (bam_cigar_type(bam_cigar_op(cigar
[k
]))&1)
334 l
+= bam_cigar_oplen(cigar
[k
]);
338 int bam_cigar2rlen(int n_cigar
, const uint32_t *cigar
)
341 for (k
= l
= 0; k
< n_cigar
; ++k
)
342 if (bam_cigar_type(bam_cigar_op(cigar
[k
]))&2)
343 l
+= bam_cigar_oplen(cigar
[k
]);
347 int32_t bam_endpos(const bam1_t
*b
)
349 if (!(b
->core
.flag
& BAM_FUNMAP
) && b
->core
.n_cigar
> 0)
350 return b
->core
.pos
+ bam_cigar2rlen(b
->core
.n_cigar
, bam_get_cigar(b
));
352 return b
->core
.pos
+ 1;
355 static inline int aux_type2size(uint8_t type
)
358 case 'A': case 'c': case 'C':
362 case 'i': case 'I': case 'f':
366 case 'Z': case 'H': case 'B':
373 static void swap_data(const bam1_core_t
*c
, int l_data
, uint8_t *data
, int is_host
)
375 uint32_t *cigar
= (uint32_t*)(data
+ c
->l_qname
);
377 for (i
= 0; i
< c
->n_cigar
; ++i
) ed_swap_4p(&cigar
[i
]);
380 int bam_read1(BGZF
*fp
, bam1_t
*b
)
382 bam1_core_t
*c
= &b
->core
;
383 int32_t block_len
, ret
, i
;
385 if ((ret
= bgzf_read(fp
, &block_len
, 4)) != 4) {
386 if (ret
== 0) return -1; // normal end-of-file
387 else return -2; // truncated
389 if (bgzf_read(fp
, x
, 32) != 32) return -3;
391 ed_swap_4p(&block_len
);
392 for (i
= 0; i
< 8; ++i
) ed_swap_4p(x
+ i
);
394 c
->tid
= x
[0]; c
->pos
= x
[1];
395 c
->bin
= x
[2]>>16; c
->qual
= x
[2]>>8&0xff; c
->l_qname
= x
[2]&0xff;
396 c
->l_extranul
= (c
->l_qname
%4 != 0)? (4 - c
->l_qname
%4) : 0;
397 if ((uint32_t) c
->l_qname
+ c
->l_extranul
> 255) // l_qname would overflow
399 c
->flag
= x
[3]>>16; c
->n_cigar
= x
[3]&0xffff;
401 c
->mtid
= x
[5]; c
->mpos
= x
[6]; c
->isize
= x
[7];
402 b
->l_data
= block_len
- 32 + c
->l_extranul
;
403 if (b
->l_data
< 0 || c
->l_qseq
< 0 || c
->l_qname
< 1) return -4;
404 if (((uint64_t) c
->n_cigar
<< 2) + c
->l_qname
+ c
->l_extranul
405 + (((uint64_t) c
->l_qseq
+ 1) >> 1) + c
->l_qseq
> (uint64_t) b
->l_data
)
407 if (b
->m_data
< b
->l_data
) {
409 uint32_t new_m
= b
->l_data
;
411 new_data
= (uint8_t*)realloc(b
->data
, new_m
);
417 if (bgzf_read(fp
, b
->data
, c
->l_qname
) != c
->l_qname
) return -4;
418 for (i
= 0; i
< c
->l_extranul
; ++i
) b
->data
[c
->l_qname
+i
] = '\0';
419 c
->l_qname
+= c
->l_extranul
;
420 if (b
->l_data
< c
->l_qname
||
421 bgzf_read(fp
, b
->data
+ c
->l_qname
, b
->l_data
- c
->l_qname
) != b
->l_data
- c
->l_qname
)
423 if (fp
->is_be
) swap_data(c
, b
->l_data
, b
->data
, 0);
424 return 4 + block_len
;
427 int bam_write1(BGZF
*fp
, const bam1_t
*b
)
429 const bam1_core_t
*c
= &b
->core
;
430 uint32_t x
[8], block_len
= b
->l_data
- c
->l_extranul
+ 32, y
;
432 if (c
->n_cigar
>= 65536) {
433 hts_log_error("Too many CIGAR operations (%d >= 64K for QNAME \"%s\")", c
->n_cigar
, bam_get_qname(b
));
439 x
[2] = (uint32_t)c
->bin
<<16 | c
->qual
<<8 | (c
->l_qname
- c
->l_extranul
);
440 x
[3] = (uint32_t)c
->flag
<<16 | c
->n_cigar
;
445 ok
= (bgzf_flush_try(fp
, 4 + block_len
) >= 0);
447 for (i
= 0; i
< 8; ++i
) ed_swap_4p(x
+ i
);
449 if (ok
) ok
= (bgzf_write(fp
, ed_swap_4p(&y
), 4) >= 0);
450 swap_data(c
, b
->l_data
, b
->data
, 1);
452 if (ok
) ok
= (bgzf_write(fp
, &block_len
, 4) >= 0);
454 if (ok
) ok
= (bgzf_write(fp
, x
, 32) >= 0);
455 if (ok
) ok
= (bgzf_write(fp
, b
->data
, c
->l_qname
- c
->l_extranul
) >= 0);
456 if (ok
) ok
= (bgzf_write(fp
, b
->data
+ c
->l_qname
, b
->l_data
- c
->l_qname
) >= 0);
457 if (fp
->is_be
) swap_data(c
, b
->l_data
, b
->data
, 0);
458 return ok
? 4 + block_len
: -1;
461 /********************
463 ********************/
465 static hts_idx_t
*bam_index(BGZF
*fp
, int min_shift
)
467 int n_lvls
, i
, fmt
, ret
;
471 h
= bam_hdr_read(fp
);
472 if (h
== NULL
) return NULL
;
474 int64_t max_len
= 0, s
;
475 for (i
= 0; i
< h
->n_targets
; ++i
)
476 if (max_len
< h
->target_len
[i
]) max_len
= h
->target_len
[i
];
478 for (n_lvls
= 0, s
= 1<<min_shift
; max_len
> s
; ++n_lvls
, s
<<= 3);
480 } else min_shift
= 14, n_lvls
= 5, fmt
= HTS_FMT_BAI
;
481 idx
= hts_idx_init(h
->n_targets
, fmt
, bgzf_tell(fp
), min_shift
, n_lvls
);
484 while ((ret
= bam_read1(fp
, b
)) >= 0) {
485 ret
= hts_idx_push(idx
, b
->core
.tid
, b
->core
.pos
, bam_endpos(b
), bgzf_tell(fp
), !(b
->core
.flag
&BAM_FUNMAP
));
486 if (ret
< 0) goto err
; // unsorted
488 if (ret
< -1) goto err
; // corrupted BAM file
490 hts_idx_finish(idx
, bgzf_tell(fp
));
496 hts_idx_destroy(idx
);
500 int sam_index_build3(const char *fn
, const char *fnidx
, int min_shift
, int nthreads
)
506 if ((fp
= hts_open(fn
, "r")) == 0) return -2;
508 hts_set_threads(fp
, nthreads
);
510 switch (fp
->format
.format
) {
512 ret
= cram_index_build(fp
->fp
.cram
, fn
, fnidx
);
516 idx
= bam_index(fp
->fp
.bgzf
, min_shift
);
518 ret
= hts_idx_save_as(idx
, fn
, fnidx
, (min_shift
> 0)? HTS_FMT_CSI
: HTS_FMT_BAI
);
519 if (ret
< 0) ret
= -4;
520 hts_idx_destroy(idx
);
534 int sam_index_build2(const char *fn
, const char *fnidx
, int min_shift
)
536 return sam_index_build3(fn
, fnidx
, min_shift
, 0);
539 int sam_index_build(const char *fn
, int min_shift
)
541 return sam_index_build3(fn
, NULL
, min_shift
, 0);
544 // Provide bam_index_build() symbol for binary compability with earlier HTSlib
545 #undef bam_index_build
546 int bam_index_build(const char *fn
, int min_shift
)
548 return sam_index_build2(fn
, NULL
, min_shift
);
551 static int bam_readrec(BGZF
*fp
, void *ignored
, void *bv
, int *tid
, int *beg
, int *end
)
555 if ((ret
= bam_read1(fp
, b
)) >= 0) {
558 *end
= bam_endpos(b
);
563 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
564 static int cram_readrec(BGZF
*ignored
, void *fpv
, void *bv
, int *tid
, int *beg
, int *end
)
568 return cram_get_bam_seq(fp
->fp
.cram
, &b
);
571 // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
572 static int sam_bam_cram_readrec(BGZF
*bgzfp
, void *fpv
, void *bv
, int *tid
, int *beg
, int *end
)
576 switch (fp
->format
.format
) {
577 case bam
: return bam_read1(bgzfp
, b
);
578 case cram
: return cram_get_bam_seq(fp
->fp
.cram
, &b
);
580 // TODO Need headers available to implement this for SAM files
581 hts_log_error("Not implemented for SAM files");
586 hts_idx_t
*sam_index_load2(htsFile
*fp
, const char *fn
, const char *fnidx
)
588 switch (fp
->format
.format
) {
590 return fnidx
? hts_idx_load2(fn
, fnidx
) : hts_idx_load(fn
, HTS_FMT_BAI
);
593 if (cram_index_load(fp
->fp
.cram
, fn
, fnidx
) < 0) return NULL
;
594 // Cons up a fake "index" just pointing at the associated cram_fd:
595 hts_cram_idx_t
*idx
= malloc(sizeof (hts_cram_idx_t
));
596 if (idx
== NULL
) return NULL
;
597 idx
->fmt
= HTS_FMT_CRAI
;
598 idx
->cram
= fp
->fp
.cram
;
599 return (hts_idx_t
*) idx
;
603 return NULL
; // TODO Would use tbx_index_load if it returned hts_idx_t
607 hts_idx_t
*sam_index_load(htsFile
*fp
, const char *fn
)
609 return sam_index_load2(fp
, fn
, NULL
);
612 static hts_itr_t
*cram_itr_query(const hts_idx_t
*idx
, int tid
, int beg
, int end
, hts_readrec_func
*readrec
)
614 const hts_cram_idx_t
*cidx
= (const hts_cram_idx_t
*) idx
;
615 hts_itr_t
*iter
= (hts_itr_t
*) calloc(1, sizeof(hts_itr_t
));
616 if (iter
== NULL
) return NULL
;
618 // Cons up a dummy iterator for which hts_itr_next() will simply invoke
619 // the readrec function:
624 iter
->readrec
= readrec
;
626 if (tid
>= 0 || tid
== HTS_IDX_NOCOOR
) {
627 cram_range r
= { tid
== HTS_IDX_NOCOOR
? -1 : tid
, beg
+1, end
};
628 int ret
= cram_set_option(cidx
->cram
, CRAM_OPT_RANGE
, &r
);
631 // The following fields are not required by hts_itr_next(), but are
632 // filled in in case user code wants to look at them.
642 // No data vs this ref, so mark iterator as completed.
643 // Same as HTS_IDX_NONE.
661 hts_log_error("Query with tid=%d not implemented for CRAM files", tid
);
669 hts_itr_t
*sam_itr_queryi(const hts_idx_t
*idx
, int tid
, int beg
, int end
)
671 const hts_cram_idx_t
*cidx
= (const hts_cram_idx_t
*) idx
;
673 return hts_itr_query(NULL
, tid
, beg
, end
, sam_bam_cram_readrec
);
674 else if (cidx
->fmt
== HTS_FMT_CRAI
)
675 return cram_itr_query(idx
, tid
, beg
, end
, cram_readrec
);
677 return hts_itr_query(idx
, tid
, beg
, end
, bam_readrec
);
680 static int cram_name2id(void *fdv
, const char *ref
)
682 cram_fd
*fd
= (cram_fd
*) fdv
;
683 return sam_hdr_name2ref(fd
->header
, ref
);
686 hts_itr_t
*sam_itr_querys(const hts_idx_t
*idx
, bam_hdr_t
*hdr
, const char *region
)
688 const hts_cram_idx_t
*cidx
= (const hts_cram_idx_t
*) idx
;
689 if (cidx
->fmt
== HTS_FMT_CRAI
)
690 return hts_itr_querys(idx
, region
, cram_name2id
, cidx
->cram
, cram_itr_query
, cram_readrec
);
692 return hts_itr_querys(idx
, region
, (hts_name2id_f
)(bam_name2id
), hdr
, hts_itr_query
, bam_readrec
);
695 /**********************
696 *** SAM header I/O ***
697 **********************/
699 #include "htslib/kseq.h"
700 #include "htslib/kstring.h"
702 bam_hdr_t
*sam_hdr_parse(int l_text
, const char *text
)
704 const char *q
, *r
, *p
;
707 for (p
= text
; *p
; ++p
) {
708 if (strncmp(p
, "@SQ\t", 4) == 0) {
711 for (q
= p
+ 4;; ++q
) {
712 if (strncmp(q
, "SN:", 3) == 0) {
714 for (r
= q
; *r
!= '\t' && *r
!= '\n' && *r
!= '\0'; ++r
);
715 sn
= (char*)calloc(r
- q
+ 1, 1);
716 strncpy(sn
, q
, r
- q
);
718 } else if (strncmp(q
, "LN:", 3) == 0)
719 ln
= strtol(q
+ 3, (char**)&q
, 10);
720 while (*q
!= '\t' && *q
!= '\n' && *q
!= '\0') ++q
;
721 if (*q
== '\0' || *q
== '\n') break;
727 k
= kh_put(s2i
, d
, sn
, &absent
);
729 hts_log_warning("Duplicated sequence '%s'", sn
);
731 } else kh_val(d
, k
) = (int64_t)(kh_size(d
) - 1)<<32 | ln
;
734 while (*p
!= '\0' && *p
!= '\n') ++p
;
736 return hdr_from_dict(d
);
739 // Minimal sanitisation of a header to ensure.
740 // - null terminated string.
741 // - all lines start with @ (also implies no blank lines).
743 // Much more could be done, but currently is not, including:
744 // - checking header types are known (HD, SQ, etc).
745 // - syntax (eg checking tab separated fields).
746 // - validating n_targets matches @SQ records.
747 // - validating target lengths against @SQ records.
748 static bam_hdr_t
*sam_hdr_sanitise(bam_hdr_t
*h
) {
752 // Special case for empty headers.
756 uint32_t i
, lnum
= 0;
757 char *cp
= h
->text
, last
= '\n';
758 for (i
= 0; i
< h
->l_text
; i
++) {
759 // NB: l_text excludes terminating nul. This finds early ones.
763 // Error on \n[^@], including duplicate newlines
767 hts_log_error("Malformed SAM header at line %u", lnum
);
776 if (i
< h
->l_text
) { // Early nul found. Complain if not just padding.
778 while (j
< h
->l_text
&& cp
[j
] == '\0') j
++;
780 hts_log_warning("Unexpected NUL character in header. Possibly truncated");
783 // Add trailing newline and/or trailing nul if required.
785 hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
787 if (h
->l_text
== UINT32_MAX
) {
788 hts_log_error("No room for extra newline");
793 if (i
>= h
->l_text
- 1) {
794 cp
= realloc(h
->text
, (size_t) h
->l_text
+2);
803 // l_text may be larger already due to multiple nul padding
806 cp
[h
->l_text
] = '\0';
812 bam_hdr_t
*sam_hdr_read(htsFile
*fp
)
814 switch (fp
->format
.format
) {
816 return sam_hdr_sanitise(bam_hdr_read(fp
->fp
.bgzf
));
819 return sam_hdr_sanitise(cram_header_to_bam(fp
->fp
.cram
->header
));
822 kstring_t str
= { 0, 0, NULL
};
825 while ((ret
= hts_getline(fp
, KS_SEP_LINE
, &fp
->line
)) >= 0) {
826 if (fp
->line
.s
[0] != '@') break;
827 if (fp
->line
.l
> 3 && strncmp(fp
->line
.s
,"@SQ",3) == 0) has_SQ
= 1;
828 kputsn(fp
->line
.s
, fp
->line
.l
, &str
);
831 if (ret
< -1) goto error
;
832 if (! has_SQ
&& fp
->fn_aux
) {
833 kstring_t line
= { 0, 0, NULL
};
834 hFILE
*f
= hopen(fp
->fn_aux
, "r");
835 if (f
== NULL
) goto error
;
836 while (line
.l
= 0, kgetline(&line
, (kgets_func
*) hgets
, f
) >= 0) {
837 char *tab
= strchr(line
.s
, '\t');
838 if (tab
== NULL
) continue;
839 kputs("@SQ\tSN:", &str
);
840 kputsn(line
.s
, tab
- line
.s
, &str
);
841 kputs("\tLN:", &str
);
842 kputl(atol(tab
), &str
);
846 if (hclose(f
) != 0) {
847 hts_log_warning("Failed to close %s", fp
->fn_aux
);
850 if (str
.l
== 0) kputsn("", 0, &str
);
851 h
= sam_hdr_parse(str
.l
, str
.s
);
852 h
->l_text
= str
.l
; h
->text
= str
.s
;
853 return sam_hdr_sanitise(h
);
866 int sam_hdr_write(htsFile
*fp
, const bam_hdr_t
*h
)
873 switch (fp
->format
.format
) {
875 fp
->format
.category
= sequence_data
;
876 fp
->format
.format
= bam
;
879 if (bam_hdr_write(fp
->fp
.bgzf
, h
) < 0) return -1;
883 cram_fd
*fd
= fp
->fp
.cram
;
884 SAM_hdr
*hdr
= bam_header_to_cram((bam_hdr_t
*)h
);
885 if (! hdr
) return -1;
886 if (cram_set_header(fd
, hdr
) < 0) return -1;
888 cram_load_reference(fd
, fp
->fn_aux
);
889 if (cram_write_SAM_hdr(fd
, fd
->header
) < 0) return -1;
894 fp
->format
.category
= sequence_data
;
895 fp
->format
.format
= sam
;
899 hputs(h
->text
, fp
->fp
.hfile
);
900 p
= strstr(h
->text
, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
903 for (i
= 0; i
< h
->n_targets
; ++i
) {
905 kputsn("@SQ\tSN:", 7, &fp
->line
); kputs(h
->target_name
[i
], &fp
->line
);
906 kputsn("\tLN:", 4, &fp
->line
); kputw(h
->target_len
[i
], &fp
->line
); kputc('\n', &fp
->line
);
907 if ( hwrite(fp
->fp
.hfile
, fp
->line
.s
, fp
->line
.l
) != fp
->line
.l
) return -1;
910 if ( hflush(fp
->fp
.hfile
) != 0 ) return -1;
920 /**********************
921 *** SAM record I/O ***
922 **********************/
924 int sam_parse1(kstring_t
*s
, bam_hdr_t
*h
, bam1_t
*b
)
926 #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
927 #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
928 #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
929 #define _parse_err(cond, msg) do { if (cond) { hts_log_error(msg); goto err_ret; } } while (0)
930 #define _parse_err_param(cond, msg, param) do { if (cond) { hts_log_error(msg, param); goto err_ret; } } while (0)
931 #define _parse_warn(cond, msg) do { if (cond) { hts_log_warning(msg); } } while (0)
937 bam1_core_t
*c
= &b
->core
;
939 str
.l
= b
->l_data
= 0;
940 str
.s
= (char*)b
->data
; str
.m
= b
->m_data
;
942 if (h
->cigar_tab
== 0) {
943 h
->cigar_tab
= (int8_t*) malloc(128);
944 for (i
= 0; i
< 128; ++i
)
945 h
->cigar_tab
[i
] = -1;
946 for (i
= 0; BAM_CIGAR_STR
[i
]; ++i
)
947 h
->cigar_tab
[(int)BAM_CIGAR_STR
[i
]] = i
;
951 _parse_warn(p
- q
<= 1, "empty query name");
952 _parse_err(p
- q
> 252, "query name too long");
953 kputsn_(q
, p
- q
, &str
);
954 for (c
->l_extranul
= 0; str
.l
% 4 != 0; c
->l_extranul
++)
956 c
->l_qname
= p
- q
+ c
->l_extranul
;
958 c
->flag
= strtol(p
, &p
, 0);
959 if (*p
++ != '\t') goto err_ret
; // malformated flag
962 if (strcmp(q
, "*")) {
963 _parse_err(h
->n_targets
== 0, "missing SAM header");
964 c
->tid
= bam_name2id(h
, q
);
965 _parse_warn(c
->tid
< 0, "urecognized reference name; treated as unmapped");
968 c
->pos
= strtol(p
, &p
, 10) - 1;
969 if (*p
++ != '\t') goto err_ret
;
970 if (c
->pos
< 0 && c
->tid
>= 0) {
971 _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
974 if (c
->tid
< 0) c
->flag
|= BAM_FUNMAP
;
976 c
->qual
= strtol(p
, &p
, 10);
977 if (*p
++ != '\t') goto err_ret
;
982 for (q
= p
; *p
&& *p
!= '\t'; ++p
)
983 if (!isdigit_c(*p
)) ++n_cigar
;
984 if (*p
++ != '\t') goto err_ret
;
985 _parse_err(n_cigar
== 0, "no CIGAR operations");
986 _parse_err(n_cigar
>= 2147483647, "too many CIGAR operations");
987 c
->n_cigar
= n_cigar
;
988 _get_mem(uint32_t, &cigar
, &str
, c
->n_cigar
* sizeof(uint32_t));
989 for (i
= 0; i
< c
->n_cigar
; ++i
, ++q
) {
991 cigar
[i
] = strtol(q
, &q
, 10)<<BAM_CIGAR_SHIFT
;
992 op
= (uint8_t)*q
>= 128? -1 : h
->cigar_tab
[(int)*q
];
993 _parse_err(op
< 0, "unrecognized CIGAR operator");
996 // can't use bam_endpos() directly as some fields not yet set up
997 i
= (!(c
->flag
&BAM_FUNMAP
))? bam_cigar2rlen(c
->n_cigar
, cigar
) : 1;
999 _parse_warn(!(c
->flag
&BAM_FUNMAP
), "mapped query must have a CIGAR; treated as unmapped");
1000 c
->flag
|= BAM_FUNMAP
;
1004 c
->bin
= hts_reg2bin(c
->pos
, c
->pos
+ i
, 14, 5);
1007 if (strcmp(q
, "=") == 0) {
1009 } else if (strcmp(q
, "*") == 0) {
1012 c
->mtid
= bam_name2id(h
, q
);
1013 _parse_warn(c
->mtid
< 0, "urecognized mate reference name; treated as unmapped");
1016 c
->mpos
= strtol(p
, &p
, 10) - 1;
1017 if (*p
++ != '\t') goto err_ret
;
1018 if (c
->mpos
< 0 && c
->mtid
>= 0) {
1019 _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
1023 c
->isize
= strtol(p
, &p
, 10);
1024 if (*p
++ != '\t') goto err_ret
;
1027 if (strcmp(q
, "*")) {
1028 c
->l_qseq
= p
- q
- 1;
1029 i
= bam_cigar2qlen(c
->n_cigar
, (uint32_t*)(str
.s
+ c
->l_qname
));
1030 _parse_err(c
->n_cigar
&& i
!= c
->l_qseq
, "CIGAR and query sequence are of different length");
1031 i
= (c
->l_qseq
+ 1) >> 1;
1032 _get_mem(uint8_t, &t
, &str
, i
);
1034 for (i
= 0; i
< c
->l_qseq
; ++i
)
1035 t
[i
>>1] |= seq_nt16_table
[(int)q
[i
]] << ((~i
&1)<<2);
1036 } else c
->l_qseq
= 0;
1038 q
= _read_token_aux(p
);
1039 _get_mem(uint8_t, &t
, &str
, c
->l_qseq
);
1040 if (strcmp(q
, "*")) {
1041 _parse_err(p
- q
- 1 != c
->l_qseq
, "SEQ and QUAL are of different length");
1042 for (i
= 0; i
< c
->l_qseq
; ++i
) t
[i
] = q
[i
] - 33;
1043 } else memset(t
, 0xff, c
->l_qseq
);
1045 while (p
< s
->s
+ s
->l
) {
1047 q
= _read_token_aux(p
); // FIXME: can be accelerated for long 'B' arrays
1048 _parse_err(p
- q
- 1 < 5, "incomplete aux field");
1049 kputsn_(q
, 2, &str
);
1050 q
+= 3; type
= *q
++; ++q
; // q points to value
1051 if (type
!= 'Z' && type
!= 'H') // the only zero length acceptable fields
1052 _parse_err(p
- q
- 1 < 1, "incomplete aux field");
1054 // Ensure str has enough space for a double + type allocated.
1055 // This is so we can stuff bigger integers and floats directly into
1056 // the kstring. Sorry.
1057 _parse_err(ks_resize(&str
, str
.l
+ 16), "out of memory");
1059 if (type
== 'A' || type
== 'a' || type
== 'c' || type
== 'C') {
1062 } else if (type
== 'i' || type
== 'I') {
1064 long x
= strtol(q
, &q
, 10);
1065 if (x
>= INT8_MIN
) {
1066 kputc_('c', &str
); kputc_(x
, &str
);
1067 } else if (x
>= INT16_MIN
) {
1068 str
.s
[str
.l
++] = 's';
1069 i16_to_le(x
, (uint8_t *) str
.s
+ str
.l
);
1072 str
.s
[str
.l
++] = 'i';
1073 i32_to_le(x
, (uint8_t *) str
.s
+ str
.l
);
1077 unsigned long x
= strtoul(q
, &q
, 10);
1078 if (x
<= UINT8_MAX
) {
1079 kputc_('C', &str
); kputc_(x
, &str
);
1080 } else if (x
<= UINT16_MAX
) {
1081 str
.s
[str
.l
++] = 'S';
1082 u16_to_le(x
, (uint8_t *) str
.s
+ str
.l
);
1085 str
.s
[str
.l
++] = 'I';
1086 u32_to_le(x
, (uint8_t *) str
.s
+ str
.l
);
1090 } else if (type
== 'f') {
1091 str
.s
[str
.l
++] = 'f';
1092 float_to_le(strtod(q
, &q
), (uint8_t *) str
.s
+ str
.l
);
1093 str
.l
+= sizeof(float);
1094 } else if (type
== 'd') {
1095 str
.s
[str
.l
++] = 'd';
1096 double_to_le(strtod(q
, &q
), (uint8_t *) str
.s
+ str
.l
);
1097 str
.l
+= sizeof(double);
1098 } else if (type
== 'Z' || type
== 'H') {
1099 _parse_err(type
== 'H' && !((p
-q
)&1),
1100 "hex field does not have an even number of digits");
1101 kputc_(type
, &str
);kputsn_(q
, p
- q
, &str
); // note that this include the trailing NULL
1102 } else if (type
== 'B') {
1106 _parse_err(p
- q
- 1 < 3, "incomplete B-typed aux field");
1107 type
= *q
++; // q points to the first ',' following the typing byte
1109 size
= aux_type2size(type
);
1110 _parse_err_param(size
<= 0 || size
> 4,
1111 "unrecognized type B:%c", type
);
1112 _parse_err(*q
&& *q
!= ',', "B aux field type not followed by ','");
1114 for (r
= q
, n
= 0; *r
; ++r
)
1117 // Ensure space for type + values
1118 bytes
= (size_t) n
* (size_t) size
;
1119 _parse_err(bytes
/ size
!= n
1120 || ks_resize(&str
, str
.l
+ bytes
+ 2 + sizeof(uint32_t)),
1122 str
.s
[str
.l
++] = 'B';
1123 str
.s
[str
.l
++] = type
;
1124 i32_to_le(n
, (uint8_t *) str
.s
+ str
.l
);
1125 str
.l
+= sizeof(uint32_t);
1127 // This ensures that q always ends up at the next comma after
1128 // reading a number even if it's followed by junk. It
1129 // prevents the possibility of trying to read more than n items.
1130 #define _skip_to_comma(q, p) do { while ((q) < (p) && *(q) != ',') (q)++; } while (0)
1132 if (type
== 'c') while (q
+ 1 < p
) { int8_t x
= strtol(q
+ 1, &q
, 0); kputc_(x
, &str
); }
1133 else if (type
== 'C') while (q
+ 1 < p
) { uint8_t x
= strtoul(q
+ 1, &q
, 0); kputc_(x
, &str
); }
1134 else if (type
== 's') while (q
+ 1 < p
) { i16_to_le(strtol(q
+ 1, &q
, 0), (uint8_t *) str
.s
+ str
.l
); str
.l
+= 2; _skip_to_comma(q
, p
); }
1135 else if (type
== 'S') while (q
+ 1 < p
) { u16_to_le(strtoul(q
+ 1, &q
, 0), (uint8_t *) str
.s
+ str
.l
); str
.l
+= 2; _skip_to_comma(q
, p
); }
1136 else if (type
== 'i') while (q
+ 1 < p
) { i32_to_le(strtol(q
+ 1, &q
, 0), (uint8_t *) str
.s
+ str
.l
); str
.l
+= 4; _skip_to_comma(q
, p
); }
1137 else if (type
== 'I') while (q
+ 1 < p
) { u32_to_le(strtoul(q
+ 1, &q
, 0), (uint8_t *) str
.s
+ str
.l
); str
.l
+= 4; _skip_to_comma(q
, p
); }
1138 else if (type
== 'f') while (q
+ 1 < p
) { float_to_le(strtod(q
+ 1, &q
), (uint8_t *) str
.s
+ str
.l
); str
.l
+= 4; _skip_to_comma(q
, p
); }
1139 else _parse_err_param(1, "unrecognized type B:%c", type
);
1141 #undef _skip_to_comma
1143 } else _parse_err_param(1, "unrecognized type %c", type
);
1145 b
->data
= (uint8_t*)str
.s
; b
->l_data
= str
.l
; b
->m_data
= str
.m
;
1150 #undef _parse_err_param
1152 #undef _read_token_aux
1155 b
->data
= (uint8_t*)str
.s
; b
->l_data
= str
.l
; b
->m_data
= str
.m
;
1159 int sam_read1(htsFile
*fp
, bam_hdr_t
*h
, bam1_t
*b
)
1161 switch (fp
->format
.format
) {
1163 int r
= bam_read1(fp
->fp
.bgzf
, b
);
1165 if (b
->core
.tid
>= h
->n_targets
|| b
->core
.tid
< -1 ||
1166 b
->core
.mtid
>= h
->n_targets
|| b
->core
.mtid
< -1)
1173 int ret
= cram_get_bam_seq(fp
->fp
.cram
, &b
);
1176 : (cram_eof(fp
->fp
.cram
) ? -1 : -2);
1182 if (fp
->line
.l
== 0) {
1183 ret
= hts_getline(fp
, KS_SEP_LINE
, &fp
->line
);
1184 if (ret
< 0) return ret
;
1186 ret
= sam_parse1(&fp
->line
, h
, b
);
1189 hts_log_warning("Parse error at line %lld", (long long)fp
->lineno
);
1190 if (h
->ignore_sam_err
) goto err_recover
;
1200 int sam_format1(const bam_hdr_t
*h
, const bam1_t
*b
, kstring_t
*str
)
1204 const bam1_core_t
*c
= &b
->core
;
1207 kputsn(bam_get_qname(b
), c
->l_qname
-1-c
->l_extranul
, str
); kputc('\t', str
); // query name
1208 kputw(c
->flag
, str
); kputc('\t', str
); // flag
1209 if (c
->tid
>= 0) { // chr
1210 kputs(h
->target_name
[c
->tid
] , str
);
1212 } else kputsn("*\t", 2, str
);
1213 kputw(c
->pos
+ 1, str
); kputc('\t', str
); // pos
1214 kputw(c
->qual
, str
); kputc('\t', str
); // qual
1215 if (c
->n_cigar
) { // cigar
1216 uint32_t *cigar
= bam_get_cigar(b
);
1217 for (i
= 0; i
< c
->n_cigar
; ++i
) {
1218 kputw(bam_cigar_oplen(cigar
[i
]), str
);
1219 kputc(bam_cigar_opchr(cigar
[i
]), str
);
1221 } else kputc('*', str
);
1223 if (c
->mtid
< 0) kputsn("*\t", 2, str
); // mate chr
1224 else if (c
->mtid
== c
->tid
) kputsn("=\t", 2, str
);
1226 kputs(h
->target_name
[c
->mtid
], str
);
1229 kputw(c
->mpos
+ 1, str
); kputc('\t', str
); // mate pos
1230 kputw(c
->isize
, str
); kputc('\t', str
); // template len
1231 if (c
->l_qseq
) { // seq and qual
1232 uint8_t *s
= bam_get_seq(b
);
1233 for (i
= 0; i
< c
->l_qseq
; ++i
) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s
, i
)], str
);
1235 s
= bam_get_qual(b
);
1236 if (s
[0] == 0xff) kputc('*', str
);
1237 else for (i
= 0; i
< c
->l_qseq
; ++i
) kputc(s
[i
] + 33, str
);
1238 } else kputsn("*\t*", 3, str
);
1240 s
= bam_get_aux(b
); // aux
1241 end
= b
->data
+ b
->l_data
;
1242 while (end
- s
>= 4) {
1243 uint8_t type
, key
[2];
1244 key
[0] = s
[0]; key
[1] = s
[1];
1245 s
+= 2; type
= *s
++;
1246 kputc('\t', str
); kputsn((char*)key
, 2, str
); kputc(':', str
);
1248 kputsn("A:", 2, str
);
1251 } else if (type
== 'C') {
1252 kputsn("i:", 2, str
);
1255 } else if (type
== 'c') {
1256 kputsn("i:", 2, str
);
1257 kputw(*(int8_t*)s
, str
);
1259 } else if (type
== 'S') {
1261 kputsn("i:", 2, str
);
1262 kputuw(le_to_u16(s
), str
);
1264 } else goto bad_aux
;
1265 } else if (type
== 's') {
1267 kputsn("i:", 2, str
);
1268 kputw(le_to_i16(s
), str
);
1270 } else goto bad_aux
;
1271 } else if (type
== 'I') {
1273 kputsn("i:", 2, str
);
1274 kputuw(le_to_u32(s
), str
);
1276 } else goto bad_aux
;
1277 } else if (type
== 'i') {
1279 kputsn("i:", 2, str
);
1280 kputw(le_to_i32(s
), str
);
1282 } else goto bad_aux
;
1283 } else if (type
== 'f') {
1285 ksprintf(str
, "f:%g", le_to_float(s
));
1287 } else goto bad_aux
;
1289 } else if (type
== 'd') {
1291 ksprintf(str
, "d:%g", le_to_double(s
));
1293 } else goto bad_aux
;
1294 } else if (type
== 'Z' || type
== 'H') {
1295 kputc(type
, str
); kputc(':', str
);
1296 while (s
< end
&& *s
) kputc(*s
++, str
);
1300 } else if (type
== 'B') {
1301 uint8_t sub_type
= *(s
++);
1302 int sub_type_size
= aux_type2size(sub_type
);
1304 if (sub_type_size
== 0 || end
- s
< 4)
1307 s
+= 4; // now points to the start of the array
1308 if ((end
- s
) / sub_type_size
< n
)
1310 kputsn("B:", 2, str
); kputc(sub_type
, str
); // write the typing
1311 for (i
= 0; i
< n
; ++i
) { // FIXME: for better performance, put the loop after "if"
1313 if ('c' == sub_type
) { kputw(*(int8_t*)s
, str
); ++s
; }
1314 else if ('C' == sub_type
) { kputw(*(uint8_t*)s
, str
); ++s
; }
1315 else if ('s' == sub_type
) { kputw(le_to_i16(s
), str
); s
+= 2; }
1316 else if ('S' == sub_type
) { kputw(le_to_u16(s
), str
); s
+= 2; }
1317 else if ('i' == sub_type
) { kputw(le_to_i32(s
), str
); s
+= 4; }
1318 else if ('I' == sub_type
) { kputuw(le_to_u32(s
), str
); s
+= 4; }
1319 else if ('f' == sub_type
) { kputd(le_to_float(s
), str
); s
+= 4; }
1320 else goto bad_aux
; // Unknown sub-type
1322 } else { // Unknown type
1329 hts_log_error("Corrupted aux data for read %.*s",
1330 b
->core
.l_qname
, bam_get_qname(b
));
1335 int sam_write1(htsFile
*fp
, const bam_hdr_t
*h
, const bam1_t
*b
)
1337 switch (fp
->format
.format
) {
1339 fp
->format
.category
= sequence_data
;
1340 fp
->format
.format
= bam
;
1343 return bam_write1(fp
->fp
.bgzf
, b
);
1346 return cram_put_bam_seq(fp
->fp
.cram
, (bam1_t
*)b
);
1349 fp
->format
.category
= sequence_data
;
1350 fp
->format
.format
= sam
;
1353 if (sam_format1(h
, b
, &fp
->line
) < 0) return -1;
1354 kputc('\n', &fp
->line
);
1355 if ( hwrite(fp
->fp
.hfile
, fp
->line
.s
, fp
->line
.l
) != fp
->line
.l
) return -1;
1363 /************************
1364 *** Auxiliary fields ***
1365 ************************/
1366 #ifndef HTS_LITTLE_ENDIAN
1367 static int aux_to_le(char type
, uint8_t *out
, const uint8_t *in
, size_t len
) {
1368 int tsz
= aux_type2size(type
);
1370 if (tsz
>= 2 && tsz
<= 8 && (len
& (tsz
- 1)) != 0) return -1;
1373 case 'H': case 'Z': case 1: // Trivial
1374 memcpy(out
, in
, len
);
1377 #define aux_val_to_le(type_t, store_le) do { \
1380 for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
1381 memcpy(&v, in + i, sizeof(type_t)); \
1386 case 2: aux_val_to_le(uint16_t, u16_to_le
); break;
1387 case 4: aux_val_to_le(uint32_t, u32_to_le
); break;
1388 case 8: aux_val_to_le(uint64_t, u64_to_le
); break;
1390 #undef aux_val_to_le
1392 case 'B': { // Recurse!
1394 if (len
< 5) return -1;
1395 memcpy(&n
, in
+ 1, 4);
1397 u32_to_le(n
, out
+ 1);
1398 return aux_to_le(in
[0], out
+ 5, in
+ 5, len
- 5);
1401 default: // Unknown type code
1411 int bam_aux_append(bam1_t
*b
, const char tag
[2], char type
, int len
, const uint8_t *data
)
1415 assert(b
->l_data
>= 0);
1416 new_len
= b
->l_data
+ 3 + len
;
1417 if (new_len
> INT32_MAX
|| new_len
< b
->l_data
) goto nomem
;
1419 if (b
->m_data
< new_len
) {
1420 uint32_t new_size
= new_len
;
1422 kroundup32(new_size
);
1423 new_data
= realloc(b
->data
, new_size
);
1424 if (new_data
== NULL
) goto nomem
;
1425 b
->m_data
= new_size
;
1429 b
->data
[b
->l_data
] = tag
[0];
1430 b
->data
[b
->l_data
+ 1] = tag
[1];
1431 b
->data
[b
->l_data
+ 2] = type
;
1433 #ifdef HTS_LITTLE_ENDIAN
1434 memcpy(b
->data
+ b
->l_data
+ 3, data
, len
);
1436 if (aux_to_le(type
, b
->data
+ b
->l_data
+ 3, data
, len
) != 0) {
1442 b
->l_data
= new_len
;
1451 static inline uint8_t *skip_aux(uint8_t *s
, uint8_t *end
)
1455 if (s
>= end
) return end
;
1456 size
= aux_type2size(*s
); ++s
; // skip type
1460 while (*s
&& s
< end
) ++s
;
1461 return s
< end
? s
+ 1 : end
;
1463 if (end
- s
< 5) return NULL
;
1464 size
= aux_type2size(*s
); ++s
;
1467 if (size
== 0 || end
- s
< size
* n
) return NULL
;
1468 return s
+ size
* n
;
1472 if (end
- s
< size
) return NULL
;
1477 uint8_t *bam_aux_get(const bam1_t
*b
, const char tag
[2])
1479 uint8_t *s
, *end
, *t
= (uint8_t *) tag
;
1480 uint16_t y
= (uint16_t) t
[0]<<8 | t
[1];
1482 end
= b
->data
+ b
->l_data
;
1483 while (s
!= NULL
&& end
- s
>= 3) {
1484 uint16_t x
= (uint16_t) s
[0]<<8 | s
[1];
1487 // Check the tag value is valid and complete
1488 uint8_t *e
= skip_aux(s
, end
);
1489 if ((*s
== 'Z' || *s
== 'H') && *(e
- 1) != '\0') {
1490 goto bad_aux
; // Unterminated string
1498 s
= skip_aux(s
, end
);
1500 if (s
== NULL
) goto bad_aux
;
1505 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b
));
1509 // s MUST BE returned by bam_aux_get()
1510 int bam_aux_del(bam1_t
*b
, uint8_t *s
)
1513 int l_aux
= bam_get_l_aux(b
);
1514 aux
= bam_get_aux(b
);
1516 s
= skip_aux(s
, aux
+ l_aux
);
1517 if (s
== NULL
) goto bad_aux
;
1518 memmove(p
, s
, l_aux
- (s
- aux
));
1523 hts_log_error("Corrupted aux data for read %s", bam_get_qname(b
));
1528 int bam_aux_update_str(bam1_t
*b
, const char tag
[2], int len
, const char *data
)
1530 // FIXME: This is not at all efficient!
1531 uint8_t *s
= bam_aux_get(b
,tag
);
1533 if (errno
== ENOENT
) { // Tag doesn't exist - add a new one
1534 return bam_aux_append(b
, tag
, 'Z', len
, (const uint8_t *) data
);
1535 } else { // Invalid aux data, give up.
1541 hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type
);
1548 int l_aux
= bam_get_l_aux(b
);
1550 b
->l_data
+= 3 + len
;
1551 if (b
->m_data
< b
->l_data
) {
1552 ptrdiff_t s_offset
= s
- b
->data
;
1553 b
->m_data
= b
->l_data
;
1554 kroundup32(b
->m_data
);
1555 b
->data
= (uint8_t*)realloc(b
->data
, b
->m_data
);
1556 s
= b
->data
+ s_offset
;
1558 memmove(s
+3+len
, s
, l_aux
- (s
- bam_get_aux(b
)));
1562 memmove(s
+3,data
,len
);
1566 static inline int64_t get_int_aux_val(uint8_t type
, const uint8_t *s
,
1570 case 'c': return le_to_i8(s
+ idx
);
1571 case 'C': return s
[idx
];
1572 case 's': return le_to_i16(s
+ 2 * idx
);
1573 case 'S': return le_to_u16(s
+ 2 * idx
);
1574 case 'i': return le_to_i32(s
+ 4 * idx
);
1575 case 'I': return le_to_u32(s
+ 4 * idx
);
1582 int64_t bam_aux2i(const uint8_t *s
)
1586 return get_int_aux_val(type
, s
, 0);
1589 double bam_aux2f(const uint8_t *s
)
1593 if (type
== 'd') return le_to_double(s
);
1594 else if (type
== 'f') return le_to_float(s
);
1595 else return get_int_aux_val(type
, s
, 0);
1598 char bam_aux2A(const uint8_t *s
)
1602 if (type
== 'A') return *(char*)s
;
1607 char *bam_aux2Z(const uint8_t *s
)
1611 if (type
== 'Z' || type
== 'H') return (char*)s
;
1616 uint32_t bam_auxB_len(const uint8_t *s
)
1622 return le_to_u32(s
+ 2);
1625 int64_t bam_auxB2i(const uint8_t *s
, uint32_t idx
)
1627 uint32_t len
= bam_auxB_len(s
);
1632 return get_int_aux_val(s
[1], s
+ 6, idx
);
1635 double bam_auxB2f(const uint8_t *s
, uint32_t idx
)
1637 uint32_t len
= bam_auxB_len(s
);
1642 if (s
[1] == 'f') return le_to_float(s
+ 6 + 4 * idx
);
1643 else return get_int_aux_val(s
[1], s
+ 6, idx
);
1646 int sam_open_mode(char *mode
, const char *fn
, const char *format
)
1648 // TODO Parse "bam5" etc for compression level
1649 if (format
== NULL
) {
1650 // Try to pick a format based on the filename extension
1651 const char *ext
= fn
? strrchr(fn
, '.') : NULL
;
1652 if (ext
== NULL
|| strchr(ext
, '/')) return -1;
1653 return sam_open_mode(mode
, fn
, ext
+1);
1655 else if (strcmp(format
, "bam") == 0) strcpy(mode
, "b");
1656 else if (strcmp(format
, "cram") == 0) strcpy(mode
, "c");
1657 else if (strcmp(format
, "sam") == 0) strcpy(mode
, "");
1663 // A version of sam_open_mode that can handle ,key=value options.
1664 // The format string is allocated and returned, to be freed by the caller.
1665 // Prefix should be "r" or "w",
1666 char *sam_open_mode_opts(const char *fn
,
1670 char *mode_opts
= malloc((format
? strlen(format
) : 1) +
1671 (mode
? strlen(mode
) : 1) + 12);
1678 strcpy(mode_opts
, mode
? mode
: "r");
1679 cp
= mode_opts
+ strlen(mode_opts
);
1681 if (format
== NULL
) {
1682 // Try to pick a format based on the filename extension
1683 const char *ext
= fn
? strrchr(fn
, '.') : NULL
;
1684 if (ext
== NULL
|| strchr(ext
, '/')) {
1688 return sam_open_mode(cp
, fn
, ext
+1)
1689 ? (free(mode_opts
), NULL
)
1693 if ((opts
= strchr(format
, ','))) {
1694 format_len
= opts
-format
;
1697 format_len
= strlen(format
);
1700 if (strncmp(format
, "bam", format_len
) == 0) {
1702 } else if (strncmp(format
, "cram", format_len
) == 0) {
1704 } else if (strncmp(format
, "cram2", format_len
) == 0) {
1706 strcpy(cp
, ",VERSION=2.1");
1708 } else if (strncmp(format
, "cram3", format_len
) == 0) {
1710 strcpy(cp
, ",VERSION=3.0");
1712 } else if (strncmp(format
, "sam", format_len
) == 0) {
1724 #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
1725 int bam_str2flag(const char *str
)
1727 char *end
, *beg
= (char*) str
;
1728 long int flag
= strtol(str
, &end
, 0);
1729 if ( end
!=str
) return flag
; // the conversion was successful
1734 while ( *end
&& *end
!=',' ) end
++;
1735 if ( !STRNCMP("PAIRED",beg
,end
-beg
) ) flag
|= BAM_FPAIRED
;
1736 else if ( !STRNCMP("PROPER_PAIR",beg
,end
-beg
) ) flag
|= BAM_FPROPER_PAIR
;
1737 else if ( !STRNCMP("UNMAP",beg
,end
-beg
) ) flag
|= BAM_FUNMAP
;
1738 else if ( !STRNCMP("MUNMAP",beg
,end
-beg
) ) flag
|= BAM_FMUNMAP
;
1739 else if ( !STRNCMP("REVERSE",beg
,end
-beg
) ) flag
|= BAM_FREVERSE
;
1740 else if ( !STRNCMP("MREVERSE",beg
,end
-beg
) ) flag
|= BAM_FMREVERSE
;
1741 else if ( !STRNCMP("READ1",beg
,end
-beg
) ) flag
|= BAM_FREAD1
;
1742 else if ( !STRNCMP("READ2",beg
,end
-beg
) ) flag
|= BAM_FREAD2
;
1743 else if ( !STRNCMP("SECONDARY",beg
,end
-beg
) ) flag
|= BAM_FSECONDARY
;
1744 else if ( !STRNCMP("QCFAIL",beg
,end
-beg
) ) flag
|= BAM_FQCFAIL
;
1745 else if ( !STRNCMP("DUP",beg
,end
-beg
) ) flag
|= BAM_FDUP
;
1746 else if ( !STRNCMP("SUPPLEMENTARY",beg
,end
-beg
) ) flag
|= BAM_FSUPPLEMENTARY
;
1754 char *bam_flag2str(int flag
)
1756 kstring_t str
= {0,0,0};
1757 if ( flag
&BAM_FPAIRED
) ksprintf(&str
,"%s%s", str
.l
?",":"","PAIRED");
1758 if ( flag
&BAM_FPROPER_PAIR
) ksprintf(&str
,"%s%s", str
.l
?",":"","PROPER_PAIR");
1759 if ( flag
&BAM_FUNMAP
) ksprintf(&str
,"%s%s", str
.l
?",":"","UNMAP");
1760 if ( flag
&BAM_FMUNMAP
) ksprintf(&str
,"%s%s", str
.l
?",":"","MUNMAP");
1761 if ( flag
&BAM_FREVERSE
) ksprintf(&str
,"%s%s", str
.l
?",":"","REVERSE");
1762 if ( flag
&BAM_FMREVERSE
) ksprintf(&str
,"%s%s", str
.l
?",":"","MREVERSE");
1763 if ( flag
&BAM_FREAD1
) ksprintf(&str
,"%s%s", str
.l
?",":"","READ1");
1764 if ( flag
&BAM_FREAD2
) ksprintf(&str
,"%s%s", str
.l
?",":"","READ2");
1765 if ( flag
&BAM_FSECONDARY
) ksprintf(&str
,"%s%s", str
.l
?",":"","SECONDARY");
1766 if ( flag
&BAM_FQCFAIL
) ksprintf(&str
,"%s%s", str
.l
?",":"","QCFAIL");
1767 if ( flag
&BAM_FDUP
) ksprintf(&str
,"%s%s", str
.l
?",":"","DUP");
1768 if ( flag
&BAM_FSUPPLEMENTARY
) ksprintf(&str
,"%s%s", str
.l
?",":"","SUPPLEMENTARY");
1769 if ( str
.l
== 0 ) kputsn("", 0, &str
);
1774 /**************************
1775 *** Pileup and Mpileup ***
1776 **************************/
1778 #if !defined(BAM_NO_PILEUP)
1782 /*******************
1784 *******************/
1790 static cstate_t g_cstate_null
= { -1, 0, 0, 0 };
1792 typedef struct __linkbuf_t
{
1796 struct __linkbuf_t
*next
;
1805 static mempool_t
*mp_init(void)
1808 mp
= (mempool_t
*)calloc(1, sizeof(mempool_t
));
1811 static void mp_destroy(mempool_t
*mp
)
1814 for (k
= 0; k
< mp
->n
; ++k
) {
1815 free(mp
->buf
[k
]->b
.data
);
1821 static inline lbnode_t
*mp_alloc(mempool_t
*mp
)
1824 if (mp
->n
== 0) return (lbnode_t
*)calloc(1, sizeof(lbnode_t
));
1825 else return mp
->buf
[--mp
->n
];
1827 static inline void mp_free(mempool_t
*mp
, lbnode_t
*p
)
1829 --mp
->cnt
; p
->next
= 0; // clear lbnode_t::next here
1830 if (mp
->n
== mp
->max
) {
1831 mp
->max
= mp
->max
? mp
->max
<<1 : 256;
1832 mp
->buf
= (lbnode_t
**)realloc(mp
->buf
, sizeof(lbnode_t
*) * mp
->max
);
1834 mp
->buf
[mp
->n
++] = p
;
1837 /**********************
1838 *** CIGAR resolver ***
1839 **********************/
1841 /* s->k: the index of the CIGAR operator that has just been processed.
1842 s->x: the reference coordinate of the start of s->k
1843 s->y: the query coordiante of the start of s->k
1845 static inline int resolve_cigar2(bam_pileup1_t
*p
, int32_t pos
, cstate_t
*s
)
1847 #define _cop(c) ((c)&BAM_CIGAR_MASK)
1848 #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
1851 bam1_core_t
*c
= &b
->core
;
1852 uint32_t *cigar
= bam_get_cigar(b
);
1854 // determine the current CIGAR operation
1855 // fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
1856 if (s
->k
== -1) { // never processed
1857 if (c
->n_cigar
== 1) { // just one operation, save a loop
1858 if (_cop(cigar
[0]) == BAM_CMATCH
|| _cop(cigar
[0]) == BAM_CEQUAL
|| _cop(cigar
[0]) == BAM_CDIFF
) s
->k
= 0, s
->x
= c
->pos
, s
->y
= 0;
1859 } else { // find the first match or deletion
1860 for (k
= 0, s
->x
= c
->pos
, s
->y
= 0; k
< c
->n_cigar
; ++k
) {
1861 int op
= _cop(cigar
[k
]);
1862 int l
= _cln(cigar
[k
]);
1863 if (op
== BAM_CMATCH
|| op
== BAM_CDEL
|| op
== BAM_CEQUAL
|| op
== BAM_CDIFF
) break;
1864 else if (op
== BAM_CREF_SKIP
) s
->x
+= l
;
1865 else if (op
== BAM_CINS
|| op
== BAM_CSOFT_CLIP
) s
->y
+= l
;
1867 assert(k
< c
->n_cigar
);
1870 } else { // the read has been processed before
1871 int op
, l
= _cln(cigar
[s
->k
]);
1872 if (pos
- s
->x
>= l
) { // jump to the next operation
1873 assert(s
->k
< c
->n_cigar
); // otherwise a bug: this function should not be called in this case
1874 op
= _cop(cigar
[s
->k
+1]);
1875 if (op
== BAM_CMATCH
|| op
== BAM_CDEL
|| op
== BAM_CREF_SKIP
|| op
== BAM_CEQUAL
|| op
== BAM_CDIFF
) { // jump to the next without a loop
1876 if (_cop(cigar
[s
->k
]) == BAM_CMATCH
|| _cop(cigar
[s
->k
]) == BAM_CEQUAL
|| _cop(cigar
[s
->k
]) == BAM_CDIFF
) s
->y
+= l
;
1879 } else { // find the next M/D/N/=/X
1880 if (_cop(cigar
[s
->k
]) == BAM_CMATCH
|| _cop(cigar
[s
->k
]) == BAM_CEQUAL
|| _cop(cigar
[s
->k
]) == BAM_CDIFF
) s
->y
+= l
;
1882 for (k
= s
->k
+ 1; k
< c
->n_cigar
; ++k
) {
1883 op
= _cop(cigar
[k
]), l
= _cln(cigar
[k
]);
1884 if (op
== BAM_CMATCH
|| op
== BAM_CDEL
|| op
== BAM_CREF_SKIP
|| op
== BAM_CEQUAL
|| op
== BAM_CDIFF
) break;
1885 else if (op
== BAM_CINS
|| op
== BAM_CSOFT_CLIP
) s
->y
+= l
;
1889 assert(s
->k
< c
->n_cigar
); // otherwise a bug
1890 } // else, do nothing
1892 { // collect pileup information
1894 op
= _cop(cigar
[s
->k
]); l
= _cln(cigar
[s
->k
]);
1895 p
->is_del
= p
->indel
= p
->is_refskip
= 0;
1896 if (s
->x
+ l
- 1 == pos
&& s
->k
+ 1 < c
->n_cigar
) { // peek the next operation
1897 int op2
= _cop(cigar
[s
->k
+1]);
1898 int l2
= _cln(cigar
[s
->k
+1]);
1899 if (op2
== BAM_CDEL
) p
->indel
= -(int)l2
;
1900 else if (op2
== BAM_CINS
) p
->indel
= l2
;
1901 else if (op2
== BAM_CPAD
&& s
->k
+ 2 < c
->n_cigar
) { // no working for adjacent padding
1903 for (k
= s
->k
+ 2; k
< c
->n_cigar
; ++k
) {
1904 op2
= _cop(cigar
[k
]); l2
= _cln(cigar
[k
]);
1905 if (op2
== BAM_CINS
) l3
+= l2
;
1906 else if (op2
== BAM_CDEL
|| op2
== BAM_CMATCH
|| op2
== BAM_CREF_SKIP
|| op2
== BAM_CEQUAL
|| op2
== BAM_CDIFF
) break;
1908 if (l3
> 0) p
->indel
= l3
;
1911 if (op
== BAM_CMATCH
|| op
== BAM_CEQUAL
|| op
== BAM_CDIFF
) {
1912 p
->qpos
= s
->y
+ (pos
- s
->x
);
1913 } else if (op
== BAM_CDEL
|| op
== BAM_CREF_SKIP
) {
1914 p
->is_del
= 1; p
->qpos
= s
->y
; // FIXME: distinguish D and N!!!!!
1915 p
->is_refskip
= (op
== BAM_CREF_SKIP
);
1916 } // cannot be other operations; otherwise a bug
1917 p
->is_head
= (pos
== c
->pos
); p
->is_tail
= (pos
== s
->end
);
1922 /***********************
1923 *** Pileup iterator ***
1924 ***********************/
1926 // Dictionary of overlapping reads
1927 KHASH_MAP_INIT_STR(olap_hash
, lbnode_t
*)
1928 typedef khash_t(olap_hash
) olap_hash_t
;
1930 struct __bam_plp_t
{
1932 lbnode_t
*head
, *tail
;
1933 int32_t tid
, pos
, max_tid
, max_pos
;
1934 int is_eof
, max_plp
, error
, maxcnt
;
1937 // for the "auto" interface only
1939 bam_plp_auto_f func
;
1941 olap_hash_t
*overlaps
;
1943 // For notification of creation and destruction events
1944 // and associated client-owned pointer.
1945 int (*plp_construct
)(void *data
, const bam1_t
*b
, bam_pileup_cd
*cd
);
1946 int (*plp_destruct
)(void *data
, const bam1_t
*b
, bam_pileup_cd
*cd
);
1949 bam_plp_t
bam_plp_init(bam_plp_auto_f func
, void *data
)
1952 iter
= (bam_plp_t
)calloc(1, sizeof(struct __bam_plp_t
));
1953 iter
->mp
= mp_init();
1954 iter
->head
= iter
->tail
= mp_alloc(iter
->mp
);
1955 iter
->max_tid
= iter
->max_pos
= -1;
1956 iter
->maxcnt
= 8000;
1960 iter
->b
= bam_init1();
1965 void bam_plp_init_overlaps(bam_plp_t iter
)
1967 iter
->overlaps
= kh_init(olap_hash
); // hash for tweaking quality of bases in overlapping reads
1970 void bam_plp_destroy(bam_plp_t iter
)
1972 lbnode_t
*p
, *pnext
;
1973 if ( iter
->overlaps
) kh_destroy(olap_hash
, iter
->overlaps
);
1974 for (p
= iter
->head
; p
!= NULL
; p
= pnext
) {
1976 mp_free(iter
->mp
, p
);
1978 mp_destroy(iter
->mp
);
1979 if (iter
->b
) bam_destroy1(iter
->b
);
1984 void bam_plp_constructor(bam_plp_t plp
,
1985 int (*func
)(void *data
, const bam1_t
*b
, bam_pileup_cd
*cd
)) {
1986 plp
->plp_construct
= func
;
1989 void bam_plp_destructor(bam_plp_t plp
,
1990 int (*func
)(void *data
, const bam1_t
*b
, bam_pileup_cd
*cd
)) {
1991 plp
->plp_destruct
= func
;
1994 //---------------------------------
1995 //--- Tweak overlapping reads
1996 //---------------------------------
1999 * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index
2000 * cigar_iref2iseq_next() - get the next CMATCH base
2001 * @cigar: pointer to current cigar block (rw)
2002 * @cigar_max: pointer just beyond the last cigar block
2003 * @icig: position within the current cigar block (rw)
2004 * @iseq: position in the sequence (rw)
2005 * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
2007 * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
2009 static inline int cigar_iref2iseq_set(uint32_t **cigar
, uint32_t *cigar_max
, int *icig
, int *iseq
, int *iref
)
2012 if ( pos
< 0 ) return -1;
2016 while ( *cigar
<cigar_max
)
2018 int cig
= (**cigar
) & BAM_CIGAR_MASK
;
2019 int ncig
= (**cigar
) >> BAM_CIGAR_SHIFT
;
2021 if ( cig
==BAM_CSOFT_CLIP
) { (*cigar
)++; *iseq
+= ncig
; *icig
= 0; continue; }
2022 if ( cig
==BAM_CHARD_CLIP
|| cig
==BAM_CPAD
) { (*cigar
)++; *icig
= 0; continue; }
2023 if ( cig
==BAM_CMATCH
|| cig
==BAM_CEQUAL
|| cig
==BAM_CDIFF
)
2026 if ( pos
< 0 ) { *icig
= ncig
+ pos
; *iseq
+= *icig
; *iref
+= *icig
; return BAM_CMATCH
; }
2027 (*cigar
)++; *iseq
+= ncig
; *icig
= 0; *iref
+= ncig
;
2030 if ( cig
==BAM_CINS
) { (*cigar
)++; *iseq
+= ncig
; *icig
= 0; continue; }
2031 if ( cig
==BAM_CDEL
|| cig
==BAM_CREF_SKIP
)
2034 if ( pos
<0 ) pos
= 0;
2035 (*cigar
)++; *icig
= 0; *iref
+= ncig
;
2038 hts_log_error("Unexpected cigar %d", cig
);
2044 static inline int cigar_iref2iseq_next(uint32_t **cigar
, uint32_t *cigar_max
, int *icig
, int *iseq
, int *iref
)
2046 while ( *cigar
< cigar_max
)
2048 int cig
= (**cigar
) & BAM_CIGAR_MASK
;
2049 int ncig
= (**cigar
) >> BAM_CIGAR_SHIFT
;
2051 if ( cig
==BAM_CMATCH
|| cig
==BAM_CEQUAL
|| cig
==BAM_CDIFF
)
2053 if ( *icig
>= ncig
- 1 ) { *icig
= 0; (*cigar
)++; continue; }
2054 (*iseq
)++; (*icig
)++; (*iref
)++;
2057 if ( cig
==BAM_CDEL
|| cig
==BAM_CREF_SKIP
) { (*cigar
)++; (*iref
) += ncig
; *icig
= 0; continue; }
2058 if ( cig
==BAM_CINS
) { (*cigar
)++; *iseq
+= ncig
; *icig
= 0; continue; }
2059 if ( cig
==BAM_CSOFT_CLIP
) { (*cigar
)++; *iseq
+= ncig
; *icig
= 0; continue; }
2060 if ( cig
==BAM_CHARD_CLIP
|| cig
==BAM_CPAD
) { (*cigar
)++; *icig
= 0; continue; }
2061 hts_log_error("Unexpected cigar %d", cig
);
2069 static void tweak_overlap_quality(bam1_t
*a
, bam1_t
*b
)
2071 uint32_t *a_cigar
= bam_get_cigar(a
), *a_cigar_max
= a_cigar
+ a
->core
.n_cigar
;
2072 uint32_t *b_cigar
= bam_get_cigar(b
), *b_cigar_max
= b_cigar
+ b
->core
.n_cigar
;
2073 int a_icig
= 0, a_iseq
= 0;
2074 int b_icig
= 0, b_iseq
= 0;
2075 uint8_t *a_qual
= bam_get_qual(a
), *b_qual
= bam_get_qual(b
);
2076 uint8_t *a_seq
= bam_get_seq(a
), *b_seq
= bam_get_seq(b
);
2078 int iref
= b
->core
.pos
;
2079 int a_iref
= iref
- a
->core
.pos
;
2080 int b_iref
= iref
- b
->core
.pos
;
2081 int a_ret
= cigar_iref2iseq_set(&a_cigar
, a_cigar_max
, &a_icig
, &a_iseq
, &a_iref
);
2082 if ( a_ret
<0 ) return; // no overlap
2083 int b_ret
= cigar_iref2iseq_set(&b_cigar
, b_cigar_max
, &b_icig
, &b_iseq
, &b_iref
);
2084 if ( b_ret
<0 ) return; // no overlap
2087 fprintf(stderr
,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a
), a
->core
.n_cigar
, b
->core
.n_cigar
,
2088 a
->core
.pos
+1,a
->core
.pos
+bam_cigar2rlen(a
->core
.n_cigar
,bam_get_cigar(a
)), b
->core
.pos
+1, b
->core
.pos
+bam_cigar2rlen(b
->core
.n_cigar
,bam_get_cigar(b
)));
2093 // Increment reference position
2094 while ( a_iref
>=0 && a_iref
< iref
- a
->core
.pos
)
2095 a_ret
= cigar_iref2iseq_next(&a_cigar
, a_cigar_max
, &a_icig
, &a_iseq
, &a_iref
);
2096 if ( a_ret
<0 ) break; // done
2097 if ( iref
< a_iref
+ a
->core
.pos
) iref
= a_iref
+ a
->core
.pos
;
2099 while ( b_iref
>=0 && b_iref
< iref
- b
->core
.pos
)
2100 b_ret
= cigar_iref2iseq_next(&b_cigar
, b_cigar_max
, &b_icig
, &b_iseq
, &b_iref
);
2101 if ( b_ret
<0 ) break; // done
2102 if ( iref
< b_iref
+ b
->core
.pos
) iref
= b_iref
+ b
->core
.pos
;
2105 if ( a_iref
+a
->core
.pos
!= b_iref
+b
->core
.pos
) continue; // only CMATCH positions, don't know what to do with indels
2107 if ( bam_seqi(a_seq
,a_iseq
) == bam_seqi(b_seq
,b_iseq
) )
2110 fprintf(stderr
,"%c",seq_nt16_str
[bam_seqi(a_seq
,a_iseq
)]);
2112 // we are very confident about this base
2113 int qual
= a_qual
[a_iseq
] + b_qual
[b_iseq
];
2114 a_qual
[a_iseq
] = qual
>200 ? 200 : qual
;
2119 if ( a_qual
[a_iseq
] >= b_qual
[b_iseq
] )
2122 fprintf(stderr
,"[%c/%c]",seq_nt16_str
[bam_seqi(a_seq
,a_iseq
)],tolower_c(seq_nt16_str
[bam_seqi(b_seq
,b_iseq
)]));
2124 a_qual
[a_iseq
] = 0.8 * a_qual
[a_iseq
]; // not so confident about a_qual anymore given the mismatch
2130 fprintf(stderr
,"[%c/%c]",tolower_c(seq_nt16_str
[bam_seqi(a_seq
,a_iseq
)]),seq_nt16_str
[bam_seqi(b_seq
,b_iseq
)]);
2132 b_qual
[b_iseq
] = 0.8 * b_qual
[b_iseq
];
2138 fprintf(stderr
,"\n");
2142 // Fix overlapping reads. Simple soft-clipping did not give good results.
2143 // Lowering qualities of unwanted bases is more selective and works better.
2145 static void overlap_push(bam_plp_t iter
, lbnode_t
*node
)
2147 if ( !iter
->overlaps
) return;
2149 // mapped mates and paired reads only
2150 if ( node
->b
.core
.flag
&BAM_FMUNMAP
|| !(node
->b
.core
.flag
&BAM_FPROPER_PAIR
) ) return;
2152 // no overlap possible, unless some wild cigar
2153 if ( abs(node
->b
.core
.isize
) >= 2*node
->b
.core
.l_qseq
) return;
2155 khiter_t kitr
= kh_get(olap_hash
, iter
->overlaps
, bam_get_qname(&node
->b
));
2156 if ( kitr
==kh_end(iter
->overlaps
) )
2159 kitr
= kh_put(olap_hash
, iter
->overlaps
, bam_get_qname(&node
->b
), &ret
);
2160 kh_value(iter
->overlaps
, kitr
) = node
;
2164 lbnode_t
*a
= kh_value(iter
->overlaps
, kitr
);
2165 tweak_overlap_quality(&a
->b
, &node
->b
);
2166 kh_del(olap_hash
, iter
->overlaps
, kitr
);
2167 assert(a
->end
-1 == a
->s
.end
);
2168 a
->end
= bam_endpos(&a
->b
);
2169 a
->s
.end
= a
->end
- 1;
2173 static void overlap_remove(bam_plp_t iter
, const bam1_t
*b
)
2175 if ( !iter
->overlaps
) return;
2180 kitr
= kh_get(olap_hash
, iter
->overlaps
, bam_get_qname(b
));
2181 if ( kitr
!=kh_end(iter
->overlaps
) )
2182 kh_del(olap_hash
, iter
->overlaps
, kitr
);
2187 for (kitr
= kh_begin(iter
->overlaps
); kitr
<kh_end(iter
->overlaps
); kitr
++)
2188 if ( kh_exist(iter
->overlaps
, kitr
) ) kh_del(olap_hash
, iter
->overlaps
, kitr
);
2194 // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
2195 // pointer to the piled records if next position is ready or NULL if there is not enough records in the
2196 // buffer yet (the current position is still the maximum position across all buffered reads).
2197 const bam_pileup1_t
*bam_plp_next(bam_plp_t iter
, int *_tid
, int *_pos
, int *_n_plp
)
2199 if (iter
->error
) { *_n_plp
= -1; return NULL
; }
2201 if (iter
->is_eof
&& iter
->head
== iter
->tail
) return NULL
;
2202 while (iter
->is_eof
|| iter
->max_tid
> iter
->tid
|| (iter
->max_tid
== iter
->tid
&& iter
->max_pos
> iter
->pos
)) {
2204 // write iter->plp at iter->pos
2205 lbnode_t
**pptr
= &iter
->head
;
2206 while (*pptr
!= iter
->tail
) {
2207 lbnode_t
*p
= *pptr
;
2208 if (p
->b
.core
.tid
< iter
->tid
|| (p
->b
.core
.tid
== iter
->tid
&& p
->end
<= iter
->pos
)) { // then remove
2209 overlap_remove(iter
, &p
->b
);
2210 if (iter
->plp_destruct
)
2211 iter
->plp_destruct(iter
->data
, &p
->b
, &p
->cd
);
2212 *pptr
= p
->next
; mp_free(iter
->mp
, p
);
2215 if (p
->b
.core
.tid
== iter
->tid
&& p
->beg
<= iter
->pos
) { // here: p->end > pos; then add to pileup
2216 if (n_plp
== iter
->max_plp
) { // then double the capacity
2217 iter
->max_plp
= iter
->max_plp
? iter
->max_plp
<<1 : 256;
2218 iter
->plp
= (bam_pileup1_t
*)realloc(iter
->plp
, sizeof(bam_pileup1_t
) * iter
->max_plp
);
2220 iter
->plp
[n_plp
].b
= &p
->b
;
2221 iter
->plp
[n_plp
].cd
= p
->cd
;
2222 if (resolve_cigar2(iter
->plp
+ n_plp
, iter
->pos
, &p
->s
)) ++n_plp
; // actually always true...
2224 pptr
= &(*pptr
)->next
;
2227 *_n_plp
= n_plp
; *_tid
= iter
->tid
; *_pos
= iter
->pos
;
2228 // update iter->tid and iter->pos
2229 if (iter
->head
!= iter
->tail
) {
2230 if (iter
->tid
> iter
->head
->b
.core
.tid
) {
2231 hts_log_error("Unsorted input. Pileup aborts");
2237 if (iter
->tid
< iter
->head
->b
.core
.tid
) { // come to a new reference sequence
2238 iter
->tid
= iter
->head
->b
.core
.tid
; iter
->pos
= iter
->head
->beg
; // jump to the next reference
2239 } else if (iter
->pos
< iter
->head
->beg
) { // here: tid == head->b.core.tid
2240 iter
->pos
= iter
->head
->beg
; // jump to the next position
2241 } else ++iter
->pos
; // scan contiguously
2243 if (n_plp
) return iter
->plp
;
2244 if (iter
->is_eof
&& iter
->head
== iter
->tail
) break;
2249 int bam_plp_push(bam_plp_t iter
, const bam1_t
*b
)
2251 if (iter
->error
) return -1;
2253 if (b
->core
.tid
< 0) { overlap_remove(iter
, b
); return 0; }
2254 // Skip only unmapped reads here, any additional filtering must be done in iter->func
2255 if (b
->core
.flag
& BAM_FUNMAP
) { overlap_remove(iter
, b
); return 0; }
2256 if (iter
->tid
== b
->core
.tid
&& iter
->pos
== b
->core
.pos
&& iter
->mp
->cnt
> iter
->maxcnt
)
2258 overlap_remove(iter
, b
);
2261 bam_copy1(&iter
->tail
->b
, b
);
2262 overlap_push(iter
, iter
->tail
);
2264 iter
->tail
->b
.id
= iter
->id
++;
2266 iter
->tail
->beg
= b
->core
.pos
;
2267 iter
->tail
->end
= bam_endpos(b
);
2268 iter
->tail
->s
= g_cstate_null
; iter
->tail
->s
.end
= iter
->tail
->end
- 1; // initialize cstate_t
2269 if (b
->core
.tid
< iter
->max_tid
) {
2270 hts_log_error("The input is not sorted (chromosomes out of order)");
2274 if ((b
->core
.tid
== iter
->max_tid
) && (iter
->tail
->beg
< iter
->max_pos
)) {
2275 hts_log_error("The input is not sorted (reads out of order)");
2279 iter
->max_tid
= b
->core
.tid
; iter
->max_pos
= iter
->tail
->beg
;
2280 if (iter
->tail
->end
> iter
->pos
|| iter
->tail
->b
.core
.tid
> iter
->tid
) {
2281 if (iter
->plp_construct
)
2282 iter
->plp_construct(iter
->data
, b
, &iter
->tail
->cd
);
2283 iter
->tail
->next
= mp_alloc(iter
->mp
);
2284 iter
->tail
= iter
->tail
->next
;
2286 } else iter
->is_eof
= 1;
2290 const bam_pileup1_t
*bam_plp_auto(bam_plp_t iter
, int *_tid
, int *_pos
, int *_n_plp
)
2292 const bam_pileup1_t
*plp
;
2293 if (iter
->func
== 0 || iter
->error
) { *_n_plp
= -1; return 0; }
2294 if ((plp
= bam_plp_next(iter
, _tid
, _pos
, _n_plp
)) != 0) return plp
;
2295 else { // no pileup line can be obtained; read alignments
2297 if (iter
->is_eof
) return 0;
2299 while ( (ret
=iter
->func(iter
->data
, iter
->b
)) >= 0) {
2300 if (bam_plp_push(iter
, iter
->b
) < 0) {
2304 if ((plp
= bam_plp_next(iter
, _tid
, _pos
, _n_plp
)) != 0) return plp
;
2305 // otherwise no pileup line can be returned; read the next alignment.
2307 if ( ret
< -1 ) { iter
->error
= ret
; *_n_plp
= -1; return 0; }
2308 bam_plp_push(iter
, 0);
2309 if ((plp
= bam_plp_next(iter
, _tid
, _pos
, _n_plp
)) != 0) return plp
;
2314 void bam_plp_reset(bam_plp_t iter
)
2316 overlap_remove(iter
, NULL
);
2317 iter
->max_tid
= iter
->max_pos
= -1;
2318 iter
->tid
= iter
->pos
= 0;
2320 while (iter
->head
!= iter
->tail
) {
2321 lbnode_t
*p
= iter
->head
;
2322 iter
->head
= p
->next
;
2323 mp_free(iter
->mp
, p
);
2327 void bam_plp_set_maxcnt(bam_plp_t iter
, int maxcnt
)
2329 iter
->maxcnt
= maxcnt
;
2332 /************************
2333 *** Mpileup iterator ***
2334 ************************/
2336 struct __bam_mplp_t
{
2341 const bam_pileup1_t
**plp
;
2344 bam_mplp_t
bam_mplp_init(int n
, bam_plp_auto_f func
, void **data
)
2348 iter
= (bam_mplp_t
)calloc(1, sizeof(struct __bam_mplp_t
));
2349 iter
->pos
= (uint64_t*)calloc(n
, sizeof(uint64_t));
2350 iter
->n_plp
= (int*)calloc(n
, sizeof(int));
2351 iter
->plp
= (const bam_pileup1_t
**)calloc(n
, sizeof(bam_pileup1_t
*));
2352 iter
->iter
= (bam_plp_t
*)calloc(n
, sizeof(bam_plp_t
));
2354 iter
->min
= (uint64_t)-1;
2355 for (i
= 0; i
< n
; ++i
) {
2356 iter
->iter
[i
] = bam_plp_init(func
, data
[i
]);
2357 iter
->pos
[i
] = iter
->min
;
2362 void bam_mplp_init_overlaps(bam_mplp_t iter
)
2365 for (i
= 0; i
< iter
->n
; ++i
)
2366 bam_plp_init_overlaps(iter
->iter
[i
]);
2369 void bam_mplp_set_maxcnt(bam_mplp_t iter
, int maxcnt
)
2372 for (i
= 0; i
< iter
->n
; ++i
)
2373 iter
->iter
[i
]->maxcnt
= maxcnt
;
2376 void bam_mplp_destroy(bam_mplp_t iter
)
2379 for (i
= 0; i
< iter
->n
; ++i
) bam_plp_destroy(iter
->iter
[i
]);
2380 free(iter
->iter
); free(iter
->pos
); free(iter
->n_plp
); free(iter
->plp
);
2384 int bam_mplp_auto(bam_mplp_t iter
, int *_tid
, int *_pos
, int *n_plp
, const bam_pileup1_t
**plp
)
2387 uint64_t new_min
= (uint64_t)-1;
2388 for (i
= 0; i
< iter
->n
; ++i
) {
2389 if (iter
->pos
[i
] == iter
->min
) {
2391 iter
->plp
[i
] = bam_plp_auto(iter
->iter
[i
], &tid
, &pos
, &iter
->n_plp
[i
]);
2392 if ( iter
->iter
[i
]->error
) return -1;
2393 iter
->pos
[i
] = iter
->plp
[i
] ? (uint64_t)tid
<<32 | pos
: 0;
2395 if (iter
->plp
[i
] && iter
->pos
[i
] < new_min
) new_min
= iter
->pos
[i
];
2397 iter
->min
= new_min
;
2398 if (new_min
== (uint64_t)-1) return 0;
2399 *_tid
= new_min
>>32; *_pos
= (uint32_t)new_min
;
2400 for (i
= 0; i
< iter
->n
; ++i
) {
2401 if (iter
->pos
[i
] == iter
->min
) { // FIXME: valgrind reports "uninitialised value(s) at this line"
2402 n_plp
[i
] = iter
->n_plp
[i
], plp
[i
] = iter
->plp
[i
];
2404 } else n_plp
[i
] = 0, plp
[i
] = 0;
2409 void bam_mplp_reset(bam_mplp_t iter
)
2412 iter
->min
= (uint64_t)-1;
2413 for (i
= 0; i
< iter
->n
; ++i
) {
2414 bam_plp_reset(iter
->iter
[i
]);
2415 iter
->pos
[i
] = (uint64_t)-1;
2417 iter
->plp
[i
] = NULL
;
2421 void bam_mplp_constructor(bam_mplp_t iter
,
2422 int (*func
)(void *arg
, const bam1_t
*b
, bam_pileup_cd
*cd
)) {
2424 for (i
= 0; i
< iter
->n
; ++i
)
2425 bam_plp_constructor(iter
->iter
[i
], func
);
2428 void bam_mplp_destructor(bam_mplp_t iter
,
2429 int (*func
)(void *arg
, const bam1_t
*b
, bam_pileup_cd
*cd
)) {
2431 for (i
= 0; i
< iter
->n
; ++i
)
2432 bam_plp_destructor(iter
->iter
[i
], func
);
2435 #endif // ~!defined(BAM_NO_PILEUP)