modified: diffout.py
[GalaxyCodeBases.git] / c_cpp / lib / htslib / realn.c
blobd856f3c6af3cc37bcc8ef8fdc0504bfc7ced5c3a
1 /* realn.c -- BAQ calculation and realignment.
3 Copyright (C) 2009-2011, 2014-2015 Genome Research Ltd.
4 Portions copyright (C) 2009-2011 Broad Institute.
6 Author: Heng Li <lh3@sanger.ac.uk>
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
26 #include <config.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <math.h>
31 #include "htslib/hts.h"
32 #include "htslib/sam.h"
34 int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres)
36 uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
37 uint32_t *cigar = bam_get_cigar(b);
38 bam1_core_t *c = &b->core;
39 int i, x, y, mm, q, len, clip_l, clip_q;
40 double t;
41 if (thres < 0) thres = 40; // set the default
42 mm = q = len = clip_l = clip_q = 0;
43 for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
44 int j, l = cigar[i]>>4, op = cigar[i]&0xf;
45 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
46 for (j = 0; j < l; ++j) {
47 int c1, c2, z = y + j;
48 if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
49 c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
50 if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
51 ++len;
52 if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
53 ++mm;
54 q += qual[z] > 33? 33 : qual[z];
58 if (j < l) break;
59 x += l; y += l; len += l;
60 } else if (op == BAM_CDEL) {
61 for (j = 0; j < l; ++j)
62 if (x+j >= ref_len || ref[x+j] == '\0') break;
63 if (j < l) break;
64 x += l;
65 } else if (op == BAM_CSOFT_CLIP) {
66 for (j = 0; j < l; ++j) clip_q += qual[y+j];
67 clip_l += l;
68 y += l;
69 } else if (op == BAM_CHARD_CLIP) {
70 clip_q += 13 * l;
71 clip_l += l;
72 } else if (op == BAM_CINS) y += l;
73 else if (op == BAM_CREF_SKIP) x += l;
75 for (i = 0, t = 1; i < mm; ++i)
76 t *= (double)len / (i+1);
77 t = q - 4.343 * log(t) + clip_q / 5.;
78 if (t > thres) return -1;
79 if (t < 0) t = 0;
80 t = sqrt((thres - t) / thres) * thres;
81 // fprintf(stderr, "%s %lf %d\n", bam_get_qname(b), t, q);
82 return (int)(t + .499);
85 int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag)
87 int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
88 uint32_t *cigar = bam_get_cigar(b);
89 bam1_core_t *c = &b->core;
90 probaln_par_t conf = { 0.001, 0.1, 10 };
91 uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b);
92 if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1)
93 return -1; // do nothing
95 // test if BQ or ZQ is present
96 if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
97 if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
98 if (bq && redo_baq)
100 bam_aux_del(b, bq-1);
101 bq = 0;
103 if (bq && zq) { // remove the ZQ tag
104 bam_aux_del(b, zq-1);
105 zq = 0;
107 if (bq || zq) {
108 if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
109 if (bq && apply_baq) { // then convert BQ to ZQ
110 for (i = 0; i < c->l_qseq; ++i)
111 qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
112 *(bq - 3) = 'Z';
113 } else if (zq && !apply_baq) { // then convert ZQ to BQ
114 for (i = 0; i < c->l_qseq; ++i)
115 qual[i] += (int)zq[i] - 64;
116 *(zq - 3) = 'B';
118 return 0;
120 // find the start and end of the alignment
121 x = c->pos, y = 0, yb = ye = xb = xe = -1;
122 for (k = 0; k < c->n_cigar; ++k) {
123 int op, l;
124 op = cigar[k]&0xf; l = cigar[k]>>4;
125 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
126 if (yb < 0) yb = y;
127 if (xb < 0) xb = x;
128 ye = y + l; xe = x + l;
129 x += l; y += l;
130 } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
131 else if (op == BAM_CDEL) x += l;
132 else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
134 // set bandwidth and the start and the end
135 bw = 7;
136 if (abs((xe - xb) - (ye - yb)) > bw)
137 bw = abs((xe - xb) - (ye - yb)) + 3;
138 conf.bw = bw;
139 xb -= yb + bw/2; if (xb < 0) xb = 0;
140 xe += c->l_qseq - ye + bw/2;
141 if (xe - xb - c->l_qseq > bw)
142 xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
143 { // glocal
144 uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq;
145 int *state;
146 bq = calloc(c->l_qseq + 1, 1);
147 memcpy(bq, qual, c->l_qseq);
148 s = calloc(c->l_qseq, 1);
149 for (i = 0; i < c->l_qseq; ++i) s[i] = seq_nt16_int[bam_seqi(seq, i)];
150 r = calloc(xe - xb, 1);
151 for (i = xb; i < xe; ++i) {
152 if (i >= ref_len || ref[i] == '\0') { xe = i; break; }
153 r[i-xb] = seq_nt16_int[seq_nt16_table[(int)ref[i]]];
155 state = calloc(c->l_qseq, sizeof(int));
156 q = calloc(c->l_qseq, 1);
157 probaln_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
158 if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
159 for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
160 int op = cigar[k]&0xf, l = cigar[k]>>4;
161 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
162 for (i = y; i < y + l; ++i) {
163 if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
164 else bq[i] = bq[i] < q[i]? bq[i] : q[i];
166 x += l; y += l;
167 } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
168 else if (op == BAM_CDEL) x += l;
170 for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
171 } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
172 uint8_t *left, *rght;
173 left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
174 for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
175 int op = cigar[k]&0xf, l = cigar[k]>>4;
176 if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
177 for (i = y; i < y + l; ++i)
178 bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
179 for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
180 left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
181 for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
182 rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
183 for (i = y; i < y + l; ++i)
184 bq[i] = left[i] < rght[i]? left[i] : rght[i];
185 x += l; y += l;
186 } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
187 else if (op == BAM_CDEL) x += l;
189 for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
190 free(left); free(rght);
192 if (apply_baq) {
193 for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
194 bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
195 } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
196 free(bq); free(s); free(r); free(q); free(state);
198 return 0;