1 /* vcfutils.c -- allele-related utility functions.
3 Copyright (C) 2012-2016 Genome Research Ltd.
5 Author: Petr Danecek <pd3@sanger.ac.uk>
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
27 #include "htslib/vcfutils.h"
28 #include "htslib/kbitset.h"
30 int bcf_calc_ac(const bcf_hdr_t
*header
, bcf1_t
*line
, int *ac
, int which
)
33 for (i
=0; i
<line
->n_allele
; i
++) ac
[i
]=0;
35 // Use INFO/AC,AN field only when asked
36 if ( which
&BCF_UN_INFO
)
38 bcf_unpack(line
, BCF_UN_INFO
);
39 int an_id
= bcf_hdr_id2int(header
, BCF_DT_ID
, "AN");
40 int ac_id
= bcf_hdr_id2int(header
, BCF_DT_ID
, "AC");
41 int i
, an
=-1, ac_len
=0, ac_type
=0;
43 if ( an_id
>=0 && ac_id
>=0 )
45 for (i
=0; i
<line
->n_info
; i
++)
47 bcf_info_t
*z
= &line
->d
.info
[i
];
48 if ( z
->key
== an_id
) an
= z
->v1
.i
;
49 else if ( z
->key
== ac_id
) { ac_ptr
= z
->vptr
; ac_len
= z
->len
; ac_type
= z
->type
; }
52 if ( an
>=0 && ac_ptr
)
55 #define BRANCH_INT(type_t) { \
56 type_t *p = (type_t *) ac_ptr; \
57 for (i=0; i<ac_len; i++) \
64 case BCF_BT_INT8
: BRANCH_INT(int8_t); break;
65 case BCF_BT_INT16
: BRANCH_INT(int16_t); break;
66 case BCF_BT_INT32
: BRANCH_INT(int32_t); break;
67 default: hts_log_error("Unexpected type %d at %s:%d", ac_type
, header
->id
[BCF_DT_CTG
][line
->rid
].key
, line
->pos
+1); exit(1); break;
72 hts_log_error("Incorrect AN/AC counts at %s:%d", header
->id
[BCF_DT_CTG
][line
->rid
].key
, line
->pos
+1);
80 // Split genotype fields only when asked
81 if ( which
&BCF_UN_FMT
)
83 int i
, gt_id
= bcf_hdr_id2int(header
,BCF_DT_ID
,"GT");
84 if ( gt_id
<0 ) return 0;
85 bcf_unpack(line
, BCF_UN_FMT
);
86 bcf_fmt_t
*fmt_gt
= NULL
;
87 for (i
=0; i
<(int)line
->n_fmt
; i
++)
88 if ( line
->d
.fmt
[i
].id
==gt_id
) { fmt_gt
= &line
->d
.fmt
[i
]; break; }
89 if ( !fmt_gt
) return 0;
90 #define BRANCH_INT(type_t,vector_end) { \
91 for (i=0; i<line->n_sample; i++) \
93 type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
95 for (ial=0; ial<fmt_gt->n; ial++) \
97 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
98 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
99 if ( p[ial]>>1 > line->n_allele ) \
101 hts_log_error("Incorrect allele (\"%d\") in %s at %s:%d", (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \
104 ac[(p[ial]>>1)-1]++; \
108 switch (fmt_gt
->type
) {
109 case BCF_BT_INT8
: BRANCH_INT(int8_t, bcf_int8_vector_end
); break;
110 case BCF_BT_INT16
: BRANCH_INT(int16_t, bcf_int16_vector_end
); break;
111 case BCF_BT_INT32
: BRANCH_INT(int32_t, bcf_int32_vector_end
); break;
112 default: hts_log_error("Unexpected type %d at %s:%d", fmt_gt
->type
, header
->id
[BCF_DT_CTG
][line
->rid
].key
, line
->pos
+1); exit(1); break;
120 int bcf_gt_type(bcf_fmt_t
*fmt_ptr
, int isample
, int *_ial
, int *_jal
)
122 int i
, nals
= 0, has_ref
= 0, has_alt
= 0, ial
= 0, jal
= 0;
123 #define BRANCH_INT(type_t,vector_end) { \
124 type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
125 for (i=0; i<fmt_ptr->n; i++) \
127 if ( p[i] == vector_end ) break; /* smaller ploidy */ \
128 if ( bcf_gt_is_missing(p[i]) ) return GT_UNKN; /* missing allele */ \
132 if ( !ial ) { ial = tmp; has_alt = 1; } \
133 else if ( tmp!=ial ) \
151 switch (fmt_ptr
->type
) {
152 case BCF_BT_INT8
: BRANCH_INT(int8_t, bcf_int8_vector_end
); break;
153 case BCF_BT_INT16
: BRANCH_INT(int16_t, bcf_int16_vector_end
); break;
154 case BCF_BT_INT32
: BRANCH_INT(int32_t, bcf_int32_vector_end
); break;
155 default: hts_log_error("Unexpected type %d", fmt_ptr
->type
); exit(1); break;
159 if ( _ial
) *_ial
= ial
>0 ? ial
-1 : ial
;
160 if ( _jal
) *_jal
= jal
>0 ? jal
-1 : jal
;
161 if ( !nals
) return GT_UNKN
;
163 return has_ref
? GT_HAPL_R
: GT_HAPL_A
;
165 return has_alt
==1 ? GT_HOM_AA
: GT_HET_AA
;
171 int bcf_trim_alleles(const bcf_hdr_t
*header
, bcf1_t
*line
)
173 int i
, ret
= 0, nrm
= 0;
174 kbitset_t
*rm_set
= NULL
;
175 bcf_fmt_t
*gt
= bcf_get_fmt(header
, line
, "GT");
178 int *ac
= (int*) calloc(line
->n_allele
,sizeof(int));
180 // check if all alleles are populated
181 #define BRANCH(type_t,vector_end) { \
182 for (i=0; i<line->n_sample; i++) \
184 type_t *p = (type_t*) (gt->p + i*gt->size); \
186 for (ial=0; ial<gt->n; ial++) \
188 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
189 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
190 if ( (p[ial]>>1)-1 >= line->n_allele ) { \
191 hts_log_error("Allele index is out of bounds at %s:%d", header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \
195 ac[(p[ial]>>1)-1]++; \
200 case BCF_BT_INT8
: BRANCH(int8_t, bcf_int8_vector_end
); break;
201 case BCF_BT_INT16
: BRANCH(int16_t, bcf_int16_vector_end
); break;
202 case BCF_BT_INT32
: BRANCH(int32_t, bcf_int32_vector_end
); break;
203 default: hts_log_error("Unexpected GT %d at %s:%d",
204 gt
->type
, header
->id
[BCF_DT_CTG
][line
->rid
].key
, line
->pos
+ 1);
209 rm_set
= kbs_init(line
->n_allele
);
210 for (i
=1; i
<line
->n_allele
; i
++) {
211 if ( !ac
[i
] ) { kbs_insert(rm_set
, i
); nrm
++; }
215 if (bcf_remove_allele_set(header
, line
, rm_set
))
221 if (rm_set
) kbs_destroy(rm_set
);
222 return ret
? ret
: nrm
;
225 void bcf_remove_alleles(const bcf_hdr_t
*header
, bcf1_t
*line
, int rm_mask
)
228 kbitset_t
*rm_set
= kbs_init(line
->n_allele
);
229 for (i
=1; i
<line
->n_allele
; i
++)
230 if ( rm_mask
& 1<<i
) kbs_insert(rm_set
, i
);
232 bcf_remove_allele_set(header
, line
, rm_set
);
236 int bcf_remove_allele_set(const bcf_hdr_t
*header
, bcf1_t
*line
, const struct kbitset_t
*rm_set
)
238 int *map
= (int*) calloc(line
->n_allele
, sizeof(int));
241 // create map of indexes from old to new ALT numbering and modify ALT
242 kstring_t str
= {0,0,0};
243 kputs(line
->d
.allele
[0], &str
);
245 int nrm
= 0, i
,j
; // i: ori alleles, j: new alleles
246 for (i
=1, j
=1; i
<line
->n_allele
; i
++)
248 if ( kbs_exists(rm_set
, i
) )
250 // remove this allele
251 line
->d
.allele
[i
] = NULL
;
256 kputs(line
->d
.allele
[i
], &str
);
260 if ( !nrm
) goto clean
;
262 int nR_ori
= line
->n_allele
;
263 int nR_new
= line
->n_allele
-nrm
;
264 if ( nR_new
<=0 ) // should not be able to remove reference allele
266 hts_log_error("Cannot remove reference allele at %s:%d [%d]",
267 bcf_seqname(header
,line
), line
->pos
+1, nR_new
);
270 int nA_ori
= nR_ori
-1;
271 int nA_new
= nR_new
-1;
273 int nG_ori
= nR_ori
*(nR_ori
+ 1)/2;
274 int nG_new
= nR_new
*(nR_new
+ 1)/2;
276 bcf_update_alleles_str(header
, line
, str
.s
);
278 // remove from Number=G, Number=R and Number=A INFO fields.
279 int mdat
= 0, ndat
= 0, mdat_bytes
= 0, nret
;
280 for (i
=0; i
<line
->n_info
; i
++)
282 bcf_info_t
*info
= &line
->d
.info
[i
];
283 int vlen
= bcf_hdr_id2length(header
,BCF_HL_INFO
,info
->key
);
285 if ( vlen
!=BCF_VL_A
&& vlen
!=BCF_VL_G
&& vlen
!=BCF_VL_R
) continue; // no need to change
287 int type
= bcf_hdr_id2type(header
,BCF_HL_INFO
,info
->key
);
288 if ( type
==BCF_HT_FLAG
) continue;
290 if ( type
==BCF_HT_REAL
|| type
==BCF_HT_INT
) size
= 4;
292 mdat
= mdat_bytes
/ size
;
293 nret
= bcf_get_info_values(header
, line
, bcf_hdr_int2id(header
,BCF_DT_ID
,info
->key
), (void**)&dat
, &mdat
, type
);
294 mdat_bytes
= mdat
* size
;
297 hts_log_error("Could not access INFO/%s at %s:%d [%d]",
298 bcf_hdr_int2id(header
,BCF_DT_ID
,info
->key
), bcf_seqname(header
,line
), line
->pos
+1, nret
);
301 if ( nret
==0 ) continue; // no data for this tag
303 if ( type
==BCF_HT_STR
)
306 char *ss
= (char*) dat
, *se
= (char*) dat
, s
= ss
[0];
307 if ( vlen
==BCF_VL_A
|| vlen
==BCF_VL_R
)
310 if ( vlen
==BCF_VL_A
)
317 for (j
=0; j
<nexp
; j
++)
320 while ( *se
&& *se
!=',' ) se
++;
321 if ( kbs_exists(rm_set
, j
+inc
) )
327 if ( str
.l
) kputc(',',&str
);
328 kputsn(ss
,se
-ss
,&str
);
332 if ( j
==1 && s
== '.' ) continue; // missing
335 hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=%c=%d, but found %d",
336 bcf_hdr_int2id(header
,BCF_DT_ID
,info
->key
), bcf_seqname(header
,line
), line
->pos
+1, vlen
==BCF_VL_A
? 'A' : 'R', nexp
, j
);
340 else // Number=G, assuming diploid genotype
343 for (j
=0; j
<nR_ori
; j
++)
348 while ( *se
&& *se
!=',' ) se
++;
350 if ( kbs_exists(rm_set
, j
) || kbs_exists(rm_set
, k
) )
356 if ( str
.l
) kputc(',',&str
);
357 kputsn(ss
,se
-ss
,&str
);
363 if ( n
==1 && s
== '.' ) continue; // missing
366 hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=G=%d, but found %d",
367 bcf_hdr_int2id(header
,BCF_DT_ID
,info
->key
), bcf_seqname(header
,line
), line
->pos
+1, nG_ori
, n
);
372 nret
= bcf_update_info(header
, line
, bcf_hdr_int2id(header
,BCF_DT_ID
,info
->key
), (void*)str
.s
, str
.l
, type
);
375 hts_log_error("Could not update INFO/%s at %s:%d [%d]",
376 bcf_hdr_int2id(header
,BCF_DT_ID
,info
->key
), bcf_seqname(header
,line
), line
->pos
+1, nret
);
382 if (nret
==1) // could be missing - check
385 #define BRANCH(type_t, is_missing) { \
386 type_t *p = (type_t *) info->vptr; \
387 if ( is_missing ) missing = 1; \
389 switch (info
->type
) {
390 case BCF_BT_INT8
: BRANCH(int8_t, p
[0]==bcf_int8_missing
); break;
391 case BCF_BT_INT16
: BRANCH(int16_t, p
[0]==bcf_int16_missing
); break;
392 case BCF_BT_INT32
: BRANCH(int32_t, p
[0]==bcf_int32_missing
); break;
393 case BCF_BT_FLOAT
: BRANCH(float, bcf_float_is_missing(p
[0])); break;
394 default: hts_log_error("Unexpected type %d", info
->type
); goto err
; break;
397 if (missing
) continue; // could remove this INFO tag?
400 if ( vlen
==BCF_VL_A
|| vlen
==BCF_VL_R
)
403 if ( vlen
==BCF_VL_A
)
407 hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=A=%d, but found %d",
408 bcf_hdr_int2id(header
,BCF_DT_ID
,info
->key
), bcf_seqname(header
,line
), line
->pos
+1, nA_ori
, nret
);
419 hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d",
420 bcf_hdr_int2id(header
,BCF_DT_ID
,info
->key
), bcf_seqname(header
,line
), line
->pos
+1, nR_ori
, nret
);
428 #define BRANCH(type_t,is_vector_end) \
430 type_t *ptr = (type_t*) dat; \
431 int size = sizeof(type_t); \
432 for (j=0; j<ntop; j++) /* j:ori, k:new */ \
434 if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
435 if ( kbs_exists(rm_set, j+inc) ) continue; \
436 if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
442 case BCF_HT_INT
: BRANCH(int32_t,ptr
[j
]==bcf_int32_vector_end
); break;
443 case BCF_HT_REAL
: BRANCH(float,bcf_float_is_vector_end(ptr
[j
])); break;
451 hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d",
452 bcf_hdr_int2id(header
,BCF_DT_ID
,info
->key
), bcf_seqname(header
,line
), line
->pos
+1, nG_ori
, nret
);
455 int k
, l_ori
= -1, l_new
= 0;
458 #define BRANCH(type_t,is_vector_end) \
460 type_t *ptr = (type_t*) dat; \
461 int size = sizeof(type_t); \
462 for (j=0; j<nR_ori; j++) \
464 for (k=0; k<=j; k++) \
467 if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
468 if ( kbs_exists(rm_set, j) || kbs_exists(rm_set, k) ) continue; \
469 if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
476 case BCF_HT_INT
: BRANCH(int32_t,ptr
[l_ori
]==bcf_int32_vector_end
); break;
477 case BCF_HT_REAL
: BRANCH(float,bcf_float_is_vector_end(ptr
[l_ori
])); break;
482 nret
= bcf_update_info(header
, line
, bcf_hdr_int2id(header
,BCF_DT_ID
,info
->key
), (void*)dat
, ndat
, type
);
485 hts_log_error("Could not update INFO/%s at %s:%d [%d]",
486 bcf_hdr_int2id(header
,BCF_DT_ID
,info
->key
), bcf_seqname(header
,line
), line
->pos
+1, nret
);
491 // Update GT fields, the allele indexes might have changed
492 for (i
=1; i
<line
->n_allele
; i
++) if ( map
[i
]!=i
) break;
493 if ( i
<line
->n_allele
)
495 mdat
= mdat_bytes
/ 4; // sizeof(int32_t)
496 nret
= bcf_get_genotypes(header
,line
,(void**)&dat
,&mdat
);
497 mdat_bytes
= mdat
* 4;
500 nret
/= line
->n_sample
;
501 int32_t *ptr
= (int32_t*) dat
;
502 for (i
=0; i
<line
->n_sample
; i
++)
504 for (j
=0; j
<nret
; j
++)
506 if ( bcf_gt_is_missing(ptr
[j
]) ) continue;
507 if ( ptr
[j
]==bcf_int32_vector_end
) break;
508 int al
= bcf_gt_allele(ptr
[j
]);
509 if ( !( al
<nR_ori
&& map
[al
]>=0 ) )
511 hts_log_error("Problem updating genotypes at %s:%d [ al<nR_ori && map[al]>=0 :: al=%d,nR_ori=%d,map[al]=%d ]",
512 bcf_seqname(header
,line
), line
->pos
+1, al
, nR_ori
, map
[al
]);
515 ptr
[j
] = (map
[al
]+1)<<1 | (ptr
[j
]&1);
519 nret
= bcf_update_genotypes(header
, line
, (void*)dat
, nret
*line
->n_sample
);
522 hts_log_error("Could not update FORMAT/GT at %s:%d [%d]",
523 bcf_seqname(header
,line
), line
->pos
+1, nret
);
529 // Remove from Number=G, Number=R and Number=A FORMAT fields.
530 // Assuming haploid or diploid GTs
531 for (i
=0; i
<line
->n_fmt
; i
++)
533 bcf_fmt_t
*fmt
= &line
->d
.fmt
[i
];
534 int vlen
= bcf_hdr_id2length(header
,BCF_HL_FMT
,fmt
->id
);
536 if ( vlen
!=BCF_VL_A
&& vlen
!=BCF_VL_G
&& vlen
!=BCF_VL_R
) continue; // no need to change
538 int type
= bcf_hdr_id2type(header
,BCF_HL_FMT
,fmt
->id
);
539 if ( type
==BCF_HT_FLAG
) continue;
542 if ( type
==BCF_HT_REAL
|| type
==BCF_HT_INT
) size
= 4;
544 mdat
= mdat_bytes
/ size
;
545 nret
= bcf_get_format_values(header
, line
, bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), (void**)&dat
, &mdat
, type
);
546 mdat_bytes
= mdat
* size
;
549 hts_log_error("Could not access FORMAT/%s at %s:%d [%d]",
550 bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), bcf_seqname(header
,line
), line
->pos
+1, nret
);
553 if ( nret
== 0 ) continue; // no data for this tag
555 if ( type
==BCF_HT_STR
)
557 int size
= nret
/line
->n_sample
; // number of bytes per sample
559 if ( vlen
==BCF_VL_A
|| vlen
==BCF_VL_R
)
562 if ( vlen
==BCF_VL_A
)
569 for (j
=0; j
<line
->n_sample
; j
++)
571 char *ss
= ((char*)dat
) + j
*size
, *se
= ss
+ size
, *ptr
= ss
, s
= ss
[0];
572 int k_src
= 0, k_dst
= 0, l
= str
.l
;
573 for (k_src
=0; k_src
<nexp
; k_src
++)
575 if ( ptr
>=se
|| !*ptr
) break;
576 while ( ptr
<se
&& *ptr
&& *ptr
!=',' ) ptr
++;
577 if ( kbs_exists(rm_set
, k_src
+inc
) )
582 if ( k_dst
) kputc(',',&str
);
583 kputsn(ss
,ptr
-ss
,&str
);
587 if ( k_src
==1 && s
== '.' ) continue; // missing
590 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=%c=%d, but found %d",
591 bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), bcf_seqname(header
,line
), line
->pos
+1, vlen
==BCF_VL_A
? 'A' : 'R', nexp
, k_src
);
595 for (; l
<size
; l
++) kputc(0, &str
);
598 else // Number=G, diploid or haploid
600 for (j
=0; j
<line
->n_sample
; j
++)
602 char *ss
= ((char*)dat
) + j
*size
, *se
= ss
+ size
, *ptr
= ss
, s
= ss
[0];
603 int k_src
= 0, k_dst
= 0, l
= str
.l
;
604 int nexp
= 0; // diploid or haploid?
608 if ( *ptr
==',' ) nexp
++;
611 if ( ptr
!=ss
) nexp
++;
612 if ( nexp
==1 && s
== '.' ) continue; // missing
613 if ( nexp
!=nG_ori
&& nexp
!=nR_ori
)
615 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(diploid) or %d(haploid), but found %d",
616 bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), bcf_seqname(header
,line
), line
->pos
+1, nG_ori
, nR_ori
, nexp
);
620 if ( nexp
==nG_ori
) // diploid
623 for (ia
=0; ia
<nR_ori
; ia
++)
625 for (ib
=0; ib
<=ia
; ib
++)
627 if ( ptr
>=se
|| !*ptr
) break;
628 while ( ptr
<se
&& *ptr
&& *ptr
!=',' ) ptr
++;
629 if ( kbs_exists(rm_set
, ia
) || kbs_exists(rm_set
, ib
) )
634 if ( k_dst
) kputc(',',&str
);
635 kputsn(ss
,ptr
-ss
,&str
);
639 if ( ptr
>=se
|| !*ptr
) break;
644 for (k_src
=0; k_src
<nR_ori
; k_src
++)
646 if ( ptr
>=se
|| !*ptr
) break;
647 while ( ptr
<se
&& *ptr
&& *ptr
!=',' ) ptr
++;
648 if ( kbs_exists(rm_set
, k_src
) )
653 if ( k_dst
) kputc(',',&str
);
654 kputsn(ss
,ptr
-ss
,&str
);
660 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(haploid), but found %d",
661 bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), bcf_seqname(header
,line
), line
->pos
+1, nR_ori
, k_src
);
665 for (; l
<size
; l
++) kputc(0, &str
);
669 nret
= bcf_update_format(header
, line
, bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), (void*)str
.s
, str
.l
, type
);
672 hts_log_error("Could not update FORMAT/%s at %s:%d [%d]",
673 bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), bcf_seqname(header
,line
), line
->pos
+1, nret
);
679 int nori
= nret
/ line
->n_sample
;
680 if ( nori
==1 && !(vlen
==BCF_VL_A
&& nori
==nA_ori
) ) // all values may be missing - check
683 #define BRANCH(type_t, is_missing) { \
684 for (j=0; j<line->n_sample; j++) \
686 type_t *p = (type_t*) (fmt->p + j*fmt->size); \
687 if ( !(is_missing)) { all_missing = 0; break; } \
691 case BCF_BT_INT8
: BRANCH(int8_t, p
[0]==bcf_int8_missing
); break;
692 case BCF_BT_INT16
: BRANCH(int16_t, p
[0]==bcf_int16_missing
); break;
693 case BCF_BT_INT32
: BRANCH(int32_t, p
[0]==bcf_int32_missing
); break;
694 case BCF_BT_FLOAT
: BRANCH(float, bcf_float_is_missing(p
[0])); break;
695 default: hts_log_error("Unexpected type %d", fmt
->type
); goto err
; break;
698 if (all_missing
) continue; // could remove this FORMAT tag?
701 if ( vlen
==BCF_VL_A
|| vlen
==BCF_VL_R
|| (vlen
==BCF_VL_G
&& nori
==nR_ori
) ) // Number=A, R or haploid Number=G
704 if ( vlen
==BCF_VL_A
)
708 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=A=%d, but found %d",
709 bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), bcf_seqname(header
,line
), line
->pos
+1, nA_ori
, nori
);
712 ndat
= nA_new
*line
->n_sample
;
720 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=R=%d, but found %d",
721 bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), bcf_seqname(header
,line
), line
->pos
+1, nR_ori
, nori
);
724 ndat
= nR_new
*line
->n_sample
;
728 #define BRANCH(type_t,is_vector_end) \
730 for (j=0; j<line->n_sample; j++) \
732 type_t *ptr_src = ((type_t*)dat) + j*nori; \
733 type_t *ptr_dst = ((type_t*)dat) + j*nnew; \
734 int size = sizeof(type_t); \
735 int k_src, k_dst = 0; \
736 for (k_src=0; k_src<nori; k_src++) \
738 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
739 if ( kbs_exists(rm_set, k_src+inc) ) continue; \
740 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
747 case BCF_HT_INT
: BRANCH(int32_t,ptr_src
[k_src
]==bcf_int32_vector_end
); break;
748 case BCF_HT_REAL
: BRANCH(float,bcf_float_is_vector_end(ptr_src
[k_src
])); break;
752 else // Number=G, diploid or mixture of haploid+diploid
756 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d, but found %d",
757 bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), bcf_seqname(header
,line
), line
->pos
+1, nG_ori
, nori
);
760 ndat
= nG_new
*line
->n_sample
;
762 #define BRANCH(type_t,is_vector_end) \
764 for (j=0; j<line->n_sample; j++) \
766 type_t *ptr_src = ((type_t*)dat) + j*nori; \
767 type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
768 int size = sizeof(type_t); \
769 int ia, ib, k_dst = 0, k_src; \
770 int nset = 0; /* haploid or diploid? */ \
771 for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
772 if ( nset==nR_ori ) /* haploid */ \
774 for (k_src=0; k_src<nR_ori; k_src++) \
776 if ( kbs_exists(rm_set, k_src) ) continue; \
777 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
780 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
785 for (ia=0; ia<nR_ori; ia++) \
787 for (ib=0; ib<=ia; ib++) \
790 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
791 if ( kbs_exists(rm_set, ia) || kbs_exists(rm_set, ib) ) continue; \
792 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
801 case BCF_HT_INT
: BRANCH(int32_t,ptr_src
[k_src
]==bcf_int32_vector_end
); break;
802 case BCF_HT_REAL
: BRANCH(float,bcf_float_is_vector_end(ptr_src
[k_src
])); break;
806 nret
= bcf_update_format(header
, line
, bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), (void*)dat
, ndat
, type
);
809 hts_log_error("Could not update FORMAT/%s at %s:%d [%d]",
810 bcf_hdr_int2id(header
,BCF_DT_ID
,fmt
->id
), bcf_seqname(header
,line
), line
->pos
+1, nret
);