modified: diffout.py
[GalaxyCodeBases.git] / c_cpp / lib / htslib / vcfutils.c
blob4e1a6c978f140b1d75906473c84c97304e1701b3
1 /* vcfutils.c -- allele-related utility functions.
3 Copyright (C) 2012-2016 Genome Research Ltd.
5 Author: Petr Danecek <pd3@sanger.ac.uk>
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
25 #include <config.h>
27 #include "htslib/vcfutils.h"
28 #include "htslib/kbitset.h"
30 int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
32 int i;
33 for (i=0; i<line->n_allele; i++) ac[i]=0;
35 // Use INFO/AC,AN field only when asked
36 if ( which&BCF_UN_INFO )
38 bcf_unpack(line, BCF_UN_INFO);
39 int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN");
40 int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC");
41 int i, an=-1, ac_len=0, ac_type=0;
42 uint8_t *ac_ptr=NULL;
43 if ( an_id>=0 && ac_id>=0 )
45 for (i=0; i<line->n_info; i++)
47 bcf_info_t *z = &line->d.info[i];
48 if ( z->key == an_id ) an = z->v1.i;
49 else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; }
52 if ( an>=0 && ac_ptr )
54 int nac = 0;
55 #define BRANCH_INT(type_t) { \
56 type_t *p = (type_t *) ac_ptr; \
57 for (i=0; i<ac_len; i++) \
58 { \
59 ac[i+1] = p[i]; \
60 nac += p[i]; \
61 } \
63 switch (ac_type) {
64 case BCF_BT_INT8: BRANCH_INT(int8_t); break;
65 case BCF_BT_INT16: BRANCH_INT(int16_t); break;
66 case BCF_BT_INT32: BRANCH_INT(int32_t); break;
67 default: hts_log_error("Unexpected type %d at %s:%d", ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
69 #undef BRANCH_INT
70 if ( an<nac )
72 hts_log_error("Incorrect AN/AC counts at %s:%d", header->id[BCF_DT_CTG][line->rid].key, line->pos+1);
73 exit(1);
75 ac[0] = an - nac;
76 return 1;
80 // Split genotype fields only when asked
81 if ( which&BCF_UN_FMT )
83 int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT");
84 if ( gt_id<0 ) return 0;
85 bcf_unpack(line, BCF_UN_FMT);
86 bcf_fmt_t *fmt_gt = NULL;
87 for (i=0; i<(int)line->n_fmt; i++)
88 if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
89 if ( !fmt_gt ) return 0;
90 #define BRANCH_INT(type_t,vector_end) { \
91 for (i=0; i<line->n_sample; i++) \
92 { \
93 type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
94 int ial; \
95 for (ial=0; ial<fmt_gt->n; ial++) \
96 { \
97 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
98 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
99 if ( p[ial]>>1 > line->n_allele ) \
101 hts_log_error("Incorrect allele (\"%d\") in %s at %s:%d", (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \
102 exit(1); \
104 ac[(p[ial]>>1)-1]++; \
108 switch (fmt_gt->type) {
109 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
110 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
111 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
112 default: hts_log_error("Unexpected type %d at %s:%d", fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
114 #undef BRANCH_INT
115 return 1;
117 return 0;
120 int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *_ial, int *_jal)
122 int i, nals = 0, has_ref = 0, has_alt = 0, ial = 0, jal = 0;
123 #define BRANCH_INT(type_t,vector_end) { \
124 type_t *p = (type_t*) (fmt_ptr->p + isample*fmt_ptr->size); \
125 for (i=0; i<fmt_ptr->n; i++) \
127 if ( p[i] == vector_end ) break; /* smaller ploidy */ \
128 if ( bcf_gt_is_missing(p[i]) ) return GT_UNKN; /* missing allele */ \
129 int tmp = p[i]>>1; \
130 if ( tmp>1 ) \
132 if ( !ial ) { ial = tmp; has_alt = 1; } \
133 else if ( tmp!=ial ) \
135 if ( tmp<ial ) \
137 jal = ial; \
138 ial = tmp; \
140 else \
142 jal = tmp; \
144 has_alt = 2; \
147 else has_ref = 1; \
148 nals++; \
151 switch (fmt_ptr->type) {
152 case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break;
153 case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break;
154 case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break;
155 default: hts_log_error("Unexpected type %d", fmt_ptr->type); exit(1); break;
157 #undef BRANCH_INT
159 if ( _ial ) *_ial = ial>0 ? ial-1 : ial;
160 if ( _jal ) *_jal = jal>0 ? jal-1 : jal;
161 if ( !nals ) return GT_UNKN;
162 if ( nals==1 )
163 return has_ref ? GT_HAPL_R : GT_HAPL_A;
164 if ( !has_ref )
165 return has_alt==1 ? GT_HOM_AA : GT_HET_AA;
166 if ( !has_alt )
167 return GT_HOM_RR;
168 return GT_HET_RA;
171 int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
173 int i, ret = 0, nrm = 0;
174 kbitset_t *rm_set = NULL;
175 bcf_fmt_t *gt = bcf_get_fmt(header, line, "GT");
176 if ( !gt ) return 0;
178 int *ac = (int*) calloc(line->n_allele,sizeof(int));
180 // check if all alleles are populated
181 #define BRANCH(type_t,vector_end) { \
182 for (i=0; i<line->n_sample; i++) \
184 type_t *p = (type_t*) (gt->p + i*gt->size); \
185 int ial; \
186 for (ial=0; ial<gt->n; ial++) \
188 if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
189 if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \
190 if ( (p[ial]>>1)-1 >= line->n_allele ) { \
191 hts_log_error("Allele index is out of bounds at %s:%d", header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \
192 ret = -1; \
193 goto clean; \
195 ac[(p[ial]>>1)-1]++; \
199 switch (gt->type) {
200 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break;
201 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break;
202 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break;
203 default: hts_log_error("Unexpected GT %d at %s:%d",
204 gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos + 1);
205 goto clean; break;
207 #undef BRANCH
209 rm_set = kbs_init(line->n_allele);
210 for (i=1; i<line->n_allele; i++) {
211 if ( !ac[i] ) { kbs_insert(rm_set, i); nrm++; }
214 if (nrm) {
215 if (bcf_remove_allele_set(header, line, rm_set))
216 ret = -2;
219 clean:
220 free(ac);
221 if (rm_set) kbs_destroy(rm_set);
222 return ret ? ret : nrm;
225 void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
227 int i;
228 kbitset_t *rm_set = kbs_init(line->n_allele);
229 for (i=1; i<line->n_allele; i++)
230 if ( rm_mask & 1<<i ) kbs_insert(rm_set, i);
232 bcf_remove_allele_set(header, line, rm_set);
233 kbs_destroy(rm_set);
236 int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kbitset_t *rm_set)
238 int *map = (int*) calloc(line->n_allele, sizeof(int));
239 uint8_t *dat = NULL;
241 // create map of indexes from old to new ALT numbering and modify ALT
242 kstring_t str = {0,0,0};
243 kputs(line->d.allele[0], &str);
245 int nrm = 0, i,j; // i: ori alleles, j: new alleles
246 for (i=1, j=1; i<line->n_allele; i++)
248 if ( kbs_exists(rm_set, i) )
250 // remove this allele
251 line->d.allele[i] = NULL;
252 nrm++;
253 continue;
255 kputc(',', &str);
256 kputs(line->d.allele[i], &str);
257 map[i] = j;
258 j++;
260 if ( !nrm ) goto clean;
262 int nR_ori = line->n_allele;
263 int nR_new = line->n_allele-nrm;
264 if ( nR_new<=0 ) // should not be able to remove reference allele
266 hts_log_error("Cannot remove reference allele at %s:%d [%d]",
267 bcf_seqname(header,line), line->pos+1, nR_new);
268 goto err;
270 int nA_ori = nR_ori-1;
271 int nA_new = nR_new-1;
273 int nG_ori = nR_ori*(nR_ori + 1)/2;
274 int nG_new = nR_new*(nR_new + 1)/2;
276 bcf_update_alleles_str(header, line, str.s);
278 // remove from Number=G, Number=R and Number=A INFO fields.
279 int mdat = 0, ndat = 0, mdat_bytes = 0, nret;
280 for (i=0; i<line->n_info; i++)
282 bcf_info_t *info = &line->d.info[i];
283 int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key);
285 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
287 int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key);
288 if ( type==BCF_HT_FLAG ) continue;
289 int size = 1;
290 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
292 mdat = mdat_bytes / size;
293 nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type);
294 mdat_bytes = mdat * size;
295 if ( nret<0 )
297 hts_log_error("Could not access INFO/%s at %s:%d [%d]",
298 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
299 goto err;
301 if ( nret==0 ) continue; // no data for this tag
303 if ( type==BCF_HT_STR )
305 str.l = 0;
306 char *ss = (char*) dat, *se = (char*) dat, s = ss[0];
307 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
309 int nexp, inc = 0;
310 if ( vlen==BCF_VL_A )
312 nexp = nA_ori;
313 inc = 1;
315 else
316 nexp = nR_ori;
317 for (j=0; j<nexp; j++)
319 if ( !*se ) break;
320 while ( *se && *se!=',' ) se++;
321 if ( kbs_exists(rm_set, j+inc) )
323 if ( *se ) se++;
324 ss = se;
325 continue;
327 if ( str.l ) kputc(',',&str);
328 kputsn(ss,se-ss,&str);
329 if ( *se ) se++;
330 ss = se;
332 if ( j==1 && s == '.' ) continue; // missing
333 if ( j!=nexp )
335 hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=%c=%d, but found %d",
336 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, j);
337 goto err;
340 else // Number=G, assuming diploid genotype
342 int k = 0, n = 0;
343 for (j=0; j<nR_ori; j++)
345 for (k=0; k<=j; k++)
347 if ( !*se ) break;
348 while ( *se && *se!=',' ) se++;
349 n++;
350 if ( kbs_exists(rm_set, j) || kbs_exists(rm_set, k) )
352 if ( *se ) se++;
353 ss = se;
354 continue;
356 if ( str.l ) kputc(',',&str);
357 kputsn(ss,se-ss,&str);
358 if ( *se ) se++;
359 ss = se;
361 if ( !*se ) break;
363 if ( n==1 && s == '.' ) continue; // missing
364 if ( n!=nG_ori )
366 hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=G=%d, but found %d",
367 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, n);
368 goto err;
372 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type);
373 if ( nret<0 )
375 hts_log_error("Could not update INFO/%s at %s:%d [%d]",
376 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
377 goto err;
379 continue;
382 if (nret==1) // could be missing - check
384 int missing = 0;
385 #define BRANCH(type_t, is_missing) { \
386 type_t *p = (type_t *) info->vptr; \
387 if ( is_missing ) missing = 1; \
389 switch (info->type) {
390 case BCF_BT_INT8: BRANCH(int8_t, p[0]==bcf_int8_missing); break;
391 case BCF_BT_INT16: BRANCH(int16_t, p[0]==bcf_int16_missing); break;
392 case BCF_BT_INT32: BRANCH(int32_t, p[0]==bcf_int32_missing); break;
393 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[0])); break;
394 default: hts_log_error("Unexpected type %d", info->type); goto err; break;
396 #undef BRANCH
397 if (missing) continue; // could remove this INFO tag?
400 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
402 int inc = 0, ntop;
403 if ( vlen==BCF_VL_A )
405 if ( nret!=nA_ori )
407 hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=A=%d, but found %d",
408 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nA_ori, nret);
409 goto err;
411 ntop = nA_ori;
412 ndat = nA_new;
413 inc = 1;
415 else
417 if ( nret!=nR_ori )
419 hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d",
420 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nR_ori, nret);
421 goto err;
423 ntop = nR_ori;
424 ndat = nR_new;
426 int k = 0;
428 #define BRANCH(type_t,is_vector_end) \
430 type_t *ptr = (type_t*) dat; \
431 int size = sizeof(type_t); \
432 for (j=0; j<ntop; j++) /* j:ori, k:new */ \
434 if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
435 if ( kbs_exists(rm_set, j+inc) ) continue; \
436 if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
437 k++; \
440 switch (type)
442 case BCF_HT_INT: BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break;
443 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break;
445 #undef BRANCH
447 else // Number=G
449 if ( nret!=nG_ori )
451 hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d",
452 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, nret);
453 goto err;
455 int k, l_ori = -1, l_new = 0;
456 ndat = nG_new;
458 #define BRANCH(type_t,is_vector_end) \
460 type_t *ptr = (type_t*) dat; \
461 int size = sizeof(type_t); \
462 for (j=0; j<nR_ori; j++) \
464 for (k=0; k<=j; k++) \
466 l_ori++; \
467 if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
468 if ( kbs_exists(rm_set, j) || kbs_exists(rm_set, k) ) continue; \
469 if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
470 l_new++; \
474 switch (type)
476 case BCF_HT_INT: BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break;
477 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break;
479 #undef BRANCH
482 nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type);
483 if ( nret<0 )
485 hts_log_error("Could not update INFO/%s at %s:%d [%d]",
486 bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
487 goto err;
491 // Update GT fields, the allele indexes might have changed
492 for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break;
493 if ( i<line->n_allele )
495 mdat = mdat_bytes / 4; // sizeof(int32_t)
496 nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat);
497 mdat_bytes = mdat * 4;
498 if ( nret>0 )
500 nret /= line->n_sample;
501 int32_t *ptr = (int32_t*) dat;
502 for (i=0; i<line->n_sample; i++)
504 for (j=0; j<nret; j++)
506 if ( bcf_gt_is_missing(ptr[j]) ) continue;
507 if ( ptr[j]==bcf_int32_vector_end ) break;
508 int al = bcf_gt_allele(ptr[j]);
509 if ( !( al<nR_ori && map[al]>=0 ) )
511 hts_log_error("Problem updating genotypes at %s:%d [ al<nR_ori && map[al]>=0 :: al=%d,nR_ori=%d,map[al]=%d ]",
512 bcf_seqname(header,line), line->pos+1, al, nR_ori, map[al]);
513 goto err;
515 ptr[j] = (map[al]+1)<<1 | (ptr[j]&1);
517 ptr += nret;
519 nret = bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample);
520 if ( nret<0 )
522 hts_log_error("Could not update FORMAT/GT at %s:%d [%d]",
523 bcf_seqname(header,line), line->pos+1, nret);
524 goto err;
529 // Remove from Number=G, Number=R and Number=A FORMAT fields.
530 // Assuming haploid or diploid GTs
531 for (i=0; i<line->n_fmt; i++)
533 bcf_fmt_t *fmt = &line->d.fmt[i];
534 int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id);
536 if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change
538 int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id);
539 if ( type==BCF_HT_FLAG ) continue;
541 int size = 1;
542 if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;
544 mdat = mdat_bytes / size;
545 nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type);
546 mdat_bytes = mdat * size;
547 if ( nret<0 )
549 hts_log_error("Could not access FORMAT/%s at %s:%d [%d]",
550 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
551 goto err;
553 if ( nret == 0 ) continue; // no data for this tag
555 if ( type==BCF_HT_STR )
557 int size = nret/line->n_sample; // number of bytes per sample
558 str.l = 0;
559 if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
561 int nexp, inc = 0;
562 if ( vlen==BCF_VL_A )
564 nexp = nA_ori;
565 inc = 1;
567 else
568 nexp = nR_ori;
569 for (j=0; j<line->n_sample; j++)
571 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss, s = ss[0];
572 int k_src = 0, k_dst = 0, l = str.l;
573 for (k_src=0; k_src<nexp; k_src++)
575 if ( ptr>=se || !*ptr) break;
576 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
577 if ( kbs_exists(rm_set, k_src+inc) )
579 ss = ++ptr;
580 continue;
582 if ( k_dst ) kputc(',',&str);
583 kputsn(ss,ptr-ss,&str);
584 ss = ++ptr;
585 k_dst++;
587 if ( k_src==1 && s == '.' ) continue; // missing
588 if ( k_src!=nexp )
590 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=%c=%d, but found %d",
591 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, k_src);
592 goto err;
594 l = str.l - l;
595 for (; l<size; l++) kputc(0, &str);
598 else // Number=G, diploid or haploid
600 for (j=0; j<line->n_sample; j++)
602 char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss, s = ss[0];
603 int k_src = 0, k_dst = 0, l = str.l;
604 int nexp = 0; // diploid or haploid?
605 while ( ptr<se )
607 if ( !*ptr ) break;
608 if ( *ptr==',' ) nexp++;
609 ptr++;
611 if ( ptr!=ss ) nexp++;
612 if ( nexp==1 && s == '.' ) continue; // missing
613 if ( nexp!=nG_ori && nexp!=nR_ori )
615 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(diploid) or %d(haploid), but found %d",
616 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nR_ori, nexp);
617 goto err;
619 ptr = ss;
620 if ( nexp==nG_ori ) // diploid
622 int ia, ib;
623 for (ia=0; ia<nR_ori; ia++)
625 for (ib=0; ib<=ia; ib++)
627 if ( ptr>=se || !*ptr ) break;
628 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
629 if ( kbs_exists(rm_set, ia) || kbs_exists(rm_set, ib) )
631 ss = ++ptr;
632 continue;
634 if ( k_dst ) kputc(',',&str);
635 kputsn(ss,ptr-ss,&str);
636 ss = ++ptr;
637 k_dst++;
639 if ( ptr>=se || !*ptr ) break;
642 else // haploid
644 for (k_src=0; k_src<nR_ori; k_src++)
646 if ( ptr>=se || !*ptr ) break;
647 while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
648 if ( kbs_exists(rm_set, k_src) )
650 ss = ++ptr;
651 continue;
653 if ( k_dst ) kputc(',',&str);
654 kputsn(ss,ptr-ss,&str);
655 ss = ++ptr;
656 k_dst++;
658 if ( k_src!=nR_ori )
660 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(haploid), but found %d",
661 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, k_src);
662 goto err;
664 l = str.l - l;
665 for (; l<size; l++) kputc(0, &str);
669 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type);
670 if ( nret<0 )
672 hts_log_error("Could not update FORMAT/%s at %s:%d [%d]",
673 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
674 goto err;
676 continue;
679 int nori = nret / line->n_sample;
680 if ( nori==1 && !(vlen==BCF_VL_A && nori==nA_ori) ) // all values may be missing - check
682 int all_missing = 1;
683 #define BRANCH(type_t, is_missing) { \
684 for (j=0; j<line->n_sample; j++) \
686 type_t *p = (type_t*) (fmt->p + j*fmt->size); \
687 if ( !(is_missing)) { all_missing = 0; break; } \
690 switch (fmt->type) {
691 case BCF_BT_INT8: BRANCH(int8_t, p[0]==bcf_int8_missing); break;
692 case BCF_BT_INT16: BRANCH(int16_t, p[0]==bcf_int16_missing); break;
693 case BCF_BT_INT32: BRANCH(int32_t, p[0]==bcf_int32_missing); break;
694 case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[0])); break;
695 default: hts_log_error("Unexpected type %d", fmt->type); goto err; break;
697 #undef BRANCH
698 if (all_missing) continue; // could remove this FORMAT tag?
701 if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
703 int inc = 0, nnew;
704 if ( vlen==BCF_VL_A )
706 if ( nori!=nA_ori )
708 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=A=%d, but found %d",
709 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nA_ori, nori);
710 goto err;
712 ndat = nA_new*line->n_sample;
713 nnew = nA_new;
714 inc = 1;
716 else
718 if ( nori!=nR_ori )
720 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=R=%d, but found %d",
721 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, nori);
722 goto err;
724 ndat = nR_new*line->n_sample;
725 nnew = nR_new;
728 #define BRANCH(type_t,is_vector_end) \
730 for (j=0; j<line->n_sample; j++) \
732 type_t *ptr_src = ((type_t*)dat) + j*nori; \
733 type_t *ptr_dst = ((type_t*)dat) + j*nnew; \
734 int size = sizeof(type_t); \
735 int k_src, k_dst = 0; \
736 for (k_src=0; k_src<nori; k_src++) \
738 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
739 if ( kbs_exists(rm_set, k_src+inc) ) continue; \
740 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
741 k_dst++; \
745 switch (type)
747 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
748 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
750 #undef BRANCH
752 else // Number=G, diploid or mixture of haploid+diploid
754 if ( nori!=nG_ori )
756 hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d, but found %d",
757 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nori);
758 goto err;
760 ndat = nG_new*line->n_sample;
762 #define BRANCH(type_t,is_vector_end) \
764 for (j=0; j<line->n_sample; j++) \
766 type_t *ptr_src = ((type_t*)dat) + j*nori; \
767 type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
768 int size = sizeof(type_t); \
769 int ia, ib, k_dst = 0, k_src; \
770 int nset = 0; /* haploid or diploid? */ \
771 for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
772 if ( nset==nR_ori ) /* haploid */ \
774 for (k_src=0; k_src<nR_ori; k_src++) \
776 if ( kbs_exists(rm_set, k_src) ) continue; \
777 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
778 k_dst++; \
780 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
782 else /* diploid */ \
784 k_src = -1; \
785 for (ia=0; ia<nR_ori; ia++) \
787 for (ib=0; ib<=ia; ib++) \
789 k_src++; \
790 if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; } \
791 if ( kbs_exists(rm_set, ia) || kbs_exists(rm_set, ib) ) continue; \
792 memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
793 k_dst++; \
799 switch (type)
801 case BCF_HT_INT: BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
802 case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
804 #undef BRANCH
806 nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type);
807 if ( nret<0 )
809 hts_log_error("Could not update FORMAT/%s at %s:%d [%d]",
810 bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
811 goto err;
815 clean:
816 free(str.s);
817 free(map);
818 free(dat);
819 return 0;
821 err:
822 free(str.s);
823 free(map);
824 free(dat);
825 return -1;