1 /* vcf.c -- VCF/BCF API functions.
3 Copyright (C) 2012, 2013 Broad Institute.
4 Copyright (C) 2012-2017 Genome Research Ltd.
5 Portions copyright (C) 2014 Intel Corporation.
7 Author: Heng Li <lh3@sanger.ac.uk>
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE. */
38 #include "htslib/vcf.h"
39 #include "htslib/bgzf.h"
40 #include "htslib/tbx.h"
41 #include "htslib/hfile.h"
42 #include "hts_internal.h"
43 #include "htslib/hts_endian.h"
44 #include "htslib/khash_str2int.h"
45 #include "htslib/kstring.h"
47 #include "htslib/khash.h"
48 KHASH_MAP_INIT_STR(vdict
, bcf_idinfo_t
)
49 typedef khash_t(vdict
) vdict_t
;
51 #include "htslib/kseq.h"
53 uint32_t bcf_float_missing
= 0x7F800001;
54 uint32_t bcf_float_vector_end
= 0x7F800002;
55 uint8_t bcf_type_shift
[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
56 static bcf_idinfo_t bcf_idinfo_def
= { .info
= { 15, 15, 15 }, .hrec
= { NULL
, NULL
, NULL
}, .id
= -1 };
58 static const char *dump_char(char *buffer
, char c
)
61 case '\n': strcpy(buffer
, "\\n"); break;
62 case '\r': strcpy(buffer
, "\\r"); break;
63 case '\t': strcpy(buffer
, "\\t"); break;
67 sprintf(buffer
, "\\%c", c
);
70 if (isprint_c(c
)) sprintf(buffer
, "%c", c
);
71 else sprintf(buffer
, "\\x%02X", (unsigned char) c
);
77 static char *find_chrom_header_line(char *s
)
80 if (strncmp(s
, "#CHROM\t", 7) == 0) return s
;
81 else if ((nl
= strstr(s
, "\n#CHROM\t")) != NULL
) return nl
+1;
85 /*************************
86 *** VCF header parser ***
87 *************************/
89 int bcf_hdr_add_sample(bcf_hdr_t
*h
, const char *s
)
94 while ( !*ss
&& isspace_c(*ss
) ) ss
++;
97 hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
101 vdict_t
*d
= (vdict_t
*)h
->dict
[BCF_DT_SAMPLE
];
103 char *sdup
= strdup(s
);
104 int k
= kh_put(vdict
, d
, sdup
, &ret
);
106 kh_val(d
, k
) = bcf_idinfo_def
;
107 kh_val(d
, k
).id
= kh_size(d
) - 1;
109 hts_log_error("Duplicated sample name '%s'", s
);
114 h
->samples
= (char**) realloc(h
->samples
,sizeof(char*)*n
);
115 h
->samples
[n
-1] = sdup
;
120 int bcf_hdr_parse_sample_line(bcf_hdr_t
*h
, const char *str
)
126 for (p
= q
= str
;; ++q
) {
127 if (*q
!= '\t' && *q
!= 0 && *q
!= '\n') continue;
129 char *s
= (char*)malloc(q
- p
+ 1);
130 strncpy(s
, p
, q
- p
);
132 if ( bcf_hdr_add_sample(h
,s
) < 0 ) ret
= -1;
135 if (*q
== 0 || *q
== '\n') break;
138 bcf_hdr_add_sample(h
,NULL
);
142 int bcf_hdr_sync(bcf_hdr_t
*h
)
145 for (i
= 0; i
< 3; i
++)
147 vdict_t
*d
= (vdict_t
*)h
->dict
[i
];
149 if ( h
->n
[i
] < kh_size(d
) )
151 // this should be true only for i=2, BCF_DT_SAMPLE
152 h
->n
[i
] = kh_size(d
);
153 h
->id
[i
] = (bcf_idpair_t
*) realloc(h
->id
[i
], kh_size(d
)*sizeof(bcf_idpair_t
));
155 for (k
=kh_begin(d
); k
<kh_end(d
); k
++)
157 if (!kh_exist(d
,k
)) continue;
158 h
->id
[i
][kh_val(d
,k
).id
].key
= kh_key(d
,k
);
159 h
->id
[i
][kh_val(d
,k
).id
].val
= &kh_val(d
,k
);
166 void bcf_hrec_destroy(bcf_hrec_t
*hrec
)
169 if ( hrec
->value
) free(hrec
->value
);
171 for (i
=0; i
<hrec
->nkeys
; i
++)
181 // Copies all fields except IDX.
182 bcf_hrec_t
*bcf_hrec_dup(bcf_hrec_t
*hrec
)
184 bcf_hrec_t
*out
= (bcf_hrec_t
*) calloc(1,sizeof(bcf_hrec_t
));
185 out
->type
= hrec
->type
;
186 if ( hrec
->key
) out
->key
= strdup(hrec
->key
);
187 if ( hrec
->value
) out
->value
= strdup(hrec
->value
);
188 out
->nkeys
= hrec
->nkeys
;
189 out
->keys
= (char**) malloc(sizeof(char*)*hrec
->nkeys
);
190 out
->vals
= (char**) malloc(sizeof(char*)*hrec
->nkeys
);
192 for (i
=0; i
<hrec
->nkeys
; i
++)
194 if ( hrec
->keys
[i
] && !strcmp("IDX",hrec
->keys
[i
]) ) continue;
195 if ( hrec
->keys
[i
] ) out
->keys
[j
] = strdup(hrec
->keys
[i
]);
196 if ( hrec
->vals
[i
] ) out
->vals
[j
] = strdup(hrec
->vals
[i
]);
199 if ( i
!=j
) out
->nkeys
-= i
-j
; // IDX was omitted
203 void bcf_hrec_debug(FILE *fp
, bcf_hrec_t
*hrec
)
205 fprintf(fp
, "key=[%s] value=[%s]", hrec
->key
, hrec
->value
?hrec
->value
:"");
207 for (i
=0; i
<hrec
->nkeys
; i
++)
208 fprintf(fp
, "\t[%s]=[%s]", hrec
->keys
[i
],hrec
->vals
[i
]);
212 void bcf_header_debug(bcf_hdr_t
*hdr
)
215 for (i
=0; i
<hdr
->nhrec
; i
++)
217 if ( !hdr
->hrec
[i
]->value
)
219 fprintf(stderr
, "##%s=<", hdr
->hrec
[i
]->key
);
220 fprintf(stderr
,"%s=%s", hdr
->hrec
[i
]->keys
[0], hdr
->hrec
[i
]->vals
[0]);
221 for (j
=1; j
<hdr
->hrec
[i
]->nkeys
; j
++)
222 fprintf(stderr
,",%s=%s", hdr
->hrec
[i
]->keys
[j
], hdr
->hrec
[i
]->vals
[j
]);
223 fprintf(stderr
,">\n");
226 fprintf(stderr
,"##%s=%s\n", hdr
->hrec
[i
]->key
,hdr
->hrec
[i
]->value
);
230 void bcf_hrec_add_key(bcf_hrec_t
*hrec
, const char *str
, int len
)
232 int n
= ++hrec
->nkeys
;
233 hrec
->keys
= (char**) realloc(hrec
->keys
, sizeof(char*)*n
);
234 hrec
->vals
= (char**) realloc(hrec
->vals
, sizeof(char*)*n
);
236 hrec
->keys
[n
-1] = (char*) malloc((len
+1)*sizeof(char));
237 memcpy(hrec
->keys
[n
-1],str
,len
);
238 hrec
->keys
[n
-1][len
] = 0;
239 hrec
->vals
[n
-1] = NULL
;
242 void bcf_hrec_set_val(bcf_hrec_t
*hrec
, int i
, const char *str
, int len
, int is_quoted
)
244 if ( !str
) { hrec
->vals
[i
] = NULL
; return; }
245 if ( hrec
->vals
[i
] ) free(hrec
->vals
[i
]);
248 hrec
->vals
[i
] = (char*) malloc((len
+3)*sizeof(char));
249 hrec
->vals
[i
][0] = '"';
250 memcpy(&hrec
->vals
[i
][1],str
,len
);
251 hrec
->vals
[i
][len
+1] = '"';
252 hrec
->vals
[i
][len
+2] = 0;
256 hrec
->vals
[i
] = (char*) malloc((len
+1)*sizeof(char));
257 memcpy(hrec
->vals
[i
],str
,len
);
258 hrec
->vals
[i
][len
] = 0;
262 void hrec_add_idx(bcf_hrec_t
*hrec
, int idx
)
264 int n
= ++hrec
->nkeys
;
265 hrec
->keys
= (char**) realloc(hrec
->keys
, sizeof(char*)*n
);
266 hrec
->vals
= (char**) realloc(hrec
->vals
, sizeof(char*)*n
);
267 hrec
->keys
[n
-1] = strdup("IDX");
268 kstring_t str
= {0,0,0};
270 hrec
->vals
[n
-1] = str
.s
;
273 int bcf_hrec_find_key(bcf_hrec_t
*hrec
, const char *key
)
276 for (i
=0; i
<hrec
->nkeys
; i
++)
277 if ( !strcasecmp(key
,hrec
->keys
[i
]) ) return i
;
281 static inline int is_escaped(const char *min
, const char *str
)
284 while ( --str
>=min
&& *str
=='\\' ) n
++;
288 bcf_hrec_t
*bcf_hdr_parse_line(const bcf_hdr_t
*h
, const char *line
, int *len
)
290 const char *p
= line
;
291 if (p
[0] != '#' || p
[1] != '#') { *len
= 0; return NULL
; }
295 while ( *q
&& *q
!='=' && *q
!= '\n' ) q
++;
297 if ( *q
!='=' || !n
) { *len
= q
-line
+1; return NULL
; } // wrong format
299 bcf_hrec_t
*hrec
= (bcf_hrec_t
*) calloc(1,sizeof(bcf_hrec_t
));
300 hrec
->key
= (char*) malloc(sizeof(char)*(n
+1));
301 memcpy(hrec
->key
,p
,n
);
305 if ( *p
!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
307 while ( *q
&& *q
!='\n' ) q
++;
308 hrec
->value
= (char*) malloc((q
-p
+1)*sizeof(char));
309 memcpy(hrec
->value
, p
, q
-p
);
310 hrec
->value
[q
-p
] = 0;
311 *len
= q
- line
+ (*q
? 1 : 0); // Skip \n but not \0
315 // structured line, e.g.
316 // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
317 // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
319 while ( *q
&& *q
!='\n' && nopen
>0 )
322 while ( *q
&& *q
==' ' ) { p
++; q
++; }
323 // ^[A-Za-z_][0-9A-Za-z_.]*$
324 if (p
==q
&& *q
&& (isalpha_c(*q
) || *q
=='_'))
327 while ( *q
&& (isalnum_c(*q
) || *q
=='_' || *q
=='.') ) q
++;
331 while ( *q
&& *q
==' ' ) { q
++; m
++; }
335 while ( *q
&& *q
!='\n' ) q
++;
336 hts_log_error("Could not parse the header line: \"%.*s\"",
337 (int) (q
- line
), line
);
338 *len
= q
- line
+ (*q
? 1 : 0);
339 bcf_hrec_destroy(hrec
);
342 bcf_hrec_add_key(hrec
, p
, q
-p
-m
);
344 while ( *q
&& *q
==' ' ) { p
++; q
++; }
345 int quoted
= *p
=='"' ? 1 : 0;
346 if ( quoted
) p
++, q
++;
347 while ( *q
&& *q
!= '\n' )
349 if ( quoted
) { if ( *q
=='"' && !is_escaped(p
,q
) ) break; }
352 if ( *q
=='<' ) nopen
++;
353 if ( *q
=='>' ) nopen
--;
355 if ( *q
==',' && nopen
==1 ) break;
360 while ( r
> p
&& r
[-1] == ' ' ) r
--;
361 bcf_hrec_set_val(hrec
, hrec
->nkeys
-1, p
, r
-p
, quoted
);
362 if ( quoted
&& *q
=='"' ) q
++;
363 if ( *q
=='>' ) { nopen
--; q
++; }
366 // Skip to end of line
369 while ( *q
&& *q
!='\n' ) { nonspace
|= !isspace(*q
); q
++; }
371 hts_log_warning("Dropped trailing junk from header line '%.*s'", (int) (q
- line
), line
);
374 *len
= q
- line
+ (*q
? 1 : 0);
378 static int bcf_hdr_set_idx(bcf_hdr_t
*hdr
, int dict_type
, const char *tag
, bcf_idinfo_t
*idinfo
)
380 // If available, preserve existing IDX
381 if ( idinfo
->id
==-1 )
382 idinfo
->id
= hdr
->n
[dict_type
]++;
383 else if ( idinfo
->id
< hdr
->n
[dict_type
] && hdr
->id
[dict_type
][idinfo
->id
].key
)
385 hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
390 if ( idinfo
->id
>= hdr
->n
[dict_type
] ) hdr
->n
[dict_type
] = idinfo
->id
+1;
391 hts_expand0(bcf_idpair_t
,hdr
->n
[dict_type
],hdr
->m
[dict_type
],hdr
->id
[dict_type
]);
393 // NB: the next kh_put call can invalidate the idinfo pointer, therefore
394 // we leave it unassigned here. It myst be set explicitly in bcf_hdr_sync.
395 hdr
->id
[dict_type
][idinfo
->id
].key
= tag
;
400 // returns: 1 when hdr needs to be synced, 0 otherwise
401 int bcf_hdr_register_hrec(bcf_hdr_t
*hdr
, bcf_hrec_t
*hrec
)
407 if ( !strcmp(hrec
->key
, "contig") )
409 hrec
->type
= BCF_HL_CTG
;
411 // Get the contig ID ($str) and length ($j)
412 i
= bcf_hrec_find_key(hrec
,"length");
414 else if ( sscanf(hrec
->vals
[i
],"%d",&j
)!=1 ) return 0;
416 i
= bcf_hrec_find_key(hrec
,"ID");
418 str
= strdup(hrec
->vals
[i
]);
420 // Register in the dictionary
421 vdict_t
*d
= (vdict_t
*)hdr
->dict
[BCF_DT_CTG
];
422 khint_t k
= kh_get(vdict
, d
, str
);
423 if ( k
!= kh_end(d
) ) { free(str
); return 0; } // already present
424 k
= kh_put(vdict
, d
, str
, &ret
);
426 int idx
= bcf_hrec_find_key(hrec
,"IDX");
429 char *tmp
= hrec
->vals
[idx
];
430 idx
= strtol(hrec
->vals
[idx
], &tmp
, 10);
431 if ( *tmp
|| idx
< 0 || idx
>= INT_MAX
- 1)
433 hts_log_warning("Error parsing the IDX tag, skipping");
438 kh_val(d
, k
) = bcf_idinfo_def
;
439 kh_val(d
, k
).id
= idx
;
440 kh_val(d
, k
).info
[0] = j
;
441 kh_val(d
, k
).hrec
[0] = hrec
;
442 bcf_hdr_set_idx(hdr
, BCF_DT_CTG
, kh_key(d
,k
), &kh_val(d
,k
));
443 if ( idx
==-1 ) hrec_add_idx(hrec
, kh_val(d
,k
).id
);
448 if ( !strcmp(hrec
->key
, "INFO") ) hrec
->type
= BCF_HL_INFO
;
449 else if ( !strcmp(hrec
->key
, "FILTER") ) hrec
->type
= BCF_HL_FLT
;
450 else if ( !strcmp(hrec
->key
, "FORMAT") ) hrec
->type
= BCF_HL_FMT
;
451 else if ( hrec
->nkeys
>0 ) { hrec
->type
= BCF_HL_STR
; return 1; }
454 // INFO/FILTER/FORMAT
456 uint32_t type
= -1, var
= -1;
457 int num
= -1, idx
= -1;
458 for (i
=0; i
<hrec
->nkeys
; i
++)
460 if ( !strcmp(hrec
->keys
[i
], "ID") ) id
= hrec
->vals
[i
];
461 else if ( !strcmp(hrec
->keys
[i
], "IDX") )
463 char *tmp
= hrec
->vals
[i
];
464 idx
= strtol(hrec
->vals
[i
], &tmp
, 10);
465 if ( *tmp
|| idx
< 0 || idx
>= INT_MAX
- 1)
467 hts_log_warning("Error parsing the IDX tag, skipping");
471 else if ( !strcmp(hrec
->keys
[i
], "Type") )
473 if ( !strcmp(hrec
->vals
[i
], "Integer") ) type
= BCF_HT_INT
;
474 else if ( !strcmp(hrec
->vals
[i
], "Float") ) type
= BCF_HT_REAL
;
475 else if ( !strcmp(hrec
->vals
[i
], "String") ) type
= BCF_HT_STR
;
476 else if ( !strcmp(hrec
->vals
[i
], "Character") ) type
= BCF_HT_STR
;
477 else if ( !strcmp(hrec
->vals
[i
], "Flag") ) type
= BCF_HT_FLAG
;
480 hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec
->vals
[i
]);
484 else if ( !strcmp(hrec
->keys
[i
], "Number") )
486 if ( !strcmp(hrec
->vals
[i
],"A") ) var
= BCF_VL_A
;
487 else if ( !strcmp(hrec
->vals
[i
],"R") ) var
= BCF_VL_R
;
488 else if ( !strcmp(hrec
->vals
[i
],"G") ) var
= BCF_VL_G
;
489 else if ( !strcmp(hrec
->vals
[i
],".") ) var
= BCF_VL_VAR
;
492 sscanf(hrec
->vals
[i
],"%d",&num
);
495 if (var
!= BCF_VL_FIXED
) num
= 0xfffff;
498 if (hrec
->type
== BCF_HL_INFO
|| hrec
->type
== BCF_HL_FMT
) {
500 hts_log_warning("%s %s field has no Type defined. Assuming String",
501 *hrec
->key
== 'I' ? "An" : "A", hrec
->key
);
505 hts_log_warning("%s %s field has no Number defined. Assuming '.'",
506 *hrec
->key
== 'I' ? "An" : "A", hrec
->key
);
510 uint32_t info
= ((((uint32_t)num
) & 0xfffff)<<12 |
513 (((uint32_t) hrec
->type
) & 0xf));
518 vdict_t
*d
= (vdict_t
*)hdr
->dict
[BCF_DT_ID
];
519 k
= kh_get(vdict
, d
, str
);
520 if ( k
!= kh_end(d
) )
524 if ( kh_val(d
, k
).hrec
[info
&0xf] ) return 0;
525 kh_val(d
, k
).info
[info
&0xf] = info
;
526 kh_val(d
, k
).hrec
[info
&0xf] = hrec
;
527 if ( idx
==-1 ) hrec_add_idx(hrec
, kh_val(d
, k
).id
);
530 k
= kh_put(vdict
, d
, str
, &ret
);
531 kh_val(d
, k
) = bcf_idinfo_def
;
532 kh_val(d
, k
).info
[info
&0xf] = info
;
533 kh_val(d
, k
).hrec
[info
&0xf] = hrec
;
534 kh_val(d
, k
).id
= idx
;
535 bcf_hdr_set_idx(hdr
, BCF_DT_ID
, kh_key(d
,k
), &kh_val(d
,k
));
536 if ( idx
==-1 ) hrec_add_idx(hrec
, kh_val(d
,k
).id
);
541 int bcf_hdr_add_hrec(bcf_hdr_t
*hdr
, bcf_hrec_t
*hrec
)
543 if ( !hrec
) return 0;
545 hrec
->type
= BCF_HL_GEN
;
546 if ( !bcf_hdr_register_hrec(hdr
,hrec
) )
548 // If one of the hashed field, then it is already present
549 if ( hrec
->type
!= BCF_HL_GEN
)
551 bcf_hrec_destroy(hrec
);
555 // Is one of the generic fields and already present?
557 for (i
=0; i
<hdr
->nhrec
; i
++)
559 if ( hdr
->hrec
[i
]->type
!=BCF_HL_GEN
) continue;
560 if ( !strcmp(hdr
->hrec
[i
]->key
,hrec
->key
) && !strcmp(hrec
->key
,"fileformat") ) break;
561 if ( !strcmp(hdr
->hrec
[i
]->key
,hrec
->key
) && !strcmp(hdr
->hrec
[i
]->value
,hrec
->value
) ) break;
565 bcf_hrec_destroy(hrec
);
570 // New record, needs to be added
571 int n
= ++hdr
->nhrec
;
572 hdr
->hrec
= (bcf_hrec_t
**) realloc(hdr
->hrec
, n
*sizeof(bcf_hrec_t
*));
573 hdr
->hrec
[n
-1] = hrec
;
576 return hrec
->type
==BCF_HL_GEN
? 0 : 1;
580 * Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed),
581 * the STR,GEN lines are searched for linearly in a linked list of all header lines.
582 * This may become a problem for VCFs with huge headers, we might need to build a
583 * dictionary for these lines as well.
585 bcf_hrec_t
*bcf_hdr_get_hrec(const bcf_hdr_t
*hdr
, int type
, const char *key
, const char *value
, const char *str_class
)
588 if ( type
==BCF_HL_GEN
)
590 for (i
=0; i
<hdr
->nhrec
; i
++)
592 if ( hdr
->hrec
[i
]->type
!=type
) continue;
593 if ( strcmp(hdr
->hrec
[i
]->key
,key
) ) continue;
594 if ( !value
|| !strcmp(hdr
->hrec
[i
]->value
,value
) ) return hdr
->hrec
[i
];
598 else if ( type
==BCF_HL_STR
)
600 for (i
=0; i
<hdr
->nhrec
; i
++)
602 if ( hdr
->hrec
[i
]->type
!=type
) continue;
603 if ( strcmp(hdr
->hrec
[i
]->key
,str_class
) ) continue;
604 int j
= bcf_hrec_find_key(hdr
->hrec
[i
],key
);
605 if ( j
>=0 && !strcmp(hdr
->hrec
[i
]->vals
[j
],value
) ) return hdr
->hrec
[i
];
609 vdict_t
*d
= type
==BCF_HL_CTG
? (vdict_t
*)hdr
->dict
[BCF_DT_CTG
] : (vdict_t
*)hdr
->dict
[BCF_DT_ID
];
610 khint_t k
= kh_get(vdict
, d
, value
);
611 if ( k
== kh_end(d
) ) return NULL
;
612 return kh_val(d
, k
).hrec
[type
==BCF_HL_CTG
?0:type
];
615 void bcf_hdr_check_sanity(bcf_hdr_t
*hdr
)
617 static int PL_warned
= 0, GL_warned
= 0;
621 int id
= bcf_hdr_id2int(hdr
, BCF_DT_ID
, "PL");
622 if ( bcf_hdr_idinfo_exists(hdr
,BCF_HL_FMT
,id
) && bcf_hdr_id2length(hdr
,BCF_HL_FMT
,id
)!=BCF_VL_G
)
624 hts_log_warning("PL should be declared as Number=G");
630 int id
= bcf_hdr_id2int(hdr
, BCF_DT_ID
, "GL");
631 if ( bcf_hdr_idinfo_exists(hdr
,BCF_HL_FMT
,id
) && bcf_hdr_id2length(hdr
,BCF_HL_FMT
,id
)!=BCF_VL_G
)
633 hts_log_warning("GL should be declared as Number=G");
639 int bcf_hdr_parse(bcf_hdr_t
*hdr
, char *htxt
)
641 int len
, needs_sync
= 0, done
= 0;
644 // Check sanity: "fileformat" string must come as first
645 bcf_hrec_t
*hrec
= bcf_hdr_parse_line(hdr
,p
,&len
);
646 if ( !hrec
|| !hrec
->key
|| strcasecmp(hrec
->key
,"fileformat") )
647 hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
648 needs_sync
+= bcf_hdr_add_hrec(hdr
, hrec
);
650 // The filter PASS must appear first in the dictionary
651 hrec
= bcf_hdr_parse_line(hdr
,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len
);
652 needs_sync
+= bcf_hdr_add_hrec(hdr
, hrec
);
654 // Parse the whole header
656 while (NULL
!= (hrec
= bcf_hdr_parse_line(hdr
, p
, &len
))) {
657 needs_sync
+= bcf_hdr_add_hrec(hdr
, hrec
);
661 // Next should be the sample line. If not, it was a malformed
662 // header, in which case print a warning and skip (many VCF
663 // operations do not really care about a few malformed lines).
664 // In the future we may want to add a strict mode that errors in
666 if ( strncmp("#CHROM\tPOS",p
,10) != 0 ) {
667 char *eol
= strchr(p
, '\n');
669 hts_log_warning("Could not parse header line: %.*s",
670 eol
? (int)(eol
- p
) : INT_MAX
, p
);
673 p
= eol
+ 1; // Try from the next line.
675 done
= -1; // No more lines left, give up.
678 done
= 1; // Sample line found
683 // No sample line is fatal.
684 hts_log_error("Could not parse the header, sample line not found");
688 int ret
= bcf_hdr_parse_sample_line(hdr
,p
);
690 bcf_hdr_check_sanity(hdr
);
694 int bcf_hdr_append(bcf_hdr_t
*hdr
, const char *line
)
697 bcf_hrec_t
*hrec
= bcf_hdr_parse_line(hdr
, (char*) line
, &len
);
698 if ( !hrec
) return -1;
699 bcf_hdr_add_hrec(hdr
, hrec
);
703 void bcf_hdr_remove(bcf_hdr_t
*hdr
, int type
, const char *key
)
709 while ( i
<hdr
->nhrec
)
711 if ( hdr
->hrec
[i
]->type
!=type
) { i
++; continue; }
714 if ( type
==BCF_HL_FLT
|| type
==BCF_HL_INFO
|| type
==BCF_HL_FMT
|| type
== BCF_HL_CTG
)
716 int j
= bcf_hrec_find_key(hdr
->hrec
[i
], "ID");
719 vdict_t
*d
= type
==BCF_HL_CTG
? (vdict_t
*)hdr
->dict
[BCF_DT_CTG
] : (vdict_t
*)hdr
->dict
[BCF_DT_ID
];
720 khint_t k
= kh_get(vdict
, d
, hdr
->hrec
[i
]->vals
[j
]);
721 kh_val(d
, k
).hrec
[type
==BCF_HL_CTG
?0:type
] = NULL
;
727 if ( i
< hdr
->nhrec
)
728 memmove(&hdr
->hrec
[i
],&hdr
->hrec
[i
+1],(hdr
->nhrec
-i
)*sizeof(bcf_hrec_t
*));
729 bcf_hrec_destroy(hrec
);
735 if ( type
==BCF_HL_FLT
|| type
==BCF_HL_INFO
|| type
==BCF_HL_FMT
|| type
== BCF_HL_CTG
)
737 hrec
= bcf_hdr_get_hrec(hdr
, type
, "ID", key
, NULL
);
740 for (i
=0; i
<hdr
->nhrec
; i
++)
741 if ( hdr
->hrec
[i
]==hrec
) break;
742 assert( i
<hdr
->nhrec
);
744 vdict_t
*d
= type
==BCF_HL_CTG
? (vdict_t
*)hdr
->dict
[BCF_DT_CTG
] : (vdict_t
*)hdr
->dict
[BCF_DT_ID
];
745 khint_t k
= kh_get(vdict
, d
, key
);
746 kh_val(d
, k
).hrec
[type
==BCF_HL_CTG
?0:type
] = NULL
;
750 for (i
=0; i
<hdr
->nhrec
; i
++)
752 if ( hdr
->hrec
[i
]->type
!=type
) continue;
753 if ( type
==BCF_HL_GEN
)
755 if ( !strcmp(hdr
->hrec
[i
]->key
,key
) ) break;
759 // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
760 int j
= bcf_hrec_find_key(hdr
->hrec
[i
], "ID");
761 if ( j
>=0 && !strcmp(hdr
->hrec
[i
]->vals
[j
],key
) ) break;
764 if ( i
==hdr
->nhrec
) return;
769 if ( i
< hdr
->nhrec
)
770 memmove(&hdr
->hrec
[i
],&hdr
->hrec
[i
+1],(hdr
->nhrec
-i
)*sizeof(bcf_hrec_t
*));
771 bcf_hrec_destroy(hrec
);
776 int bcf_hdr_printf(bcf_hdr_t
*hdr
, const char *fmt
, ...)
780 int n
= vsnprintf(NULL
, 0, fmt
, ap
) + 2;
783 char *line
= (char*)malloc(n
);
785 vsnprintf(line
, n
, fmt
, ap
);
788 int ret
= bcf_hdr_append(hdr
, line
);
795 /**********************
796 *** BCF header I/O ***
797 **********************/
799 const char *bcf_hdr_get_version(const bcf_hdr_t
*hdr
)
801 bcf_hrec_t
*hrec
= bcf_hdr_get_hrec(hdr
, BCF_HL_GEN
, "fileformat", NULL
, NULL
);
804 hts_log_warning("No version string found, assuming VCFv4.2");
810 void bcf_hdr_set_version(bcf_hdr_t
*hdr
, const char *version
)
812 bcf_hrec_t
*hrec
= bcf_hdr_get_hrec(hdr
, BCF_HL_GEN
, "fileformat", NULL
, NULL
);
816 kstring_t str
= {0,0,0};
817 ksprintf(&str
,"##fileformat=%s", version
);
818 hrec
= bcf_hdr_parse_line(hdr
, str
.s
, &len
);
824 hrec
->value
= strdup(version
);
829 bcf_hdr_t
*bcf_hdr_init(const char *mode
)
833 h
= (bcf_hdr_t
*)calloc(1, sizeof(bcf_hdr_t
));
835 for (i
= 0; i
< 3; ++i
)
836 if ((h
->dict
[i
] = kh_init(vdict
)) == NULL
) goto fail
;
837 if ( strchr(mode
,'w') )
839 bcf_hdr_append(h
, "##fileformat=VCFv4.2");
840 // The filter PASS must appear first in the dictionary
841 bcf_hdr_append(h
, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
846 for (i
= 0; i
< 3; ++i
)
847 kh_destroy(vdict
, h
->dict
[i
]);
852 void bcf_hdr_destroy(bcf_hdr_t
*h
)
857 for (i
= 0; i
< 3; ++i
) {
858 vdict_t
*d
= (vdict_t
*)h
->dict
[i
];
859 if (d
== 0) continue;
860 for (k
= kh_begin(d
); k
!= kh_end(d
); ++k
)
861 if (kh_exist(d
, k
)) free((char*)kh_key(d
, k
));
862 kh_destroy(vdict
, d
);
865 for (i
=0; i
<h
->nhrec
; i
++)
866 bcf_hrec_destroy(h
->hrec
[i
]);
867 if (h
->nhrec
) free(h
->hrec
);
868 if (h
->samples
) free(h
->samples
);
869 free(h
->keep_samples
);
870 free(h
->transl
[0]); free(h
->transl
[1]);
875 bcf_hdr_t
*bcf_hdr_read(htsFile
*hfp
)
877 if (hfp
->format
.format
== vcf
)
878 return vcf_hdr_read(hfp
);
879 if (hfp
->format
.format
!= bcf
) {
880 hts_log_error("Input is not detected as bcf or vcf format");
884 assert(hfp
->is_bgzf
);
886 BGZF
*fp
= hfp
->fp
.bgzf
;
889 h
= bcf_hdr_init("r");
891 hts_log_error("Failed to allocate bcf header");
894 if (bgzf_read(fp
, magic
, 5) != 5)
896 hts_log_error("Failed to read the header (reading BCF in text mode?)");
900 if (strncmp((char*)magic
, "BCF\2\2", 5) != 0)
902 if (!strncmp((char*)magic
, "BCF", 3))
903 hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
905 hts_log_error("Invalid BCF2 magic string");
912 if (bgzf_read(fp
, buf
, 4) != 4) goto fail
;
913 hlen
= buf
[0] | (buf
[1] << 8) | (buf
[2] << 16) | (buf
[3] << 24);
914 if (hlen
>= SIZE_MAX
) { errno
= ENOMEM
; goto fail
; }
915 htxt
= (char*)malloc(hlen
+ 1);
916 if (!htxt
) goto fail
;
917 if (bgzf_read(fp
, htxt
, hlen
) != hlen
) goto fail
;
918 htxt
[hlen
] = '\0'; // Ensure htxt is terminated
919 if ( bcf_hdr_parse(h
, htxt
) < 0 ) goto fail
;
923 hts_log_error("Failed to read BCF header");
929 int bcf_hdr_write(htsFile
*hfp
, bcf_hdr_t
*h
)
935 if ( h
->dirty
) bcf_hdr_sync(h
);
936 if (hfp
->format
.format
== vcf
|| hfp
->format
.format
== text_format
)
937 return vcf_hdr_write(hfp
, h
);
939 kstring_t htxt
= {0,0,0};
940 bcf_hdr_format(h
, 1, &htxt
);
941 kputc('\0', &htxt
); // include the \0 byte
943 BGZF
*fp
= hfp
->fp
.bgzf
;
944 if ( bgzf_write(fp
, "BCF\2\2", 5) !=5 ) return -1;
946 u32_to_le(htxt
.l
, hlen
);
947 if ( bgzf_write(fp
, hlen
, 4) !=4 ) return -1;
948 if ( bgzf_write(fp
, htxt
.s
, htxt
.l
) != htxt
.l
) return -1;
954 /********************
956 ********************/
961 v
= (bcf1_t
*)calloc(1, sizeof(bcf1_t
));
965 void bcf_clear(bcf1_t
*v
)
968 for (i
=0; i
<v
->d
.m_info
; i
++)
970 if ( v
->d
.info
[i
].vptr_free
)
972 free(v
->d
.info
[i
].vptr
- v
->d
.info
[i
].vptr_off
);
973 v
->d
.info
[i
].vptr_free
= 0;
976 for (i
=0; i
<v
->d
.m_fmt
; i
++)
978 if ( v
->d
.fmt
[i
].p_free
)
980 free(v
->d
.fmt
[i
].p
- v
->d
.fmt
[i
].p_off
);
981 v
->d
.fmt
[i
].p_free
= 0;
984 v
->rid
= v
->pos
= v
->rlen
= v
->unpacked
= 0;
985 bcf_float_set_missing(v
->qual
);
986 v
->n_info
= v
->n_allele
= v
->n_fmt
= v
->n_sample
= 0;
987 v
->shared
.l
= v
->indiv
.l
= 0;
989 v
->d
.shared_dirty
= 0;
990 v
->d
.indiv_dirty
= 0;
993 if (v
->d
.m_als
) v
->d
.als
[0] = 0;
994 if (v
->d
.m_id
) v
->d
.id
[0] = 0;
997 void bcf_empty(bcf1_t
*v
)
1002 free(v
->d
.allele
); free(v
->d
.flt
); free(v
->d
.info
); free(v
->d
.fmt
);
1003 if (v
->d
.var
) free(v
->d
.var
);
1004 free(v
->shared
.s
); free(v
->indiv
.s
);
1007 void bcf_destroy(bcf1_t
*v
)
1014 static inline int bcf_read1_core(BGZF
*fp
, bcf1_t
*v
)
1018 if ((ret
= bgzf_read(fp
, x
, 32)) != 32) {
1019 if (ret
== 0) return -1;
1023 x
[0] -= 24; // to exclude six 32-bit integers
1024 if (ks_resize(&v
->shared
, x
[0]) != 0) return -2;
1025 if (ks_resize(&v
->indiv
, x
[1]) != 0) return -2;
1026 memcpy(v
, x
+ 2, 16);
1027 v
->n_allele
= x
[6]>>16; v
->n_info
= x
[6]&0xffff;
1028 v
->n_fmt
= x
[7]>>24; v
->n_sample
= x
[7]&0xffffff;
1029 v
->shared
.l
= x
[0], v
->indiv
.l
= x
[1];
1030 // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1031 if ( (!v
->indiv
.l
|| !v
->n_sample
) && v
->n_fmt
) v
->n_fmt
= 0;
1033 if (bgzf_read(fp
, v
->shared
.s
, v
->shared
.l
) != v
->shared
.l
) return -2;
1034 if (bgzf_read(fp
, v
->indiv
.s
, v
->indiv
.l
) != v
->indiv
.l
) return -2;
1038 #define bit_array_size(n) ((n)/8+1)
1039 #define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8))
1040 #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1041 #define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8)))
1043 static int bcf_dec_typed_int1_safe(uint8_t *p
, uint8_t *end
, uint8_t **q
,
1047 if (end
- p
< 2) return -1;
1049 /* Use if .. else if ... else instead of switch to force order. Assumption
1050 is that small integers are more frequent than big ones. */
1051 if (t
== BCF_BT_INT8
) {
1053 *val
= *(int8_t *) p
;
1054 } else if (t
== BCF_BT_INT16
) {
1055 if (end
- p
< 2) return -1;
1057 v
= p
[0] | (p
[1] << 8);
1058 *val
= v
< 0x8000 ? v
: -((int32_t) (0xffff - v
)) - 1;
1059 } else if (t
== BCF_BT_INT32
) {
1060 if (end
- p
< 4) return -1;
1062 v
= p
[0] | (p
[1] << 8) | (p
[2] << 16) | (p
[3] << 24);
1063 *val
= v
< 0x80000000UL
? v
: -((int32_t) (0xffffffffUL
- v
)) - 1;
1070 static int bcf_dec_size_safe(uint8_t *p
, uint8_t *end
, uint8_t **q
,
1071 int *num
, int *type
) {
1073 if (p
>= end
) return -1;
1080 r
= bcf_dec_typed_int1_safe(p
+ 1, end
, q
, num
);
1082 return *num
>= 0 ? 0 : -1;
1085 static const char *get_type_name(int type
) {
1086 const char *types
[9] = {
1087 "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1088 "unknown", "float", "unknown", "char", "unknown"
1090 int t
= (type
>= 0 && type
< 8) ? type
: 8;
1094 static int bcf_record_check(const bcf_hdr_t
*hdr
, bcf1_t
*rec
) {
1100 uint32_t i
, reports
;
1101 const uint32_t is_integer
= ((1 << BCF_BT_INT8
) |
1102 (1 << BCF_BT_INT16
) |
1103 (1 << BCF_BT_INT32
));
1104 const uint32_t is_valid_type
= (is_integer
|
1105 (1 << BCF_BT_NULL
) |
1106 (1 << BCF_BT_FLOAT
) |
1107 (1 << BCF_BT_CHAR
));
1110 // Check for valid contig ID
1111 if (rec
->rid
< 0 || rec
->rid
>= hdr
->n
[BCF_DT_CTG
]) {
1112 hts_log_warning("Bad BCF record: Invalid %s id %d", "CONTIG", rec
->rid
);
1113 err
|= BCF_ERR_CTG_INVALID
;
1117 ptr
= (uint8_t *) rec
->shared
.s
;
1118 end
= ptr
+ rec
->shared
.l
;
1119 if (bcf_dec_size_safe(ptr
, end
, &ptr
, &num
, &type
) != 0) goto bad_shared
;
1120 if (type
!= BCF_BT_CHAR
) {
1121 hts_log_warning("Bad BCF record: Invalid %s type %d (%s)", "ID", type
, get_type_name(type
));
1122 err
|= BCF_ERR_TAG_INVALID
;
1124 bytes
= (size_t) num
<< bcf_type_shift
[type
];
1125 if (end
- ptr
< bytes
) goto bad_shared
;
1128 // Check REF and ALT
1130 for (i
= 0; i
< rec
->n_allele
; i
++) {
1131 if (bcf_dec_size_safe(ptr
, end
, &ptr
, &num
, &type
) != 0) goto bad_shared
;
1132 if (type
!= BCF_BT_CHAR
) {
1133 if (!reports
++ || hts_verbose
>= HTS_LOG_DEBUG
)
1134 hts_log_warning("Bad BCF record: Invalid %s type %d (%s)", "REF/ALT", type
, get_type_name(type
));
1135 err
|= BCF_ERR_CHAR
;
1137 bytes
= (size_t) num
<< bcf_type_shift
[type
];
1138 if (end
- ptr
< bytes
) goto bad_shared
;
1144 if (bcf_dec_size_safe(ptr
, end
, &ptr
, &num
, &type
) != 0) goto bad_shared
;
1146 if (((1 << type
) & is_integer
) == 0) {
1147 hts_log_warning("Bad BCF record: Invalid %s type %d (%s)", "FILTER", type
, get_type_name(type
));
1148 err
|= BCF_ERR_TAG_INVALID
;
1150 bytes
= (size_t) num
<< bcf_type_shift
[type
];
1151 if (end
- ptr
< bytes
) goto bad_shared
;
1152 for (i
= 0; i
< num
; i
++) {
1153 int32_t key
= bcf_dec_int1(ptr
, type
, &ptr
);
1154 if (key
< 0 || key
>= hdr
->n
[BCF_DT_ID
]) {
1155 if (!reports
++ || hts_verbose
>= HTS_LOG_DEBUG
)
1156 hts_log_warning("Bad BCF record: Invalid %s id %d", "FILTER", key
);
1157 err
|= BCF_ERR_TAG_UNDEF
;
1164 for (i
= 0; i
< rec
->n_info
; i
++) {
1166 if (bcf_dec_typed_int1_safe(ptr
, end
, &ptr
, &key
) != 0) goto bad_shared
;
1167 if (key
< 0 || key
>= hdr
->n
[BCF_DT_ID
]) {
1168 if (!reports
++ || hts_verbose
>= HTS_LOG_DEBUG
)
1169 hts_log_warning("Bad BCF record: Invalid %s id %d", "INFO", key
);
1170 err
|= BCF_ERR_TAG_UNDEF
;
1172 if (bcf_dec_size_safe(ptr
, end
, &ptr
, &num
, &type
) != 0) goto bad_shared
;
1173 if (((1 << type
) & is_valid_type
) == 0) {
1174 if (!reports
++ || hts_verbose
>= HTS_LOG_DEBUG
)
1175 hts_log_warning("Bad BCF record: Invalid %s type %d (%s)", "INFO", type
, get_type_name(type
));
1176 err
|= BCF_ERR_TAG_INVALID
;
1178 bytes
= (size_t) num
<< bcf_type_shift
[type
];
1179 if (end
- ptr
< bytes
) goto bad_shared
;
1183 // Check FORMAT and individual information
1184 ptr
= (uint8_t *) rec
->indiv
.s
;
1185 end
= ptr
+ rec
->indiv
.l
;
1187 for (i
= 0; i
< rec
->n_fmt
; i
++) {
1189 if (bcf_dec_typed_int1_safe(ptr
, end
, &ptr
, &key
) != 0) goto bad_indiv
;
1190 if (key
< 0 || key
>= hdr
->n
[BCF_DT_ID
]) {
1191 if (!reports
++ || hts_verbose
>= HTS_LOG_DEBUG
)
1192 hts_log_warning("Bad BCF record: Invalid %s id %d", "FORMAT", key
);
1193 err
|= BCF_ERR_TAG_UNDEF
;
1195 if (bcf_dec_size_safe(ptr
, end
, &ptr
, &num
, &type
) != 0) goto bad_indiv
;
1196 if (((1 << type
) & is_valid_type
) == 0) {
1197 if (!reports
++ || hts_verbose
>= HTS_LOG_DEBUG
)
1198 hts_log_warning("Bad BCF record: Invalid %s type %d (%s)", "FORMAT", type
, get_type_name(type
));
1199 err
|= BCF_ERR_TAG_INVALID
;
1201 bytes
= ((size_t) num
<< bcf_type_shift
[type
]) * rec
->n_sample
;
1202 if (end
- ptr
< bytes
) goto bad_indiv
;
1206 rec
->errcode
|= err
;
1208 return err
? -1 : 0;
1211 hts_log_error("Bad BCF record - shared section malformed or too short");
1215 hts_log_error("Bad BCF record - individuals section malformed or too short");
1219 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr
, int n_sample
, bcf_fmt_t
*fmt
);
1220 int bcf_subset_format(const bcf_hdr_t
*hdr
, bcf1_t
*rec
)
1222 if ( !hdr
->keep_samples
) return 0;
1223 if ( !bcf_hdr_nsamples(hdr
) )
1225 rec
->indiv
.l
= rec
->n_sample
= 0;
1230 uint8_t *ptr
= (uint8_t*)rec
->indiv
.s
, *dst
= NULL
, *src
;
1231 bcf_dec_t
*dec
= &rec
->d
;
1232 hts_expand(bcf_fmt_t
, rec
->n_fmt
, dec
->m_fmt
, dec
->fmt
);
1233 for (i
=0; i
<dec
->m_fmt
; ++i
) dec
->fmt
[i
].p_free
= 0;
1235 for (i
=0; i
<rec
->n_fmt
; i
++)
1237 ptr
= bcf_unpack_fmt_core1(ptr
, rec
->n_sample
, &dec
->fmt
[i
]);
1238 src
= dec
->fmt
[i
].p
- dec
->fmt
[i
].size
;
1241 memmove(dec
->fmt
[i
-1].p
+ dec
->fmt
[i
-1].p_len
, dec
->fmt
[i
].p
- dec
->fmt
[i
].p_off
, dec
->fmt
[i
].p_off
);
1242 dec
->fmt
[i
].p
= dec
->fmt
[i
-1].p
+ dec
->fmt
[i
-1].p_len
+ dec
->fmt
[i
].p_off
;
1244 dst
= dec
->fmt
[i
].p
;
1245 for (j
=0; j
<hdr
->nsamples_ori
; j
++)
1247 src
+= dec
->fmt
[i
].size
;
1248 if ( !bit_array_test(hdr
->keep_samples
,j
) ) continue;
1249 memmove(dst
, src
, dec
->fmt
[i
].size
);
1250 dst
+= dec
->fmt
[i
].size
;
1252 rec
->indiv
.l
-= dec
->fmt
[i
].p_len
- (dst
- dec
->fmt
[i
].p
);
1253 dec
->fmt
[i
].p_len
= dst
- dec
->fmt
[i
].p
;
1255 rec
->unpacked
|= BCF_UN_FMT
;
1257 rec
->n_sample
= bcf_hdr_nsamples(hdr
);
1261 int bcf_read(htsFile
*fp
, const bcf_hdr_t
*h
, bcf1_t
*v
)
1263 if (fp
->format
.format
== vcf
) return vcf_read(fp
,h
,v
);
1264 int ret
= bcf_read1_core(fp
->fp
.bgzf
, v
);
1265 if (ret
== 0) ret
= bcf_record_check(h
, v
);
1266 if ( ret
!=0 || !h
->keep_samples
) return ret
;
1267 return bcf_subset_format(h
,v
);
1270 int bcf_readrec(BGZF
*fp
, void *null
, void *vv
, int *tid
, int *beg
, int *end
)
1272 bcf1_t
*v
= (bcf1_t
*) vv
;
1274 if ((ret
= bcf_read1_core(fp
, v
)) >= 0)
1275 *tid
= v
->rid
, *beg
= v
->pos
, *end
= v
->pos
+ v
->rlen
;
1279 static inline void bcf1_sync_id(bcf1_t
*line
, kstring_t
*str
)
1281 // single typed string
1282 if ( line
->d
.id
&& strcmp(line
->d
.id
, ".") ) bcf_enc_vchar(str
, strlen(line
->d
.id
), line
->d
.id
);
1283 else bcf_enc_size(str
, 0, BCF_BT_CHAR
);
1285 static inline void bcf1_sync_alleles(bcf1_t
*line
, kstring_t
*str
)
1287 // list of typed strings
1289 for (i
=0; i
<line
->n_allele
; i
++)
1290 bcf_enc_vchar(str
, strlen(line
->d
.allele
[i
]), line
->d
.allele
[i
]);
1291 if ( !line
->rlen
&& line
->n_allele
) line
->rlen
= strlen(line
->d
.allele
[0]);
1293 static inline void bcf1_sync_filter(bcf1_t
*line
, kstring_t
*str
)
1295 // typed vector of integers
1296 if ( line
->d
.n_flt
) bcf_enc_vint(str
, line
->d
.n_flt
, line
->d
.flt
, -1);
1297 else bcf_enc_vint(str
, 0, 0, -1);
1300 static inline void bcf1_sync_info(bcf1_t
*line
, kstring_t
*str
)
1302 // pairs of typed vectors
1304 for (i
=0; i
<line
->n_info
; i
++)
1306 bcf_info_t
*info
= &line
->d
.info
[i
];
1309 // marked for removal
1310 if ( irm
< 0 ) irm
= i
;
1313 kputsn_(info
->vptr
- info
->vptr_off
, info
->vptr_len
+ info
->vptr_off
, str
);
1316 bcf_info_t tmp
= line
->d
.info
[irm
]; line
->d
.info
[irm
] = line
->d
.info
[i
]; line
->d
.info
[i
] = tmp
;
1317 while ( irm
<=i
&& line
->d
.info
[irm
].vptr
) irm
++;
1320 if ( irm
>=0 ) line
->n_info
= irm
;
1323 static int bcf1_sync(bcf1_t
*line
)
1325 char *shared_ori
= line
->shared
.s
;
1328 kstring_t tmp
= {0,0,0};
1329 if ( !line
->shared
.l
)
1331 // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
1333 bcf1_sync_id(line
, &tmp
);
1334 line
->unpack_size
[0] = tmp
.l
; prev_len
= tmp
.l
;
1336 bcf1_sync_alleles(line
, &tmp
);
1337 line
->unpack_size
[1] = tmp
.l
- prev_len
; prev_len
= tmp
.l
;
1339 bcf1_sync_filter(line
, &tmp
);
1340 line
->unpack_size
[2] = tmp
.l
- prev_len
;
1342 bcf1_sync_info(line
, &tmp
);
1345 else if ( line
->d
.shared_dirty
)
1347 // The line was edited, update the BCF data block.
1349 if ( !(line
->unpacked
& BCF_UN_STR
) ) bcf_unpack(line
,BCF_UN_STR
);
1351 // ptr_ori points to the original unchanged BCF data.
1352 uint8_t *ptr_ori
= (uint8_t *) line
->shared
.s
;
1354 // ID: single typed string
1355 if ( line
->d
.shared_dirty
& BCF1_DIRTY_ID
)
1356 bcf1_sync_id(line
, &tmp
);
1358 kputsn_(ptr_ori
, line
->unpack_size
[0], &tmp
);
1359 ptr_ori
+= line
->unpack_size
[0];
1360 line
->unpack_size
[0] = tmp
.l
; prev_len
= tmp
.l
;
1362 // REF+ALT: list of typed strings
1363 if ( line
->d
.shared_dirty
& BCF1_DIRTY_ALS
)
1364 bcf1_sync_alleles(line
, &tmp
);
1367 kputsn_(ptr_ori
, line
->unpack_size
[1], &tmp
);
1368 if ( !line
->rlen
&& line
->n_allele
) line
->rlen
= strlen(line
->d
.allele
[0]);
1370 ptr_ori
+= line
->unpack_size
[1];
1371 line
->unpack_size
[1] = tmp
.l
- prev_len
; prev_len
= tmp
.l
;
1373 if ( line
->unpacked
& BCF_UN_FLT
)
1375 // FILTER: typed vector of integers
1376 if ( line
->d
.shared_dirty
& BCF1_DIRTY_FLT
)
1377 bcf1_sync_filter(line
, &tmp
);
1378 else if ( line
->d
.n_flt
)
1379 kputsn_(ptr_ori
, line
->unpack_size
[2], &tmp
);
1381 bcf_enc_vint(&tmp
, 0, 0, -1);
1382 ptr_ori
+= line
->unpack_size
[2];
1383 line
->unpack_size
[2] = tmp
.l
- prev_len
;
1385 if ( line
->unpacked
& BCF_UN_INFO
)
1387 // INFO: pairs of typed vectors
1388 if ( line
->d
.shared_dirty
& BCF1_DIRTY_INF
)
1390 bcf1_sync_info(line
, &tmp
);
1391 ptr_ori
= (uint8_t*)line
->shared
.s
+ line
->shared
.l
;
1396 int size
= line
->shared
.l
- (size_t)ptr_ori
+ (size_t)line
->shared
.s
;
1397 if ( size
) kputsn_(ptr_ori
, size
, &tmp
);
1399 free(line
->shared
.s
);
1402 if ( line
->shared
.s
!= shared_ori
&& line
->unpacked
& BCF_UN_INFO
)
1404 // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
1405 size_t off_new
= line
->unpack_size
[0] + line
->unpack_size
[1] + line
->unpack_size
[2];
1407 for (i
=0; i
<line
->n_info
; i
++)
1409 uint8_t *vptr_free
= line
->d
.info
[i
].vptr_free
? line
->d
.info
[i
].vptr
- line
->d
.info
[i
].vptr_off
: NULL
;
1410 line
->d
.info
[i
].vptr
= (uint8_t*) line
->shared
.s
+ off_new
+ line
->d
.info
[i
].vptr_off
;
1411 off_new
+= line
->d
.info
[i
].vptr_len
+ line
->d
.info
[i
].vptr_off
;
1415 line
->d
.info
[i
].vptr_free
= 0;
1420 if ( line
->n_sample
&& line
->n_fmt
&& (!line
->indiv
.l
|| line
->d
.indiv_dirty
) )
1422 // The genotype fields changed or are not present
1423 tmp
.l
= tmp
.m
= 0; tmp
.s
= NULL
;
1425 for (i
=0; i
<line
->n_fmt
; i
++)
1427 bcf_fmt_t
*fmt
= &line
->d
.fmt
[i
];
1430 // marked for removal
1431 if ( irm
< 0 ) irm
= i
;
1434 kputsn_(fmt
->p
- fmt
->p_off
, fmt
->p_len
+ fmt
->p_off
, &tmp
);
1437 bcf_fmt_t tfmt
= line
->d
.fmt
[irm
]; line
->d
.fmt
[irm
] = line
->d
.fmt
[i
]; line
->d
.fmt
[i
] = tfmt
;
1438 while ( irm
<=i
&& line
->d
.fmt
[irm
].p
) irm
++;
1442 if ( irm
>=0 ) line
->n_fmt
= irm
;
1443 free(line
->indiv
.s
);
1446 // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
1448 for (i
=0; i
<line
->n_fmt
; i
++)
1450 uint8_t *p_free
= line
->d
.fmt
[i
].p_free
? line
->d
.fmt
[i
].p
- line
->d
.fmt
[i
].p_off
: NULL
;
1451 line
->d
.fmt
[i
].p
= (uint8_t*) line
->indiv
.s
+ off_new
+ line
->d
.fmt
[i
].p_off
;
1452 off_new
+= line
->d
.fmt
[i
].p_len
+ line
->d
.fmt
[i
].p_off
;
1456 line
->d
.fmt
[i
].p_free
= 0;
1460 if ( !line
->n_sample
) line
->n_fmt
= 0;
1461 line
->d
.shared_dirty
= line
->d
.indiv_dirty
= 0;
1465 bcf1_t
*bcf_copy(bcf1_t
*dst
, bcf1_t
*src
)
1470 dst
->rid
= src
->rid
;
1471 dst
->pos
= src
->pos
;
1472 dst
->rlen
= src
->rlen
;
1473 dst
->qual
= src
->qual
;
1474 dst
->n_info
= src
->n_info
; dst
->n_allele
= src
->n_allele
;
1475 dst
->n_fmt
= src
->n_fmt
; dst
->n_sample
= src
->n_sample
;
1477 dst
->shared
.m
= dst
->shared
.l
= src
->shared
.l
;
1478 dst
->shared
.s
= (char*) malloc(dst
->shared
.l
);
1479 memcpy(dst
->shared
.s
,src
->shared
.s
,dst
->shared
.l
);
1481 dst
->indiv
.m
= dst
->indiv
.l
= src
->indiv
.l
;
1482 dst
->indiv
.s
= (char*) malloc(dst
->indiv
.l
);
1483 memcpy(dst
->indiv
.s
,src
->indiv
.s
,dst
->indiv
.l
);
1487 bcf1_t
*bcf_dup(bcf1_t
*src
)
1489 bcf1_t
*out
= bcf_init1();
1490 return bcf_copy(out
, src
);
1493 int bcf_write(htsFile
*hfp
, bcf_hdr_t
*h
, bcf1_t
*v
)
1495 if ( h
->dirty
) bcf_hdr_sync(h
);
1496 if ( bcf_hdr_nsamples(h
)!=v
->n_sample
)
1498 hts_log_error("Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d)",
1499 bcf_seqname(h
,v
), v
->pos
+1, v
->n_sample
, bcf_hdr_nsamples(h
));
1503 if ( hfp
->format
.format
== vcf
|| hfp
->format
.format
== text_format
)
1504 return vcf_write(hfp
,h
,v
);
1508 // vcf_parse1() encountered a new contig or tag, undeclared in the
1509 // header. At this point, the header must have been printed,
1510 // proceeding would lead to a broken BCF file. Errors must be checked
1511 // and cleared by the caller before we can proceed.
1512 hts_log_error("Unchecked error (%d), exiting", v
->errcode
);
1515 bcf1_sync(v
); // check if the BCF record was modified
1517 BGZF
*fp
= hfp
->fp
.bgzf
;
1519 x
[0] = v
->shared
.l
+ 24; // to include six 32-bit integers
1521 memcpy(x
+ 2, v
, 16);
1522 x
[6] = (uint32_t)v
->n_allele
<<16 | v
->n_info
;
1523 x
[7] = (uint32_t)v
->n_fmt
<<24 | v
->n_sample
;
1524 if ( bgzf_write(fp
, x
, 32) != 32 ) return -1;
1525 if ( bgzf_write(fp
, v
->shared
.s
, v
->shared
.l
) != v
->shared
.l
) return -1;
1526 if ( bgzf_write(fp
, v
->indiv
.s
, v
->indiv
.l
) != v
->indiv
.l
) return -1;
1530 /**********************
1531 *** VCF header I/O ***
1532 **********************/
1534 bcf_hdr_t
*vcf_hdr_read(htsFile
*fp
)
1536 kstring_t txt
, *s
= &fp
->line
;
1539 h
= bcf_hdr_init("r");
1541 hts_log_error("Failed to allocate bcf header");
1544 txt
.l
= txt
.m
= 0; txt
.s
= 0;
1545 while ((ret
= hts_getline(fp
, KS_SEP_LINE
, s
)) >= 0) {
1546 if (s
->l
== 0) continue;
1547 if (s
->s
[0] != '#') {
1548 hts_log_error("No sample line");
1551 if (s
->s
[1] != '#' && fp
->fn_aux
) { // insert contigs here
1552 kstring_t tmp
= { 0, 0, NULL
};
1553 hFILE
*f
= hopen(fp
->fn_aux
, "r");
1555 hts_log_error("Couldn't open \"%s\"", fp
->fn_aux
);
1558 while (tmp
.l
= 0, kgetline(&tmp
, (kgets_func
*) hgets
, f
) >= 0) {
1559 char *tab
= strchr(tmp
.s
, '\t');
1560 if (tab
== NULL
) continue;
1561 kputs("##contig=<ID=", &txt
); kputsn(tmp
.s
, tab
- tmp
.s
, &txt
);
1562 kputs(",length=", &txt
); kputl(atol(tab
), &txt
);
1563 kputsn(">\n", 2, &txt
);
1566 if (hclose(f
) != 0) {
1567 hts_log_warning("Failed to close %s", fp
->fn_aux
);
1570 kputsn(s
->s
, s
->l
, &txt
);
1572 if (s
->s
[1] != '#') break;
1574 if ( ret
< -1 ) goto error
;
1577 hts_log_error("Could not read the header");
1580 if ( bcf_hdr_parse(h
, txt
.s
) < 0 ) goto error
;
1582 // check tabix index, are all contigs listed in the header? add the missing ones
1583 tbx_t
*idx
= tbx_index_load(fp
->fn
);
1586 int i
, n
, need_sync
= 0;
1587 const char **names
= tbx_seqnames(idx
, &n
);
1590 bcf_hrec_t
*hrec
= bcf_hdr_get_hrec(h
, BCF_HL_CTG
, "ID", (char*) names
[i
], NULL
);
1591 if ( hrec
) continue;
1592 hrec
= (bcf_hrec_t
*) calloc(1,sizeof(bcf_hrec_t
));
1593 hrec
->key
= strdup("contig");
1594 bcf_hrec_add_key(hrec
, "ID", strlen("ID"));
1595 bcf_hrec_set_val(hrec
, hrec
->nkeys
-1, (char*) names
[i
], strlen(names
[i
]), 0);
1596 bcf_hdr_add_hrec(h
, hrec
);
1609 if (h
) bcf_hdr_destroy(h
);
1613 int bcf_hdr_set(bcf_hdr_t
*hdr
, const char *fname
)
1616 char **lines
= hts_readlines(fname
, &n
);
1617 if ( !lines
) return 1;
1618 for (i
=0; i
<n
-1; i
++)
1621 bcf_hrec_t
*hrec
= bcf_hdr_parse_line(hdr
,lines
[i
],&k
);
1622 if ( hrec
) bcf_hdr_add_hrec(hdr
, hrec
);
1625 bcf_hdr_parse_sample_line(hdr
,lines
[n
-1]);
1632 static void _bcf_hrec_format(const bcf_hrec_t
*hrec
, int is_bcf
, kstring_t
*str
)
1637 ksprintf(str
, "##%s=<", hrec
->key
);
1638 for (j
=0; j
<hrec
->nkeys
; j
++)
1640 // do not output IDX if output is VCF
1641 if ( !is_bcf
&& !strcmp("IDX",hrec
->keys
[j
]) ) continue;
1642 if ( nout
) kputc(',',str
);
1643 ksprintf(str
,"%s=%s", hrec
->keys
[j
], hrec
->vals
[j
]);
1646 ksprintf(str
,">\n");
1649 ksprintf(str
,"##%s=%s\n", hrec
->key
,hrec
->value
);
1652 void bcf_hrec_format(const bcf_hrec_t
*hrec
, kstring_t
*str
)
1654 _bcf_hrec_format(hrec
,0,str
);
1657 int bcf_hdr_format(const bcf_hdr_t
*hdr
, int is_bcf
, kstring_t
*str
)
1660 for (i
=0; i
<hdr
->nhrec
; i
++)
1661 _bcf_hrec_format(hdr
->hrec
[i
], is_bcf
, str
);
1663 ksprintf(str
, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
1664 if ( bcf_hdr_nsamples(hdr
) )
1666 ksprintf(str
, "\tFORMAT");
1667 for (i
=0; i
<bcf_hdr_nsamples(hdr
); i
++)
1668 ksprintf(str
, "\t%s", hdr
->samples
[i
]);
1670 ksprintf(str
, "\n");
1675 char *bcf_hdr_fmt_text(const bcf_hdr_t
*hdr
, int is_bcf
, int *len
)
1677 kstring_t txt
= {0,0,0};
1678 bcf_hdr_format(hdr
, is_bcf
, &txt
);
1679 if ( len
) *len
= txt
.l
;
1683 const char **bcf_hdr_seqnames(const bcf_hdr_t
*h
, int *n
)
1685 vdict_t
*d
= (vdict_t
*)h
->dict
[BCF_DT_CTG
];
1686 int tid
, m
= kh_size(d
);
1687 const char **names
= (const char**) calloc(m
,sizeof(const char*));
1689 for (k
=kh_begin(d
); k
<kh_end(d
); k
++)
1691 if ( !kh_exist(d
,k
) ) continue;
1692 tid
= kh_val(d
,k
).id
;
1694 names
[tid
] = kh_key(d
,k
);
1696 // sanity check: there should be no gaps
1697 for (tid
=0; tid
<m
; tid
++)
1703 int vcf_hdr_write(htsFile
*fp
, const bcf_hdr_t
*h
)
1705 kstring_t htxt
= {0,0,0};
1706 bcf_hdr_format(h
, 0, &htxt
);
1707 while (htxt
.l
&& htxt
.s
[htxt
.l
-1] == '\0') --htxt
.l
; // kill trailing zeros
1709 if ( fp
->format
.compression
!=no_compression
)
1710 ret
= bgzf_write(fp
->fp
.bgzf
, htxt
.s
, htxt
.l
);
1712 ret
= hwrite(fp
->fp
.hfile
, htxt
.s
, htxt
.l
);
1714 return ret
<0 ? -1 : 0;
1717 /***********************
1718 *** Typed value I/O ***
1719 ***********************/
1721 void bcf_enc_vint(kstring_t
*s
, int n
, int32_t *a
, int wsize
)
1723 int32_t max
= INT32_MIN
+ 1, min
= INT32_MAX
;
1725 if (n
<= 0) bcf_enc_size(s
, 0, BCF_BT_NULL
);
1726 else if (n
== 1) bcf_enc_int1(s
, a
[0]);
1728 if (wsize
<= 0) wsize
= n
;
1729 for (i
= 0; i
< n
; ++i
) {
1730 if (a
[i
] == bcf_int32_missing
|| a
[i
] == bcf_int32_vector_end
) continue;
1731 if (max
< a
[i
]) max
= a
[i
];
1732 if (min
> a
[i
]) min
= a
[i
];
1734 if (max
<= INT8_MAX
&& min
> bcf_int8_vector_end
) {
1735 bcf_enc_size(s
, wsize
, BCF_BT_INT8
);
1736 for (i
= 0; i
< n
; ++i
)
1737 if ( a
[i
]==bcf_int32_vector_end
) kputc(bcf_int8_vector_end
, s
);
1738 else if ( a
[i
]==bcf_int32_missing
) kputc(bcf_int8_missing
, s
);
1739 else kputc(a
[i
], s
);
1740 } else if (max
<= INT16_MAX
&& min
> bcf_int16_vector_end
) {
1742 bcf_enc_size(s
, wsize
, BCF_BT_INT16
);
1743 ks_resize(s
, s
->l
+ n
* sizeof(int16_t));
1744 p
= (uint8_t *) s
->s
+ s
->l
;
1745 for (i
= 0; i
< n
; ++i
)
1748 if ( a
[i
]==bcf_int32_vector_end
) x
= bcf_int16_vector_end
;
1749 else if ( a
[i
]==bcf_int32_missing
) x
= bcf_int16_missing
;
1752 p
+= sizeof(int16_t);
1754 s
->l
+= n
* sizeof(int16_t);
1757 bcf_enc_size(s
, wsize
, BCF_BT_INT32
);
1758 ks_resize(s
, s
->l
+ n
* sizeof(int32_t));
1759 p
= (uint8_t *) s
->s
+ s
->l
;
1760 for (i
= 0; i
< n
; ++i
) {
1762 p
+= sizeof(int32_t);
1764 s
->l
+= n
* sizeof(int32_t);
1769 static inline int serialize_float_array(kstring_t
*s
, size_t n
, const float *a
) {
1772 size_t bytes
= n
* sizeof(float);
1774 if (bytes
/ sizeof(float) != n
) return -1;
1775 if (ks_resize(s
, s
->l
+ bytes
) < 0) return -1;
1777 p
= (uint8_t *) s
->s
+ s
->l
;
1778 for (i
= 0; i
< n
; i
++) {
1779 float_to_le(a
[i
], p
);
1787 void bcf_enc_vfloat(kstring_t
*s
, int n
, float *a
)
1790 bcf_enc_size(s
, n
, BCF_BT_FLOAT
);
1791 serialize_float_array(s
, n
, a
);
1794 void bcf_enc_vchar(kstring_t
*s
, int l
, const char *a
)
1796 bcf_enc_size(s
, l
, BCF_BT_CHAR
);
1800 void bcf_fmt_array(kstring_t
*s
, int n
, int type
, void *data
)
1807 if (type
== BCF_BT_CHAR
)
1809 char *p
= (char*)data
;
1810 for (j
= 0; j
< n
&& *p
; ++j
, ++p
)
1812 if ( *p
==bcf_str_missing
) kputc('.', s
);
1818 #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
1819 uint8_t *p = (uint8_t *) data; \
1820 for (j=0; j<n; j++, p += sizeof(type_t)) \
1822 type_t v = convert(p); \
1823 if ( is_vector_end ) break; \
1824 if ( j ) kputc(',', s); \
1825 if ( is_missing ) kputc('.', s); \
1830 case BCF_BT_INT8
: BRANCH(int8_t, le_to_i8
, v
==bcf_int8_missing
, v
==bcf_int8_vector_end
, kputw(v
, s
)); break;
1831 case BCF_BT_INT16
: BRANCH(int16_t, le_to_i16
, v
==bcf_int16_missing
, v
==bcf_int16_vector_end
, kputw(v
, s
)); break;
1832 case BCF_BT_INT32
: BRANCH(int32_t, le_to_i32
, v
==bcf_int32_missing
, v
==bcf_int32_vector_end
, kputw(v
, s
)); break;
1833 case BCF_BT_FLOAT
: BRANCH(uint32_t, le_to_u32
, v
==bcf_float_missing
, v
==bcf_float_vector_end
, kputd(le_to_float(p
), s
)); break;
1834 default: hts_log_error("Unexpected type %d", type
); exit(1); break;
1840 uint8_t *bcf_fmt_sized_array(kstring_t
*s
, uint8_t *ptr
)
1843 x
= bcf_dec_size(ptr
, &ptr
, &type
);
1844 bcf_fmt_array(s
, x
, type
, ptr
);
1845 return ptr
+ (x
<< bcf_type_shift
[type
]);
1848 /********************
1849 *** VCF site I/O ***
1850 ********************/
1853 int key
, max_m
, size
, offset
;
1854 uint64_t is_gt
:1, max_g
:31, max_l
:32;
1859 static inline void align_mem(kstring_t
*s
)
1863 int l
= ((s
->l
+ 7)>>3<<3) - s
->l
;
1864 kputsn((char*)&zero
, l
, s
);
1868 // p,q is the start and the end of the FORMAT field
1869 #define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */
1870 static int vcf_parse_format(kstring_t
*s
, const bcf_hdr_t
*h
, bcf1_t
*v
, char *p
, char *q
)
1872 if ( !bcf_hdr_nsamples(h
) ) return 0;
1878 vdict_t
*d
= (vdict_t
*)h
->dict
[BCF_DT_ID
];
1879 kstring_t
*mem
= (kstring_t
*)&h
->mem
;
1880 fmt_aux_t fmt
[MAX_N_FMT
];
1883 char *end
= s
->s
+ s
->l
;
1886 hts_log_error("FORMAT column with no sample columns starting at %s:%d", s
->s
, v
->pos
+1);
1891 if ( p
[0]=='.' && p
[1]==0 ) // FORMAT field is empty "."
1893 v
->n_sample
= bcf_hdr_nsamples(h
);
1897 // get format information from the dictionary
1898 for (j
= 0, t
= kstrtok(p
, ":", &aux1
); t
; t
= kstrtok(0, 0, &aux1
), ++j
) {
1899 if (j
>= MAX_N_FMT
) {
1900 v
->errcode
|= BCF_ERR_LIMITS
;
1901 hts_log_error("FORMAT column at %s:%d lists more identifiers than htslib can handle",
1902 bcf_seqname(h
,v
), v
->pos
+1);
1907 k
= kh_get(vdict
, d
, t
);
1908 if (k
== kh_end(d
) || kh_val(d
, k
).info
[BCF_HL_FMT
] == 15) {
1909 if ( t
[0]=='.' && t
[1]==0 )
1911 hts_log_error("Invalid FORMAT tag name '.'");
1912 v
->errcode
|= BCF_ERR_TAG_INVALID
;
1915 hts_log_warning("FORMAT '%s' is not defined in the header, assuming Type=String", t
);
1916 kstring_t tmp
= {0,0,0};
1918 ksprintf(&tmp
, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t
);
1919 bcf_hrec_t
*hrec
= bcf_hdr_parse_line(h
,tmp
.s
,&l
);
1921 if ( bcf_hdr_add_hrec((bcf_hdr_t
*)h
, hrec
) ) bcf_hdr_sync((bcf_hdr_t
*)h
);
1922 k
= kh_get(vdict
, d
, t
);
1923 v
->errcode
= BCF_ERR_TAG_UNDEF
;
1924 if (k
== kh_end(d
)) {
1925 hts_log_error("Could not add dummy header for FORMAT '%s'", t
);
1926 v
->errcode
|= BCF_ERR_TAG_INVALID
;
1930 fmt
[j
].max_l
= fmt
[j
].max_m
= fmt
[j
].max_g
= 0;
1931 fmt
[j
].key
= kh_val(d
, k
).id
;
1932 fmt
[j
].is_gt
= !strcmp(t
, "GT");
1933 fmt
[j
].y
= h
->id
[0][fmt
[j
].key
].val
->info
[BCF_HL_FMT
];
1937 int n_sample_ori
= -1;
1938 r
= q
+ 1; // r: position in the format string
1939 l
= 0, m
= g
= 1, v
->n_sample
= 0; // m: max vector size, l: max field len, g: max number of alleles
1942 // can we skip some samples?
1943 if ( h
->keep_samples
)
1946 if ( !bit_array_test(h
->keep_samples
,n_sample_ori
) )
1948 while ( *r
!='\t' && r
<end
) r
++;
1949 if ( *r
=='\t' ) { *r
= 0; r
++; }
1954 // collect fmt stats: max vector size, length, number of alleles
1955 j
= 0; // j-th format field
1958 if ( *r
== '\t' ) *r
= 0;
1959 if ( *r
== ':' || !*r
) // end of field or end of sample
1961 if (fmt
[j
].max_m
< m
) fmt
[j
].max_m
= m
;
1962 if (fmt
[j
].max_l
< l
) fmt
[j
].max_l
= l
;
1963 if (fmt
[j
].is_gt
&& fmt
[j
].max_g
< g
) fmt
[j
].max_g
= g
;
1970 hts_log_error("Incorrect number of FORMAT fields at %s:%d",
1971 h
->id
[BCF_DT_CTG
][v
->rid
].key
, v
->pos
+1);
1977 else if ( *r
== ',' ) m
++;
1978 else if ( fmt
[j
].is_gt
&& (*r
== '|' || *r
== '/') ) g
++;
1979 if ( r
>=end
) break;
1983 if ( v
->n_sample
== bcf_hdr_nsamples(h
) ) break;
1987 // allocate memory for arrays
1988 for (j
= 0; j
< v
->n_fmt
; ++j
) {
1989 fmt_aux_t
*f
= &fmt
[j
];
1990 if ( !f
->max_m
) f
->max_m
= 1; // omitted trailing format field
1991 if ((f
->y
>>4&0xf) == BCF_HT_STR
) {
1992 f
->size
= f
->is_gt
? f
->max_g
<< 2 : f
->max_l
;
1993 } else if ((f
->y
>>4&0xf) == BCF_HT_REAL
|| (f
->y
>>4&0xf) == BCF_HT_INT
) {
1994 f
->size
= f
->max_m
<< 2;
1997 hts_log_error("The format type %d is currently not supported", f
->y
>>4&0xf);
1998 abort(); // I do not know how to do with Flag in the genotype fields
2002 ks_resize(mem
, mem
->l
+ v
->n_sample
* f
->size
);
2003 mem
->l
+= v
->n_sample
* f
->size
;
2005 for (j
= 0; j
< v
->n_fmt
; ++j
)
2006 fmt
[j
].buf
= (uint8_t*)mem
->s
+ fmt
[j
].offset
;
2007 // fill the sample fields; at beginning of the loop, t points to the first char of a format
2009 t
= q
+ 1; m
= 0; // m: sample id
2012 // can we skip some samples?
2013 if ( h
->keep_samples
)
2016 if ( !bit_array_test(h
->keep_samples
,n_sample_ori
) )
2018 while ( *t
&& t
<end
) t
++;
2023 if ( m
== bcf_hdr_nsamples(h
) ) break;
2025 j
= 0; // j-th format field, m-th sample
2028 fmt_aux_t
*z
= &fmt
[j
++];
2029 if ((z
->y
>>4&0xf) == BCF_HT_STR
) {
2030 if (z
->is_gt
) { // genotypes
2031 int32_t is_phased
= 0, *x
= (int32_t*)(z
->buf
+ z
->size
* m
);
2033 if (*t
== '.') ++t
, x
[l
++] = is_phased
;
2034 else x
[l
++] = (strtol(t
, &t
, 10) + 1) << 1 | is_phased
;
2035 #if THOROUGH_SANITY_CHECKS
2036 assert( 0 ); // success of strtol,strtod not checked
2038 is_phased
= (*t
== '|');
2039 if (*t
!= '|' && *t
!= '/') break;
2041 if ( !l
) x
[l
++] = 0; // An empty field, insert missing value
2042 for (; l
< z
->size
>>2; ++l
) x
[l
] = bcf_int32_vector_end
;
2044 char *x
= (char*)z
->buf
+ z
->size
* m
;
2045 for (r
= t
, l
= 0; *t
!= ':' && *t
; ++t
) x
[l
++] = *t
;
2046 for (; l
< z
->size
; ++l
) x
[l
] = 0;
2048 } else if ((z
->y
>>4&0xf) == BCF_HT_INT
) {
2049 int32_t *x
= (int32_t*)(z
->buf
+ z
->size
* m
);
2051 if (*t
== '.') x
[l
++] = bcf_int32_missing
, ++t
; // ++t to skip "."
2052 else x
[l
++] = strtol(t
, &t
, 10);
2053 if (*t
!= ',') break;
2055 if ( !l
) x
[l
++] = bcf_int32_missing
;
2056 for (; l
< z
->size
>>2; ++l
) x
[l
] = bcf_int32_vector_end
;
2057 } else if ((z
->y
>>4&0xf) == BCF_HT_REAL
) {
2058 float *x
= (float*)(z
->buf
+ z
->size
* m
);
2060 if (*t
== '.' && !isdigit_c(t
[1])) bcf_float_set_missing(x
[l
++]), ++t
; // ++t to skip "."
2061 else x
[l
++] = strtod(t
, &t
);
2062 if (*t
!= ',') break;
2064 if ( !l
) bcf_float_set_missing(x
[l
++]); // An empty field, insert missing value
2065 for (; l
< z
->size
>>2; ++l
) bcf_float_set_vector_end(x
[l
]);
2071 else if (*t
== ':') {
2076 hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%d",
2077 dump_char(buffer
, *t
), h
->id
[BCF_DT_ID
][z
->key
].key
, bcf_seqname(h
,v
), v
->pos
+1);
2078 v
->errcode
|= BCF_ERR_CHAR
;
2083 for (; j
< v
->n_fmt
; ++j
) { // fill end-of-vector values
2084 fmt_aux_t
*z
= &fmt
[j
];
2085 if ((z
->y
>>4&0xf) == BCF_HT_STR
) {
2087 int32_t *x
= (int32_t*)(z
->buf
+ z
->size
* m
);
2088 if (z
->size
) x
[0] = bcf_int32_missing
;
2089 for (l
= 1; l
< z
->size
>>2; ++l
) x
[l
] = bcf_int32_vector_end
;
2091 char *x
= (char*)z
->buf
+ z
->size
* m
;
2092 if ( z
->size
) x
[0] = '.';
2093 for (l
= 1; l
< z
->size
; ++l
) x
[l
] = 0;
2095 } else if ((z
->y
>>4&0xf) == BCF_HT_INT
) {
2096 int32_t *x
= (int32_t*)(z
->buf
+ z
->size
* m
);
2097 x
[0] = bcf_int32_missing
;
2098 for (l
= 1; l
< z
->size
>>2; ++l
) x
[l
] = bcf_int32_vector_end
;
2099 } else if ((z
->y
>>4&0xf) == BCF_HT_REAL
) {
2100 float *x
= (float*)(z
->buf
+ z
->size
* m
);
2101 bcf_float_set_missing(x
[0]);
2102 for (l
= 1; l
< z
->size
>>2; ++l
) bcf_float_set_vector_end(x
[l
]);
2109 // write individual genotype information
2110 kstring_t
*str
= &v
->indiv
;
2112 if (v
->n_sample
> 0) {
2113 for (i
= 0; i
< v
->n_fmt
; ++i
) {
2114 fmt_aux_t
*z
= &fmt
[i
];
2115 bcf_enc_int1(str
, z
->key
);
2116 if ((z
->y
>>4&0xf) == BCF_HT_STR
&& !z
->is_gt
) {
2117 bcf_enc_size(str
, z
->size
, BCF_BT_CHAR
);
2118 kputsn((char*)z
->buf
, z
->size
* v
->n_sample
, str
);
2119 } else if ((z
->y
>>4&0xf) == BCF_HT_INT
|| z
->is_gt
) {
2120 bcf_enc_vint(str
, (z
->size
>>2) * v
->n_sample
, (int32_t*)z
->buf
, z
->size
>>2);
2122 bcf_enc_size(str
, z
->size
>>2, BCF_BT_FLOAT
);
2123 if (serialize_float_array(str
, (z
->size
>>2) * v
->n_sample
,
2124 (float *) z
->buf
) != 0) {
2125 v
->errcode
|= BCF_ERR_LIMITS
;
2126 hts_log_error("Out of memory");
2133 if ( v
->n_sample
!=bcf_hdr_nsamples(h
) )
2135 hts_log_error("Number of columns at %s:%d does not match the number of samples (%d vs %d)",
2136 bcf_seqname(h
,v
), v
->pos
+1, v
->n_sample
, bcf_hdr_nsamples(h
));
2137 v
->errcode
|= BCF_ERR_NCOLS
;
2140 if ( v
->indiv
.l
> 0xffffffff )
2142 hts_log_error("The FORMAT at %s:%d is too long", bcf_seqname(h
,v
), v
->pos
+1);
2143 v
->errcode
|= BCF_ERR_LIMITS
;
2145 // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
2153 int vcf_parse(kstring_t
*s
, const bcf_hdr_t
*h
, bcf1_t
*v
)
2156 char *p
, *q
, *r
, *t
;
2160 int max_n_flt
= 0, max_n_val
= 0;
2161 int32_t *flt_a
= NULL
, *val_a
= NULL
;
2164 // Assumed in lots of places, but we may as well spot this early
2165 assert(sizeof(float) == sizeof(int32_t));
2169 memset(&aux
, 0, sizeof(ks_tokaux_t
));
2170 for (p
= kstrtok(s
->s
, "\t", &aux
), i
= 0; p
; p
= kstrtok(0, 0, &aux
), ++i
) {
2173 if (i
== 0) { // CHROM
2174 vdict_t
*d
= (vdict_t
*)h
->dict
[BCF_DT_CTG
];
2175 k
= kh_get(vdict
, d
, p
);
2178 // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
2179 // been already printed, but will enable tools like vcfcheck to proceed.
2180 hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p
);
2181 kstring_t tmp
= {0,0,0};
2183 ksprintf(&tmp
, "##contig=<ID=%s>", p
);
2184 bcf_hrec_t
*hrec
= bcf_hdr_parse_line(h
,tmp
.s
,&l
);
2186 if ( bcf_hdr_add_hrec((bcf_hdr_t
*)h
, hrec
) ) bcf_hdr_sync((bcf_hdr_t
*)h
);
2187 k
= kh_get(vdict
, d
, p
);
2188 v
->errcode
= BCF_ERR_CTG_UNDEF
;
2189 if (k
== kh_end(d
)) {
2190 hts_log_error("Could not add dummy header for contig '%s'", p
);
2191 v
->errcode
|= BCF_ERR_CTG_INVALID
;
2195 v
->rid
= kh_val(d
, k
).id
;
2196 } else if (i
== 1) { // POS
2197 v
->pos
= atoi(p
) - 1;
2198 } else if (i
== 2) { // ID
2199 if (strcmp(p
, ".")) bcf_enc_vchar(str
, q
- p
, p
);
2200 else bcf_enc_size(str
, 0, BCF_BT_CHAR
);
2201 } else if (i
== 3) { // REF
2202 bcf_enc_vchar(str
, q
- p
, p
);
2203 v
->n_allele
= 1, v
->rlen
= q
- p
;
2204 } else if (i
== 4) { // ALT
2205 if (strcmp(p
, ".")) {
2206 for (r
= t
= p
;; ++r
) {
2207 if (*r
== ',' || *r
== 0) {
2208 bcf_enc_vchar(str
, r
- t
, t
);
2215 } else if (i
== 5) { // QUAL
2216 if (strcmp(p
, ".")) v
->qual
= atof(p
);
2217 else bcf_float_set_missing(v
->qual
);
2218 if ( v
->max_unpack
&& !(v
->max_unpack
>>1) ) goto end
; // BCF_UN_STR
2219 } else if (i
== 6) { // FILTER
2220 if (strcmp(p
, ".")) {
2223 vdict_t
*d
= (vdict_t
*)h
->dict
[BCF_DT_ID
];
2224 // count the number of filters
2225 if (*(q
-1) == ';') *(q
-1) = 0;
2226 for (r
= p
; *r
; ++r
)
2227 if (*r
== ';') ++n_flt
;
2228 if (n_flt
> max_n_flt
) {
2229 int32_t *a
= realloc(flt_a
, n_flt
* sizeof(int32_t));
2231 hts_log_error("Could not allocate memory");
2232 v
->errcode
|= BCF_ERR_LIMITS
; // No appropriate code?
2239 for (t
= kstrtok(p
, ";", &aux1
), i
= 0; t
; t
= kstrtok(0, 0, &aux1
)) {
2241 k
= kh_get(vdict
, d
, t
);
2244 // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
2245 // been already printed, but will enable tools like vcfcheck to proceed.
2246 hts_log_warning("FILTER '%s' is not defined in the header", t
);
2247 kstring_t tmp
= {0,0,0};
2249 ksprintf(&tmp
, "##FILTER=<ID=%s,Description=\"Dummy\">", t
);
2250 bcf_hrec_t
*hrec
= bcf_hdr_parse_line(h
,tmp
.s
,&l
);
2252 if ( bcf_hdr_add_hrec((bcf_hdr_t
*)h
, hrec
) ) bcf_hdr_sync((bcf_hdr_t
*)h
);
2253 k
= kh_get(vdict
, d
, t
);
2254 v
->errcode
= BCF_ERR_TAG_UNDEF
;
2255 if (k
== kh_end(d
)) {
2256 hts_log_error("Could not add dummy header for FILTER '%s'", t
);
2257 v
->errcode
|= BCF_ERR_TAG_INVALID
;
2261 flt_a
[i
++] = kh_val(d
, k
).id
;
2264 bcf_enc_vint(str
, n_flt
, flt_a
, -1);
2265 } else bcf_enc_vint(str
, 0, 0, -1);
2266 if ( v
->max_unpack
&& !(v
->max_unpack
>>2) ) goto end
; // BCF_UN_FLT
2267 } else if (i
== 7) { // INFO
2269 vdict_t
*d
= (vdict_t
*)h
->dict
[BCF_DT_ID
];
2271 if (strcmp(p
, ".")) {
2272 if (*(q
-1) == ';') *(q
-1) = 0;
2273 for (r
= key
= p
;; ++r
) {
2276 if (*r
!= ';' && *r
!= '=' && *r
!= 0) continue;
2281 for (end
= val
; *end
!= ';' && *end
!= 0; ++end
);
2284 if ( !*key
) { if (c
==0) break; r
= end
; key
= r
+ 1; continue; } // faulty VCF, ";;" in the INFO
2285 k
= kh_get(vdict
, d
, key
);
2286 if (k
== kh_end(d
) || kh_val(d
, k
).info
[BCF_HL_INFO
] == 15)
2288 hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key
);
2289 kstring_t tmp
= {0,0,0};
2291 ksprintf(&tmp
, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key
);
2292 bcf_hrec_t
*hrec
= bcf_hdr_parse_line(h
,tmp
.s
,&l
);
2294 if ( bcf_hdr_add_hrec((bcf_hdr_t
*)h
, hrec
) ) bcf_hdr_sync((bcf_hdr_t
*)h
);
2295 k
= kh_get(vdict
, d
, key
);
2296 v
->errcode
= BCF_ERR_TAG_UNDEF
;
2297 if (k
== kh_end(d
)) {
2298 hts_log_error("Could not add dummy header for INFO '%s'", key
);
2299 v
->errcode
|= BCF_ERR_TAG_INVALID
;
2303 uint32_t y
= kh_val(d
, k
).info
[BCF_HL_INFO
];
2305 bcf_enc_int1(str
, kh_val(d
, k
).id
);
2307 bcf_enc_size(str
, 0, BCF_BT_NULL
);
2308 } else if ((y
>>4&0xf) == BCF_HT_FLAG
|| (y
>>4&0xf) == BCF_HT_STR
) { // if Flag has a value, treat it as a string
2309 bcf_enc_vchar(str
, end
- val
, val
);
2310 } else { // int/float value/array
2313 for (t
= val
, n_val
= 1; *t
; ++t
) // count the number of values
2314 if (*t
== ',') ++n_val
;
2315 // Check both int and float size in one step for simplicity
2316 if (n_val
> max_n_val
) {
2318 z
= (int32_t *)realloc((void *)val_a
, n_val
* sizeof(*z
));
2320 hts_log_error("Could not allocate memory");
2321 v
->errcode
|= BCF_ERR_LIMITS
; // No appropriate code?
2327 if ((y
>>4&0xf) == BCF_HT_INT
) {
2328 for (i
= 0, t
= val
; i
< n_val
; ++i
, ++t
)
2330 val_a
[i
] = strtol(t
, &te
, 10);
2331 if ( te
==t
) // conversion failed
2333 val_a
[i
] = bcf_int32_missing
;
2334 while ( *te
&& *te
!=',' ) te
++;
2338 bcf_enc_vint(str
, n_val
, val_a
, -1);
2339 if (strcmp(key
, "END") == 0) v
->rlen
= val_a
[0] - v
->pos
;
2340 } else if ((y
>>4&0xf) == BCF_HT_REAL
) {
2341 float *val_f
= (float *)val_a
;
2342 for (i
= 0, t
= val
; i
< n_val
; ++i
, ++t
)
2344 val_f
[i
] = strtod(t
, &te
);
2345 if ( te
==t
) // conversion failed
2347 bcf_float_set_missing(val_f
[i
]);
2348 while ( *te
&& *te
!=',' ) te
++;
2352 bcf_enc_vfloat(str
, n_val
, val_f
);
2360 if ( v
->max_unpack
&& !(v
->max_unpack
>>3) ) goto end
;
2361 } else if (i
== 8) {// FORMAT
2364 return vcf_parse_format(s
, h
, v
, p
, q
);
2377 int vcf_read(htsFile
*fp
, const bcf_hdr_t
*h
, bcf1_t
*v
)
2380 ret
= hts_getline(fp
, KS_SEP_LINE
, &fp
->line
);
2381 if (ret
< 0) return ret
;
2382 return vcf_parse1(&fp
->line
, h
, v
);
2385 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr
, int n_sample
, bcf_fmt_t
*fmt
)
2387 uint8_t *ptr_start
= ptr
;
2388 fmt
->id
= bcf_dec_typed_int1(ptr
, &ptr
);
2389 fmt
->n
= bcf_dec_size(ptr
, &ptr
, &fmt
->type
);
2390 fmt
->size
= fmt
->n
<< bcf_type_shift
[fmt
->type
];
2392 fmt
->p_off
= ptr
- ptr_start
;
2394 ptr
+= n_sample
* fmt
->size
;
2395 fmt
->p_len
= ptr
- fmt
->p
;
2399 static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr
, bcf_info_t
*info
)
2401 uint8_t *ptr_start
= ptr
;
2402 info
->key
= bcf_dec_typed_int1(ptr
, &ptr
);
2403 info
->len
= bcf_dec_size(ptr
, &ptr
, &info
->type
);
2405 info
->vptr_off
= ptr
- ptr_start
;
2406 info
->vptr_free
= 0;
2408 if (info
->len
== 1) {
2409 if (info
->type
== BCF_BT_INT8
|| info
->type
== BCF_BT_CHAR
) info
->v1
.i
= *(int8_t*)ptr
;
2410 else if (info
->type
== BCF_BT_INT32
) info
->v1
.i
= le_to_i32(ptr
);
2411 else if (info
->type
== BCF_BT_FLOAT
) info
->v1
.f
= le_to_float(ptr
);
2412 else if (info
->type
== BCF_BT_INT16
) info
->v1
.i
= le_to_i16(ptr
);
2414 ptr
+= info
->len
<< bcf_type_shift
[info
->type
];
2415 info
->vptr_len
= ptr
- info
->vptr
;
2419 int bcf_unpack(bcf1_t
*b
, int which
)
2421 if ( !b
->shared
.l
) return 0; // Building a new BCF record from scratch
2422 uint8_t *ptr
= (uint8_t*)b
->shared
.s
, *ptr_ori
;
2424 bcf_dec_t
*d
= &b
->d
;
2425 if (which
& BCF_UN_FLT
) which
|= BCF_UN_STR
;
2426 if (which
& BCF_UN_INFO
) which
|= BCF_UN_SHR
;
2427 if ((which
&BCF_UN_STR
) && !(b
->unpacked
&BCF_UN_STR
))
2432 tmp
.l
= 0; tmp
.s
= d
->id
; tmp
.m
= d
->m_id
;
2434 ptr
= bcf_fmt_sized_array(&tmp
, ptr
);
2435 b
->unpack_size
[0] = ptr
- ptr_ori
;
2437 d
->id
= tmp
.s
; d
->m_id
= tmp
.m
;
2439 // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
2440 hts_expand(char*, b
->n_allele
, d
->m_allele
, d
->allele
); // NM: hts_expand() is a macro
2441 tmp
.l
= 0; tmp
.s
= d
->als
; tmp
.m
= d
->m_als
;
2444 for (i
= 0; i
< b
->n_allele
; ++i
) {
2445 d
->allele
[i
] = o
+ tmp
.l
;
2446 ptr
= bcf_fmt_sized_array(&tmp
, ptr
);
2449 b
->unpack_size
[1] = ptr
- ptr_ori
;
2450 d
->als
= tmp
.s
; d
->m_als
= tmp
.m
;
2452 for (i
= 0; i
< b
->n_allele
; ++i
)
2453 d
->allele
[i
] = d
->als
+ (d
->allele
[i
]-o
);
2454 b
->unpacked
|= BCF_UN_STR
;
2456 if ((which
&BCF_UN_FLT
) && !(b
->unpacked
&BCF_UN_FLT
)) { // FILTER
2457 ptr
= (uint8_t*)b
->shared
.s
+ b
->unpack_size
[0] + b
->unpack_size
[1];
2461 d
->n_flt
= bcf_dec_size(ptr
, &ptr
, &type
);
2462 hts_expand(int, d
->n_flt
, d
->m_flt
, d
->flt
);
2463 for (i
= 0; i
< d
->n_flt
; ++i
)
2464 d
->flt
[i
] = bcf_dec_int1(ptr
, type
, &ptr
);
2465 } else ++ptr
, d
->n_flt
= 0;
2466 b
->unpack_size
[2] = ptr
- ptr_ori
;
2467 b
->unpacked
|= BCF_UN_FLT
;
2469 if ((which
&BCF_UN_INFO
) && !(b
->unpacked
&BCF_UN_INFO
)) { // INFO
2470 ptr
= (uint8_t*)b
->shared
.s
+ b
->unpack_size
[0] + b
->unpack_size
[1] + b
->unpack_size
[2];
2471 hts_expand(bcf_info_t
, b
->n_info
, d
->m_info
, d
->info
);
2472 for (i
= 0; i
< d
->m_info
; ++i
) d
->info
[i
].vptr_free
= 0;
2473 for (i
= 0; i
< b
->n_info
; ++i
)
2474 ptr
= bcf_unpack_info_core1(ptr
, &d
->info
[i
]);
2475 b
->unpacked
|= BCF_UN_INFO
;
2477 if ((which
&BCF_UN_FMT
) && b
->n_sample
&& !(b
->unpacked
&BCF_UN_FMT
)) { // FORMAT
2478 ptr
= (uint8_t*)b
->indiv
.s
;
2479 hts_expand(bcf_fmt_t
, b
->n_fmt
, d
->m_fmt
, d
->fmt
);
2480 for (i
= 0; i
< d
->m_fmt
; ++i
) d
->fmt
[i
].p_free
= 0;
2481 for (i
= 0; i
< b
->n_fmt
; ++i
)
2482 ptr
= bcf_unpack_fmt_core1(ptr
, b
->n_sample
, &d
->fmt
[i
]);
2483 b
->unpacked
|= BCF_UN_FMT
;
2488 int vcf_format(const bcf_hdr_t
*h
, const bcf1_t
*v
, kstring_t
*s
)
2491 bcf_unpack((bcf1_t
*)v
, BCF_UN_ALL
);
2492 kputs(h
->id
[BCF_DT_CTG
][v
->rid
].key
, s
); // CHROM
2493 kputc('\t', s
); kputw(v
->pos
+ 1, s
); // POS
2494 kputc('\t', s
); kputs(v
->d
.id
? v
->d
.id
: ".", s
); // ID
2495 kputc('\t', s
); // REF
2496 if (v
->n_allele
> 0) kputs(v
->d
.allele
[0], s
);
2498 kputc('\t', s
); // ALT
2499 if (v
->n_allele
> 1) {
2500 for (i
= 1; i
< v
->n_allele
; ++i
) {
2501 if (i
> 1) kputc(',', s
);
2502 kputs(v
->d
.allele
[i
], s
);
2504 } else kputc('.', s
);
2505 kputc('\t', s
); // QUAL
2506 if ( bcf_float_is_missing(v
->qual
) ) kputc('.', s
); // QUAL
2507 else kputd(v
->qual
, s
);
2508 kputc('\t', s
); // FILTER
2510 for (i
= 0; i
< v
->d
.n_flt
; ++i
) {
2511 if (i
) kputc(';', s
);
2512 kputs(h
->id
[BCF_DT_ID
][v
->d
.flt
[i
]].key
, s
);
2514 } else kputc('.', s
);
2515 kputc('\t', s
); // INFO
2518 for (i
= 0; i
< v
->n_info
; ++i
) {
2519 bcf_info_t
*z
= &v
->d
.info
[i
];
2520 if ( !z
->vptr
) continue;
2521 if ( !first
) kputc(';', s
);
2523 if (z
->key
>= h
->n
[BCF_DT_ID
]) {
2524 hts_log_error("Invalid BCF, the INFO index is too large");
2528 kputs(h
->id
[BCF_DT_ID
][z
->key
].key
, s
);
2529 if (z
->len
<= 0) continue;
2535 case BCF_BT_INT8
: if ( z
->v1
.i
==bcf_int8_missing
) kputc('.', s
); else kputw(z
->v1
.i
, s
); break;
2536 case BCF_BT_INT16
: if ( z
->v1
.i
==bcf_int16_missing
) kputc('.', s
); else kputw(z
->v1
.i
, s
); break;
2537 case BCF_BT_INT32
: if ( z
->v1
.i
==bcf_int32_missing
) kputc('.', s
); else kputw(z
->v1
.i
, s
); break;
2538 case BCF_BT_FLOAT
: if ( bcf_float_is_missing(z
->v1
.f
) ) kputc('.', s
); else kputd(z
->v1
.f
, s
); break;
2539 case BCF_BT_CHAR
: kputc(z
->v1
.i
, s
); break;
2540 default: hts_log_error("Unexpected type %d", z
->type
); exit(1); break;
2543 else bcf_fmt_array(s
, z
->len
, z
->type
, z
->vptr
);
2545 if ( first
) kputc('.', s
);
2546 } else kputc('.', s
);
2547 // FORMAT and individual information
2554 bcf_fmt_t
*fmt
= v
->d
.fmt
;
2556 for (i
= 0; i
< (int)v
->n_fmt
; ++i
) {
2557 if ( !fmt
[i
].p
) continue;
2558 kputc(!first
? ':' : '\t', s
); first
= 0;
2559 if ( fmt
[i
].id
<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
2561 hts_log_error("Invalid BCF, the FORMAT tag id=%d not present in the header", fmt
[i
].id
);
2564 kputs(h
->id
[BCF_DT_ID
][fmt
[i
].id
].key
, s
);
2565 if (strcmp(h
->id
[BCF_DT_ID
][fmt
[i
].id
].key
, "GT") == 0) gt_i
= i
;
2567 if ( first
) kputs("\t.", s
);
2568 for (j
= 0; j
< v
->n_sample
; ++j
) {
2571 for (i
= 0; i
< (int)v
->n_fmt
; ++i
) {
2572 bcf_fmt_t
*f
= &fmt
[i
];
2573 if ( !f
->p
) continue;
2574 if (!first
) kputc(':', s
);
2577 bcf_format_gt(f
,j
,s
);
2579 bcf_fmt_array(s
, f
->n
, f
->type
, f
->p
+ j
* f
->size
);
2581 if ( first
) kputc('.', s
);
2585 for (j
=0; j
<=v
->n_sample
; j
++)
2592 int vcf_write_line(htsFile
*fp
, kstring_t
*line
)
2595 if ( line
->s
[line
->l
-1]!='\n' ) kputc('\n',line
);
2596 if ( fp
->format
.compression
!=no_compression
)
2597 ret
= bgzf_write(fp
->fp
.bgzf
, line
->s
, line
->l
);
2599 ret
= hwrite(fp
->fp
.hfile
, line
->s
, line
->l
);
2600 return ret
==line
->l
? 0 : -1;
2603 int vcf_write(htsFile
*fp
, const bcf_hdr_t
*h
, bcf1_t
*v
)
2607 if (vcf_format1(h
, v
, &fp
->line
) != 0)
2609 if ( fp
->format
.compression
!=no_compression
)
2610 ret
= bgzf_write(fp
->fp
.bgzf
, fp
->line
.s
, fp
->line
.l
);
2612 ret
= hwrite(fp
->fp
.hfile
, fp
->line
.s
, fp
->line
.l
);
2613 return ret
==fp
->line
.l
? 0 : -1;
2616 /************************
2617 * Data access routines *
2618 ************************/
2620 int bcf_hdr_id2int(const bcf_hdr_t
*h
, int which
, const char *id
)
2623 vdict_t
*d
= (vdict_t
*)h
->dict
[which
];
2624 k
= kh_get(vdict
, d
, id
);
2625 return k
== kh_end(d
)? -1 : kh_val(d
, k
).id
;
2629 /********************
2630 *** BCF indexing ***
2631 ********************/
2633 hts_idx_t
*bcf_index(htsFile
*fp
, int min_shift
)
2637 hts_idx_t
*idx
= NULL
;
2639 int64_t max_len
= 0, s
;
2641 h
= bcf_hdr_read(fp
);
2642 if ( !h
) return NULL
;
2644 for (i
= 0; i
< h
->n
[BCF_DT_CTG
]; ++i
)
2646 if ( !h
->id
[BCF_DT_CTG
][i
].val
) continue;
2647 if ( max_len
< h
->id
[BCF_DT_CTG
][i
].val
->info
[0] ) max_len
= h
->id
[BCF_DT_CTG
][i
].val
->info
[0];
2650 if ( !max_len
) max_len
= ((int64_t)1<<31) - 1; // In case contig line is broken.
2652 for (n_lvls
= 0, s
= 1<<min_shift
; max_len
> s
; ++n_lvls
, s
<<= 3);
2653 idx
= hts_idx_init(nids
, HTS_FMT_CSI
, bgzf_tell(fp
->fp
.bgzf
), min_shift
, n_lvls
);
2654 if (!idx
) goto fail
;
2657 while ((r
= bcf_read1(fp
,h
, b
)) >= 0) {
2659 ret
= hts_idx_push(idx
, b
->rid
, b
->pos
, b
->pos
+ b
->rlen
, bgzf_tell(fp
->fp
.bgzf
), 1);
2660 if (ret
< 0) goto fail
;
2662 if (r
< -1) goto fail
;
2663 hts_idx_finish(idx
, bgzf_tell(fp
->fp
.bgzf
));
2669 hts_idx_destroy(idx
);
2675 hts_idx_t
*bcf_index_load2(const char *fn
, const char *fnidx
)
2677 return fnidx
? hts_idx_load2(fn
, fnidx
) : bcf_index_load(fn
);
2680 int bcf_index_build3(const char *fn
, const char *fnidx
, int min_shift
, int n_threads
)
2686 if ((fp
= hts_open(fn
, "rb")) == 0) return -2;
2688 hts_set_threads(fp
, n_threads
);
2689 if ( fp
->format
.compression
!=bgzf
) { hts_close(fp
); return -3; }
2690 switch (fp
->format
.format
) {
2692 idx
= bcf_index(fp
, min_shift
);
2694 ret
= hts_idx_save_as(idx
, fn
, fnidx
, HTS_FMT_CSI
);
2695 if (ret
< 0) ret
= -4;
2696 hts_idx_destroy(idx
);
2702 tbx
= tbx_index(hts_get_bgzfp(fp
), min_shift
, &tbx_conf_vcf
);
2704 ret
= hts_idx_save_as(tbx
->idx
, fn
, fnidx
, min_shift
> 0 ? HTS_FMT_CSI
: HTS_FMT_TBI
);
2705 if (ret
< 0) ret
= -4;
2719 int bcf_index_build2(const char *fn
, const char *fnidx
, int min_shift
)
2721 return bcf_index_build3(fn
, fnidx
, min_shift
, 0);
2724 int bcf_index_build(const char *fn
, int min_shift
)
2726 return bcf_index_build3(fn
, NULL
, min_shift
, 0);
2733 int bcf_hdr_combine(bcf_hdr_t
*dst
, const bcf_hdr_t
*src
)
2735 int i
, ndst_ori
= dst
->nhrec
, need_sync
= 0, ret
= 0;
2736 for (i
=0; i
<src
->nhrec
; i
++)
2738 if ( src
->hrec
[i
]->type
==BCF_HL_GEN
&& src
->hrec
[i
]->value
)
2741 for (j
=0; j
<ndst_ori
; j
++)
2743 if ( dst
->hrec
[j
]->type
!=BCF_HL_GEN
) continue;
2745 // Checking only the key part of generic lines, otherwise
2746 // the VCFs are too verbose. Should we perhaps add a flag
2747 // to bcf_hdr_combine() and make this optional?
2748 if ( !strcmp(src
->hrec
[i
]->key
,dst
->hrec
[j
]->key
) ) break;
2751 need_sync
+= bcf_hdr_add_hrec(dst
, bcf_hrec_dup(src
->hrec
[i
]));
2753 else if ( src
->hrec
[i
]->type
==BCF_HL_STR
)
2755 // NB: we are ignoring fields without ID
2756 int j
= bcf_hrec_find_key(src
->hrec
[i
],"ID");
2759 bcf_hrec_t
*rec
= bcf_hdr_get_hrec(dst
, src
->hrec
[i
]->type
, "ID", src
->hrec
[i
]->vals
[j
], src
->hrec
[i
]->key
);
2761 need_sync
+= bcf_hdr_add_hrec(dst
, bcf_hrec_dup(src
->hrec
[i
]));
2766 int j
= bcf_hrec_find_key(src
->hrec
[i
],"ID");
2767 assert( j
>=0 ); // this should always be true for valid VCFs
2769 bcf_hrec_t
*rec
= bcf_hdr_get_hrec(dst
, src
->hrec
[i
]->type
, "ID", src
->hrec
[i
]->vals
[j
], NULL
);
2771 need_sync
+= bcf_hdr_add_hrec(dst
, bcf_hrec_dup(src
->hrec
[i
]));
2772 else if ( src
->hrec
[i
]->type
==BCF_HL_INFO
|| src
->hrec
[i
]->type
==BCF_HL_FMT
)
2774 // Check that both records are of the same type. The bcf_hdr_id2length
2775 // macro cannot be used here because dst header is not synced yet.
2776 vdict_t
*d_src
= (vdict_t
*)src
->dict
[BCF_DT_ID
];
2777 vdict_t
*d_dst
= (vdict_t
*)dst
->dict
[BCF_DT_ID
];
2778 khint_t k_src
= kh_get(vdict
, d_src
, src
->hrec
[i
]->vals
[0]);
2779 khint_t k_dst
= kh_get(vdict
, d_dst
, src
->hrec
[i
]->vals
[0]);
2780 if ( (kh_val(d_src
,k_src
).info
[rec
->type
]>>8 & 0xf) != (kh_val(d_dst
,k_dst
).info
[rec
->type
]>>8 & 0xf) )
2782 hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
2783 src
->hrec
[i
]->vals
[0]);
2786 if ( (kh_val(d_src
,k_src
).info
[rec
->type
]>>4 & 0xf) != (kh_val(d_dst
,k_dst
).info
[rec
->type
]>>4 & 0xf) )
2788 hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
2789 src
->hrec
[i
]->vals
[0]);
2795 if ( need_sync
) bcf_hdr_sync(dst
);
2799 bcf_hdr_t
*bcf_hdr_merge(bcf_hdr_t
*dst
, const bcf_hdr_t
*src
)
2803 // this will effectively strip existing IDX attributes from src to become dst
2804 dst
= bcf_hdr_init("r");
2805 kstring_t htxt
= {0,0,0};
2806 bcf_hdr_format(src
, 0, &htxt
);
2807 if ( bcf_hdr_parse(dst
, htxt
.s
) < 0 ) {
2808 bcf_hdr_destroy(dst
);
2815 int i
, ndst_ori
= dst
->nhrec
, need_sync
= 0, ret
= 0;
2816 for (i
=0; i
<src
->nhrec
; i
++)
2818 if ( src
->hrec
[i
]->type
==BCF_HL_GEN
&& src
->hrec
[i
]->value
)
2821 for (j
=0; j
<ndst_ori
; j
++)
2823 if ( dst
->hrec
[j
]->type
!=BCF_HL_GEN
) continue;
2825 // Checking only the key part of generic lines, otherwise
2826 // the VCFs are too verbose. Should we perhaps add a flag
2827 // to bcf_hdr_combine() and make this optional?
2828 if ( !strcmp(src
->hrec
[i
]->key
,dst
->hrec
[j
]->key
) ) break;
2831 need_sync
+= bcf_hdr_add_hrec(dst
, bcf_hrec_dup(src
->hrec
[i
]));
2833 else if ( src
->hrec
[i
]->type
==BCF_HL_STR
)
2835 // NB: we are ignoring fields without ID
2836 int j
= bcf_hrec_find_key(src
->hrec
[i
],"ID");
2839 bcf_hrec_t
*rec
= bcf_hdr_get_hrec(dst
, src
->hrec
[i
]->type
, "ID", src
->hrec
[i
]->vals
[j
], src
->hrec
[i
]->key
);
2841 need_sync
+= bcf_hdr_add_hrec(dst
, bcf_hrec_dup(src
->hrec
[i
]));
2846 int j
= bcf_hrec_find_key(src
->hrec
[i
],"ID");
2847 assert( j
>=0 ); // this should always be true for valid VCFs
2849 bcf_hrec_t
*rec
= bcf_hdr_get_hrec(dst
, src
->hrec
[i
]->type
, "ID", src
->hrec
[i
]->vals
[j
], NULL
);
2851 need_sync
+= bcf_hdr_add_hrec(dst
, bcf_hrec_dup(src
->hrec
[i
]));
2852 else if ( src
->hrec
[i
]->type
==BCF_HL_INFO
|| src
->hrec
[i
]->type
==BCF_HL_FMT
)
2854 // Check that both records are of the same type. The bcf_hdr_id2length
2855 // macro cannot be used here because dst header is not synced yet.
2856 vdict_t
*d_src
= (vdict_t
*)src
->dict
[BCF_DT_ID
];
2857 vdict_t
*d_dst
= (vdict_t
*)dst
->dict
[BCF_DT_ID
];
2858 khint_t k_src
= kh_get(vdict
, d_src
, src
->hrec
[i
]->vals
[0]);
2859 khint_t k_dst
= kh_get(vdict
, d_dst
, src
->hrec
[i
]->vals
[0]);
2860 if ( (kh_val(d_src
,k_src
).info
[rec
->type
]>>8 & 0xf) != (kh_val(d_dst
,k_dst
).info
[rec
->type
]>>8 & 0xf) )
2862 hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
2863 src
->hrec
[i
]->vals
[0]);
2866 if ( (kh_val(d_src
,k_src
).info
[rec
->type
]>>4 & 0xf) != (kh_val(d_dst
,k_dst
).info
[rec
->type
]>>4 & 0xf) )
2868 hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
2869 src
->hrec
[i
]->vals
[0]);
2875 if ( need_sync
) bcf_hdr_sync(dst
);
2878 int bcf_translate(const bcf_hdr_t
*dst_hdr
, bcf_hdr_t
*src_hdr
, bcf1_t
*line
)
2881 if ( line
->errcode
)
2883 hts_log_error("Unchecked error (%d), exiting", line
->errcode
);
2886 if ( src_hdr
->ntransl
==-1 ) return 0; // no need to translate, all tags have the same id
2887 if ( !src_hdr
->ntransl
) // called for the first time, see what needs translating
2890 for (dict
=0; dict
<2; dict
++) // BCF_DT_ID and BCF_DT_CTG
2892 src_hdr
->transl
[dict
] = (int*) malloc(src_hdr
->n
[dict
]*sizeof(int));
2893 for (i
=0; i
<src_hdr
->n
[dict
]; i
++)
2895 if ( !src_hdr
->id
[dict
][i
].key
) // gap left after removed BCF header lines
2897 src_hdr
->transl
[dict
][i
] = -1;
2900 src_hdr
->transl
[dict
][i
] = bcf_hdr_id2int(dst_hdr
,dict
,src_hdr
->id
[dict
][i
].key
);
2901 if ( src_hdr
->transl
[dict
][i
]!=-1 && i
!=src_hdr
->transl
[dict
][i
] ) src_hdr
->ntransl
++;
2904 if ( !src_hdr
->ntransl
)
2906 free(src_hdr
->transl
[0]); src_hdr
->transl
[0] = NULL
;
2907 free(src_hdr
->transl
[1]); src_hdr
->transl
[1] = NULL
;
2908 src_hdr
->ntransl
= -1;
2910 if ( src_hdr
->ntransl
==-1 ) return 0;
2912 bcf_unpack(line
,BCF_UN_ALL
);
2915 if ( src_hdr
->transl
[BCF_DT_CTG
][line
->rid
] >=0 ) line
->rid
= src_hdr
->transl
[BCF_DT_CTG
][line
->rid
];
2918 for (i
=0; i
<line
->d
.n_flt
; i
++)
2920 int src_id
= line
->d
.flt
[i
];
2921 if ( src_hdr
->transl
[BCF_DT_ID
][src_id
] >=0 )
2922 line
->d
.flt
[i
] = src_hdr
->transl
[BCF_DT_ID
][src_id
];
2923 line
->d
.shared_dirty
|= BCF1_DIRTY_FLT
;
2927 for (i
=0; i
<line
->n_info
; i
++)
2929 int src_id
= line
->d
.info
[i
].key
;
2930 int dst_id
= src_hdr
->transl
[BCF_DT_ID
][src_id
];
2931 if ( dst_id
<0 ) continue;
2932 int src_size
= src_id
>>7 ? ( src_id
>>15 ? BCF_BT_INT32
: BCF_BT_INT16
) : BCF_BT_INT8
;
2933 int dst_size
= dst_id
>>7 ? ( dst_id
>>15 ? BCF_BT_INT32
: BCF_BT_INT16
) : BCF_BT_INT8
;
2934 if ( src_size
==dst_size
) // can overwrite
2936 line
->d
.info
[i
].key
= dst_id
;
2937 uint8_t *vptr
= line
->d
.info
[i
].vptr
- line
->d
.info
[i
].vptr_off
;
2938 if ( dst_size
==BCF_BT_INT8
) { vptr
[1] = (uint8_t)dst_id
; }
2939 else if ( dst_size
==BCF_BT_INT16
) { *(uint16_t*)vptr
= (uint16_t)dst_id
; }
2940 else { *(uint32_t*)vptr
= (uint32_t)dst_id
; }
2942 else // must realloc
2944 bcf_info_t
*info
= &line
->d
.info
[i
];
2945 assert( !info
->vptr_free
);
2946 kstring_t str
= {0,0,0};
2947 bcf_enc_int1(&str
, dst_id
);
2948 bcf_enc_size(&str
, info
->len
,info
->type
);
2949 info
->vptr_off
= str
.l
;
2950 kputsn((char*)info
->vptr
, info
->vptr_len
, &str
);
2951 info
->vptr
= (uint8_t*)str
.s
+ info
->vptr_off
;
2952 info
->vptr_free
= 1;
2954 line
->d
.shared_dirty
|= BCF1_DIRTY_INF
;
2959 for (i
=0; i
<line
->n_fmt
; i
++)
2961 int src_id
= line
->d
.fmt
[i
].id
;
2962 int dst_id
= src_hdr
->transl
[BCF_DT_ID
][src_id
];
2963 if ( dst_id
<0 ) continue;
2964 int src_size
= src_id
>>7 ? ( src_id
>>15 ? BCF_BT_INT32
: BCF_BT_INT16
) : BCF_BT_INT8
;
2965 int dst_size
= dst_id
>>7 ? ( dst_id
>>15 ? BCF_BT_INT32
: BCF_BT_INT16
) : BCF_BT_INT8
;
2966 if ( src_size
==dst_size
) // can overwrite
2968 line
->d
.fmt
[i
].id
= dst_id
;
2969 uint8_t *p
= line
->d
.fmt
[i
].p
- line
->d
.fmt
[i
].p_off
; // pointer to the vector size (4bits) and BT type (4bits)
2970 if ( dst_size
==BCF_BT_INT8
) { p
[1] = dst_id
; }
2971 else if ( dst_size
==BCF_BT_INT16
) { uint8_t *x
= (uint8_t*) &dst_id
; p
[1] = x
[0]; p
[2] = x
[1]; }
2972 else { uint8_t *x
= (uint8_t*) &dst_id
; p
[1] = x
[0]; p
[2] = x
[1]; p
[3] = x
[2]; p
[4] = x
[3]; }
2974 else // must realloc
2976 bcf_fmt_t
*fmt
= &line
->d
.fmt
[i
];
2977 assert( !fmt
->p_free
);
2978 kstring_t str
= {0,0,0};
2979 bcf_enc_int1(&str
, dst_id
);
2980 bcf_enc_size(&str
, fmt
->n
, fmt
->type
);
2982 kputsn((char*)fmt
->p
, fmt
->p_len
, &str
);
2983 fmt
->p
= (uint8_t*)str
.s
+ fmt
->p_off
;
2986 line
->d
.indiv_dirty
= 1;
2992 bcf_hdr_t
*bcf_hdr_dup(const bcf_hdr_t
*hdr
)
2994 bcf_hdr_t
*hout
= bcf_hdr_init("r");
2996 hts_log_error("Failed to allocate bcf header");
2999 kstring_t htxt
= {0,0,0};
3000 bcf_hdr_format(hdr
, 1, &htxt
);
3001 if ( bcf_hdr_parse(hout
, htxt
.s
) < 0 ) {
3002 bcf_hdr_destroy(hout
);
3009 bcf_hdr_t
*bcf_hdr_subset(const bcf_hdr_t
*h0
, int n
, char *const* samples
, int *imap
)
3011 void *names_hash
= khash_str2int_init();
3012 kstring_t htxt
= {0,0,0};
3013 kstring_t str
= {0,0,0};
3014 bcf_hdr_t
*h
= bcf_hdr_init("w");
3016 hts_log_error("Failed to allocate bcf header");
3019 bcf_hdr_format(h0
, 1, &htxt
);
3020 bcf_hdr_set_version(h
,bcf_hdr_get_version(h0
));
3022 for (j
=0; j
<n
; j
++) imap
[j
] = -1;
3023 if ( bcf_hdr_nsamples(h0
) > 0) {
3024 char *p
= find_chrom_header_line(htxt
.s
);
3025 int i
= 0, end
= n
? 8 : 7;
3026 while ((p
= strchr(p
, '\t')) != 0 && i
< end
) ++i
, ++p
;
3028 free(h
); free(str
.s
);
3029 return 0; // malformated header
3031 kputsn(htxt
.s
, p
- htxt
.s
, &str
);
3032 for (i
= 0; i
< n
; ++i
) {
3033 if ( khash_str2int_has_key(names_hash
,samples
[i
]) )
3035 hts_log_error("Duplicate sample name \"%s\"", samples
[i
]);
3038 khash_str2int_destroy(names_hash
);
3042 imap
[i
] = bcf_hdr_id2int(h0
, BCF_DT_SAMPLE
, samples
[i
]);
3043 if (imap
[i
] < 0) continue;
3045 kputs(samples
[i
], &str
);
3046 khash_str2int_inc(names_hash
,samples
[i
]);
3048 } else kputsn(htxt
.s
, htxt
.l
, &str
);
3049 while (str
.l
&& (!str
.s
[str
.l
-1] || str
.s
[str
.l
-1]=='\n') ) str
.l
--; // kill trailing zeros and newlines
3051 if ( bcf_hdr_parse(h
, str
.s
) < 0 ) {
3057 khash_str2int_destroy(names_hash
);
3061 int bcf_hdr_set_samples(bcf_hdr_t
*hdr
, const char *samples
, int is_file
)
3063 if ( samples
&& !strcmp("-",samples
) ) return 0; // keep all samples
3065 hdr
->nsamples_ori
= bcf_hdr_nsamples(hdr
);
3066 if ( !samples
) { bcf_hdr_nsamples(hdr
) = 0; return 0; } // exclude all samples
3068 int i
, narr
= bit_array_size(bcf_hdr_nsamples(hdr
));
3069 hdr
->keep_samples
= (uint8_t*) calloc(narr
,1);
3070 if ( samples
[0]=='^' )
3071 for (i
=0; i
<bcf_hdr_nsamples(hdr
); i
++) bit_array_set(hdr
->keep_samples
,i
);
3073 int idx
, n
, ret
= 0;
3074 char **smpls
= hts_readlist(samples
[0]=='^'?samples
+1:samples
, is_file
, &n
);
3075 if ( !smpls
) return -1;
3078 idx
= bcf_hdr_id2int(hdr
,BCF_DT_SAMPLE
,smpls
[i
]);
3081 if ( !ret
) ret
= i
+1;
3084 assert( idx
<bcf_hdr_nsamples(hdr
) );
3085 if ( samples
[0]=='^' )
3086 bit_array_clear(hdr
->keep_samples
, idx
);
3088 bit_array_set(hdr
->keep_samples
, idx
);
3090 for (i
=0; i
<n
; i
++) free(smpls
[i
]);
3093 bcf_hdr_nsamples(hdr
) = 0;
3094 for (i
=0; i
<hdr
->nsamples_ori
; i
++)
3095 if ( bit_array_test(hdr
->keep_samples
,i
) ) bcf_hdr_nsamples(hdr
)++;
3096 if ( !bcf_hdr_nsamples(hdr
) ) { free(hdr
->keep_samples
); hdr
->keep_samples
=NULL
; }
3099 char **samples
= (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr
));
3101 for (i
=0; i
<hdr
->nsamples_ori
; i
++)
3102 if ( bit_array_test(hdr
->keep_samples
,i
) ) samples
[idx
++] = strdup(hdr
->samples
[i
]);
3104 hdr
->samples
= samples
;
3106 // delete original samples from the dictionary
3107 vdict_t
*d
= (vdict_t
*)hdr
->dict
[BCF_DT_SAMPLE
];
3109 for (k
= kh_begin(d
); k
!= kh_end(d
); ++k
)
3110 if (kh_exist(d
, k
)) free((char*)kh_key(d
, k
));
3111 kh_destroy(vdict
, d
);
3113 // add the subset back
3114 hdr
->dict
[BCF_DT_SAMPLE
] = d
= kh_init(vdict
);
3115 for (i
=0; i
<bcf_hdr_nsamples(hdr
); i
++)
3117 int ignore
, k
= kh_put(vdict
, d
, hdr
->samples
[i
], &ignore
);
3118 kh_val(d
, k
) = bcf_idinfo_def
;
3119 kh_val(d
, k
).id
= kh_size(d
) - 1;
3127 int bcf_subset(const bcf_hdr_t
*h
, bcf1_t
*v
, int n
, int *imap
)
3130 ind
.s
= 0; ind
.l
= ind
.m
= 0;
3132 bcf_fmt_t fmt
[MAX_N_FMT
];
3134 uint8_t *ptr
= (uint8_t*)v
->indiv
.s
;
3135 for (i
= 0; i
< v
->n_fmt
; ++i
)
3136 ptr
= bcf_unpack_fmt_core1(ptr
, v
->n_sample
, &fmt
[i
]);
3137 for (i
= 0; i
< (int)v
->n_fmt
; ++i
) {
3138 bcf_fmt_t
*f
= &fmt
[i
];
3139 bcf_enc_int1(&ind
, f
->id
);
3140 bcf_enc_size(&ind
, f
->n
, f
->type
);
3141 for (j
= 0; j
< n
; ++j
)
3142 if (imap
[j
] >= 0) kputsn((char*)(f
->p
+ imap
[j
] * f
->size
), f
->size
, &ind
);
3144 for (i
= j
= 0; j
< n
; ++j
) if (imap
[j
] >= 0) ++i
;
3146 } else v
->n_sample
= 0;
3147 if ( !v
->n_sample
) v
->n_fmt
= 0;
3150 v
->unpacked
&= ~BCF_UN_FMT
; // only BCF is ready for output, VCF will need to unpack again
3154 int bcf_is_snp(bcf1_t
*v
)
3157 bcf_unpack(v
, BCF_UN_STR
);
3158 for (i
= 0; i
< v
->n_allele
; ++i
)
3160 if ( v
->d
.allele
[i
][1]==0 ) continue;
3162 // mpileup's <X> allele, see also below. This is not completely satisfactory,
3163 // a general library is here narrowly tailored to fit samtools.
3164 if ( v
->d
.allele
[i
][0]=='<' && v
->d
.allele
[i
][1]=='X' && v
->d
.allele
[i
][2]=='>' ) continue;
3165 if ( v
->d
.allele
[i
][0]=='<' && v
->d
.allele
[i
][1]=='*' && v
->d
.allele
[i
][2]=='>' ) continue;
3169 return i
== v
->n_allele
;
3172 static void bcf_set_variant_type(const char *ref
, const char *alt
, variant_t
*var
)
3174 // The most frequent case
3175 if ( !ref
[1] && !alt
[1] )
3177 if ( *alt
== '.' || *ref
==*alt
) { var
->n
= 0; var
->type
= VCF_REF
; return; }
3178 if ( *alt
== 'X' ) { var
->n
= 0; var
->type
= VCF_REF
; return; } // mpileup's X allele shouldn't be treated as variant
3179 if ( *alt
== '*' ) { var
->n
= 0; var
->type
= VCF_REF
; return; }
3180 var
->n
= 1; var
->type
= VCF_SNP
; return;
3184 if ( alt
[1]=='X' && alt
[2]=='>' ) { var
->n
= 0; var
->type
= VCF_REF
; return; } // mpileup's X allele shouldn't be treated as variant
3185 if ( alt
[1]=='*' && alt
[2]=='>' ) { var
->n
= 0; var
->type
= VCF_REF
; return; }
3186 var
->type
= VCF_OTHER
;
3190 const char *r
= ref
, *a
= alt
;
3191 while (*r
&& *a
&& toupper_c(*r
)==toupper_c(*a
) ) { r
++; a
++; } // unfortunately, matching REF,ALT case is not guaranteed
3195 if ( *a
==']' || *a
=='[' ) { var
->type
= VCF_BND
; return; }
3197 var
->n
= (a
-alt
)-(r
-ref
); var
->type
= VCF_INDEL
; return;
3199 else if ( *r
&& !*a
)
3202 var
->n
= (a
-alt
)-(r
-ref
); var
->type
= VCF_INDEL
; return;
3204 else if ( !*r
&& !*a
)
3206 var
->n
= 0; var
->type
= VCF_REF
; return;
3209 const char *re
= r
, *ae
= a
;
3210 while ( re
[1] ) re
++;
3211 while ( ae
[1] ) ae
++;
3212 while ( re
>r
&& ae
>a
&& toupper_c(*re
)==toupper_c(*ae
) ) { re
--; ae
--; }
3215 if ( re
==r
) { var
->n
= 1; var
->type
= VCF_SNP
; return; }
3217 if ( toupper_c(*re
)==toupper_c(*ae
) ) { var
->type
= VCF_INDEL
; return; }
3218 var
->type
= VCF_OTHER
; return;
3223 if ( toupper_c(*re
)==toupper_c(*ae
) ) { var
->type
= VCF_INDEL
; return; }
3224 var
->type
= VCF_OTHER
; return;
3227 var
->type
= ( re
-r
== ae
-a
) ? VCF_MNP
: VCF_OTHER
;
3228 var
->n
= ( re
-r
> ae
-a
) ? -(re
-r
+1) : ae
-a
+1;
3230 // should do also complex events, SVs, etc...
3233 static void bcf_set_variant_types(bcf1_t
*b
)
3235 if ( !(b
->unpacked
& BCF_UN_STR
) ) bcf_unpack(b
, BCF_UN_STR
);
3236 bcf_dec_t
*d
= &b
->d
;
3237 if ( d
->n_var
< b
->n_allele
)
3239 d
->var
= (variant_t
*) realloc(d
->var
, sizeof(variant_t
)*b
->n_allele
);
3240 d
->n_var
= b
->n_allele
;
3244 for (i
=1; i
<b
->n_allele
; i
++)
3246 bcf_set_variant_type(d
->allele
[0],d
->allele
[i
], &d
->var
[i
]);
3247 b
->d
.var_type
|= d
->var
[i
].type
;
3248 //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
3252 int bcf_get_variant_types(bcf1_t
*rec
)
3254 if ( rec
->d
.var_type
==-1 ) bcf_set_variant_types(rec
);
3255 return rec
->d
.var_type
;
3257 int bcf_get_variant_type(bcf1_t
*rec
, int ith_allele
)
3259 if ( rec
->d
.var_type
==-1 ) bcf_set_variant_types(rec
);
3260 return rec
->d
.var
[ith_allele
].type
;
3263 int bcf_update_info(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char *key
, const void *values
, int n
, int type
)
3265 // Is the field already present?
3266 int i
, inf_id
= bcf_hdr_id2int(hdr
,BCF_DT_ID
,key
);
3267 if ( !bcf_hdr_idinfo_exists(hdr
,BCF_HL_INFO
,inf_id
) ) return -1; // No such INFO field in the header
3268 if ( !(line
->unpacked
& BCF_UN_INFO
) ) bcf_unpack(line
, BCF_UN_INFO
);
3270 for (i
=0; i
<line
->n_info
; i
++)
3271 if ( inf_id
==line
->d
.info
[i
].key
) break;
3272 bcf_info_t
*inf
= i
==line
->n_info
? NULL
: &line
->d
.info
[i
];
3274 if ( !n
|| (type
==BCF_HT_STR
&& !values
) )
3276 if ( n
==0 && !strcmp("END",key
) )
3277 line
->rlen
= line
->n_allele
? strlen(line
->d
.allele
[0]) : 0;
3280 // Mark the tag for removal, free existing memory if necessary
3281 if ( inf
->vptr_free
)
3283 free(inf
->vptr
- inf
->vptr_off
);
3286 line
->d
.shared_dirty
|= BCF1_DIRTY_INF
;
3288 inf
->vptr_off
= inf
->vptr_len
= 0;
3293 // Encode the values and determine the size required to accommodate the values
3294 kstring_t str
= {0,0,0};
3295 bcf_enc_int1(&str
, inf_id
);
3296 if ( type
==BCF_HT_INT
)
3297 bcf_enc_vint(&str
, n
, (int32_t*)values
, -1);
3298 else if ( type
==BCF_HT_REAL
)
3299 bcf_enc_vfloat(&str
, n
, (float*)values
);
3300 else if ( type
==BCF_HT_FLAG
|| type
==BCF_HT_STR
)
3303 bcf_enc_size(&str
, 0, BCF_BT_NULL
);
3305 bcf_enc_vchar(&str
, strlen((char*)values
), (char*)values
);
3309 hts_log_error("The type %d not implemented yet", type
);
3313 // Is the INFO tag already present
3316 // Is it big enough to accommodate new block?
3317 if ( str
.l
<= inf
->vptr_len
+ inf
->vptr_off
)
3319 if ( str
.l
!= inf
->vptr_len
+ inf
->vptr_off
) line
->d
.shared_dirty
|= BCF1_DIRTY_INF
;
3320 uint8_t *ptr
= inf
->vptr
- inf
->vptr_off
;
3321 memcpy(ptr
, str
.s
, str
.l
);
3323 int vptr_free
= inf
->vptr_free
;
3324 bcf_unpack_info_core1(ptr
, inf
);
3325 inf
->vptr_free
= vptr_free
;
3329 assert( !inf
->vptr_free
); // fix the caller or improve here: this has been modified before
3330 bcf_unpack_info_core1((uint8_t*)str
.s
, inf
);
3332 line
->d
.shared_dirty
|= BCF1_DIRTY_INF
;
3337 // The tag is not present, create new one
3339 hts_expand0(bcf_info_t
, line
->n_info
, line
->d
.m_info
, line
->d
.info
);
3340 inf
= &line
->d
.info
[line
->n_info
-1];
3341 bcf_unpack_info_core1((uint8_t*)str
.s
, inf
);
3343 line
->d
.shared_dirty
|= BCF1_DIRTY_INF
;
3345 line
->unpacked
|= BCF_UN_INFO
;
3347 if ( n
==1 && !strcmp("END",key
) ) line
->rlen
= ((int32_t*)values
)[0] - line
->pos
;
3351 int bcf_update_format_string(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char *key
, const char **values
, int n
)
3354 return bcf_update_format(hdr
,line
,key
,NULL
,0,BCF_HT_STR
);
3359 int len
= strlen(values
[i
]);
3360 if ( len
> max_len
) max_len
= len
;
3362 char *out
= (char*) malloc(max_len
*n
);
3363 if ( !out
) return -2;
3366 char *dst
= out
+i
*max_len
;
3367 const char *src
= values
[i
];
3369 while ( src
[j
] ) { dst
[j
] = src
[j
]; j
++; }
3370 for (; j
<max_len
; j
++) dst
[j
] = 0;
3372 int ret
= bcf_update_format(hdr
,line
,key
,out
,max_len
*n
,BCF_HT_STR
);
3377 int bcf_update_format(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char *key
, const void *values
, int n
, int type
)
3379 // Is the field already present?
3380 int i
, fmt_id
= bcf_hdr_id2int(hdr
,BCF_DT_ID
,key
);
3381 if ( !bcf_hdr_idinfo_exists(hdr
,BCF_HL_FMT
,fmt_id
) )
3384 return -1; // the key not present in the header
3387 if ( !(line
->unpacked
& BCF_UN_FMT
) ) bcf_unpack(line
, BCF_UN_FMT
);
3389 for (i
=0; i
<line
->n_fmt
; i
++)
3390 if ( line
->d
.fmt
[i
].id
==fmt_id
) break;
3391 bcf_fmt_t
*fmt
= i
==line
->n_fmt
? NULL
: &line
->d
.fmt
[i
];
3397 // Mark the tag for removal, free existing memory if necessary
3400 free(fmt
->p
- fmt
->p_off
);
3403 line
->d
.indiv_dirty
= 1;
3409 line
->n_sample
= bcf_hdr_nsamples(hdr
);
3410 int nps
= n
/ line
->n_sample
; // number of values per sample
3411 assert( nps
&& nps
*line
->n_sample
==n
); // must be divisible by n_sample
3413 // Encode the values and determine the size required to accommodate the values
3414 kstring_t str
= {0,0,0};
3415 bcf_enc_int1(&str
, fmt_id
);
3416 if ( type
==BCF_HT_INT
)
3417 bcf_enc_vint(&str
, n
, (int32_t*)values
, nps
);
3418 else if ( type
==BCF_HT_REAL
)
3420 bcf_enc_size(&str
, nps
, BCF_BT_FLOAT
);
3421 serialize_float_array(&str
, nps
*line
->n_sample
, (float *) values
);
3423 else if ( type
==BCF_HT_STR
)
3425 bcf_enc_size(&str
, nps
, BCF_BT_CHAR
);
3426 kputsn((char*)values
, nps
*line
->n_sample
, &str
);
3430 hts_log_error("The type %d not implemented yet", type
);
3436 // Not present, new format field
3438 hts_expand0(bcf_fmt_t
, line
->n_fmt
, line
->d
.m_fmt
, line
->d
.fmt
);
3440 // Special case: VCF specification requires that GT is always first
3441 if ( line
->n_fmt
> 1 && key
[0]=='G' && key
[1]=='T' && !key
[2] )
3443 for (i
=line
->n_fmt
-1; i
>0; i
--)
3444 line
->d
.fmt
[i
] = line
->d
.fmt
[i
-1];
3445 fmt
= &line
->d
.fmt
[0];
3448 fmt
= &line
->d
.fmt
[line
->n_fmt
-1];
3449 bcf_unpack_fmt_core1((uint8_t*)str
.s
, line
->n_sample
, fmt
);
3450 line
->d
.indiv_dirty
= 1;
3455 // The tag is already present, check if it is big enough to accomodate the new block
3456 if ( str
.l
<= fmt
->p_len
+ fmt
->p_off
)
3458 // good, the block is big enough
3459 if ( str
.l
!= fmt
->p_len
+ fmt
->p_off
) line
->d
.indiv_dirty
= 1;
3460 uint8_t *ptr
= fmt
->p
- fmt
->p_off
;
3461 memcpy(ptr
, str
.s
, str
.l
);
3463 int p_free
= fmt
->p_free
;
3464 bcf_unpack_fmt_core1(ptr
, line
->n_sample
, fmt
);
3465 fmt
->p_free
= p_free
;
3469 assert( !fmt
->p_free
); // fix the caller or improve here: this has been modified before
3470 bcf_unpack_fmt_core1((uint8_t*)str
.s
, line
->n_sample
, fmt
);
3472 line
->d
.indiv_dirty
= 1;
3475 line
->unpacked
|= BCF_UN_FMT
;
3480 int bcf_update_filter(const bcf_hdr_t
*hdr
, bcf1_t
*line
, int *flt_ids
, int n
)
3482 if ( !(line
->unpacked
& BCF_UN_FLT
) ) bcf_unpack(line
, BCF_UN_FLT
);
3483 line
->d
.shared_dirty
|= BCF1_DIRTY_FLT
;
3486 hts_expand(int, line
->d
.n_flt
, line
->d
.m_flt
, line
->d
.flt
);
3489 line
->d
.flt
[i
] = flt_ids
[i
];
3493 int bcf_add_filter(const bcf_hdr_t
*hdr
, bcf1_t
*line
, int flt_id
)
3495 if ( !(line
->unpacked
& BCF_UN_FLT
) ) bcf_unpack(line
, BCF_UN_FLT
);
3497 for (i
=0; i
<line
->d
.n_flt
; i
++)
3498 if ( flt_id
==line
->d
.flt
[i
] ) break;
3499 if ( i
<line
->d
.n_flt
) return 0; // this filter is already set
3500 line
->d
.shared_dirty
|= BCF1_DIRTY_FLT
;
3501 if ( flt_id
==0 ) // set to PASS
3503 else if ( line
->d
.n_flt
==1 && line
->d
.flt
[0]==0 )
3507 hts_expand(int, line
->d
.n_flt
, line
->d
.m_flt
, line
->d
.flt
);
3508 line
->d
.flt
[line
->d
.n_flt
-1] = flt_id
;
3511 int bcf_remove_filter(const bcf_hdr_t
*hdr
, bcf1_t
*line
, int flt_id
, int pass
)
3513 if ( !(line
->unpacked
& BCF_UN_FLT
) ) bcf_unpack(line
, BCF_UN_FLT
);
3515 for (i
=0; i
<line
->d
.n_flt
; i
++)
3516 if ( flt_id
==line
->d
.flt
[i
] ) break;
3517 if ( i
==line
->d
.n_flt
) return 0; // the filter is not present
3518 line
->d
.shared_dirty
|= BCF1_DIRTY_FLT
;
3519 if ( i
!=line
->d
.n_flt
-1 ) memmove(line
->d
.flt
+i
,line
->d
.flt
+i
+1,(line
->d
.n_flt
-i
-1)*sizeof(*line
->d
.flt
));
3521 if ( !line
->d
.n_flt
&& pass
) bcf_add_filter(hdr
,line
,0);
3525 int bcf_has_filter(const bcf_hdr_t
*hdr
, bcf1_t
*line
, char *filter
)
3527 if ( filter
[0]=='.' && !filter
[1] ) filter
= "PASS";
3528 int id
= bcf_hdr_id2int(hdr
, BCF_DT_ID
, filter
);
3529 if ( !bcf_hdr_idinfo_exists(hdr
,BCF_HL_FLT
,id
) ) return -1; // not defined in the header
3531 if ( !(line
->unpacked
& BCF_UN_FLT
) ) bcf_unpack(line
, BCF_UN_FLT
);
3532 if ( id
==0 && !line
->d
.n_flt
) return 1; // PASS
3535 for (i
=0; i
<line
->d
.n_flt
; i
++)
3536 if ( line
->d
.flt
[i
]==id
) return 1;
3540 static inline int _bcf1_sync_alleles(const bcf_hdr_t
*hdr
, bcf1_t
*line
, int nals
)
3542 line
->d
.shared_dirty
|= BCF1_DIRTY_ALS
;
3544 line
->n_allele
= nals
;
3545 hts_expand(char*, line
->n_allele
, line
->d
.m_allele
, line
->d
.allele
);
3547 char *als
= line
->d
.als
;
3551 line
->d
.allele
[n
] = als
;
3552 while ( *als
) als
++;
3557 // Update REF length
3558 bcf_info_t
*end_info
= bcf_get_info(hdr
,line
,"END");
3559 line
->rlen
= end_info
? end_info
->v1
.i
: strlen(line
->d
.allele
[0]);
3563 int bcf_update_alleles(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char **alleles
, int nals
)
3565 kstring_t tmp
= {0,0,0};
3566 char *free_old
= NULL
;
3568 // If the supplied alleles are not pointers to line->d.als, the existing block can be reused.
3570 for (i
=0; i
<nals
; i
++)
3571 if ( alleles
[i
]>=line
->d
.als
&& alleles
[i
]<line
->d
.als
+line
->d
.m_als
) break;
3574 // all alleles point elsewhere, reuse the existing block
3575 tmp
.l
= 0; tmp
.s
= line
->d
.als
; tmp
.m
= line
->d
.m_als
;
3578 free_old
= line
->d
.als
;
3580 for (i
=0; i
<nals
; i
++)
3582 kputs(alleles
[i
], &tmp
);
3585 line
->d
.als
= tmp
.s
; line
->d
.m_als
= tmp
.m
;
3587 return _bcf1_sync_alleles(hdr
,line
,nals
);
3590 int bcf_update_alleles_str(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char *alleles_string
)
3593 tmp
.l
= 0; tmp
.s
= line
->d
.als
; tmp
.m
= line
->d
.m_als
;
3594 kputs(alleles_string
, &tmp
);
3595 line
->d
.als
= tmp
.s
; line
->d
.m_als
= tmp
.m
;
3598 char *t
= line
->d
.als
;
3601 if ( *t
==',' ) { *t
= 0; nals
++; }
3604 return _bcf1_sync_alleles(hdr
, line
, nals
);
3607 int bcf_update_id(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char *id
)
3610 tmp
.l
= 0; tmp
.s
= line
->d
.id
; tmp
.m
= line
->d
.m_id
;
3615 line
->d
.id
= tmp
.s
; line
->d
.m_id
= tmp
.m
;
3616 line
->d
.shared_dirty
|= BCF1_DIRTY_ID
;
3620 int bcf_add_id(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char *id
)
3622 if ( !id
) return 0;
3625 tmp
.l
= 0; tmp
.s
= line
->d
.id
; tmp
.m
= line
->d
.m_id
;
3627 int len
= strlen(id
);
3628 char *dst
= line
->d
.id
;
3629 while ( *dst
&& (dst
=strstr(dst
,id
)) )
3631 if ( dst
[len
]!=0 && dst
[len
]!=';' ) dst
++; // a prefix, not a match
3632 else if ( dst
==line
->d
.id
|| dst
[-1]==';' ) return 0; // already present
3633 dst
++; // a suffix, not a match
3635 if ( line
->d
.id
&& (line
->d
.id
[0]!='.' || line
->d
.id
[1]) )
3637 tmp
.l
= strlen(line
->d
.id
);
3642 line
->d
.id
= tmp
.s
; line
->d
.m_id
= tmp
.m
;
3643 line
->d
.shared_dirty
|= BCF1_DIRTY_ID
;
3648 bcf_fmt_t
*bcf_get_fmt(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char *key
)
3650 int id
= bcf_hdr_id2int(hdr
, BCF_DT_ID
, key
);
3651 if ( !bcf_hdr_idinfo_exists(hdr
,BCF_HL_FMT
,id
) ) return NULL
; // no such FMT field in the header
3652 return bcf_get_fmt_id(line
, id
);
3655 bcf_info_t
*bcf_get_info(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char *key
)
3657 int id
= bcf_hdr_id2int(hdr
, BCF_DT_ID
, key
);
3658 if ( !bcf_hdr_idinfo_exists(hdr
,BCF_HL_INFO
,id
) ) return NULL
; // no such INFO field in the header
3659 return bcf_get_info_id(line
, id
);
3662 bcf_fmt_t
*bcf_get_fmt_id(bcf1_t
*line
, const int id
)
3665 if ( !(line
->unpacked
& BCF_UN_FMT
) ) bcf_unpack(line
, BCF_UN_FMT
);
3666 for (i
=0; i
<line
->n_fmt
; i
++)
3668 if ( line
->d
.fmt
[i
].id
==id
) return &line
->d
.fmt
[i
];
3673 bcf_info_t
*bcf_get_info_id(bcf1_t
*line
, const int id
)
3676 if ( !(line
->unpacked
& BCF_UN_INFO
) ) bcf_unpack(line
, BCF_UN_INFO
);
3677 for (i
=0; i
<line
->n_info
; i
++)
3679 if ( line
->d
.info
[i
].key
==id
) return &line
->d
.info
[i
];
3685 int bcf_get_info_values(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char *tag
, void **dst
, int *ndst
, int type
)
3687 int i
,j
, tag_id
= bcf_hdr_id2int(hdr
, BCF_DT_ID
, tag
);
3688 if ( !bcf_hdr_idinfo_exists(hdr
,BCF_HL_INFO
,tag_id
) ) return -1; // no such INFO field in the header
3689 if ( bcf_hdr_id2type(hdr
,BCF_HL_INFO
,tag_id
)!=type
) return -2; // expected different type
3691 if ( !(line
->unpacked
& BCF_UN_INFO
) ) bcf_unpack(line
, BCF_UN_INFO
);
3693 for (i
=0; i
<line
->n_info
; i
++)
3694 if ( line
->d
.info
[i
].key
==tag_id
) break;
3695 if ( i
==line
->n_info
) return ( type
==BCF_HT_FLAG
) ? 0 : -3; // the tag is not present in this record
3696 if ( type
==BCF_HT_FLAG
) return 1;
3698 bcf_info_t
*info
= &line
->d
.info
[i
];
3699 if ( !info
->vptr
) return -3; // the tag was marked for removal
3700 if ( type
==BCF_HT_STR
)
3702 if ( *ndst
< info
->len
+1 )
3704 *ndst
= info
->len
+ 1;
3705 *dst
= realloc(*dst
, *ndst
);
3707 memcpy(*dst
,info
->vptr
,info
->len
);
3708 ((uint8_t*)*dst
)[info
->len
] = 0;
3712 // Make sure the buffer is big enough
3713 int size1
= type
==BCF_HT_INT
? sizeof(int32_t) : sizeof(float);
3714 if ( *ndst
< info
->len
)
3717 *dst
= realloc(*dst
, *ndst
* size1
);
3720 #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) { \
3721 out_type_t *tmp = (out_type_t *) *dst; \
3722 for (j=0; j<info->len; j++) \
3724 type_t p = convert(info->vptr + j * sizeof(type_t)); \
3725 if ( is_vector_end ) return j; \
3726 if ( is_missing ) set_missing; \
3732 switch (info
->type
) {
3733 case BCF_BT_INT8
: BRANCH(int8_t, le_to_i8
, p
==bcf_int8_missing
, p
==bcf_int8_vector_end
, *tmp
=bcf_int32_missing
, *tmp
=p
, int32_t); break;
3734 case BCF_BT_INT16
: BRANCH(int16_t, le_to_i16
, p
==bcf_int16_missing
, p
==bcf_int16_vector_end
, *tmp
=bcf_int32_missing
, *tmp
=p
, int32_t); break;
3735 case BCF_BT_INT32
: BRANCH(int32_t, le_to_i32
, p
==bcf_int32_missing
, p
==bcf_int32_vector_end
, *tmp
=bcf_int32_missing
, *tmp
=p
, int32_t); break;
3736 case BCF_BT_FLOAT
: BRANCH(uint32_t, le_to_u32
, p
==bcf_float_missing
, p
==bcf_float_vector_end
, bcf_float_set_missing(*tmp
), bcf_float_set(tmp
, p
), float); break;
3737 default: hts_log_error("Unexpected type %d", info
->type
); exit(1);
3740 return -4; // this can never happen
3743 int bcf_get_format_string(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char *tag
, char ***dst
, int *ndst
)
3745 int i
,tag_id
= bcf_hdr_id2int(hdr
, BCF_DT_ID
, tag
);
3746 if ( !bcf_hdr_idinfo_exists(hdr
,BCF_HL_FMT
,tag_id
) ) return -1; // no such FORMAT field in the header
3747 if ( bcf_hdr_id2type(hdr
,BCF_HL_FMT
,tag_id
)!=BCF_HT_STR
) return -2; // expected different type
3749 if ( !(line
->unpacked
& BCF_UN_FMT
) ) bcf_unpack(line
, BCF_UN_FMT
);
3751 for (i
=0; i
<line
->n_fmt
; i
++)
3752 if ( line
->d
.fmt
[i
].id
==tag_id
) break;
3753 if ( i
==line
->n_fmt
) return -3; // the tag is not present in this record
3754 bcf_fmt_t
*fmt
= &line
->d
.fmt
[i
];
3755 if ( !fmt
->p
) return -3; // the tag was marked for removal
3757 int nsmpl
= bcf_hdr_nsamples(hdr
);
3760 *dst
= (char**) malloc(sizeof(char*)*nsmpl
);
3761 if ( !*dst
) return -4; // could not alloc
3764 int n
= (fmt
->n
+1)*nsmpl
;
3767 (*dst
)[0] = realloc((*dst
)[0], n
);
3768 if ( !(*dst
)[0] ) return -4; // could not alloc
3771 for (i
=0; i
<nsmpl
; i
++)
3773 uint8_t *src
= fmt
->p
+ i
*fmt
->n
;
3774 uint8_t *tmp
= (uint8_t*)(*dst
)[0] + i
*(fmt
->n
+1);
3775 memcpy(tmp
,src
,fmt
->n
);
3777 (*dst
)[i
] = (char*) tmp
;
3782 int bcf_get_format_values(const bcf_hdr_t
*hdr
, bcf1_t
*line
, const char *tag
, void **dst
, int *ndst
, int type
)
3784 int i
,j
, tag_id
= bcf_hdr_id2int(hdr
, BCF_DT_ID
, tag
);
3785 if ( !bcf_hdr_idinfo_exists(hdr
,BCF_HL_FMT
,tag_id
) ) return -1; // no such FORMAT field in the header
3786 if ( tag
[0]=='G' && tag
[1]=='T' && tag
[2]==0 )
3788 // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
3789 if ( bcf_hdr_id2type(hdr
,BCF_HL_FMT
,tag_id
)!=BCF_HT_STR
) return -2;
3791 else if ( bcf_hdr_id2type(hdr
,BCF_HL_FMT
,tag_id
)!=type
) return -2; // expected different type
3793 if ( !(line
->unpacked
& BCF_UN_FMT
) ) bcf_unpack(line
, BCF_UN_FMT
);
3795 for (i
=0; i
<line
->n_fmt
; i
++)
3796 if ( line
->d
.fmt
[i
].id
==tag_id
) break;
3797 if ( i
==line
->n_fmt
) return -3; // the tag is not present in this record
3798 bcf_fmt_t
*fmt
= &line
->d
.fmt
[i
];
3799 if ( !fmt
->p
) return -3; // the tag was marked for removal
3801 if ( type
==BCF_HT_STR
)
3803 int n
= fmt
->n
*bcf_hdr_nsamples(hdr
);
3806 *dst
= realloc(*dst
, n
);
3807 if ( !*dst
) return -4; // could not alloc
3810 memcpy(*dst
,fmt
->p
,n
);
3814 // Make sure the buffer is big enough
3815 int nsmpl
= bcf_hdr_nsamples(hdr
);
3816 int size1
= type
==BCF_HT_INT
? sizeof(int32_t) : sizeof(float);
3817 if ( *ndst
< fmt
->n
*nsmpl
)
3819 *ndst
= fmt
->n
*nsmpl
;
3820 *dst
= realloc(*dst
, *ndst
*size1
);
3821 if ( !dst
) return -4; // could not alloc
3824 #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
3825 out_type_t *tmp = (out_type_t *) *dst; \
3826 uint8_t *fmt_p = fmt->p; \
3827 for (i=0; i<nsmpl; i++) \
3829 for (j=0; j<fmt->n; j++) \
3831 type_t p = convert(fmt_p + j * sizeof(type_t)); \
3832 if ( is_missing ) set_missing; \
3833 else if ( is_vector_end ) { set_vector_end; break; } \
3837 for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
3838 fmt_p += fmt->size; \
3841 switch (fmt
->type
) {
3842 case BCF_BT_INT8
: BRANCH(int8_t, le_to_i8
, p
==bcf_int8_missing
, p
==bcf_int8_vector_end
, *tmp
=bcf_int32_missing
, *tmp
=bcf_int32_vector_end
, *tmp
=p
, int32_t); break;
3843 case BCF_BT_INT16
: BRANCH(int16_t, le_to_i16
, p
==bcf_int16_missing
, p
==bcf_int16_vector_end
, *tmp
=bcf_int32_missing
, *tmp
=bcf_int32_vector_end
, *tmp
=p
, int32_t); break;
3844 case BCF_BT_INT32
: BRANCH(int32_t, le_to_i32
, p
==bcf_int32_missing
, p
==bcf_int32_vector_end
, *tmp
=bcf_int32_missing
, *tmp
=bcf_int32_vector_end
, *tmp
=p
, int32_t); break;
3845 case BCF_BT_FLOAT
: BRANCH(uint32_t, le_to_u32
, p
==bcf_float_missing
, p
==bcf_float_vector_end
, bcf_float_set_missing(*tmp
), bcf_float_set_vector_end(*tmp
), bcf_float_set(tmp
, p
), float); break;
3846 default: hts_log_error("Unexpected type %d", fmt
->type
); exit(1);
3849 return nsmpl
*fmt
->n
;