modified: src1/input.c
[GalaxyCodeBases.git] / c_cpp / lib / htslib / vcf.c
blob6a7f16a86a02a7d9aef3ee8c1ace39e864a9985c
1 /* vcf.c -- VCF/BCF API functions.
3 Copyright (C) 2012, 2013 Broad Institute.
4 Copyright (C) 2012-2017 Genome Research Ltd.
5 Portions copyright (C) 2014 Intel Corporation.
7 Author: Heng Li <lh3@sanger.ac.uk>
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 DEALINGS IN THE SOFTWARE. */
27 #include <config.h>
29 #include <stdio.h>
30 #include <assert.h>
31 #include <string.h>
32 #include <strings.h>
33 #include <stdlib.h>
34 #include <limits.h>
35 #include <stdint.h>
36 #include <errno.h>
38 #include "htslib/vcf.h"
39 #include "htslib/bgzf.h"
40 #include "htslib/tbx.h"
41 #include "htslib/hfile.h"
42 #include "hts_internal.h"
43 #include "htslib/hts_endian.h"
44 #include "htslib/khash_str2int.h"
45 #include "htslib/kstring.h"
47 #include "htslib/khash.h"
48 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
49 typedef khash_t(vdict) vdict_t;
51 #include "htslib/kseq.h"
53 uint32_t bcf_float_missing = 0x7F800001;
54 uint32_t bcf_float_vector_end = 0x7F800002;
55 uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
56 static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 };
58 static const char *dump_char(char *buffer, char c)
60 switch (c) {
61 case '\n': strcpy(buffer, "\\n"); break;
62 case '\r': strcpy(buffer, "\\r"); break;
63 case '\t': strcpy(buffer, "\\t"); break;
64 case '\'':
65 case '\"':
66 case '\\':
67 sprintf(buffer, "\\%c", c);
68 break;
69 default:
70 if (isprint_c(c)) sprintf(buffer, "%c", c);
71 else sprintf(buffer, "\\x%02X", (unsigned char) c);
72 break;
74 return buffer;
77 static char *find_chrom_header_line(char *s)
79 char *nl;
80 if (strncmp(s, "#CHROM\t", 7) == 0) return s;
81 else if ((nl = strstr(s, "\n#CHROM\t")) != NULL) return nl+1;
82 else return NULL;
85 /*************************
86 *** VCF header parser ***
87 *************************/
89 int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s)
91 if ( !s ) return 0;
93 const char *ss = s;
94 while ( !*ss && isspace_c(*ss) ) ss++;
95 if ( !*ss )
97 hts_log_error("Empty sample name: trailing spaces/tabs in the header line?");
98 abort();
101 vdict_t *d = (vdict_t*)h->dict[BCF_DT_SAMPLE];
102 int ret;
103 char *sdup = strdup(s);
104 int k = kh_put(vdict, d, sdup, &ret);
105 if (ret) { // absent
106 kh_val(d, k) = bcf_idinfo_def;
107 kh_val(d, k).id = kh_size(d) - 1;
108 } else {
109 hts_log_error("Duplicated sample name '%s'", s);
110 free(sdup);
111 return -1;
113 int n = kh_size(d);
114 h->samples = (char**) realloc(h->samples,sizeof(char*)*n);
115 h->samples[n-1] = sdup;
116 h->dirty = 1;
117 return 0;
120 int bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str)
122 int ret = 0;
123 int i = 0;
124 const char *p, *q;
125 // add samples
126 for (p = q = str;; ++q) {
127 if (*q != '\t' && *q != 0 && *q != '\n') continue;
128 if (++i > 9) {
129 char *s = (char*)malloc(q - p + 1);
130 strncpy(s, p, q - p);
131 s[q - p] = 0;
132 if ( bcf_hdr_add_sample(h,s) < 0 ) ret = -1;
133 free(s);
135 if (*q == 0 || *q == '\n') break;
136 p = q + 1;
138 bcf_hdr_add_sample(h,NULL);
139 return ret;
142 int bcf_hdr_sync(bcf_hdr_t *h)
144 int i;
145 for (i = 0; i < 3; i++)
147 vdict_t *d = (vdict_t*)h->dict[i];
148 khint_t k;
149 if ( h->n[i] < kh_size(d) )
151 // this should be true only for i=2, BCF_DT_SAMPLE
152 h->n[i] = kh_size(d);
153 h->id[i] = (bcf_idpair_t*) realloc(h->id[i], kh_size(d)*sizeof(bcf_idpair_t));
155 for (k=kh_begin(d); k<kh_end(d); k++)
157 if (!kh_exist(d,k)) continue;
158 h->id[i][kh_val(d,k).id].key = kh_key(d,k);
159 h->id[i][kh_val(d,k).id].val = &kh_val(d,k);
162 h->dirty = 0;
163 return 0;
166 void bcf_hrec_destroy(bcf_hrec_t *hrec)
168 free(hrec->key);
169 if ( hrec->value ) free(hrec->value);
170 int i;
171 for (i=0; i<hrec->nkeys; i++)
173 free(hrec->keys[i]);
174 free(hrec->vals[i]);
176 free(hrec->keys);
177 free(hrec->vals);
178 free(hrec);
181 // Copies all fields except IDX.
182 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
184 bcf_hrec_t *out = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
185 out->type = hrec->type;
186 if ( hrec->key ) out->key = strdup(hrec->key);
187 if ( hrec->value ) out->value = strdup(hrec->value);
188 out->nkeys = hrec->nkeys;
189 out->keys = (char**) malloc(sizeof(char*)*hrec->nkeys);
190 out->vals = (char**) malloc(sizeof(char*)*hrec->nkeys);
191 int i, j = 0;
192 for (i=0; i<hrec->nkeys; i++)
194 if ( hrec->keys[i] && !strcmp("IDX",hrec->keys[i]) ) continue;
195 if ( hrec->keys[i] ) out->keys[j] = strdup(hrec->keys[i]);
196 if ( hrec->vals[i] ) out->vals[j] = strdup(hrec->vals[i]);
197 j++;
199 if ( i!=j ) out->nkeys -= i-j; // IDX was omitted
200 return out;
203 void bcf_hrec_debug(FILE *fp, bcf_hrec_t *hrec)
205 fprintf(fp, "key=[%s] value=[%s]", hrec->key, hrec->value?hrec->value:"");
206 int i;
207 for (i=0; i<hrec->nkeys; i++)
208 fprintf(fp, "\t[%s]=[%s]", hrec->keys[i],hrec->vals[i]);
209 fprintf(fp, "\n");
212 void bcf_header_debug(bcf_hdr_t *hdr)
214 int i, j;
215 for (i=0; i<hdr->nhrec; i++)
217 if ( !hdr->hrec[i]->value )
219 fprintf(stderr, "##%s=<", hdr->hrec[i]->key);
220 fprintf(stderr,"%s=%s", hdr->hrec[i]->keys[0], hdr->hrec[i]->vals[0]);
221 for (j=1; j<hdr->hrec[i]->nkeys; j++)
222 fprintf(stderr,",%s=%s", hdr->hrec[i]->keys[j], hdr->hrec[i]->vals[j]);
223 fprintf(stderr,">\n");
225 else
226 fprintf(stderr,"##%s=%s\n", hdr->hrec[i]->key,hdr->hrec[i]->value);
230 void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
232 int n = ++hrec->nkeys;
233 hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
234 hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
235 assert( len );
236 hrec->keys[n-1] = (char*) malloc((len+1)*sizeof(char));
237 memcpy(hrec->keys[n-1],str,len);
238 hrec->keys[n-1][len] = 0;
239 hrec->vals[n-1] = NULL;
242 void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
244 if ( !str ) { hrec->vals[i] = NULL; return; }
245 if ( hrec->vals[i] ) free(hrec->vals[i]);
246 if ( is_quoted )
248 hrec->vals[i] = (char*) malloc((len+3)*sizeof(char));
249 hrec->vals[i][0] = '"';
250 memcpy(&hrec->vals[i][1],str,len);
251 hrec->vals[i][len+1] = '"';
252 hrec->vals[i][len+2] = 0;
254 else
256 hrec->vals[i] = (char*) malloc((len+1)*sizeof(char));
257 memcpy(hrec->vals[i],str,len);
258 hrec->vals[i][len] = 0;
262 void hrec_add_idx(bcf_hrec_t *hrec, int idx)
264 int n = ++hrec->nkeys;
265 hrec->keys = (char**) realloc(hrec->keys, sizeof(char*)*n);
266 hrec->vals = (char**) realloc(hrec->vals, sizeof(char*)*n);
267 hrec->keys[n-1] = strdup("IDX");
268 kstring_t str = {0,0,0};
269 kputw(idx, &str);
270 hrec->vals[n-1] = str.s;
273 int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
275 int i;
276 for (i=0; i<hrec->nkeys; i++)
277 if ( !strcasecmp(key,hrec->keys[i]) ) return i;
278 return -1;
281 static inline int is_escaped(const char *min, const char *str)
283 int n = 0;
284 while ( --str>=min && *str=='\\' ) n++;
285 return n%2;
288 bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
290 const char *p = line;
291 if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; }
292 p += 2;
294 const char *q = p;
295 while ( *q && *q!='=' && *q != '\n' ) q++;
296 int n = q-p;
297 if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format
299 bcf_hrec_t *hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
300 hrec->key = (char*) malloc(sizeof(char)*(n+1));
301 memcpy(hrec->key,p,n);
302 hrec->key[n] = 0;
304 p = ++q;
305 if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579
307 while ( *q && *q!='\n' ) q++;
308 hrec->value = (char*) malloc((q-p+1)*sizeof(char));
309 memcpy(hrec->value, p, q-p);
310 hrec->value[q-p] = 0;
311 *len = q - line + (*q ? 1 : 0); // Skip \n but not \0
312 return hrec;
315 // structured line, e.g.
316 // ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias">
317 // ##PEDIGREE=<Name_0=G0-ID,Name_1=G1-ID,Name_3=GN-ID>
318 int nopen = 1;
319 while ( *q && *q!='\n' && nopen>0 )
321 p = ++q;
322 while ( *q && *q==' ' ) { p++; q++; }
323 // ^[A-Za-z_][0-9A-Za-z_.]*$
324 if (p==q && *q && (isalpha_c(*q) || *q=='_'))
326 q++;
327 while ( *q && (isalnum_c(*q) || *q=='_' || *q=='.') ) q++;
329 n = q-p;
330 int m = 0;
331 while ( *q && *q==' ' ) { q++; m++; }
332 if ( *q!='=' || !n )
334 // wrong format
335 while ( *q && *q!='\n' ) q++;
336 hts_log_error("Could not parse the header line: \"%.*s\"",
337 (int) (q - line), line);
338 *len = q - line + (*q ? 1 : 0);
339 bcf_hrec_destroy(hrec);
340 return NULL;
342 bcf_hrec_add_key(hrec, p, q-p-m);
343 p = ++q;
344 while ( *q && *q==' ' ) { p++; q++; }
345 int quoted = *p=='"' ? 1 : 0;
346 if ( quoted ) p++, q++;
347 while ( *q && *q != '\n' )
349 if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; }
350 else
352 if ( *q=='<' ) nopen++;
353 if ( *q=='>' ) nopen--;
354 if ( !nopen ) break;
355 if ( *q==',' && nopen==1 ) break;
357 q++;
359 const char *r = q;
360 while ( r > p && r[-1] == ' ' ) r--;
361 bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted);
362 if ( quoted && *q=='"' ) q++;
363 if ( *q=='>' ) { nopen--; q++; }
366 // Skip to end of line
367 int nonspace = 0;
368 p = q;
369 while ( *q && *q!='\n' ) { nonspace |= !isspace(*q); q++; }
370 if (nonspace) {
371 hts_log_warning("Dropped trailing junk from header line '%.*s'", (int) (q - line), line);
374 *len = q - line + (*q ? 1 : 0);
375 return hrec;
378 static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_idinfo_t *idinfo)
380 // If available, preserve existing IDX
381 if ( idinfo->id==-1 )
382 idinfo->id = hdr->n[dict_type]++;
383 else if ( idinfo->id < hdr->n[dict_type] && hdr->id[dict_type][idinfo->id].key )
385 hts_log_error("Conflicting IDX=%d lines in the header dictionary, the new tag is %s",
386 idinfo->id, tag);
387 exit(1);
390 if ( idinfo->id >= hdr->n[dict_type] ) hdr->n[dict_type] = idinfo->id+1;
391 hts_expand0(bcf_idpair_t,hdr->n[dict_type],hdr->m[dict_type],hdr->id[dict_type]);
393 // NB: the next kh_put call can invalidate the idinfo pointer, therefore
394 // we leave it unassigned here. It myst be set explicitly in bcf_hdr_sync.
395 hdr->id[dict_type][idinfo->id].key = tag;
397 return 0;
400 // returns: 1 when hdr needs to be synced, 0 otherwise
401 int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
403 // contig
404 int i,j, ret;
405 khint_t k;
406 char *str;
407 if ( !strcmp(hrec->key, "contig") )
409 hrec->type = BCF_HL_CTG;
411 // Get the contig ID ($str) and length ($j)
412 i = bcf_hrec_find_key(hrec,"length");
413 if ( i<0 ) j = 0;
414 else if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0;
416 i = bcf_hrec_find_key(hrec,"ID");
417 if ( i<0 ) return 0;
418 str = strdup(hrec->vals[i]);
420 // Register in the dictionary
421 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_CTG];
422 khint_t k = kh_get(vdict, d, str);
423 if ( k != kh_end(d) ) { free(str); return 0; } // already present
424 k = kh_put(vdict, d, str, &ret);
426 int idx = bcf_hrec_find_key(hrec,"IDX");
427 if ( idx!=-1 )
429 char *tmp = hrec->vals[idx];
430 idx = strtol(hrec->vals[idx], &tmp, 10);
431 if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
433 hts_log_warning("Error parsing the IDX tag, skipping");
434 return 0;
438 kh_val(d, k) = bcf_idinfo_def;
439 kh_val(d, k).id = idx;
440 kh_val(d, k).info[0] = j;
441 kh_val(d, k).hrec[0] = hrec;
442 bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k));
443 if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d,k).id);
445 return 1;
448 if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO;
449 else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT;
450 else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT;
451 else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; }
452 else return 0;
454 // INFO/FILTER/FORMAT
455 char *id = NULL;
456 uint32_t type = -1, var = -1;
457 int num = -1, idx = -1;
458 for (i=0; i<hrec->nkeys; i++)
460 if ( !strcmp(hrec->keys[i], "ID") ) id = hrec->vals[i];
461 else if ( !strcmp(hrec->keys[i], "IDX") )
463 char *tmp = hrec->vals[i];
464 idx = strtol(hrec->vals[i], &tmp, 10);
465 if ( *tmp || idx < 0 || idx >= INT_MAX - 1)
467 hts_log_warning("Error parsing the IDX tag, skipping");
468 return 0;
471 else if ( !strcmp(hrec->keys[i], "Type") )
473 if ( !strcmp(hrec->vals[i], "Integer") ) type = BCF_HT_INT;
474 else if ( !strcmp(hrec->vals[i], "Float") ) type = BCF_HT_REAL;
475 else if ( !strcmp(hrec->vals[i], "String") ) type = BCF_HT_STR;
476 else if ( !strcmp(hrec->vals[i], "Character") ) type = BCF_HT_STR;
477 else if ( !strcmp(hrec->vals[i], "Flag") ) type = BCF_HT_FLAG;
478 else
480 hts_log_warning("The type \"%s\" is not supported, assuming \"String\"", hrec->vals[i]);
481 type = BCF_HT_STR;
484 else if ( !strcmp(hrec->keys[i], "Number") )
486 if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A;
487 else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R;
488 else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G;
489 else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR;
490 else
492 sscanf(hrec->vals[i],"%d",&num);
493 var = BCF_VL_FIXED;
495 if (var != BCF_VL_FIXED) num = 0xfffff;
498 if (hrec->type == BCF_HL_INFO || hrec->type == BCF_HL_FMT) {
499 if (type == -1) {
500 hts_log_warning("%s %s field has no Type defined. Assuming String",
501 *hrec->key == 'I' ? "An" : "A", hrec->key);
502 type = BCF_HT_STR;
504 if (var == -1) {
505 hts_log_warning("%s %s field has no Number defined. Assuming '.'",
506 *hrec->key == 'I' ? "An" : "A", hrec->key);
507 var = BCF_VL_VAR;
510 uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 |
511 (var & 0xf) << 8 |
512 (type & 0xf) << 4 |
513 (((uint32_t) hrec->type) & 0xf));
515 if ( !id ) return 0;
516 str = strdup(id);
518 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_ID];
519 k = kh_get(vdict, d, str);
520 if ( k != kh_end(d) )
522 // already present
523 free(str);
524 if ( kh_val(d, k).hrec[info&0xf] ) return 0;
525 kh_val(d, k).info[info&0xf] = info;
526 kh_val(d, k).hrec[info&0xf] = hrec;
527 if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d, k).id);
528 return 1;
530 k = kh_put(vdict, d, str, &ret);
531 kh_val(d, k) = bcf_idinfo_def;
532 kh_val(d, k).info[info&0xf] = info;
533 kh_val(d, k).hrec[info&0xf] = hrec;
534 kh_val(d, k).id = idx;
535 bcf_hdr_set_idx(hdr, BCF_DT_ID, kh_key(d,k), &kh_val(d,k));
536 if ( idx==-1 ) hrec_add_idx(hrec, kh_val(d,k).id);
538 return 1;
541 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
543 if ( !hrec ) return 0;
545 hrec->type = BCF_HL_GEN;
546 if ( !bcf_hdr_register_hrec(hdr,hrec) )
548 // If one of the hashed field, then it is already present
549 if ( hrec->type != BCF_HL_GEN )
551 bcf_hrec_destroy(hrec);
552 return 0;
555 // Is one of the generic fields and already present?
556 int i;
557 for (i=0; i<hdr->nhrec; i++)
559 if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue;
560 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break;
561 if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break;
563 if ( i<hdr->nhrec )
565 bcf_hrec_destroy(hrec);
566 return 0;
570 // New record, needs to be added
571 int n = ++hdr->nhrec;
572 hdr->hrec = (bcf_hrec_t**) realloc(hdr->hrec, n*sizeof(bcf_hrec_t*));
573 hdr->hrec[n-1] = hrec;
574 hdr->dirty = 1;
576 return hrec->type==BCF_HL_GEN ? 0 : 1;
580 * Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed),
581 * the STR,GEN lines are searched for linearly in a linked list of all header lines.
582 * This may become a problem for VCFs with huge headers, we might need to build a
583 * dictionary for these lines as well.
585 bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
587 int i;
588 if ( type==BCF_HL_GEN )
590 for (i=0; i<hdr->nhrec; i++)
592 if ( hdr->hrec[i]->type!=type ) continue;
593 if ( strcmp(hdr->hrec[i]->key,key) ) continue;
594 if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i];
596 return NULL;
598 else if ( type==BCF_HL_STR )
600 for (i=0; i<hdr->nhrec; i++)
602 if ( hdr->hrec[i]->type!=type ) continue;
603 if ( strcmp(hdr->hrec[i]->key,str_class) ) continue;
604 int j = bcf_hrec_find_key(hdr->hrec[i],key);
605 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],value) ) return hdr->hrec[i];
607 return NULL;
609 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
610 khint_t k = kh_get(vdict, d, value);
611 if ( k == kh_end(d) ) return NULL;
612 return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type];
615 void bcf_hdr_check_sanity(bcf_hdr_t *hdr)
617 static int PL_warned = 0, GL_warned = 0;
619 if ( !PL_warned )
621 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL");
622 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
624 hts_log_warning("PL should be declared as Number=G");
625 PL_warned = 1;
628 if ( !GL_warned )
630 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL");
631 if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G )
633 hts_log_warning("GL should be declared as Number=G");
634 GL_warned = 1;
639 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
641 int len, needs_sync = 0, done = 0;
642 char *p = htxt;
644 // Check sanity: "fileformat" string must come as first
645 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len);
646 if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") )
647 hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?");
648 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
650 // The filter PASS must appear first in the dictionary
651 hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len);
652 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
654 // Parse the whole header
655 do {
656 while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) {
657 needs_sync += bcf_hdr_add_hrec(hdr, hrec);
658 p += len;
661 // Next should be the sample line. If not, it was a malformed
662 // header, in which case print a warning and skip (many VCF
663 // operations do not really care about a few malformed lines).
664 // In the future we may want to add a strict mode that errors in
665 // this case.
666 if ( strncmp("#CHROM\tPOS",p,10) != 0 ) {
667 char *eol = strchr(p, '\n');
668 if (*p != '\0') {
669 hts_log_warning("Could not parse header line: %.*s",
670 eol ? (int)(eol - p) : INT_MAX, p);
672 if (eol) {
673 p = eol + 1; // Try from the next line.
674 } else {
675 done = -1; // No more lines left, give up.
677 } else {
678 done = 1; // Sample line found
680 } while (!done);
682 if (done < 0) {
683 // No sample line is fatal.
684 hts_log_error("Could not parse the header, sample line not found");
685 return -1;
688 int ret = bcf_hdr_parse_sample_line(hdr,p);
689 bcf_hdr_sync(hdr);
690 bcf_hdr_check_sanity(hdr);
691 return ret;
694 int bcf_hdr_append(bcf_hdr_t *hdr, const char *line)
696 int len;
697 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr, (char*) line, &len);
698 if ( !hrec ) return -1;
699 bcf_hdr_add_hrec(hdr, hrec);
700 return 0;
703 void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key)
705 int i = 0;
706 bcf_hrec_t *hrec;
707 if ( !key )
709 while ( i<hdr->nhrec )
711 if ( hdr->hrec[i]->type!=type ) { i++; continue; }
712 hrec = hdr->hrec[i];
714 if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
716 int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
717 if ( j>=0 )
719 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
720 khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[j]);
721 kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
725 hdr->dirty = 1;
726 hdr->nhrec--;
727 if ( i < hdr->nhrec )
728 memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
729 bcf_hrec_destroy(hrec);
731 return;
733 while (1)
735 if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
737 hrec = bcf_hdr_get_hrec(hdr, type, "ID", key, NULL);
738 if ( !hrec ) return;
740 for (i=0; i<hdr->nhrec; i++)
741 if ( hdr->hrec[i]==hrec ) break;
742 assert( i<hdr->nhrec );
744 vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
745 khint_t k = kh_get(vdict, d, key);
746 kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
748 else
750 for (i=0; i<hdr->nhrec; i++)
752 if ( hdr->hrec[i]->type!=type ) continue;
753 if ( type==BCF_HL_GEN )
755 if ( !strcmp(hdr->hrec[i]->key,key) ) break;
757 else
759 // not all structured lines have ID, we could be more sophisticated as in bcf_hdr_get_hrec()
760 int j = bcf_hrec_find_key(hdr->hrec[i], "ID");
761 if ( j>=0 && !strcmp(hdr->hrec[i]->vals[j],key) ) break;
764 if ( i==hdr->nhrec ) return;
765 hrec = hdr->hrec[i];
768 hdr->nhrec--;
769 if ( i < hdr->nhrec )
770 memmove(&hdr->hrec[i],&hdr->hrec[i+1],(hdr->nhrec-i)*sizeof(bcf_hrec_t*));
771 bcf_hrec_destroy(hrec);
772 hdr->dirty = 1;
776 int bcf_hdr_printf(bcf_hdr_t *hdr, const char *fmt, ...)
778 va_list ap;
779 va_start(ap, fmt);
780 int n = vsnprintf(NULL, 0, fmt, ap) + 2;
781 va_end(ap);
783 char *line = (char*)malloc(n);
784 va_start(ap, fmt);
785 vsnprintf(line, n, fmt, ap);
786 va_end(ap);
788 int ret = bcf_hdr_append(hdr, line);
790 free(line);
791 return ret;
795 /**********************
796 *** BCF header I/O ***
797 **********************/
799 const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
801 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
802 if ( !hrec )
804 hts_log_warning("No version string found, assuming VCFv4.2");
805 return "VCFv4.2";
807 return hrec->value;
810 void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
812 bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_GEN, "fileformat", NULL, NULL);
813 if ( !hrec )
815 int len;
816 kstring_t str = {0,0,0};
817 ksprintf(&str,"##fileformat=%s", version);
818 hrec = bcf_hdr_parse_line(hdr, str.s, &len);
819 free(str.s);
821 else
823 free(hrec->value);
824 hrec->value = strdup(version);
826 hdr->dirty = 1;
829 bcf_hdr_t *bcf_hdr_init(const char *mode)
831 int i;
832 bcf_hdr_t *h;
833 h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
834 if (!h) return NULL;
835 for (i = 0; i < 3; ++i)
836 if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
837 if ( strchr(mode,'w') )
839 bcf_hdr_append(h, "##fileformat=VCFv4.2");
840 // The filter PASS must appear first in the dictionary
841 bcf_hdr_append(h, "##FILTER=<ID=PASS,Description=\"All filters passed\">");
843 return h;
845 fail:
846 for (i = 0; i < 3; ++i)
847 kh_destroy(vdict, h->dict[i]);
848 free(h);
849 return NULL;
852 void bcf_hdr_destroy(bcf_hdr_t *h)
854 int i;
855 khint_t k;
856 if (!h) return;
857 for (i = 0; i < 3; ++i) {
858 vdict_t *d = (vdict_t*)h->dict[i];
859 if (d == 0) continue;
860 for (k = kh_begin(d); k != kh_end(d); ++k)
861 if (kh_exist(d, k)) free((char*)kh_key(d, k));
862 kh_destroy(vdict, d);
863 free(h->id[i]);
865 for (i=0; i<h->nhrec; i++)
866 bcf_hrec_destroy(h->hrec[i]);
867 if (h->nhrec) free(h->hrec);
868 if (h->samples) free(h->samples);
869 free(h->keep_samples);
870 free(h->transl[0]); free(h->transl[1]);
871 free(h->mem.s);
872 free(h);
875 bcf_hdr_t *bcf_hdr_read(htsFile *hfp)
877 if (hfp->format.format == vcf)
878 return vcf_hdr_read(hfp);
879 if (hfp->format.format != bcf) {
880 hts_log_error("Input is not detected as bcf or vcf format");
881 return NULL;
884 assert(hfp->is_bgzf);
886 BGZF *fp = hfp->fp.bgzf;
887 uint8_t magic[5];
888 bcf_hdr_t *h;
889 h = bcf_hdr_init("r");
890 if (!h) {
891 hts_log_error("Failed to allocate bcf header");
892 return NULL;
894 if (bgzf_read(fp, magic, 5) != 5)
896 hts_log_error("Failed to read the header (reading BCF in text mode?)");
897 bcf_hdr_destroy(h);
898 return NULL;
900 if (strncmp((char*)magic, "BCF\2\2", 5) != 0)
902 if (!strncmp((char*)magic, "BCF", 3))
903 hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported");
904 else
905 hts_log_error("Invalid BCF2 magic string");
906 bcf_hdr_destroy(h);
907 return NULL;
909 uint8_t buf[4];
910 size_t hlen;
911 char *htxt = NULL;
912 if (bgzf_read(fp, buf, 4) != 4) goto fail;
913 hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
914 if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; }
915 htxt = (char*)malloc(hlen + 1);
916 if (!htxt) goto fail;
917 if (bgzf_read(fp, htxt, hlen) != hlen) goto fail;
918 htxt[hlen] = '\0'; // Ensure htxt is terminated
919 if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail;
920 free(htxt);
921 return h;
922 fail:
923 hts_log_error("Failed to read BCF header");
924 free(htxt);
925 bcf_hdr_destroy(h);
926 return NULL;
929 int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
931 if (!h) {
932 errno = EINVAL;
933 return -1;
935 if ( h->dirty ) bcf_hdr_sync(h);
936 if (hfp->format.format == vcf || hfp->format.format == text_format)
937 return vcf_hdr_write(hfp, h);
939 kstring_t htxt = {0,0,0};
940 bcf_hdr_format(h, 1, &htxt);
941 kputc('\0', &htxt); // include the \0 byte
943 BGZF *fp = hfp->fp.bgzf;
944 if ( bgzf_write(fp, "BCF\2\2", 5) !=5 ) return -1;
945 uint8_t hlen[4];
946 u32_to_le(htxt.l, hlen);
947 if ( bgzf_write(fp, hlen, 4) !=4 ) return -1;
948 if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1;
950 free(htxt.s);
951 return 0;
954 /********************
955 *** BCF site I/O ***
956 ********************/
958 bcf1_t *bcf_init()
960 bcf1_t *v;
961 v = (bcf1_t*)calloc(1, sizeof(bcf1_t));
962 return v;
965 void bcf_clear(bcf1_t *v)
967 int i;
968 for (i=0; i<v->d.m_info; i++)
970 if ( v->d.info[i].vptr_free )
972 free(v->d.info[i].vptr - v->d.info[i].vptr_off);
973 v->d.info[i].vptr_free = 0;
976 for (i=0; i<v->d.m_fmt; i++)
978 if ( v->d.fmt[i].p_free )
980 free(v->d.fmt[i].p - v->d.fmt[i].p_off);
981 v->d.fmt[i].p_free = 0;
984 v->rid = v->pos = v->rlen = v->unpacked = 0;
985 bcf_float_set_missing(v->qual);
986 v->n_info = v->n_allele = v->n_fmt = v->n_sample = 0;
987 v->shared.l = v->indiv.l = 0;
988 v->d.var_type = -1;
989 v->d.shared_dirty = 0;
990 v->d.indiv_dirty = 0;
991 v->d.n_flt = 0;
992 v->errcode = 0;
993 if (v->d.m_als) v->d.als[0] = 0;
994 if (v->d.m_id) v->d.id[0] = 0;
997 void bcf_empty(bcf1_t *v)
999 bcf_clear1(v);
1000 free(v->d.id);
1001 free(v->d.als);
1002 free(v->d.allele); free(v->d.flt); free(v->d.info); free(v->d.fmt);
1003 if (v->d.var ) free(v->d.var);
1004 free(v->shared.s); free(v->indiv.s);
1007 void bcf_destroy(bcf1_t *v)
1009 if (!v) return;
1010 bcf_empty1(v);
1011 free(v);
1014 static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
1016 uint32_t x[8];
1017 ssize_t ret;
1018 if ((ret = bgzf_read(fp, x, 32)) != 32) {
1019 if (ret == 0) return -1;
1020 return -2;
1022 bcf_clear1(v);
1023 x[0] -= 24; // to exclude six 32-bit integers
1024 if (ks_resize(&v->shared, x[0]) != 0) return -2;
1025 if (ks_resize(&v->indiv, x[1]) != 0) return -2;
1026 memcpy(v, x + 2, 16);
1027 v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff;
1028 v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff;
1029 v->shared.l = x[0], v->indiv.l = x[1];
1030 // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4
1031 if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0;
1033 if (bgzf_read(fp, v->shared.s, v->shared.l) != v->shared.l) return -2;
1034 if (bgzf_read(fp, v->indiv.s, v->indiv.l) != v->indiv.l) return -2;
1035 return 0;
1038 #define bit_array_size(n) ((n)/8+1)
1039 #define bit_array_set(a,i) ((a)[(i)/8] |= 1 << ((i)%8))
1040 #define bit_array_clear(a,i) ((a)[(i)/8] &= ~(1 << ((i)%8)))
1041 #define bit_array_test(a,i) ((a)[(i)/8] & (1 << ((i)%8)))
1043 static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1044 int32_t *val) {
1045 uint32_t v;
1046 uint32_t t;
1047 if (end - p < 2) return -1;
1048 t = *p++ & 0xf;
1049 /* Use if .. else if ... else instead of switch to force order. Assumption
1050 is that small integers are more frequent than big ones. */
1051 if (t == BCF_BT_INT8) {
1052 *q = p + 1;
1053 *val = *(int8_t *) p;
1054 } else if (t == BCF_BT_INT16) {
1055 if (end - p < 2) return -1;
1056 *q = p + 2;
1057 v = p[0] | (p[1] << 8);
1058 *val = v < 0x8000 ? v : -((int32_t) (0xffff - v)) - 1;
1059 } else if (t == BCF_BT_INT32) {
1060 if (end - p < 4) return -1;
1061 *q = p + 4;
1062 v = p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
1063 *val = v < 0x80000000UL ? v : -((int32_t) (0xffffffffUL - v)) - 1;
1064 } else {
1065 return -1;
1067 return 0;
1070 static int bcf_dec_size_safe(uint8_t *p, uint8_t *end, uint8_t **q,
1071 int *num, int *type) {
1072 int r;
1073 if (p >= end) return -1;
1074 *type = *p & 0xf;
1075 if (*p>>4 != 15) {
1076 *q = p + 1;
1077 *num = *p >> 4;
1078 return 0;
1080 r = bcf_dec_typed_int1_safe(p + 1, end, q, num);
1081 if (r) return r;
1082 return *num >= 0 ? 0 : -1;
1085 static const char *get_type_name(int type) {
1086 const char *types[9] = {
1087 "null", "int (8-bit)", "int (16 bit)", "int (32 bit)",
1088 "unknown", "float", "unknown", "char", "unknown"
1090 int t = (type >= 0 && type < 8) ? type : 8;
1091 return types[t];
1094 static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1095 uint8_t *ptr, *end;
1096 size_t bytes;
1097 uint32_t err = 0;
1098 int type = 0;
1099 int num = 0;
1100 uint32_t i, reports;
1101 const uint32_t is_integer = ((1 << BCF_BT_INT8) |
1102 (1 << BCF_BT_INT16) |
1103 (1 << BCF_BT_INT32));
1104 const uint32_t is_valid_type = (is_integer |
1105 (1 << BCF_BT_NULL) |
1106 (1 << BCF_BT_FLOAT) |
1107 (1 << BCF_BT_CHAR));
1110 // Check for valid contig ID
1111 if (rec->rid < 0 || rec->rid >= hdr->n[BCF_DT_CTG]) {
1112 hts_log_warning("Bad BCF record: Invalid %s id %d", "CONTIG", rec->rid);
1113 err |= BCF_ERR_CTG_INVALID;
1116 // Check ID
1117 ptr = (uint8_t *) rec->shared.s;
1118 end = ptr + rec->shared.l;
1119 if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1120 if (type != BCF_BT_CHAR) {
1121 hts_log_warning("Bad BCF record: Invalid %s type %d (%s)", "ID", type, get_type_name(type));
1122 err |= BCF_ERR_TAG_INVALID;
1124 bytes = (size_t) num << bcf_type_shift[type];
1125 if (end - ptr < bytes) goto bad_shared;
1126 ptr += bytes;
1128 // Check REF and ALT
1129 reports = 0;
1130 for (i = 0; i < rec->n_allele; i++) {
1131 if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1132 if (type != BCF_BT_CHAR) {
1133 if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1134 hts_log_warning("Bad BCF record: Invalid %s type %d (%s)", "REF/ALT", type, get_type_name(type));
1135 err |= BCF_ERR_CHAR;
1137 bytes = (size_t) num << bcf_type_shift[type];
1138 if (end - ptr < bytes) goto bad_shared;
1139 ptr += bytes;
1142 // Check FILTER
1143 reports = 0;
1144 if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1145 if (num > 0) {
1146 if (((1 << type) & is_integer) == 0) {
1147 hts_log_warning("Bad BCF record: Invalid %s type %d (%s)", "FILTER", type, get_type_name(type));
1148 err |= BCF_ERR_TAG_INVALID;
1150 bytes = (size_t) num << bcf_type_shift[type];
1151 if (end - ptr < bytes) goto bad_shared;
1152 for (i = 0; i < num; i++) {
1153 int32_t key = bcf_dec_int1(ptr, type, &ptr);
1154 if (key < 0 || key >= hdr->n[BCF_DT_ID]) {
1155 if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1156 hts_log_warning("Bad BCF record: Invalid %s id %d", "FILTER", key);
1157 err |= BCF_ERR_TAG_UNDEF;
1162 // Check INFO
1163 reports = 0;
1164 for (i = 0; i < rec->n_info; i++) {
1165 int32_t key = -1;
1166 if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_shared;
1167 if (key < 0 || key >= hdr->n[BCF_DT_ID]) {
1168 if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1169 hts_log_warning("Bad BCF record: Invalid %s id %d", "INFO", key);
1170 err |= BCF_ERR_TAG_UNDEF;
1172 if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared;
1173 if (((1 << type) & is_valid_type) == 0) {
1174 if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1175 hts_log_warning("Bad BCF record: Invalid %s type %d (%s)", "INFO", type, get_type_name(type));
1176 err |= BCF_ERR_TAG_INVALID;
1178 bytes = (size_t) num << bcf_type_shift[type];
1179 if (end - ptr < bytes) goto bad_shared;
1180 ptr += bytes;
1183 // Check FORMAT and individual information
1184 ptr = (uint8_t *) rec->indiv.s;
1185 end = ptr + rec->indiv.l;
1186 reports = 0;
1187 for (i = 0; i < rec->n_fmt; i++) {
1188 int32_t key = -1;
1189 if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) goto bad_indiv;
1190 if (key < 0 || key >= hdr->n[BCF_DT_ID]) {
1191 if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1192 hts_log_warning("Bad BCF record: Invalid %s id %d", "FORMAT", key);
1193 err |= BCF_ERR_TAG_UNDEF;
1195 if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv;
1196 if (((1 << type) & is_valid_type) == 0) {
1197 if (!reports++ || hts_verbose >= HTS_LOG_DEBUG)
1198 hts_log_warning("Bad BCF record: Invalid %s type %d (%s)", "FORMAT", type, get_type_name(type));
1199 err |= BCF_ERR_TAG_INVALID;
1201 bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
1202 if (end - ptr < bytes) goto bad_indiv;
1203 ptr += bytes;
1206 rec->errcode |= err;
1208 return err ? -1 : 0;
1210 bad_shared:
1211 hts_log_error("Bad BCF record - shared section malformed or too short");
1212 return -1;
1214 bad_indiv:
1215 hts_log_error("Bad BCF record - individuals section malformed or too short");
1216 return -1;
1219 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
1220 int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
1222 if ( !hdr->keep_samples ) return 0;
1223 if ( !bcf_hdr_nsamples(hdr) )
1225 rec->indiv.l = rec->n_sample = 0;
1226 return 0;
1229 int i, j;
1230 uint8_t *ptr = (uint8_t*)rec->indiv.s, *dst = NULL, *src;
1231 bcf_dec_t *dec = &rec->d;
1232 hts_expand(bcf_fmt_t, rec->n_fmt, dec->m_fmt, dec->fmt);
1233 for (i=0; i<dec->m_fmt; ++i) dec->fmt[i].p_free = 0;
1235 for (i=0; i<rec->n_fmt; i++)
1237 ptr = bcf_unpack_fmt_core1(ptr, rec->n_sample, &dec->fmt[i]);
1238 src = dec->fmt[i].p - dec->fmt[i].size;
1239 if ( dst )
1241 memmove(dec->fmt[i-1].p + dec->fmt[i-1].p_len, dec->fmt[i].p - dec->fmt[i].p_off, dec->fmt[i].p_off);
1242 dec->fmt[i].p = dec->fmt[i-1].p + dec->fmt[i-1].p_len + dec->fmt[i].p_off;
1244 dst = dec->fmt[i].p;
1245 for (j=0; j<hdr->nsamples_ori; j++)
1247 src += dec->fmt[i].size;
1248 if ( !bit_array_test(hdr->keep_samples,j) ) continue;
1249 memmove(dst, src, dec->fmt[i].size);
1250 dst += dec->fmt[i].size;
1252 rec->indiv.l -= dec->fmt[i].p_len - (dst - dec->fmt[i].p);
1253 dec->fmt[i].p_len = dst - dec->fmt[i].p;
1255 rec->unpacked |= BCF_UN_FMT;
1257 rec->n_sample = bcf_hdr_nsamples(hdr);
1258 return 0;
1261 int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
1263 if (fp->format.format == vcf) return vcf_read(fp,h,v);
1264 int ret = bcf_read1_core(fp->fp.bgzf, v);
1265 if (ret == 0) ret = bcf_record_check(h, v);
1266 if ( ret!=0 || !h->keep_samples ) return ret;
1267 return bcf_subset_format(h,v);
1270 int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end)
1272 bcf1_t *v = (bcf1_t *) vv;
1273 int ret;
1274 if ((ret = bcf_read1_core(fp, v)) >= 0)
1275 *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
1276 return ret;
1279 static inline void bcf1_sync_id(bcf1_t *line, kstring_t *str)
1281 // single typed string
1282 if ( line->d.id && strcmp(line->d.id, ".") ) bcf_enc_vchar(str, strlen(line->d.id), line->d.id);
1283 else bcf_enc_size(str, 0, BCF_BT_CHAR);
1285 static inline void bcf1_sync_alleles(bcf1_t *line, kstring_t *str)
1287 // list of typed strings
1288 int i;
1289 for (i=0; i<line->n_allele; i++)
1290 bcf_enc_vchar(str, strlen(line->d.allele[i]), line->d.allele[i]);
1291 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1293 static inline void bcf1_sync_filter(bcf1_t *line, kstring_t *str)
1295 // typed vector of integers
1296 if ( line->d.n_flt ) bcf_enc_vint(str, line->d.n_flt, line->d.flt, -1);
1297 else bcf_enc_vint(str, 0, 0, -1);
1300 static inline void bcf1_sync_info(bcf1_t *line, kstring_t *str)
1302 // pairs of typed vectors
1303 int i, irm = -1;
1304 for (i=0; i<line->n_info; i++)
1306 bcf_info_t *info = &line->d.info[i];
1307 if ( !info->vptr )
1309 // marked for removal
1310 if ( irm < 0 ) irm = i;
1311 continue;
1313 kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, str);
1314 if ( irm >=0 )
1316 bcf_info_t tmp = line->d.info[irm]; line->d.info[irm] = line->d.info[i]; line->d.info[i] = tmp;
1317 while ( irm<=i && line->d.info[irm].vptr ) irm++;
1320 if ( irm>=0 ) line->n_info = irm;
1323 static int bcf1_sync(bcf1_t *line)
1325 char *shared_ori = line->shared.s;
1326 size_t prev_len;
1328 kstring_t tmp = {0,0,0};
1329 if ( !line->shared.l )
1331 // New line created via API, BCF data blocks do not exist. Get it ready for BCF output
1332 tmp = line->shared;
1333 bcf1_sync_id(line, &tmp);
1334 line->unpack_size[0] = tmp.l; prev_len = tmp.l;
1336 bcf1_sync_alleles(line, &tmp);
1337 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
1339 bcf1_sync_filter(line, &tmp);
1340 line->unpack_size[2] = tmp.l - prev_len;
1342 bcf1_sync_info(line, &tmp);
1343 line->shared = tmp;
1345 else if ( line->d.shared_dirty )
1347 // The line was edited, update the BCF data block.
1349 if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line,BCF_UN_STR);
1351 // ptr_ori points to the original unchanged BCF data.
1352 uint8_t *ptr_ori = (uint8_t *) line->shared.s;
1354 // ID: single typed string
1355 if ( line->d.shared_dirty & BCF1_DIRTY_ID )
1356 bcf1_sync_id(line, &tmp);
1357 else
1358 kputsn_(ptr_ori, line->unpack_size[0], &tmp);
1359 ptr_ori += line->unpack_size[0];
1360 line->unpack_size[0] = tmp.l; prev_len = tmp.l;
1362 // REF+ALT: list of typed strings
1363 if ( line->d.shared_dirty & BCF1_DIRTY_ALS )
1364 bcf1_sync_alleles(line, &tmp);
1365 else
1367 kputsn_(ptr_ori, line->unpack_size[1], &tmp);
1368 if ( !line->rlen && line->n_allele ) line->rlen = strlen(line->d.allele[0]);
1370 ptr_ori += line->unpack_size[1];
1371 line->unpack_size[1] = tmp.l - prev_len; prev_len = tmp.l;
1373 if ( line->unpacked & BCF_UN_FLT )
1375 // FILTER: typed vector of integers
1376 if ( line->d.shared_dirty & BCF1_DIRTY_FLT )
1377 bcf1_sync_filter(line, &tmp);
1378 else if ( line->d.n_flt )
1379 kputsn_(ptr_ori, line->unpack_size[2], &tmp);
1380 else
1381 bcf_enc_vint(&tmp, 0, 0, -1);
1382 ptr_ori += line->unpack_size[2];
1383 line->unpack_size[2] = tmp.l - prev_len;
1385 if ( line->unpacked & BCF_UN_INFO )
1387 // INFO: pairs of typed vectors
1388 if ( line->d.shared_dirty & BCF1_DIRTY_INF )
1390 bcf1_sync_info(line, &tmp);
1391 ptr_ori = (uint8_t*)line->shared.s + line->shared.l;
1396 int size = line->shared.l - (size_t)ptr_ori + (size_t)line->shared.s;
1397 if ( size ) kputsn_(ptr_ori, size, &tmp);
1399 free(line->shared.s);
1400 line->shared = tmp;
1402 if ( line->shared.s != shared_ori && line->unpacked & BCF_UN_INFO )
1404 // Reallocated line->shared.s block invalidated line->d.info[].vptr pointers
1405 size_t off_new = line->unpack_size[0] + line->unpack_size[1] + line->unpack_size[2];
1406 int i;
1407 for (i=0; i<line->n_info; i++)
1409 uint8_t *vptr_free = line->d.info[i].vptr_free ? line->d.info[i].vptr - line->d.info[i].vptr_off : NULL;
1410 line->d.info[i].vptr = (uint8_t*) line->shared.s + off_new + line->d.info[i].vptr_off;
1411 off_new += line->d.info[i].vptr_len + line->d.info[i].vptr_off;
1412 if ( vptr_free )
1414 free(vptr_free);
1415 line->d.info[i].vptr_free = 0;
1420 if ( line->n_sample && line->n_fmt && (!line->indiv.l || line->d.indiv_dirty) )
1422 // The genotype fields changed or are not present
1423 tmp.l = tmp.m = 0; tmp.s = NULL;
1424 int i, irm = -1;
1425 for (i=0; i<line->n_fmt; i++)
1427 bcf_fmt_t *fmt = &line->d.fmt[i];
1428 if ( !fmt->p )
1430 // marked for removal
1431 if ( irm < 0 ) irm = i;
1432 continue;
1434 kputsn_(fmt->p - fmt->p_off, fmt->p_len + fmt->p_off, &tmp);
1435 if ( irm >=0 )
1437 bcf_fmt_t tfmt = line->d.fmt[irm]; line->d.fmt[irm] = line->d.fmt[i]; line->d.fmt[i] = tfmt;
1438 while ( irm<=i && line->d.fmt[irm].p ) irm++;
1442 if ( irm>=0 ) line->n_fmt = irm;
1443 free(line->indiv.s);
1444 line->indiv = tmp;
1446 // Reallocated line->indiv.s block invalidated line->d.fmt[].p pointers
1447 size_t off_new = 0;
1448 for (i=0; i<line->n_fmt; i++)
1450 uint8_t *p_free = line->d.fmt[i].p_free ? line->d.fmt[i].p - line->d.fmt[i].p_off : NULL;
1451 line->d.fmt[i].p = (uint8_t*) line->indiv.s + off_new + line->d.fmt[i].p_off;
1452 off_new += line->d.fmt[i].p_len + line->d.fmt[i].p_off;
1453 if ( p_free )
1455 free(p_free);
1456 line->d.fmt[i].p_free = 0;
1460 if ( !line->n_sample ) line->n_fmt = 0;
1461 line->d.shared_dirty = line->d.indiv_dirty = 0;
1462 return 0;
1465 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
1467 bcf1_sync(src);
1469 bcf_clear(dst);
1470 dst->rid = src->rid;
1471 dst->pos = src->pos;
1472 dst->rlen = src->rlen;
1473 dst->qual = src->qual;
1474 dst->n_info = src->n_info; dst->n_allele = src->n_allele;
1475 dst->n_fmt = src->n_fmt; dst->n_sample = src->n_sample;
1477 dst->shared.m = dst->shared.l = src->shared.l;
1478 dst->shared.s = (char*) malloc(dst->shared.l);
1479 memcpy(dst->shared.s,src->shared.s,dst->shared.l);
1481 dst->indiv.m = dst->indiv.l = src->indiv.l;
1482 dst->indiv.s = (char*) malloc(dst->indiv.l);
1483 memcpy(dst->indiv.s,src->indiv.s,dst->indiv.l);
1485 return dst;
1487 bcf1_t *bcf_dup(bcf1_t *src)
1489 bcf1_t *out = bcf_init1();
1490 return bcf_copy(out, src);
1493 int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
1495 if ( h->dirty ) bcf_hdr_sync(h);
1496 if ( bcf_hdr_nsamples(h)!=v->n_sample )
1498 hts_log_error("Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d)",
1499 bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
1500 return -1;
1503 if ( hfp->format.format == vcf || hfp->format.format == text_format )
1504 return vcf_write(hfp,h,v);
1506 if ( v->errcode )
1508 // vcf_parse1() encountered a new contig or tag, undeclared in the
1509 // header. At this point, the header must have been printed,
1510 // proceeding would lead to a broken BCF file. Errors must be checked
1511 // and cleared by the caller before we can proceed.
1512 hts_log_error("Unchecked error (%d), exiting", v->errcode);
1513 exit(1);
1515 bcf1_sync(v); // check if the BCF record was modified
1517 BGZF *fp = hfp->fp.bgzf;
1518 uint32_t x[8];
1519 x[0] = v->shared.l + 24; // to include six 32-bit integers
1520 x[1] = v->indiv.l;
1521 memcpy(x + 2, v, 16);
1522 x[6] = (uint32_t)v->n_allele<<16 | v->n_info;
1523 x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample;
1524 if ( bgzf_write(fp, x, 32) != 32 ) return -1;
1525 if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1;
1526 if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1;
1527 return 0;
1530 /**********************
1531 *** VCF header I/O ***
1532 **********************/
1534 bcf_hdr_t *vcf_hdr_read(htsFile *fp)
1536 kstring_t txt, *s = &fp->line;
1537 int ret;
1538 bcf_hdr_t *h;
1539 h = bcf_hdr_init("r");
1540 if (!h) {
1541 hts_log_error("Failed to allocate bcf header");
1542 return NULL;
1544 txt.l = txt.m = 0; txt.s = 0;
1545 while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) {
1546 if (s->l == 0) continue;
1547 if (s->s[0] != '#') {
1548 hts_log_error("No sample line");
1549 goto error;
1551 if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
1552 kstring_t tmp = { 0, 0, NULL };
1553 hFILE *f = hopen(fp->fn_aux, "r");
1554 if (f == NULL) {
1555 hts_log_error("Couldn't open \"%s\"", fp->fn_aux);
1556 goto error;
1558 while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) {
1559 char *tab = strchr(tmp.s, '\t');
1560 if (tab == NULL) continue;
1561 kputs("##contig=<ID=", &txt); kputsn(tmp.s, tab - tmp.s, &txt);
1562 kputs(",length=", &txt); kputl(atol(tab), &txt);
1563 kputsn(">\n", 2, &txt);
1565 free(tmp.s);
1566 if (hclose(f) != 0) {
1567 hts_log_warning("Failed to close %s", fp->fn_aux);
1570 kputsn(s->s, s->l, &txt);
1571 kputc('\n', &txt);
1572 if (s->s[1] != '#') break;
1574 if ( ret < -1 ) goto error;
1575 if ( !txt.s )
1577 hts_log_error("Could not read the header");
1578 goto error;
1580 if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error;
1582 // check tabix index, are all contigs listed in the header? add the missing ones
1583 tbx_t *idx = tbx_index_load(fp->fn);
1584 if ( idx )
1586 int i, n, need_sync = 0;
1587 const char **names = tbx_seqnames(idx, &n);
1588 for (i=0; i<n; i++)
1590 bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_HL_CTG, "ID", (char*) names[i], NULL);
1591 if ( hrec ) continue;
1592 hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
1593 hrec->key = strdup("contig");
1594 bcf_hrec_add_key(hrec, "ID", strlen("ID"));
1595 bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
1596 bcf_hdr_add_hrec(h, hrec);
1597 need_sync = 1;
1599 free(names);
1600 tbx_destroy(idx);
1601 if ( need_sync )
1602 bcf_hdr_sync(h);
1604 free(txt.s);
1605 return h;
1607 error:
1608 free(txt.s);
1609 if (h) bcf_hdr_destroy(h);
1610 return NULL;
1613 int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
1615 int i, n;
1616 char **lines = hts_readlines(fname, &n);
1617 if ( !lines ) return 1;
1618 for (i=0; i<n-1; i++)
1620 int k;
1621 bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k);
1622 if ( hrec ) bcf_hdr_add_hrec(hdr, hrec);
1623 free(lines[i]);
1625 bcf_hdr_parse_sample_line(hdr,lines[n-1]);
1626 free(lines[n-1]);
1627 free(lines);
1628 bcf_hdr_sync(hdr);
1629 return 0;
1632 static void _bcf_hrec_format(const bcf_hrec_t *hrec, int is_bcf, kstring_t *str)
1634 if ( !hrec->value )
1636 int j, nout = 0;
1637 ksprintf(str, "##%s=<", hrec->key);
1638 for (j=0; j<hrec->nkeys; j++)
1640 // do not output IDX if output is VCF
1641 if ( !is_bcf && !strcmp("IDX",hrec->keys[j]) ) continue;
1642 if ( nout ) kputc(',',str);
1643 ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
1644 nout++;
1646 ksprintf(str,">\n");
1648 else
1649 ksprintf(str,"##%s=%s\n", hrec->key,hrec->value);
1652 void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
1654 _bcf_hrec_format(hrec,0,str);
1657 int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str)
1659 int i;
1660 for (i=0; i<hdr->nhrec; i++)
1661 _bcf_hrec_format(hdr->hrec[i], is_bcf, str);
1663 ksprintf(str, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
1664 if ( bcf_hdr_nsamples(hdr) )
1666 ksprintf(str, "\tFORMAT");
1667 for (i=0; i<bcf_hdr_nsamples(hdr); i++)
1668 ksprintf(str, "\t%s", hdr->samples[i]);
1670 ksprintf(str, "\n");
1672 return 0;
1675 char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
1677 kstring_t txt = {0,0,0};
1678 bcf_hdr_format(hdr, is_bcf, &txt);
1679 if ( len ) *len = txt.l;
1680 return txt.s;
1683 const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n)
1685 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
1686 int tid, m = kh_size(d);
1687 const char **names = (const char**) calloc(m,sizeof(const char*));
1688 khint_t k;
1689 for (k=kh_begin(d); k<kh_end(d); k++)
1691 if ( !kh_exist(d,k) ) continue;
1692 tid = kh_val(d,k).id;
1693 assert( tid<m );
1694 names[tid] = kh_key(d,k);
1696 // sanity check: there should be no gaps
1697 for (tid=0; tid<m; tid++)
1698 assert(names[tid]);
1699 *n = m;
1700 return names;
1703 int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
1705 kstring_t htxt = {0,0,0};
1706 bcf_hdr_format(h, 0, &htxt);
1707 while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros
1708 int ret;
1709 if ( fp->format.compression!=no_compression )
1710 ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l);
1711 else
1712 ret = hwrite(fp->fp.hfile, htxt.s, htxt.l);
1713 free(htxt.s);
1714 return ret<0 ? -1 : 0;
1717 /***********************
1718 *** Typed value I/O ***
1719 ***********************/
1721 void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
1723 int32_t max = INT32_MIN + 1, min = INT32_MAX;
1724 int i;
1725 if (n <= 0) bcf_enc_size(s, 0, BCF_BT_NULL);
1726 else if (n == 1) bcf_enc_int1(s, a[0]);
1727 else {
1728 if (wsize <= 0) wsize = n;
1729 for (i = 0; i < n; ++i) {
1730 if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
1731 if (max < a[i]) max = a[i];
1732 if (min > a[i]) min = a[i];
1734 if (max <= INT8_MAX && min > bcf_int8_vector_end) {
1735 bcf_enc_size(s, wsize, BCF_BT_INT8);
1736 for (i = 0; i < n; ++i)
1737 if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
1738 else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
1739 else kputc(a[i], s);
1740 } else if (max <= INT16_MAX && min > bcf_int16_vector_end) {
1741 uint8_t *p;
1742 bcf_enc_size(s, wsize, BCF_BT_INT16);
1743 ks_resize(s, s->l + n * sizeof(int16_t));
1744 p = (uint8_t *) s->s + s->l;
1745 for (i = 0; i < n; ++i)
1747 int16_t x;
1748 if ( a[i]==bcf_int32_vector_end ) x = bcf_int16_vector_end;
1749 else if ( a[i]==bcf_int32_missing ) x = bcf_int16_missing;
1750 else x = a[i];
1751 i16_to_le(x, p);
1752 p += sizeof(int16_t);
1754 s->l += n * sizeof(int16_t);
1755 } else {
1756 uint8_t *p;
1757 bcf_enc_size(s, wsize, BCF_BT_INT32);
1758 ks_resize(s, s->l + n * sizeof(int32_t));
1759 p = (uint8_t *) s->s + s->l;
1760 for (i = 0; i < n; ++i) {
1761 i32_to_le(a[i], p);
1762 p += sizeof(int32_t);
1764 s->l += n * sizeof(int32_t);
1769 static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) {
1770 uint8_t *p;
1771 size_t i;
1772 size_t bytes = n * sizeof(float);
1774 if (bytes / sizeof(float) != n) return -1;
1775 if (ks_resize(s, s->l + bytes) < 0) return -1;
1777 p = (uint8_t *) s->s + s->l;
1778 for (i = 0; i < n; i++) {
1779 float_to_le(a[i], p);
1780 p += sizeof(float);
1782 s->l += bytes;
1784 return 0;
1787 void bcf_enc_vfloat(kstring_t *s, int n, float *a)
1789 assert(n >= 0);
1790 bcf_enc_size(s, n, BCF_BT_FLOAT);
1791 serialize_float_array(s, n, a);
1794 void bcf_enc_vchar(kstring_t *s, int l, const char *a)
1796 bcf_enc_size(s, l, BCF_BT_CHAR);
1797 kputsn(a, l, s);
1800 void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
1802 int j = 0;
1803 if (n == 0) {
1804 kputc('.', s);
1805 return;
1807 if (type == BCF_BT_CHAR)
1809 char *p = (char*)data;
1810 for (j = 0; j < n && *p; ++j, ++p)
1812 if ( *p==bcf_str_missing ) kputc('.', s);
1813 else kputc(*p, s);
1816 else
1818 #define BRANCH(type_t, convert, is_missing, is_vector_end, kprint) { \
1819 uint8_t *p = (uint8_t *) data; \
1820 for (j=0; j<n; j++, p += sizeof(type_t)) \
1822 type_t v = convert(p); \
1823 if ( is_vector_end ) break; \
1824 if ( j ) kputc(',', s); \
1825 if ( is_missing ) kputc('.', s); \
1826 else kprint; \
1829 switch (type) {
1830 case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, v==bcf_int8_missing, v==bcf_int8_vector_end, kputw(v, s)); break;
1831 case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break;
1832 case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break;
1833 case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break;
1834 default: hts_log_error("Unexpected type %d", type); exit(1); break;
1836 #undef BRANCH
1840 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
1842 int x, type;
1843 x = bcf_dec_size(ptr, &ptr, &type);
1844 bcf_fmt_array(s, x, type, ptr);
1845 return ptr + (x << bcf_type_shift[type]);
1848 /********************
1849 *** VCF site I/O ***
1850 ********************/
1852 typedef struct {
1853 int key, max_m, size, offset;
1854 uint64_t is_gt:1, max_g:31, max_l:32;
1855 uint32_t y;
1856 uint8_t *buf;
1857 } fmt_aux_t;
1859 static inline void align_mem(kstring_t *s)
1861 if (s->l&7) {
1862 uint64_t zero = 0;
1863 int l = ((s->l + 7)>>3<<3) - s->l;
1864 kputsn((char*)&zero, l, s);
1868 // p,q is the start and the end of the FORMAT field
1869 #define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */
1870 static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
1872 if ( !bcf_hdr_nsamples(h) ) return 0;
1874 char *r, *t;
1875 int j, l, m, g;
1876 khint_t k;
1877 ks_tokaux_t aux1;
1878 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
1879 kstring_t *mem = (kstring_t*)&h->mem;
1880 fmt_aux_t fmt[MAX_N_FMT];
1881 mem->l = 0;
1883 char *end = s->s + s->l;
1884 if ( q>=end )
1886 hts_log_error("FORMAT column with no sample columns starting at %s:%d", s->s, v->pos+1);
1887 return -1;
1890 v->n_fmt = 0;
1891 if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
1893 v->n_sample = bcf_hdr_nsamples(h);
1894 return 0;
1897 // get format information from the dictionary
1898 for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
1899 if (j >= MAX_N_FMT) {
1900 v->errcode |= BCF_ERR_LIMITS;
1901 hts_log_error("FORMAT column at %s:%d lists more identifiers than htslib can handle",
1902 bcf_seqname(h,v), v->pos+1);
1903 return -1;
1906 *(char*)aux1.p = 0;
1907 k = kh_get(vdict, d, t);
1908 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
1909 if ( t[0]=='.' && t[1]==0 )
1911 hts_log_error("Invalid FORMAT tag name '.'");
1912 v->errcode |= BCF_ERR_TAG_INVALID;
1913 return -1;
1915 hts_log_warning("FORMAT '%s' is not defined in the header, assuming Type=String", t);
1916 kstring_t tmp = {0,0,0};
1917 int l;
1918 ksprintf(&tmp, "##FORMAT=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", t);
1919 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
1920 free(tmp.s);
1921 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
1922 k = kh_get(vdict, d, t);
1923 v->errcode = BCF_ERR_TAG_UNDEF;
1924 if (k == kh_end(d)) {
1925 hts_log_error("Could not add dummy header for FORMAT '%s'", t);
1926 v->errcode |= BCF_ERR_TAG_INVALID;
1927 return -1;
1930 fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
1931 fmt[j].key = kh_val(d, k).id;
1932 fmt[j].is_gt = !strcmp(t, "GT");
1933 fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
1934 v->n_fmt++;
1936 // compute max
1937 int n_sample_ori = -1;
1938 r = q + 1; // r: position in the format string
1939 l = 0, m = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles
1940 while ( r<end )
1942 // can we skip some samples?
1943 if ( h->keep_samples )
1945 n_sample_ori++;
1946 if ( !bit_array_test(h->keep_samples,n_sample_ori) )
1948 while ( *r!='\t' && r<end ) r++;
1949 if ( *r=='\t' ) { *r = 0; r++; }
1950 continue;
1954 // collect fmt stats: max vector size, length, number of alleles
1955 j = 0; // j-th format field
1956 for (;;)
1958 if ( *r == '\t' ) *r = 0;
1959 if ( *r == ':' || !*r ) // end of field or end of sample
1961 if (fmt[j].max_m < m) fmt[j].max_m = m;
1962 if (fmt[j].max_l < l) fmt[j].max_l = l;
1963 if (fmt[j].is_gt && fmt[j].max_g < g) fmt[j].max_g = g;
1964 l = 0, m = g = 1;
1965 if ( *r==':' )
1967 j++;
1968 if ( j>=v->n_fmt )
1970 hts_log_error("Incorrect number of FORMAT fields at %s:%d",
1971 h->id[BCF_DT_CTG][v->rid].key, v->pos+1);
1972 exit(1);
1975 else break;
1977 else if ( *r== ',' ) m++;
1978 else if ( fmt[j].is_gt && (*r == '|' || *r == '/') ) g++;
1979 if ( r>=end ) break;
1980 r++; l++;
1982 v->n_sample++;
1983 if ( v->n_sample == bcf_hdr_nsamples(h) ) break;
1984 r++;
1987 // allocate memory for arrays
1988 for (j = 0; j < v->n_fmt; ++j) {
1989 fmt_aux_t *f = &fmt[j];
1990 if ( !f->max_m ) f->max_m = 1; // omitted trailing format field
1991 if ((f->y>>4&0xf) == BCF_HT_STR) {
1992 f->size = f->is_gt? f->max_g << 2 : f->max_l;
1993 } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
1994 f->size = f->max_m << 2;
1995 } else
1997 hts_log_error("The format type %d is currently not supported", f->y>>4&0xf);
1998 abort(); // I do not know how to do with Flag in the genotype fields
2000 align_mem(mem);
2001 f->offset = mem->l;
2002 ks_resize(mem, mem->l + v->n_sample * f->size);
2003 mem->l += v->n_sample * f->size;
2005 for (j = 0; j < v->n_fmt; ++j)
2006 fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
2007 // fill the sample fields; at beginning of the loop, t points to the first char of a format
2008 n_sample_ori = -1;
2009 t = q + 1; m = 0; // m: sample id
2010 while ( t<end )
2012 // can we skip some samples?
2013 if ( h->keep_samples )
2015 n_sample_ori++;
2016 if ( !bit_array_test(h->keep_samples,n_sample_ori) )
2018 while ( *t && t<end ) t++;
2019 t++;
2020 continue;
2023 if ( m == bcf_hdr_nsamples(h) ) break;
2025 j = 0; // j-th format field, m-th sample
2026 while ( t < end )
2028 fmt_aux_t *z = &fmt[j++];
2029 if ((z->y>>4&0xf) == BCF_HT_STR) {
2030 if (z->is_gt) { // genotypes
2031 int32_t is_phased = 0, *x = (int32_t*)(z->buf + z->size * m);
2032 for (l = 0;; ++t) {
2033 if (*t == '.') ++t, x[l++] = is_phased;
2034 else x[l++] = (strtol(t, &t, 10) + 1) << 1 | is_phased;
2035 #if THOROUGH_SANITY_CHECKS
2036 assert( 0 ); // success of strtol,strtod not checked
2037 #endif
2038 is_phased = (*t == '|');
2039 if (*t != '|' && *t != '/') break;
2041 if ( !l ) x[l++] = 0; // An empty field, insert missing value
2042 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2043 } else {
2044 char *x = (char*)z->buf + z->size * m;
2045 for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
2046 for (; l < z->size; ++l) x[l] = 0;
2048 } else if ((z->y>>4&0xf) == BCF_HT_INT) {
2049 int32_t *x = (int32_t*)(z->buf + z->size * m);
2050 for (l = 0;; ++t) {
2051 if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
2052 else x[l++] = strtol(t, &t, 10);
2053 if (*t != ',') break;
2055 if ( !l ) x[l++] = bcf_int32_missing;
2056 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2057 } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
2058 float *x = (float*)(z->buf + z->size * m);
2059 for (l = 0;; ++t) {
2060 if (*t == '.' && !isdigit_c(t[1])) bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
2061 else x[l++] = strtod(t, &t);
2062 if (*t != ',') break;
2064 if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value
2065 for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
2066 } else abort();
2068 if (*t == '\0') {
2069 break;
2071 else if (*t == ':') {
2072 t++;
2074 else {
2075 char buffer[8];
2076 hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%d",
2077 dump_char(buffer, *t), h->id[BCF_DT_ID][z->key].key, bcf_seqname(h,v), v->pos+1);
2078 v->errcode |= BCF_ERR_CHAR;
2079 return -1;
2083 for (; j < v->n_fmt; ++j) { // fill end-of-vector values
2084 fmt_aux_t *z = &fmt[j];
2085 if ((z->y>>4&0xf) == BCF_HT_STR) {
2086 if (z->is_gt) {
2087 int32_t *x = (int32_t*)(z->buf + z->size * m);
2088 if (z->size) x[0] = bcf_int32_missing;
2089 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2090 } else {
2091 char *x = (char*)z->buf + z->size * m;
2092 if ( z->size ) x[0] = '.';
2093 for (l = 1; l < z->size; ++l) x[l] = 0;
2095 } else if ((z->y>>4&0xf) == BCF_HT_INT) {
2096 int32_t *x = (int32_t*)(z->buf + z->size * m);
2097 x[0] = bcf_int32_missing;
2098 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
2099 } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
2100 float *x = (float*)(z->buf + z->size * m);
2101 bcf_float_set_missing(x[0]);
2102 for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
2106 m++; t++;
2109 // write individual genotype information
2110 kstring_t *str = &v->indiv;
2111 int i;
2112 if (v->n_sample > 0) {
2113 for (i = 0; i < v->n_fmt; ++i) {
2114 fmt_aux_t *z = &fmt[i];
2115 bcf_enc_int1(str, z->key);
2116 if ((z->y>>4&0xf) == BCF_HT_STR && !z->is_gt) {
2117 bcf_enc_size(str, z->size, BCF_BT_CHAR);
2118 kputsn((char*)z->buf, z->size * v->n_sample, str);
2119 } else if ((z->y>>4&0xf) == BCF_HT_INT || z->is_gt) {
2120 bcf_enc_vint(str, (z->size>>2) * v->n_sample, (int32_t*)z->buf, z->size>>2);
2121 } else {
2122 bcf_enc_size(str, z->size>>2, BCF_BT_FLOAT);
2123 if (serialize_float_array(str, (z->size>>2) * v->n_sample,
2124 (float *) z->buf) != 0) {
2125 v->errcode |= BCF_ERR_LIMITS;
2126 hts_log_error("Out of memory");
2127 return -1;
2133 if ( v->n_sample!=bcf_hdr_nsamples(h) )
2135 hts_log_error("Number of columns at %s:%d does not match the number of samples (%d vs %d)",
2136 bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
2137 v->errcode |= BCF_ERR_NCOLS;
2138 return -1;
2140 if ( v->indiv.l > 0xffffffff )
2142 hts_log_error("The FORMAT at %s:%d is too long", bcf_seqname(h,v), v->pos+1);
2143 v->errcode |= BCF_ERR_LIMITS;
2145 // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed
2146 v->n_fmt = 0;
2147 return -1;
2150 return 0;
2153 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
2155 int i = 0;
2156 char *p, *q, *r, *t;
2157 kstring_t *str;
2158 khint_t k;
2159 ks_tokaux_t aux;
2160 int max_n_flt = 0, max_n_val = 0;
2161 int32_t *flt_a = NULL, *val_a = NULL;
2162 int ret = -1;
2164 // Assumed in lots of places, but we may as well spot this early
2165 assert(sizeof(float) == sizeof(int32_t));
2167 bcf_clear1(v);
2168 str = &v->shared;
2169 memset(&aux, 0, sizeof(ks_tokaux_t));
2170 for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
2171 q = (char*)aux.p;
2172 *q = 0;
2173 if (i == 0) { // CHROM
2174 vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
2175 k = kh_get(vdict, d, p);
2176 if (k == kh_end(d))
2178 // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
2179 // been already printed, but will enable tools like vcfcheck to proceed.
2180 hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
2181 kstring_t tmp = {0,0,0};
2182 int l;
2183 ksprintf(&tmp, "##contig=<ID=%s>", p);
2184 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2185 free(tmp.s);
2186 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
2187 k = kh_get(vdict, d, p);
2188 v->errcode = BCF_ERR_CTG_UNDEF;
2189 if (k == kh_end(d)) {
2190 hts_log_error("Could not add dummy header for contig '%s'", p);
2191 v->errcode |= BCF_ERR_CTG_INVALID;
2192 goto err;
2195 v->rid = kh_val(d, k).id;
2196 } else if (i == 1) { // POS
2197 v->pos = atoi(p) - 1;
2198 } else if (i == 2) { // ID
2199 if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
2200 else bcf_enc_size(str, 0, BCF_BT_CHAR);
2201 } else if (i == 3) { // REF
2202 bcf_enc_vchar(str, q - p, p);
2203 v->n_allele = 1, v->rlen = q - p;
2204 } else if (i == 4) { // ALT
2205 if (strcmp(p, ".")) {
2206 for (r = t = p;; ++r) {
2207 if (*r == ',' || *r == 0) {
2208 bcf_enc_vchar(str, r - t, t);
2209 t = r + 1;
2210 ++v->n_allele;
2212 if (r == q) break;
2215 } else if (i == 5) { // QUAL
2216 if (strcmp(p, ".")) v->qual = atof(p);
2217 else bcf_float_set_missing(v->qual);
2218 if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
2219 } else if (i == 6) { // FILTER
2220 if (strcmp(p, ".")) {
2221 int n_flt = 1, i;
2222 ks_tokaux_t aux1;
2223 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2224 // count the number of filters
2225 if (*(q-1) == ';') *(q-1) = 0;
2226 for (r = p; *r; ++r)
2227 if (*r == ';') ++n_flt;
2228 if (n_flt > max_n_flt) {
2229 int32_t *a = realloc(flt_a, n_flt * sizeof(int32_t));
2230 if (!a) {
2231 hts_log_error("Could not allocate memory");
2232 v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
2233 goto err;
2235 max_n_flt = n_flt;
2236 flt_a = a;
2238 // add filters
2239 for (t = kstrtok(p, ";", &aux1), i = 0; t; t = kstrtok(0, 0, &aux1)) {
2240 *(char*)aux1.p = 0;
2241 k = kh_get(vdict, d, t);
2242 if (k == kh_end(d))
2244 // Simple error recovery for FILTERs not defined in the header. It will not help when VCF header has
2245 // been already printed, but will enable tools like vcfcheck to proceed.
2246 hts_log_warning("FILTER '%s' is not defined in the header", t);
2247 kstring_t tmp = {0,0,0};
2248 int l;
2249 ksprintf(&tmp, "##FILTER=<ID=%s,Description=\"Dummy\">", t);
2250 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2251 free(tmp.s);
2252 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
2253 k = kh_get(vdict, d, t);
2254 v->errcode = BCF_ERR_TAG_UNDEF;
2255 if (k == kh_end(d)) {
2256 hts_log_error("Could not add dummy header for FILTER '%s'", t);
2257 v->errcode |= BCF_ERR_TAG_INVALID;
2258 goto err;
2261 flt_a[i++] = kh_val(d, k).id;
2263 n_flt = i;
2264 bcf_enc_vint(str, n_flt, flt_a, -1);
2265 } else bcf_enc_vint(str, 0, 0, -1);
2266 if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
2267 } else if (i == 7) { // INFO
2268 char *key;
2269 vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
2270 v->n_info = 0;
2271 if (strcmp(p, ".")) {
2272 if (*(q-1) == ';') *(q-1) = 0;
2273 for (r = key = p;; ++r) {
2274 int c;
2275 char *val, *end;
2276 if (*r != ';' && *r != '=' && *r != 0) continue;
2277 val = end = 0;
2278 c = *r; *r = 0;
2279 if (c == '=') {
2280 val = r + 1;
2281 for (end = val; *end != ';' && *end != 0; ++end);
2282 c = *end; *end = 0;
2283 } else end = r;
2284 if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO
2285 k = kh_get(vdict, d, key);
2286 if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
2288 hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
2289 kstring_t tmp = {0,0,0};
2290 int l;
2291 ksprintf(&tmp, "##INFO=<ID=%s,Number=1,Type=String,Description=\"Dummy\">", key);
2292 bcf_hrec_t *hrec = bcf_hdr_parse_line(h,tmp.s,&l);
2293 free(tmp.s);
2294 if ( bcf_hdr_add_hrec((bcf_hdr_t*)h, hrec) ) bcf_hdr_sync((bcf_hdr_t*)h);
2295 k = kh_get(vdict, d, key);
2296 v->errcode = BCF_ERR_TAG_UNDEF;
2297 if (k == kh_end(d)) {
2298 hts_log_error("Could not add dummy header for INFO '%s'", key);
2299 v->errcode |= BCF_ERR_TAG_INVALID;
2300 goto err;
2303 uint32_t y = kh_val(d, k).info[BCF_HL_INFO];
2304 ++v->n_info;
2305 bcf_enc_int1(str, kh_val(d, k).id);
2306 if (val == 0) {
2307 bcf_enc_size(str, 0, BCF_BT_NULL);
2308 } else if ((y>>4&0xf) == BCF_HT_FLAG || (y>>4&0xf) == BCF_HT_STR) { // if Flag has a value, treat it as a string
2309 bcf_enc_vchar(str, end - val, val);
2310 } else { // int/float value/array
2311 int i, n_val;
2312 char *t, *te;
2313 for (t = val, n_val = 1; *t; ++t) // count the number of values
2314 if (*t == ',') ++n_val;
2315 // Check both int and float size in one step for simplicity
2316 if (n_val > max_n_val) {
2317 int32_t *z;
2318 z = (int32_t *)realloc((void *)val_a, n_val * sizeof(*z));
2319 if (!z) {
2320 hts_log_error("Could not allocate memory");
2321 v->errcode |= BCF_ERR_LIMITS; // No appropriate code?
2322 goto err;
2324 max_n_val = n_val;
2325 val_a = z;
2327 if ((y>>4&0xf) == BCF_HT_INT) {
2328 for (i = 0, t = val; i < n_val; ++i, ++t)
2330 val_a[i] = strtol(t, &te, 10);
2331 if ( te==t ) // conversion failed
2333 val_a[i] = bcf_int32_missing;
2334 while ( *te && *te!=',' ) te++;
2336 t = te;
2338 bcf_enc_vint(str, n_val, val_a, -1);
2339 if (strcmp(key, "END") == 0) v->rlen = val_a[0] - v->pos;
2340 } else if ((y>>4&0xf) == BCF_HT_REAL) {
2341 float *val_f = (float *)val_a;
2342 for (i = 0, t = val; i < n_val; ++i, ++t)
2344 val_f[i] = strtod(t, &te);
2345 if ( te==t ) // conversion failed
2347 bcf_float_set_missing(val_f[i]);
2348 while ( *te && *te!=',' ) te++;
2350 t = te;
2352 bcf_enc_vfloat(str, n_val, val_f);
2355 if (c == 0) break;
2356 r = end;
2357 key = r + 1;
2360 if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
2361 } else if (i == 8) {// FORMAT
2362 free(flt_a);
2363 free(val_a);
2364 return vcf_parse_format(s, h, v, p, q);
2368 end:
2369 ret = 0;
2371 err:
2372 free(flt_a);
2373 free(val_a);
2374 return ret;
2377 int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2379 int ret;
2380 ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
2381 if (ret < 0) return ret;
2382 return vcf_parse1(&fp->line, h, v);
2385 static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt)
2387 uint8_t *ptr_start = ptr;
2388 fmt->id = bcf_dec_typed_int1(ptr, &ptr);
2389 fmt->n = bcf_dec_size(ptr, &ptr, &fmt->type);
2390 fmt->size = fmt->n << bcf_type_shift[fmt->type];
2391 fmt->p = ptr;
2392 fmt->p_off = ptr - ptr_start;
2393 fmt->p_free = 0;
2394 ptr += n_sample * fmt->size;
2395 fmt->p_len = ptr - fmt->p;
2396 return ptr;
2399 static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info)
2401 uint8_t *ptr_start = ptr;
2402 info->key = bcf_dec_typed_int1(ptr, &ptr);
2403 info->len = bcf_dec_size(ptr, &ptr, &info->type);
2404 info->vptr = ptr;
2405 info->vptr_off = ptr - ptr_start;
2406 info->vptr_free = 0;
2407 info->v1.i = 0;
2408 if (info->len == 1) {
2409 if (info->type == BCF_BT_INT8 || info->type == BCF_BT_CHAR) info->v1.i = *(int8_t*)ptr;
2410 else if (info->type == BCF_BT_INT32) info->v1.i = le_to_i32(ptr);
2411 else if (info->type == BCF_BT_FLOAT) info->v1.f = le_to_float(ptr);
2412 else if (info->type == BCF_BT_INT16) info->v1.i = le_to_i16(ptr);
2414 ptr += info->len << bcf_type_shift[info->type];
2415 info->vptr_len = ptr - info->vptr;
2416 return ptr;
2419 int bcf_unpack(bcf1_t *b, int which)
2421 if ( !b->shared.l ) return 0; // Building a new BCF record from scratch
2422 uint8_t *ptr = (uint8_t*)b->shared.s, *ptr_ori;
2423 int i;
2424 bcf_dec_t *d = &b->d;
2425 if (which & BCF_UN_FLT) which |= BCF_UN_STR;
2426 if (which & BCF_UN_INFO) which |= BCF_UN_SHR;
2427 if ((which&BCF_UN_STR) && !(b->unpacked&BCF_UN_STR))
2429 kstring_t tmp;
2431 // ID
2432 tmp.l = 0; tmp.s = d->id; tmp.m = d->m_id;
2433 ptr_ori = ptr;
2434 ptr = bcf_fmt_sized_array(&tmp, ptr);
2435 b->unpack_size[0] = ptr - ptr_ori;
2436 kputc('\0', &tmp);
2437 d->id = tmp.s; d->m_id = tmp.m;
2439 // REF and ALT are in a single block (d->als) and d->alleles are pointers into this block
2440 hts_expand(char*, b->n_allele, d->m_allele, d->allele); // NM: hts_expand() is a macro
2441 tmp.l = 0; tmp.s = d->als; tmp.m = d->m_als;
2442 ptr_ori = ptr;
2443 char *o = "";
2444 for (i = 0; i < b->n_allele; ++i) {
2445 d->allele[i] = o + tmp.l;
2446 ptr = bcf_fmt_sized_array(&tmp, ptr);
2447 kputc('\0', &tmp);
2449 b->unpack_size[1] = ptr - ptr_ori;
2450 d->als = tmp.s; d->m_als = tmp.m;
2452 for (i = 0; i < b->n_allele; ++i)
2453 d->allele[i] = d->als + (d->allele[i]-o);
2454 b->unpacked |= BCF_UN_STR;
2456 if ((which&BCF_UN_FLT) && !(b->unpacked&BCF_UN_FLT)) { // FILTER
2457 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1];
2458 ptr_ori = ptr;
2459 if (*ptr>>4) {
2460 int type;
2461 d->n_flt = bcf_dec_size(ptr, &ptr, &type);
2462 hts_expand(int, d->n_flt, d->m_flt, d->flt);
2463 for (i = 0; i < d->n_flt; ++i)
2464 d->flt[i] = bcf_dec_int1(ptr, type, &ptr);
2465 } else ++ptr, d->n_flt = 0;
2466 b->unpack_size[2] = ptr - ptr_ori;
2467 b->unpacked |= BCF_UN_FLT;
2469 if ((which&BCF_UN_INFO) && !(b->unpacked&BCF_UN_INFO)) { // INFO
2470 ptr = (uint8_t*)b->shared.s + b->unpack_size[0] + b->unpack_size[1] + b->unpack_size[2];
2471 hts_expand(bcf_info_t, b->n_info, d->m_info, d->info);
2472 for (i = 0; i < d->m_info; ++i) d->info[i].vptr_free = 0;
2473 for (i = 0; i < b->n_info; ++i)
2474 ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
2475 b->unpacked |= BCF_UN_INFO;
2477 if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
2478 ptr = (uint8_t*)b->indiv.s;
2479 hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
2480 for (i = 0; i < d->m_fmt; ++i) d->fmt[i].p_free = 0;
2481 for (i = 0; i < b->n_fmt; ++i)
2482 ptr = bcf_unpack_fmt_core1(ptr, b->n_sample, &d->fmt[i]);
2483 b->unpacked |= BCF_UN_FMT;
2485 return 0;
2488 int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
2490 int i;
2491 bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
2492 kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM
2493 kputc('\t', s); kputw(v->pos + 1, s); // POS
2494 kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
2495 kputc('\t', s); // REF
2496 if (v->n_allele > 0) kputs(v->d.allele[0], s);
2497 else kputc('.', s);
2498 kputc('\t', s); // ALT
2499 if (v->n_allele > 1) {
2500 for (i = 1; i < v->n_allele; ++i) {
2501 if (i > 1) kputc(',', s);
2502 kputs(v->d.allele[i], s);
2504 } else kputc('.', s);
2505 kputc('\t', s); // QUAL
2506 if ( bcf_float_is_missing(v->qual) ) kputc('.', s); // QUAL
2507 else kputd(v->qual, s);
2508 kputc('\t', s); // FILTER
2509 if (v->d.n_flt) {
2510 for (i = 0; i < v->d.n_flt; ++i) {
2511 if (i) kputc(';', s);
2512 kputs(h->id[BCF_DT_ID][v->d.flt[i]].key, s);
2514 } else kputc('.', s);
2515 kputc('\t', s); // INFO
2516 if (v->n_info) {
2517 int first = 1;
2518 for (i = 0; i < v->n_info; ++i) {
2519 bcf_info_t *z = &v->d.info[i];
2520 if ( !z->vptr ) continue;
2521 if ( !first ) kputc(';', s);
2522 first = 0;
2523 if (z->key >= h->n[BCF_DT_ID]) {
2524 hts_log_error("Invalid BCF, the INFO index is too large");
2525 errno = EINVAL;
2526 return -1;
2528 kputs(h->id[BCF_DT_ID][z->key].key, s);
2529 if (z->len <= 0) continue;
2530 kputc('=', s);
2531 if (z->len == 1)
2533 switch (z->type)
2535 case BCF_BT_INT8: if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
2536 case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
2537 case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break;
2538 case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else kputd(z->v1.f, s); break;
2539 case BCF_BT_CHAR: kputc(z->v1.i, s); break;
2540 default: hts_log_error("Unexpected type %d", z->type); exit(1); break;
2543 else bcf_fmt_array(s, z->len, z->type, z->vptr);
2545 if ( first ) kputc('.', s);
2546 } else kputc('.', s);
2547 // FORMAT and individual information
2548 if (v->n_sample)
2550 int i,j;
2551 if ( v->n_fmt)
2553 int gt_i = -1;
2554 bcf_fmt_t *fmt = v->d.fmt;
2555 int first = 1;
2556 for (i = 0; i < (int)v->n_fmt; ++i) {
2557 if ( !fmt[i].p ) continue;
2558 kputc(!first ? ':' : '\t', s); first = 0;
2559 if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
2561 hts_log_error("Invalid BCF, the FORMAT tag id=%d not present in the header", fmt[i].id);
2562 abort();
2564 kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
2565 if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
2567 if ( first ) kputs("\t.", s);
2568 for (j = 0; j < v->n_sample; ++j) {
2569 kputc('\t', s);
2570 first = 1;
2571 for (i = 0; i < (int)v->n_fmt; ++i) {
2572 bcf_fmt_t *f = &fmt[i];
2573 if ( !f->p ) continue;
2574 if (!first) kputc(':', s);
2575 first = 0;
2576 if (gt_i == i)
2577 bcf_format_gt(f,j,s);
2578 else
2579 bcf_fmt_array(s, f->n, f->type, f->p + j * f->size);
2581 if ( first ) kputc('.', s);
2584 else
2585 for (j=0; j<=v->n_sample; j++)
2586 kputs("\t.", s);
2588 kputc('\n', s);
2589 return 0;
2592 int vcf_write_line(htsFile *fp, kstring_t *line)
2594 int ret;
2595 if ( line->s[line->l-1]!='\n' ) kputc('\n',line);
2596 if ( fp->format.compression!=no_compression )
2597 ret = bgzf_write(fp->fp.bgzf, line->s, line->l);
2598 else
2599 ret = hwrite(fp->fp.hfile, line->s, line->l);
2600 return ret==line->l ? 0 : -1;
2603 int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
2605 int ret;
2606 fp->line.l = 0;
2607 if (vcf_format1(h, v, &fp->line) != 0)
2608 return -1;
2609 if ( fp->format.compression!=no_compression )
2610 ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2611 else
2612 ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2613 return ret==fp->line.l ? 0 : -1;
2616 /************************
2617 * Data access routines *
2618 ************************/
2620 int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id)
2622 khint_t k;
2623 vdict_t *d = (vdict_t*)h->dict[which];
2624 k = kh_get(vdict, d, id);
2625 return k == kh_end(d)? -1 : kh_val(d, k).id;
2629 /********************
2630 *** BCF indexing ***
2631 ********************/
2633 hts_idx_t *bcf_index(htsFile *fp, int min_shift)
2635 int n_lvls, i;
2636 bcf1_t *b = NULL;
2637 hts_idx_t *idx = NULL;
2638 bcf_hdr_t *h;
2639 int64_t max_len = 0, s;
2640 int r;
2641 h = bcf_hdr_read(fp);
2642 if ( !h ) return NULL;
2643 int nids = 0;
2644 for (i = 0; i < h->n[BCF_DT_CTG]; ++i)
2646 if ( !h->id[BCF_DT_CTG][i].val ) continue;
2647 if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0];
2648 nids++;
2650 if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken.
2651 max_len += 256;
2652 for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
2653 idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
2654 if (!idx) goto fail;
2655 b = bcf_init1();
2656 if (!b) goto fail;
2657 while ((r = bcf_read1(fp,h, b)) >= 0) {
2658 int ret;
2659 ret = hts_idx_push(idx, b->rid, b->pos, b->pos + b->rlen, bgzf_tell(fp->fp.bgzf), 1);
2660 if (ret < 0) goto fail;
2662 if (r < -1) goto fail;
2663 hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
2664 bcf_destroy1(b);
2665 bcf_hdr_destroy(h);
2666 return idx;
2668 fail:
2669 hts_idx_destroy(idx);
2670 bcf_destroy1(b);
2671 bcf_hdr_destroy(h);
2672 return NULL;
2675 hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
2677 return fnidx? hts_idx_load2(fn, fnidx) : bcf_index_load(fn);
2680 int bcf_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads)
2682 htsFile *fp;
2683 hts_idx_t *idx;
2684 tbx_t *tbx;
2685 int ret;
2686 if ((fp = hts_open(fn, "rb")) == 0) return -2;
2687 if (n_threads)
2688 hts_set_threads(fp, n_threads);
2689 if ( fp->format.compression!=bgzf ) { hts_close(fp); return -3; }
2690 switch (fp->format.format) {
2691 case bcf:
2692 idx = bcf_index(fp, min_shift);
2693 if (idx) {
2694 ret = hts_idx_save_as(idx, fn, fnidx, HTS_FMT_CSI);
2695 if (ret < 0) ret = -4;
2696 hts_idx_destroy(idx);
2698 else ret = -1;
2699 break;
2701 case vcf:
2702 tbx = tbx_index(hts_get_bgzfp(fp), min_shift, &tbx_conf_vcf);
2703 if (tbx) {
2704 ret = hts_idx_save_as(tbx->idx, fn, fnidx, min_shift > 0 ? HTS_FMT_CSI : HTS_FMT_TBI);
2705 if (ret < 0) ret = -4;
2706 tbx_destroy(tbx);
2708 else ret = -1;
2709 break;
2711 default:
2712 ret = -3;
2713 break;
2715 hts_close(fp);
2716 return ret;
2719 int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
2721 return bcf_index_build3(fn, fnidx, min_shift, 0);
2724 int bcf_index_build(const char *fn, int min_shift)
2726 return bcf_index_build3(fn, NULL, min_shift, 0);
2729 /*****************
2730 *** Utilities ***
2731 *****************/
2733 int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
2735 int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0;
2736 for (i=0; i<src->nhrec; i++)
2738 if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
2740 int j;
2741 for (j=0; j<ndst_ori; j++)
2743 if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
2745 // Checking only the key part of generic lines, otherwise
2746 // the VCFs are too verbose. Should we perhaps add a flag
2747 // to bcf_hdr_combine() and make this optional?
2748 if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
2750 if ( j>=ndst_ori )
2751 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
2753 else if ( src->hrec[i]->type==BCF_HL_STR )
2755 // NB: we are ignoring fields without ID
2756 int j = bcf_hrec_find_key(src->hrec[i],"ID");
2757 if ( j>=0 )
2759 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
2760 if ( !rec )
2761 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
2764 else
2766 int j = bcf_hrec_find_key(src->hrec[i],"ID");
2767 assert( j>=0 ); // this should always be true for valid VCFs
2769 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
2770 if ( !rec )
2771 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
2772 else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
2774 // Check that both records are of the same type. The bcf_hdr_id2length
2775 // macro cannot be used here because dst header is not synced yet.
2776 vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
2777 vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
2778 khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
2779 khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
2780 if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
2782 hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
2783 src->hrec[i]->vals[0]);
2784 ret |= 1;
2786 if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
2788 hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
2789 src->hrec[i]->vals[0]);
2790 ret |= 1;
2795 if ( need_sync ) bcf_hdr_sync(dst);
2796 return ret;
2799 bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
2801 if ( !dst )
2803 // this will effectively strip existing IDX attributes from src to become dst
2804 dst = bcf_hdr_init("r");
2805 kstring_t htxt = {0,0,0};
2806 bcf_hdr_format(src, 0, &htxt);
2807 if ( bcf_hdr_parse(dst, htxt.s) < 0 ) {
2808 bcf_hdr_destroy(dst);
2809 dst = NULL;
2811 free(htxt.s);
2812 return dst;
2815 int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0;
2816 for (i=0; i<src->nhrec; i++)
2818 if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value )
2820 int j;
2821 for (j=0; j<ndst_ori; j++)
2823 if ( dst->hrec[j]->type!=BCF_HL_GEN ) continue;
2825 // Checking only the key part of generic lines, otherwise
2826 // the VCFs are too verbose. Should we perhaps add a flag
2827 // to bcf_hdr_combine() and make this optional?
2828 if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break;
2830 if ( j>=ndst_ori )
2831 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
2833 else if ( src->hrec[i]->type==BCF_HL_STR )
2835 // NB: we are ignoring fields without ID
2836 int j = bcf_hrec_find_key(src->hrec[i],"ID");
2837 if ( j>=0 )
2839 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key);
2840 if ( !rec )
2841 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
2844 else
2846 int j = bcf_hrec_find_key(src->hrec[i],"ID");
2847 assert( j>=0 ); // this should always be true for valid VCFs
2849 bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL);
2850 if ( !rec )
2851 need_sync += bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i]));
2852 else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT )
2854 // Check that both records are of the same type. The bcf_hdr_id2length
2855 // macro cannot be used here because dst header is not synced yet.
2856 vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID];
2857 vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID];
2858 khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]);
2859 khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]);
2860 if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) )
2862 hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths",
2863 src->hrec[i]->vals[0]);
2864 ret |= 1;
2866 if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) )
2868 hts_log_warning("Trying to combine \"%s\" tag definitions of different types",
2869 src->hrec[i]->vals[0]);
2870 ret |= 1;
2875 if ( need_sync ) bcf_hdr_sync(dst);
2876 return dst;
2878 int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line)
2880 int i;
2881 if ( line->errcode )
2883 hts_log_error("Unchecked error (%d), exiting", line->errcode);
2884 exit(1);
2886 if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id
2887 if ( !src_hdr->ntransl ) // called for the first time, see what needs translating
2889 int dict;
2890 for (dict=0; dict<2; dict++) // BCF_DT_ID and BCF_DT_CTG
2892 src_hdr->transl[dict] = (int*) malloc(src_hdr->n[dict]*sizeof(int));
2893 for (i=0; i<src_hdr->n[dict]; i++)
2895 if ( !src_hdr->id[dict][i].key ) // gap left after removed BCF header lines
2897 src_hdr->transl[dict][i] = -1;
2898 continue;
2900 src_hdr->transl[dict][i] = bcf_hdr_id2int(dst_hdr,dict,src_hdr->id[dict][i].key);
2901 if ( src_hdr->transl[dict][i]!=-1 && i!=src_hdr->transl[dict][i] ) src_hdr->ntransl++;
2904 if ( !src_hdr->ntransl )
2906 free(src_hdr->transl[0]); src_hdr->transl[0] = NULL;
2907 free(src_hdr->transl[1]); src_hdr->transl[1] = NULL;
2908 src_hdr->ntransl = -1;
2910 if ( src_hdr->ntransl==-1 ) return 0;
2912 bcf_unpack(line,BCF_UN_ALL);
2914 // CHROM
2915 if ( src_hdr->transl[BCF_DT_CTG][line->rid] >=0 ) line->rid = src_hdr->transl[BCF_DT_CTG][line->rid];
2917 // FILTER
2918 for (i=0; i<line->d.n_flt; i++)
2920 int src_id = line->d.flt[i];
2921 if ( src_hdr->transl[BCF_DT_ID][src_id] >=0 )
2922 line->d.flt[i] = src_hdr->transl[BCF_DT_ID][src_id];
2923 line->d.shared_dirty |= BCF1_DIRTY_FLT;
2926 // INFO
2927 for (i=0; i<line->n_info; i++)
2929 int src_id = line->d.info[i].key;
2930 int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
2931 if ( dst_id<0 ) continue;
2932 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2933 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2934 if ( src_size==dst_size ) // can overwrite
2936 line->d.info[i].key = dst_id;
2937 uint8_t *vptr = line->d.info[i].vptr - line->d.info[i].vptr_off;
2938 if ( dst_size==BCF_BT_INT8 ) { vptr[1] = (uint8_t)dst_id; }
2939 else if ( dst_size==BCF_BT_INT16 ) { *(uint16_t*)vptr = (uint16_t)dst_id; }
2940 else { *(uint32_t*)vptr = (uint32_t)dst_id; }
2942 else // must realloc
2944 bcf_info_t *info = &line->d.info[i];
2945 assert( !info->vptr_free );
2946 kstring_t str = {0,0,0};
2947 bcf_enc_int1(&str, dst_id);
2948 bcf_enc_size(&str, info->len,info->type);
2949 info->vptr_off = str.l;
2950 kputsn((char*)info->vptr, info->vptr_len, &str);
2951 info->vptr = (uint8_t*)str.s + info->vptr_off;
2952 info->vptr_free = 1;
2953 info->key = dst_id;
2954 line->d.shared_dirty |= BCF1_DIRTY_INF;
2958 // FORMAT
2959 for (i=0; i<line->n_fmt; i++)
2961 int src_id = line->d.fmt[i].id;
2962 int dst_id = src_hdr->transl[BCF_DT_ID][src_id];
2963 if ( dst_id<0 ) continue;
2964 int src_size = src_id>>7 ? ( src_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2965 int dst_size = dst_id>>7 ? ( dst_id>>15 ? BCF_BT_INT32 : BCF_BT_INT16) : BCF_BT_INT8;
2966 if ( src_size==dst_size ) // can overwrite
2968 line->d.fmt[i].id = dst_id;
2969 uint8_t *p = line->d.fmt[i].p - line->d.fmt[i].p_off; // pointer to the vector size (4bits) and BT type (4bits)
2970 if ( dst_size==BCF_BT_INT8 ) { p[1] = dst_id; }
2971 else if ( dst_size==BCF_BT_INT16 ) { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; }
2972 else { uint8_t *x = (uint8_t*) &dst_id; p[1] = x[0]; p[2] = x[1]; p[3] = x[2]; p[4] = x[3]; }
2974 else // must realloc
2976 bcf_fmt_t *fmt = &line->d.fmt[i];
2977 assert( !fmt->p_free );
2978 kstring_t str = {0,0,0};
2979 bcf_enc_int1(&str, dst_id);
2980 bcf_enc_size(&str, fmt->n, fmt->type);
2981 fmt->p_off = str.l;
2982 kputsn((char*)fmt->p, fmt->p_len, &str);
2983 fmt->p = (uint8_t*)str.s + fmt->p_off;
2984 fmt->p_free = 1;
2985 fmt->id = dst_id;
2986 line->d.indiv_dirty = 1;
2989 return 0;
2992 bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
2994 bcf_hdr_t *hout = bcf_hdr_init("r");
2995 if (!hout) {
2996 hts_log_error("Failed to allocate bcf header");
2997 return NULL;
2999 kstring_t htxt = {0,0,0};
3000 bcf_hdr_format(hdr, 1, &htxt);
3001 if ( bcf_hdr_parse(hout, htxt.s) < 0 ) {
3002 bcf_hdr_destroy(hout);
3003 hout = NULL;
3005 free(htxt.s);
3006 return hout;
3009 bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
3011 void *names_hash = khash_str2int_init();
3012 kstring_t htxt = {0,0,0};
3013 kstring_t str = {0,0,0};
3014 bcf_hdr_t *h = bcf_hdr_init("w");
3015 if (!h) {
3016 hts_log_error("Failed to allocate bcf header");
3017 return NULL;
3019 bcf_hdr_format(h0, 1, &htxt);
3020 bcf_hdr_set_version(h,bcf_hdr_get_version(h0));
3021 int j;
3022 for (j=0; j<n; j++) imap[j] = -1;
3023 if ( bcf_hdr_nsamples(h0) > 0) {
3024 char *p = find_chrom_header_line(htxt.s);
3025 int i = 0, end = n? 8 : 7;
3026 while ((p = strchr(p, '\t')) != 0 && i < end) ++i, ++p;
3027 if (i != end) {
3028 free(h); free(str.s);
3029 return 0; // malformated header
3031 kputsn(htxt.s, p - htxt.s, &str);
3032 for (i = 0; i < n; ++i) {
3033 if ( khash_str2int_has_key(names_hash,samples[i]) )
3035 hts_log_error("Duplicate sample name \"%s\"", samples[i]);
3036 free(str.s);
3037 free(htxt.s);
3038 khash_str2int_destroy(names_hash);
3039 bcf_hdr_destroy(h);
3040 return NULL;
3042 imap[i] = bcf_hdr_id2int(h0, BCF_DT_SAMPLE, samples[i]);
3043 if (imap[i] < 0) continue;
3044 kputc('\t', &str);
3045 kputs(samples[i], &str);
3046 khash_str2int_inc(names_hash,samples[i]);
3048 } else kputsn(htxt.s, htxt.l, &str);
3049 while (str.l && (!str.s[str.l-1] || str.s[str.l-1]=='\n') ) str.l--; // kill trailing zeros and newlines
3050 kputc('\n',&str);
3051 if ( bcf_hdr_parse(h, str.s) < 0 ) {
3052 bcf_hdr_destroy(h);
3053 h = NULL;
3055 free(str.s);
3056 free(htxt.s);
3057 khash_str2int_destroy(names_hash);
3058 return h;
3061 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
3063 if ( samples && !strcmp("-",samples) ) return 0; // keep all samples
3065 hdr->nsamples_ori = bcf_hdr_nsamples(hdr);
3066 if ( !samples ) { bcf_hdr_nsamples(hdr) = 0; return 0; } // exclude all samples
3068 int i, narr = bit_array_size(bcf_hdr_nsamples(hdr));
3069 hdr->keep_samples = (uint8_t*) calloc(narr,1);
3070 if ( samples[0]=='^' )
3071 for (i=0; i<bcf_hdr_nsamples(hdr); i++) bit_array_set(hdr->keep_samples,i);
3073 int idx, n, ret = 0;
3074 char **smpls = hts_readlist(samples[0]=='^'?samples+1:samples, is_file, &n);
3075 if ( !smpls ) return -1;
3076 for (i=0; i<n; i++)
3078 idx = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,smpls[i]);
3079 if ( idx<0 )
3081 if ( !ret ) ret = i+1;
3082 continue;
3084 assert( idx<bcf_hdr_nsamples(hdr) );
3085 if ( samples[0]=='^' )
3086 bit_array_clear(hdr->keep_samples, idx);
3087 else
3088 bit_array_set(hdr->keep_samples, idx);
3090 for (i=0; i<n; i++) free(smpls[i]);
3091 free(smpls);
3093 bcf_hdr_nsamples(hdr) = 0;
3094 for (i=0; i<hdr->nsamples_ori; i++)
3095 if ( bit_array_test(hdr->keep_samples,i) ) bcf_hdr_nsamples(hdr)++;
3096 if ( !bcf_hdr_nsamples(hdr) ) { free(hdr->keep_samples); hdr->keep_samples=NULL; }
3097 else
3099 char **samples = (char**) malloc(sizeof(char*)*bcf_hdr_nsamples(hdr));
3100 idx = 0;
3101 for (i=0; i<hdr->nsamples_ori; i++)
3102 if ( bit_array_test(hdr->keep_samples,i) ) samples[idx++] = strdup(hdr->samples[i]);
3103 free(hdr->samples);
3104 hdr->samples = samples;
3106 // delete original samples from the dictionary
3107 vdict_t *d = (vdict_t*)hdr->dict[BCF_DT_SAMPLE];
3108 int k;
3109 for (k = kh_begin(d); k != kh_end(d); ++k)
3110 if (kh_exist(d, k)) free((char*)kh_key(d, k));
3111 kh_destroy(vdict, d);
3113 // add the subset back
3114 hdr->dict[BCF_DT_SAMPLE] = d = kh_init(vdict);
3115 for (i=0; i<bcf_hdr_nsamples(hdr); i++)
3117 int ignore, k = kh_put(vdict, d, hdr->samples[i], &ignore);
3118 kh_val(d, k) = bcf_idinfo_def;
3119 kh_val(d, k).id = kh_size(d) - 1;
3121 bcf_hdr_sync(hdr);
3124 return ret;
3127 int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
3129 kstring_t ind;
3130 ind.s = 0; ind.l = ind.m = 0;
3131 if (n) {
3132 bcf_fmt_t fmt[MAX_N_FMT];
3133 int i, j;
3134 uint8_t *ptr = (uint8_t*)v->indiv.s;
3135 for (i = 0; i < v->n_fmt; ++i)
3136 ptr = bcf_unpack_fmt_core1(ptr, v->n_sample, &fmt[i]);
3137 for (i = 0; i < (int)v->n_fmt; ++i) {
3138 bcf_fmt_t *f = &fmt[i];
3139 bcf_enc_int1(&ind, f->id);
3140 bcf_enc_size(&ind, f->n, f->type);
3141 for (j = 0; j < n; ++j)
3142 if (imap[j] >= 0) kputsn((char*)(f->p + imap[j] * f->size), f->size, &ind);
3144 for (i = j = 0; j < n; ++j) if (imap[j] >= 0) ++i;
3145 v->n_sample = i;
3146 } else v->n_sample = 0;
3147 if ( !v->n_sample ) v->n_fmt = 0;
3148 free(v->indiv.s);
3149 v->indiv = ind;
3150 v->unpacked &= ~BCF_UN_FMT; // only BCF is ready for output, VCF will need to unpack again
3151 return 0;
3154 int bcf_is_snp(bcf1_t *v)
3156 int i;
3157 bcf_unpack(v, BCF_UN_STR);
3158 for (i = 0; i < v->n_allele; ++i)
3160 if ( v->d.allele[i][1]==0 ) continue;
3162 // mpileup's <X> allele, see also below. This is not completely satisfactory,
3163 // a general library is here narrowly tailored to fit samtools.
3164 if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='X' && v->d.allele[i][2]=='>' ) continue;
3165 if ( v->d.allele[i][0]=='<' && v->d.allele[i][1]=='*' && v->d.allele[i][2]=='>' ) continue;
3167 break;
3169 return i == v->n_allele;
3172 static void bcf_set_variant_type(const char *ref, const char *alt, variant_t *var)
3174 // The most frequent case
3175 if ( !ref[1] && !alt[1] )
3177 if ( *alt == '.' || *ref==*alt ) { var->n = 0; var->type = VCF_REF; return; }
3178 if ( *alt == 'X' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
3179 if ( *alt == '*' ) { var->n = 0; var->type = VCF_REF; return; }
3180 var->n = 1; var->type = VCF_SNP; return;
3182 if ( alt[0]=='<' )
3184 if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant
3185 if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; }
3186 var->type = VCF_OTHER;
3187 return;
3190 const char *r = ref, *a = alt;
3191 while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; } // unfortunately, matching REF,ALT case is not guaranteed
3193 if ( *a && !*r )
3195 if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; }
3196 while ( *a ) a++;
3197 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
3199 else if ( *r && !*a )
3201 while ( *r ) r++;
3202 var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return;
3204 else if ( !*r && !*a )
3206 var->n = 0; var->type = VCF_REF; return;
3209 const char *re = r, *ae = a;
3210 while ( re[1] ) re++;
3211 while ( ae[1] ) ae++;
3212 while ( re>r && ae>a && toupper_c(*re)==toupper_c(*ae) ) { re--; ae--; }
3213 if ( ae==a )
3215 if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; }
3216 var->n = -(re-r);
3217 if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; }
3218 var->type = VCF_OTHER; return;
3220 else if ( re==r )
3222 var->n = ae-a;
3223 if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; }
3224 var->type = VCF_OTHER; return;
3227 var->type = ( re-r == ae-a ) ? VCF_MNP : VCF_OTHER;
3228 var->n = ( re-r > ae-a ) ? -(re-r+1) : ae-a+1;
3230 // should do also complex events, SVs, etc...
3233 static void bcf_set_variant_types(bcf1_t *b)
3235 if ( !(b->unpacked & BCF_UN_STR) ) bcf_unpack(b, BCF_UN_STR);
3236 bcf_dec_t *d = &b->d;
3237 if ( d->n_var < b->n_allele )
3239 d->var = (variant_t *) realloc(d->var, sizeof(variant_t)*b->n_allele);
3240 d->n_var = b->n_allele;
3242 int i;
3243 b->d.var_type = 0;
3244 for (i=1; i<b->n_allele; i++)
3246 bcf_set_variant_type(d->allele[0],d->allele[i], &d->var[i]);
3247 b->d.var_type |= d->var[i].type;
3248 //fprintf(stderr,"[set_variant_type] %d %s %s -> %d %d .. %d\n", b->pos+1,d->allele[0],d->allele[i],d->var[i].type,d->var[i].n, b->d.var_type);
3252 int bcf_get_variant_types(bcf1_t *rec)
3254 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
3255 return rec->d.var_type;
3257 int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
3259 if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec);
3260 return rec->d.var[ith_allele].type;
3263 int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
3265 // Is the field already present?
3266 int i, inf_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
3267 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,inf_id) ) return -1; // No such INFO field in the header
3268 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
3270 for (i=0; i<line->n_info; i++)
3271 if ( inf_id==line->d.info[i].key ) break;
3272 bcf_info_t *inf = i==line->n_info ? NULL : &line->d.info[i];
3274 if ( !n || (type==BCF_HT_STR && !values) )
3276 if ( n==0 && !strcmp("END",key) )
3277 line->rlen = line->n_allele ? strlen(line->d.allele[0]) : 0;
3278 if ( inf )
3280 // Mark the tag for removal, free existing memory if necessary
3281 if ( inf->vptr_free )
3283 free(inf->vptr - inf->vptr_off);
3284 inf->vptr_free = 0;
3286 line->d.shared_dirty |= BCF1_DIRTY_INF;
3287 inf->vptr = NULL;
3288 inf->vptr_off = inf->vptr_len = 0;
3290 return 0;
3293 // Encode the values and determine the size required to accommodate the values
3294 kstring_t str = {0,0,0};
3295 bcf_enc_int1(&str, inf_id);
3296 if ( type==BCF_HT_INT )
3297 bcf_enc_vint(&str, n, (int32_t*)values, -1);
3298 else if ( type==BCF_HT_REAL )
3299 bcf_enc_vfloat(&str, n, (float*)values);
3300 else if ( type==BCF_HT_FLAG || type==BCF_HT_STR )
3302 if ( values==NULL )
3303 bcf_enc_size(&str, 0, BCF_BT_NULL);
3304 else
3305 bcf_enc_vchar(&str, strlen((char*)values), (char*)values);
3307 else
3309 hts_log_error("The type %d not implemented yet", type);
3310 abort();
3313 // Is the INFO tag already present
3314 if ( inf )
3316 // Is it big enough to accommodate new block?
3317 if ( str.l <= inf->vptr_len + inf->vptr_off )
3319 if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF;
3320 uint8_t *ptr = inf->vptr - inf->vptr_off;
3321 memcpy(ptr, str.s, str.l);
3322 free(str.s);
3323 int vptr_free = inf->vptr_free;
3324 bcf_unpack_info_core1(ptr, inf);
3325 inf->vptr_free = vptr_free;
3327 else
3329 assert( !inf->vptr_free ); // fix the caller or improve here: this has been modified before
3330 bcf_unpack_info_core1((uint8_t*)str.s, inf);
3331 inf->vptr_free = 1;
3332 line->d.shared_dirty |= BCF1_DIRTY_INF;
3335 else
3337 // The tag is not present, create new one
3338 line->n_info++;
3339 hts_expand0(bcf_info_t, line->n_info, line->d.m_info , line->d.info);
3340 inf = &line->d.info[line->n_info-1];
3341 bcf_unpack_info_core1((uint8_t*)str.s, inf);
3342 inf->vptr_free = 1;
3343 line->d.shared_dirty |= BCF1_DIRTY_INF;
3345 line->unpacked |= BCF_UN_INFO;
3347 if ( n==1 && !strcmp("END",key) ) line->rlen = ((int32_t*)values)[0] - line->pos;
3348 return 0;
3351 int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
3353 if ( !n )
3354 return bcf_update_format(hdr,line,key,NULL,0,BCF_HT_STR);
3356 int i, max_len = 0;
3357 for (i=0; i<n; i++)
3359 int len = strlen(values[i]);
3360 if ( len > max_len ) max_len = len;
3362 char *out = (char*) malloc(max_len*n);
3363 if ( !out ) return -2;
3364 for (i=0; i<n; i++)
3366 char *dst = out+i*max_len;
3367 const char *src = values[i];
3368 int j = 0;
3369 while ( src[j] ) { dst[j] = src[j]; j++; }
3370 for (; j<max_len; j++) dst[j] = 0;
3372 int ret = bcf_update_format(hdr,line,key,out,max_len*n,BCF_HT_STR);
3373 free(out);
3374 return ret;
3377 int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
3379 // Is the field already present?
3380 int i, fmt_id = bcf_hdr_id2int(hdr,BCF_DT_ID,key);
3381 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,fmt_id) )
3383 if ( !n ) return 0;
3384 return -1; // the key not present in the header
3387 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
3389 for (i=0; i<line->n_fmt; i++)
3390 if ( line->d.fmt[i].id==fmt_id ) break;
3391 bcf_fmt_t *fmt = i==line->n_fmt ? NULL : &line->d.fmt[i];
3393 if ( !n )
3395 if ( fmt )
3397 // Mark the tag for removal, free existing memory if necessary
3398 if ( fmt->p_free )
3400 free(fmt->p - fmt->p_off);
3401 fmt->p_free = 0;
3403 line->d.indiv_dirty = 1;
3404 fmt->p = NULL;
3406 return 0;
3409 line->n_sample = bcf_hdr_nsamples(hdr);
3410 int nps = n / line->n_sample; // number of values per sample
3411 assert( nps && nps*line->n_sample==n ); // must be divisible by n_sample
3413 // Encode the values and determine the size required to accommodate the values
3414 kstring_t str = {0,0,0};
3415 bcf_enc_int1(&str, fmt_id);
3416 if ( type==BCF_HT_INT )
3417 bcf_enc_vint(&str, n, (int32_t*)values, nps);
3418 else if ( type==BCF_HT_REAL )
3420 bcf_enc_size(&str, nps, BCF_BT_FLOAT);
3421 serialize_float_array(&str, nps*line->n_sample, (float *) values);
3423 else if ( type==BCF_HT_STR )
3425 bcf_enc_size(&str, nps, BCF_BT_CHAR);
3426 kputsn((char*)values, nps*line->n_sample, &str);
3428 else
3430 hts_log_error("The type %d not implemented yet", type);
3431 abort();
3434 if ( !fmt )
3436 // Not present, new format field
3437 line->n_fmt++;
3438 hts_expand0(bcf_fmt_t, line->n_fmt, line->d.m_fmt, line->d.fmt);
3440 // Special case: VCF specification requires that GT is always first
3441 if ( line->n_fmt > 1 && key[0]=='G' && key[1]=='T' && !key[2] )
3443 for (i=line->n_fmt-1; i>0; i--)
3444 line->d.fmt[i] = line->d.fmt[i-1];
3445 fmt = &line->d.fmt[0];
3447 else
3448 fmt = &line->d.fmt[line->n_fmt-1];
3449 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
3450 line->d.indiv_dirty = 1;
3451 fmt->p_free = 1;
3453 else
3455 // The tag is already present, check if it is big enough to accomodate the new block
3456 if ( str.l <= fmt->p_len + fmt->p_off )
3458 // good, the block is big enough
3459 if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1;
3460 uint8_t *ptr = fmt->p - fmt->p_off;
3461 memcpy(ptr, str.s, str.l);
3462 free(str.s);
3463 int p_free = fmt->p_free;
3464 bcf_unpack_fmt_core1(ptr, line->n_sample, fmt);
3465 fmt->p_free = p_free;
3467 else
3469 assert( !fmt->p_free ); // fix the caller or improve here: this has been modified before
3470 bcf_unpack_fmt_core1((uint8_t*)str.s, line->n_sample, fmt);
3471 fmt->p_free = 1;
3472 line->d.indiv_dirty = 1;
3475 line->unpacked |= BCF_UN_FMT;
3476 return 0;
3480 int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
3482 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
3483 line->d.shared_dirty |= BCF1_DIRTY_FLT;
3484 line->d.n_flt = n;
3485 if ( !n ) return 0;
3486 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
3487 int i;
3488 for (i=0; i<n; i++)
3489 line->d.flt[i] = flt_ids[i];
3490 return 0;
3493 int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
3495 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
3496 int i;
3497 for (i=0; i<line->d.n_flt; i++)
3498 if ( flt_id==line->d.flt[i] ) break;
3499 if ( i<line->d.n_flt ) return 0; // this filter is already set
3500 line->d.shared_dirty |= BCF1_DIRTY_FLT;
3501 if ( flt_id==0 ) // set to PASS
3502 line->d.n_flt = 1;
3503 else if ( line->d.n_flt==1 && line->d.flt[0]==0 )
3504 line->d.n_flt = 1;
3505 else
3506 line->d.n_flt++;
3507 hts_expand(int, line->d.n_flt, line->d.m_flt, line->d.flt);
3508 line->d.flt[line->d.n_flt-1] = flt_id;
3509 return 1;
3511 int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
3513 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
3514 int i;
3515 for (i=0; i<line->d.n_flt; i++)
3516 if ( flt_id==line->d.flt[i] ) break;
3517 if ( i==line->d.n_flt ) return 0; // the filter is not present
3518 line->d.shared_dirty |= BCF1_DIRTY_FLT;
3519 if ( i!=line->d.n_flt-1 ) memmove(line->d.flt+i,line->d.flt+i+1,(line->d.n_flt-i-1)*sizeof(*line->d.flt));
3520 line->d.n_flt--;
3521 if ( !line->d.n_flt && pass ) bcf_add_filter(hdr,line,0);
3522 return 0;
3525 int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
3527 if ( filter[0]=='.' && !filter[1] ) filter = "PASS";
3528 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, filter);
3529 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FLT,id) ) return -1; // not defined in the header
3531 if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT);
3532 if ( id==0 && !line->d.n_flt) return 1; // PASS
3534 int i;
3535 for (i=0; i<line->d.n_flt; i++)
3536 if ( line->d.flt[i]==id ) return 1;
3537 return 0;
3540 static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nals)
3542 line->d.shared_dirty |= BCF1_DIRTY_ALS;
3544 line->n_allele = nals;
3545 hts_expand(char*, line->n_allele, line->d.m_allele, line->d.allele);
3547 char *als = line->d.als;
3548 int n = 0;
3549 while (n<nals)
3551 line->d.allele[n] = als;
3552 while ( *als ) als++;
3553 als++;
3554 n++;
3557 // Update REF length
3558 bcf_info_t *end_info = bcf_get_info(hdr,line,"END");
3559 line->rlen = end_info ? end_info->v1.i : strlen(line->d.allele[0]);
3561 return 0;
3563 int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
3565 kstring_t tmp = {0,0,0};
3566 char *free_old = NULL;
3568 // If the supplied alleles are not pointers to line->d.als, the existing block can be reused.
3569 int i;
3570 for (i=0; i<nals; i++)
3571 if ( alleles[i]>=line->d.als && alleles[i]<line->d.als+line->d.m_als ) break;
3572 if ( i==nals )
3574 // all alleles point elsewhere, reuse the existing block
3575 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
3577 else
3578 free_old = line->d.als;
3580 for (i=0; i<nals; i++)
3582 kputs(alleles[i], &tmp);
3583 kputc(0, &tmp);
3585 line->d.als = tmp.s; line->d.m_als = tmp.m;
3586 free(free_old);
3587 return _bcf1_sync_alleles(hdr,line,nals);
3590 int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
3592 kstring_t tmp;
3593 tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als;
3594 kputs(alleles_string, &tmp);
3595 line->d.als = tmp.s; line->d.m_als = tmp.m;
3597 int nals = 1;
3598 char *t = line->d.als;
3599 while (*t)
3601 if ( *t==',' ) { *t = 0; nals++; }
3602 t++;
3604 return _bcf1_sync_alleles(hdr, line, nals);
3607 int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
3609 kstring_t tmp;
3610 tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
3611 if ( id )
3612 kputs(id, &tmp);
3613 else
3614 kputs(".", &tmp);
3615 line->d.id = tmp.s; line->d.m_id = tmp.m;
3616 line->d.shared_dirty |= BCF1_DIRTY_ID;
3617 return 0;
3620 int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
3622 if ( !id ) return 0;
3624 kstring_t tmp;
3625 tmp.l = 0; tmp.s = line->d.id; tmp.m = line->d.m_id;
3627 int len = strlen(id);
3628 char *dst = line->d.id;
3629 while ( *dst && (dst=strstr(dst,id)) )
3631 if ( dst[len]!=0 && dst[len]!=';' ) dst++; // a prefix, not a match
3632 else if ( dst==line->d.id || dst[-1]==';' ) return 0; // already present
3633 dst++; // a suffix, not a match
3635 if ( line->d.id && (line->d.id[0]!='.' || line->d.id[1]) )
3637 tmp.l = strlen(line->d.id);
3638 kputc(';',&tmp);
3640 kputs(id,&tmp);
3642 line->d.id = tmp.s; line->d.m_id = tmp.m;
3643 line->d.shared_dirty |= BCF1_DIRTY_ID;
3644 return 0;
3648 bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
3650 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
3651 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) ) return NULL; // no such FMT field in the header
3652 return bcf_get_fmt_id(line, id);
3655 bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
3657 int id = bcf_hdr_id2int(hdr, BCF_DT_ID, key);
3658 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,id) ) return NULL; // no such INFO field in the header
3659 return bcf_get_info_id(line, id);
3662 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
3664 int i;
3665 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
3666 for (i=0; i<line->n_fmt; i++)
3668 if ( line->d.fmt[i].id==id ) return &line->d.fmt[i];
3670 return NULL;
3673 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
3675 int i;
3676 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
3677 for (i=0; i<line->n_info; i++)
3679 if ( line->d.info[i].key==id ) return &line->d.info[i];
3681 return NULL;
3685 int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
3687 int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
3688 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header
3689 if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type
3691 if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO);
3693 for (i=0; i<line->n_info; i++)
3694 if ( line->d.info[i].key==tag_id ) break;
3695 if ( i==line->n_info ) return ( type==BCF_HT_FLAG ) ? 0 : -3; // the tag is not present in this record
3696 if ( type==BCF_HT_FLAG ) return 1;
3698 bcf_info_t *info = &line->d.info[i];
3699 if ( !info->vptr ) return -3; // the tag was marked for removal
3700 if ( type==BCF_HT_STR )
3702 if ( *ndst < info->len+1 )
3704 *ndst = info->len + 1;
3705 *dst = realloc(*dst, *ndst);
3707 memcpy(*dst,info->vptr,info->len);
3708 ((uint8_t*)*dst)[info->len] = 0;
3709 return info->len;
3712 // Make sure the buffer is big enough
3713 int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
3714 if ( *ndst < info->len )
3716 *ndst = info->len;
3717 *dst = realloc(*dst, *ndst * size1);
3720 #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_regular, out_type_t) { \
3721 out_type_t *tmp = (out_type_t *) *dst; \
3722 for (j=0; j<info->len; j++) \
3724 type_t p = convert(info->vptr + j * sizeof(type_t)); \
3725 if ( is_vector_end ) return j; \
3726 if ( is_missing ) set_missing; \
3727 else set_regular; \
3728 tmp++; \
3730 return j; \
3732 switch (info->type) {
3733 case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
3734 case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
3735 case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break;
3736 case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break;
3737 default: hts_log_error("Unexpected type %d", info->type); exit(1);
3739 #undef BRANCH
3740 return -4; // this can never happen
3743 int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
3745 int i,tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
3746 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
3747 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2; // expected different type
3749 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
3751 for (i=0; i<line->n_fmt; i++)
3752 if ( line->d.fmt[i].id==tag_id ) break;
3753 if ( i==line->n_fmt ) return -3; // the tag is not present in this record
3754 bcf_fmt_t *fmt = &line->d.fmt[i];
3755 if ( !fmt->p ) return -3; // the tag was marked for removal
3757 int nsmpl = bcf_hdr_nsamples(hdr);
3758 if ( !*dst )
3760 *dst = (char**) malloc(sizeof(char*)*nsmpl);
3761 if ( !*dst ) return -4; // could not alloc
3762 (*dst)[0] = NULL;
3764 int n = (fmt->n+1)*nsmpl;
3765 if ( *ndst < n )
3767 (*dst)[0] = realloc((*dst)[0], n);
3768 if ( !(*dst)[0] ) return -4; // could not alloc
3769 *ndst = n;
3771 for (i=0; i<nsmpl; i++)
3773 uint8_t *src = fmt->p + i*fmt->n;
3774 uint8_t *tmp = (uint8_t*)(*dst)[0] + i*(fmt->n+1);
3775 memcpy(tmp,src,fmt->n);
3776 tmp[fmt->n] = 0;
3777 (*dst)[i] = (char*) tmp;
3779 return n;
3782 int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
3784 int i,j, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag);
3785 if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,tag_id) ) return -1; // no such FORMAT field in the header
3786 if ( tag[0]=='G' && tag[1]=='T' && tag[2]==0 )
3788 // Ugly: GT field is considered to be a string by the VCF header but BCF represents it as INT.
3789 if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=BCF_HT_STR ) return -2;
3791 else if ( bcf_hdr_id2type(hdr,BCF_HL_FMT,tag_id)!=type ) return -2; // expected different type
3793 if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
3795 for (i=0; i<line->n_fmt; i++)
3796 if ( line->d.fmt[i].id==tag_id ) break;
3797 if ( i==line->n_fmt ) return -3; // the tag is not present in this record
3798 bcf_fmt_t *fmt = &line->d.fmt[i];
3799 if ( !fmt->p ) return -3; // the tag was marked for removal
3801 if ( type==BCF_HT_STR )
3803 int n = fmt->n*bcf_hdr_nsamples(hdr);
3804 if ( *ndst < n )
3806 *dst = realloc(*dst, n);
3807 if ( !*dst ) return -4; // could not alloc
3808 *ndst = n;
3810 memcpy(*dst,fmt->p,n);
3811 return n;
3814 // Make sure the buffer is big enough
3815 int nsmpl = bcf_hdr_nsamples(hdr);
3816 int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float);
3817 if ( *ndst < fmt->n*nsmpl )
3819 *ndst = fmt->n*nsmpl;
3820 *dst = realloc(*dst, *ndst*size1);
3821 if ( !dst ) return -4; // could not alloc
3824 #define BRANCH(type_t, convert, is_missing, is_vector_end, set_missing, set_vector_end, set_regular, out_type_t) { \
3825 out_type_t *tmp = (out_type_t *) *dst; \
3826 uint8_t *fmt_p = fmt->p; \
3827 for (i=0; i<nsmpl; i++) \
3829 for (j=0; j<fmt->n; j++) \
3831 type_t p = convert(fmt_p + j * sizeof(type_t)); \
3832 if ( is_missing ) set_missing; \
3833 else if ( is_vector_end ) { set_vector_end; break; } \
3834 else set_regular; \
3835 tmp++; \
3837 for (; j<fmt->n; j++) { set_vector_end; tmp++; } \
3838 fmt_p += fmt->size; \
3841 switch (fmt->type) {
3842 case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
3843 case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
3844 case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break;
3845 case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break;
3846 default: hts_log_error("Unexpected type %d", fmt->type); exit(1);
3848 #undef BRANCH
3849 return nsmpl*fmt->n;