1 /* test/test-vcf-api.c -- VCF test harness.
3 Copyright (C) 2013, 2014 Genome Research Ltd.
5 Author: Petr Danecek <pd3@sanger.ac.uk>
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
28 #include <htslib/hts.h>
29 #include <htslib/vcf.h>
30 #include <htslib/kstring.h>
31 #include <htslib/kseq.h>
33 void write_bcf(char *fname
)
36 htsFile
*fp
= hts_open(fname
,"wb");
37 bcf_hdr_t
*hdr
= bcf_hdr_init("w");
38 bcf1_t
*rec
= bcf_init1();
41 kstring_t str
= {0,0,0};
42 bcf_hdr_append(hdr
, "##fileDate=20090805");
43 bcf_hdr_append(hdr
, "##FORMAT=<ID=UF,Number=1,Type=Integer,Description=\"Unused FORMAT\">");
44 bcf_hdr_append(hdr
, "##INFO=<ID=UI,Number=1,Type=Integer,Description=\"Unused INFO\">");
45 bcf_hdr_append(hdr
, "##FILTER=<ID=Flt,Description=\"Unused FILTER\">");
46 bcf_hdr_append(hdr
, "##unused=<XX=AA,Description=\"Unused generic\">");
47 bcf_hdr_append(hdr
, "##unused=unformatted text 1");
48 bcf_hdr_append(hdr
, "##unused=unformatted text 2");
49 bcf_hdr_append(hdr
, "##contig=<ID=Unused,length=62435964>");
50 bcf_hdr_append(hdr
, "##source=myImputationProgramV3.1");
51 bcf_hdr_append(hdr
, "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta");
52 bcf_hdr_append(hdr
, "##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\",taxonomy=x>");
53 bcf_hdr_append(hdr
, "##phasing=partial");
54 bcf_hdr_append(hdr
, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">");
55 bcf_hdr_append(hdr
, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">");
56 bcf_hdr_append(hdr
, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">");
57 bcf_hdr_append(hdr
, "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">");
58 bcf_hdr_append(hdr
, "##INFO=<ID=DB,Number=0,Type=Flag,Description=\"dbSNP membership, build 129\">");
59 bcf_hdr_append(hdr
, "##INFO=<ID=H2,Number=0,Type=Flag,Description=\"HapMap2 membership\">");
60 bcf_hdr_append(hdr
, "##FILTER=<ID=q10,Description=\"Quality below 10\">");
61 bcf_hdr_append(hdr
, "##FILTER=<ID=s50,Description=\"Less than 50% of samples have data\">");
62 bcf_hdr_append(hdr
, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
63 bcf_hdr_append(hdr
, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
64 bcf_hdr_append(hdr
, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">");
65 bcf_hdr_append(hdr
, "##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"Haplotype Quality\">");
66 bcf_hdr_append(hdr
, "##FORMAT=<ID=TS,Number=1,Type=String,Description=\"Test String\">");
68 bcf_hdr_add_sample(hdr
, "NA00001");
69 bcf_hdr_add_sample(hdr
, "NA00002");
70 bcf_hdr_add_sample(hdr
, "NA00003");
71 bcf_hdr_add_sample(hdr
, NULL
); // to update internal structures
72 bcf_hdr_write(fp
, hdr
);
76 // 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
78 rec
->rid
= bcf_hdr_name2id(hdr
, "20");
82 bcf_update_id(hdr
, rec
, "rs6054257");
84 bcf_update_alleles_str(hdr
, rec
, "G,A");
88 int32_t tmpi
= bcf_hdr_id2int(hdr
, BCF_DT_ID
, "PASS");
89 bcf_update_filter(hdr
, rec
, &tmpi
, 1);
92 bcf_update_info_int32(hdr
, rec
, "NS", &tmpi
, 1);
94 bcf_update_info_int32(hdr
, rec
, "DP", &tmpi
, 1);
96 bcf_update_info_float(hdr
, rec
, "AF", &tmpf
, 1);
97 bcf_update_info_flag(hdr
, rec
, "DB", NULL
, 1);
98 bcf_update_info_flag(hdr
, rec
, "H2", NULL
, 1);
100 int32_t *tmpia
= (int*)malloc(bcf_hdr_nsamples(hdr
)*2*sizeof(int));
101 tmpia
[0] = bcf_gt_phased(0);
102 tmpia
[1] = bcf_gt_phased(0);
103 tmpia
[2] = bcf_gt_phased(1);
104 tmpia
[3] = bcf_gt_phased(0);
105 tmpia
[4] = bcf_gt_unphased(1);
106 tmpia
[5] = bcf_gt_unphased(1);
107 bcf_update_genotypes(hdr
, rec
, tmpia
, bcf_hdr_nsamples(hdr
)*2);
111 bcf_update_format_int32(hdr
, rec
, "GQ", tmpia
, bcf_hdr_nsamples(hdr
));
115 bcf_update_format_int32(hdr
, rec
, "DP", tmpia
, bcf_hdr_nsamples(hdr
));
120 tmpia
[4] = bcf_int32_missing
;
121 tmpia
[5] = bcf_int32_missing
;
122 bcf_update_format_int32(hdr
, rec
, "HQ", tmpia
, bcf_hdr_nsamples(hdr
)*2);
123 char *tmp_str
[] = {"String1","SomeOtherString2","YetAnotherString3"};
124 bcf_update_format_string(hdr
, rec
, "TS", (const char**)tmp_str
, 3);
125 bcf_write1(fp
, hdr
, rec
);
127 // 20 1110696 . A G,T 67 . NS=2;DP=10;AF=0.333,.;AA=T;DB GT 2 1 ./.
129 rec
->rid
= bcf_hdr_name2id(hdr
, "20");
131 bcf_update_alleles_str(hdr
, rec
, "A,G,T");
134 bcf_update_info_int32(hdr
, rec
, "NS", &tmpi
, 1);
136 bcf_update_info_int32(hdr
, rec
, "DP", &tmpi
, 1);
137 float *tmpfa
= (float*)malloc(2*sizeof(float));
139 bcf_float_set_missing(tmpfa
[1]);
140 bcf_update_info_float(hdr
, rec
, "AF", tmpfa
, 2);
141 bcf_update_info_string(hdr
, rec
, "AA", "T");
142 bcf_update_info_flag(hdr
, rec
, "DB", NULL
, 1);
143 tmpia
[0] = bcf_gt_phased(2);
144 tmpia
[1] = bcf_int32_vector_end
;
145 tmpia
[2] = bcf_gt_phased(1);
146 tmpia
[3] = bcf_int32_vector_end
;
147 tmpia
[4] = bcf_gt_missing
;
148 tmpia
[5] = bcf_gt_missing
;
149 bcf_update_genotypes(hdr
, rec
, tmpia
, bcf_hdr_nsamples(hdr
)*2);
150 bcf_write1(fp
, hdr
, rec
);
158 bcf_hdr_destroy(hdr
);
160 if ( (ret
=hts_close(fp
)) )
162 fprintf(stderr
,"hts_close(%s): non-zero status %d\n",fname
,ret
);
167 void bcf_to_vcf(char *fname
)
169 htsFile
*fp
= hts_open(fname
,"rb");
170 bcf_hdr_t
*hdr
= bcf_hdr_read(fp
);
171 bcf1_t
*rec
= bcf_init1();
173 char *gz_fname
= (char*) malloc(strlen(fname
)+4);
174 snprintf(gz_fname
,strlen(fname
)+4,"%s.gz",fname
);
175 htsFile
*out
= hts_open(gz_fname
,"wg");
177 bcf_hdr_t
*hdr_out
= bcf_hdr_dup(hdr
);
178 bcf_hdr_remove(hdr_out
,BCF_HL_STR
,"unused");
179 bcf_hdr_remove(hdr_out
,BCF_HL_GEN
,"unused");
180 bcf_hdr_remove(hdr_out
,BCF_HL_FLT
,"Flt");
181 bcf_hdr_remove(hdr_out
,BCF_HL_INFO
,"UI");
182 bcf_hdr_remove(hdr_out
,BCF_HL_FMT
,"UF");
183 bcf_hdr_remove(hdr_out
,BCF_HL_CTG
,"Unused");
184 bcf_hdr_write(out
, hdr_out
);
186 while ( bcf_read1(fp
, hdr
, rec
)>=0 )
188 bcf_write1(out
, hdr_out
, rec
);
190 // Test problems caused by bcf1_sync: the data block
191 // may be realloced, also the unpacked structures must
193 bcf_unpack(rec
, BCF_UN_STR
);
194 bcf_update_id(hdr
, rec
, 0);
195 bcf_update_format_int32(hdr
, rec
, "GQ", NULL
, 0);
197 bcf1_t
*dup
= bcf_dup(rec
); // force bcf1_sync call
198 bcf_write1(out
, hdr_out
, dup
);
201 bcf_update_alleles_str(hdr_out
, rec
, "G,A");
203 bcf_update_info_int32(hdr_out
, rec
, "DP", &tmpi
, 1);
204 int32_t tmpia
[] = {9,9,9};
205 bcf_update_format_int32(hdr_out
, rec
, "DP", tmpia
, 3);
207 bcf_write1(out
, hdr_out
, rec
);
211 bcf_hdr_destroy(hdr
);
212 bcf_hdr_destroy(hdr_out
);
214 if ( (ret
=hts_close(fp
)) )
216 fprintf(stderr
,"hts_close(%s): non-zero status %d\n",fname
,ret
);
219 if ( (ret
=hts_close(out
)) )
221 fprintf(stderr
,"hts_close(%s): non-zero status %d\n",gz_fname
,ret
);
226 // read gzip, write stdout
227 htsFile
*gz_in
= hts_open(gz_fname
, "r");
230 fprintf(stderr
,"Could not read: %s\n", gz_fname
);
234 kstring_t line
= {0,0,0};
235 while ( hts_getline(gz_in
, KS_SEP_LINE
, &line
)>0 )
238 fwrite(line
.s
,1,line
.l
,stdout
);
241 if ( (ret
=hts_close(gz_in
)) )
243 fprintf(stderr
,"hts_close(%s): non-zero status %d\n",gz_fname
,ret
);
250 void iterator(const char *fname
)
252 htsFile
*fp
= hts_open(fname
, "r");
253 bcf_hdr_t
*hdr
= bcf_hdr_read(fp
);
257 bcf_index_build(fname
, 0);
258 idx
= bcf_index_load(fname
);
260 iter
= bcf_itr_queryi(idx
, bcf_hdr_name2id(hdr
, "20"), 1110600, 1110800);
261 bcf_itr_destroy(iter
);
263 iter
= bcf_itr_querys(idx
, hdr
, "20:1110600-1110800");
264 bcf_itr_destroy(iter
);
266 hts_idx_destroy(idx
);
267 bcf_hdr_destroy(hdr
);
269 if ( (ret
=hts_close(fp
)) )
271 fprintf(stderr
,"hts_close(%s): non-zero status %d\n",fname
,ret
);
276 void test_get_info_values(const char *fname
)
278 htsFile
*fp
= hts_open(fname
, "r");
279 bcf_hdr_t
*hdr
= bcf_hdr_read(fp
);
280 bcf1_t
*line
= bcf_init();
282 while (bcf_read(fp
, hdr
, line
) == 0)
286 int ret
= bcf_get_info_float(hdr
, line
, "AF", &afs
, &count
);
288 if (line
->pos
== 14369)
290 if (ret
!= 1 || afs
[0] != 0.5f
)
292 fprintf(stderr
, "AF on position 14370 should be 0.5\n");
298 if (ret
!= 2 || afs
[0] != 0.333f
|| !bcf_float_is_missing(afs
[1]))
300 fprintf(stderr
, "AF on position 1110696 should be 0.333, missing\n");
309 bcf_hdr_destroy(hdr
);
313 void write_format_values(const char *fname
)
316 htsFile
*fp
= hts_open(fname
, "wb");
317 bcf_hdr_t
*hdr
= bcf_hdr_init("w");
318 bcf1_t
*rec
= bcf_init1();
321 bcf_hdr_append(hdr
, "##contig=<ID=1>");
322 bcf_hdr_append(hdr
, "##FORMAT=<ID=TF,Number=1,Type=Float,Description=\"Test Float\">");
323 bcf_hdr_add_sample(hdr
, "S");
324 bcf_hdr_add_sample(hdr
, NULL
); // to update internal structures
325 bcf_hdr_write(fp
, hdr
);
330 bcf_float_set_missing(test
[0]);
332 bcf_float_set_vector_end(test
[2]);
333 bcf_update_format_float(hdr
, rec
, "TF", test
, 4);
334 bcf_write1(fp
, hdr
, rec
);
337 bcf_hdr_destroy(hdr
);
339 if ((ret
= hts_close(fp
)))
341 fprintf(stderr
, "hts_close(%s): non-zero status %d\n", fname
, ret
);
346 void check_format_values(const char *fname
)
348 htsFile
*fp
= hts_open(fname
, "r");
349 bcf_hdr_t
*hdr
= bcf_hdr_read(fp
);
350 bcf1_t
*line
= bcf_init();
352 while (bcf_read(fp
, hdr
, line
) == 0)
356 int ret
= bcf_get_format_float(hdr
, line
, "TF", &values
, &count
);
358 // NOTE the return value from bcf_get_format_float is different from
359 // bcf_get_info_float in the sense that vector-end markers also count.
362 !bcf_float_is_missing(values
[0]) ||
363 values
[1] != 47.11f
||
364 !bcf_float_is_vector_end(values
[2]) ||
365 !bcf_float_is_vector_end(values
[3]))
367 fprintf(stderr
, "bcf_get_format_float didn't produce the expected output.\n");
375 bcf_hdr_destroy(hdr
);
379 void test_get_format_values(const char *fname
)
381 write_format_values(fname
);
382 check_format_values(fname
);
385 int main(int argc
, char **argv
)
387 char *fname
= argc
>1 ? argv
[1] : "rmme.bcf";
389 // format test. quiet unless there's a failure
390 test_get_format_values(fname
);
392 // main test. writes to stdout
397 // additional tests. quiet unless there's a failure.
398 test_get_info_values(fname
);