modified: src1/input.c
[GalaxyCodeBases.git] / c_cpp / lib / htslib / test / test-vcf-api.c
blob0e1bdca70eb1d69248718dbdd985f786c9c03aea
1 /* test/test-vcf-api.c -- VCF test harness.
3 Copyright (C) 2013, 2014 Genome Research Ltd.
5 Author: Petr Danecek <pd3@sanger.ac.uk>
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
25 #include <config.h>
27 #include <stdio.h>
28 #include <htslib/hts.h>
29 #include <htslib/vcf.h>
30 #include <htslib/kstring.h>
31 #include <htslib/kseq.h>
33 void write_bcf(char *fname)
35 // Init
36 htsFile *fp = hts_open(fname,"wb");
37 bcf_hdr_t *hdr = bcf_hdr_init("w");
38 bcf1_t *rec = bcf_init1();
40 // Create VCF header
41 kstring_t str = {0,0,0};
42 bcf_hdr_append(hdr, "##fileDate=20090805");
43 bcf_hdr_append(hdr, "##FORMAT=<ID=UF,Number=1,Type=Integer,Description=\"Unused FORMAT\">");
44 bcf_hdr_append(hdr, "##INFO=<ID=UI,Number=1,Type=Integer,Description=\"Unused INFO\">");
45 bcf_hdr_append(hdr, "##FILTER=<ID=Flt,Description=\"Unused FILTER\">");
46 bcf_hdr_append(hdr, "##unused=<XX=AA,Description=\"Unused generic\">");
47 bcf_hdr_append(hdr, "##unused=unformatted text 1");
48 bcf_hdr_append(hdr, "##unused=unformatted text 2");
49 bcf_hdr_append(hdr, "##contig=<ID=Unused,length=62435964>");
50 bcf_hdr_append(hdr, "##source=myImputationProgramV3.1");
51 bcf_hdr_append(hdr, "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta");
52 bcf_hdr_append(hdr, "##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\",taxonomy=x>");
53 bcf_hdr_append(hdr, "##phasing=partial");
54 bcf_hdr_append(hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">");
55 bcf_hdr_append(hdr, "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">");
56 bcf_hdr_append(hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency\">");
57 bcf_hdr_append(hdr, "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">");
58 bcf_hdr_append(hdr, "##INFO=<ID=DB,Number=0,Type=Flag,Description=\"dbSNP membership, build 129\">");
59 bcf_hdr_append(hdr, "##INFO=<ID=H2,Number=0,Type=Flag,Description=\"HapMap2 membership\">");
60 bcf_hdr_append(hdr, "##FILTER=<ID=q10,Description=\"Quality below 10\">");
61 bcf_hdr_append(hdr, "##FILTER=<ID=s50,Description=\"Less than 50% of samples have data\">");
62 bcf_hdr_append(hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
63 bcf_hdr_append(hdr, "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">");
64 bcf_hdr_append(hdr, "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">");
65 bcf_hdr_append(hdr, "##FORMAT=<ID=HQ,Number=2,Type=Integer,Description=\"Haplotype Quality\">");
66 bcf_hdr_append(hdr, "##FORMAT=<ID=TS,Number=1,Type=String,Description=\"Test String\">");
68 bcf_hdr_add_sample(hdr, "NA00001");
69 bcf_hdr_add_sample(hdr, "NA00002");
70 bcf_hdr_add_sample(hdr, "NA00003");
71 bcf_hdr_add_sample(hdr, NULL); // to update internal structures
72 bcf_hdr_write(fp, hdr);
75 // Add a record
76 // 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
77 // .. CHROM
78 rec->rid = bcf_hdr_name2id(hdr, "20");
79 // .. POS
80 rec->pos = 14369;
81 // .. ID
82 bcf_update_id(hdr, rec, "rs6054257");
83 // .. REF and ALT
84 bcf_update_alleles_str(hdr, rec, "G,A");
85 // .. QUAL
86 rec->qual = 29;
87 // .. FILTER
88 int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS");
89 bcf_update_filter(hdr, rec, &tmpi, 1);
90 // .. INFO
91 tmpi = 3;
92 bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1);
93 tmpi = 14;
94 bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1);
95 float tmpf = 0.5;
96 bcf_update_info_float(hdr, rec, "AF", &tmpf, 1);
97 bcf_update_info_flag(hdr, rec, "DB", NULL, 1);
98 bcf_update_info_flag(hdr, rec, "H2", NULL, 1);
99 // .. FORMAT
100 int32_t *tmpia = (int*)malloc(bcf_hdr_nsamples(hdr)*2*sizeof(int));
101 tmpia[0] = bcf_gt_phased(0);
102 tmpia[1] = bcf_gt_phased(0);
103 tmpia[2] = bcf_gt_phased(1);
104 tmpia[3] = bcf_gt_phased(0);
105 tmpia[4] = bcf_gt_unphased(1);
106 tmpia[5] = bcf_gt_unphased(1);
107 bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2);
108 tmpia[0] = 48;
109 tmpia[1] = 48;
110 tmpia[2] = 43;
111 bcf_update_format_int32(hdr, rec, "GQ", tmpia, bcf_hdr_nsamples(hdr));
112 tmpia[0] = 1;
113 tmpia[1] = 8;
114 tmpia[2] = 5;
115 bcf_update_format_int32(hdr, rec, "DP", tmpia, bcf_hdr_nsamples(hdr));
116 tmpia[0] = 51;
117 tmpia[1] = 51;
118 tmpia[2] = 51;
119 tmpia[3] = 51;
120 tmpia[4] = bcf_int32_missing;
121 tmpia[5] = bcf_int32_missing;
122 bcf_update_format_int32(hdr, rec, "HQ", tmpia, bcf_hdr_nsamples(hdr)*2);
123 char *tmp_str[] = {"String1","SomeOtherString2","YetAnotherString3"};
124 bcf_update_format_string(hdr, rec, "TS", (const char**)tmp_str, 3);
125 bcf_write1(fp, hdr, rec);
127 // 20 1110696 . A G,T 67 . NS=2;DP=10;AF=0.333,.;AA=T;DB GT 2 1 ./.
128 bcf_clear1(rec);
129 rec->rid = bcf_hdr_name2id(hdr, "20");
130 rec->pos = 1110695;
131 bcf_update_alleles_str(hdr, rec, "A,G,T");
132 rec->qual = 67;
133 tmpi = 2;
134 bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1);
135 tmpi = 10;
136 bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1);
137 float *tmpfa = (float*)malloc(2*sizeof(float));
138 tmpfa[0] = 0.333;
139 bcf_float_set_missing(tmpfa[1]);
140 bcf_update_info_float(hdr, rec, "AF", tmpfa, 2);
141 bcf_update_info_string(hdr, rec, "AA", "T");
142 bcf_update_info_flag(hdr, rec, "DB", NULL, 1);
143 tmpia[0] = bcf_gt_phased(2);
144 tmpia[1] = bcf_int32_vector_end;
145 tmpia[2] = bcf_gt_phased(1);
146 tmpia[3] = bcf_int32_vector_end;
147 tmpia[4] = bcf_gt_missing;
148 tmpia[5] = bcf_gt_missing;
149 bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2);
150 bcf_write1(fp, hdr, rec);
152 free(tmpia);
153 free(tmpfa);
155 // Clean
156 free(str.s);
157 bcf_destroy1(rec);
158 bcf_hdr_destroy(hdr);
159 int ret;
160 if ( (ret=hts_close(fp)) )
162 fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
163 exit(ret);
167 void bcf_to_vcf(char *fname)
169 htsFile *fp = hts_open(fname,"rb");
170 bcf_hdr_t *hdr = bcf_hdr_read(fp);
171 bcf1_t *rec = bcf_init1();
173 char *gz_fname = (char*) malloc(strlen(fname)+4);
174 snprintf(gz_fname,strlen(fname)+4,"%s.gz",fname);
175 htsFile *out = hts_open(gz_fname,"wg");
177 bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr);
178 bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused");
179 bcf_hdr_remove(hdr_out,BCF_HL_GEN,"unused");
180 bcf_hdr_remove(hdr_out,BCF_HL_FLT,"Flt");
181 bcf_hdr_remove(hdr_out,BCF_HL_INFO,"UI");
182 bcf_hdr_remove(hdr_out,BCF_HL_FMT,"UF");
183 bcf_hdr_remove(hdr_out,BCF_HL_CTG,"Unused");
184 bcf_hdr_write(out, hdr_out);
186 while ( bcf_read1(fp, hdr, rec)>=0 )
188 bcf_write1(out, hdr_out, rec);
190 // Test problems caused by bcf1_sync: the data block
191 // may be realloced, also the unpacked structures must
192 // get updated.
193 bcf_unpack(rec, BCF_UN_STR);
194 bcf_update_id(hdr, rec, 0);
195 bcf_update_format_int32(hdr, rec, "GQ", NULL, 0);
197 bcf1_t *dup = bcf_dup(rec); // force bcf1_sync call
198 bcf_write1(out, hdr_out, dup);
199 bcf_destroy1(dup);
201 bcf_update_alleles_str(hdr_out, rec, "G,A");
202 int32_t tmpi = 99;
203 bcf_update_info_int32(hdr_out, rec, "DP", &tmpi, 1);
204 int32_t tmpia[] = {9,9,9};
205 bcf_update_format_int32(hdr_out, rec, "DP", tmpia, 3);
207 bcf_write1(out, hdr_out, rec);
210 bcf_destroy1(rec);
211 bcf_hdr_destroy(hdr);
212 bcf_hdr_destroy(hdr_out);
213 int ret;
214 if ( (ret=hts_close(fp)) )
216 fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
217 exit(ret);
219 if ( (ret=hts_close(out)) )
221 fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret);
222 exit(ret);
226 // read gzip, write stdout
227 htsFile *gz_in = hts_open(gz_fname, "r");
228 if ( !gz_in )
230 fprintf(stderr,"Could not read: %s\n", gz_fname);
231 exit(1);
234 kstring_t line = {0,0,0};
235 while ( hts_getline(gz_in, KS_SEP_LINE, &line)>0 )
237 kputc('\n',&line);
238 fwrite(line.s,1,line.l,stdout);
241 if ( (ret=hts_close(gz_in)) )
243 fprintf(stderr,"hts_close(%s): non-zero status %d\n",gz_fname,ret);
244 exit(ret);
246 free(line.s);
247 free(gz_fname);
250 void iterator(const char *fname)
252 htsFile *fp = hts_open(fname, "r");
253 bcf_hdr_t *hdr = bcf_hdr_read(fp);
254 hts_idx_t *idx;
255 hts_itr_t *iter;
257 bcf_index_build(fname, 0);
258 idx = bcf_index_load(fname);
260 iter = bcf_itr_queryi(idx, bcf_hdr_name2id(hdr, "20"), 1110600, 1110800);
261 bcf_itr_destroy(iter);
263 iter = bcf_itr_querys(idx, hdr, "20:1110600-1110800");
264 bcf_itr_destroy(iter);
266 hts_idx_destroy(idx);
267 bcf_hdr_destroy(hdr);
268 int ret;
269 if ( (ret=hts_close(fp)) )
271 fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
272 exit(ret);
276 void test_get_info_values(const char *fname)
278 htsFile *fp = hts_open(fname, "r");
279 bcf_hdr_t *hdr = bcf_hdr_read(fp);
280 bcf1_t *line = bcf_init();
282 while (bcf_read(fp, hdr, line) == 0)
284 float *afs = 0;
285 int count = 0;
286 int ret = bcf_get_info_float(hdr, line, "AF", &afs, &count);
288 if (line->pos == 14369)
290 if (ret != 1 || afs[0] != 0.5f)
292 fprintf(stderr, "AF on position 14370 should be 0.5\n");
293 exit(-1);
296 else
298 if (ret != 2 || afs[0] != 0.333f || !bcf_float_is_missing(afs[1]))
300 fprintf(stderr, "AF on position 1110696 should be 0.333, missing\n");
301 exit(-1);
305 free(afs);
308 bcf_destroy(line);
309 bcf_hdr_destroy(hdr);
310 hts_close(fp);
313 void write_format_values(const char *fname)
315 // Init
316 htsFile *fp = hts_open(fname, "wb");
317 bcf_hdr_t *hdr = bcf_hdr_init("w");
318 bcf1_t *rec = bcf_init1();
320 // Create VCF header
321 bcf_hdr_append(hdr, "##contig=<ID=1>");
322 bcf_hdr_append(hdr, "##FORMAT=<ID=TF,Number=1,Type=Float,Description=\"Test Float\">");
323 bcf_hdr_add_sample(hdr, "S");
324 bcf_hdr_add_sample(hdr, NULL); // to update internal structures
325 bcf_hdr_write(fp, hdr);
327 // Add a record
328 // .. FORMAT
329 float test[4];
330 bcf_float_set_missing(test[0]);
331 test[1] = 47.11f;
332 bcf_float_set_vector_end(test[2]);
333 bcf_update_format_float(hdr, rec, "TF", test, 4);
334 bcf_write1(fp, hdr, rec);
336 bcf_destroy1(rec);
337 bcf_hdr_destroy(hdr);
338 int ret;
339 if ((ret = hts_close(fp)))
341 fprintf(stderr, "hts_close(%s): non-zero status %d\n", fname, ret);
342 exit(ret);
346 void check_format_values(const char *fname)
348 htsFile *fp = hts_open(fname, "r");
349 bcf_hdr_t *hdr = bcf_hdr_read(fp);
350 bcf1_t *line = bcf_init();
352 while (bcf_read(fp, hdr, line) == 0)
354 float *values = 0;
355 int count = 0;
356 int ret = bcf_get_format_float(hdr, line, "TF", &values, &count);
358 // NOTE the return value from bcf_get_format_float is different from
359 // bcf_get_info_float in the sense that vector-end markers also count.
360 if (ret != 4 ||
361 count < ret ||
362 !bcf_float_is_missing(values[0]) ||
363 values[1] != 47.11f ||
364 !bcf_float_is_vector_end(values[2]) ||
365 !bcf_float_is_vector_end(values[3]))
367 fprintf(stderr, "bcf_get_format_float didn't produce the expected output.\n");
368 exit(-1);
371 free(values);
374 bcf_destroy(line);
375 bcf_hdr_destroy(hdr);
376 hts_close(fp);
379 void test_get_format_values(const char *fname)
381 write_format_values(fname);
382 check_format_values(fname);
385 int main(int argc, char **argv)
387 char *fname = argc>1 ? argv[1] : "rmme.bcf";
389 // format test. quiet unless there's a failure
390 test_get_format_values(fname);
392 // main test. writes to stdout
393 write_bcf(fname);
394 bcf_to_vcf(fname);
395 iterator(fname);
397 // additional tests. quiet unless there's a failure.
398 test_get_info_values(fname);
400 return 0;