modified: src1/input.c
[GalaxyCodeBases.git] / c_cpp / lib / htslib / tabix.c
blob681f8a0b5d514df19326f6bae940cf858a3b2b3f
1 /* tabix.c -- Generic indexer for TAB-delimited genome position files.
3 Copyright (C) 2009-2011 Broad Institute.
4 Copyright (C) 2010-2012, 2014-2017 Genome Research Ltd.
6 Author: Heng Li <lh3@sanger.ac.uk>
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
26 #include <config.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <unistd.h>
31 #include <string.h>
32 #include <getopt.h>
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <errno.h>
36 #include "htslib/tbx.h"
37 #include "htslib/sam.h"
38 #include "htslib/vcf.h"
39 #include "htslib/kseq.h"
40 #include "htslib/bgzf.h"
41 #include "htslib/hts.h"
42 #include "htslib/regidx.h"
44 typedef struct
46 char *regions_fname, *targets_fname;
47 int print_header, header_only;
49 args_t;
51 static void error(const char *format, ...)
53 va_list ap;
54 va_start(ap, format);
55 vfprintf(stderr, format, ap);
56 va_end(ap);
57 exit(EXIT_FAILURE);
60 #define IS_GFF (1<<0)
61 #define IS_BED (1<<1)
62 #define IS_SAM (1<<2)
63 #define IS_VCF (1<<3)
64 #define IS_BCF (1<<4)
65 #define IS_BAM (1<<5)
66 #define IS_CRAM (1<<6)
67 #define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF)
69 int file_type(const char *fname)
71 int l = strlen(fname);
72 int strcasecmp(const char *s1, const char *s2);
73 if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF;
74 else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED;
75 else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM;
76 else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF;
77 else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF;
78 else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM;
79 else if (l>=4 && strcasecmp(fname+l-5, ".cram") == 0) return IS_CRAM;
81 htsFile *fp = hts_open(fname,"r");
82 enum htsExactFormat format = fp->format.format;
83 hts_close(fp);
84 if ( format == bcf ) return IS_BCF;
85 if ( format == bam ) return IS_BAM;
86 if ( format == cram ) return IS_CRAM;
87 if ( format == vcf ) return IS_VCF;
89 return 0;
92 static char **parse_regions(char *regions_fname, char **argv, int argc, int *nregs)
94 kstring_t str = {0,0,0};
95 int iseq = 0, ireg = 0;
96 char **regs = NULL;
97 *nregs = argc;
99 if ( regions_fname )
101 // improve me: this is a too heavy machinery for parsing regions...
103 regidx_t *idx = regidx_init(regions_fname, NULL, NULL, 0, NULL);
104 if ( !idx ) error("Could not read %s\n", regions_fname);
106 (*nregs) += regidx_nregs(idx);
107 regs = (char**) malloc(sizeof(char*)*(*nregs));
109 int nseq;
110 char **seqs = regidx_seq_names(idx, &nseq);
111 for (iseq=0; iseq<nseq; iseq++)
113 regitr_t itr;
114 regidx_overlap(idx, seqs[iseq], 0, UINT32_MAX, &itr);
115 while ( itr.i < itr.n )
117 str.l = 0;
118 ksprintf(&str, "%s:%d-%d", seqs[iseq], REGITR_START(itr)+1, REGITR_END(itr)+1);
119 regs[ireg++] = strdup(str.s);
120 itr.i++;
123 regidx_destroy(idx);
125 free(str.s);
127 if ( !ireg )
129 if ( argc )
130 regs = (char**) malloc(sizeof(char*)*argc);
131 else
133 regs = (char**) malloc(sizeof(char*));
134 regs[0] = strdup(".");
135 *nregs = 1;
139 for (iseq=0; iseq<argc; iseq++) regs[ireg++] = strdup(argv[iseq]);
140 return regs;
142 static int query_regions(args_t *args, char *fname, char **regs, int nregs)
144 int i;
145 htsFile *fp = hts_open(fname,"r");
146 if ( !fp ) error("Could not read %s\n", fname);
147 enum htsExactFormat format = hts_get_format(fp)->format;
149 regidx_t *reg_idx = NULL;
150 if ( args->targets_fname )
152 reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
153 if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
156 if ( format == bcf )
158 htsFile *out = hts_open("-","w");
159 if ( !out ) error("Could not open stdout\n", fname);
160 hts_idx_t *idx = bcf_index_load(fname);
161 if ( !idx ) error("Could not load .csi index of %s\n", fname);
162 bcf_hdr_t *hdr = bcf_hdr_read(fp);
163 if ( !hdr ) error("Could not read the header: %s\n", fname);
164 if ( args->print_header )
165 bcf_hdr_write(out,hdr);
166 if ( !args->header_only )
168 bcf1_t *rec = bcf_init();
169 for (i=0; i<nregs; i++)
171 hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
172 while ( bcf_itr_next(fp, itr, rec) >=0 )
174 if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
175 bcf_write(out,hdr,rec);
177 tbx_itr_destroy(itr);
179 bcf_destroy(rec);
181 if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
182 bcf_hdr_destroy(hdr);
183 hts_idx_destroy(idx);
185 else if ( format==vcf || format==sam || format==unknown_format )
187 tbx_t *tbx = tbx_index_load(fname);
188 if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
189 kstring_t str = {0,0,0};
190 if ( args->print_header )
192 while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
194 if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
195 puts(str.s);
198 if ( !args->header_only )
200 int nseq;
201 const char **seq = NULL;
202 if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
203 for (i=0; i<nregs; i++)
205 hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
206 if ( !itr ) continue;
207 while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
209 if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
210 puts(str.s);
212 tbx_itr_destroy(itr);
214 free(seq);
216 free(str.s);
217 tbx_destroy(tbx);
219 else if ( format==bam )
220 error("Please use \"samtools view\" for querying BAM files.\n");
222 if ( reg_idx ) regidx_destroy(reg_idx);
223 if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
225 for (i=0; i<nregs; i++) free(regs[i]);
226 free(regs);
227 return 0;
229 static int query_chroms(char *fname)
231 const char **seq;
232 int i, nseq, ftype = file_type(fname);
233 if ( ftype & IS_TXT || !ftype )
235 tbx_t *tbx = tbx_index_load(fname);
236 if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
237 seq = tbx_seqnames(tbx, &nseq);
238 for (i=0; i<nseq; i++)
239 printf("%s\n", seq[i]);
240 free(seq);
241 tbx_destroy(tbx);
243 else if ( ftype==IS_BCF )
245 htsFile *fp = hts_open(fname,"r");
246 if ( !fp ) error("Could not read %s\n", fname);
247 bcf_hdr_t *hdr = bcf_hdr_read(fp);
248 if ( !hdr ) error("Could not read the header: %s\n", fname);
249 hts_close(fp);
250 hts_idx_t *idx = bcf_index_load(fname);
251 if ( !idx ) error("Could not load .csi index of %s\n", fname);
252 seq = bcf_index_seqnames(idx, hdr, &nseq);
253 for (i=0; i<nseq; i++)
254 printf("%s\n", seq[i]);
255 free(seq);
256 bcf_hdr_destroy(hdr);
257 hts_idx_destroy(idx);
259 else if ( ftype==IS_BAM ) // todo: BAM
260 error("BAM: todo\n");
261 return 0;
264 int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf)
266 if ( ftype & IS_TXT || !ftype )
268 BGZF *fp = bgzf_open(fname,"r");
269 if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1;
271 char *buffer = fp->uncompressed_block;
272 int skip_until = 0;
274 // Skip the header: find out the position of the data block
275 if ( buffer[0]==conf->meta_char )
277 skip_until = 1;
278 while (1)
280 if ( buffer[skip_until]=='\n' )
282 skip_until++;
283 if ( skip_until>=fp->block_length )
285 if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname);
286 skip_until = 0;
288 // The header has finished
289 if ( buffer[skip_until]!=conf->meta_char ) break;
291 skip_until++;
292 if ( skip_until>=fp->block_length )
294 if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname);
295 skip_until = 0;
300 // Output the new header
301 FILE *hdr = fopen(header,"r");
302 if ( !hdr ) error("%s: %s", header,strerror(errno));
303 const size_t page_size = 32768;
304 char *buf = malloc(page_size);
305 BGZF *bgzf_out = bgzf_open("-", "w");
306 ssize_t nread;
307 while ( (nread=fread(buf,1,page_size-1,hdr))>0 )
309 if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n';
310 if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
312 if ( fclose(hdr) ) error("close failed: %s\n", header);
314 // Output all remainig data read with the header block
315 if ( fp->block_length - skip_until > 0 )
317 if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode);
319 if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
321 while (1)
323 nread = bgzf_raw_read(fp, buf, page_size);
324 if ( nread<=0 ) break;
326 int count = bgzf_raw_write(bgzf_out, buf, nread);
327 if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
329 if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
330 if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode);
331 free(buf);
333 else
334 error("todo: reheader BCF, BAM\n"); // BCF is difficult, records contain pointers to the header.
335 return 0;
338 static int usage(void)
340 fprintf(stderr, "\n");
341 fprintf(stderr, "Version: %s\n", hts_version());
342 fprintf(stderr, "Usage: tabix [OPTIONS] [FILE] [REGION [...]]\n");
343 fprintf(stderr, "\n");
344 fprintf(stderr, "Indexing Options:\n");
345 fprintf(stderr, " -0, --zero-based coordinates are zero-based\n");
346 fprintf(stderr, " -b, --begin INT column number for region start [4]\n");
347 fprintf(stderr, " -c, --comment CHAR skip comment lines starting with CHAR [null]\n");
348 fprintf(stderr, " -C, --csi generate CSI index for VCF (default is TBI)\n");
349 fprintf(stderr, " -e, --end INT column number for region end (if no end, set INT to -b) [5]\n");
350 fprintf(stderr, " -f, --force overwrite existing index without asking\n");
351 fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
352 fprintf(stderr, " -p, --preset STR gff, bed, sam, vcf\n");
353 fprintf(stderr, " -s, --sequence INT column number for sequence names (suppressed by -p) [1]\n");
354 fprintf(stderr, " -S, --skip-lines INT skip first INT lines [0]\n");
355 fprintf(stderr, "\n");
356 fprintf(stderr, "Querying and other options:\n");
357 fprintf(stderr, " -h, --print-header print also the header lines\n");
358 fprintf(stderr, " -H, --only-header print only the header lines\n");
359 fprintf(stderr, " -l, --list-chroms list chromosome names\n");
360 fprintf(stderr, " -r, --reheader FILE replace the header with the content of FILE\n");
361 fprintf(stderr, " -R, --regions FILE restrict to regions listed in the file\n");
362 fprintf(stderr, " -T, --targets FILE similar to -R but streams rather than index-jumps\n");
363 fprintf(stderr, "\n");
364 return 1;
367 int main(int argc, char *argv[])
369 int c, detect = 1, min_shift = 0, is_force = 0, list_chroms = 0, do_csi = 0;
370 tbx_conf_t conf = tbx_conf_gff;
371 char *reheader = NULL;
372 args_t args;
373 memset(&args,0,sizeof(args_t));
375 static const struct option loptions[] =
377 {"help", no_argument, NULL, 2},
378 {"regions", required_argument, NULL, 'R'},
379 {"targets", required_argument, NULL, 'T'},
380 {"csi", no_argument, NULL, 'C'},
381 {"zero-based", no_argument, NULL, '0'},
382 {"print-header", no_argument, NULL, 'h'},
383 {"only-header", no_argument, NULL, 'H'},
384 {"begin", required_argument, NULL, 'b'},
385 {"comment", required_argument, NULL, 'c'},
386 {"end", required_argument, NULL, 'e'},
387 {"force", no_argument, NULL, 'f'},
388 {"preset", required_argument, NULL, 'p'},
389 {"sequence", required_argument, NULL, 's'},
390 {"skip-lines", required_argument, NULL, 'S'},
391 {"list-chroms", no_argument, NULL, 'l'},
392 {"reheader", required_argument, NULL, 'r'},
393 {"version", no_argument, NULL, 1},
394 {NULL, 0, NULL, 0}
397 char *tmp;
398 while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:CR:T:", loptions,NULL)) >= 0)
400 switch (c)
402 case 'R': args.regions_fname = optarg; break;
403 case 'T': args.targets_fname = optarg; break;
404 case 'C': do_csi = 1; break;
405 case 'r': reheader = optarg; break;
406 case 'h': args.print_header = 1; break;
407 case 'H': args.print_header = 1; args.header_only = 1; break;
408 case 'l': list_chroms = 1; break;
409 case '0': conf.preset |= TBX_UCSC; detect = 0; break;
410 case 'b':
411 conf.bc = strtol(optarg,&tmp,10);
412 if ( *tmp ) error("Could not parse argument: -b %s\n", optarg);
413 detect = 0;
414 break;
415 case 'e':
416 conf.ec = strtol(optarg,&tmp,10);
417 if ( *tmp ) error("Could not parse argument: -e %s\n", optarg);
418 detect = 0;
419 break;
420 case 'c': conf.meta_char = *optarg; detect = 0; break;
421 case 'f': is_force = 1; break;
422 case 'm':
423 min_shift = strtol(optarg,&tmp,10);
424 if ( *tmp ) error("Could not parse argument: -m %s\n", optarg);
425 break;
426 case 'p':
427 detect = 0;
428 if (strcmp(optarg, "gff") == 0) conf = tbx_conf_gff;
429 else if (strcmp(optarg, "bed") == 0) conf = tbx_conf_bed;
430 else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam;
431 else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf;
432 else if (strcmp(optarg, "bcf") == 0) detect = 1; // bcf is autodetected, preset is not needed
433 else if (strcmp(optarg, "bam") == 0) detect = 1; // same as bcf
434 else error("The preset string not recognised: '%s'\n", optarg);
435 break;
436 case 's':
437 conf.sc = strtol(optarg,&tmp,10);
438 if ( *tmp ) error("Could not parse argument: -s %s\n", optarg);
439 detect = 0;
440 break;
441 case 'S':
442 conf.line_skip = strtol(optarg,&tmp,10);
443 if ( *tmp ) error("Could not parse argument: -S %s\n", optarg);
444 detect = 0;
445 break;
446 case 1:
447 printf(
448 "tabix (htslib) %s\n"
449 "Copyright (C) 2017 Genome Research Ltd.\n", hts_version());
450 return EXIT_SUCCESS;
451 default: return usage();
455 if ( optind==argc ) return usage();
457 if ( list_chroms )
458 return query_chroms(argv[optind]);
460 if ( argc > optind+1 || args.header_only || args.regions_fname || args.targets_fname )
462 int nregs = 0;
463 char **regs = NULL;
464 if ( !args.header_only )
465 regs = parse_regions(args.regions_fname, argv+optind+1, argc-optind-1, &nregs);
466 return query_regions(&args, argv[optind], regs, nregs);
469 char *fname = argv[optind];
470 int ftype = file_type(fname);
471 if ( detect ) // no preset given
473 if ( ftype==IS_GFF ) conf = tbx_conf_gff;
474 else if ( ftype==IS_BED ) conf = tbx_conf_bed;
475 else if ( ftype==IS_SAM ) conf = tbx_conf_sam;
476 else if ( ftype==IS_VCF )
478 conf = tbx_conf_vcf;
479 if ( !min_shift && do_csi ) min_shift = 14;
481 else if ( ftype==IS_BCF )
483 if ( !min_shift ) min_shift = 14;
485 else if ( ftype==IS_BAM )
487 if ( !min_shift ) min_shift = 14;
490 if ( do_csi )
492 if ( !min_shift ) min_shift = 14;
493 min_shift *= do_csi; // positive for CSIv2, negative for CSIv1
495 if ( min_shift!=0 && !do_csi ) do_csi = 1;
497 if ( reheader )
498 return reheader_file(fname, reheader, ftype, &conf);
500 char *suffix = ".tbi";
501 if ( do_csi ) suffix = ".csi";
502 else if ( ftype==IS_BAM ) suffix = ".bai";
503 else if ( ftype==IS_CRAM ) suffix = ".crai";
505 char *idx_fname = calloc(strlen(fname) + 5, 1);
506 strcat(strcpy(idx_fname, fname), suffix);
508 struct stat stat_tbi, stat_file;
509 if ( !is_force && stat(idx_fname, &stat_tbi)==0 )
511 // Before complaining about existing index, check if the VCF file isn't
512 // newer. This is a common source of errors, people tend not to notice
513 // that tabix failed
514 stat(fname, &stat_file);
515 if ( stat_file.st_mtime <= stat_tbi.st_mtime )
516 error("[tabix] the index file exists. Please use '-f' to overwrite.\n");
518 free(idx_fname);
520 if ( ftype==IS_CRAM )
522 if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
523 return 0;
525 else if ( do_csi )
527 if ( ftype==IS_BCF )
529 if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname);
530 return 0;
532 if ( ftype==IS_BAM )
534 if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
535 return 0;
537 if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname);
538 return 0;
540 else // TBI index
542 if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname);
543 return 0;
545 return 0;