2 Copyright (C) 2014 Genome Research Ltd.
4 Author: Petr Danecek <pd3@sanger.ac.uk>
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
28 #include "htslib/hts.h"
29 #include "htslib/kstring.h"
30 #include "htslib/kseq.h"
31 #include "htslib/khash_str2int.h"
32 #include "htslib/regidx.h"
33 #include "hts_internal.h"
35 #define LIDX_SHIFT 13 // number of insignificant index bits
37 // List of regions for one chromosome
41 int nregs
, mregs
; // n:used, m:alloced
47 // Container of all sequences
50 int nseq
, mseq
; // n:used, m:alloced
51 reglist_t
*seq
; // regions for each sequence
52 void *seq2regs
; // hash for fast lookup from chr name to regions
54 regidx_free_f free
; // function to free any data allocated by regidx_parse_f
55 regidx_parse_f parse
; // parse one input line
56 void *usr
; // user data to pass to regidx_parse_f
58 // temporary data for index initialization
60 int rid_prev
, start_prev
, end_prev
;
65 int regidx_seq_nregs(regidx_t
*idx
, const char *seq
)
68 if ( khash_str2int_get(idx
->seq2regs
, seq
, &iseq
)!=0 ) return 0; // no such sequence
69 return idx
->seq
[iseq
].nregs
;
72 int regidx_nregs(regidx_t
*idx
)
75 for (i
=0; i
<idx
->nseq
; i
++) nregs
+= idx
->seq
[i
].nregs
;
79 char **regidx_seq_names(regidx_t
*idx
, int *n
)
82 return idx
->seq_names
;
85 int _regidx_build_index(regidx_t
*idx
)
88 for (iseq
=0; iseq
<idx
->nseq
; iseq
++)
90 reglist_t
*list
= &idx
->seq
[iseq
];
91 int j
,k
, imax
= 0; // max index bin
92 for (j
=0; j
<list
->nregs
; j
++)
94 int ibeg
= list
->regs
[j
].start
>> LIDX_SHIFT
;
95 int iend
= list
->regs
[j
].end
>> LIDX_SHIFT
;
96 if ( imax
< iend
+ 1 )
101 list
->idx
= (int*) realloc(list
->idx
, imax
*sizeof(int));
102 for (k
=old_imax
; k
<imax
; k
++) list
->idx
[k
] = -1;
106 if ( list
->idx
[ibeg
]<0 ) list
->idx
[ibeg
] = j
;
110 for (k
=ibeg
; k
<=iend
; k
++)
111 if ( list
->idx
[k
]<0 ) list
->idx
[k
] = j
;
113 list
->nidx
= iend
+ 1;
119 int regidx_insert(regidx_t
*idx
, char *line
)
122 return _regidx_build_index(idx
);
124 char *chr_from
, *chr_to
;
126 int ret
= idx
->parse(line
,&chr_from
,&chr_to
,®
,idx
->payload
,idx
->usr
);
127 if ( ret
==-2 ) return -1; // error
128 if ( ret
==-1 ) return 0; // skip the line
132 kputsn(chr_from
, chr_to
-chr_from
+1, &idx
->str
);
133 if ( khash_str2int_get(idx
->seq2regs
, idx
->str
.s
, &rid
)!=0 )
136 int m_prev
= idx
->mseq
;
137 hts_expand0(reglist_t
,idx
->nseq
,idx
->mseq
,idx
->seq
);
138 hts_expand0(char*,idx
->nseq
,m_prev
,idx
->seq_names
);
139 idx
->seq_names
[idx
->nseq
-1] = strdup(idx
->str
.s
);
140 rid
= khash_str2int_inc(idx
->seq2regs
, idx
->seq_names
[idx
->nseq
-1]);
143 reglist_t
*list
= &idx
->seq
[rid
];
145 int m_prev
= list
->mregs
;
146 hts_expand(reg_t
,list
->nregs
,list
->mregs
,list
->regs
);
147 list
->regs
[list
->nregs
-1] = reg
;
148 if ( idx
->payload_size
)
150 if ( m_prev
< list
->mregs
) list
->payload
= realloc(list
->payload
,idx
->payload_size
*list
->mregs
);
151 memcpy((char*)list
->payload
+ idx
->payload_size
*(list
->nregs
-1), idx
->payload
, idx
->payload_size
);
154 if ( idx
->rid_prev
==rid
)
156 if ( idx
->start_prev
> reg
.start
|| (idx
->start_prev
==reg
.start
&& idx
->end_prev
>reg
.end
) )
158 hts_log_error("The regions are not sorted: %s:%d-%d is before %s:%d-%d",
159 idx
->str
.s
,idx
->start_prev
+1,idx
->end_prev
+1,idx
->str
.s
,reg
.start
+1,reg
.end
+1);
164 idx
->start_prev
= reg
.start
;
165 idx
->end_prev
= reg
.end
;
169 regidx_t
*regidx_init(const char *fname
, regidx_parse_f parser
, regidx_free_f free_f
, size_t payload_size
, void *usr_dat
)
173 if ( !fname
) parser
= regidx_parse_tab
;
176 int len
= strlen(fname
);
177 if ( len
>=7 && !strcasecmp(".bed.gz",fname
+len
-7) )
178 parser
= regidx_parse_bed
;
179 else if ( len
>=8 && !strcasecmp(".bed.bgz",fname
+len
-8) )
180 parser
= regidx_parse_bed
;
181 else if ( len
>=4 && !strcasecmp(".bed",fname
+len
-4) )
182 parser
= regidx_parse_bed
;
184 parser
= regidx_parse_tab
;
188 regidx_t
*idx
= (regidx_t
*) calloc(1,sizeof(regidx_t
));
192 idx
->seq2regs
= khash_str2int_init();
194 idx
->start_prev
= -1;
196 idx
->payload_size
= payload_size
;
197 if ( payload_size
) idx
->payload
= malloc(payload_size
);
199 if ( !fname
) return idx
;
201 kstring_t str
= {0,0,0};
203 htsFile
*fp
= hts_open(fname
,"r");
204 if ( !fp
) goto error
;
206 while ( hts_getline(fp
, KS_SEP_LINE
, &str
) > 0 )
208 if ( regidx_insert(idx
, str
.s
) ) goto error
;
210 regidx_insert(idx
, NULL
);
218 if ( fp
) hts_close(fp
);
223 void regidx_destroy(regidx_t
*idx
)
226 for (i
=0; i
<idx
->nseq
; i
++)
228 reglist_t
*list
= &idx
->seq
[i
];
231 for (j
=0; j
<list
->nregs
; j
++)
232 idx
->free((char*)list
->payload
+ idx
->payload_size
*j
);
238 free(idx
->seq_names
);
242 khash_str2int_destroy_free(idx
->seq2regs
);
246 int regidx_overlap(regidx_t
*idx
, const char *chr
, uint32_t from
, uint32_t to
, regitr_t
*itr
)
248 if ( itr
) itr
->i
= itr
->n
= 0;
251 if ( khash_str2int_get(idx
->seq2regs
, chr
, &iseq
)!=0 ) return 0; // no such sequence
253 reglist_t
*list
= &idx
->seq
[iseq
];
254 if ( !list
->nregs
) return 0;
256 int i
, ibeg
= from
>>LIDX_SHIFT
;
257 int ireg
= ibeg
< list
->nidx
? list
->idx
[ibeg
] : list
->idx
[ list
->nidx
- 1 ];
260 // linear search; if slow, replace with binary search
261 if ( ibeg
> list
->nidx
) ibeg
= list
->nidx
;
262 for (i
=ibeg
- 1; i
>=0; i
--)
263 if ( list
->idx
[i
] >=0 ) break;
264 ireg
= i
>=0 ? list
->idx
[i
] : 0;
266 for (i
=ireg
; i
<list
->nregs
; i
++)
268 if ( list
->regs
[i
].start
> to
) return 0; // no match
269 if ( list
->regs
[i
].end
>= from
&& list
->regs
[i
].start
<= to
) break; // found
272 if ( i
>=list
->nregs
) return 0; // no match
274 if ( !itr
) return 1;
277 itr
->n
= list
->nregs
- i
;
278 itr
->reg
= &idx
->seq
[iseq
].regs
[i
];
279 if ( idx
->payload_size
)
280 itr
->payload
= (char*)idx
->seq
[iseq
].payload
+ i
*idx
->payload_size
;
287 int regidx_parse_bed(const char *line
, char **chr_beg
, char **chr_end
, reg_t
*reg
, void *payload
, void *usr
)
289 char *ss
= (char*) line
;
290 while ( *ss
&& isspace_c(*ss
) ) ss
++;
291 if ( !*ss
) return -1; // skip blank lines
292 if ( *ss
=='#' ) return -1; // skip comments
295 while ( *se
&& !isspace_c(*se
) ) se
++;
296 if ( !*se
) { hts_log_error("Could not parse bed line: %s", line
); return -2; }
302 reg
->start
= hts_parse_decimal(ss
, &se
, 0);
303 if ( ss
==se
) { hts_log_error("Could not parse bed line: %s", line
); return -2; }
306 reg
->end
= hts_parse_decimal(ss
, &se
, 0) - 1;
307 if ( ss
==se
) { hts_log_error("Could not parse bed line: %s", line
); return -2; }
312 int regidx_parse_tab(const char *line
, char **chr_beg
, char **chr_end
, reg_t
*reg
, void *payload
, void *usr
)
314 char *ss
= (char*) line
;
315 while ( *ss
&& isspace_c(*ss
) ) ss
++;
316 if ( !*ss
) return -1; // skip blank lines
317 if ( *ss
=='#' ) return -1; // skip comments
320 while ( *se
&& !isspace_c(*se
) ) se
++;
321 if ( !*se
) { hts_log_error("Could not parse bed line: %s", line
); return -2; }
327 reg
->start
= hts_parse_decimal(ss
, &se
, 0) - 1;
328 if ( ss
==se
) { hts_log_error("Could not parse bed line: %s", line
); return -2; }
330 if ( !se
[0] || !se
[1] )
331 reg
->end
= reg
->start
;
335 reg
->end
= hts_parse_decimal(ss
, &se
, 0);
336 if ( ss
==se
) reg
->end
= reg
->start
;