2 Copyright (c) 2012-2016 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 * CRAM I/O primitives.
34 * - ITF8 encoding and decoding.
36 * - Zlib inflating and deflating (memory)
37 * - CRAM basic data structure reading and writing
38 * - File opening / closing
39 * - Reference sequence handling
43 * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
44 * a way to return errors for when malloc fails.
62 #include <sys/types.h>
68 #include "cram/cram.h"
70 #include "htslib/hts.h"
71 #include "cram/open_trace_file.h"
72 #include "cram/rANS_static.h"
77 #include <sys/syscall.h>
78 #define gettid() (int)syscall(SYS_gettid)
80 #define RP(...) fprintf (stderr, __VA_ARGS__)
85 #include "htslib/hfile.h"
86 #include "htslib/bgzf.h"
87 #include "htslib/faidx.h"
88 #include "hts_internal.h"
91 #define PATH_MAX FILENAME_MAX
98 /* ----------------------------------------------------------------------
99 * ITF8 encoding and decoding.
101 * Also see the itf8_get and itf8_put macros in cram_io.h
105 * LEGACY: consider using itf8_decode_crc.
107 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
110 * Returns the number of bytes read on success
113 int itf8_decode(cram_fd
*fd
, int32_t *val_p
) {
114 static int nbytes
[16] = {
115 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
116 1,1,1,1, // 1000xxxx - 1011xxxx
117 2,2, // 1100xxxx - 1101xxxx
122 static int nbits
[16] = {
123 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
124 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
125 0x1f, 0x1f, // 1100xxxx - 1101xxxx
130 int32_t val
= hgetc(fd
->fp
);
134 int i
= nbytes
[val
>>4];
135 val
&= nbits
[val
>>4];
143 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
148 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
149 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
154 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
155 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
156 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
160 case 4: // really 3.5 more, why make it different?
161 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
162 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
163 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
164 val
= (val
<<4) | (((unsigned char)hgetc(fd
->fp
)) & 0x0f);
171 int itf8_decode_crc(cram_fd
*fd
, int32_t *val_p
, uint32_t *crc
) {
172 static int nbytes
[16] = {
173 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
174 1,1,1,1, // 1000xxxx - 1011xxxx
175 2,2, // 1100xxxx - 1101xxxx
180 static int nbits
[16] = {
181 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
182 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
183 0x1f, 0x1f, // 1100xxxx - 1101xxxx
189 int32_t val
= hgetc(fd
->fp
);
195 int i
= nbytes
[val
>>4];
196 val
&= nbits
[val
>>4];
201 *crc
= crc32(*crc
, c
, 1);
205 val
= (val
<<8) | (c
[1]=hgetc(fd
->fp
));
207 *crc
= crc32(*crc
, c
, 2);
211 val
= (val
<<8) | (c
[1]=hgetc(fd
->fp
));
212 val
= (val
<<8) | (c
[2]=hgetc(fd
->fp
));
214 *crc
= crc32(*crc
, c
, 3);
218 val
= (val
<<8) | (c
[1]=hgetc(fd
->fp
));
219 val
= (val
<<8) | (c
[2]=hgetc(fd
->fp
));
220 val
= (val
<<8) | (c
[3]=hgetc(fd
->fp
));
222 *crc
= crc32(*crc
, c
, 4);
225 case 4: // really 3.5 more, why make it different?
228 uv
= (uv
<<8) | (c
[1]=hgetc(fd
->fp
));
229 uv
= (uv
<<8) | (c
[2]=hgetc(fd
->fp
));
230 uv
= (uv
<<8) | (c
[3]=hgetc(fd
->fp
));
231 uv
= (uv
<<4) | (((c
[4]=hgetc(fd
->fp
))) & 0x0f);
232 *val_p
= uv
< 0x80000000UL
? uv
: -((int32_t) (0xffffffffUL
- uv
)) - 1;
233 *crc
= crc32(*crc
, c
, 5);
241 * Encodes and writes a single integer in ITF-8 format.
242 * Returns 0 on success
245 int itf8_encode(cram_fd
*fd
, int32_t val
) {
247 int len
= itf8_put(buf
, val
);
248 return hwrite(fd
->fp
, buf
, len
) == len
? 0 : -1;
251 const int itf8_bytes
[16] = {
252 1, 1, 1, 1, 1, 1, 1, 1,
253 2, 2, 2, 2, 3, 3, 4, 5
256 const int ltf8_bytes
[256] = {
257 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
258 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
259 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
260 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
262 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
263 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
264 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
265 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
267 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
268 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
269 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
270 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
272 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
273 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
275 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
277 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 9
281 * LEGACY: consider using ltf8_decode_crc.
283 int ltf8_decode(cram_fd
*fd
, int64_t *val_p
) {
284 int c
= hgetc(fd
->fp
);
285 int64_t val
= (unsigned char)c
;
293 } else if (val
< 0xc0) {
294 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
295 *val_p
= val
& (((1LL<<(6+8)))-1);
298 } else if (val
< 0xe0) {
299 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
300 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
301 *val_p
= val
& ((1LL<<(5+2*8))-1);
304 } else if (val
< 0xf0) {
305 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
306 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
307 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
308 *val_p
= val
& ((1LL<<(4+3*8))-1);
311 } else if (val
< 0xf8) {
312 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
313 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
314 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
315 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
316 *val_p
= val
& ((1LL<<(3+4*8))-1);
319 } else if (val
< 0xfc) {
320 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
321 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
322 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
323 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
324 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
325 *val_p
= val
& ((1LL<<(2+5*8))-1);
328 } else if (val
< 0xfe) {
329 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
330 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
331 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
332 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
333 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
334 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
335 *val_p
= val
& ((1LL<<(1+6*8))-1);
338 } else if (val
< 0xff) {
339 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
340 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
341 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
342 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
343 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
344 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
345 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
346 *val_p
= val
& ((1LL<<(7*8))-1);
350 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
351 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
352 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
353 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
354 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
355 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
356 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
357 val
= (val
<<8) | (unsigned char)hgetc(fd
->fp
);
364 int ltf8_decode_crc(cram_fd
*fd
, int64_t *val_p
, uint32_t *crc
) {
366 int64_t val
= (unsigned char)hgetc(fd
->fp
);
374 *crc
= crc32(*crc
, c
, 1);
377 } else if (val
< 0xc0) {
378 val
= (val
<<8) | (c
[1]=hgetc(fd
->fp
));;
379 *val_p
= val
& (((1LL<<(6+8)))-1);
380 *crc
= crc32(*crc
, c
, 2);
383 } else if (val
< 0xe0) {
384 val
= (val
<<8) | (c
[1]=hgetc(fd
->fp
));;
385 val
= (val
<<8) | (c
[2]=hgetc(fd
->fp
));;
386 *val_p
= val
& ((1LL<<(5+2*8))-1);
387 *crc
= crc32(*crc
, c
, 3);
390 } else if (val
< 0xf0) {
391 val
= (val
<<8) | (c
[1]=hgetc(fd
->fp
));;
392 val
= (val
<<8) | (c
[2]=hgetc(fd
->fp
));;
393 val
= (val
<<8) | (c
[3]=hgetc(fd
->fp
));;
394 *val_p
= val
& ((1LL<<(4+3*8))-1);
395 *crc
= crc32(*crc
, c
, 4);
398 } else if (val
< 0xf8) {
399 val
= (val
<<8) | (c
[1]=hgetc(fd
->fp
));;
400 val
= (val
<<8) | (c
[2]=hgetc(fd
->fp
));;
401 val
= (val
<<8) | (c
[3]=hgetc(fd
->fp
));;
402 val
= (val
<<8) | (c
[4]=hgetc(fd
->fp
));;
403 *val_p
= val
& ((1LL<<(3+4*8))-1);
404 *crc
= crc32(*crc
, c
, 5);
407 } else if (val
< 0xfc) {
408 val
= (val
<<8) | (c
[1]=hgetc(fd
->fp
));;
409 val
= (val
<<8) | (c
[2]=hgetc(fd
->fp
));;
410 val
= (val
<<8) | (c
[3]=hgetc(fd
->fp
));;
411 val
= (val
<<8) | (c
[4]=hgetc(fd
->fp
));;
412 val
= (val
<<8) | (c
[5]=hgetc(fd
->fp
));;
413 *val_p
= val
& ((1LL<<(2+5*8))-1);
414 *crc
= crc32(*crc
, c
, 6);
417 } else if (val
< 0xfe) {
418 val
= (val
<<8) | (c
[1]=hgetc(fd
->fp
));;
419 val
= (val
<<8) | (c
[2]=hgetc(fd
->fp
));;
420 val
= (val
<<8) | (c
[3]=hgetc(fd
->fp
));;
421 val
= (val
<<8) | (c
[4]=hgetc(fd
->fp
));;
422 val
= (val
<<8) | (c
[5]=hgetc(fd
->fp
));;
423 val
= (val
<<8) | (c
[6]=hgetc(fd
->fp
));;
424 *val_p
= val
& ((1LL<<(1+6*8))-1);
425 *crc
= crc32(*crc
, c
, 7);
428 } else if (val
< 0xff) {
429 val
= (val
<<8) | (c
[1]=hgetc(fd
->fp
));;
430 val
= (val
<<8) | (c
[2]=hgetc(fd
->fp
));;
431 val
= (val
<<8) | (c
[3]=hgetc(fd
->fp
));;
432 val
= (val
<<8) | (c
[4]=hgetc(fd
->fp
));;
433 val
= (val
<<8) | (c
[5]=hgetc(fd
->fp
));;
434 val
= (val
<<8) | (c
[6]=hgetc(fd
->fp
));;
435 val
= (val
<<8) | (c
[7]=hgetc(fd
->fp
));;
436 *val_p
= val
& ((1LL<<(7*8))-1);
437 *crc
= crc32(*crc
, c
, 8);
441 val
= (val
<<8) | (c
[1]=hgetc(fd
->fp
));;
442 val
= (val
<<8) | (c
[2]=hgetc(fd
->fp
));;
443 val
= (val
<<8) | (c
[3]=hgetc(fd
->fp
));;
444 val
= (val
<<8) | (c
[4]=hgetc(fd
->fp
));;
445 val
= (val
<<8) | (c
[5]=hgetc(fd
->fp
));;
446 val
= (val
<<8) | (c
[6]=hgetc(fd
->fp
));;
447 val
= (val
<<8) | (c
[7]=hgetc(fd
->fp
));;
448 val
= (val
<<8) | (c
[8]=hgetc(fd
->fp
));;
449 *crc
= crc32(*crc
, c
, 9);
457 * Pushes a value in ITF8 format onto the end of a block.
458 * This shouldn't be used for high-volume data as it is not the fastest
461 * Returns the number of bytes written
463 int itf8_put_blk(cram_block
*blk
, int val
) {
467 sz
= itf8_put(buf
, val
);
468 BLOCK_APPEND(blk
, buf
, sz
);
473 * Decodes a 32-bit little endian value from fd and stores in val.
475 * Returns the number of bytes read on success
478 int int32_decode(cram_fd
*fd
, int32_t *val
) {
480 if (4 != hread(fd
->fp
, &i
, 4))
488 * Encodes a 32-bit little endian value 'val' and writes to fd.
490 * Returns the number of bytes written on success
493 int int32_encode(cram_fd
*fd
, int32_t val
) {
495 if (4 != hwrite(fd
->fp
, &val
, 4))
501 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
502 int int32_get_blk(cram_block
*b
, int32_t *val
) {
503 if (b
->uncomp_size
- BLOCK_SIZE(b
) < 4)
508 (b
->data
[b
->byte
+1] << 8) |
509 (b
->data
[b
->byte
+2] << 16) |
510 (b
->data
[b
->byte
+3] << 24);
515 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
516 int int32_put_blk(cram_block
*b
, int32_t val
) {
518 cp
[0] = ( val
& 0xff);
519 cp
[1] = ((val
>>8) & 0xff);
520 cp
[2] = ((val
>>16) & 0xff);
521 cp
[3] = ((val
>>24) & 0xff);
523 BLOCK_APPEND(b
, cp
, 4);
524 return b
->data
? 0 : -1;
527 /* ----------------------------------------------------------------------
528 * zlib compression code - from Gap5's tg_iface_g.c
529 * They're static here as they're only used within the cram_compress_block
530 * and cram_uncompress_block functions, which are the external interface.
532 char *zlib_mem_inflate(char *cdata
, size_t csize
, size_t *size
) {
534 unsigned char *data
= NULL
; /* Uncompressed output */
538 /* Starting point at uncompressed size, and scale after that */
539 data
= malloc(data_alloc
= csize
*1.2+100);
543 /* Initialise zlib stream */
544 s
.zalloc
= Z_NULL
; /* use default allocation functions */
547 s
.next_in
= (unsigned char *)cdata
;
551 s
.avail_out
= data_alloc
;
554 //err = inflateInit(&s);
555 err
= inflateInit2(&s
, 15 + 32);
557 fprintf(stderr
, "zlib inflateInit error: %s\n", s
.msg
);
562 /* Decode to 'data' array */
564 unsigned char *data_tmp
;
567 s
.next_out
= &data
[s
.total_out
];
568 err
= inflate(&s
, Z_NO_FLUSH
);
569 if (err
== Z_STREAM_END
)
573 fprintf(stderr
, "zlib inflate error: %s\n", s
.msg
);
579 /* More to come, so realloc based on growth so far */
580 alloc_inc
= (double)s
.avail_in
/s
.total_in
* s
.total_out
+ 100;
581 data
= realloc((data_tmp
= data
), data_alloc
+= alloc_inc
);
586 s
.avail_out
+= alloc_inc
;
594 static char *zlib_mem_deflate(char *data
, size_t size
, size_t *cdata_size
,
595 int level
, int strat
) {
597 unsigned char *cdata
= NULL
; /* Compressed output */
602 cdata
= malloc(cdata_alloc
= size
*1.05+100);
607 /* Initialise zlib stream */
608 s
.zalloc
= Z_NULL
; /* use default allocation functions */
611 s
.next_in
= (unsigned char *)data
;
615 s
.avail_out
= cdata_alloc
;
617 s
.data_type
= Z_BINARY
;
619 err
= deflateInit2(&s
, level
, Z_DEFLATED
, 15|16, 9, strat
);
621 fprintf(stderr
, "zlib deflateInit2 error: %s\n", s
.msg
);
625 /* Encode to 'cdata' array */
627 s
.next_out
= &cdata
[cdata_pos
];
628 s
.avail_out
= cdata_alloc
- cdata_pos
;
629 if (cdata_alloc
- cdata_pos
<= 0) {
630 fprintf(stderr
, "Deflate produced larger output than expected. Abort\n");
633 err
= deflate(&s
, Z_NO_FLUSH
);
634 cdata_pos
= cdata_alloc
- s
.avail_out
;
636 fprintf(stderr
, "zlib deflate error: %s\n", s
.msg
);
640 if (deflate(&s
, Z_FINISH
) != Z_STREAM_END
) {
641 fprintf(stderr
, "zlib deflate error: %s\n", s
.msg
);
643 *cdata_size
= s
.total_out
;
645 if (deflateEnd(&s
) != Z_OK
) {
646 fprintf(stderr
, "zlib deflate error: %s\n", s
.msg
);
648 return (char *)cdata
;
652 /* ------------------------------------------------------------------------ */
654 * Data compression routines using liblzma (xz)
656 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
657 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
658 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
659 * as compression times.
661 * For now we disable this functionality. If it's to be reenabled make sure you
662 * improve the mem_inflate implementation as it's just a test hack at the
666 static char *lzma_mem_deflate(char *data
, size_t size
, size_t *cdata_size
,
669 size_t out_size
= lzma_stream_buffer_bound(size
);
672 out
= malloc(out_size
);
674 /* Single call compression */
675 if (LZMA_OK
!= lzma_easy_buffer_encode(level
, LZMA_CHECK_CRC32
, NULL
,
676 (uint8_t *)data
, size
,
677 (uint8_t *)out
, cdata_size
,
684 static char *lzma_mem_inflate(char *cdata
, size_t csize
, size_t *size
) {
685 lzma_stream strm
= LZMA_STREAM_INIT
;
686 size_t out_size
= 0, out_pos
= 0;
690 /* Initiate the decoder */
691 if (LZMA_OK
!= lzma_stream_decoder(&strm
, lzma_easy_decoder_memusage(9), 0))
695 strm
.avail_in
= csize
;
696 strm
.next_in
= (uint8_t *)cdata
;
698 for (;strm
.avail_in
;) {
699 if (strm
.avail_in
> out_size
- out_pos
) {
700 out_size
+= strm
.avail_in
* 4 + 32768;
701 out
= realloc(out
, out_size
);
703 strm
.avail_out
= out_size
- out_pos
;
704 strm
.next_out
= (uint8_t *)&out
[out_pos
];
706 r
= lzma_code(&strm
, LZMA_RUN
);
707 if (LZMA_OK
!= r
&& LZMA_STREAM_END
!= r
) {
708 fprintf(stderr
, "[E::%s] LZMA decode failure (error %d)\n", __func__
, r
);
712 out_pos
= strm
.total_out
;
714 if (r
== LZMA_STREAM_END
)
718 /* finish up any unflushed data; necessary? */
719 r
= lzma_code(&strm
, LZMA_FINISH
);
720 if (r
!= LZMA_OK
&& r
!= LZMA_STREAM_END
) {
721 fprintf(stderr
, "r=%d\n", r
);
725 out
= realloc(out
, strm
.total_out
);
726 *size
= strm
.total_out
;
734 /* ----------------------------------------------------------------------
735 * CRAM blocks - the dynamically growable data block. We have code to
736 * create, update, (un)compress and read/write.
738 * These are derived from the deflate_interlaced.c blocks, but with the
739 * CRAM extension of content types and IDs.
743 * Allocates a new cram_block structure with a specified content_type and
746 * Returns block pointer on success
749 cram_block
*cram_new_block(enum cram_content_type content_type
,
751 cram_block
*b
= malloc(sizeof(*b
));
754 b
->method
= b
->orig_method
= RAW
;
755 b
->content_type
= content_type
;
756 b
->content_id
= content_id
;
768 * Reads a block from a cram file.
769 * Returns cram_block pointer on success.
772 cram_block
*cram_read_block(cram_fd
*fd
) {
773 cram_block
*b
= malloc(sizeof(*b
));
779 //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
781 if (-1 == (b
->method
= hgetc(fd
->fp
))) { free(b
); return NULL
; }
782 c
= b
->method
; crc
= crc32(crc
, &c
, 1);
783 if (-1 == (b
->content_type
= hgetc(fd
->fp
))) { free(b
); return NULL
; }
784 c
= b
->content_type
; crc
= crc32(crc
, &c
, 1);
785 if (-1 == itf8_decode_crc(fd
, &b
->content_id
, &crc
)) { free(b
); return NULL
; }
786 if (-1 == itf8_decode_crc(fd
, &b
->comp_size
, &crc
)) { free(b
); return NULL
; }
787 if (-1 == itf8_decode_crc(fd
, &b
->uncomp_size
, &crc
)) { free(b
); return NULL
; }
789 // fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
790 // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
792 if (b
->method
== RAW
) {
793 if (b
->uncomp_size
< 0) { free(b
); return NULL
; }
794 b
->alloc
= b
->uncomp_size
;
795 if (!(b
->data
= malloc(b
->uncomp_size
))){ free(b
); return NULL
; }
796 if (b
->uncomp_size
!= hread(fd
->fp
, b
->data
, b
->uncomp_size
)) {
802 if (b
->comp_size
< 0) { free(b
); return NULL
; }
803 b
->alloc
= b
->comp_size
;
804 if (!(b
->data
= malloc(b
->comp_size
))) { free(b
); return NULL
; }
805 if (b
->comp_size
!= hread(fd
->fp
, b
->data
, b
->comp_size
)) {
812 if (CRAM_MAJOR_VERS(fd
->version
) >= 3) {
813 if (-1 == int32_decode(fd
, (int32_t *)&b
->crc32
)) {
818 crc
= crc32(crc
, b
->data
? b
->data
: (uc
*)"", b
->alloc
);
819 if (crc
!= b
->crc32
) {
820 fprintf(stderr
, "Block CRC32 failure\n");
827 b
->orig_method
= b
->method
;
837 * Computes the size of a cram block, including the block
840 uint32_t cram_block_size(cram_block
*b
) {
841 unsigned char dat
[100], *cp
= dat
;;
845 *cp
++ = b
->content_type
;
846 cp
+= itf8_put((char*)cp
, b
->content_id
);
847 cp
+= itf8_put((char*)cp
, b
->comp_size
);
848 cp
+= itf8_put((char*)cp
, b
->uncomp_size
);
851 sz
+= b
->method
== RAW
? b
->uncomp_size
: b
->comp_size
;
857 * Writes a CRAM block.
858 * Returns 0 on success
861 int cram_write_block(cram_fd
*fd
, cram_block
*b
) {
862 assert(b
->method
!= RAW
|| (b
->comp_size
== b
->uncomp_size
));
864 if (hputc(b
->method
, fd
->fp
) == EOF
) return -1;
865 if (hputc(b
->content_type
, fd
->fp
) == EOF
) return -1;
866 if (itf8_encode(fd
, b
->content_id
) == -1) return -1;
867 if (itf8_encode(fd
, b
->comp_size
) == -1) return -1;
868 if (itf8_encode(fd
, b
->uncomp_size
) == -1) return -1;
871 if (b
->method
== RAW
) {
872 if (b
->uncomp_size
!= hwrite(fd
->fp
, b
->data
, b
->uncomp_size
))
875 if (b
->comp_size
!= hwrite(fd
->fp
, b
->data
, b
->comp_size
))
879 // Absent blocks should be size 0
880 assert(b
->method
== RAW
&& b
->uncomp_size
== 0);
883 if (CRAM_MAJOR_VERS(fd
->version
) >= 3) {
884 unsigned char dat
[100], *cp
= dat
;;
888 *cp
++ = b
->content_type
;
889 cp
+= itf8_put((char*)cp
, b
->content_id
);
890 cp
+= itf8_put((char*)cp
, b
->comp_size
);
891 cp
+= itf8_put((char*)cp
, b
->uncomp_size
);
892 crc
= crc32(0L, dat
, cp
-dat
);
894 if (b
->method
== RAW
) {
895 b
->crc32
= crc32(crc
, b
->data
? b
->data
: (uc
*)"", b
->uncomp_size
);
897 b
->crc32
= crc32(crc
, b
->data
? b
->data
: (uc
*)"", b
->comp_size
);
900 if (-1 == int32_encode(fd
, b
->crc32
))
908 * Frees a CRAM block, deallocating internal data too.
910 void cram_free_block(cram_block
*b
) {
919 * Uncompresses a CRAM block, if compressed.
921 int cram_uncompress_block(cram_block
*b
) {
923 size_t uncomp_size
= 0;
925 if (b
->uncomp_size
== 0) {
936 uncomp
= zlib_mem_inflate((char *)b
->data
, b
->comp_size
, &uncomp_size
);
939 if ((int)uncomp_size
!= b
->uncomp_size
) {
944 b
->data
= (unsigned char *)uncomp
;
945 b
->alloc
= uncomp_size
;
951 unsigned int usize
= b
->uncomp_size
;
952 if (!(uncomp
= malloc(usize
)))
954 if (BZ_OK
!= BZ2_bzBuffToBuffDecompress(uncomp
, &usize
,
955 (char *)b
->data
, b
->comp_size
,
961 b
->data
= (unsigned char *)uncomp
;
964 b
->uncomp_size
= usize
; // Just incase it differs
969 fprintf(stderr
, "Bzip2 compression is not compiled into this "
970 "version.\nPlease rebuild and try again.\n");
976 uncomp
= lzma_mem_inflate((char *)b
->data
, b
->comp_size
, &uncomp_size
);
979 if ((int)uncomp_size
!= b
->uncomp_size
)
982 b
->data
= (unsigned char *)uncomp
;
983 b
->alloc
= uncomp_size
;
988 fprintf(stderr
, "Lzma compression is not compiled into this "
989 "version.\nPlease rebuild and try again.\n");
995 unsigned int usize
= b
->uncomp_size
, usize2
;
996 uncomp
= (char *)rans_uncompress(b
->data
, b
->comp_size
, &usize2
);
997 if (!uncomp
|| usize
!= usize2
)
1000 b
->data
= (unsigned char *)uncomp
;
1003 b
->uncomp_size
= usize2
; // Just incase it differs
1004 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1015 static char *cram_compress_by_method(char *in
, size_t in_size
,
1017 enum cram_block_method method
,
1018 int level
, int strat
) {
1021 return zlib_mem_deflate(in
, in_size
, out_size
, level
, strat
);
1025 unsigned int comp_size
= in_size
*1.01 + 600;
1026 char *comp
= malloc(comp_size
);
1030 if (BZ_OK
!= BZ2_bzBuffToBuffCompress(comp
, &comp_size
,
1036 *out_size
= comp_size
;
1045 return lzma_mem_deflate(in
, in_size
, out_size
, level
);
1051 unsigned int out_size_i
;
1053 cp
= rans_compress((unsigned char *)in
, in_size
, &out_size_i
, 0);
1054 *out_size
= out_size_i
;
1059 unsigned int out_size_i
;
1062 cp
= rans_compress((unsigned char *)in
, in_size
, &out_size_i
, 1);
1063 *out_size
= out_size_i
;
1079 * Compresses a block using one of two different zlib strategies. If we only
1080 * want one choice set strat2 to be -1.
1082 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
1083 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
1084 * significantly faster.
1086 * Method and level -1 implies defaults, as specified in cram_fd.
1088 int cram_compress_block(cram_fd
*fd
, cram_block
*b
, cram_metrics
*metrics
,
1089 int method
, int level
) {
1092 size_t comp_size
= 0;
1095 if (b
->method
!= RAW
) {
1096 // Maybe already compressed if s->block[0] was compressed and
1097 // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1098 // one base type present and hence using E_HUFFMAN on block 0.
1099 // A second explicit attempt to compress the same block then
1115 //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1117 if (method
== RAW
|| level
== 0 || b
->uncomp_size
== 0) {
1119 b
->comp_size
= b
->uncomp_size
;
1120 //fprintf(stderr, "Skip block id %d\n", b->content_id);
1125 pthread_mutex_lock(&fd
->metrics_lock
);
1126 if (metrics
->trial
> 0 || --metrics
->next_trial
<= 0) {
1127 size_t sz_best
= INT_MAX
;
1128 size_t sz_gz_rle
= 0;
1129 size_t sz_gz_def
= 0;
1130 size_t sz_rans0
= 0;
1131 size_t sz_rans1
= 0;
1132 size_t sz_bzip2
= 0;
1134 int method_best
= 0;
1135 char *c_best
= NULL
, *c
= NULL
;
1137 if (metrics
->revised_method
)
1138 method
= metrics
->revised_method
;
1140 metrics
->revised_method
= method
;
1142 if (metrics
->next_trial
<= 0) {
1143 metrics
->next_trial
= TRIAL_SPAN
;
1144 metrics
->trial
= NTRIALS
;
1145 metrics
->sz_gz_rle
/= 2;
1146 metrics
->sz_gz_def
/= 2;
1147 metrics
->sz_rans0
/= 2;
1148 metrics
->sz_rans1
/= 2;
1149 metrics
->sz_bzip2
/= 2;
1150 metrics
->sz_lzma
/= 2;
1153 pthread_mutex_unlock(&fd
->metrics_lock
);
1155 if (method
& (1<<GZIP_RLE
)) {
1156 c
= cram_compress_by_method((char *)b
->data
, b
->uncomp_size
,
1157 &sz_gz_rle
, GZIP
, 1, Z_RLE
);
1158 if (c
&& sz_best
> sz_gz_rle
) {
1159 sz_best
= sz_gz_rle
;
1160 method_best
= GZIP_RLE
;
1167 sz_gz_rle
= b
->uncomp_size
*2+1000;
1170 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle);
1173 if (method
& (1<<GZIP
)) {
1174 c
= cram_compress_by_method((char *)b
->data
, b
->uncomp_size
,
1175 &sz_gz_def
, GZIP
, level
,
1177 if (c
&& sz_best
> sz_gz_def
) {
1178 sz_best
= sz_gz_def
;
1186 sz_gz_def
= b
->uncomp_size
*2+1000;
1189 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def);
1192 if (method
& (1<<RANS0
)) {
1193 c
= cram_compress_by_method((char *)b
->data
, b
->uncomp_size
,
1194 &sz_rans0
, RANS0
, 0, 0);
1195 if (c
&& sz_best
> sz_rans0
) {
1197 method_best
= RANS0
;
1204 sz_rans0
= b
->uncomp_size
*2+1000;
1208 if (method
& (1<<RANS1
)) {
1209 c
= cram_compress_by_method((char *)b
->data
, b
->uncomp_size
,
1210 &sz_rans1
, RANS1
, 0, 0);
1211 if (c
&& sz_best
> sz_rans1
) {
1213 method_best
= RANS1
;
1220 sz_rans1
= b
->uncomp_size
*2+1000;
1224 if (method
& (1<<BZIP2
)) {
1225 c
= cram_compress_by_method((char *)b
->data
, b
->uncomp_size
,
1226 &sz_bzip2
, BZIP2
, level
, 0);
1227 if (c
&& sz_best
> sz_bzip2
) {
1229 method_best
= BZIP2
;
1236 sz_bzip2
= b
->uncomp_size
*2+1000;
1240 if (method
& (1<<LZMA
)) {
1241 c
= cram_compress_by_method((char *)b
->data
, b
->uncomp_size
,
1242 &sz_lzma
, LZMA
, level
, 0);
1243 if (c
&& sz_best
> sz_lzma
) {
1252 sz_lzma
= b
->uncomp_size
*2+1000;
1256 //fprintf(stderr, "sz_best = %d\n", sz_best);
1259 b
->data
= (unsigned char *)c_best
;
1260 //printf("method_best = %s\n", cram_block_method2str(method_best));
1261 b
->method
= method_best
== GZIP_RLE
? GZIP
: method_best
;
1262 b
->comp_size
= sz_best
;
1264 pthread_mutex_lock(&fd
->metrics_lock
);
1265 metrics
->sz_gz_rle
+= sz_gz_rle
;
1266 metrics
->sz_gz_def
+= sz_gz_def
;
1267 metrics
->sz_rans0
+= sz_rans0
;
1268 metrics
->sz_rans1
+= sz_rans1
;
1269 metrics
->sz_bzip2
+= sz_bzip2
;
1270 metrics
->sz_lzma
+= sz_lzma
;
1271 if (--metrics
->trial
== 0) {
1272 int best_method
= RAW
;
1273 int best_sz
= INT_MAX
;
1275 // Scale methods by cost
1276 if (fd
->level
<= 3) {
1277 metrics
->sz_rans1
*= 1.02;
1278 metrics
->sz_gz_def
*= 1.04;
1279 metrics
->sz_bzip2
*= 1.08;
1280 metrics
->sz_lzma
*= 1.10;
1281 } else if (fd
->level
<= 6) {
1282 metrics
->sz_rans1
*= 1.01;
1283 metrics
->sz_gz_def
*= 1.02;
1284 metrics
->sz_bzip2
*= 1.03;
1285 metrics
->sz_lzma
*= 1.05;
1288 if (method
& (1<<GZIP_RLE
) && best_sz
> metrics
->sz_gz_rle
)
1289 best_sz
= metrics
->sz_gz_rle
, best_method
= GZIP_RLE
;
1291 if (method
& (1<<GZIP
) && best_sz
> metrics
->sz_gz_def
)
1292 best_sz
= metrics
->sz_gz_def
, best_method
= GZIP
;
1294 if (method
& (1<<RANS0
) && best_sz
> metrics
->sz_rans0
)
1295 best_sz
= metrics
->sz_rans0
, best_method
= RANS0
;
1297 if (method
& (1<<RANS1
) && best_sz
> metrics
->sz_rans1
)
1298 best_sz
= metrics
->sz_rans1
, best_method
= RANS1
;
1300 if (method
& (1<<BZIP2
) && best_sz
> metrics
->sz_bzip2
)
1301 best_sz
= metrics
->sz_bzip2
, best_method
= BZIP2
;
1303 if (method
& (1<<LZMA
) && best_sz
> metrics
->sz_lzma
)
1304 best_sz
= metrics
->sz_lzma
, best_method
= LZMA
;
1306 if (best_method
== GZIP_RLE
) {
1307 metrics
->method
= GZIP
;
1308 metrics
->strat
= Z_RLE
;
1310 metrics
->method
= best_method
;
1311 metrics
->strat
= Z_FILTERED
;
1314 // If we see at least MAXFAIL trials in a row for a specific
1315 // compression method with more than MAXDELTA aggregate
1316 // size then we drop this from the list of methods used
1317 // for this block type.
1318 #define MAXDELTA 0.20
1320 if (best_method
== GZIP_RLE
) {
1321 metrics
->gz_rle_cnt
= 0;
1322 metrics
->gz_rle_extra
= 0;
1323 } else if (best_sz
< metrics
->sz_gz_rle
) {
1324 double r
= (double)metrics
->sz_gz_rle
/ best_sz
- 1;
1325 if (++metrics
->gz_rle_cnt
>= MAXFAILS
&&
1326 (metrics
->gz_rle_extra
+= r
) >= MAXDELTA
)
1327 method
&= ~(1<<GZIP_RLE
);
1330 if (best_method
== GZIP
) {
1331 metrics
->gz_def_cnt
= 0;
1332 metrics
->gz_def_extra
= 0;
1333 } else if (best_sz
< metrics
->sz_gz_def
) {
1334 double r
= (double)metrics
->sz_gz_def
/ best_sz
- 1;
1335 if (++metrics
->gz_def_cnt
>= MAXFAILS
&&
1336 (metrics
->gz_def_extra
+= r
) >= MAXDELTA
)
1337 method
&= ~(1<<GZIP
);
1340 if (best_method
== RANS0
) {
1341 metrics
->rans0_cnt
= 0;
1342 metrics
->rans0_extra
= 0;
1343 } else if (best_sz
< metrics
->sz_rans0
) {
1344 double r
= (double)metrics
->sz_rans0
/ best_sz
- 1;
1345 if (++metrics
->rans0_cnt
>= MAXFAILS
&&
1346 (metrics
->rans0_extra
+= r
) >= MAXDELTA
)
1347 method
&= ~(1<<RANS0
);
1350 if (best_method
== RANS1
) {
1351 metrics
->rans1_cnt
= 0;
1352 metrics
->rans1_extra
= 0;
1353 } else if (best_sz
< metrics
->sz_rans1
) {
1354 double r
= (double)metrics
->sz_rans1
/ best_sz
- 1;
1355 if (++metrics
->rans1_cnt
>= MAXFAILS
&&
1356 (metrics
->rans1_extra
+= r
) >= MAXDELTA
)
1357 method
&= ~(1<<RANS1
);
1360 if (best_method
== BZIP2
) {
1361 metrics
->bzip2_cnt
= 0;
1362 metrics
->bzip2_extra
= 0;
1363 } else if (best_sz
< metrics
->sz_bzip2
) {
1364 double r
= (double)metrics
->sz_bzip2
/ best_sz
- 1;
1365 if (++metrics
->bzip2_cnt
>= MAXFAILS
&&
1366 (metrics
->bzip2_extra
+= r
) >= MAXDELTA
)
1367 method
&= ~(1<<BZIP2
);
1370 if (best_method
== LZMA
) {
1371 metrics
->lzma_cnt
= 0;
1372 metrics
->lzma_extra
= 0;
1373 } else if (best_sz
< metrics
->sz_lzma
) {
1374 double r
= (double)metrics
->sz_lzma
/ best_sz
- 1;
1375 if (++metrics
->lzma_cnt
>= MAXFAILS
&&
1376 (metrics
->lzma_extra
+= r
) >= MAXDELTA
)
1377 method
&= ~(1<<LZMA
);
1380 //if (method != metrics->revised_method)
1381 // fprintf(stderr, "%d: method from %x to %x\n",
1382 // b->content_id, metrics->revised_method, method);
1383 metrics
->revised_method
= method
;
1385 pthread_mutex_unlock(&fd
->metrics_lock
);
1387 strat
= metrics
->strat
;
1388 method
= metrics
->method
;
1390 pthread_mutex_unlock(&fd
->metrics_lock
);
1391 comp
= cram_compress_by_method((char *)b
->data
, b
->uncomp_size
,
1397 b
->data
= (unsigned char *)comp
;
1398 b
->comp_size
= comp_size
;
1403 // no cached metrics, so just do zlib?
1404 comp
= cram_compress_by_method((char *)b
->data
, b
->uncomp_size
,
1405 &comp_size
, GZIP
, level
, Z_FILTERED
);
1407 fprintf(stderr
, "Compression failed!\n");
1411 b
->data
= (unsigned char *)comp
;
1412 b
->comp_size
= comp_size
;
1417 fprintf(stderr
, "Compressed block ID %d from %d to %d by method %s\n",
1418 b
->content_id
, b
->uncomp_size
, b
->comp_size
,
1419 cram_block_method2str(b
->method
));
1421 if (b
->method
== RANS1
)
1422 b
->method
= RANS0
; // Spec just has RANS (not 0/1) with auto-sensing
1427 cram_metrics
*cram_new_metrics(void) {
1428 cram_metrics
*m
= calloc(1, sizeof(*m
));
1431 m
->trial
= NTRIALS
-1;
1432 m
->next_trial
= TRIAL_SPAN
;
1435 m
->revised_method
= 0;
1440 char *cram_block_method2str(enum cram_block_method m
) {
1442 case RAW
: return "RAW";
1443 case GZIP
: return "GZIP";
1444 case BZIP2
: return "BZIP2";
1445 case LZMA
: return "LZMA";
1446 case RANS0
: return "RANS0";
1447 case RANS1
: return "RANS1";
1448 case GZIP_RLE
: return "GZIP_RLE";
1454 char *cram_content_type2str(enum cram_content_type t
) {
1456 case FILE_HEADER
: return "FILE_HEADER";
1457 case COMPRESSION_HEADER
: return "COMPRESSION_HEADER";
1458 case MAPPED_SLICE
: return "MAPPED_SLICE";
1459 case UNMAPPED_SLICE
: return "UNMAPPED_SLICE";
1460 case EXTERNAL
: return "EXTERNAL";
1461 case CORE
: return "CORE";
1462 case CT_ERROR
: break;
1467 /* ----------------------------------------------------------------------
1468 * Reference sequence handling
1470 * These revolve around the refs_t structure, which may potentially be
1471 * shared between multiple cram_fd.
1473 * We start with refs_create() to allocate an empty refs_t and then
1474 * populate it with @SQ line data using refs_from_header(). This is done on
1475 * cram_open(). Also at start up we can call cram_load_reference() which
1476 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
1477 * new one specified. In either case refs2id() is then called which
1478 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
1480 * Later, possibly within a thread, we will want to know the actual ref
1481 * seq itself, obtained by calling cram_get_ref(). This may use the
1482 * UR: or M5: fields or the filename specified in the original
1483 * cram_load_reference() call.
1485 * Given the potential for multi-threaded reference usage, we have
1486 * reference counting (sorry for the confusing double use of "ref") to
1487 * track the number of callers interested in any specific reference.
1491 * Frees/unmaps a reference sequence and associated file handles.
1493 static void ref_entry_free_seq(ref_entry
*e
) {
1496 if (e
->seq
&& !e
->mf
)
1503 void refs_free(refs_t
*r
) {
1504 RP("refs_free()\n");
1513 string_pool_destroy(r
->pool
);
1518 for (k
= kh_begin(r
->h_meta
); k
!= kh_end(r
->h_meta
); k
++) {
1521 if (!kh_exist(r
->h_meta
, k
))
1523 if (!(e
= kh_val(r
->h_meta
, k
)))
1525 ref_entry_free_seq(e
);
1529 kh_destroy(refs
, r
->h_meta
);
1538 pthread_mutex_destroy(&r
->lock
);
1543 static refs_t
*refs_create(void) {
1544 refs_t
*r
= calloc(1, sizeof(*r
));
1546 RP("refs_create()\n");
1551 if (!(r
->pool
= string_pool_create(8192)))
1554 r
->ref_id
= NULL
; // see refs2id() to populate.
1559 if (!(r
->h_meta
= kh_init(refs
)))
1562 pthread_mutex_init(&r
->lock
, NULL
);
1572 * Opens a reference fasta file as a BGZF stream, allowing for
1573 * compressed files. It automatically builds a .fai file if
1574 * required and if compressed a .gzi bgzf index too.
1576 * Returns a BGZF handle on success;
1579 static BGZF
*bgzf_open_ref(char *fn
, char *mode
, int is_md5
) {
1583 char fai_file
[PATH_MAX
];
1585 snprintf(fai_file
, PATH_MAX
, "%s.fai", fn
);
1586 if (access(fai_file
, R_OK
) != 0)
1587 if (fai_build(fn
) != 0)
1591 if (!(fp
= bgzf_open(fn
, mode
))) {
1596 if (fp
->is_compressed
== 1 && bgzf_index_load(fp
, fn
, ".gzi") < 0) {
1597 fprintf(stderr
, "Unable to load .gzi index '%s.gzi'\n", fn
);
1606 * Loads a FAI file for a reference.fasta.
1607 * "is_err" indicates whether failure to load is worthy of emitting an
1608 * error message. In some cases (eg with embedded references) we
1609 * speculatively load, just incase, and silently ignore errors.
1611 * Returns the refs_t struct on success (maybe newly allocated);
1614 static refs_t
*refs_load_fai(refs_t
*r_orig
, char *fn
, int is_err
) {
1617 char fai_fn
[PATH_MAX
];
1620 size_t fn_l
= strlen(fn
);
1621 int id
= 0, id_alloc
= 0;
1623 RP("refs_load_fai %s\n", fn
);
1626 if (!(r
= refs_create()))
1629 /* Open reference, for later use */
1630 if (stat(fn
, &sb
) != 0) {
1637 if (bgzf_close(r
->fp
) != 0)
1641 if (!(r
->fn
= string_dup(r
->pool
, fn
)))
1644 if (fn_l
> 4 && strcmp(&fn
[fn_l
-4], ".fai") == 0)
1647 if (!(r
->fp
= bgzf_open_ref(r
->fn
, "r", 0)))
1650 /* Parse .fai file and load meta-data */
1651 sprintf(fai_fn
, "%.*s.fai", PATH_MAX
-5, r
->fn
);
1653 if (stat(fai_fn
, &sb
) != 0) {
1658 if (!(fp
= fopen(fai_fn
, "r"))) {
1663 while (fgets(line
, 8192, fp
) != NULL
) {
1664 ref_entry
*e
= malloc(sizeof(*e
));
1673 for (cp
= line
; *cp
&& !isspace_c(*cp
); cp
++)
1676 e
->name
= string_dup(r
->pool
, line
);
1679 while (*cp
&& isspace_c(*cp
))
1681 e
->length
= strtoll(cp
, &cp
, 10);
1684 while (*cp
&& isspace_c(*cp
))
1686 e
->offset
= strtoll(cp
, &cp
, 10);
1689 while (*cp
&& isspace_c(*cp
))
1691 e
->bases_per_line
= strtol(cp
, &cp
, 10);
1694 while (*cp
&& isspace_c(*cp
))
1696 e
->line_length
= strtol(cp
, &cp
, 10);
1706 k
= kh_put(refs
, r
->h_meta
, e
->name
, &n
);
1713 kh_val(r
->h_meta
, k
) = e
;
1715 ref_entry
*re
= kh_val(r
->h_meta
, k
);
1716 if (re
&& (re
->count
!= 0 || re
->length
!= 0)) {
1723 kh_val(r
->h_meta
, k
) = e
;
1727 if (id
>= id_alloc
) {
1730 id_alloc
= id_alloc
?id_alloc
*2 : 16;
1731 r
->ref_id
= realloc(r
->ref_id
, id_alloc
* sizeof(*r
->ref_id
));
1733 for (x
= id
; x
< id_alloc
; x
++)
1734 r
->ref_id
[x
] = NULL
;
1754 * Verifies that the CRAM @SQ lines and .fai files match.
1756 static void sanitise_SQ_lines(cram_fd
*fd
) {
1762 if (!fd
->refs
|| !fd
->refs
->h_meta
)
1765 for (i
= 0; i
< fd
->header
->nref
; i
++) {
1766 char *name
= fd
->header
->ref
[i
].name
;
1767 khint_t k
= kh_get(refs
, fd
->refs
->h_meta
, name
);
1770 // We may have @SQ lines which have no known .fai, but do not
1771 // in themselves pose a problem because they are unused in the file.
1772 if (k
== kh_end(fd
->refs
->h_meta
))
1775 if (!(r
= (ref_entry
*)kh_val(fd
->refs
->h_meta
, k
)))
1778 if (r
->length
&& r
->length
!= fd
->header
->ref
[i
].len
) {
1779 assert(strcmp(r
->name
, fd
->header
->ref
[i
].name
) == 0);
1781 // Should we also check MD5sums here to ensure the correct
1782 // reference was given?
1783 fprintf(stderr
, "WARNING: Header @SQ length mismatch for "
1784 "ref %s, %d vs %d\n",
1785 r
->name
, fd
->header
->ref
[i
].len
, (int)r
->length
);
1787 // Fixing the parsed @SQ header will make MD:Z: strings work
1788 // and also stop it producing N for the sequence.
1789 fd
->header
->ref
[i
].len
= r
->length
;
1795 * Indexes references by the order they appear in a BAM file. This may not
1796 * necessarily be the same order they appear in the fasta reference file.
1798 * Returns 0 on success
1801 int refs2id(refs_t
*r
, SAM_hdr
*h
) {
1809 r
->ref_id
= calloc(h
->nref
, sizeof(*r
->ref_id
));
1814 for (i
= 0; i
< h
->nref
; i
++) {
1815 khint_t k
= kh_get(refs
, r
->h_meta
, h
->ref
[i
].name
);
1816 if (k
!= kh_end(r
->h_meta
)) {
1817 r
->ref_id
[i
] = kh_val(r
->h_meta
, k
);
1819 fprintf(stderr
, "Unable to find ref name '%s'\n",
1828 * Generates refs_t entries based on @SQ lines in the header.
1829 * Returns 0 on success
1832 static int refs_from_header(refs_t
*r
, cram_fd
*fd
, SAM_hdr
*h
) {
1838 if (!h
|| h
->nref
== 0)
1841 //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
1843 /* Existing refs are fine, as long as they're compatible with the hdr. */
1844 if (!(r
->ref_id
= realloc(r
->ref_id
, (r
->nref
+ h
->nref
) * sizeof(*r
->ref_id
))))
1847 /* Copy info from h->ref[i] over to r */
1848 for (i
= 0, j
= r
->nref
; i
< h
->nref
; i
++) {
1854 k
= kh_get(refs
, r
->h_meta
, h
->ref
[i
].name
);
1855 if (k
!= kh_end(r
->h_meta
))
1856 // Ref already known about
1859 if (!(r
->ref_id
[j
] = calloc(1, sizeof(ref_entry
))))
1862 if (!h
->ref
[i
].name
)
1865 r
->ref_id
[j
]->name
= string_dup(r
->pool
, h
->ref
[i
].name
);
1866 r
->ref_id
[j
]->length
= 0; // marker for not yet loaded
1868 /* Initialise likely filename if known */
1869 if ((ty
= sam_hdr_find(h
, "SQ", "SN", h
->ref
[i
].name
))) {
1870 if ((tag
= sam_hdr_find_key(h
, ty
, "M5", NULL
))) {
1871 r
->ref_id
[j
]->fn
= string_dup(r
->pool
, tag
->str
+3);
1872 //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
1876 k
= kh_put(refs
, r
->h_meta
, r
->ref_id
[j
]->name
, &n
);
1877 if (n
<= 0) // already exists or error
1879 kh_val(r
->h_meta
, k
) = r
->ref_id
[j
];
1889 * Attaches a header to a cram_fd.
1891 * This should be used when creating a new cram_fd for writing where
1892 * we have an SAM_hdr already constructed (eg from a file we've read
1895 int cram_set_header(cram_fd
*fd
, SAM_hdr
*hdr
) {
1897 sam_hdr_free(fd
->header
);
1899 return refs_from_header(fd
->refs
, fd
, hdr
);
1903 * Converts a directory and a filename into an expanded path, replacing %s
1904 * in directory with the filename and %[0-9]+s with portions of the filename
1905 * Any remaining parts of filename are added to the end with /%s.
1907 void expand_cache_path(char *path
, char *dir
, char *fn
) {
1910 while ((cp
= strchr(dir
, '%'))) {
1911 strncpy(path
, dir
, cp
-dir
);
1919 } else if (*cp
>= '0' && *cp
<= '9') {
1923 l
= strtol(cp
, &endp
, 10);
1924 l
= MIN(l
, strlen(fn
));
1926 strncpy(path
, fn
, l
);
1942 path
+= strlen(dir
);
1943 if (*fn
&& path
[-1] != '/')
1949 * Make the directory containing path and any prefix directories.
1951 void mkdir_prefix(char *path
, int mode
) {
1952 char *cp
= strrchr(path
, '/');
1957 if (is_directory(path
)) {
1962 if (mkdir(path
, mode
) == 0) {
1968 mkdir_prefix(path
, mode
);
1975 * Return the cache directory to use, based on the first of these
1976 * environment variables to be set to a non-empty value.
1978 static const char *get_cache_basedir(const char **extra
) {
1983 base
= getenv("XDG_CACHE_HOME");
1984 if (base
&& *base
) return base
;
1986 base
= getenv("HOME");
1987 if (base
&& *base
) { *extra
= "/.cache"; return base
; }
1989 base
= getenv("TMPDIR");
1990 if (base
&& *base
) return base
;
1992 base
= getenv("TEMP");
1993 if (base
&& *base
) return base
;
1999 * Return an integer representation of pthread_self().
2001 static unsigned get_int_threadid() {
2002 pthread_t pt
= pthread_self();
2003 unsigned char *s
= (unsigned char *) &pt
;
2006 for (i
= 0; i
< sizeof(pthread_t
); i
++)
2007 h
= (h
<< 5) - h
+ s
[i
];
2012 * Queries the M5 string from the header and attempts to populate the
2013 * reference from this using the REF_PATH environment.
2015 * Returns 0 on sucess
2018 static int cram_populate_ref(cram_fd
*fd
, int id
, ref_entry
*r
) {
2019 char *ref_path
= getenv("REF_PATH");
2022 char path
[PATH_MAX
], path_tmp
[PATH_MAX
];
2023 char cache
[PATH_MAX
], cache_root
[PATH_MAX
];
2024 char *local_cache
= getenv("REF_CACHE");
2029 fprintf(stderr
, "cram_populate_ref on fd %p, id %d\n", (void *)fd
, id
);
2031 cache_root
[0] = '\0';
2033 if (!ref_path
|| *ref_path
== '\0') {
2035 * If we have no ref path, we use the EBI server.
2036 * However to avoid spamming it we require a local ref cache too.
2038 ref_path
= "http://www.ebi.ac.uk:80/ena/cram/md5/%s";
2039 if (!local_cache
|| *local_cache
== '\0') {
2041 const char *base
= get_cache_basedir(&extra
);
2042 snprintf(cache_root
, PATH_MAX
, "%s%s/hts-ref", base
, extra
);
2043 snprintf(cache
,PATH_MAX
, "%s%s/hts-ref/%%2s/%%2s/%%s", base
, extra
);
2044 local_cache
= cache
;
2046 fprintf(stderr
, "Populating local cache: %s\n", local_cache
);
2053 if (!(ty
= sam_hdr_find(fd
->header
, "SQ", "SN", r
->name
)))
2056 if (!(tag
= sam_hdr_find_key(fd
->header
, ty
, "M5", NULL
)))
2060 fprintf(stderr
, "Querying ref %s\n", tag
->str
+3);
2062 /* Use cache if available */
2063 if (local_cache
&& *local_cache
) {
2064 expand_cache_path(path
, local_cache
, tag
->str
+3);
2070 /* Search local files in REF_PATH; we can open them and return as above */
2071 if (!local_path
&& (path2
= find_path(tag
->str
+3, ref_path
))) {
2072 strncpy(path
, path2
, PATH_MAX
);
2074 if (is_file(path
)) // incase it's too long
2079 /* Found via REF_CACHE or local REF_PATH file */
2084 if (0 == stat(path
, &sb
) && (fp
= bgzf_open(path
, "r"))) {
2085 r
->length
= sb
.st_size
;
2086 r
->offset
= r
->line_length
= r
->bases_per_line
= 0;
2088 r
->fn
= string_dup(fd
->refs
->pool
, path
);
2091 if (bgzf_close(fd
->refs
->fp
) != 0)
2094 fd
->refs
->fn
= r
->fn
;
2097 // Fall back to cram_get_ref() where it'll do the actual
2098 // reading of the file.
2104 /* Otherwise search full REF_PATH; slower as loads entire file */
2105 if ((mf
= open_path_mfile(tag
->str
+3, ref_path
, NULL
))) {
2107 r
->seq
= mfsteal(mf
, &sz
);
2111 // keep mf around as we couldn't detach
2122 /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
2123 if (!(tag
= sam_hdr_find_key(fd
->header
, ty
, "UR", NULL
)))
2126 fn
= (strncmp(tag
->str
+3, "file:", 5) == 0)
2131 if (bgzf_close(fd
->refs
->fp
) != 0)
2133 fd
->refs
->fp
= NULL
;
2135 if (!(refs
= refs_load_fai(fd
->refs
, fn
, 0)))
2137 sanitise_SQ_lines(fd
);
2141 if (bgzf_close(fd
->refs
->fp
) != 0)
2143 fd
->refs
->fp
= NULL
;
2149 if (-1 == refs2id(fd
->refs
, fd
->header
))
2151 if (!fd
->refs
->ref_id
|| !fd
->refs
->ref_id
[id
])
2154 // Local copy already, so fall back to cram_get_ref().
2158 /* Populate the local disk cache if required */
2159 if (local_cache
&& *local_cache
) {
2160 int pid
= (int) getpid();
2161 unsigned thrid
= get_int_threadid();
2164 if (*cache_root
&& !is_directory(cache_root
)) {
2165 hts_log_warning("Creating reference cache directory %s\n"
2166 "This may become large; see the samtools(1) manual page REF_CACHE discussion",
2170 expand_cache_path(path
, local_cache
, tag
->str
+3);
2172 fprintf(stderr
, "Writing cache file '%s'\n", path
);
2173 mkdir_prefix(path
, 01777);
2176 // Attempt to further uniquify the temporary filename
2177 unsigned t
= ((unsigned) time(NULL
)) ^ ((unsigned) clock());
2178 thrid
++; // Ensure filename changes even if time/clock haven't
2180 sprintf(path_tmp
, "%s.tmp_%d_%u_%u", path
, pid
, thrid
, t
);
2181 fp
= hopen(path_tmp
, "wx");
2182 } while (fp
== NULL
&& errno
== EEXIST
);
2186 // Not fatal - we have the data already so keep going.
2191 hts_md5_context
*md5
;
2192 char unsigned md5_buf1
[16];
2195 if (!(md5
= hts_md5_init())) {
2196 hclose_abruptly(fp
);
2200 hts_md5_update(md5
, r
->seq
, r
->length
);
2201 hts_md5_final(md5_buf1
, md5
);
2202 hts_md5_destroy(md5
);
2203 hts_md5_hex(md5_buf2
, md5_buf1
);
2205 if (strncmp(tag
->str
+3, md5_buf2
, 32) != 0) {
2206 fprintf(stderr
, "[E::%s] mismatching md5sum for downloaded reference.\n", __func__
);
2207 hclose_abruptly(fp
);
2212 if (hwrite(fp
, r
->seq
, r
->length
) != r
->length
) {
2215 if (hclose(fp
) < 0) {
2218 if (0 == chmod(path_tmp
, 0444))
2219 rename(path_tmp
, path
);
2228 static void cram_ref_incr_locked(refs_t
*r
, int id
) {
2229 RP("%d INC REF %d, %d %p\n", gettid(), id
, (int)(id
>=0?r
->ref_id
[id
]->count
+1:-999), id
>=0?r
->ref_id
[id
]->seq
:(char *)1);
2231 if (id
< 0 || !r
->ref_id
[id
] || !r
->ref_id
[id
]->seq
)
2234 if (r
->last_id
== id
)
2237 ++r
->ref_id
[id
]->count
;
2240 void cram_ref_incr(refs_t
*r
, int id
) {
2241 pthread_mutex_lock(&r
->lock
);
2242 cram_ref_incr_locked(r
, id
);
2243 pthread_mutex_unlock(&r
->lock
);
2246 static void cram_ref_decr_locked(refs_t
*r
, int id
) {
2247 RP("%d DEC REF %d, %d %p\n", gettid(), id
, (int)(id
>=0?r
->ref_id
[id
]->count
-1:-999), id
>=0?r
->ref_id
[id
]->seq
:(char *)1);
2249 if (id
< 0 || !r
->ref_id
[id
] || !r
->ref_id
[id
]->seq
) {
2250 assert(r
->ref_id
[id
]->count
>= 0);
2254 if (--r
->ref_id
[id
]->count
<= 0) {
2255 assert(r
->ref_id
[id
]->count
== 0);
2256 if (r
->last_id
>= 0) {
2257 if (r
->ref_id
[r
->last_id
]->count
<= 0 &&
2258 r
->ref_id
[r
->last_id
]->seq
) {
2259 RP("%d FREE REF %d (%p)\n", gettid(),
2260 r
->last_id
, r
->ref_id
[r
->last_id
]->seq
);
2261 ref_entry_free_seq(r
->ref_id
[r
->last_id
]);
2262 r
->ref_id
[r
->last_id
]->length
= 0;
2269 void cram_ref_decr(refs_t
*r
, int id
) {
2270 pthread_mutex_lock(&r
->lock
);
2271 cram_ref_decr_locked(r
, id
);
2272 pthread_mutex_unlock(&r
->lock
);
2276 * Used by cram_ref_load and cram_ref_get. The file handle will have
2277 * already been opened, so we can catch it. The ref_entry *e informs us
2278 * of whether this is a multi-line fasta file or a raw MD5 style file.
2279 * Either way we create a single contiguous sequence.
2281 * Returns all or part of a reference sequence on success (malloced);
2284 static char *load_ref_portion(BGZF
*fp
, ref_entry
*e
, int start
, int end
) {
2292 * Compute locations in file. This is trivial for the MD5 files, but
2293 * is still necessary for the fasta variants.
2295 offset
= e
->line_length
2296 ? e
->offset
+ (start
-1)/e
->bases_per_line
* e
->line_length
+
2297 (start
-1) % e
->bases_per_line
2300 len
= (e
->line_length
2301 ? e
->offset
+ (end
-1)/e
->bases_per_line
* e
->line_length
+
2302 (end
-1) % e
->bases_per_line
2303 : end
-1) - offset
+ 1;
2305 if (bgzf_useek(fp
, offset
, SEEK_SET
) < 0) {
2306 perror("bgzf_useek() on reference file");
2310 if (len
== 0 || !(seq
= malloc(len
))) {
2314 if (len
!= bgzf_read(fp
, seq
, len
)) {
2315 perror("bgzf_read() on reference file");
2320 /* Strip white-space if required. */
2321 if (len
!= end
-start
+1) {
2326 for (i
= j
= 0; i
< len
; i
++) {
2327 if (cp
[i
] >= '!' && cp
[i
] <= '~')
2328 cp
[j
++] = toupper_c(cp
[i
]);
2332 if (cp_to
- seq
!= end
-start
+1) {
2333 fprintf(stderr
, "Malformed reference file?\n");
2339 for (i
= 0; i
< len
; i
++) {
2340 seq
[i
] = toupper_c(seq
[i
]);
2348 * Load the entire reference 'id'.
2349 * This also increments the reference count by 1.
2351 * Returns ref_entry on success;
2354 ref_entry
*cram_ref_load(refs_t
*r
, int id
, int is_md5
) {
2355 ref_entry
*e
= r
->ref_id
[id
];
2356 int start
= 1, end
= e
->length
;
2363 assert(e
->count
== 0);
2368 for (idx
= 0; idx
< r
->nref
; idx
++)
2369 if (r
->last
== r
->ref_id
[idx
])
2371 RP("%d cram_ref_load DECR %d\n", gettid(), idx
);
2373 assert(r
->last
->count
> 0);
2374 if (--r
->last
->count
<= 0) {
2375 RP("%d FREE REF %d (%p)\n", gettid(), id
, r
->ref_id
[id
]->seq
);
2377 ref_entry_free_seq(r
->last
);
2381 /* Open file if it's not already the current open reference */
2382 if (strcmp(r
->fn
, e
->fn
) || r
->fp
== NULL
) {
2384 if (bgzf_close(r
->fp
) != 0)
2387 if (!(r
->fp
= bgzf_open_ref(r
->fn
, "r", is_md5
)))
2391 RP("%d Loading ref %d (%d..%d)\n", gettid(), id
, start
, end
);
2393 if (!(seq
= load_ref_portion(r
->fp
, e
, start
, end
))) {
2397 RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id
, start
, end
, seq
);
2399 RP("%d INC REF %d, %d\n", gettid(), id
, (int)(e
->count
+1));
2405 * Also keep track of last used ref so incr/decr loops on the same
2406 * sequence don't cause load/free loops.
2408 RP("%d cram_ref_load INCR %d => %d\n", gettid(), id
, e
->count
+1);
2416 * Returns a portion of a reference sequence from start to end inclusive.
2417 * The returned pointer is owned by either the cram_file fd or by the
2418 * internal refs_t structure and should not be freed by the caller.
2420 * The difference is whether or not this refs_t is in use by just the one
2421 * cram_fd or by multiples, or whether we have multiple threads accessing
2422 * references. In either case fd->shared will be true and we start using
2423 * reference counting to track the number of users of a specific reference
2426 * Otherwise the ref seq returned is allocated as part of cram_fd itself
2427 * and will be freed up on the next call to cram_get_ref or cram_close.
2429 * To return the entire reference sequence, specify start as 1 and end
2432 * To cease using a reference, call cram_ref_decr().
2434 * Returns reference on success,
2437 char *cram_get_ref(cram_fd
*fd
, int id
, int start
, int end
) {
2445 /* FIXME: axiomatic query of r->seq being true?
2446 * Or shortcut for unsorted data where we load once and never free?
2449 //fd->shared_ref = 1; // hard code for now to simplify things
2451 pthread_mutex_lock(&fd
->ref_lock
);
2453 RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd
, id
, start
, end
);
2456 * Unsorted data implies we want to fetch an entire reference at a time.
2457 * We just deal with this at the moment by claiming we're sharing
2458 * references instead, which has the same requirement.
2464 /* Sanity checking: does this ID exist? */
2465 if (id
>= fd
->refs
->nref
) {
2466 fprintf(stderr
, "No reference found for id %d\n", id
);
2467 pthread_mutex_unlock(&fd
->ref_lock
);
2471 if (!fd
->refs
|| !fd
->refs
->ref_id
[id
]) {
2472 fprintf(stderr
, "No reference found for id %d\n", id
);
2473 pthread_mutex_unlock(&fd
->ref_lock
);
2477 if (!(r
= fd
->refs
->ref_id
[id
])) {
2478 fprintf(stderr
, "No reference found for id %d\n", id
);
2479 pthread_mutex_unlock(&fd
->ref_lock
);
2485 * It has an entry, but may not have been populated yet.
2486 * Any manually loaded .fai files have their lengths known.
2487 * A ref entry computed from @SQ lines (M5 or UR field) will have
2488 * r->length == 0 unless it's been loaded once and verified that we have
2489 * an on-disk filename for it.
2491 * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
2492 * open_path_mfile and libcurl, which isn't multi-thread safe unless I
2493 * rewrite my code to have one curl handle per thread.
2495 pthread_mutex_lock(&fd
->refs
->lock
);
2496 if (r
->length
== 0) {
2497 if (cram_populate_ref(fd
, id
, r
) == -1) {
2498 fprintf(stderr
, "Failed to populate reference for id %d\n", id
);
2499 pthread_mutex_unlock(&fd
->refs
->lock
);
2500 pthread_mutex_unlock(&fd
->ref_lock
);
2503 r
= fd
->refs
->ref_id
[id
];
2505 cram_ref_incr_locked(fd
->refs
, id
);
2510 * We now know that we the filename containing the reference, so check
2511 * for limits. If it's over half the reference we'll load all of it in
2512 * memory as this will speed up subsequent calls.
2516 if (end
>= r
->length
)
2521 if (end
- start
>= 0.5*r
->length
|| fd
->shared_ref
) {
2527 * Maybe we have it cached already? If so use it.
2529 * Alternatively if we don't have the sequence but we're sharing
2530 * references and/or are asking for the entire length of it, then
2531 * load the full reference into the refs structure and return
2532 * a pointer to that one instead.
2534 if (fd
->shared_ref
|| r
->seq
|| (start
== 1 && end
== r
->length
)) {
2539 cram_ref_incr_locked(fd
->refs
, id
);
2542 if (!(e
= cram_ref_load(fd
->refs
, id
, r
->is_md5
))) {
2543 pthread_mutex_unlock(&fd
->refs
->lock
);
2544 pthread_mutex_unlock(&fd
->ref_lock
);
2548 /* unsorted data implies cache ref indefinitely, to avoid
2549 * continually loading and unloading.
2552 cram_ref_incr_locked(fd
->refs
, id
);
2555 fd
->ref
= NULL
; /* We never access it directly */
2557 fd
->ref_end
= r
->length
;
2560 cp
= fd
->refs
->ref_id
[id
]->seq
+ ostart
-1;
2566 RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id
, (int)r
->count
);
2568 pthread_mutex_unlock(&fd
->refs
->lock
);
2569 pthread_mutex_unlock(&fd
->ref_lock
);
2574 * Otherwise we're not sharing, we don't have a copy of it already and
2575 * we're only asking for a small portion of it.
2577 * In this case load up just that segment ourselves, freeing any old
2578 * small segments in the process.
2581 /* Unmapped ref ID */
2585 fd
->ref_free
= NULL
;
2589 pthread_mutex_unlock(&fd
->refs
->lock
);
2590 pthread_mutex_unlock(&fd
->ref_lock
);
2594 /* Open file if it's not already the current open reference */
2595 if (strcmp(fd
->refs
->fn
, r
->fn
) || fd
->refs
->fp
== NULL
) {
2597 if (bgzf_close(fd
->refs
->fp
) != 0)
2599 fd
->refs
->fn
= r
->fn
;
2600 if (!(fd
->refs
->fp
= bgzf_open_ref(fd
->refs
->fn
, "r", r
->is_md5
))) {
2601 pthread_mutex_unlock(&fd
->refs
->lock
);
2602 pthread_mutex_unlock(&fd
->ref_lock
);
2607 if (!(fd
->ref
= load_ref_portion(fd
->refs
->fp
, r
, start
, end
))) {
2608 pthread_mutex_unlock(&fd
->refs
->lock
);
2609 pthread_mutex_unlock(&fd
->ref_lock
);
2617 fd
->ref_start
= start
;
2619 fd
->ref_free
= fd
->ref
;
2622 pthread_mutex_unlock(&fd
->refs
->lock
);
2623 pthread_mutex_unlock(&fd
->ref_lock
);
2625 return seq
+ ostart
- start
;
2629 * If fd has been opened for reading, it may be permitted to specify 'fn'
2630 * as NULL and let the code auto-detect the reference by parsing the
2631 * SAM header @SQ lines.
2633 int cram_load_reference(cram_fd
*fd
, char *fn
) {
2637 fd
->refs
= refs_load_fai(fd
->refs
, fn
,
2638 !(fd
->embed_ref
&& fd
->mode
== 'r'));
2639 fn
= fd
->refs
? fd
->refs
->fn
: NULL
;
2642 sanitise_SQ_lines(fd
);
2646 if ((!fd
->refs
|| (fd
->refs
->nref
== 0 && !fn
)) && fd
->header
) {
2648 refs_free(fd
->refs
);
2649 if (!(fd
->refs
= refs_create()))
2651 if (-1 == refs_from_header(fd
->refs
, fd
, fd
->header
))
2656 if (-1 == refs2id(fd
->refs
, fd
->header
))
2662 /* ----------------------------------------------------------------------
2667 * Creates a new container, specifying the maximum number of slices
2668 * and records permitted.
2670 * Returns cram_container ptr on success
2673 cram_container
*cram_new_container(int nrec
, int nslice
) {
2674 cram_container
*c
= calloc(1, sizeof(*c
));
2682 c
->max_c_rec
= nrec
* nslice
;
2686 c
->record_counter
= 0;
2690 c
->max_slice
= nslice
;
2699 if (!(c
->slices
= (cram_slice
**)calloc(nslice
, sizeof(cram_slice
*))))
2703 if (!(c
->comp_hdr
= cram_new_compression_header()))
2705 c
->comp_hdr_block
= NULL
;
2707 for (id
= DS_RN
; id
< DS_TN
; id
++)
2708 if (!(c
->stats
[id
] = cram_stats_create())) goto err
;
2710 //c->aux_B_stats = cram_stats_create();
2712 if (!(c
->tags_used
= kh_init(m_tagmap
)))
2727 void cram_free_container(cram_container
*c
) {
2741 cram_free_compression_header(c
->comp_hdr
);
2743 if (c
->comp_hdr_block
)
2744 cram_free_block(c
->comp_hdr_block
);
2747 for (i
= 0; i
< c
->max_slice
; i
++)
2749 cram_free_slice(c
->slices
[i
]);
2753 for (id
= DS_RN
; id
< DS_TN
; id
++)
2754 if (c
->stats
[id
]) cram_stats_free(c
->stats
[id
]);
2756 //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
2761 for (k
= kh_begin(c
->tags_used
); k
!= kh_end(c
->tags_used
); k
++) {
2762 if (!kh_exist(c
->tags_used
, k
))
2765 cram_tag_map
*tm
= (cram_tag_map
*)kh_val(c
->tags_used
, k
);
2766 cram_codec
*c
= tm
->codec
;
2772 kh_destroy(m_tagmap
, c
->tags_used
);
2779 * Reads a container header.
2781 * Returns cram_container on success
2782 * NULL on failure or no container left (fd->err == 0).
2784 cram_container
*cram_read_container(cram_fd
*fd
) {
2785 cram_container c2
, *c
;
2793 memset(&c2
, 0, sizeof(c2
));
2794 if (CRAM_MAJOR_VERS(fd
->version
) == 1) {
2795 if ((s
= itf8_decode_crc(fd
, &c2
.length
, &crc
)) == -1) {
2796 fd
->eof
= fd
->empty_container
? 1 : 2;
2803 if ((s
= int32_decode(fd
, &c2
.length
)) == -1) {
2804 if (CRAM_MAJOR_VERS(fd
->version
) == 2 &&
2805 CRAM_MINOR_VERS(fd
->version
) == 0)
2806 fd
->eof
= 1; // EOF blocks arrived in v2.1
2808 fd
->eof
= fd
->empty_container
? 1 : 2;
2813 len
= le_int4(c2
.length
);
2814 crc
= crc32(0L, (unsigned char *)&len
, 4);
2816 if ((s
= itf8_decode_crc(fd
, &c2
.ref_seq_id
, &crc
)) == -1) return NULL
; else rd
+=s
;
2817 if ((s
= itf8_decode_crc(fd
, &c2
.ref_seq_start
, &crc
))== -1) return NULL
; else rd
+=s
;
2818 if ((s
= itf8_decode_crc(fd
, &c2
.ref_seq_span
, &crc
)) == -1) return NULL
; else rd
+=s
;
2819 if ((s
= itf8_decode_crc(fd
, &c2
.num_records
, &crc
)) == -1) return NULL
; else rd
+=s
;
2821 if (CRAM_MAJOR_VERS(fd
->version
) == 1) {
2822 c2
.record_counter
= 0;
2825 if (CRAM_MAJOR_VERS(fd
->version
) >= 3) {
2826 if ((s
= ltf8_decode_crc(fd
, &c2
.record_counter
, &crc
)) == -1)
2832 if ((s
= itf8_decode_crc(fd
, &i32
, &crc
)) == -1)
2836 c2
.record_counter
= i32
;
2839 if ((s
= ltf8_decode_crc(fd
, &c2
.num_bases
, &crc
))== -1)
2844 if ((s
= itf8_decode_crc(fd
, &c2
.num_blocks
, &crc
)) == -1) return NULL
; else rd
+=s
;
2845 if ((s
= itf8_decode_crc(fd
, &c2
.num_landmarks
, &crc
))== -1) return NULL
; else rd
+=s
;
2847 if (c2
.num_landmarks
< 0 || c2
.num_landmarks
>= SIZE_MAX
/ sizeof(int32_t))
2850 if (!(c
= calloc(1, sizeof(*c
))))
2855 if (!(c
->landmark
= malloc(c
->num_landmarks
* sizeof(int32_t))) &&
2858 cram_free_container(c
);
2861 for (i
= 0; i
< c
->num_landmarks
; i
++) {
2862 if ((s
= itf8_decode_crc(fd
, &c
->landmark
[i
], &crc
)) == -1) {
2863 cram_free_container(c
);
2870 if (CRAM_MAJOR_VERS(fd
->version
) >= 3) {
2871 if (-1 == int32_decode(fd
, (int32_t *)&c
->crc32
))
2876 if (crc
!= c
->crc32
) {
2877 fprintf(stderr
, "Container header CRC32 failure\n");
2878 cram_free_container(c
);
2886 c
->max_slice
= c
->num_landmarks
;
2891 if (c
->ref_seq_id
== -2) {
2896 fd
->empty_container
=
2897 (c
->num_records
== 0 &&
2898 c
->ref_seq_id
== -1 &&
2899 c
->ref_seq_start
== 0x454f46 /* EOF */) ? 1 : 0;
2905 /* MAXIMUM storage size needed for the container. */
2906 int cram_container_size(cram_container
*c
) {
2907 return 55 + 5*c
->num_landmarks
;
2912 * Stores the container structure in dat and returns *size as the
2913 * number of bytes written to dat[]. The input size of dat is also
2914 * held in *size and should be initialised to cram_container_size(c).
2916 * Returns 0 on success;
2919 int cram_store_container(cram_fd
*fd
, cram_container
*c
, char *dat
, int *size
)
2921 unsigned char *cp
= (unsigned char *)dat
;
2924 // Check the input buffer is large enough according to our stated
2925 // requirements. (NOTE: it may actually take less.)
2926 if (cram_container_size(c
) > *size
)
2929 if (CRAM_MAJOR_VERS(fd
->version
) == 1) {
2930 cp
+= itf8_put((char*)cp
, c
->length
);
2932 *(int32_t *)cp
= le_int4(c
->length
);
2936 cp
+= itf8_put((char*)cp
, -2);
2937 cp
+= itf8_put((char*)cp
, 0);
2938 cp
+= itf8_put((char*)cp
, 0);
2940 cp
+= itf8_put((char*)cp
, c
->ref_seq_id
);
2941 cp
+= itf8_put((char*)cp
, c
->ref_seq_start
);
2942 cp
+= itf8_put((char*)cp
, c
->ref_seq_span
);
2944 cp
+= itf8_put((char*)cp
, c
->num_records
);
2945 if (CRAM_MAJOR_VERS(fd
->version
) == 2) {
2946 cp
+= itf8_put((char*)cp
, c
->record_counter
);
2947 cp
+= ltf8_put((char*)cp
, c
->num_bases
);
2948 } else if (CRAM_MAJOR_VERS(fd
->version
) >= 3) {
2949 cp
+= ltf8_put((char*)cp
, c
->record_counter
);
2950 cp
+= ltf8_put((char*)cp
, c
->num_bases
);
2953 cp
+= itf8_put((char*)cp
, c
->num_blocks
);
2954 cp
+= itf8_put((char*)cp
, c
->num_landmarks
);
2955 for (i
= 0; i
< c
->num_landmarks
; i
++)
2956 cp
+= itf8_put((char*)cp
, c
->landmark
[i
]);
2958 if (CRAM_MAJOR_VERS(fd
->version
) >= 3) {
2959 c
->crc32
= crc32(0L, (uc
*)dat
, (char*)cp
-dat
);
2960 cp
[0] = c
->crc32
& 0xff;
2961 cp
[1] = (c
->crc32
>> 8) & 0xff;
2962 cp
[2] = (c
->crc32
>> 16) & 0xff;
2963 cp
[3] = (c
->crc32
>> 24) & 0xff;
2967 *size
= (char *)cp
-dat
; // actual used size
2974 * Writes a container structure.
2976 * Returns 0 on success
2979 int cram_write_container(cram_fd
*fd
, cram_container
*c
) {
2980 char buf_a
[1024], *buf
= buf_a
;
2984 if (55 + c
->num_landmarks
* 5 >= 1024)
2985 buf
= malloc(55 + c
->num_landmarks
* 5);
2986 cp
= (unsigned char *)buf
;
2988 if (CRAM_MAJOR_VERS(fd
->version
) == 1) {
2989 cp
+= itf8_put((char*)cp
, c
->length
);
2991 *(int32_t *)cp
= le_int4(c
->length
);
2995 cp
+= itf8_put((char*)cp
, -2);
2996 cp
+= itf8_put((char*)cp
, 0);
2997 cp
+= itf8_put((char*)cp
, 0);
2999 cp
+= itf8_put((char*)cp
, c
->ref_seq_id
);
3000 cp
+= itf8_put((char*)cp
, c
->ref_seq_start
);
3001 cp
+= itf8_put((char*)cp
, c
->ref_seq_span
);
3003 cp
+= itf8_put((char*)cp
, c
->num_records
);
3004 if (CRAM_MAJOR_VERS(fd
->version
) == 2) {
3005 cp
+= itf8_put((char*)cp
, c
->record_counter
);
3006 cp
+= ltf8_put((char*)cp
, c
->num_bases
);
3007 } else if (CRAM_MAJOR_VERS(fd
->version
) >= 3) {
3008 cp
+= ltf8_put((char*)cp
, c
->record_counter
);
3009 cp
+= ltf8_put((char*)cp
, c
->num_bases
);
3012 cp
+= itf8_put((char*)cp
, c
->num_blocks
);
3013 cp
+= itf8_put((char*)cp
, c
->num_landmarks
);
3014 for (i
= 0; i
< c
->num_landmarks
; i
++)
3015 cp
+= itf8_put((char*)cp
, c
->landmark
[i
]);
3017 if (CRAM_MAJOR_VERS(fd
->version
) >= 3) {
3018 c
->crc32
= crc32(0L, (uc
*)buf
, (char*)cp
-buf
);
3019 cp
[0] = c
->crc32
& 0xff;
3020 cp
[1] = (c
->crc32
>> 8) & 0xff;
3021 cp
[2] = (c
->crc32
>> 16) & 0xff;
3022 cp
[3] = (c
->crc32
>> 24) & 0xff;
3026 if ((char*)cp
-buf
!= hwrite(fd
->fp
, buf
, (char*)cp
-buf
)) {
3038 // common component shared by cram_flush_container{,_mt}
3039 static int cram_flush_container2(cram_fd
*fd
, cram_container
*c
) {
3042 if (c
->curr_slice
> 0 && !c
->slices
)
3045 //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
3047 /* Write the container struct itself */
3048 if (0 != cram_write_container(fd
, c
))
3051 /* And the compression header */
3052 if (0 != cram_write_block(fd
, c
->comp_hdr_block
))
3055 /* Followed by the slice blocks */
3056 for (i
= 0; i
< c
->curr_slice
; i
++) {
3057 cram_slice
*s
= c
->slices
[i
];
3059 if (0 != cram_write_block(fd
, s
->hdr_block
))
3062 for (j
= 0; j
< s
->hdr
->num_blocks
; j
++) {
3063 if (0 != cram_write_block(fd
, s
->block
[j
]))
3068 return hflush(fd
->fp
) == 0 ? 0 : -1;
3072 * Flushes a completely or partially full container to disk, writing
3073 * container structure, header and blocks. This also calls the encoder
3076 * Returns 0 on success
3079 int cram_flush_container(cram_fd
*fd
, cram_container
*c
) {
3080 /* Encode the container blocks and generate compression header */
3081 if (0 != cram_encode_container(fd
, c
))
3084 return cram_flush_container2(fd
, c
);
3092 void *cram_flush_thread(void *arg
) {
3093 cram_job
*j
= (cram_job
*)arg
;
3095 /* Encode the container blocks and generate compression header */
3096 if (0 != cram_encode_container(j
->fd
, j
->c
)) {
3097 fprintf(stderr
, "cram_encode_container failed\n");
3104 static int cram_flush_result(cram_fd
*fd
) {
3106 hts_tpool_result
*r
;
3108 while ((r
= hts_tpool_next_result(fd
->rqueue
))) {
3109 cram_job
*j
= (cram_job
*)hts_tpool_result_data(r
);
3113 hts_tpool_delete_result(r
, 0);
3120 if (fd
->mode
== 'w')
3121 if (0 != cram_flush_container2(fd
, c
))
3124 /* Free the container */
3125 for (i
= 0; i
< c
->max_slice
; i
++) {
3126 if (c
->slices
&& c
->slices
[i
]) {
3127 cram_free_slice(c
->slices
[i
]);
3128 c
->slices
[i
] = NULL
;
3135 cram_free_container(c
);
3137 ret
|= hflush(fd
->fp
) == 0 ? 0 : -1;
3139 hts_tpool_delete_result(r
, 1);
3145 int cram_flush_container_mt(cram_fd
*fd
, cram_container
*c
) {
3149 return cram_flush_container(fd
, c
);
3151 if (!(j
= malloc(sizeof(*j
))))
3156 // Flush the job. Note our encoder queue may be full, so we
3157 // either have to keep trying in non-blocking mode (what we do) or
3158 // use a dedicated separate thread for draining the queue.
3161 hts_tpool_dispatch2(fd
->pool
, fd
->rqueue
, cram_flush_thread
, j
, 1);
3162 int pending
= (errno
== EAGAIN
);
3163 if (cram_flush_result(fd
) != 0)
3174 /* ----------------------------------------------------------------------
3175 * Compression headers; the first part of the container
3179 * Creates a new blank container compression header
3181 * Returns header ptr on success
3184 cram_block_compression_hdr
*cram_new_compression_header(void) {
3185 cram_block_compression_hdr
*hdr
= calloc(1, sizeof(*hdr
));
3189 if (!(hdr
->TD_blk
= cram_new_block(CORE
, 0))) {
3194 if (!(hdr
->TD_hash
= kh_init(m_s2i
))) {
3195 cram_free_block(hdr
->TD_blk
);
3200 if (!(hdr
->TD_keys
= string_pool_create(8192))) {
3201 kh_destroy(m_s2i
, hdr
->TD_hash
);
3202 cram_free_block(hdr
->TD_blk
);
3210 void cram_free_compression_header(cram_block_compression_hdr
*hdr
) {
3214 free(hdr
->landmark
);
3216 if (hdr
->preservation_map
)
3217 kh_destroy(map
, hdr
->preservation_map
);
3219 for (i
= 0; i
< CRAM_MAP_HASH
; i
++) {
3221 for (m
= hdr
->rec_encoding_map
[i
]; m
; m
= m2
) {
3224 m
->codec
->free(m
->codec
);
3229 for (i
= 0; i
< CRAM_MAP_HASH
; i
++) {
3231 for (m
= hdr
->tag_encoding_map
[i
]; m
; m
= m2
) {
3234 m
->codec
->free(m
->codec
);
3239 for (i
= 0; i
< DS_END
; i
++) {
3241 hdr
->codecs
[i
]->free(hdr
->codecs
[i
]);
3247 cram_free_block(hdr
->TD_blk
);
3249 kh_destroy(m_s2i
, hdr
->TD_hash
);
3251 string_pool_destroy(hdr
->TD_keys
);
3257 /* ----------------------------------------------------------------------
3258 * Slices and slice headers
3261 void cram_free_slice_header(cram_block_slice_hdr
*hdr
) {
3265 if (hdr
->block_content_ids
)
3266 free(hdr
->block_content_ids
);
3273 void cram_free_slice(cram_slice
*s
) {
3278 cram_free_block(s
->hdr_block
);
3284 for (i
= 0; i
< s
->hdr
->num_blocks
; i
++) {
3285 cram_free_block(s
->block
[i
]);
3292 free(s
->block_by_id
);
3295 cram_free_slice_header(s
->hdr
);
3298 cram_free_block(s
->seqs_blk
);
3301 cram_free_block(s
->qual_blk
);
3304 cram_free_block(s
->name_blk
);
3307 cram_free_block(s
->aux_blk
);
3310 cram_free_block(s
->base_blk
);
3313 cram_free_block(s
->soft_blk
);
3328 string_pool_destroy(s
->pair_keys
);
3331 kh_destroy(m_s2i
, s
->pair
[0]);
3333 kh_destroy(m_s2i
, s
->pair
[1]);
3342 * Creates a new empty slice in memory, for subsequent writing to
3345 * Returns cram_slice ptr on success
3348 cram_slice
*cram_new_slice(enum cram_content_type type
, int nrecs
) {
3349 cram_slice
*s
= calloc(1, sizeof(*s
));
3353 if (!(s
->hdr
= (cram_block_slice_hdr
*)calloc(1, sizeof(*s
->hdr
))))
3355 s
->hdr
->content_type
= type
;
3357 s
->hdr_block
= NULL
;
3359 s
->block_by_id
= NULL
;
3361 if (!(s
->crecs
= malloc(nrecs
* sizeof(cram_record
)))) goto err
;
3366 if (!(s
->seqs_blk
= cram_new_block(EXTERNAL
, 0))) goto err
;
3367 if (!(s
->qual_blk
= cram_new_block(EXTERNAL
, DS_QS
))) goto err
;
3368 if (!(s
->name_blk
= cram_new_block(EXTERNAL
, DS_RN
))) goto err
;
3369 if (!(s
->aux_blk
= cram_new_block(EXTERNAL
, DS_aux
))) goto err
;
3370 if (!(s
->base_blk
= cram_new_block(EXTERNAL
, DS_IN
))) goto err
;
3371 if (!(s
->soft_blk
= cram_new_block(EXTERNAL
, DS_SC
))) goto err
;
3374 s
->nfeatures
= s
->afeatures
= 0;
3378 s
->nTN
= s
->aTN
= 0;
3381 // Volatile keys as we do realloc in dstring
3382 if (!(s
->pair_keys
= string_pool_create(8192))) goto err
;
3383 if (!(s
->pair
[0] = kh_init(m_s2i
))) goto err
;
3384 if (!(s
->pair
[1] = kh_init(m_s2i
))) goto err
;
3400 * Loads an entire slice.
3401 * FIXME: In 1.0 the native unit of slices within CRAM is broken
3402 * as slices contain references to objects in other slices.
3403 * To work around this while keeping the slice oriented outer loop
3404 * we read all slices and stitch them together into a fake large
3407 * Returns cram_slice ptr on success
3410 cram_slice
*cram_read_slice(cram_fd
*fd
) {
3411 cram_block
*b
= cram_read_block(fd
);
3412 cram_slice
*s
= calloc(1, sizeof(*s
));
3413 int i
, n
, max_id
, min_id
;
3419 switch (b
->content_type
) {
3421 case UNMAPPED_SLICE
:
3422 if (!(s
->hdr
= cram_decode_slice_header(fd
, b
)))
3427 fprintf(stderr
, "Unexpected block of type %s\n",
3428 cram_content_type2str(b
->content_type
));
3432 if (s
->hdr
->num_blocks
< 1) {
3433 fprintf(stderr
, "Slice does not include any data blocks.\n");
3437 s
->block
= calloc(n
= s
->hdr
->num_blocks
, sizeof(*s
->block
));
3441 for (max_id
= i
= 0, min_id
= INT_MAX
; i
< n
; i
++) {
3442 if (!(s
->block
[i
] = cram_read_block(fd
)))
3445 if (s
->block
[i
]->content_type
== EXTERNAL
) {
3446 if (max_id
< s
->block
[i
]->content_id
)
3447 max_id
= s
->block
[i
]->content_id
;
3448 if (min_id
> s
->block
[i
]->content_id
)
3449 min_id
= s
->block
[i
]->content_id
;
3452 if (min_id
>= 0 && max_id
< 1024) {
3453 if (!(s
->block_by_id
= calloc(1024, sizeof(s
->block
[0]))))
3456 for (i
= 0; i
< n
; i
++) {
3457 if (s
->block
[i
]->content_type
!= EXTERNAL
)
3459 s
->block_by_id
[s
->block
[i
]->content_id
] = s
->block
[i
];
3463 /* Initialise encoding/decoding tables */
3468 if (!(s
->seqs_blk
= cram_new_block(EXTERNAL
, 0))) goto err
;
3469 if (!(s
->qual_blk
= cram_new_block(EXTERNAL
, DS_QS
))) goto err
;
3470 if (!(s
->name_blk
= cram_new_block(EXTERNAL
, DS_RN
))) goto err
;
3471 if (!(s
->aux_blk
= cram_new_block(EXTERNAL
, DS_aux
))) goto err
;
3472 if (!(s
->base_blk
= cram_new_block(EXTERNAL
, DS_IN
))) goto err
;
3473 if (!(s
->soft_blk
= cram_new_block(EXTERNAL
, DS_SC
))) goto err
;
3477 s
->last_apos
= s
->hdr
->ref_seq_start
;
3485 s
->hdr_block
= NULL
;
3492 /* ----------------------------------------------------------------------
3493 * CRAM file definition (header)
3497 * Reads a CRAM file definition structure.
3498 * Returns file_def ptr on success
3501 cram_file_def
*cram_read_file_def(cram_fd
*fd
) {
3502 cram_file_def
*def
= malloc(sizeof(*def
));
3506 if (26 != hread(fd
->fp
, &def
->magic
[0], 26)) {
3511 if (memcmp(def
->magic
, "CRAM", 4) != 0) {
3516 if (def
->major_version
> 3) {
3517 fprintf(stderr
, "CRAM version number mismatch\n"
3518 "Expected 1.x, 2.x or 3.x, got %d.%d\n",
3519 def
->major_version
, def
->minor_version
);
3524 fd
->first_container
+= 26;
3531 * Writes a cram_file_def structure to cram_fd.
3532 * Returns 0 on success
3535 int cram_write_file_def(cram_fd
*fd
, cram_file_def
*def
) {
3536 return (hwrite(fd
->fp
, &def
->magic
[0], 26) == 26) ? 0 : -1;
3539 void cram_free_file_def(cram_file_def
*def
) {
3543 /* ----------------------------------------------------------------------
3549 * Reads the SAM header from the first CRAM data block.
3550 * Also performs minimal parsing to extract read-group
3551 * and sample information.
3553 * Returns SAM hdr ptr on success
3556 SAM_hdr
*cram_read_SAM_hdr(cram_fd
*fd
) {
3561 /* 1.1 onwards stores the header in the first block of a container */
3562 if (CRAM_MAJOR_VERS(fd
->version
) == 1) {
3564 if (-1 == int32_decode(fd
, &header_len
))
3567 /* Alloc and read */
3568 if (header_len
< 0 || NULL
== (header
= malloc((size_t) header_len
+1)))
3571 if (header_len
!= hread(fd
->fp
, header
, header_len
))
3573 header
[header_len
] = '\0';
3575 fd
->first_container
+= 4 + header_len
;
3577 cram_container
*c
= cram_read_container(fd
);
3584 fd
->first_container
+= c
->length
+ c
->offset
;
3586 if (c
->num_blocks
< 1) {
3587 cram_free_container(c
);
3591 if (!(b
= cram_read_block(fd
))) {
3592 cram_free_container(c
);
3595 if (cram_uncompress_block(b
) != 0) {
3596 cram_free_container(c
);
3601 len
= b
->comp_size
+ 2 + 4*(CRAM_MAJOR_VERS(fd
->version
) >= 3) +
3602 itf8_size(b
->content_id
) +
3603 itf8_size(b
->uncomp_size
) +
3604 itf8_size(b
->comp_size
);
3606 /* Extract header from 1st block */
3607 if (-1 == int32_get_blk(b
, &header_len
) ||
3608 header_len
< 0 || /* Spec. says signed... why? */
3609 b
->uncomp_size
- 4 < header_len
) {
3610 cram_free_container(c
);
3614 if (NULL
== (header
= malloc((size_t) header_len
+1))) {
3615 cram_free_container(c
);
3619 memcpy(header
, BLOCK_END(b
), header_len
);
3620 header
[header_len
] = '\0';
3623 /* Consume any remaining blocks */
3624 for (i
= 1; i
< c
->num_blocks
; i
++) {
3625 if (!(b
= cram_read_block(fd
))) {
3626 cram_free_container(c
);
3629 len
+= b
->comp_size
+ 2 + 4*(CRAM_MAJOR_VERS(fd
->version
) >= 3) +
3630 itf8_size(b
->content_id
) +
3631 itf8_size(b
->uncomp_size
) +
3632 itf8_size(b
->comp_size
);
3636 if (c
->length
> 0 && len
> 0 && c
->length
> len
) {
3638 char *pads
= malloc(c
->length
- len
);
3640 cram_free_container(c
);
3644 if (c
->length
- len
!= hread(fd
->fp
, pads
, c
->length
- len
)) {
3645 cram_free_container(c
);
3651 cram_free_container(c
);
3655 hdr
= sam_hdr_parse_(header
, header_len
);
3662 * Converts 'in' to a full pathname to store in out.
3663 * Out must be at least PATH_MAX bytes long.
3665 static void full_path(char *out
, char *in
) {
3667 strncpy(out
, in
, PATH_MAX
);
3668 out
[PATH_MAX
-1] = 0;
3672 // unable to get dir or out+in is too long
3673 if (!getcwd(out
, PATH_MAX
) ||
3674 (len
= strlen(out
))+1+strlen(in
) >= PATH_MAX
) {
3675 strncpy(out
, in
, PATH_MAX
);
3676 out
[PATH_MAX
-1] = 0;
3680 sprintf(out
+len
, "/%.*s", PATH_MAX
- len
, in
);
3682 // FIXME: cope with `pwd`/../../../foo.fa ?
3687 * Writes a CRAM SAM header.
3688 * Returns 0 on success
3691 int cram_write_SAM_hdr(cram_fd
*fd
, SAM_hdr
*hdr
) {
3693 int blank_block
= (CRAM_MAJOR_VERS(fd
->version
) >= 3);
3695 /* Write CRAM MAGIC if not yet written. */
3696 if (fd
->file_def
->major_version
== 0) {
3697 fd
->file_def
->major_version
= CRAM_MAJOR_VERS(fd
->version
);
3698 fd
->file_def
->minor_version
= CRAM_MINOR_VERS(fd
->version
);
3699 if (0 != cram_write_file_def(fd
, fd
->file_def
))
3703 /* 1.0 requires an UNKNOWN read-group */
3704 if (CRAM_MAJOR_VERS(fd
->version
) == 1) {
3705 if (!sam_hdr_find_rg(hdr
, "UNKNOWN"))
3706 if (sam_hdr_add(hdr
, "RG",
3707 "ID", "UNKNOWN", "SM", "UNKNOWN", NULL
))
3711 /* Fix M5 strings */
3712 if (fd
->refs
&& !fd
->no_ref
) {
3714 for (i
= 0; i
< hdr
->nref
; i
++) {
3718 if (!(ty
= sam_hdr_find(hdr
, "SQ", "SN", hdr
->ref
[i
].name
)))
3721 if (!sam_hdr_find_key(hdr
, ty
, "M5", NULL
)) {
3722 char unsigned buf
[16];
3725 hts_md5_context
*md5
;
3728 !fd
->refs
->ref_id
||
3729 !fd
->refs
->ref_id
[i
]) {
3732 rlen
= fd
->refs
->ref_id
[i
]->length
;
3733 if (!(md5
= hts_md5_init()))
3735 ref
= cram_get_ref(fd
, i
, 1, rlen
);
3736 if (NULL
== ref
) return -1;
3737 rlen
= fd
->refs
->ref_id
[i
]->length
; /* In case it just loaded */
3738 hts_md5_update(md5
, ref
, rlen
);
3739 hts_md5_final(buf
, md5
);
3740 hts_md5_destroy(md5
);
3741 cram_ref_decr(fd
->refs
, i
);
3743 hts_md5_hex(buf2
, buf
);
3744 if (sam_hdr_update(hdr
, ty
, "M5", buf2
, NULL
))
3749 char ref_fn
[PATH_MAX
];
3750 full_path(ref_fn
, fd
->ref_fn
);
3751 if (sam_hdr_update(hdr
, ty
, "UR", ref_fn
, NULL
))
3757 if (sam_hdr_rebuild(hdr
))
3761 header_len
= sam_hdr_length(hdr
);
3762 if (CRAM_MAJOR_VERS(fd
->version
) == 1) {
3763 if (-1 == int32_encode(fd
, header_len
))
3767 if (header_len
!= hwrite(fd
->fp
, sam_hdr_str(hdr
), header_len
))
3770 /* Create block(s) inside a container */
3771 cram_block
*b
= cram_new_block(FILE_HEADER
, 0);
3772 cram_container
*c
= cram_new_container(0, 0);
3775 int is_cram_3
= (CRAM_MAJOR_VERS(fd
->version
) >= 3);
3778 if (b
) cram_free_block(b
);
3779 if (c
) cram_free_container(c
);
3783 int32_put_blk(b
, header_len
);
3785 BLOCK_APPEND(b
, sam_hdr_str(hdr
), header_len
);
3788 // Compress header block if V3.0 and above
3789 if (CRAM_MAJOR_VERS(fd
->version
) >= 3)
3790 cram_compress_block(fd
, b
, NULL
, -1, -1);
3793 c
->length
= b
->comp_size
+ 2 + 4*is_cram_3
+
3794 itf8_size(b
->content_id
) +
3795 itf8_size(b
->uncomp_size
) +
3796 itf8_size(b
->comp_size
);
3799 c
->num_landmarks
= 2;
3800 if (!(c
->landmark
= malloc(2*sizeof(*c
->landmark
)))) {
3802 cram_free_container(c
);
3806 c
->landmark
[1] = c
->length
;
3808 // Plus extra storage for uncompressed secondary blank block
3809 padded_length
= MIN(c
->length
*.5, 10000);
3810 c
->length
+= padded_length
+ 2 + 4*is_cram_3
+
3811 itf8_size(b
->content_id
) +
3812 itf8_size(padded_length
)*2;
3814 // Pad the block instead.
3816 c
->num_landmarks
= 1;
3817 if (!(c
->landmark
= malloc(sizeof(*c
->landmark
))))
3821 padded_length
= MAX(c
->length
*1.5, 10000) - c
->length
;
3823 c
->length
= b
->comp_size
+ padded_length
+
3825 itf8_size(b
->content_id
) +
3826 itf8_size(b
->uncomp_size
) +
3827 itf8_size(b
->comp_size
);
3829 if (NULL
== (pads
= calloc(1, padded_length
))) {
3831 cram_free_container(c
);
3834 BLOCK_APPEND(b
, pads
, padded_length
);
3839 if (-1 == cram_write_container(fd
, c
)) {
3841 cram_free_container(c
);
3845 if (-1 == cram_write_block(fd
, b
)) {
3847 cram_free_container(c
);
3852 BLOCK_RESIZE(b
, padded_length
);
3853 memset(BLOCK_DATA(b
), 0, padded_length
);
3854 BLOCK_SIZE(b
) = padded_length
;
3857 if (-1 == cram_write_block(fd
, b
)) {
3859 cram_free_container(c
);
3865 cram_free_container(c
);
3868 if (-1 == refs_from_header(fd
->refs
, fd
, fd
->header
))
3870 if (-1 == refs2id(fd
->refs
, fd
->header
))
3873 if (0 != hflush(fd
->fp
))
3876 RP("=== Finishing saving header ===\n");
3881 /* ----------------------------------------------------------------------
3882 * The top-level cram opening, closing and option handling
3886 * Initialises the lookup tables. These could be global statics, but they're
3887 * clumsy to setup in a multi-threaded environment unless we generate
3888 * verbatim code and include that.
3890 static void cram_init_tables(cram_fd
*fd
) {
3893 memset(fd
->L1
, 4, 256);
3894 fd
->L1
['A'] = 0; fd
->L1
['a'] = 0;
3895 fd
->L1
['C'] = 1; fd
->L1
['c'] = 1;
3896 fd
->L1
['G'] = 2; fd
->L1
['g'] = 2;
3897 fd
->L1
['T'] = 3; fd
->L1
['t'] = 3;
3899 memset(fd
->L2
, 5, 256);
3900 fd
->L2
['A'] = 0; fd
->L2
['a'] = 0;
3901 fd
->L2
['C'] = 1; fd
->L2
['c'] = 1;
3902 fd
->L2
['G'] = 2; fd
->L2
['g'] = 2;
3903 fd
->L2
['T'] = 3; fd
->L2
['t'] = 3;
3904 fd
->L2
['N'] = 4; fd
->L2
['n'] = 4;
3906 if (CRAM_MAJOR_VERS(fd
->version
) == 1) {
3907 for (i
= 0; i
< 0x200; i
++) {
3910 if (i
& CRAM_FPAIRED
) f
|= BAM_FPAIRED
;
3911 if (i
& CRAM_FPROPER_PAIR
) f
|= BAM_FPROPER_PAIR
;
3912 if (i
& CRAM_FUNMAP
) f
|= BAM_FUNMAP
;
3913 if (i
& CRAM_FREVERSE
) f
|= BAM_FREVERSE
;
3914 if (i
& CRAM_FREAD1
) f
|= BAM_FREAD1
;
3915 if (i
& CRAM_FREAD2
) f
|= BAM_FREAD2
;
3916 if (i
& CRAM_FSECONDARY
) f
|= BAM_FSECONDARY
;
3917 if (i
& CRAM_FQCFAIL
) f
|= BAM_FQCFAIL
;
3918 if (i
& CRAM_FDUP
) f
|= BAM_FDUP
;
3920 fd
->bam_flag_swap
[i
] = f
;
3923 for (i
= 0; i
< 0x1000; i
++) {
3926 if (i
& BAM_FPAIRED
) g
|= CRAM_FPAIRED
;
3927 if (i
& BAM_FPROPER_PAIR
) g
|= CRAM_FPROPER_PAIR
;
3928 if (i
& BAM_FUNMAP
) g
|= CRAM_FUNMAP
;
3929 if (i
& BAM_FREVERSE
) g
|= CRAM_FREVERSE
;
3930 if (i
& BAM_FREAD1
) g
|= CRAM_FREAD1
;
3931 if (i
& BAM_FREAD2
) g
|= CRAM_FREAD2
;
3932 if (i
& BAM_FSECONDARY
) g
|= CRAM_FSECONDARY
;
3933 if (i
& BAM_FQCFAIL
) g
|= CRAM_FQCFAIL
;
3934 if (i
& BAM_FDUP
) g
|= CRAM_FDUP
;
3936 fd
->cram_flag_swap
[i
] = g
;
3940 for (i
= 0; i
< 0x1000; i
++)
3941 fd
->bam_flag_swap
[i
] = i
;
3942 for (i
= 0; i
< 0x1000; i
++)
3943 fd
->cram_flag_swap
[i
] = i
;
3946 memset(fd
->cram_sub_matrix
, 4, 32*32);
3947 for (i
= 0; i
< 32; i
++) {
3948 fd
->cram_sub_matrix
[i
]['A'&0x1f]=0;
3949 fd
->cram_sub_matrix
[i
]['C'&0x1f]=1;
3950 fd
->cram_sub_matrix
[i
]['G'&0x1f]=2;
3951 fd
->cram_sub_matrix
[i
]['T'&0x1f]=3;
3952 fd
->cram_sub_matrix
[i
]['N'&0x1f]=4;
3954 for (i
= 0; i
< 20; i
+=4) {
3956 for (j
= 0; j
< 20; j
++) {
3957 fd
->cram_sub_matrix
["ACGTN"[i
>>2]&0x1f][j
]=3;
3958 fd
->cram_sub_matrix
["ACGTN"[i
>>2]&0x1f][j
]=3;
3959 fd
->cram_sub_matrix
["ACGTN"[i
>>2]&0x1f][j
]=3;
3960 fd
->cram_sub_matrix
["ACGTN"[i
>>2]&0x1f][j
]=3;
3962 fd
->cram_sub_matrix
["ACGTN"[i
>>2]&0x1f][CRAM_SUBST_MATRIX
[i
+0]&0x1f]=0;
3963 fd
->cram_sub_matrix
["ACGTN"[i
>>2]&0x1f][CRAM_SUBST_MATRIX
[i
+1]&0x1f]=1;
3964 fd
->cram_sub_matrix
["ACGTN"[i
>>2]&0x1f][CRAM_SUBST_MATRIX
[i
+2]&0x1f]=2;
3965 fd
->cram_sub_matrix
["ACGTN"[i
>>2]&0x1f][CRAM_SUBST_MATRIX
[i
+3]&0x1f]=3;
3969 // Default version numbers for CRAM
3970 static int major_version
= 3;
3971 static int minor_version
= 0;
3974 * Opens a CRAM file for read (mode "rb") or write ("wb").
3975 * The filename may be "-" to indicate stdin or stdout.
3977 * Returns file handle on success
3980 cram_fd
*cram_open(const char *filename
, const char *mode
) {
3983 char fmode
[3]= { mode
[0], '\0', '\0' };
3985 if (strlen(mode
) > 1 && (mode
[1] == 'b' || mode
[1] == 'c')) {
3989 fp
= hopen(filename
, fmode
);
3993 fd
= cram_dopen(fp
, filename
, mode
);
3995 hclose_abruptly(fp
);
4000 /* Opens an existing stream for reading or writing.
4002 * Returns file handle on success;
4005 cram_fd
*cram_dopen(hFILE
*fp
, const char *filename
, const char *mode
) {
4008 cram_fd
*fd
= calloc(1, sizeof(*fd
));
4013 for (i
= 0; mode
[i
]; i
++) {
4014 if (mode
[i
] >= '0' && mode
[i
] <= '9') {
4015 fd
->level
= mode
[i
] - '0';
4022 fd
->first_container
= 0;
4024 if (fd
->mode
== 'r') {
4027 if (!(fd
->file_def
= cram_read_file_def(fd
)))
4030 fd
->version
= fd
->file_def
->major_version
* 256 +
4031 fd
->file_def
->minor_version
;
4033 if (!(fd
->header
= cram_read_SAM_hdr(fd
))) {
4034 cram_free_file_def(fd
->file_def
);
4040 cram_file_def
*def
= calloc(1, sizeof(*def
));
4046 def
->magic
[0] = 'C';
4047 def
->magic
[1] = 'R';
4048 def
->magic
[2] = 'A';
4049 def
->magic
[3] = 'M';
4050 def
->major_version
= 0; // Indicator to write file def later.
4051 def
->minor_version
= 0;
4052 memset(def
->file_id
, 0, 20);
4053 strncpy(def
->file_id
, filename
, 20);
4055 fd
->version
= major_version
* 256 + minor_version
;
4057 /* SAM header written later along with this file_def */
4060 cram_init_tables(fd
);
4062 fd
->prefix
= strdup((cp
= strrchr(filename
, '/')) ? cp
+1 : filename
);
4065 fd
->first_base
= fd
->last_base
= -1;
4066 fd
->record_counter
= 0;
4069 fd
->refs
= refs_create();
4077 fd
->seqs_per_slice
= SEQS_PER_SLICE
;
4078 fd
->bases_per_slice
= BASES_PER_SLICE
;
4079 fd
->slices_per_container
= SLICE_PER_CNT
;
4083 fd
->lossy_read_names
= 0;
4085 fd
->use_rans
= (CRAM_MAJOR_VERS(fd
->version
) >= 3);
4095 fd
->job_pending
= NULL
;
4097 fd
->required_fields
= INT_MAX
;
4099 for (i
= 0; i
< DS_END
; i
++)
4100 fd
->m
[i
] = cram_new_metrics();
4102 if (!(fd
->tags_used
= kh_init(m_metrics
)))
4105 fd
->range
.refid
= -2; // no ref.
4106 fd
->eof
= 1; // See samtools issue #150
4111 /* Initialise dummy refs from the @SQ headers */
4112 if (-1 == refs_from_header(fd
->refs
, fd
, fd
->header
))
4125 * Seek within a CRAM file.
4127 * Returns 0 on success
4130 int cram_seek(cram_fd
*fd
, off_t offset
, int whence
) {
4135 if (hseek(fd
->fp
, offset
, whence
) >= 0)
4138 if (!(whence
== SEEK_CUR
&& offset
>= 0))
4141 /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
4142 while (offset
> 0) {
4143 int len
= MIN(65536, offset
);
4144 if (len
!= hread(fd
->fp
, buf
, len
))
4153 * Flushes a CRAM file.
4154 * Useful for when writing to stdout without wishing to close the stream.
4156 * Returns 0 on success
4159 int cram_flush(cram_fd
*fd
) {
4163 if (fd
->mode
== 'w' && fd
->ctr
) {
4165 cram_update_curr_slice(fd
->ctr
);
4167 if (-1 == cram_flush_container_mt(fd
, fd
->ctr
))
4175 * Closes a CRAM file.
4176 * Returns 0 on success
4179 int cram_close(cram_fd
*fd
) {
4180 spare_bams
*bl
, *next
;
4186 if (fd
->mode
== 'w' && fd
->ctr
) {
4188 cram_update_curr_slice(fd
->ctr
);
4190 if (-1 == cram_flush_container_mt(fd
, fd
->ctr
))
4194 if (fd
->pool
&& fd
->eof
>= 0) {
4195 hts_tpool_process_flush(fd
->rqueue
);
4197 if (0 != cram_flush_result(fd
))
4200 pthread_mutex_destroy(&fd
->metrics_lock
);
4201 pthread_mutex_destroy(&fd
->ref_lock
);
4202 pthread_mutex_destroy(&fd
->bam_list_lock
);
4204 fd
->ctr
= NULL
; // prevent double freeing
4206 //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
4208 hts_tpool_process_destroy(fd
->rqueue
);
4211 if (fd
->mode
== 'w') {
4212 /* Write EOF block */
4213 if (CRAM_MAJOR_VERS(fd
->version
) == 3) {
4214 if (38 != hwrite(fd
->fp
,
4215 "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
4216 "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
4217 "\x00\x01\x00" // Cont HDR
4218 "\x05\xbd\xd9\x4f" // CRC32
4219 "\x00\x01\x00\x06\x06" // Comp.HDR blk
4220 "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk
4221 "\xee\x63\x01\x4b", // CRC32
4225 if (30 != hwrite(fd
->fp
,
4226 "\x0b\x00\x00\x00\xff\xff\xff\xff"
4227 "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
4228 "\x00\x01\x00\x00\x01\x00\x06\x06"
4229 "\x01\x00\x01\x00\x01\x00", 30))
4234 for (bl
= fd
->bl
; bl
; bl
= next
) {
4235 int i
, max_rec
= fd
->seqs_per_slice
* fd
->slices_per_container
;
4238 for (i
= 0; i
< max_rec
; i
++) {
4240 bam_free(bl
->bams
[i
]);
4246 if (hclose(fd
->fp
) != 0)
4250 cram_free_file_def(fd
->file_def
);
4253 sam_hdr_free(fd
->header
);
4258 cram_free_container(fd
->ctr
);
4261 refs_free(fd
->refs
);
4265 for (i
= 0; i
< DS_END
; i
++)
4269 if (fd
->tags_used
) {
4272 for (k
= kh_begin(fd
->tags_used
); k
!= kh_end(fd
->tags_used
); k
++) {
4273 if (kh_exist(fd
->tags_used
, k
))
4274 free(kh_val(fd
->tags_used
, k
));
4277 kh_destroy(m_metrics
, fd
->tags_used
);
4281 cram_index_free(fd
);
4283 if (fd
->own_pool
&& fd
->pool
)
4284 hts_tpool_destroy(fd
->pool
);
4291 * Returns 1 if we hit an EOF while reading.
4293 int cram_eof(cram_fd
*fd
) {
4299 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4300 * Use this immediately after opening.
4302 * Returns 0 on success
4305 int cram_set_option(cram_fd
*fd
, enum hts_fmt_option opt
, ...) {
4309 va_start(args
, opt
);
4310 r
= cram_set_voption(fd
, opt
, args
);
4317 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4318 * Use this immediately after opening.
4320 * Returns 0 on success
4323 int cram_set_voption(cram_fd
*fd
, enum hts_fmt_option opt
, va_list args
) {
4332 case CRAM_OPT_DECODE_MD
:
4333 fd
->decode_md
= va_arg(args
, int);
4336 case CRAM_OPT_PREFIX
:
4339 if (!(fd
->prefix
= strdup(va_arg(args
, char *))))
4343 case CRAM_OPT_VERBOSITY
:
4344 fd
->verbose
= va_arg(args
, int);
4347 case CRAM_OPT_SEQS_PER_SLICE
:
4348 fd
->seqs_per_slice
= va_arg(args
, int);
4351 case CRAM_OPT_BASES_PER_SLICE
:
4352 fd
->bases_per_slice
= va_arg(args
, int);
4355 case CRAM_OPT_SLICES_PER_CONTAINER
:
4356 fd
->slices_per_container
= va_arg(args
, int);
4359 case CRAM_OPT_EMBED_REF
:
4360 fd
->embed_ref
= va_arg(args
, int);
4363 case CRAM_OPT_NO_REF
:
4364 fd
->no_ref
= va_arg(args
, int);
4367 case CRAM_OPT_IGNORE_MD5
:
4368 fd
->ignore_md5
= va_arg(args
, int);
4371 case CRAM_OPT_LOSSY_NAMES
:
4372 fd
->lossy_read_names
= va_arg(args
, int);
4373 // Currently lossy read names required paired (attached) reads.
4374 // TLEN 0 or being 1 out causes read pairs to be detached, breaking
4375 // the lossy read name compression, so we have extra options to
4376 // slacken the exact TLEN round-trip checks.
4377 fd
->tlen_approx
= fd
->lossy_read_names
;
4378 fd
->tlen_zero
= fd
->lossy_read_names
;
4381 case CRAM_OPT_USE_BZIP2
:
4382 fd
->use_bz2
= va_arg(args
, int);
4385 case CRAM_OPT_USE_RANS
:
4386 fd
->use_rans
= va_arg(args
, int);
4389 case CRAM_OPT_USE_LZMA
:
4390 fd
->use_lzma
= va_arg(args
, int);
4393 case CRAM_OPT_SHARED_REF
:
4395 refs
= va_arg(args
, refs_t
*);
4396 if (refs
!= fd
->refs
) {
4398 refs_free(fd
->refs
);
4404 case CRAM_OPT_RANGE
:
4405 fd
->range
= *va_arg(args
, cram_range
*);
4406 return cram_seek_to_refpos(fd
, &fd
->range
);
4408 case CRAM_OPT_REFERENCE
:
4409 return cram_load_reference(fd
, va_arg(args
, char *));
4411 case CRAM_OPT_VERSION
: {
4413 char *s
= va_arg(args
, char *);
4414 if (2 != sscanf(s
, "%d.%d", &major
, &minor
)) {
4415 fprintf(stderr
, "Malformed version string %s\n", s
);
4418 if (!((major
== 1 && minor
== 0) ||
4419 (major
== 2 && (minor
== 0 || minor
== 1)) ||
4420 (major
== 3 && minor
== 0))) {
4421 fprintf(stderr
, "Unknown version string; "
4422 "use 1.0, 2.0, 2.1 or 3.0\n");
4426 fd
->version
= major
*256 + minor
;
4428 if (CRAM_MAJOR_VERS(fd
->version
) >= 3)
4433 case CRAM_OPT_MULTI_SEQ_PER_SLICE
:
4434 fd
->multi_seq
= va_arg(args
, int);
4437 case CRAM_OPT_NTHREADS
: {
4438 int nthreads
= va_arg(args
, int);
4440 if (!(fd
->pool
= hts_tpool_init(nthreads
)))
4443 fd
->rqueue
= hts_tpool_process_init(fd
->pool
, nthreads
*2, 0);
4444 pthread_mutex_init(&fd
->metrics_lock
, NULL
);
4445 pthread_mutex_init(&fd
->ref_lock
, NULL
);
4446 pthread_mutex_init(&fd
->bam_list_lock
, NULL
);
4453 case CRAM_OPT_THREAD_POOL
: {
4454 htsThreadPool
*p
= va_arg(args
, htsThreadPool
*);
4455 fd
->pool
= p
? p
->pool
: NULL
;
4457 fd
->rqueue
= hts_tpool_process_init(fd
->pool
,
4458 p
->qsize
? p
->qsize
: hts_tpool_size(fd
->pool
)*2,
4460 pthread_mutex_init(&fd
->metrics_lock
, NULL
);
4461 pthread_mutex_init(&fd
->ref_lock
, NULL
);
4462 pthread_mutex_init(&fd
->bam_list_lock
, NULL
);
4464 fd
->shared_ref
= 1; // Needed to avoid clobbering ref between threads
4468 //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
4469 //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
4473 case CRAM_OPT_REQUIRED_FIELDS
:
4474 fd
->required_fields
= va_arg(args
, int);
4477 case HTS_OPT_COMPRESSION_LEVEL
:
4478 fd
->level
= va_arg(args
, int);
4482 fprintf(stderr
, "Unknown CRAM option code %d\n", opt
);
4490 int cram_check_EOF(cram_fd
*fd
)
4492 // Byte 9 in these templates is & with 0x0f to resolve differences
4493 // between ITF-8 interpretations between early Java and C
4494 // implementations of CRAM
4495 static const unsigned char TEMPLATE_2_1
[30] = {
4496 0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4497 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
4498 0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
4500 static const unsigned char TEMPLATE_3
[38] = {
4501 0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4502 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
4503 0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
4504 0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
4507 unsigned char buf
[38]; // max(sizeof TEMPLATE_*)
4509 uint8_t major
= CRAM_MAJOR_VERS(fd
->version
);
4510 uint8_t minor
= CRAM_MINOR_VERS(fd
->version
);
4512 const unsigned char *template;
4513 ssize_t template_len
;
4515 (major
== 2 && minor
== 0)) {
4516 return 3; // No EOF support in cram versions less than 2.1
4517 } else if (major
== 2 && minor
== 1) {
4518 template = TEMPLATE_2_1
;
4519 template_len
= sizeof TEMPLATE_2_1
;
4521 template = TEMPLATE_3
;
4522 template_len
= sizeof TEMPLATE_3
;
4525 off_t offset
= htell(fd
->fp
);
4526 if (hseek(fd
->fp
, -template_len
, SEEK_END
) < 0) {
4527 if (errno
== ESPIPE
) {
4535 if (hread(fd
->fp
, buf
, template_len
) != template_len
) return -1;
4536 if (hseek(fd
->fp
, offset
, SEEK_SET
) < 0) return -1;
4538 return (memcmp(template, buf
, template_len
) == 0)? 1 : 0;