modified: myjupyterlab.sh
[GalaxyCodeBases.git] / c_cpp / lib / htslib / cram / cram_io.c
blob8e2f1ea5899c8e95a1d641992e3a1746cd7fb4ac
1 /*
2 Copyright (c) 2012-2016 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 * CRAM I/O primitives.
34 * - ITF8 encoding and decoding.
35 * - Block based I/O
36 * - Zlib inflating and deflating (memory)
37 * - CRAM basic data structure reading and writing
38 * - File opening / closing
39 * - Reference sequence handling
43 * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
44 * a way to return errors for when malloc fails.
47 #include <config.h>
49 #include <stdio.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <unistd.h>
55 #include <zlib.h>
56 #ifdef HAVE_LIBBZ2
57 #include <bzlib.h>
58 #endif
59 #ifdef HAVE_LIBLZMA
60 #include <lzma.h>
61 #endif
62 #include <sys/types.h>
63 #include <sys/stat.h>
64 #include <math.h>
65 #include <time.h>
66 #include <stdint.h>
68 #include "cram/cram.h"
69 #include "cram/os.h"
70 #include "htslib/hts.h"
71 #include "cram/open_trace_file.h"
72 #include "cram/rANS_static.h"
74 //#define REF_DEBUG
76 #ifdef REF_DEBUG
77 #include <sys/syscall.h>
78 #define gettid() (int)syscall(SYS_gettid)
80 #define RP(...) fprintf (stderr, __VA_ARGS__)
81 #else
82 #define RP(...)
83 #endif
85 #include "htslib/hfile.h"
86 #include "htslib/bgzf.h"
87 #include "htslib/faidx.h"
88 #include "hts_internal.h"
90 #ifndef PATH_MAX
91 #define PATH_MAX FILENAME_MAX
92 #endif
94 #define TRIAL_SPAN 50
95 #define NTRIALS 3
98 /* ----------------------------------------------------------------------
99 * ITF8 encoding and decoding.
101 * Also see the itf8_get and itf8_put macros in cram_io.h
105 * LEGACY: consider using itf8_decode_crc.
107 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
108 * *val.
110 * Returns the number of bytes read on success
111 * -1 on failure
113 int itf8_decode(cram_fd *fd, int32_t *val_p) {
114 static int nbytes[16] = {
115 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
116 1,1,1,1, // 1000xxxx - 1011xxxx
117 2,2, // 1100xxxx - 1101xxxx
118 3, // 1110xxxx
119 4, // 1111xxxx
122 static int nbits[16] = {
123 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
124 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
125 0x1f, 0x1f, // 1100xxxx - 1101xxxx
126 0x0f, // 1110xxxx
127 0x0f, // 1111xxxx
130 int32_t val = hgetc(fd->fp);
131 if (val == -1)
132 return -1;
134 int i = nbytes[val>>4];
135 val &= nbits[val>>4];
137 switch(i) {
138 case 0:
139 *val_p = val;
140 return 1;
142 case 1:
143 val = (val<<8) | (unsigned char)hgetc(fd->fp);
144 *val_p = val;
145 return 2;
147 case 2:
148 val = (val<<8) | (unsigned char)hgetc(fd->fp);
149 val = (val<<8) | (unsigned char)hgetc(fd->fp);
150 *val_p = val;
151 return 3;
153 case 3:
154 val = (val<<8) | (unsigned char)hgetc(fd->fp);
155 val = (val<<8) | (unsigned char)hgetc(fd->fp);
156 val = (val<<8) | (unsigned char)hgetc(fd->fp);
157 *val_p = val;
158 return 4;
160 case 4: // really 3.5 more, why make it different?
161 val = (val<<8) | (unsigned char)hgetc(fd->fp);
162 val = (val<<8) | (unsigned char)hgetc(fd->fp);
163 val = (val<<8) | (unsigned char)hgetc(fd->fp);
164 val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
165 *val_p = val;
168 return 5;
171 int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
172 static int nbytes[16] = {
173 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
174 1,1,1,1, // 1000xxxx - 1011xxxx
175 2,2, // 1100xxxx - 1101xxxx
176 3, // 1110xxxx
177 4, // 1111xxxx
180 static int nbits[16] = {
181 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
182 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
183 0x1f, 0x1f, // 1100xxxx - 1101xxxx
184 0x0f, // 1110xxxx
185 0x0f, // 1111xxxx
187 unsigned char c[5];
189 int32_t val = hgetc(fd->fp);
190 if (val == -1)
191 return -1;
193 c[0]=val;
195 int i = nbytes[val>>4];
196 val &= nbits[val>>4];
198 switch(i) {
199 case 0:
200 *val_p = val;
201 *crc = crc32(*crc, c, 1);
202 return 1;
204 case 1:
205 val = (val<<8) | (c[1]=hgetc(fd->fp));
206 *val_p = val;
207 *crc = crc32(*crc, c, 2);
208 return 2;
210 case 2:
211 val = (val<<8) | (c[1]=hgetc(fd->fp));
212 val = (val<<8) | (c[2]=hgetc(fd->fp));
213 *val_p = val;
214 *crc = crc32(*crc, c, 3);
215 return 3;
217 case 3:
218 val = (val<<8) | (c[1]=hgetc(fd->fp));
219 val = (val<<8) | (c[2]=hgetc(fd->fp));
220 val = (val<<8) | (c[3]=hgetc(fd->fp));
221 *val_p = val;
222 *crc = crc32(*crc, c, 4);
223 return 4;
225 case 4: // really 3.5 more, why make it different?
227 uint32_t uv = val;
228 uv = (uv<<8) | (c[1]=hgetc(fd->fp));
229 uv = (uv<<8) | (c[2]=hgetc(fd->fp));
230 uv = (uv<<8) | (c[3]=hgetc(fd->fp));
231 uv = (uv<<4) | (((c[4]=hgetc(fd->fp))) & 0x0f);
232 *val_p = uv < 0x80000000UL ? uv : -((int32_t) (0xffffffffUL - uv)) - 1;
233 *crc = crc32(*crc, c, 5);
237 return 5;
241 * Encodes and writes a single integer in ITF-8 format.
242 * Returns 0 on success
243 * -1 on failure
245 int itf8_encode(cram_fd *fd, int32_t val) {
246 char buf[5];
247 int len = itf8_put(buf, val);
248 return hwrite(fd->fp, buf, len) == len ? 0 : -1;
251 const int itf8_bytes[16] = {
252 1, 1, 1, 1, 1, 1, 1, 1,
253 2, 2, 2, 2, 3, 3, 4, 5
256 const int ltf8_bytes[256] = {
257 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
258 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
259 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
260 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
262 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
263 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
264 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
265 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
267 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
268 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
269 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
270 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
272 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
273 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
275 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
277 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 9
281 * LEGACY: consider using ltf8_decode_crc.
283 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
284 int c = hgetc(fd->fp);
285 int64_t val = (unsigned char)c;
286 if (c == -1)
287 return -1;
289 if (val < 0x80) {
290 *val_p = val;
291 return 1;
293 } else if (val < 0xc0) {
294 val = (val<<8) | (unsigned char)hgetc(fd->fp);
295 *val_p = val & (((1LL<<(6+8)))-1);
296 return 2;
298 } else if (val < 0xe0) {
299 val = (val<<8) | (unsigned char)hgetc(fd->fp);
300 val = (val<<8) | (unsigned char)hgetc(fd->fp);
301 *val_p = val & ((1LL<<(5+2*8))-1);
302 return 3;
304 } else if (val < 0xf0) {
305 val = (val<<8) | (unsigned char)hgetc(fd->fp);
306 val = (val<<8) | (unsigned char)hgetc(fd->fp);
307 val = (val<<8) | (unsigned char)hgetc(fd->fp);
308 *val_p = val & ((1LL<<(4+3*8))-1);
309 return 4;
311 } else if (val < 0xf8) {
312 val = (val<<8) | (unsigned char)hgetc(fd->fp);
313 val = (val<<8) | (unsigned char)hgetc(fd->fp);
314 val = (val<<8) | (unsigned char)hgetc(fd->fp);
315 val = (val<<8) | (unsigned char)hgetc(fd->fp);
316 *val_p = val & ((1LL<<(3+4*8))-1);
317 return 5;
319 } else if (val < 0xfc) {
320 val = (val<<8) | (unsigned char)hgetc(fd->fp);
321 val = (val<<8) | (unsigned char)hgetc(fd->fp);
322 val = (val<<8) | (unsigned char)hgetc(fd->fp);
323 val = (val<<8) | (unsigned char)hgetc(fd->fp);
324 val = (val<<8) | (unsigned char)hgetc(fd->fp);
325 *val_p = val & ((1LL<<(2+5*8))-1);
326 return 6;
328 } else if (val < 0xfe) {
329 val = (val<<8) | (unsigned char)hgetc(fd->fp);
330 val = (val<<8) | (unsigned char)hgetc(fd->fp);
331 val = (val<<8) | (unsigned char)hgetc(fd->fp);
332 val = (val<<8) | (unsigned char)hgetc(fd->fp);
333 val = (val<<8) | (unsigned char)hgetc(fd->fp);
334 val = (val<<8) | (unsigned char)hgetc(fd->fp);
335 *val_p = val & ((1LL<<(1+6*8))-1);
336 return 7;
338 } else if (val < 0xff) {
339 val = (val<<8) | (unsigned char)hgetc(fd->fp);
340 val = (val<<8) | (unsigned char)hgetc(fd->fp);
341 val = (val<<8) | (unsigned char)hgetc(fd->fp);
342 val = (val<<8) | (unsigned char)hgetc(fd->fp);
343 val = (val<<8) | (unsigned char)hgetc(fd->fp);
344 val = (val<<8) | (unsigned char)hgetc(fd->fp);
345 val = (val<<8) | (unsigned char)hgetc(fd->fp);
346 *val_p = val & ((1LL<<(7*8))-1);
347 return 8;
349 } else {
350 val = (val<<8) | (unsigned char)hgetc(fd->fp);
351 val = (val<<8) | (unsigned char)hgetc(fd->fp);
352 val = (val<<8) | (unsigned char)hgetc(fd->fp);
353 val = (val<<8) | (unsigned char)hgetc(fd->fp);
354 val = (val<<8) | (unsigned char)hgetc(fd->fp);
355 val = (val<<8) | (unsigned char)hgetc(fd->fp);
356 val = (val<<8) | (unsigned char)hgetc(fd->fp);
357 val = (val<<8) | (unsigned char)hgetc(fd->fp);
358 *val_p = val;
361 return 9;
364 int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
365 unsigned char c[9];
366 int64_t val = (unsigned char)hgetc(fd->fp);
367 if (val == -1)
368 return -1;
370 c[0] = val;
372 if (val < 0x80) {
373 *val_p = val;
374 *crc = crc32(*crc, c, 1);
375 return 1;
377 } else if (val < 0xc0) {
378 val = (val<<8) | (c[1]=hgetc(fd->fp));;
379 *val_p = val & (((1LL<<(6+8)))-1);
380 *crc = crc32(*crc, c, 2);
381 return 2;
383 } else if (val < 0xe0) {
384 val = (val<<8) | (c[1]=hgetc(fd->fp));;
385 val = (val<<8) | (c[2]=hgetc(fd->fp));;
386 *val_p = val & ((1LL<<(5+2*8))-1);
387 *crc = crc32(*crc, c, 3);
388 return 3;
390 } else if (val < 0xf0) {
391 val = (val<<8) | (c[1]=hgetc(fd->fp));;
392 val = (val<<8) | (c[2]=hgetc(fd->fp));;
393 val = (val<<8) | (c[3]=hgetc(fd->fp));;
394 *val_p = val & ((1LL<<(4+3*8))-1);
395 *crc = crc32(*crc, c, 4);
396 return 4;
398 } else if (val < 0xf8) {
399 val = (val<<8) | (c[1]=hgetc(fd->fp));;
400 val = (val<<8) | (c[2]=hgetc(fd->fp));;
401 val = (val<<8) | (c[3]=hgetc(fd->fp));;
402 val = (val<<8) | (c[4]=hgetc(fd->fp));;
403 *val_p = val & ((1LL<<(3+4*8))-1);
404 *crc = crc32(*crc, c, 5);
405 return 5;
407 } else if (val < 0xfc) {
408 val = (val<<8) | (c[1]=hgetc(fd->fp));;
409 val = (val<<8) | (c[2]=hgetc(fd->fp));;
410 val = (val<<8) | (c[3]=hgetc(fd->fp));;
411 val = (val<<8) | (c[4]=hgetc(fd->fp));;
412 val = (val<<8) | (c[5]=hgetc(fd->fp));;
413 *val_p = val & ((1LL<<(2+5*8))-1);
414 *crc = crc32(*crc, c, 6);
415 return 6;
417 } else if (val < 0xfe) {
418 val = (val<<8) | (c[1]=hgetc(fd->fp));;
419 val = (val<<8) | (c[2]=hgetc(fd->fp));;
420 val = (val<<8) | (c[3]=hgetc(fd->fp));;
421 val = (val<<8) | (c[4]=hgetc(fd->fp));;
422 val = (val<<8) | (c[5]=hgetc(fd->fp));;
423 val = (val<<8) | (c[6]=hgetc(fd->fp));;
424 *val_p = val & ((1LL<<(1+6*8))-1);
425 *crc = crc32(*crc, c, 7);
426 return 7;
428 } else if (val < 0xff) {
429 val = (val<<8) | (c[1]=hgetc(fd->fp));;
430 val = (val<<8) | (c[2]=hgetc(fd->fp));;
431 val = (val<<8) | (c[3]=hgetc(fd->fp));;
432 val = (val<<8) | (c[4]=hgetc(fd->fp));;
433 val = (val<<8) | (c[5]=hgetc(fd->fp));;
434 val = (val<<8) | (c[6]=hgetc(fd->fp));;
435 val = (val<<8) | (c[7]=hgetc(fd->fp));;
436 *val_p = val & ((1LL<<(7*8))-1);
437 *crc = crc32(*crc, c, 8);
438 return 8;
440 } else {
441 val = (val<<8) | (c[1]=hgetc(fd->fp));;
442 val = (val<<8) | (c[2]=hgetc(fd->fp));;
443 val = (val<<8) | (c[3]=hgetc(fd->fp));;
444 val = (val<<8) | (c[4]=hgetc(fd->fp));;
445 val = (val<<8) | (c[5]=hgetc(fd->fp));;
446 val = (val<<8) | (c[6]=hgetc(fd->fp));;
447 val = (val<<8) | (c[7]=hgetc(fd->fp));;
448 val = (val<<8) | (c[8]=hgetc(fd->fp));;
449 *crc = crc32(*crc, c, 9);
450 *val_p = val;
453 return 9;
457 * Pushes a value in ITF8 format onto the end of a block.
458 * This shouldn't be used for high-volume data as it is not the fastest
459 * method.
461 * Returns the number of bytes written
463 int itf8_put_blk(cram_block *blk, int val) {
464 char buf[5];
465 int sz;
467 sz = itf8_put(buf, val);
468 BLOCK_APPEND(blk, buf, sz);
469 return sz;
473 * Decodes a 32-bit little endian value from fd and stores in val.
475 * Returns the number of bytes read on success
476 * -1 on failure
478 int int32_decode(cram_fd *fd, int32_t *val) {
479 int32_t i;
480 if (4 != hread(fd->fp, &i, 4))
481 return -1;
483 *val = le_int4(i);
484 return 4;
488 * Encodes a 32-bit little endian value 'val' and writes to fd.
490 * Returns the number of bytes written on success
491 * -1 on failure
493 int int32_encode(cram_fd *fd, int32_t val) {
494 val = le_int4(val);
495 if (4 != hwrite(fd->fp, &val, 4))
496 return -1;
498 return 4;
501 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
502 int int32_get_blk(cram_block *b, int32_t *val) {
503 if (b->uncomp_size - BLOCK_SIZE(b) < 4)
504 return -1;
506 *val =
507 b->data[b->byte ] |
508 (b->data[b->byte+1] << 8) |
509 (b->data[b->byte+2] << 16) |
510 (b->data[b->byte+3] << 24);
511 BLOCK_SIZE(b) += 4;
512 return 4;
515 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
516 int int32_put_blk(cram_block *b, int32_t val) {
517 unsigned char cp[4];
518 cp[0] = ( val & 0xff);
519 cp[1] = ((val>>8) & 0xff);
520 cp[2] = ((val>>16) & 0xff);
521 cp[3] = ((val>>24) & 0xff);
523 BLOCK_APPEND(b, cp, 4);
524 return b->data ? 0 : -1;
527 /* ----------------------------------------------------------------------
528 * zlib compression code - from Gap5's tg_iface_g.c
529 * They're static here as they're only used within the cram_compress_block
530 * and cram_uncompress_block functions, which are the external interface.
532 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
533 z_stream s;
534 unsigned char *data = NULL; /* Uncompressed output */
535 int data_alloc = 0;
536 int err;
538 /* Starting point at uncompressed size, and scale after that */
539 data = malloc(data_alloc = csize*1.2+100);
540 if (!data)
541 return NULL;
543 /* Initialise zlib stream */
544 s.zalloc = Z_NULL; /* use default allocation functions */
545 s.zfree = Z_NULL;
546 s.opaque = Z_NULL;
547 s.next_in = (unsigned char *)cdata;
548 s.avail_in = csize;
549 s.total_in = 0;
550 s.next_out = data;
551 s.avail_out = data_alloc;
552 s.total_out = 0;
554 //err = inflateInit(&s);
555 err = inflateInit2(&s, 15 + 32);
556 if (err != Z_OK) {
557 fprintf(stderr, "zlib inflateInit error: %s\n", s.msg);
558 free(data);
559 return NULL;
562 /* Decode to 'data' array */
563 for (;s.avail_in;) {
564 unsigned char *data_tmp;
565 int alloc_inc;
567 s.next_out = &data[s.total_out];
568 err = inflate(&s, Z_NO_FLUSH);
569 if (err == Z_STREAM_END)
570 break;
572 if (err != Z_OK) {
573 fprintf(stderr, "zlib inflate error: %s\n", s.msg);
574 if (data)
575 free(data);
576 return NULL;
579 /* More to come, so realloc based on growth so far */
580 alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
581 data = realloc((data_tmp = data), data_alloc += alloc_inc);
582 if (!data) {
583 free(data_tmp);
584 return NULL;
586 s.avail_out += alloc_inc;
588 inflateEnd(&s);
590 *size = s.total_out;
591 return (char *)data;
594 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
595 int level, int strat) {
596 z_stream s;
597 unsigned char *cdata = NULL; /* Compressed output */
598 int cdata_alloc = 0;
599 int cdata_pos = 0;
600 int err;
602 cdata = malloc(cdata_alloc = size*1.05+100);
603 if (!cdata)
604 return NULL;
605 cdata_pos = 0;
607 /* Initialise zlib stream */
608 s.zalloc = Z_NULL; /* use default allocation functions */
609 s.zfree = Z_NULL;
610 s.opaque = Z_NULL;
611 s.next_in = (unsigned char *)data;
612 s.avail_in = size;
613 s.total_in = 0;
614 s.next_out = cdata;
615 s.avail_out = cdata_alloc;
616 s.total_out = 0;
617 s.data_type = Z_BINARY;
619 err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
620 if (err != Z_OK) {
621 fprintf(stderr, "zlib deflateInit2 error: %s\n", s.msg);
622 return NULL;
625 /* Encode to 'cdata' array */
626 for (;s.avail_in;) {
627 s.next_out = &cdata[cdata_pos];
628 s.avail_out = cdata_alloc - cdata_pos;
629 if (cdata_alloc - cdata_pos <= 0) {
630 fprintf(stderr, "Deflate produced larger output than expected. Abort\n");
631 return NULL;
633 err = deflate(&s, Z_NO_FLUSH);
634 cdata_pos = cdata_alloc - s.avail_out;
635 if (err != Z_OK) {
636 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
637 break;
640 if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
641 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
643 *cdata_size = s.total_out;
645 if (deflateEnd(&s) != Z_OK) {
646 fprintf(stderr, "zlib deflate error: %s\n", s.msg);
648 return (char *)cdata;
651 #ifdef HAVE_LIBLZMA
652 /* ------------------------------------------------------------------------ */
654 * Data compression routines using liblzma (xz)
656 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
657 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
658 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
659 * as compression times.
661 * For now we disable this functionality. If it's to be reenabled make sure you
662 * improve the mem_inflate implementation as it's just a test hack at the
663 * moment.
666 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
667 int level) {
668 char *out;
669 size_t out_size = lzma_stream_buffer_bound(size);
670 *cdata_size = 0;
672 out = malloc(out_size);
674 /* Single call compression */
675 if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
676 (uint8_t *)data, size,
677 (uint8_t *)out, cdata_size,
678 out_size))
679 return NULL;
681 return out;
684 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
685 lzma_stream strm = LZMA_STREAM_INIT;
686 size_t out_size = 0, out_pos = 0;
687 char *out = NULL;
688 int r;
690 /* Initiate the decoder */
691 if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
692 return NULL;
694 /* Decode loop */
695 strm.avail_in = csize;
696 strm.next_in = (uint8_t *)cdata;
698 for (;strm.avail_in;) {
699 if (strm.avail_in > out_size - out_pos) {
700 out_size += strm.avail_in * 4 + 32768;
701 out = realloc(out, out_size);
703 strm.avail_out = out_size - out_pos;
704 strm.next_out = (uint8_t *)&out[out_pos];
706 r = lzma_code(&strm, LZMA_RUN);
707 if (LZMA_OK != r && LZMA_STREAM_END != r) {
708 fprintf(stderr, "[E::%s] LZMA decode failure (error %d)\n", __func__, r);
709 return NULL;
712 out_pos = strm.total_out;
714 if (r == LZMA_STREAM_END)
715 break;
718 /* finish up any unflushed data; necessary? */
719 r = lzma_code(&strm, LZMA_FINISH);
720 if (r != LZMA_OK && r != LZMA_STREAM_END) {
721 fprintf(stderr, "r=%d\n", r);
722 return NULL;
725 out = realloc(out, strm.total_out);
726 *size = strm.total_out;
728 lzma_end(&strm);
730 return out;
732 #endif
734 /* ----------------------------------------------------------------------
735 * CRAM blocks - the dynamically growable data block. We have code to
736 * create, update, (un)compress and read/write.
738 * These are derived from the deflate_interlaced.c blocks, but with the
739 * CRAM extension of content types and IDs.
743 * Allocates a new cram_block structure with a specified content_type and
744 * id.
746 * Returns block pointer on success
747 * NULL on failure
749 cram_block *cram_new_block(enum cram_content_type content_type,
750 int content_id) {
751 cram_block *b = malloc(sizeof(*b));
752 if (!b)
753 return NULL;
754 b->method = b->orig_method = RAW;
755 b->content_type = content_type;
756 b->content_id = content_id;
757 b->comp_size = 0;
758 b->uncomp_size = 0;
759 b->data = NULL;
760 b->alloc = 0;
761 b->byte = 0;
762 b->bit = 7; // MSB
764 return b;
768 * Reads a block from a cram file.
769 * Returns cram_block pointer on success.
770 * NULL on failure
772 cram_block *cram_read_block(cram_fd *fd) {
773 cram_block *b = malloc(sizeof(*b));
774 unsigned char c;
775 uint32_t crc = 0;
776 if (!b)
777 return NULL;
779 //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
781 if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; }
782 c = b->method; crc = crc32(crc, &c, 1);
783 if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
784 c = b->content_type; crc = crc32(crc, &c, 1);
785 if (-1 == itf8_decode_crc(fd, &b->content_id, &crc)) { free(b); return NULL; }
786 if (-1 == itf8_decode_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; }
787 if (-1 == itf8_decode_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
789 // fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
790 // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
792 if (b->method == RAW) {
793 if (b->uncomp_size < 0) { free(b); return NULL; }
794 b->alloc = b->uncomp_size;
795 if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
796 if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
797 free(b->data);
798 free(b);
799 return NULL;
801 } else {
802 if (b->comp_size < 0) { free(b); return NULL; }
803 b->alloc = b->comp_size;
804 if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; }
805 if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
806 free(b->data);
807 free(b);
808 return NULL;
812 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
813 if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
814 free(b);
815 return NULL;
818 crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc);
819 if (crc != b->crc32) {
820 fprintf(stderr, "Block CRC32 failure\n");
821 free(b->data);
822 free(b);
823 return NULL;
827 b->orig_method = b->method;
828 b->idx = 0;
829 b->byte = 0;
830 b->bit = 7; // MSB
832 return b;
837 * Computes the size of a cram block, including the block
838 * header itself.
840 uint32_t cram_block_size(cram_block *b) {
841 unsigned char dat[100], *cp = dat;;
842 uint32_t sz;
844 *cp++ = b->method;
845 *cp++ = b->content_type;
846 cp += itf8_put((char*)cp, b->content_id);
847 cp += itf8_put((char*)cp, b->comp_size);
848 cp += itf8_put((char*)cp, b->uncomp_size);
850 sz = cp-dat + 4;
851 sz += b->method == RAW ? b->uncomp_size : b->comp_size;
853 return sz;
857 * Writes a CRAM block.
858 * Returns 0 on success
859 * -1 on failure
861 int cram_write_block(cram_fd *fd, cram_block *b) {
862 assert(b->method != RAW || (b->comp_size == b->uncomp_size));
864 if (hputc(b->method, fd->fp) == EOF) return -1;
865 if (hputc(b->content_type, fd->fp) == EOF) return -1;
866 if (itf8_encode(fd, b->content_id) == -1) return -1;
867 if (itf8_encode(fd, b->comp_size) == -1) return -1;
868 if (itf8_encode(fd, b->uncomp_size) == -1) return -1;
870 if (b->data) {
871 if (b->method == RAW) {
872 if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
873 return -1;
874 } else {
875 if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
876 return -1;
878 } else {
879 // Absent blocks should be size 0
880 assert(b->method == RAW && b->uncomp_size == 0);
883 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
884 unsigned char dat[100], *cp = dat;;
885 uint32_t crc;
887 *cp++ = b->method;
888 *cp++ = b->content_type;
889 cp += itf8_put((char*)cp, b->content_id);
890 cp += itf8_put((char*)cp, b->comp_size);
891 cp += itf8_put((char*)cp, b->uncomp_size);
892 crc = crc32(0L, dat, cp-dat);
894 if (b->method == RAW) {
895 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
896 } else {
897 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
900 if (-1 == int32_encode(fd, b->crc32))
901 return -1;
904 return 0;
908 * Frees a CRAM block, deallocating internal data too.
910 void cram_free_block(cram_block *b) {
911 if (!b)
912 return;
913 if (b->data)
914 free(b->data);
915 free(b);
919 * Uncompresses a CRAM block, if compressed.
921 int cram_uncompress_block(cram_block *b) {
922 char *uncomp;
923 size_t uncomp_size = 0;
925 if (b->uncomp_size == 0) {
926 // blank block
927 b->method = RAW;
928 return 0;
931 switch (b->method) {
932 case RAW:
933 return 0;
935 case GZIP:
936 uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
937 if (!uncomp)
938 return -1;
939 if ((int)uncomp_size != b->uncomp_size) {
940 free(uncomp);
941 return -1;
943 free(b->data);
944 b->data = (unsigned char *)uncomp;
945 b->alloc = uncomp_size;
946 b->method = RAW;
947 break;
949 #ifdef HAVE_LIBBZ2
950 case BZIP2: {
951 unsigned int usize = b->uncomp_size;
952 if (!(uncomp = malloc(usize)))
953 return -1;
954 if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
955 (char *)b->data, b->comp_size,
956 0, 0)) {
957 free(uncomp);
958 return -1;
960 free(b->data);
961 b->data = (unsigned char *)uncomp;
962 b->alloc = usize;
963 b->method = RAW;
964 b->uncomp_size = usize; // Just incase it differs
965 break;
967 #else
968 case BZIP2:
969 fprintf(stderr, "Bzip2 compression is not compiled into this "
970 "version.\nPlease rebuild and try again.\n");
971 return -1;
972 #endif
974 #ifdef HAVE_LIBLZMA
975 case LZMA:
976 uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
977 if (!uncomp)
978 return -1;
979 if ((int)uncomp_size != b->uncomp_size)
980 return -1;
981 free(b->data);
982 b->data = (unsigned char *)uncomp;
983 b->alloc = uncomp_size;
984 b->method = RAW;
985 break;
986 #else
987 case LZMA:
988 fprintf(stderr, "Lzma compression is not compiled into this "
989 "version.\nPlease rebuild and try again.\n");
990 return -1;
991 break;
992 #endif
994 case RANS: {
995 unsigned int usize = b->uncomp_size, usize2;
996 uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
997 if (!uncomp || usize != usize2)
998 return -1;
999 free(b->data);
1000 b->data = (unsigned char *)uncomp;
1001 b->alloc = usize2;
1002 b->method = RAW;
1003 b->uncomp_size = usize2; // Just incase it differs
1004 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1005 break;
1008 default:
1009 return -1;
1012 return 0;
1015 static char *cram_compress_by_method(char *in, size_t in_size,
1016 size_t *out_size,
1017 enum cram_block_method method,
1018 int level, int strat) {
1019 switch (method) {
1020 case GZIP:
1021 return zlib_mem_deflate(in, in_size, out_size, level, strat);
1023 case BZIP2: {
1024 #ifdef HAVE_LIBBZ2
1025 unsigned int comp_size = in_size*1.01 + 600;
1026 char *comp = malloc(comp_size);
1027 if (!comp)
1028 return NULL;
1030 if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1031 in, in_size,
1032 level, 0, 30)) {
1033 free(comp);
1034 return NULL;
1036 *out_size = comp_size;
1037 return comp;
1038 #else
1039 return NULL;
1040 #endif
1043 case LZMA:
1044 #ifdef HAVE_LIBLZMA
1045 return lzma_mem_deflate(in, in_size, out_size, level);
1046 #else
1047 return NULL;
1048 #endif
1050 case RANS0: {
1051 unsigned int out_size_i;
1052 unsigned char *cp;
1053 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0);
1054 *out_size = out_size_i;
1055 return (char *)cp;
1058 case RANS1: {
1059 unsigned int out_size_i;
1060 unsigned char *cp;
1062 cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1);
1063 *out_size = out_size_i;
1064 return (char *)cp;
1067 case RAW:
1068 break;
1070 default:
1071 return NULL;
1074 return NULL;
1079 * Compresses a block using one of two different zlib strategies. If we only
1080 * want one choice set strat2 to be -1.
1082 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
1083 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
1084 * significantly faster.
1086 * Method and level -1 implies defaults, as specified in cram_fd.
1088 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
1089 int method, int level) {
1091 char *comp = NULL;
1092 size_t comp_size = 0;
1093 int strat;
1095 if (b->method != RAW) {
1096 // Maybe already compressed if s->block[0] was compressed and
1097 // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1098 // one base type present and hence using E_HUFFMAN on block 0.
1099 // A second explicit attempt to compress the same block then
1100 // occurs.
1101 return 0;
1104 if (method == -1) {
1105 method = 1<<GZIP;
1106 if (fd->use_bz2)
1107 method |= 1<<BZIP2;
1108 if (fd->use_lzma)
1109 method |= 1<<LZMA;
1112 if (level == -1)
1113 level = fd->level;
1115 //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1117 if (method == RAW || level == 0 || b->uncomp_size == 0) {
1118 b->method = RAW;
1119 b->comp_size = b->uncomp_size;
1120 //fprintf(stderr, "Skip block id %d\n", b->content_id);
1121 return 0;
1124 if (metrics) {
1125 pthread_mutex_lock(&fd->metrics_lock);
1126 if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1127 size_t sz_best = INT_MAX;
1128 size_t sz_gz_rle = 0;
1129 size_t sz_gz_def = 0;
1130 size_t sz_rans0 = 0;
1131 size_t sz_rans1 = 0;
1132 size_t sz_bzip2 = 0;
1133 size_t sz_lzma = 0;
1134 int method_best = 0;
1135 char *c_best = NULL, *c = NULL;
1137 if (metrics->revised_method)
1138 method = metrics->revised_method;
1139 else
1140 metrics->revised_method = method;
1142 if (metrics->next_trial <= 0) {
1143 metrics->next_trial = TRIAL_SPAN;
1144 metrics->trial = NTRIALS;
1145 metrics->sz_gz_rle /= 2;
1146 metrics->sz_gz_def /= 2;
1147 metrics->sz_rans0 /= 2;
1148 metrics->sz_rans1 /= 2;
1149 metrics->sz_bzip2 /= 2;
1150 metrics->sz_lzma /= 2;
1153 pthread_mutex_unlock(&fd->metrics_lock);
1155 if (method & (1<<GZIP_RLE)) {
1156 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1157 &sz_gz_rle, GZIP, 1, Z_RLE);
1158 if (c && sz_best > sz_gz_rle) {
1159 sz_best = sz_gz_rle;
1160 method_best = GZIP_RLE;
1161 if (c_best)
1162 free(c_best);
1163 c_best = c;
1164 } else if (c) {
1165 free(c);
1166 } else {
1167 sz_gz_rle = b->uncomp_size*2+1000;
1170 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle);
1173 if (method & (1<<GZIP)) {
1174 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1175 &sz_gz_def, GZIP, level,
1176 Z_FILTERED);
1177 if (c && sz_best > sz_gz_def) {
1178 sz_best = sz_gz_def;
1179 method_best = GZIP;
1180 if (c_best)
1181 free(c_best);
1182 c_best = c;
1183 } else if (c) {
1184 free(c);
1185 } else {
1186 sz_gz_def = b->uncomp_size*2+1000;
1189 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def);
1192 if (method & (1<<RANS0)) {
1193 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1194 &sz_rans0, RANS0, 0, 0);
1195 if (c && sz_best > sz_rans0) {
1196 sz_best = sz_rans0;
1197 method_best = RANS0;
1198 if (c_best)
1199 free(c_best);
1200 c_best = c;
1201 } else if (c) {
1202 free(c);
1203 } else {
1204 sz_rans0 = b->uncomp_size*2+1000;
1208 if (method & (1<<RANS1)) {
1209 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1210 &sz_rans1, RANS1, 0, 0);
1211 if (c && sz_best > sz_rans1) {
1212 sz_best = sz_rans1;
1213 method_best = RANS1;
1214 if (c_best)
1215 free(c_best);
1216 c_best = c;
1217 } else if (c) {
1218 free(c);
1219 } else {
1220 sz_rans1 = b->uncomp_size*2+1000;
1224 if (method & (1<<BZIP2)) {
1225 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1226 &sz_bzip2, BZIP2, level, 0);
1227 if (c && sz_best > sz_bzip2) {
1228 sz_best = sz_bzip2;
1229 method_best = BZIP2;
1230 if (c_best)
1231 free(c_best);
1232 c_best = c;
1233 } else if (c) {
1234 free(c);
1235 } else {
1236 sz_bzip2 = b->uncomp_size*2+1000;
1240 if (method & (1<<LZMA)) {
1241 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1242 &sz_lzma, LZMA, level, 0);
1243 if (c && sz_best > sz_lzma) {
1244 sz_best = sz_lzma;
1245 method_best = LZMA;
1246 if (c_best)
1247 free(c_best);
1248 c_best = c;
1249 } else if (c) {
1250 free(c);
1251 } else {
1252 sz_lzma = b->uncomp_size*2+1000;
1256 //fprintf(stderr, "sz_best = %d\n", sz_best);
1258 free(b->data);
1259 b->data = (unsigned char *)c_best;
1260 //printf("method_best = %s\n", cram_block_method2str(method_best));
1261 b->method = method_best == GZIP_RLE ? GZIP : method_best;
1262 b->comp_size = sz_best;
1264 pthread_mutex_lock(&fd->metrics_lock);
1265 metrics->sz_gz_rle += sz_gz_rle;
1266 metrics->sz_gz_def += sz_gz_def;
1267 metrics->sz_rans0 += sz_rans0;
1268 metrics->sz_rans1 += sz_rans1;
1269 metrics->sz_bzip2 += sz_bzip2;
1270 metrics->sz_lzma += sz_lzma;
1271 if (--metrics->trial == 0) {
1272 int best_method = RAW;
1273 int best_sz = INT_MAX;
1275 // Scale methods by cost
1276 if (fd->level <= 3) {
1277 metrics->sz_rans1 *= 1.02;
1278 metrics->sz_gz_def *= 1.04;
1279 metrics->sz_bzip2 *= 1.08;
1280 metrics->sz_lzma *= 1.10;
1281 } else if (fd->level <= 6) {
1282 metrics->sz_rans1 *= 1.01;
1283 metrics->sz_gz_def *= 1.02;
1284 metrics->sz_bzip2 *= 1.03;
1285 metrics->sz_lzma *= 1.05;
1288 if (method & (1<<GZIP_RLE) && best_sz > metrics->sz_gz_rle)
1289 best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE;
1291 if (method & (1<<GZIP) && best_sz > metrics->sz_gz_def)
1292 best_sz = metrics->sz_gz_def, best_method = GZIP;
1294 if (method & (1<<RANS0) && best_sz > metrics->sz_rans0)
1295 best_sz = metrics->sz_rans0, best_method = RANS0;
1297 if (method & (1<<RANS1) && best_sz > metrics->sz_rans1)
1298 best_sz = metrics->sz_rans1, best_method = RANS1;
1300 if (method & (1<<BZIP2) && best_sz > metrics->sz_bzip2)
1301 best_sz = metrics->sz_bzip2, best_method = BZIP2;
1303 if (method & (1<<LZMA) && best_sz > metrics->sz_lzma)
1304 best_sz = metrics->sz_lzma, best_method = LZMA;
1306 if (best_method == GZIP_RLE) {
1307 metrics->method = GZIP;
1308 metrics->strat = Z_RLE;
1309 } else {
1310 metrics->method = best_method;
1311 metrics->strat = Z_FILTERED;
1314 // If we see at least MAXFAIL trials in a row for a specific
1315 // compression method with more than MAXDELTA aggregate
1316 // size then we drop this from the list of methods used
1317 // for this block type.
1318 #define MAXDELTA 0.20
1319 #define MAXFAILS 4
1320 if (best_method == GZIP_RLE) {
1321 metrics->gz_rle_cnt = 0;
1322 metrics->gz_rle_extra = 0;
1323 } else if (best_sz < metrics->sz_gz_rle) {
1324 double r = (double)metrics->sz_gz_rle / best_sz - 1;
1325 if (++metrics->gz_rle_cnt >= MAXFAILS &&
1326 (metrics->gz_rle_extra += r) >= MAXDELTA)
1327 method &= ~(1<<GZIP_RLE);
1330 if (best_method == GZIP) {
1331 metrics->gz_def_cnt = 0;
1332 metrics->gz_def_extra = 0;
1333 } else if (best_sz < metrics->sz_gz_def) {
1334 double r = (double)metrics->sz_gz_def / best_sz - 1;
1335 if (++metrics->gz_def_cnt >= MAXFAILS &&
1336 (metrics->gz_def_extra += r) >= MAXDELTA)
1337 method &= ~(1<<GZIP);
1340 if (best_method == RANS0) {
1341 metrics->rans0_cnt = 0;
1342 metrics->rans0_extra = 0;
1343 } else if (best_sz < metrics->sz_rans0) {
1344 double r = (double)metrics->sz_rans0 / best_sz - 1;
1345 if (++metrics->rans0_cnt >= MAXFAILS &&
1346 (metrics->rans0_extra += r) >= MAXDELTA)
1347 method &= ~(1<<RANS0);
1350 if (best_method == RANS1) {
1351 metrics->rans1_cnt = 0;
1352 metrics->rans1_extra = 0;
1353 } else if (best_sz < metrics->sz_rans1) {
1354 double r = (double)metrics->sz_rans1 / best_sz - 1;
1355 if (++metrics->rans1_cnt >= MAXFAILS &&
1356 (metrics->rans1_extra += r) >= MAXDELTA)
1357 method &= ~(1<<RANS1);
1360 if (best_method == BZIP2) {
1361 metrics->bzip2_cnt = 0;
1362 metrics->bzip2_extra = 0;
1363 } else if (best_sz < metrics->sz_bzip2) {
1364 double r = (double)metrics->sz_bzip2 / best_sz - 1;
1365 if (++metrics->bzip2_cnt >= MAXFAILS &&
1366 (metrics->bzip2_extra += r) >= MAXDELTA)
1367 method &= ~(1<<BZIP2);
1370 if (best_method == LZMA) {
1371 metrics->lzma_cnt = 0;
1372 metrics->lzma_extra = 0;
1373 } else if (best_sz < metrics->sz_lzma) {
1374 double r = (double)metrics->sz_lzma / best_sz - 1;
1375 if (++metrics->lzma_cnt >= MAXFAILS &&
1376 (metrics->lzma_extra += r) >= MAXDELTA)
1377 method &= ~(1<<LZMA);
1380 //if (method != metrics->revised_method)
1381 // fprintf(stderr, "%d: method from %x to %x\n",
1382 // b->content_id, metrics->revised_method, method);
1383 metrics->revised_method = method;
1385 pthread_mutex_unlock(&fd->metrics_lock);
1386 } else {
1387 strat = metrics->strat;
1388 method = metrics->method;
1390 pthread_mutex_unlock(&fd->metrics_lock);
1391 comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1392 &comp_size, method,
1393 level, strat);
1394 if (!comp)
1395 return -1;
1396 free(b->data);
1397 b->data = (unsigned char *)comp;
1398 b->comp_size = comp_size;
1399 b->method = method;
1402 } else {
1403 // no cached metrics, so just do zlib?
1404 comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1405 &comp_size, GZIP, level, Z_FILTERED);
1406 if (!comp) {
1407 fprintf(stderr, "Compression failed!\n");
1408 return -1;
1410 free(b->data);
1411 b->data = (unsigned char *)comp;
1412 b->comp_size = comp_size;
1413 b->method = GZIP;
1416 if (fd->verbose)
1417 fprintf(stderr, "Compressed block ID %d from %d to %d by method %s\n",
1418 b->content_id, b->uncomp_size, b->comp_size,
1419 cram_block_method2str(b->method));
1421 if (b->method == RANS1)
1422 b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing
1424 return 0;
1427 cram_metrics *cram_new_metrics(void) {
1428 cram_metrics *m = calloc(1, sizeof(*m));
1429 if (!m)
1430 return NULL;
1431 m->trial = NTRIALS-1;
1432 m->next_trial = TRIAL_SPAN;
1433 m->method = RAW;
1434 m->strat = 0;
1435 m->revised_method = 0;
1437 return m;
1440 char *cram_block_method2str(enum cram_block_method m) {
1441 switch(m) {
1442 case RAW: return "RAW";
1443 case GZIP: return "GZIP";
1444 case BZIP2: return "BZIP2";
1445 case LZMA: return "LZMA";
1446 case RANS0: return "RANS0";
1447 case RANS1: return "RANS1";
1448 case GZIP_RLE: return "GZIP_RLE";
1449 case ERROR: break;
1451 return "?";
1454 char *cram_content_type2str(enum cram_content_type t) {
1455 switch (t) {
1456 case FILE_HEADER: return "FILE_HEADER";
1457 case COMPRESSION_HEADER: return "COMPRESSION_HEADER";
1458 case MAPPED_SLICE: return "MAPPED_SLICE";
1459 case UNMAPPED_SLICE: return "UNMAPPED_SLICE";
1460 case EXTERNAL: return "EXTERNAL";
1461 case CORE: return "CORE";
1462 case CT_ERROR: break;
1464 return "?";
1467 /* ----------------------------------------------------------------------
1468 * Reference sequence handling
1470 * These revolve around the refs_t structure, which may potentially be
1471 * shared between multiple cram_fd.
1473 * We start with refs_create() to allocate an empty refs_t and then
1474 * populate it with @SQ line data using refs_from_header(). This is done on
1475 * cram_open(). Also at start up we can call cram_load_reference() which
1476 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
1477 * new one specified. In either case refs2id() is then called which
1478 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
1480 * Later, possibly within a thread, we will want to know the actual ref
1481 * seq itself, obtained by calling cram_get_ref(). This may use the
1482 * UR: or M5: fields or the filename specified in the original
1483 * cram_load_reference() call.
1485 * Given the potential for multi-threaded reference usage, we have
1486 * reference counting (sorry for the confusing double use of "ref") to
1487 * track the number of callers interested in any specific reference.
1491 * Frees/unmaps a reference sequence and associated file handles.
1493 static void ref_entry_free_seq(ref_entry *e) {
1494 if (e->mf)
1495 mfclose(e->mf);
1496 if (e->seq && !e->mf)
1497 free(e->seq);
1499 e->seq = NULL;
1500 e->mf = NULL;
1503 void refs_free(refs_t *r) {
1504 RP("refs_free()\n");
1506 if (--r->count > 0)
1507 return;
1509 if (!r)
1510 return;
1512 if (r->pool)
1513 string_pool_destroy(r->pool);
1515 if (r->h_meta) {
1516 khint_t k;
1518 for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
1519 ref_entry *e;
1521 if (!kh_exist(r->h_meta, k))
1522 continue;
1523 if (!(e = kh_val(r->h_meta, k)))
1524 continue;
1525 ref_entry_free_seq(e);
1526 free(e);
1529 kh_destroy(refs, r->h_meta);
1532 if (r->ref_id)
1533 free(r->ref_id);
1535 if (r->fp)
1536 bgzf_close(r->fp);
1538 pthread_mutex_destroy(&r->lock);
1540 free(r);
1543 static refs_t *refs_create(void) {
1544 refs_t *r = calloc(1, sizeof(*r));
1546 RP("refs_create()\n");
1548 if (!r)
1549 return NULL;
1551 if (!(r->pool = string_pool_create(8192)))
1552 goto err;
1554 r->ref_id = NULL; // see refs2id() to populate.
1555 r->count = 1;
1556 r->last = NULL;
1557 r->last_id = -1;
1559 if (!(r->h_meta = kh_init(refs)))
1560 goto err;
1562 pthread_mutex_init(&r->lock, NULL);
1564 return r;
1566 err:
1567 refs_free(r);
1568 return NULL;
1572 * Opens a reference fasta file as a BGZF stream, allowing for
1573 * compressed files. It automatically builds a .fai file if
1574 * required and if compressed a .gzi bgzf index too.
1576 * Returns a BGZF handle on success;
1577 * NULL on failure.
1579 static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
1580 BGZF *fp;
1582 if (!is_md5) {
1583 char fai_file[PATH_MAX];
1585 snprintf(fai_file, PATH_MAX, "%s.fai", fn);
1586 if (access(fai_file, R_OK) != 0)
1587 if (fai_build(fn) != 0)
1588 return NULL;
1591 if (!(fp = bgzf_open(fn, mode))) {
1592 perror(fn);
1593 return NULL;
1596 if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
1597 fprintf(stderr, "Unable to load .gzi index '%s.gzi'\n", fn);
1598 bgzf_close(fp);
1599 return NULL;
1602 return fp;
1606 * Loads a FAI file for a reference.fasta.
1607 * "is_err" indicates whether failure to load is worthy of emitting an
1608 * error message. In some cases (eg with embedded references) we
1609 * speculatively load, just incase, and silently ignore errors.
1611 * Returns the refs_t struct on success (maybe newly allocated);
1612 * NULL on failure
1614 static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
1615 struct stat sb;
1616 FILE *fp = NULL;
1617 char fai_fn[PATH_MAX];
1618 char line[8192];
1619 refs_t *r = r_orig;
1620 size_t fn_l = strlen(fn);
1621 int id = 0, id_alloc = 0;
1623 RP("refs_load_fai %s\n", fn);
1625 if (!r)
1626 if (!(r = refs_create()))
1627 goto err;
1629 /* Open reference, for later use */
1630 if (stat(fn, &sb) != 0) {
1631 if (is_err)
1632 perror(fn);
1633 goto err;
1636 if (r->fp)
1637 if (bgzf_close(r->fp) != 0)
1638 goto err;
1639 r->fp = NULL;
1641 if (!(r->fn = string_dup(r->pool, fn)))
1642 goto err;
1644 if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0)
1645 r->fn[fn_l-4] = 0;
1647 if (!(r->fp = bgzf_open_ref(r->fn, "r", 0)))
1648 goto err;
1650 /* Parse .fai file and load meta-data */
1651 sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn);
1653 if (stat(fai_fn, &sb) != 0) {
1654 if (is_err)
1655 perror(fai_fn);
1656 goto err;
1658 if (!(fp = fopen(fai_fn, "r"))) {
1659 if (is_err)
1660 perror(fai_fn);
1661 goto err;
1663 while (fgets(line, 8192, fp) != NULL) {
1664 ref_entry *e = malloc(sizeof(*e));
1665 char *cp;
1666 int n;
1667 khint_t k;
1669 if (!e)
1670 return NULL;
1672 // id
1673 for (cp = line; *cp && !isspace_c(*cp); cp++)
1675 *cp++ = 0;
1676 e->name = string_dup(r->pool, line);
1678 // length
1679 while (*cp && isspace_c(*cp))
1680 cp++;
1681 e->length = strtoll(cp, &cp, 10);
1683 // offset
1684 while (*cp && isspace_c(*cp))
1685 cp++;
1686 e->offset = strtoll(cp, &cp, 10);
1688 // bases per line
1689 while (*cp && isspace_c(*cp))
1690 cp++;
1691 e->bases_per_line = strtol(cp, &cp, 10);
1693 // line length
1694 while (*cp && isspace_c(*cp))
1695 cp++;
1696 e->line_length = strtol(cp, &cp, 10);
1698 // filename
1699 e->fn = r->fn;
1701 e->count = 0;
1702 e->seq = NULL;
1703 e->mf = NULL;
1704 e->is_md5 = 0;
1706 k = kh_put(refs, r->h_meta, e->name, &n);
1707 if (-1 == n) {
1708 free(e);
1709 return NULL;
1712 if (n) {
1713 kh_val(r->h_meta, k) = e;
1714 } else {
1715 ref_entry *re = kh_val(r->h_meta, k);
1716 if (re && (re->count != 0 || re->length != 0)) {
1717 /* Keep old */
1718 free(e);
1719 } else {
1720 /* Replace old */
1721 if (re)
1722 free(re);
1723 kh_val(r->h_meta, k) = e;
1727 if (id >= id_alloc) {
1728 int x;
1730 id_alloc = id_alloc ?id_alloc*2 : 16;
1731 r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
1733 for (x = id; x < id_alloc; x++)
1734 r->ref_id[x] = NULL;
1736 r->ref_id[id] = e;
1737 r->nref = ++id;
1740 fclose(fp);
1741 return r;
1743 err:
1744 if (fp)
1745 fclose(fp);
1747 if (!r_orig)
1748 refs_free(r);
1750 return NULL;
1754 * Verifies that the CRAM @SQ lines and .fai files match.
1756 static void sanitise_SQ_lines(cram_fd *fd) {
1757 int i;
1759 if (!fd->header)
1760 return;
1762 if (!fd->refs || !fd->refs->h_meta)
1763 return;
1765 for (i = 0; i < fd->header->nref; i++) {
1766 char *name = fd->header->ref[i].name;
1767 khint_t k = kh_get(refs, fd->refs->h_meta, name);
1768 ref_entry *r;
1770 // We may have @SQ lines which have no known .fai, but do not
1771 // in themselves pose a problem because they are unused in the file.
1772 if (k == kh_end(fd->refs->h_meta))
1773 continue;
1775 if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
1776 continue;
1778 if (r->length && r->length != fd->header->ref[i].len) {
1779 assert(strcmp(r->name, fd->header->ref[i].name) == 0);
1781 // Should we also check MD5sums here to ensure the correct
1782 // reference was given?
1783 fprintf(stderr, "WARNING: Header @SQ length mismatch for "
1784 "ref %s, %d vs %d\n",
1785 r->name, fd->header->ref[i].len, (int)r->length);
1787 // Fixing the parsed @SQ header will make MD:Z: strings work
1788 // and also stop it producing N for the sequence.
1789 fd->header->ref[i].len = r->length;
1795 * Indexes references by the order they appear in a BAM file. This may not
1796 * necessarily be the same order they appear in the fasta reference file.
1798 * Returns 0 on success
1799 * -1 on failure
1801 int refs2id(refs_t *r, SAM_hdr *h) {
1802 int i;
1804 if (r->ref_id)
1805 free(r->ref_id);
1806 if (r->last)
1807 r->last = NULL;
1809 r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
1810 if (!r->ref_id)
1811 return -1;
1813 r->nref = h->nref;
1814 for (i = 0; i < h->nref; i++) {
1815 khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
1816 if (k != kh_end(r->h_meta)) {
1817 r->ref_id[i] = kh_val(r->h_meta, k);
1818 } else {
1819 fprintf(stderr, "Unable to find ref name '%s'\n",
1820 h->ref[i].name);
1824 return 0;
1828 * Generates refs_t entries based on @SQ lines in the header.
1829 * Returns 0 on success
1830 * -1 on failure
1832 static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
1833 int i, j;
1835 if (!r)
1836 return -1;
1838 if (!h || h->nref == 0)
1839 return 0;
1841 //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
1843 /* Existing refs are fine, as long as they're compatible with the hdr. */
1844 if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
1845 return -1;
1847 /* Copy info from h->ref[i] over to r */
1848 for (i = 0, j = r->nref; i < h->nref; i++) {
1849 SAM_hdr_type *ty;
1850 SAM_hdr_tag *tag;
1851 khint_t k;
1852 int n;
1854 k = kh_get(refs, r->h_meta, h->ref[i].name);
1855 if (k != kh_end(r->h_meta))
1856 // Ref already known about
1857 continue;
1859 if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
1860 return -1;
1862 if (!h->ref[i].name)
1863 return -1;
1865 r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
1866 r->ref_id[j]->length = 0; // marker for not yet loaded
1868 /* Initialise likely filename if known */
1869 if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
1870 if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
1871 r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
1872 //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
1876 k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
1877 if (n <= 0) // already exists or error
1878 return -1;
1879 kh_val(r->h_meta, k) = r->ref_id[j];
1881 j++;
1883 r->nref = j;
1885 return 0;
1889 * Attaches a header to a cram_fd.
1891 * This should be used when creating a new cram_fd for writing where
1892 * we have an SAM_hdr already constructed (eg from a file we've read
1893 * in).
1895 int cram_set_header(cram_fd *fd, SAM_hdr *hdr) {
1896 if (fd->header)
1897 sam_hdr_free(fd->header);
1898 fd->header = hdr;
1899 return refs_from_header(fd->refs, fd, hdr);
1903 * Converts a directory and a filename into an expanded path, replacing %s
1904 * in directory with the filename and %[0-9]+s with portions of the filename
1905 * Any remaining parts of filename are added to the end with /%s.
1907 void expand_cache_path(char *path, char *dir, char *fn) {
1908 char *cp;
1910 while ((cp = strchr(dir, '%'))) {
1911 strncpy(path, dir, cp-dir);
1912 path += cp-dir;
1914 if (*++cp == 's') {
1915 strcpy(path, fn);
1916 path += strlen(fn);
1917 fn += strlen(fn);
1918 cp++;
1919 } else if (*cp >= '0' && *cp <= '9') {
1920 char *endp;
1921 long l;
1923 l = strtol(cp, &endp, 10);
1924 l = MIN(l, strlen(fn));
1925 if (*endp == 's') {
1926 strncpy(path, fn, l);
1927 path += l;
1928 fn += l;
1929 *path = 0;
1930 cp = endp+1;
1931 } else {
1932 *path++ = '%';
1933 *path++ = *cp++;
1935 } else {
1936 *path++ = '%';
1937 *path++ = *cp++;
1939 dir = cp;
1941 strcpy(path, dir);
1942 path += strlen(dir);
1943 if (*fn && path[-1] != '/')
1944 *path++ = '/';
1945 strcpy(path, fn);
1949 * Make the directory containing path and any prefix directories.
1951 void mkdir_prefix(char *path, int mode) {
1952 char *cp = strrchr(path, '/');
1953 if (!cp)
1954 return;
1956 *cp = 0;
1957 if (is_directory(path)) {
1958 *cp = '/';
1959 return;
1962 if (mkdir(path, mode) == 0) {
1963 chmod(path, mode);
1964 *cp = '/';
1965 return;
1968 mkdir_prefix(path, mode);
1969 mkdir(path, mode);
1970 chmod(path, mode);
1971 *cp = '/';
1975 * Return the cache directory to use, based on the first of these
1976 * environment variables to be set to a non-empty value.
1978 static const char *get_cache_basedir(const char **extra) {
1979 char *base;
1981 *extra = "";
1983 base = getenv("XDG_CACHE_HOME");
1984 if (base && *base) return base;
1986 base = getenv("HOME");
1987 if (base && *base) { *extra = "/.cache"; return base; }
1989 base = getenv("TMPDIR");
1990 if (base && *base) return base;
1992 base = getenv("TEMP");
1993 if (base && *base) return base;
1995 return "/tmp";
1999 * Return an integer representation of pthread_self().
2001 static unsigned get_int_threadid() {
2002 pthread_t pt = pthread_self();
2003 unsigned char *s = (unsigned char *) &pt;
2004 size_t i;
2005 unsigned h = 0;
2006 for (i = 0; i < sizeof(pthread_t); i++)
2007 h = (h << 5) - h + s[i];
2008 return h;
2012 * Queries the M5 string from the header and attempts to populate the
2013 * reference from this using the REF_PATH environment.
2015 * Returns 0 on sucess
2016 * -1 on failure
2018 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2019 char *ref_path = getenv("REF_PATH");
2020 SAM_hdr_type *ty;
2021 SAM_hdr_tag *tag;
2022 char path[PATH_MAX], path_tmp[PATH_MAX];
2023 char cache[PATH_MAX], cache_root[PATH_MAX];
2024 char *local_cache = getenv("REF_CACHE");
2025 mFILE *mf;
2026 int local_path = 0;
2028 if (fd->verbose)
2029 fprintf(stderr, "cram_populate_ref on fd %p, id %d\n", (void *)fd, id);
2031 cache_root[0] = '\0';
2033 if (!ref_path || *ref_path == '\0') {
2035 * If we have no ref path, we use the EBI server.
2036 * However to avoid spamming it we require a local ref cache too.
2038 ref_path = "http://www.ebi.ac.uk:80/ena/cram/md5/%s";
2039 if (!local_cache || *local_cache == '\0') {
2040 const char *extra;
2041 const char *base = get_cache_basedir(&extra);
2042 snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
2043 snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
2044 local_cache = cache;
2045 if (fd->verbose)
2046 fprintf(stderr, "Populating local cache: %s\n", local_cache);
2050 if (!r->name)
2051 return -1;
2053 if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
2054 return -1;
2056 if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
2057 goto no_M5;
2059 if (fd->verbose)
2060 fprintf(stderr, "Querying ref %s\n", tag->str+3);
2062 /* Use cache if available */
2063 if (local_cache && *local_cache) {
2064 expand_cache_path(path, local_cache, tag->str+3);
2065 local_path = 1;
2068 #ifndef HAVE_MMAP
2069 char *path2;
2070 /* Search local files in REF_PATH; we can open them and return as above */
2071 if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
2072 strncpy(path, path2, PATH_MAX);
2073 free(path2);
2074 if (is_file(path)) // incase it's too long
2075 local_path = 1;
2077 #endif
2079 /* Found via REF_CACHE or local REF_PATH file */
2080 if (local_path) {
2081 struct stat sb;
2082 BGZF *fp;
2084 if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) {
2085 r->length = sb.st_size;
2086 r->offset = r->line_length = r->bases_per_line = 0;
2088 r->fn = string_dup(fd->refs->pool, path);
2090 if (fd->refs->fp)
2091 if (bgzf_close(fd->refs->fp) != 0)
2092 return -1;
2093 fd->refs->fp = fp;
2094 fd->refs->fn = r->fn;
2095 r->is_md5 = 1;
2097 // Fall back to cram_get_ref() where it'll do the actual
2098 // reading of the file.
2099 return 0;
2104 /* Otherwise search full REF_PATH; slower as loads entire file */
2105 if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
2106 size_t sz;
2107 r->seq = mfsteal(mf, &sz);
2108 if (r->seq) {
2109 r->mf = NULL;
2110 } else {
2111 // keep mf around as we couldn't detach
2112 r->seq = mf->data;
2113 r->mf = mf;
2115 r->length = sz;
2116 r->is_md5 = 1;
2117 } else {
2118 refs_t *refs;
2119 char *fn;
2121 no_M5:
2122 /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
2123 if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
2124 return -1;
2126 fn = (strncmp(tag->str+3, "file:", 5) == 0)
2127 ? tag->str+8
2128 : tag->str+3;
2130 if (fd->refs->fp) {
2131 if (bgzf_close(fd->refs->fp) != 0)
2132 return -1;
2133 fd->refs->fp = NULL;
2135 if (!(refs = refs_load_fai(fd->refs, fn, 0)))
2136 return -1;
2137 sanitise_SQ_lines(fd);
2139 fd->refs = refs;
2140 if (fd->refs->fp) {
2141 if (bgzf_close(fd->refs->fp) != 0)
2142 return -1;
2143 fd->refs->fp = NULL;
2146 if (!fd->refs->fn)
2147 return -1;
2149 if (-1 == refs2id(fd->refs, fd->header))
2150 return -1;
2151 if (!fd->refs->ref_id || !fd->refs->ref_id[id])
2152 return -1;
2154 // Local copy already, so fall back to cram_get_ref().
2155 return 0;
2158 /* Populate the local disk cache if required */
2159 if (local_cache && *local_cache) {
2160 int pid = (int) getpid();
2161 unsigned thrid = get_int_threadid();
2162 hFILE *fp;
2164 if (*cache_root && !is_directory(cache_root)) {
2165 hts_log_warning("Creating reference cache directory %s\n"
2166 "This may become large; see the samtools(1) manual page REF_CACHE discussion",
2167 cache_root);
2170 expand_cache_path(path, local_cache, tag->str+3);
2171 if (fd->verbose)
2172 fprintf(stderr, "Writing cache file '%s'\n", path);
2173 mkdir_prefix(path, 01777);
2175 do {
2176 // Attempt to further uniquify the temporary filename
2177 unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
2178 thrid++; // Ensure filename changes even if time/clock haven't
2180 sprintf(path_tmp, "%s.tmp_%d_%u_%u", path, pid, thrid, t);
2181 fp = hopen(path_tmp, "wx");
2182 } while (fp == NULL && errno == EEXIST);
2183 if (!fp) {
2184 perror(path_tmp);
2186 // Not fatal - we have the data already so keep going.
2187 return 0;
2190 // Check md5sum
2191 hts_md5_context *md5;
2192 char unsigned md5_buf1[16];
2193 char md5_buf2[33];
2195 if (!(md5 = hts_md5_init())) {
2196 hclose_abruptly(fp);
2197 unlink(path_tmp);
2198 return -1;
2200 hts_md5_update(md5, r->seq, r->length);
2201 hts_md5_final(md5_buf1, md5);
2202 hts_md5_destroy(md5);
2203 hts_md5_hex(md5_buf2, md5_buf1);
2205 if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
2206 fprintf(stderr, "[E::%s] mismatching md5sum for downloaded reference.\n", __func__);
2207 hclose_abruptly(fp);
2208 unlink(path_tmp);
2209 return -1;
2212 if (hwrite(fp, r->seq, r->length) != r->length) {
2213 perror(path);
2215 if (hclose(fp) < 0) {
2216 unlink(path_tmp);
2217 } else {
2218 if (0 == chmod(path_tmp, 0444))
2219 rename(path_tmp, path);
2220 else
2221 unlink(path_tmp);
2225 return 0;
2228 static void cram_ref_incr_locked(refs_t *r, int id) {
2229 RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2231 if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
2232 return;
2234 if (r->last_id == id)
2235 r->last_id = -1;
2237 ++r->ref_id[id]->count;
2240 void cram_ref_incr(refs_t *r, int id) {
2241 pthread_mutex_lock(&r->lock);
2242 cram_ref_incr_locked(r, id);
2243 pthread_mutex_unlock(&r->lock);
2246 static void cram_ref_decr_locked(refs_t *r, int id) {
2247 RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2249 if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
2250 assert(r->ref_id[id]->count >= 0);
2251 return;
2254 if (--r->ref_id[id]->count <= 0) {
2255 assert(r->ref_id[id]->count == 0);
2256 if (r->last_id >= 0) {
2257 if (r->ref_id[r->last_id]->count <= 0 &&
2258 r->ref_id[r->last_id]->seq) {
2259 RP("%d FREE REF %d (%p)\n", gettid(),
2260 r->last_id, r->ref_id[r->last_id]->seq);
2261 ref_entry_free_seq(r->ref_id[r->last_id]);
2262 r->ref_id[r->last_id]->length = 0;
2265 r->last_id = id;
2269 void cram_ref_decr(refs_t *r, int id) {
2270 pthread_mutex_lock(&r->lock);
2271 cram_ref_decr_locked(r, id);
2272 pthread_mutex_unlock(&r->lock);
2276 * Used by cram_ref_load and cram_ref_get. The file handle will have
2277 * already been opened, so we can catch it. The ref_entry *e informs us
2278 * of whether this is a multi-line fasta file or a raw MD5 style file.
2279 * Either way we create a single contiguous sequence.
2281 * Returns all or part of a reference sequence on success (malloced);
2282 * NULL on failure.
2284 static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
2285 off_t offset, len;
2286 char *seq;
2288 if (end < start)
2289 end = start;
2292 * Compute locations in file. This is trivial for the MD5 files, but
2293 * is still necessary for the fasta variants.
2295 offset = e->line_length
2296 ? e->offset + (start-1)/e->bases_per_line * e->line_length +
2297 (start-1) % e->bases_per_line
2298 : start-1;
2300 len = (e->line_length
2301 ? e->offset + (end-1)/e->bases_per_line * e->line_length +
2302 (end-1) % e->bases_per_line
2303 : end-1) - offset + 1;
2305 if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
2306 perror("bgzf_useek() on reference file");
2307 return NULL;
2310 if (len == 0 || !(seq = malloc(len))) {
2311 return NULL;
2314 if (len != bgzf_read(fp, seq, len)) {
2315 perror("bgzf_read() on reference file");
2316 free(seq);
2317 return NULL;
2320 /* Strip white-space if required. */
2321 if (len != end-start+1) {
2322 int i, j;
2323 char *cp = seq;
2324 char *cp_to;
2326 for (i = j = 0; i < len; i++) {
2327 if (cp[i] >= '!' && cp[i] <= '~')
2328 cp[j++] = toupper_c(cp[i]);
2330 cp_to = cp+j;
2332 if (cp_to - seq != end-start+1) {
2333 fprintf(stderr, "Malformed reference file?\n");
2334 free(seq);
2335 return NULL;
2337 } else {
2338 int i;
2339 for (i = 0; i < len; i++) {
2340 seq[i] = toupper_c(seq[i]);
2344 return seq;
2348 * Load the entire reference 'id'.
2349 * This also increments the reference count by 1.
2351 * Returns ref_entry on success;
2352 * NULL on failure
2354 ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
2355 ref_entry *e = r->ref_id[id];
2356 int start = 1, end = e->length;
2357 char *seq;
2359 if (e->seq) {
2360 return e;
2363 assert(e->count == 0);
2365 if (r->last) {
2366 #ifdef REF_DEBUG
2367 int idx = 0;
2368 for (idx = 0; idx < r->nref; idx++)
2369 if (r->last == r->ref_id[idx])
2370 break;
2371 RP("%d cram_ref_load DECR %d\n", gettid(), idx);
2372 #endif
2373 assert(r->last->count > 0);
2374 if (--r->last->count <= 0) {
2375 RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
2376 if (r->last->seq)
2377 ref_entry_free_seq(r->last);
2381 /* Open file if it's not already the current open reference */
2382 if (strcmp(r->fn, e->fn) || r->fp == NULL) {
2383 if (r->fp)
2384 if (bgzf_close(r->fp) != 0)
2385 return NULL;
2386 r->fn = e->fn;
2387 if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
2388 return NULL;
2391 RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
2393 if (!(seq = load_ref_portion(r->fp, e, start, end))) {
2394 return NULL;
2397 RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
2399 RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
2400 e->seq = seq;
2401 e->mf = NULL;
2402 e->count++;
2405 * Also keep track of last used ref so incr/decr loops on the same
2406 * sequence don't cause load/free loops.
2408 RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
2409 r->last = e;
2410 e->count++;
2412 return e;
2416 * Returns a portion of a reference sequence from start to end inclusive.
2417 * The returned pointer is owned by either the cram_file fd or by the
2418 * internal refs_t structure and should not be freed by the caller.
2420 * The difference is whether or not this refs_t is in use by just the one
2421 * cram_fd or by multiples, or whether we have multiple threads accessing
2422 * references. In either case fd->shared will be true and we start using
2423 * reference counting to track the number of users of a specific reference
2424 * sequence.
2426 * Otherwise the ref seq returned is allocated as part of cram_fd itself
2427 * and will be freed up on the next call to cram_get_ref or cram_close.
2429 * To return the entire reference sequence, specify start as 1 and end
2430 * as 0.
2432 * To cease using a reference, call cram_ref_decr().
2434 * Returns reference on success,
2435 * NULL on failure
2437 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
2438 ref_entry *r;
2439 char *seq;
2440 int ostart = start;
2442 if (id == -1)
2443 return NULL;
2445 /* FIXME: axiomatic query of r->seq being true?
2446 * Or shortcut for unsorted data where we load once and never free?
2449 //fd->shared_ref = 1; // hard code for now to simplify things
2451 pthread_mutex_lock(&fd->ref_lock);
2453 RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
2456 * Unsorted data implies we want to fetch an entire reference at a time.
2457 * We just deal with this at the moment by claiming we're sharing
2458 * references instead, which has the same requirement.
2460 if (fd->unsorted)
2461 fd->shared_ref = 1;
2464 /* Sanity checking: does this ID exist? */
2465 if (id >= fd->refs->nref) {
2466 fprintf(stderr, "No reference found for id %d\n", id);
2467 pthread_mutex_unlock(&fd->ref_lock);
2468 return NULL;
2471 if (!fd->refs || !fd->refs->ref_id[id]) {
2472 fprintf(stderr, "No reference found for id %d\n", id);
2473 pthread_mutex_unlock(&fd->ref_lock);
2474 return NULL;
2477 if (!(r = fd->refs->ref_id[id])) {
2478 fprintf(stderr, "No reference found for id %d\n", id);
2479 pthread_mutex_unlock(&fd->ref_lock);
2480 return NULL;
2485 * It has an entry, but may not have been populated yet.
2486 * Any manually loaded .fai files have their lengths known.
2487 * A ref entry computed from @SQ lines (M5 or UR field) will have
2488 * r->length == 0 unless it's been loaded once and verified that we have
2489 * an on-disk filename for it.
2491 * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
2492 * open_path_mfile and libcurl, which isn't multi-thread safe unless I
2493 * rewrite my code to have one curl handle per thread.
2495 pthread_mutex_lock(&fd->refs->lock);
2496 if (r->length == 0) {
2497 if (cram_populate_ref(fd, id, r) == -1) {
2498 fprintf(stderr, "Failed to populate reference for id %d\n", id);
2499 pthread_mutex_unlock(&fd->refs->lock);
2500 pthread_mutex_unlock(&fd->ref_lock);
2501 return NULL;
2503 r = fd->refs->ref_id[id];
2504 if (fd->unsorted)
2505 cram_ref_incr_locked(fd->refs, id);
2510 * We now know that we the filename containing the reference, so check
2511 * for limits. If it's over half the reference we'll load all of it in
2512 * memory as this will speed up subsequent calls.
2514 if (end < 1)
2515 end = r->length;
2516 if (end >= r->length)
2517 end = r->length;
2518 if (start < 1)
2519 return NULL;
2521 if (end - start >= 0.5*r->length || fd->shared_ref) {
2522 start = 1;
2523 end = r->length;
2527 * Maybe we have it cached already? If so use it.
2529 * Alternatively if we don't have the sequence but we're sharing
2530 * references and/or are asking for the entire length of it, then
2531 * load the full reference into the refs structure and return
2532 * a pointer to that one instead.
2534 if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
2535 char *cp;
2537 if (id >= 0) {
2538 if (r->seq) {
2539 cram_ref_incr_locked(fd->refs, id);
2540 } else {
2541 ref_entry *e;
2542 if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
2543 pthread_mutex_unlock(&fd->refs->lock);
2544 pthread_mutex_unlock(&fd->ref_lock);
2545 return NULL;
2548 /* unsorted data implies cache ref indefinitely, to avoid
2549 * continually loading and unloading.
2551 if (fd->unsorted)
2552 cram_ref_incr_locked(fd->refs, id);
2555 fd->ref = NULL; /* We never access it directly */
2556 fd->ref_start = 1;
2557 fd->ref_end = r->length;
2558 fd->ref_id = id;
2560 cp = fd->refs->ref_id[id]->seq + ostart-1;
2561 } else {
2562 fd->ref = NULL;
2563 cp = NULL;
2566 RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
2568 pthread_mutex_unlock(&fd->refs->lock);
2569 pthread_mutex_unlock(&fd->ref_lock);
2570 return cp;
2574 * Otherwise we're not sharing, we don't have a copy of it already and
2575 * we're only asking for a small portion of it.
2577 * In this case load up just that segment ourselves, freeing any old
2578 * small segments in the process.
2581 /* Unmapped ref ID */
2582 if (id < 0) {
2583 if (fd->ref_free) {
2584 free(fd->ref_free);
2585 fd->ref_free = NULL;
2587 fd->ref = NULL;
2588 fd->ref_id = id;
2589 pthread_mutex_unlock(&fd->refs->lock);
2590 pthread_mutex_unlock(&fd->ref_lock);
2591 return NULL;
2594 /* Open file if it's not already the current open reference */
2595 if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
2596 if (fd->refs->fp)
2597 if (bgzf_close(fd->refs->fp) != 0)
2598 return NULL;
2599 fd->refs->fn = r->fn;
2600 if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
2601 pthread_mutex_unlock(&fd->refs->lock);
2602 pthread_mutex_unlock(&fd->ref_lock);
2603 return NULL;
2607 if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
2608 pthread_mutex_unlock(&fd->refs->lock);
2609 pthread_mutex_unlock(&fd->ref_lock);
2610 return NULL;
2613 if (fd->ref_free)
2614 free(fd->ref_free);
2616 fd->ref_id = id;
2617 fd->ref_start = start;
2618 fd->ref_end = end;
2619 fd->ref_free = fd->ref;
2620 seq = fd->ref;
2622 pthread_mutex_unlock(&fd->refs->lock);
2623 pthread_mutex_unlock(&fd->ref_lock);
2625 return seq + ostart - start;
2629 * If fd has been opened for reading, it may be permitted to specify 'fn'
2630 * as NULL and let the code auto-detect the reference by parsing the
2631 * SAM header @SQ lines.
2633 int cram_load_reference(cram_fd *fd, char *fn) {
2634 int ret = 0;
2636 if (fn) {
2637 fd->refs = refs_load_fai(fd->refs, fn,
2638 !(fd->embed_ref && fd->mode == 'r'));
2639 fn = fd->refs ? fd->refs->fn : NULL;
2640 if (!fn)
2641 ret = -1;
2642 sanitise_SQ_lines(fd);
2644 fd->ref_fn = fn;
2646 if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
2647 if (fd->refs)
2648 refs_free(fd->refs);
2649 if (!(fd->refs = refs_create()))
2650 return -1;
2651 if (-1 == refs_from_header(fd->refs, fd, fd->header))
2652 return -1;
2655 if (fd->header)
2656 if (-1 == refs2id(fd->refs, fd->header))
2657 return -1;
2659 return ret;
2662 /* ----------------------------------------------------------------------
2663 * Containers
2667 * Creates a new container, specifying the maximum number of slices
2668 * and records permitted.
2670 * Returns cram_container ptr on success
2671 * NULL on failure
2673 cram_container *cram_new_container(int nrec, int nslice) {
2674 cram_container *c = calloc(1, sizeof(*c));
2675 enum cram_DS_ID id;
2677 if (!c)
2678 return NULL;
2680 c->curr_ref = -2;
2682 c->max_c_rec = nrec * nslice;
2683 c->curr_c_rec = 0;
2685 c->max_rec = nrec;
2686 c->record_counter = 0;
2687 c->num_bases = 0;
2688 c->s_num_bases = 0;
2690 c->max_slice = nslice;
2691 c->curr_slice = 0;
2693 c->pos_sorted = 1;
2694 c->max_apos = 0;
2695 c->multi_seq = 0;
2697 c->bams = NULL;
2699 if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
2700 goto err;
2701 c->slice = NULL;
2703 if (!(c->comp_hdr = cram_new_compression_header()))
2704 goto err;
2705 c->comp_hdr_block = NULL;
2707 for (id = DS_RN; id < DS_TN; id++)
2708 if (!(c->stats[id] = cram_stats_create())) goto err;
2710 //c->aux_B_stats = cram_stats_create();
2712 if (!(c->tags_used = kh_init(m_tagmap)))
2713 goto err;
2714 c->refs_used = 0;
2716 return c;
2718 err:
2719 if (c) {
2720 if (c->slices)
2721 free(c->slices);
2722 free(c);
2724 return NULL;
2727 void cram_free_container(cram_container *c) {
2728 enum cram_DS_ID id;
2729 int i;
2731 if (!c)
2732 return;
2734 if (c->refs_used)
2735 free(c->refs_used);
2737 if (c->landmark)
2738 free(c->landmark);
2740 if (c->comp_hdr)
2741 cram_free_compression_header(c->comp_hdr);
2743 if (c->comp_hdr_block)
2744 cram_free_block(c->comp_hdr_block);
2746 if (c->slices) {
2747 for (i = 0; i < c->max_slice; i++)
2748 if (c->slices[i])
2749 cram_free_slice(c->slices[i]);
2750 free(c->slices);
2753 for (id = DS_RN; id < DS_TN; id++)
2754 if (c->stats[id]) cram_stats_free(c->stats[id]);
2756 //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
2758 if (c->tags_used) {
2759 khint_t k;
2761 for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
2762 if (!kh_exist(c->tags_used, k))
2763 continue;
2765 cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
2766 cram_codec *c = tm->codec;
2768 if (c) c->free(c);
2769 free(tm);
2772 kh_destroy(m_tagmap, c->tags_used);
2775 free(c);
2779 * Reads a container header.
2781 * Returns cram_container on success
2782 * NULL on failure or no container left (fd->err == 0).
2784 cram_container *cram_read_container(cram_fd *fd) {
2785 cram_container c2, *c;
2786 int i, s;
2787 size_t rd = 0;
2788 uint32_t crc = 0;
2790 fd->err = 0;
2791 fd->eof = 0;
2793 memset(&c2, 0, sizeof(c2));
2794 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2795 if ((s = itf8_decode_crc(fd, &c2.length, &crc)) == -1) {
2796 fd->eof = fd->empty_container ? 1 : 2;
2797 return NULL;
2798 } else {
2799 rd+=s;
2801 } else {
2802 uint32_t len;
2803 if ((s = int32_decode(fd, &c2.length)) == -1) {
2804 if (CRAM_MAJOR_VERS(fd->version) == 2 &&
2805 CRAM_MINOR_VERS(fd->version) == 0)
2806 fd->eof = 1; // EOF blocks arrived in v2.1
2807 else
2808 fd->eof = fd->empty_container ? 1 : 2;
2809 return NULL;
2810 } else {
2811 rd+=s;
2813 len = le_int4(c2.length);
2814 crc = crc32(0L, (unsigned char *)&len, 4);
2816 if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s;
2817 if ((s = itf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s;
2818 if ((s = itf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s;
2819 if ((s = itf8_decode_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s;
2821 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2822 c2.record_counter = 0;
2823 c2.num_bases = 0;
2824 } else {
2825 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2826 if ((s = ltf8_decode_crc(fd, &c2.record_counter, &crc)) == -1)
2827 return NULL;
2828 else
2829 rd += s;
2830 } else {
2831 int32_t i32;
2832 if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1)
2833 return NULL;
2834 else
2835 rd += s;
2836 c2.record_counter = i32;
2839 if ((s = ltf8_decode_crc(fd, &c2.num_bases, &crc))== -1)
2840 return NULL;
2841 else
2842 rd += s;
2844 if ((s = itf8_decode_crc(fd, &c2.num_blocks, &crc)) == -1) return NULL; else rd+=s;
2845 if ((s = itf8_decode_crc(fd, &c2.num_landmarks, &crc))== -1) return NULL; else rd+=s;
2847 if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
2848 return NULL;
2850 if (!(c = calloc(1, sizeof(*c))))
2851 return NULL;
2853 *c = c2;
2855 if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
2856 c->num_landmarks) {
2857 fd->err = errno;
2858 cram_free_container(c);
2859 return NULL;
2861 for (i = 0; i < c->num_landmarks; i++) {
2862 if ((s = itf8_decode_crc(fd, &c->landmark[i], &crc)) == -1) {
2863 cram_free_container(c);
2864 return NULL;
2865 } else {
2866 rd += s;
2870 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2871 if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
2872 return NULL;
2873 else
2874 rd+=4;
2876 if (crc != c->crc32) {
2877 fprintf(stderr, "Container header CRC32 failure\n");
2878 cram_free_container(c);
2879 return NULL;
2883 c->offset = rd;
2884 c->slices = NULL;
2885 c->curr_slice = 0;
2886 c->max_slice = c->num_landmarks;
2887 c->slice_rec = 0;
2888 c->curr_rec = 0;
2889 c->max_rec = 0;
2891 if (c->ref_seq_id == -2) {
2892 c->multi_seq = 1;
2893 fd->multi_seq = 1;
2896 fd->empty_container =
2897 (c->num_records == 0 &&
2898 c->ref_seq_id == -1 &&
2899 c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
2901 return c;
2905 /* MAXIMUM storage size needed for the container. */
2906 int cram_container_size(cram_container *c) {
2907 return 55 + 5*c->num_landmarks;
2912 * Stores the container structure in dat and returns *size as the
2913 * number of bytes written to dat[]. The input size of dat is also
2914 * held in *size and should be initialised to cram_container_size(c).
2916 * Returns 0 on success;
2917 * -1 on failure
2919 int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
2921 unsigned char *cp = (unsigned char *)dat;
2922 int i;
2924 // Check the input buffer is large enough according to our stated
2925 // requirements. (NOTE: it may actually take less.)
2926 if (cram_container_size(c) > *size)
2927 return -1;
2929 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2930 cp += itf8_put((char*)cp, c->length);
2931 } else {
2932 *(int32_t *)cp = le_int4(c->length);
2933 cp += 4;
2935 if (c->multi_seq) {
2936 cp += itf8_put((char*)cp, -2);
2937 cp += itf8_put((char*)cp, 0);
2938 cp += itf8_put((char*)cp, 0);
2939 } else {
2940 cp += itf8_put((char*)cp, c->ref_seq_id);
2941 cp += itf8_put((char*)cp, c->ref_seq_start);
2942 cp += itf8_put((char*)cp, c->ref_seq_span);
2944 cp += itf8_put((char*)cp, c->num_records);
2945 if (CRAM_MAJOR_VERS(fd->version) == 2) {
2946 cp += itf8_put((char*)cp, c->record_counter);
2947 cp += ltf8_put((char*)cp, c->num_bases);
2948 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2949 cp += ltf8_put((char*)cp, c->record_counter);
2950 cp += ltf8_put((char*)cp, c->num_bases);
2953 cp += itf8_put((char*)cp, c->num_blocks);
2954 cp += itf8_put((char*)cp, c->num_landmarks);
2955 for (i = 0; i < c->num_landmarks; i++)
2956 cp += itf8_put((char*)cp, c->landmark[i]);
2958 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2959 c->crc32 = crc32(0L, (uc *)dat, (char*)cp-dat);
2960 cp[0] = c->crc32 & 0xff;
2961 cp[1] = (c->crc32 >> 8) & 0xff;
2962 cp[2] = (c->crc32 >> 16) & 0xff;
2963 cp[3] = (c->crc32 >> 24) & 0xff;
2964 cp += 4;
2967 *size = (char *)cp-dat; // actual used size
2969 return 0;
2974 * Writes a container structure.
2976 * Returns 0 on success
2977 * -1 on failure
2979 int cram_write_container(cram_fd *fd, cram_container *c) {
2980 char buf_a[1024], *buf = buf_a;
2981 unsigned char *cp;
2982 int i;
2984 if (55 + c->num_landmarks * 5 >= 1024)
2985 buf = malloc(55 + c->num_landmarks * 5);
2986 cp = (unsigned char *)buf;
2988 if (CRAM_MAJOR_VERS(fd->version) == 1) {
2989 cp += itf8_put((char*)cp, c->length);
2990 } else {
2991 *(int32_t *)cp = le_int4(c->length);
2992 cp += 4;
2994 if (c->multi_seq) {
2995 cp += itf8_put((char*)cp, -2);
2996 cp += itf8_put((char*)cp, 0);
2997 cp += itf8_put((char*)cp, 0);
2998 } else {
2999 cp += itf8_put((char*)cp, c->ref_seq_id);
3000 cp += itf8_put((char*)cp, c->ref_seq_start);
3001 cp += itf8_put((char*)cp, c->ref_seq_span);
3003 cp += itf8_put((char*)cp, c->num_records);
3004 if (CRAM_MAJOR_VERS(fd->version) == 2) {
3005 cp += itf8_put((char*)cp, c->record_counter);
3006 cp += ltf8_put((char*)cp, c->num_bases);
3007 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3008 cp += ltf8_put((char*)cp, c->record_counter);
3009 cp += ltf8_put((char*)cp, c->num_bases);
3012 cp += itf8_put((char*)cp, c->num_blocks);
3013 cp += itf8_put((char*)cp, c->num_landmarks);
3014 for (i = 0; i < c->num_landmarks; i++)
3015 cp += itf8_put((char*)cp, c->landmark[i]);
3017 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3018 c->crc32 = crc32(0L, (uc *)buf, (char*)cp-buf);
3019 cp[0] = c->crc32 & 0xff;
3020 cp[1] = (c->crc32 >> 8) & 0xff;
3021 cp[2] = (c->crc32 >> 16) & 0xff;
3022 cp[3] = (c->crc32 >> 24) & 0xff;
3023 cp += 4;
3026 if ((char*)cp-buf != hwrite(fd->fp, buf, (char*)cp-buf)) {
3027 if (buf != buf_a)
3028 free(buf);
3029 return -1;
3032 if (buf != buf_a)
3033 free(buf);
3035 return 0;
3038 // common component shared by cram_flush_container{,_mt}
3039 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
3040 int i, j;
3042 if (c->curr_slice > 0 && !c->slices)
3043 return -1;
3045 //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
3047 /* Write the container struct itself */
3048 if (0 != cram_write_container(fd, c))
3049 return -1;
3051 /* And the compression header */
3052 if (0 != cram_write_block(fd, c->comp_hdr_block))
3053 return -1;
3055 /* Followed by the slice blocks */
3056 for (i = 0; i < c->curr_slice; i++) {
3057 cram_slice *s = c->slices[i];
3059 if (0 != cram_write_block(fd, s->hdr_block))
3060 return -1;
3062 for (j = 0; j < s->hdr->num_blocks; j++) {
3063 if (0 != cram_write_block(fd, s->block[j]))
3064 return -1;
3068 return hflush(fd->fp) == 0 ? 0 : -1;
3072 * Flushes a completely or partially full container to disk, writing
3073 * container structure, header and blocks. This also calls the encoder
3074 * functions.
3076 * Returns 0 on success
3077 * -1 on failure
3079 int cram_flush_container(cram_fd *fd, cram_container *c) {
3080 /* Encode the container blocks and generate compression header */
3081 if (0 != cram_encode_container(fd, c))
3082 return -1;
3084 return cram_flush_container2(fd, c);
3087 typedef struct {
3088 cram_fd *fd;
3089 cram_container *c;
3090 } cram_job;
3092 void *cram_flush_thread(void *arg) {
3093 cram_job *j = (cram_job *)arg;
3095 /* Encode the container blocks and generate compression header */
3096 if (0 != cram_encode_container(j->fd, j->c)) {
3097 fprintf(stderr, "cram_encode_container failed\n");
3098 return NULL;
3101 return arg;
3104 static int cram_flush_result(cram_fd *fd) {
3105 int i, ret = 0;
3106 hts_tpool_result *r;
3108 while ((r = hts_tpool_next_result(fd->rqueue))) {
3109 cram_job *j = (cram_job *)hts_tpool_result_data(r);
3110 cram_container *c;
3112 if (!j) {
3113 hts_tpool_delete_result(r, 0);
3114 return -1;
3117 fd = j->fd;
3118 c = j->c;
3120 if (fd->mode == 'w')
3121 if (0 != cram_flush_container2(fd, c))
3122 return -1;
3124 /* Free the container */
3125 for (i = 0; i < c->max_slice; i++) {
3126 if (c->slices && c->slices[i]) {
3127 cram_free_slice(c->slices[i]);
3128 c->slices[i] = NULL;
3132 c->slice = NULL;
3133 c->curr_slice = 0;
3135 cram_free_container(c);
3137 ret |= hflush(fd->fp) == 0 ? 0 : -1;
3139 hts_tpool_delete_result(r, 1);
3142 return ret;
3145 int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
3146 cram_job *j;
3148 if (!fd->pool)
3149 return cram_flush_container(fd, c);
3151 if (!(j = malloc(sizeof(*j))))
3152 return -1;
3153 j->fd = fd;
3154 j->c = c;
3156 // Flush the job. Note our encoder queue may be full, so we
3157 // either have to keep trying in non-blocking mode (what we do) or
3158 // use a dedicated separate thread for draining the queue.
3159 for (;;) {
3160 errno = 0;
3161 hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
3162 int pending = (errno == EAGAIN);
3163 if (cram_flush_result(fd) != 0)
3164 return -1;
3165 if (!pending)
3166 break;
3168 usleep(1000);
3171 return 0;
3174 /* ----------------------------------------------------------------------
3175 * Compression headers; the first part of the container
3179 * Creates a new blank container compression header
3181 * Returns header ptr on success
3182 * NULL on failure
3184 cram_block_compression_hdr *cram_new_compression_header(void) {
3185 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
3186 if (!hdr)
3187 return NULL;
3189 if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
3190 free(hdr);
3191 return NULL;
3194 if (!(hdr->TD_hash = kh_init(m_s2i))) {
3195 cram_free_block(hdr->TD_blk);
3196 free(hdr);
3197 return NULL;
3200 if (!(hdr->TD_keys = string_pool_create(8192))) {
3201 kh_destroy(m_s2i, hdr->TD_hash);
3202 cram_free_block(hdr->TD_blk);
3203 free(hdr);
3204 return NULL;
3207 return hdr;
3210 void cram_free_compression_header(cram_block_compression_hdr *hdr) {
3211 int i;
3213 if (hdr->landmark)
3214 free(hdr->landmark);
3216 if (hdr->preservation_map)
3217 kh_destroy(map, hdr->preservation_map);
3219 for (i = 0; i < CRAM_MAP_HASH; i++) {
3220 cram_map *m, *m2;
3221 for (m = hdr->rec_encoding_map[i]; m; m = m2) {
3222 m2 = m->next;
3223 if (m->codec)
3224 m->codec->free(m->codec);
3225 free(m);
3229 for (i = 0; i < CRAM_MAP_HASH; i++) {
3230 cram_map *m, *m2;
3231 for (m = hdr->tag_encoding_map[i]; m; m = m2) {
3232 m2 = m->next;
3233 if (m->codec)
3234 m->codec->free(m->codec);
3235 free(m);
3239 for (i = 0; i < DS_END; i++) {
3240 if (hdr->codecs[i])
3241 hdr->codecs[i]->free(hdr->codecs[i]);
3244 if (hdr->TL)
3245 free(hdr->TL);
3246 if (hdr->TD_blk)
3247 cram_free_block(hdr->TD_blk);
3248 if (hdr->TD_hash)
3249 kh_destroy(m_s2i, hdr->TD_hash);
3250 if (hdr->TD_keys)
3251 string_pool_destroy(hdr->TD_keys);
3253 free(hdr);
3257 /* ----------------------------------------------------------------------
3258 * Slices and slice headers
3261 void cram_free_slice_header(cram_block_slice_hdr *hdr) {
3262 if (!hdr)
3263 return;
3265 if (hdr->block_content_ids)
3266 free(hdr->block_content_ids);
3268 free(hdr);
3270 return;
3273 void cram_free_slice(cram_slice *s) {
3274 if (!s)
3275 return;
3277 if (s->hdr_block)
3278 cram_free_block(s->hdr_block);
3280 if (s->block) {
3281 int i;
3283 if (s->hdr) {
3284 for (i = 0; i < s->hdr->num_blocks; i++) {
3285 cram_free_block(s->block[i]);
3288 free(s->block);
3291 if (s->block_by_id)
3292 free(s->block_by_id);
3294 if (s->hdr)
3295 cram_free_slice_header(s->hdr);
3297 if (s->seqs_blk)
3298 cram_free_block(s->seqs_blk);
3300 if (s->qual_blk)
3301 cram_free_block(s->qual_blk);
3303 if (s->name_blk)
3304 cram_free_block(s->name_blk);
3306 if (s->aux_blk)
3307 cram_free_block(s->aux_blk);
3309 if (s->base_blk)
3310 cram_free_block(s->base_blk);
3312 if (s->soft_blk)
3313 cram_free_block(s->soft_blk);
3315 if (s->cigar)
3316 free(s->cigar);
3318 if (s->crecs)
3319 free(s->crecs);
3321 if (s->features)
3322 free(s->features);
3324 if (s->TN)
3325 free(s->TN);
3327 if (s->pair_keys)
3328 string_pool_destroy(s->pair_keys);
3330 if (s->pair[0])
3331 kh_destroy(m_s2i, s->pair[0]);
3332 if (s->pair[1])
3333 kh_destroy(m_s2i, s->pair[1]);
3335 if (s->aux_block)
3336 free(s->aux_block);
3338 free(s);
3342 * Creates a new empty slice in memory, for subsequent writing to
3343 * disk.
3345 * Returns cram_slice ptr on success
3346 * NULL on failure
3348 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
3349 cram_slice *s = calloc(1, sizeof(*s));
3350 if (!s)
3351 return NULL;
3353 if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
3354 goto err;
3355 s->hdr->content_type = type;
3357 s->hdr_block = NULL;
3358 s->block = NULL;
3359 s->block_by_id = NULL;
3360 s->last_apos = 0;
3361 if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
3362 s->cigar = NULL;
3363 s->cigar_alloc = 0;
3364 s->ncigar = 0;
3366 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
3367 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
3368 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
3369 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
3370 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
3371 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
3373 s->features = NULL;
3374 s->nfeatures = s->afeatures = 0;
3376 #ifndef TN_external
3377 s->TN = NULL;
3378 s->nTN = s->aTN = 0;
3379 #endif
3381 // Volatile keys as we do realloc in dstring
3382 if (!(s->pair_keys = string_pool_create(8192))) goto err;
3383 if (!(s->pair[0] = kh_init(m_s2i))) goto err;
3384 if (!(s->pair[1] = kh_init(m_s2i))) goto err;
3386 #ifdef BA_external
3387 s->BA_len = 0;
3388 #endif
3390 return s;
3392 err:
3393 if (s)
3394 cram_free_slice(s);
3396 return NULL;
3400 * Loads an entire slice.
3401 * FIXME: In 1.0 the native unit of slices within CRAM is broken
3402 * as slices contain references to objects in other slices.
3403 * To work around this while keeping the slice oriented outer loop
3404 * we read all slices and stitch them together into a fake large
3405 * slice instead.
3407 * Returns cram_slice ptr on success
3408 * NULL on failure
3410 cram_slice *cram_read_slice(cram_fd *fd) {
3411 cram_block *b = cram_read_block(fd);
3412 cram_slice *s = calloc(1, sizeof(*s));
3413 int i, n, max_id, min_id;
3415 if (!b || !s)
3416 goto err;
3418 s->hdr_block = b;
3419 switch (b->content_type) {
3420 case MAPPED_SLICE:
3421 case UNMAPPED_SLICE:
3422 if (!(s->hdr = cram_decode_slice_header(fd, b)))
3423 goto err;
3424 break;
3426 default:
3427 fprintf(stderr, "Unexpected block of type %s\n",
3428 cram_content_type2str(b->content_type));
3429 goto err;
3432 if (s->hdr->num_blocks < 1) {
3433 fprintf(stderr, "Slice does not include any data blocks.\n");
3434 goto err;
3437 s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
3438 if (!s->block)
3439 goto err;
3441 for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
3442 if (!(s->block[i] = cram_read_block(fd)))
3443 goto err;
3445 if (s->block[i]->content_type == EXTERNAL) {
3446 if (max_id < s->block[i]->content_id)
3447 max_id = s->block[i]->content_id;
3448 if (min_id > s->block[i]->content_id)
3449 min_id = s->block[i]->content_id;
3452 if (min_id >= 0 && max_id < 1024) {
3453 if (!(s->block_by_id = calloc(1024, sizeof(s->block[0]))))
3454 goto err;
3456 for (i = 0; i < n; i++) {
3457 if (s->block[i]->content_type != EXTERNAL)
3458 continue;
3459 s->block_by_id[s->block[i]->content_id] = s->block[i];
3463 /* Initialise encoding/decoding tables */
3464 s->cigar = NULL;
3465 s->cigar_alloc = 0;
3466 s->ncigar = 0;
3468 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
3469 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
3470 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
3471 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
3472 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
3473 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
3475 s->crecs = NULL;
3477 s->last_apos = s->hdr->ref_seq_start;
3479 return s;
3481 err:
3482 if (b)
3483 cram_free_block(b);
3484 if (s) {
3485 s->hdr_block = NULL;
3486 cram_free_slice(s);
3488 return NULL;
3492 /* ----------------------------------------------------------------------
3493 * CRAM file definition (header)
3497 * Reads a CRAM file definition structure.
3498 * Returns file_def ptr on success
3499 * NULL on failure
3501 cram_file_def *cram_read_file_def(cram_fd *fd) {
3502 cram_file_def *def = malloc(sizeof(*def));
3503 if (!def)
3504 return NULL;
3506 if (26 != hread(fd->fp, &def->magic[0], 26)) {
3507 free(def);
3508 return NULL;
3511 if (memcmp(def->magic, "CRAM", 4) != 0) {
3512 free(def);
3513 return NULL;
3516 if (def->major_version > 3) {
3517 fprintf(stderr, "CRAM version number mismatch\n"
3518 "Expected 1.x, 2.x or 3.x, got %d.%d\n",
3519 def->major_version, def->minor_version);
3520 free(def);
3521 return NULL;
3524 fd->first_container += 26;
3525 fd->last_slice = 0;
3527 return def;
3531 * Writes a cram_file_def structure to cram_fd.
3532 * Returns 0 on success
3533 * -1 on failure
3535 int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
3536 return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
3539 void cram_free_file_def(cram_file_def *def) {
3540 if (def) free(def);
3543 /* ----------------------------------------------------------------------
3544 * SAM header I/O
3549 * Reads the SAM header from the first CRAM data block.
3550 * Also performs minimal parsing to extract read-group
3551 * and sample information.
3553 * Returns SAM hdr ptr on success
3554 * NULL on failure
3556 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
3557 int32_t header_len;
3558 char *header;
3559 SAM_hdr *hdr;
3561 /* 1.1 onwards stores the header in the first block of a container */
3562 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3563 /* Length */
3564 if (-1 == int32_decode(fd, &header_len))
3565 return NULL;
3567 /* Alloc and read */
3568 if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
3569 return NULL;
3571 if (header_len != hread(fd->fp, header, header_len))
3572 return NULL;
3573 header[header_len] = '\0';
3575 fd->first_container += 4 + header_len;
3576 } else {
3577 cram_container *c = cram_read_container(fd);
3578 cram_block *b;
3579 int i, len;
3581 if (!c)
3582 return NULL;
3584 fd->first_container += c->length + c->offset;
3586 if (c->num_blocks < 1) {
3587 cram_free_container(c);
3588 return NULL;
3591 if (!(b = cram_read_block(fd))) {
3592 cram_free_container(c);
3593 return NULL;
3595 if (cram_uncompress_block(b) != 0) {
3596 cram_free_container(c);
3597 cram_free_block(b);
3598 return NULL;
3601 len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3602 itf8_size(b->content_id) +
3603 itf8_size(b->uncomp_size) +
3604 itf8_size(b->comp_size);
3606 /* Extract header from 1st block */
3607 if (-1 == int32_get_blk(b, &header_len) ||
3608 header_len < 0 || /* Spec. says signed... why? */
3609 b->uncomp_size - 4 < header_len) {
3610 cram_free_container(c);
3611 cram_free_block(b);
3612 return NULL;
3614 if (NULL == (header = malloc((size_t) header_len+1))) {
3615 cram_free_container(c);
3616 cram_free_block(b);
3617 return NULL;
3619 memcpy(header, BLOCK_END(b), header_len);
3620 header[header_len] = '\0';
3621 cram_free_block(b);
3623 /* Consume any remaining blocks */
3624 for (i = 1; i < c->num_blocks; i++) {
3625 if (!(b = cram_read_block(fd))) {
3626 cram_free_container(c);
3627 return NULL;
3629 len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3630 itf8_size(b->content_id) +
3631 itf8_size(b->uncomp_size) +
3632 itf8_size(b->comp_size);
3633 cram_free_block(b);
3636 if (c->length > 0 && len > 0 && c->length > len) {
3637 // Consume padding
3638 char *pads = malloc(c->length - len);
3639 if (!pads) {
3640 cram_free_container(c);
3641 return NULL;
3644 if (c->length - len != hread(fd->fp, pads, c->length - len)) {
3645 cram_free_container(c);
3646 return NULL;
3648 free(pads);
3651 cram_free_container(c);
3654 /* Parse */
3655 hdr = sam_hdr_parse_(header, header_len);
3656 free(header);
3658 return hdr;
3662 * Converts 'in' to a full pathname to store in out.
3663 * Out must be at least PATH_MAX bytes long.
3665 static void full_path(char *out, char *in) {
3666 if (*in == '/') {
3667 strncpy(out, in, PATH_MAX);
3668 out[PATH_MAX-1] = 0;
3669 } else {
3670 int len;
3672 // unable to get dir or out+in is too long
3673 if (!getcwd(out, PATH_MAX) ||
3674 (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
3675 strncpy(out, in, PATH_MAX);
3676 out[PATH_MAX-1] = 0;
3677 return;
3680 sprintf(out+len, "/%.*s", PATH_MAX - len, in);
3682 // FIXME: cope with `pwd`/../../../foo.fa ?
3687 * Writes a CRAM SAM header.
3688 * Returns 0 on success
3689 * -1 on failure
3691 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
3692 int header_len;
3693 int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
3695 /* Write CRAM MAGIC if not yet written. */
3696 if (fd->file_def->major_version == 0) {
3697 fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
3698 fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
3699 if (0 != cram_write_file_def(fd, fd->file_def))
3700 return -1;
3703 /* 1.0 requires an UNKNOWN read-group */
3704 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3705 if (!sam_hdr_find_rg(hdr, "UNKNOWN"))
3706 if (sam_hdr_add(hdr, "RG",
3707 "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
3708 return -1;
3711 /* Fix M5 strings */
3712 if (fd->refs && !fd->no_ref) {
3713 int i;
3714 for (i = 0; i < hdr->nref; i++) {
3715 SAM_hdr_type *ty;
3716 char *ref;
3718 if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
3719 return -1;
3721 if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
3722 char unsigned buf[16];
3723 char buf2[33];
3724 int rlen;
3725 hts_md5_context *md5;
3727 if (!fd->refs ||
3728 !fd->refs->ref_id ||
3729 !fd->refs->ref_id[i]) {
3730 return -1;
3732 rlen = fd->refs->ref_id[i]->length;
3733 if (!(md5 = hts_md5_init()))
3734 return -1;
3735 ref = cram_get_ref(fd, i, 1, rlen);
3736 if (NULL == ref) return -1;
3737 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
3738 hts_md5_update(md5, ref, rlen);
3739 hts_md5_final(buf, md5);
3740 hts_md5_destroy(md5);
3741 cram_ref_decr(fd->refs, i);
3743 hts_md5_hex(buf2, buf);
3744 if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
3745 return -1;
3748 if (fd->ref_fn) {
3749 char ref_fn[PATH_MAX];
3750 full_path(ref_fn, fd->ref_fn);
3751 if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
3752 return -1;
3757 if (sam_hdr_rebuild(hdr))
3758 return -1;
3760 /* Length */
3761 header_len = sam_hdr_length(hdr);
3762 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3763 if (-1 == int32_encode(fd, header_len))
3764 return -1;
3766 /* Text data */
3767 if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
3768 return -1;
3769 } else {
3770 /* Create block(s) inside a container */
3771 cram_block *b = cram_new_block(FILE_HEADER, 0);
3772 cram_container *c = cram_new_container(0, 0);
3773 int padded_length;
3774 char *pads;
3775 int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
3777 if (!b || !c) {
3778 if (b) cram_free_block(b);
3779 if (c) cram_free_container(c);
3780 return -1;
3783 int32_put_blk(b, header_len);
3784 if (header_len)
3785 BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
3786 BLOCK_UPLEN(b);
3788 // Compress header block if V3.0 and above
3789 if (CRAM_MAJOR_VERS(fd->version) >= 3)
3790 cram_compress_block(fd, b, NULL, -1, -1);
3792 if (blank_block) {
3793 c->length = b->comp_size + 2 + 4*is_cram_3 +
3794 itf8_size(b->content_id) +
3795 itf8_size(b->uncomp_size) +
3796 itf8_size(b->comp_size);
3798 c->num_blocks = 2;
3799 c->num_landmarks = 2;
3800 if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
3801 cram_free_block(b);
3802 cram_free_container(c);
3803 return -1;
3805 c->landmark[0] = 0;
3806 c->landmark[1] = c->length;
3808 // Plus extra storage for uncompressed secondary blank block
3809 padded_length = MIN(c->length*.5, 10000);
3810 c->length += padded_length + 2 + 4*is_cram_3 +
3811 itf8_size(b->content_id) +
3812 itf8_size(padded_length)*2;
3813 } else {
3814 // Pad the block instead.
3815 c->num_blocks = 1;
3816 c->num_landmarks = 1;
3817 if (!(c->landmark = malloc(sizeof(*c->landmark))))
3818 return -1;
3819 c->landmark[0] = 0;
3821 padded_length = MAX(c->length*1.5, 10000) - c->length;
3823 c->length = b->comp_size + padded_length +
3824 2 + 4*is_cram_3 +
3825 itf8_size(b->content_id) +
3826 itf8_size(b->uncomp_size) +
3827 itf8_size(b->comp_size);
3829 if (NULL == (pads = calloc(1, padded_length))) {
3830 cram_free_block(b);
3831 cram_free_container(c);
3832 return -1;
3834 BLOCK_APPEND(b, pads, padded_length);
3835 BLOCK_UPLEN(b);
3836 free(pads);
3839 if (-1 == cram_write_container(fd, c)) {
3840 cram_free_block(b);
3841 cram_free_container(c);
3842 return -1;
3845 if (-1 == cram_write_block(fd, b)) {
3846 cram_free_block(b);
3847 cram_free_container(c);
3848 return -1;
3851 if (blank_block) {
3852 BLOCK_RESIZE(b, padded_length);
3853 memset(BLOCK_DATA(b), 0, padded_length);
3854 BLOCK_SIZE(b) = padded_length;
3855 BLOCK_UPLEN(b);
3856 b->method = RAW;
3857 if (-1 == cram_write_block(fd, b)) {
3858 cram_free_block(b);
3859 cram_free_container(c);
3860 return -1;
3864 cram_free_block(b);
3865 cram_free_container(c);
3868 if (-1 == refs_from_header(fd->refs, fd, fd->header))
3869 return -1;
3870 if (-1 == refs2id(fd->refs, fd->header))
3871 return -1;
3873 if (0 != hflush(fd->fp))
3874 return -1;
3876 RP("=== Finishing saving header ===\n");
3878 return 0;
3881 /* ----------------------------------------------------------------------
3882 * The top-level cram opening, closing and option handling
3886 * Initialises the lookup tables. These could be global statics, but they're
3887 * clumsy to setup in a multi-threaded environment unless we generate
3888 * verbatim code and include that.
3890 static void cram_init_tables(cram_fd *fd) {
3891 int i;
3893 memset(fd->L1, 4, 256);
3894 fd->L1['A'] = 0; fd->L1['a'] = 0;
3895 fd->L1['C'] = 1; fd->L1['c'] = 1;
3896 fd->L1['G'] = 2; fd->L1['g'] = 2;
3897 fd->L1['T'] = 3; fd->L1['t'] = 3;
3899 memset(fd->L2, 5, 256);
3900 fd->L2['A'] = 0; fd->L2['a'] = 0;
3901 fd->L2['C'] = 1; fd->L2['c'] = 1;
3902 fd->L2['G'] = 2; fd->L2['g'] = 2;
3903 fd->L2['T'] = 3; fd->L2['t'] = 3;
3904 fd->L2['N'] = 4; fd->L2['n'] = 4;
3906 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3907 for (i = 0; i < 0x200; i++) {
3908 int f = 0;
3910 if (i & CRAM_FPAIRED) f |= BAM_FPAIRED;
3911 if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
3912 if (i & CRAM_FUNMAP) f |= BAM_FUNMAP;
3913 if (i & CRAM_FREVERSE) f |= BAM_FREVERSE;
3914 if (i & CRAM_FREAD1) f |= BAM_FREAD1;
3915 if (i & CRAM_FREAD2) f |= BAM_FREAD2;
3916 if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY;
3917 if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL;
3918 if (i & CRAM_FDUP) f |= BAM_FDUP;
3920 fd->bam_flag_swap[i] = f;
3923 for (i = 0; i < 0x1000; i++) {
3924 int g = 0;
3926 if (i & BAM_FPAIRED) g |= CRAM_FPAIRED;
3927 if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR;
3928 if (i & BAM_FUNMAP) g |= CRAM_FUNMAP;
3929 if (i & BAM_FREVERSE) g |= CRAM_FREVERSE;
3930 if (i & BAM_FREAD1) g |= CRAM_FREAD1;
3931 if (i & BAM_FREAD2) g |= CRAM_FREAD2;
3932 if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY;
3933 if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL;
3934 if (i & BAM_FDUP) g |= CRAM_FDUP;
3936 fd->cram_flag_swap[i] = g;
3938 } else {
3939 /* NOP */
3940 for (i = 0; i < 0x1000; i++)
3941 fd->bam_flag_swap[i] = i;
3942 for (i = 0; i < 0x1000; i++)
3943 fd->cram_flag_swap[i] = i;
3946 memset(fd->cram_sub_matrix, 4, 32*32);
3947 for (i = 0; i < 32; i++) {
3948 fd->cram_sub_matrix[i]['A'&0x1f]=0;
3949 fd->cram_sub_matrix[i]['C'&0x1f]=1;
3950 fd->cram_sub_matrix[i]['G'&0x1f]=2;
3951 fd->cram_sub_matrix[i]['T'&0x1f]=3;
3952 fd->cram_sub_matrix[i]['N'&0x1f]=4;
3954 for (i = 0; i < 20; i+=4) {
3955 int j;
3956 for (j = 0; j < 20; j++) {
3957 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3958 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3959 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3960 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3962 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
3963 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
3964 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
3965 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
3969 // Default version numbers for CRAM
3970 static int major_version = 3;
3971 static int minor_version = 0;
3974 * Opens a CRAM file for read (mode "rb") or write ("wb").
3975 * The filename may be "-" to indicate stdin or stdout.
3977 * Returns file handle on success
3978 * NULL on failure.
3980 cram_fd *cram_open(const char *filename, const char *mode) {
3981 hFILE *fp;
3982 cram_fd *fd;
3983 char fmode[3]= { mode[0], '\0', '\0' };
3985 if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
3986 fmode[1] = 'b';
3989 fp = hopen(filename, fmode);
3990 if (!fp)
3991 return NULL;
3993 fd = cram_dopen(fp, filename, mode);
3994 if (!fd)
3995 hclose_abruptly(fp);
3997 return fd;
4000 /* Opens an existing stream for reading or writing.
4002 * Returns file handle on success;
4003 * NULL on failure.
4005 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
4006 int i;
4007 char *cp;
4008 cram_fd *fd = calloc(1, sizeof(*fd));
4009 if (!fd)
4010 return NULL;
4012 fd->level = 5;
4013 for (i = 0; mode[i]; i++) {
4014 if (mode[i] >= '0' && mode[i] <= '9') {
4015 fd->level = mode[i] - '0';
4016 break;
4020 fd->fp = fp;
4021 fd->mode = *mode;
4022 fd->first_container = 0;
4024 if (fd->mode == 'r') {
4025 /* Reader */
4027 if (!(fd->file_def = cram_read_file_def(fd)))
4028 goto err;
4030 fd->version = fd->file_def->major_version * 256 +
4031 fd->file_def->minor_version;
4033 if (!(fd->header = cram_read_SAM_hdr(fd))) {
4034 cram_free_file_def(fd->file_def);
4035 goto err;
4038 } else {
4039 /* Writer */
4040 cram_file_def *def = calloc(1, sizeof(*def));
4041 if (!def)
4042 return NULL;
4044 fd->file_def = def;
4046 def->magic[0] = 'C';
4047 def->magic[1] = 'R';
4048 def->magic[2] = 'A';
4049 def->magic[3] = 'M';
4050 def->major_version = 0; // Indicator to write file def later.
4051 def->minor_version = 0;
4052 memset(def->file_id, 0, 20);
4053 strncpy(def->file_id, filename, 20);
4055 fd->version = major_version * 256 + minor_version;
4057 /* SAM header written later along with this file_def */
4060 cram_init_tables(fd);
4062 fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
4063 if (!fd->prefix)
4064 goto err;
4065 fd->first_base = fd->last_base = -1;
4066 fd->record_counter = 0;
4068 fd->ctr = NULL;
4069 fd->refs = refs_create();
4070 if (!fd->refs)
4071 goto err;
4072 fd->ref_id = -2;
4073 fd->ref = NULL;
4075 fd->decode_md = 0;
4076 fd->verbose = 0;
4077 fd->seqs_per_slice = SEQS_PER_SLICE;
4078 fd->bases_per_slice = BASES_PER_SLICE;
4079 fd->slices_per_container = SLICE_PER_CNT;
4080 fd->embed_ref = 0;
4081 fd->no_ref = 0;
4082 fd->ignore_md5 = 0;
4083 fd->lossy_read_names = 0;
4084 fd->use_bz2 = 0;
4085 fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
4086 fd->use_lzma = 0;
4087 fd->multi_seq = -1;
4088 fd->unsorted = 0;
4089 fd->shared_ref = 0;
4091 fd->index = NULL;
4092 fd->own_pool = 0;
4093 fd->pool = NULL;
4094 fd->rqueue = NULL;
4095 fd->job_pending = NULL;
4096 fd->ooc = 0;
4097 fd->required_fields = INT_MAX;
4099 for (i = 0; i < DS_END; i++)
4100 fd->m[i] = cram_new_metrics();
4102 if (!(fd->tags_used = kh_init(m_metrics)))
4103 goto err;
4105 fd->range.refid = -2; // no ref.
4106 fd->eof = 1; // See samtools issue #150
4107 fd->ref_fn = NULL;
4109 fd->bl = NULL;
4111 /* Initialise dummy refs from the @SQ headers */
4112 if (-1 == refs_from_header(fd->refs, fd, fd->header))
4113 goto err;
4115 return fd;
4117 err:
4118 if (fd)
4119 free(fd);
4121 return NULL;
4125 * Seek within a CRAM file.
4127 * Returns 0 on success
4128 * -1 on failure
4130 int cram_seek(cram_fd *fd, off_t offset, int whence) {
4131 char buf[65536];
4133 fd->ooc = 0;
4135 if (hseek(fd->fp, offset, whence) >= 0)
4136 return 0;
4138 if (!(whence == SEEK_CUR && offset >= 0))
4139 return -1;
4141 /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
4142 while (offset > 0) {
4143 int len = MIN(65536, offset);
4144 if (len != hread(fd->fp, buf, len))
4145 return -1;
4146 offset -= len;
4149 return 0;
4153 * Flushes a CRAM file.
4154 * Useful for when writing to stdout without wishing to close the stream.
4156 * Returns 0 on success
4157 * -1 on failure
4159 int cram_flush(cram_fd *fd) {
4160 if (!fd)
4161 return -1;
4163 if (fd->mode == 'w' && fd->ctr) {
4164 if(fd->ctr->slice)
4165 cram_update_curr_slice(fd->ctr);
4167 if (-1 == cram_flush_container_mt(fd, fd->ctr))
4168 return -1;
4171 return 0;
4175 * Closes a CRAM file.
4176 * Returns 0 on success
4177 * -1 on failure
4179 int cram_close(cram_fd *fd) {
4180 spare_bams *bl, *next;
4181 int i;
4183 if (!fd)
4184 return -1;
4186 if (fd->mode == 'w' && fd->ctr) {
4187 if(fd->ctr->slice)
4188 cram_update_curr_slice(fd->ctr);
4190 if (-1 == cram_flush_container_mt(fd, fd->ctr))
4191 return -1;
4194 if (fd->pool && fd->eof >= 0) {
4195 hts_tpool_process_flush(fd->rqueue);
4197 if (0 != cram_flush_result(fd))
4198 return -1;
4200 pthread_mutex_destroy(&fd->metrics_lock);
4201 pthread_mutex_destroy(&fd->ref_lock);
4202 pthread_mutex_destroy(&fd->bam_list_lock);
4204 fd->ctr = NULL; // prevent double freeing
4206 //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
4208 hts_tpool_process_destroy(fd->rqueue);
4211 if (fd->mode == 'w') {
4212 /* Write EOF block */
4213 if (CRAM_MAJOR_VERS(fd->version) == 3) {
4214 if (38 != hwrite(fd->fp,
4215 "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
4216 "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
4217 "\x00\x01\x00" // Cont HDR
4218 "\x05\xbd\xd9\x4f" // CRC32
4219 "\x00\x01\x00\x06\x06" // Comp.HDR blk
4220 "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk
4221 "\xee\x63\x01\x4b", // CRC32
4222 38))
4223 return -1;
4224 } else {
4225 if (30 != hwrite(fd->fp,
4226 "\x0b\x00\x00\x00\xff\xff\xff\xff"
4227 "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
4228 "\x00\x01\x00\x00\x01\x00\x06\x06"
4229 "\x01\x00\x01\x00\x01\x00", 30))
4230 return -1;
4234 for (bl = fd->bl; bl; bl = next) {
4235 int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
4237 next = bl->next;
4238 for (i = 0; i < max_rec; i++) {
4239 if (bl->bams[i])
4240 bam_free(bl->bams[i]);
4242 free(bl->bams);
4243 free(bl);
4246 if (hclose(fd->fp) != 0)
4247 return -1;
4249 if (fd->file_def)
4250 cram_free_file_def(fd->file_def);
4252 if (fd->header)
4253 sam_hdr_free(fd->header);
4255 free(fd->prefix);
4257 if (fd->ctr)
4258 cram_free_container(fd->ctr);
4260 if (fd->refs)
4261 refs_free(fd->refs);
4262 if (fd->ref_free)
4263 free(fd->ref_free);
4265 for (i = 0; i < DS_END; i++)
4266 if (fd->m[i])
4267 free(fd->m[i]);
4269 if (fd->tags_used) {
4270 khint_t k;
4272 for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
4273 if (kh_exist(fd->tags_used, k))
4274 free(kh_val(fd->tags_used, k));
4277 kh_destroy(m_metrics, fd->tags_used);
4280 if (fd->index)
4281 cram_index_free(fd);
4283 if (fd->own_pool && fd->pool)
4284 hts_tpool_destroy(fd->pool);
4286 free(fd);
4287 return 0;
4291 * Returns 1 if we hit an EOF while reading.
4293 int cram_eof(cram_fd *fd) {
4294 return fd->eof;
4299 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4300 * Use this immediately after opening.
4302 * Returns 0 on success
4303 * -1 on failure
4305 int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
4306 int r;
4307 va_list args;
4309 va_start(args, opt);
4310 r = cram_set_voption(fd, opt, args);
4311 va_end(args);
4313 return r;
4317 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4318 * Use this immediately after opening.
4320 * Returns 0 on success
4321 * -1 on failure
4323 int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
4324 refs_t *refs;
4326 if (!fd) {
4327 errno = EBADF;
4328 return -1;
4331 switch (opt) {
4332 case CRAM_OPT_DECODE_MD:
4333 fd->decode_md = va_arg(args, int);
4334 break;
4336 case CRAM_OPT_PREFIX:
4337 if (fd->prefix)
4338 free(fd->prefix);
4339 if (!(fd->prefix = strdup(va_arg(args, char *))))
4340 return -1;
4341 break;
4343 case CRAM_OPT_VERBOSITY:
4344 fd->verbose = va_arg(args, int);
4345 break;
4347 case CRAM_OPT_SEQS_PER_SLICE:
4348 fd->seqs_per_slice = va_arg(args, int);
4349 break;
4351 case CRAM_OPT_BASES_PER_SLICE:
4352 fd->bases_per_slice = va_arg(args, int);
4353 break;
4355 case CRAM_OPT_SLICES_PER_CONTAINER:
4356 fd->slices_per_container = va_arg(args, int);
4357 break;
4359 case CRAM_OPT_EMBED_REF:
4360 fd->embed_ref = va_arg(args, int);
4361 break;
4363 case CRAM_OPT_NO_REF:
4364 fd->no_ref = va_arg(args, int);
4365 break;
4367 case CRAM_OPT_IGNORE_MD5:
4368 fd->ignore_md5 = va_arg(args, int);
4369 break;
4371 case CRAM_OPT_LOSSY_NAMES:
4372 fd->lossy_read_names = va_arg(args, int);
4373 // Currently lossy read names required paired (attached) reads.
4374 // TLEN 0 or being 1 out causes read pairs to be detached, breaking
4375 // the lossy read name compression, so we have extra options to
4376 // slacken the exact TLEN round-trip checks.
4377 fd->tlen_approx = fd->lossy_read_names;
4378 fd->tlen_zero = fd->lossy_read_names;
4379 break;
4381 case CRAM_OPT_USE_BZIP2:
4382 fd->use_bz2 = va_arg(args, int);
4383 break;
4385 case CRAM_OPT_USE_RANS:
4386 fd->use_rans = va_arg(args, int);
4387 break;
4389 case CRAM_OPT_USE_LZMA:
4390 fd->use_lzma = va_arg(args, int);
4391 break;
4393 case CRAM_OPT_SHARED_REF:
4394 fd->shared_ref = 1;
4395 refs = va_arg(args, refs_t *);
4396 if (refs != fd->refs) {
4397 if (fd->refs)
4398 refs_free(fd->refs);
4399 fd->refs = refs;
4400 fd->refs->count++;
4402 break;
4404 case CRAM_OPT_RANGE:
4405 fd->range = *va_arg(args, cram_range *);
4406 return cram_seek_to_refpos(fd, &fd->range);
4408 case CRAM_OPT_REFERENCE:
4409 return cram_load_reference(fd, va_arg(args, char *));
4411 case CRAM_OPT_VERSION: {
4412 int major, minor;
4413 char *s = va_arg(args, char *);
4414 if (2 != sscanf(s, "%d.%d", &major, &minor)) {
4415 fprintf(stderr, "Malformed version string %s\n", s);
4416 return -1;
4418 if (!((major == 1 && minor == 0) ||
4419 (major == 2 && (minor == 0 || minor == 1)) ||
4420 (major == 3 && minor == 0))) {
4421 fprintf(stderr, "Unknown version string; "
4422 "use 1.0, 2.0, 2.1 or 3.0\n");
4423 errno = EINVAL;
4424 return -1;
4426 fd->version = major*256 + minor;
4428 if (CRAM_MAJOR_VERS(fd->version) >= 3)
4429 fd->use_rans = 1;
4430 break;
4433 case CRAM_OPT_MULTI_SEQ_PER_SLICE:
4434 fd->multi_seq = va_arg(args, int);
4435 break;
4437 case CRAM_OPT_NTHREADS: {
4438 int nthreads = va_arg(args, int);
4439 if (nthreads > 1) {
4440 if (!(fd->pool = hts_tpool_init(nthreads)))
4441 return -1;
4443 fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
4444 pthread_mutex_init(&fd->metrics_lock, NULL);
4445 pthread_mutex_init(&fd->ref_lock, NULL);
4446 pthread_mutex_init(&fd->bam_list_lock, NULL);
4447 fd->shared_ref = 1;
4448 fd->own_pool = 1;
4450 break;
4453 case CRAM_OPT_THREAD_POOL: {
4454 htsThreadPool *p = va_arg(args, htsThreadPool *);
4455 fd->pool = p ? p->pool : NULL;
4456 if (fd->pool) {
4457 fd->rqueue = hts_tpool_process_init(fd->pool,
4458 p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
4460 pthread_mutex_init(&fd->metrics_lock, NULL);
4461 pthread_mutex_init(&fd->ref_lock, NULL);
4462 pthread_mutex_init(&fd->bam_list_lock, NULL);
4464 fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
4465 fd->own_pool = 0;
4467 //fd->qsize = 1;
4468 //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
4469 //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
4470 break;
4473 case CRAM_OPT_REQUIRED_FIELDS:
4474 fd->required_fields = va_arg(args, int);
4475 break;
4477 case HTS_OPT_COMPRESSION_LEVEL:
4478 fd->level = va_arg(args, int);
4479 break;
4481 default:
4482 fprintf(stderr, "Unknown CRAM option code %d\n", opt);
4483 errno = EINVAL;
4484 return -1;
4487 return 0;
4490 int cram_check_EOF(cram_fd *fd)
4492 // Byte 9 in these templates is & with 0x0f to resolve differences
4493 // between ITF-8 interpretations between early Java and C
4494 // implementations of CRAM
4495 static const unsigned char TEMPLATE_2_1[30] = {
4496 0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4497 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
4498 0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
4500 static const unsigned char TEMPLATE_3[38] = {
4501 0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4502 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
4503 0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
4504 0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
4507 unsigned char buf[38]; // max(sizeof TEMPLATE_*)
4509 uint8_t major = CRAM_MAJOR_VERS(fd->version);
4510 uint8_t minor = CRAM_MINOR_VERS(fd->version);
4512 const unsigned char *template;
4513 ssize_t template_len;
4514 if ((major < 2) ||
4515 (major == 2 && minor == 0)) {
4516 return 3; // No EOF support in cram versions less than 2.1
4517 } else if (major == 2 && minor == 1) {
4518 template = TEMPLATE_2_1;
4519 template_len = sizeof TEMPLATE_2_1;
4520 } else {
4521 template = TEMPLATE_3;
4522 template_len = sizeof TEMPLATE_3;
4525 off_t offset = htell(fd->fp);
4526 if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
4527 if (errno == ESPIPE) {
4528 hclearerr(fd->fp);
4529 return 2;
4531 else {
4532 return -1;
4535 if (hread(fd->fp, buf, template_len) != template_len) return -1;
4536 if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
4537 buf[8] &= 0x0f;
4538 return (memcmp(template, buf, template_len) == 0)? 1 : 0;