2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #ifndef _CRAM_STRUCTS_H_
32 #define _CRAM_STRUCTS_H_
35 * Defines in-memory structs for the basic file-format objects in the
38 * The basic file format is:
39 * File-def SAM-hdr Container Container ...
42 * Service-block data-block data-block ...
44 * Multiple blocks in a container are grouped together as slices,
45 * also sometimes referred to as landmarks in the spec.
51 #include <sys/types.h>
53 #include "htslib/thread_pool.h"
54 #include "cram/string_alloc.h"
55 #include "cram/mFILE.h"
56 #include "htslib/khash.h"
62 // Generic hash-map integer -> integer
63 KHASH_MAP_INIT_INT(m_i2i
, int)
65 // Generic hash-set integer -> (existance)
66 KHASH_SET_INIT_INT(s_i2i
)
69 typedef unsigned char uc
;
72 * A union for the preservation map. Required for khash.
79 // Generates static functions here which isn't ideal, but we have no way
80 // currently to declare the kh_map_t structure here without also declaring a
81 // duplicate in the .c files due to the nature of the KHASH macros.
82 KHASH_MAP_INIT_STR(map
, pmap_t
)
86 #define SEQS_PER_SLICE 10000
87 #define BASES_PER_SLICE (SEQS_PER_SLICE*500)
88 #define SLICE_PER_CNT 1
90 #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT"
92 #define MAX_STAT_VAL 1024
93 //#define MAX_STAT_VAL 16
94 typedef struct cram_stats
{
95 int freqs
[MAX_STAT_VAL
];
97 int nsamp
; // total number of values added
98 int nvals
; // total number of unique values added
101 /* NB: matches java impl, not the spec */
107 E_BYTE_ARRAY_LEN
= 4,
108 E_BYTE_ARRAY_STOP
= 5,
113 E_NUM_CODECS
= 10, /* Number of codecs, not a real one. */
116 enum cram_external_type
{
121 E_BYTE_ARRAY_BLOCK
= 5,
124 /* External IDs used by this implementation (only assumed during writing) */
127 DS_aux
= 1, // aux_blk
132 DS_aux_FZ
= 6, // also ZM:B
133 DS_aux_oq
= 7, // other qualities
134 DS_aux_os
= 8, // other sequences
135 DS_aux_oz
= 9, // other strings
174 DS_TC
, // CRAM v1.0 tags
181 /* "File Definition Structure" */
182 typedef struct cram_file_def
{
184 uint8_t major_version
;
185 uint8_t minor_version
;
186 char file_id
[20]; // Filename or SHA1 checksum
189 #define CRAM_MAJOR_VERS(v) ((v) >> 8)
190 #define CRAM_MINOR_VERS(v) ((v) & 0xff)
194 enum cram_block_method
{
200 RANS
= 4, // Generic; either order
202 RANS1
= 10, // Not externalised; stored as RANS (generic)
203 GZIP_RLE
= 11, // NB: not externalised in CRAM
206 enum cram_content_type
{
209 COMPRESSION_HEADER
= 1,
211 UNMAPPED_SLICE
= 3, // CRAM V1.0 only
216 /* Compression metrics */
218 // number of trials and time to next trial
222 // aggregate sizes during trials
230 // resultant method from trials
234 // Revisions of method, to allow culling of continually failing ones.
251 // Hash aux key (XX:i) to cram_metrics
252 KHASH_MAP_INIT_INT(m_metrics
, cram_metrics
*)
256 typedef struct cram_block
{
257 enum cram_block_method method
, orig_method
;
258 enum cram_content_type content_type
;
263 int32_t idx
; /* offset into data */
271 // To aid compression
272 cram_metrics
*m
; // used to track aux block compression only
275 struct cram_codec
; /* defined in cram_codecs.h */
278 #define CRAM_MAP_HASH 32
279 #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
281 /* Compression header block */
282 typedef struct cram_block_compression_hdr
{
284 int32_t ref_seq_start
;
285 int32_t ref_seq_span
;
287 int32_t num_landmarks
;
290 /* Flags from preservation map */
291 int mapped_qs_included
;
292 int unmapped_qs_included
;
295 int read_names_included
;
297 // indexed by ref-base and subst. code
298 char substitution_matrix
[5][4];
300 // TD Dictionary as a concatenated block
301 cram_block
*TD_blk
; // Tag Dictionary
302 int nTL
; // number of TL entries in TD
303 unsigned char **TL
; // array of size nTL, pointer into TD_blk.
304 khash_t(m_s2i
) *TD_hash
; // Keyed on TD strings, map to TL[] indices
305 string_alloc_t
*TD_keys
; // Pooled keys for TD hash.
307 khash_t(map
) *preservation_map
;
308 struct cram_map
*rec_encoding_map
[CRAM_MAP_HASH
];
309 struct cram_map
*tag_encoding_map
[CRAM_MAP_HASH
];
311 struct cram_codec
*codecs
[DS_END
];
313 char *uncomp
; // A single block of uncompressed data
314 size_t uncomp_size
, uncomp_alloc
;
316 unsigned int data_series
; // See cram_fields enum below
317 } cram_block_compression_hdr
;
319 typedef struct cram_map
{
320 int key
; /* 0xe0 + 3 bytes */
321 enum cram_encoding encoding
;
322 int offset
; /* Offset into a single block of memory */
324 struct cram_codec
*codec
;
325 struct cram_map
*next
; // for noddy internal hash
328 typedef struct cram_tag_map
{
329 struct cram_codec
*codec
;
334 // Hash aux key (XX:i) to cram_tag_map
335 KHASH_MAP_INIT_INT(m_tagmap
, cram_tag_map
*)
337 /* Mapped or unmapped slice header block */
338 typedef struct cram_block_slice_hdr
{
339 enum cram_content_type content_type
;
340 int32_t ref_seq_id
; /* if content_type == MAPPED_SLICE */
341 int32_t ref_seq_start
; /* if content_type == MAPPED_SLICE */
342 int32_t ref_seq_span
; /* if content_type == MAPPED_SLICE */
344 int64_t record_counter
;
346 int32_t num_content_ids
;
347 int32_t *block_content_ids
;
348 int32_t ref_base_id
; /* if content_type == MAPPED_SLICE */
349 unsigned char md5
[16];
350 } cram_block_slice_hdr
;
357 * Conceptually a container is split into slices, and slices into blocks.
358 * However on disk it's just a list of blocks and we need to query the
359 * block types to identify the start/end points of the slices.
361 * OR... are landmarks the start/end points of slices?
363 typedef struct cram_container
{
366 int32_t ref_seq_start
;
367 int32_t ref_seq_span
;
368 int64_t record_counter
;
372 int32_t num_landmarks
;
375 /* Size of container header above */
378 /* Compression header is always the first block? */
379 cram_block_compression_hdr
*comp_hdr
;
380 cram_block
*comp_hdr_block
;
382 /* For construction purposes */
383 int max_slice
, curr_slice
; // maximum number of slices
384 int max_rec
, curr_rec
; // current and max recs per slice
385 int max_c_rec
, curr_c_rec
; // current and max recs per container
386 int slice_rec
; // rec no. for start of this slice
387 int curr_ref
; // current ref ID. -2 for no previous
388 int last_pos
; // last record position
389 struct cram_slice
**slices
, *slice
;
390 int pos_sorted
; // boolean, 1=>position sorted data
391 int max_apos
; // maximum position, used if pos_sorted==0
392 int last_slice
; // number of reads in last slice (0 for 1st)
393 int multi_seq
; // true if packing multi seqs per cont/slice
394 int unsorted
; // true is AP_delta is 0.
396 /* Copied from fd before encoding, to allow multi-threading */
397 int ref_start
, first_base
, last_base
, ref_id
, ref_end
;
399 //struct ref_entry *ref;
401 /* For multi-threading */
404 /* Statistics for encoding */
405 cram_stats
*stats
[DS_END
];
407 khash_t(m_tagmap
) *tags_used
; // set of tag types in use, for tag encoding map
408 int *refs_used
; // array of frequency of ref seq IDs
410 uint32_t crc32
; // CRC32
412 uint64_t s_num_bases
; // number of bases in this slice
416 * A single cram record
418 typedef struct cram_record
{
419 struct cram_slice
*s
; // Filled out by cram_decode only
421 int32_t ref_id
; // fixed for all recs in slice?
423 int32_t cram_flags
; // CF
427 int32_t name
; // RN; idx to s->names_blk
429 int32_t mate_line
; // index to another cram_record
431 int32_t mate_pos
; // NP
436 int32_t aux
; // idx to s->aux_blk
437 int32_t aux_size
; // total size of packed ntags in aux_blk
439 int32_t TN_idx
; // TN; idx to s->TN;
441 int32_t tn
; // idx to s->tn_blk
445 int32_t seq
; // idx to s->seqs_blk
446 int32_t qual
; // idx to s->qual_blk
447 int32_t cigar
; // idx to s->cigar
449 int32_t aend
; // alignment end
452 int32_t feature
; // idx to s->feature
453 int32_t nfeature
; // number of features
454 int32_t mate_flags
; // MF
457 // Accessor macros as an analogue of the bam ones
458 #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name])
459 #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq])
460 #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual])
461 #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux])
462 #define cram_seqi(c,i) (cram_seq((c))[(i)])
463 #define cram_name_len(c) ((c)->name_len)
464 #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0)
465 #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0)
466 #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar])
469 * A feature is a base difference, used for the sequence reference encoding.
470 * (We generate these internally when writing CRAM.)
472 typedef struct cram_feature
{
477 int base
; // substitution code
482 int base
; // actual base & qual
488 int seq_idx
; // index to s->seqs_blk
500 int seq_idx
; // soft-clip multiple bases
506 int seq_idx
; // insertion multiple bases
511 int base
; // insertion single base
537 * A slice is really just a set of blocks, but it
538 * is the logical unit for decoding a number of
541 typedef struct cram_slice
{
542 cram_block_slice_hdr
*hdr
;
543 cram_block
*hdr_block
;
545 cram_block
**block_by_id
;
547 /* State used during encoding/decoding */
548 int last_apos
, max_apos
;
550 /* Array of decoded cram records */
553 /* An dynamically growing buffers for data pointed
554 * to by crecs[] array.
557 uint32_t cigar_alloc
;
560 cram_feature
*features
;
562 int afeatures
; // allocated size of features
565 // TN field (Tag Name)
567 int nTN
, aTN
; // used and allocated size for TN[]
573 // For variable sized elements which are always external blocks.
574 cram_block
*name_blk
;
575 cram_block
*seqs_blk
;
576 cram_block
*qual_blk
;
577 cram_block
*base_blk
;
578 cram_block
*soft_blk
;
579 cram_block
*aux_blk
; // BAM aux block, created while decoding CRAM
581 string_alloc_t
*pair_keys
; // Pooled keys for pair hash.
582 khash_t(m_s2i
) *pair
[2]; // for identifying read-pairs in this slice.
584 char *ref
; // slice of current reference
585 int ref_start
; // start position of current reference;
586 int ref_end
; // end position of current reference;
589 // For going from BAM to CRAM; an array of auxiliary blocks per type
591 cram_block
**aux_block
;
594 /*-----------------------------------------------------------------------------
595 * Consider moving reference handling to cram_refs.[ch]
597 // from fa.fai / samtools faidx files
598 typedef struct ref_entry
{
605 int64_t count
; // for shared references so we know to dealloc seq
608 int is_md5
; // Reference comes from a raw seq found by MD5
611 KHASH_MAP_INIT_STR(refs
, ref_entry
*)
613 // References structure.
615 string_alloc_t
*pool
; // String pool for holding filenames and SN vals
617 khash_t(refs
) *h_meta
; // ref_entry*, index by name
618 ref_entry
**ref_id
; // ref_entry*, index by ID
619 int nref
; // number of ref_entry
621 char *fn
; // current file opened
622 BGZF
*fp
; // and the hFILE* to go with it.
624 int count
; // how many cram_fd sharing this refs struct
626 pthread_mutex_t lock
; // Mutex for multi-threaded updating
627 ref_entry
*last
; // Last queried sequence
628 int last_id
; // Used in cram_ref_decr_locked to delay free
631 /*-----------------------------------------------------------------------------
634 * Detect format by number of entries per line.
635 * 5 => 1.0 (refid, start, nseq, C offset, slice)
636 * 6 => 1.1 (refid, start, span, C offset, S offset, S size)
638 * Indices are stored in a nested containment list, which is trivial to set
639 * up as the indices are on sorted data so we're appending to the nclist
640 * in sorted order. Basically if a slice entirely fits within a previous
641 * slice then we append to that slices list. This is done recursively.
643 * Lists are sorted on two dimensions: ref id + slice coords.
645 typedef struct cram_index
{
646 int nslice
, nalloc
; // total number of slices
647 struct cram_index
*e
; // array of size nslice
649 int refid
; // 1.0 1.1
650 int start
; // 1.0 1.1
652 int nseq
; // 1.0 - undocumented
653 int slice
; // 1.0 landmark index, 1.1 landmark value
654 int len
; // 1.1 - size of slice in bytes
655 int64_t offset
; // 1.0 1.1
664 /*-----------------------------------------------------------------------------
666 /* CRAM File handle */
668 typedef struct spare_bams
{
670 struct spare_bams
*next
;
673 typedef struct cram_fd
{
675 int mode
; // 'r' or 'w'
677 cram_file_def
*file_def
;
681 int64_t record_counter
;
684 // Most recent compression header decoded
685 //cram_block_compression_hdr *comp_hdr;
686 //cram_block_slice_hdr *slice_hdr;
688 // Current container being processed.
691 // positions for encoding or decoding
692 int first_base
, last_base
;
694 // cached reference portion
695 refs_t
*refs
; // ref meta-data structure
696 char *ref
, *ref_free
; // current portion held in memory
700 char *ref_fn
; // reference fasta filename
702 // compression level and metrics
704 cram_metrics
*m
[DS_END
];
705 khash_t(m_metrics
) *tags_used
; // cram_metrics[], per tag types in use.
708 int decode_md
; // Whether to export MD and NM tags
712 int slices_per_container
;
720 unsigned int required_fields
;
723 // lookup tables, stored here so we can be trivially multi-threaded
724 unsigned int bam_flag_swap
[0x1000]; // cram -> bam flags
725 unsigned int cram_flag_swap
[0x1000];// bam -> cram flags
726 unsigned char L1
[256]; // ACGT{*} ->0123{4}
727 unsigned char L2
[256]; // ACGTN{*}->01234{5}
728 char cram_sub_matrix
[32][32]; // base substituion codes
731 cram_index
*index
; // array, sizeof index_sz
732 off_t first_container
;
734 int last_slice
; // number of recs encoded in last slice
737 int empty_container
; // Marker for EOF block
742 hts_tpool_process
*rqueue
;
743 pthread_mutex_t metrics_lock
;
744 pthread_mutex_t ref_lock
;
746 pthread_mutex_t bam_list_lock
;
748 int ooc
; // out of containers.
750 int lossy_read_names
; // boolean
751 int tlen_approx
; // max TLEN calculation offset.
752 int tlen_zero
; // If true, permit tlen 0 (=> tlen calculated)
755 // Translation of required fields to cram data series
757 CRAM_BF
= 0x00000001,
758 CRAM_AP
= 0x00000002,
759 CRAM_FP
= 0x00000004,
760 CRAM_RL
= 0x00000008,
761 CRAM_DL
= 0x00000010,
762 CRAM_NF
= 0x00000020,
763 CRAM_BA
= 0x00000040,
764 CRAM_QS
= 0x00000080,
765 CRAM_FC
= 0x00000100,
766 CRAM_FN
= 0x00000200,
767 CRAM_BS
= 0x00000400,
768 CRAM_IN
= 0x00000800,
769 CRAM_RG
= 0x00001000,
770 CRAM_MQ
= 0x00002000,
771 CRAM_TL
= 0x00004000,
772 CRAM_RN
= 0x00008000,
773 CRAM_NS
= 0x00010000,
774 CRAM_NP
= 0x00020000,
775 CRAM_TS
= 0x00040000,
776 CRAM_MF
= 0x00080000,
777 CRAM_CF
= 0x00100000,
778 CRAM_RI
= 0x00200000,
779 CRAM_RS
= 0x00400000,
780 CRAM_PD
= 0x00800000,
781 CRAM_HC
= 0x01000000,
782 CRAM_SC
= 0x02000000,
783 CRAM_BB
= 0x04000000,
784 CRAM_BB_len
= 0x08000000,
785 CRAM_QQ
= 0x10000000,
786 CRAM_QQ_len
= 0x20000000,
787 CRAM_aux
= 0x40000000,
788 CRAM_ALL
= 0x7fffffff,
791 // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
792 // encode a base difference, but we don't need to know what it is for CIGAR.
793 // If we have a soft-clip or insertion, we do need SC/IN though to know how
794 // long that array is.
795 #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
796 CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
798 #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \
799 CRAM_RL | CRAM_AP | CRAM_BB)
801 #define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ)
804 /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
805 #define CRAM_FPAIRED 256
806 #define CRAM_FPROPER_PAIR 128
807 #define CRAM_FUNMAP 64
808 #define CRAM_FREVERSE 32
809 #define CRAM_FREAD1 16
810 #define CRAM_FREAD2 8
811 #define CRAM_FSECONDARY 4
812 #define CRAM_FQCFAIL 2
815 #define DS_aux_S "\001"
816 #define DS_aux_OQ_S "\002"
817 #define DS_aux_BQ_S "\003"
818 #define DS_aux_BD_S "\004"
819 #define DS_aux_BI_S "\005"
820 #define DS_aux_FZ_S "\006"
821 #define DS_aux_oq_S "\007"
822 #define DS_aux_os_S "\010"
823 #define DS_aux_oz_S "\011"
825 #define CRAM_M_REVERSE 1
826 #define CRAM_M_UNMAP 2
830 #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
831 #define CRAM_FLAG_DETACHED (1<<1)
832 #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2)
833 #define CRAM_FLAG_NO_SEQ (1<<3)
834 #define CRAM_FLAG_MASK ((1<<4)-1)
837 #define CRAM_FLAG_STATS_ADDED (1<<30)
838 #define CRAM_FLAG_DISCARD_NAME (1<<31)
844 #endif /* _CRAM_STRUCTS_H_ */