2 Copyright (c) 2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 #include "cram/sam_header.h"
37 #include "cram/string_alloc.h"
39 static void sam_hdr_error(char *msg
, char *line
, int len
, int lno
) {
42 for (j
= 0; j
< len
&& line
[j
] != '\n'; j
++)
44 fprintf(stderr
, "%s at line %d: \"%.*s\"\n", msg
, lno
, j
, line
);
47 void sam_hdr_dump(SAM_hdr
*hdr
) {
51 printf("===DUMP===\n");
52 for (k
= kh_begin(hdr
->h
); k
!= kh_end(hdr
->h
); k
++) {
53 SAM_hdr_type
*t1
, *t2
;
56 if (!kh_exist(hdr
->h
, k
))
59 t1
= t2
= kh_val(hdr
->h
, k
);
60 c
[0] = kh_key(hdr
->h
, k
)>>8;
61 c
[1] = kh_key(hdr
->h
, k
)&0xff;
62 printf("Type %.2s, count %d\n", c
, t1
->prev
->order
+1);
66 printf(">>>%d ", t1
->order
);
67 for (tag
= t1
->tag
; tag
; tag
=tag
->next
) {
68 printf("\"%.2s\":\"%.*s\"\t",
69 tag
->str
, tag
->len
-3, tag
->str
+3);
76 /* Dump out PG chains */
77 printf("\n@PG chains:\n");
78 for (i
= 0; i
< hdr
->npg_end
; i
++) {
81 for (j
= hdr
->pg_end
[i
]; j
!= -1; j
= hdr
->pg
[j
].prev_id
) {
83 j
== hdr
->pg_end
[i
] ? " " : "->",
84 j
, hdr
->pg
[j
].name_len
, hdr
->pg
[j
].name
);
89 puts("===END DUMP===");
92 /* Updates the hash tables in the SAM_hdr structure.
94 * Returns 0 on success;
97 static int sam_hdr_update_hashes(SAM_hdr
*sh
,
99 SAM_hdr_type
*h_type
) {
100 /* Add to reference hash? */
101 if ((type
>>8) == 'S' && (type
&0xff) == 'Q') {
106 new_ref
= realloc(sh
->ref
, (sh
->nref
+1)*sizeof(*sh
->ref
));
112 sh
->ref
[nref
].name
= NULL
;
113 sh
->ref
[nref
].len
= 0;
114 sh
->ref
[nref
].ty
= h_type
;
115 sh
->ref
[nref
].tag
= tag
;
118 if (tag
->str
[0] == 'S' && tag
->str
[1] == 'N') {
119 if (!(sh
->ref
[nref
].name
= malloc(tag
->len
)))
121 strncpy(sh
->ref
[nref
].name
, tag
->str
+3, tag
->len
-3);
122 sh
->ref
[nref
].name
[tag
->len
-3] = 0;
123 } else if (tag
->str
[0] == 'L' && tag
->str
[1] == 'N') {
124 sh
->ref
[nref
].len
= atoi(tag
->str
+3);
129 if (sh
->ref
[nref
].name
) {
132 k
= kh_put(m_s2i
, sh
->ref_hash
, sh
->ref
[nref
].name
, &r
);
133 if (-1 == r
) return -1;
134 kh_val(sh
->ref_hash
, k
) = nref
;
136 return -1; // SN should be present, according to spec.
142 /* Add to read-group hash? */
143 if ((type
>>8) == 'R' && (type
&0xff) == 'G') {
148 new_rg
= realloc(sh
->rg
, (sh
->nrg
+1)*sizeof(*sh
->rg
));
154 sh
->rg
[nrg
].name
= NULL
;
155 sh
->rg
[nrg
].name_len
= 0;
156 sh
->rg
[nrg
].ty
= h_type
;
157 sh
->rg
[nrg
].tag
= tag
;
158 sh
->rg
[nrg
].id
= nrg
;
161 if (tag
->str
[0] == 'I' && tag
->str
[1] == 'D') {
162 if (!(sh
->rg
[nrg
].name
= malloc(tag
->len
)))
164 strncpy(sh
->rg
[nrg
].name
, tag
->str
+3, tag
->len
-3);
165 sh
->rg
[nrg
].name
[tag
->len
-3] = 0;
166 sh
->rg
[nrg
].name_len
= strlen(sh
->rg
[nrg
].name
);
171 if (sh
->rg
[nrg
].name
) {
174 k
= kh_put(m_s2i
, sh
->rg_hash
, sh
->rg
[nrg
].name
, &r
);
175 if (-1 == r
) return -1;
176 kh_val(sh
->rg_hash
, k
) = nrg
;
178 return -1; // ID should be present, according to spec.
184 /* Add to program hash? */
185 if ((type
>>8) == 'P' && (type
&0xff) == 'G') {
190 new_pg
= realloc(sh
->pg
, (sh
->npg
+1)*sizeof(*sh
->pg
));
196 sh
->pg
[npg
].name
= NULL
;
197 sh
->pg
[npg
].name_len
= 0;
198 sh
->pg
[npg
].ty
= h_type
;
199 sh
->pg
[npg
].tag
= tag
;
200 sh
->pg
[npg
].id
= npg
;
201 sh
->pg
[npg
].prev_id
= -1;
204 if (tag
->str
[0] == 'I' && tag
->str
[1] == 'D') {
205 if (!(sh
->pg
[npg
].name
= malloc(tag
->len
)))
207 strncpy(sh
->pg
[npg
].name
, tag
->str
+3, tag
->len
-3);
208 sh
->pg
[npg
].name
[tag
->len
-3] = 0;
209 sh
->pg
[npg
].name_len
= strlen(sh
->pg
[npg
].name
);
210 } else if (tag
->str
[0] == 'P' && tag
->str
[1] == 'P') {
211 // Resolve later if needed
213 char tmp
= tag
->str
[tag
->len
]; tag
->str
[tag
->len
] = 0;
214 k
= kh_get(m_s2i
, sh
->pg_hash
, tag
->str
+3);
215 tag
->str
[tag
->len
] = tmp
;
217 if (k
!= kh_end(sh
->pg_hash
)) {
218 int p_id
= kh_val(sh
->pg_hash
, k
);
219 sh
->pg
[npg
].prev_id
= sh
->pg
[p_id
].id
;
221 /* Unmark previous entry as a PG termination */
222 if (sh
->npg_end
> 0 &&
223 sh
->pg_end
[sh
->npg_end
-1] == p_id
) {
227 for (i
= 0; i
< sh
->npg_end
; i
++) {
228 if (sh
->pg_end
[i
] == p_id
) {
229 memmove(&sh
->pg_end
[i
], &sh
->pg_end
[i
+1],
230 (sh
->npg_end
-i
-1)*sizeof(*sh
->pg_end
));
236 sh
->pg
[npg
].prev_id
= -1;
242 if (sh
->pg
[npg
].name
) {
245 k
= kh_put(m_s2i
, sh
->pg_hash
, sh
->pg
[npg
].name
, &r
);
246 if (-1 == r
) return -1;
247 kh_val(sh
->pg_hash
, k
) = npg
;
249 return -1; // ID should be present, according to spec.
252 /* Add to npg_end[] array. Remove later if we find a PP line */
253 if (sh
->npg_end
>= sh
->npg_end_alloc
) {
255 int new_alloc
= sh
->npg_end_alloc
? sh
->npg_end_alloc
*2 : 4;
257 new_pg_end
= realloc(sh
->pg_end
, new_alloc
* sizeof(int));
260 sh
->npg_end_alloc
= new_alloc
;
261 sh
->pg_end
= new_pg_end
;
263 sh
->pg_end
[sh
->npg_end
++] = npg
;
272 * Appends a formatted line to an existing SAM header.
273 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
274 * optional new-line. If it contains more than 1 line then multiple lines
275 * will be added in order.
277 * Input text is of maximum length len or as terminated earlier by a NUL.
278 * Len may be 0 if unknown, in which case lines must be NUL-terminated.
280 * Returns 0 on success
283 int sam_hdr_add_lines(SAM_hdr
*sh
, const char *lines
, int len
) {
284 int i
, lno
, text_offset
;
290 text_offset
= ks_len(&sh
->text
);
291 if (EOF
== kputsn(lines
, len
, &sh
->text
))
293 hdr
= ks_str(&sh
->text
) + text_offset
;
295 for (i
= 0, lno
= 1; i
< len
&& hdr
[i
] != '\0'; i
++, lno
++) {
299 int l_start
= i
, new;
300 SAM_hdr_type
*h_type
;
301 SAM_hdr_tag
*h_tag
, *last
;
305 for (j
= i
; j
< len
&& hdr
[j
] != '\0' && hdr
[j
] != '\n'; j
++)
307 sam_hdr_error("Header line does not start with '@'",
308 &hdr
[l_start
], len
- l_start
, lno
);
312 type
= (hdr
[i
+1]<<8) | hdr
[i
+2];
313 if (hdr
[i
+1] < 'A' || hdr
[i
+1] > 'z' ||
314 hdr
[i
+2] < 'A' || hdr
[i
+2] > 'z') {
315 sam_hdr_error("Header line does not have a two character key",
316 &hdr
[l_start
], len
- l_start
, lno
);
324 // Add the header line type
325 if (!(h_type
= pool_alloc(sh
->type_pool
)))
327 if (-1 == (k
= kh_put(sam_hdr
, sh
->h
, type
, &new)))
330 // Form the ring, either with self or other lines of this type
332 SAM_hdr_type
*t
= kh_val(sh
->h
, k
), *p
;
335 assert(p
->next
== t
);
341 h_type
->order
= p
->order
+1;
343 kh_val(sh
->h
, k
) = h_type
;
344 h_type
->prev
= h_type
->next
= h_type
;
348 // Parse the tags on this line
350 if ((type
>>8) == 'C' && (type
&0xff) == 'O') {
352 if (hdr
[i
] != '\t') {
353 sam_hdr_error("Missing tab",
354 &hdr
[l_start
], len
- l_start
, lno
);
358 for (j
= ++i
; j
< len
&& hdr
[j
] != '\0' && hdr
[j
] != '\n'; j
++)
361 if (!(h_type
->tag
= h_tag
= pool_alloc(sh
->tag_pool
)))
363 h_tag
->str
= string_ndup(sh
->str_pool
, &hdr
[i
], j
-i
);
374 if (hdr
[i
] != '\t') {
375 sam_hdr_error("Missing tab",
376 &hdr
[l_start
], len
- l_start
, lno
);
380 for (j
= ++i
; j
< len
&& hdr
[j
] != '\0' && hdr
[j
] != '\n' && hdr
[j
] != '\t'; j
++)
383 if (!(h_tag
= pool_alloc(sh
->tag_pool
)))
385 h_tag
->str
= string_ndup(sh
->str_pool
, &hdr
[i
], j
-i
);
391 if (h_tag
->len
< 3 || h_tag
->str
[2] != ':') {
392 sam_hdr_error("Malformed key:value pair",
393 &hdr
[l_start
], len
- l_start
, lno
);
404 } while (i
< len
&& hdr
[i
] != '\0' && hdr
[i
] != '\n');
407 /* Update RG/SQ hashes */
408 if (-1 == sam_hdr_update_hashes(sh
, type
, h_type
))
416 * Adds a single line to a SAM header.
417 * Specify type and one or more key,value pairs, ending with the NULL key.
418 * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
420 * Returns index for specific entry on success (eg 2nd SQ, 4th RG)
423 int sam_hdr_add(SAM_hdr
*sh
, const char *type
, ...) {
425 va_start(args
, type
);
426 return sam_hdr_vadd(sh
, type
, args
, NULL
);
430 * sam_hdr_add with a va_list interface.
432 * Note: this function invokes va_arg at least once, making the value
433 * of ap indeterminate after the return. The caller should call
434 * va_start/va_end before/after calling this function or use va_copy.
436 int sam_hdr_vadd(SAM_hdr
*sh
, const char *type
, va_list ap
, ...) {
438 SAM_hdr_type
*h_type
;
439 SAM_hdr_tag
*h_tag
, *last
;
441 khint32_t type_i
= (type
[0]<<8) | type
[1], k
;
443 if (EOF
== kputc_('@', &sh
->text
))
445 if (EOF
== kputsn(type
, 2, &sh
->text
))
448 if (!(h_type
= pool_alloc(sh
->type_pool
)))
450 if (-1 == (k
= kh_put(sam_hdr
, sh
->h
, type_i
, &new)))
453 // Form the ring, either with self or other lines of this type
455 SAM_hdr_type
*t
= kh_val(sh
->h
, k
), *p
;
458 assert(p
->next
== t
);
464 h_type
->order
= p
->order
+ 1;
466 kh_val(sh
->h
, k
) = h_type
;
467 h_type
->prev
= h_type
->next
= h_type
;
479 if (!(k
= (char *)va_arg(args
, char *)))
481 v
= va_arg(args
, char *);
483 if (EOF
== kputc_('\t', &sh
->text
))
486 if (!(h_tag
= pool_alloc(sh
->tag_pool
)))
488 idx
= ks_len(&sh
->text
);
490 if (EOF
== kputs(k
, &sh
->text
))
492 if (EOF
== kputc_(':', &sh
->text
))
494 if (EOF
== kputs(v
, &sh
->text
))
497 h_tag
->len
= ks_len(&sh
->text
) - idx
;
498 h_tag
->str
= string_ndup(sh
->str_pool
,
499 ks_str(&sh
->text
) + idx
,
514 // Plus the specified va_list params
519 if (!(k
= (char *)va_arg(ap
, char *)))
521 v
= va_arg(ap
, char *);
523 if (EOF
== kputc_('\t', &sh
->text
))
526 if (!(h_tag
= pool_alloc(sh
->tag_pool
)))
528 idx
= ks_len(&sh
->text
);
530 if (EOF
== kputs(k
, &sh
->text
))
532 if (EOF
== kputc_(':', &sh
->text
))
534 if (EOF
== kputs(v
, &sh
->text
))
537 h_tag
->len
= ks_len(&sh
->text
) - idx
;
538 h_tag
->str
= string_ndup(sh
->str_pool
,
539 ks_str(&sh
->text
) + idx
,
554 if (EOF
== kputc('\n', &sh
->text
))
557 int itype
= (type
[0]<<8) | type
[1];
558 if (-1 == sam_hdr_update_hashes(sh
, itype
, h_type
))
561 return h_type
->order
;
565 * Returns the first header item matching 'type'. If ID is non-NULL it checks
566 * for the tag ID: and compares against the specified ID.
568 * Returns NULL if no type/ID is found
570 SAM_hdr_type
*sam_hdr_find(SAM_hdr
*hdr
, char *type
,
571 char *ID_key
, char *ID_value
) {
572 SAM_hdr_type
*t1
, *t2
;
573 int itype
= (type
[0]<<8)|(type
[1]);
576 /* Special case for types we have prebuilt hashes on */
578 if (type
[0] == 'S' && type
[1] == 'Q' &&
579 ID_key
[0] == 'S' && ID_key
[1] == 'N') {
580 k
= kh_get(m_s2i
, hdr
->ref_hash
, ID_value
);
581 return k
!= kh_end(hdr
->ref_hash
)
582 ? hdr
->ref
[kh_val(hdr
->ref_hash
, k
)].ty
586 if (type
[0] == 'R' && type
[1] == 'G' &&
587 ID_key
[0] == 'I' && ID_key
[1] == 'D') {
588 k
= kh_get(m_s2i
, hdr
->rg_hash
, ID_value
);
589 return k
!= kh_end(hdr
->rg_hash
)
590 ? hdr
->rg
[kh_val(hdr
->rg_hash
, k
)].ty
594 if (type
[0] == 'P' && type
[1] == 'G' &&
595 ID_key
[0] == 'I' && ID_key
[1] == 'D') {
596 k
= kh_get(m_s2i
, hdr
->pg_hash
, ID_value
);
597 return k
!= kh_end(hdr
->pg_hash
)
598 ? hdr
->pg
[kh_val(hdr
->pg_hash
, k
)].ty
603 k
= kh_get(sam_hdr
, hdr
->h
, itype
);
604 if (k
== kh_end(hdr
->h
))
608 return kh_val(hdr
->h
, k
);
610 t1
= t2
= kh_val(hdr
->h
, k
);
613 for (tag
= t1
->tag
; tag
; tag
= tag
->next
) {
614 if (tag
->str
[0] == ID_key
[0] && tag
->str
[1] == ID_key
[1]) {
615 char *cp1
= tag
->str
+3;
616 char *cp2
= ID_value
;
617 while (*cp1
&& *cp1
== *cp2
)
631 * As per SAM_hdr_type, but returns a complete line of formatted text
632 * for a specific head type/ID combination. If ID is NULL then it returns
633 * the first line of the specified type.
635 * The returned string is malloced and should be freed by the calling
636 * function with free().
638 * Returns NULL if no type/ID is found.
640 char *sam_hdr_find_line(SAM_hdr
*hdr
, char *type
,
641 char *ID_key
, char *ID_value
) {
642 SAM_hdr_type
*ty
= sam_hdr_find(hdr
, type
, ID_key
, ID_value
);
643 kstring_t ks
= KS_INITIALIZER
;
650 // Paste together the line from the hashed copy
651 r
|= (kputc_('@', &ks
) == EOF
);
652 r
|= (kputs(type
, &ks
) == EOF
);
653 for (tag
= ty
->tag
; tag
; tag
= tag
->next
) {
654 r
|= (kputc_('\t', &ks
) == EOF
);
655 r
|= (kputsn(tag
->str
, tag
->len
, &ks
) == EOF
);
668 * Looks for a specific key in a single sam header line.
669 * If prev is non-NULL it also fills this out with the previous tag, to
670 * permit use in key removal. *prev is set to NULL when the tag is the first
671 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
672 * tag in the existing list.
674 * Returns the tag pointer on success
677 SAM_hdr_tag
*sam_hdr_find_key(SAM_hdr
*sh
,
680 SAM_hdr_tag
**prev
) {
681 SAM_hdr_tag
*tag
, *p
= NULL
;
683 for (tag
= type
->tag
; tag
; p
= tag
, tag
= tag
->next
) {
684 if (tag
->str
[0] == key
[0] && tag
->str
[1] == key
[1]) {
699 * Adds or updates tag key,value pairs in a header line.
700 * Eg for adding M5 tags to @SQ lines or updating sort order for the
701 * @HD line (although use the sam_hdr_sort_order() function for
702 * HD manipulation, which is a wrapper around this funuction).
704 * Specify multiple key,value pairs ending in NULL.
706 * Returns 0 on success
709 int sam_hdr_update(SAM_hdr
*hdr
, SAM_hdr_type
*type
, ...) {
717 SAM_hdr_tag
*tag
, *prev
;
719 if (!(k
= (char *)va_arg(ap
, char *)))
721 v
= va_arg(ap
, char *);
723 tag
= sam_hdr_find_key(hdr
, type
, k
, &prev
);
725 if (!(tag
= pool_alloc(hdr
->tag_pool
)))
735 idx
= ks_len(&hdr
->text
);
736 if (ksprintf(&hdr
->text
, "%2.2s:%s", k
, v
) < 0)
738 tag
->len
= ks_len(&hdr
->text
) - idx
;
739 tag
->str
= string_ndup(hdr
->str_pool
,
740 ks_str(&hdr
->text
) + idx
,
751 #define K(a) (((a)[0]<<8)|((a)[1]))
754 * Returns the sort order:
756 enum sam_sort_order
sam_hdr_sort_order(SAM_hdr
*hdr
) {
757 return hdr
->sort_order
;
760 static enum sam_sort_order
sam_hdr_parse_sort_order(SAM_hdr
*hdr
) {
762 enum sam_sort_order so
;
765 k
= kh_get(sam_hdr
, hdr
->h
, K("HD"));
766 if (k
!= kh_end(hdr
->h
)) {
767 SAM_hdr_type
*ty
= kh_val(hdr
->h
, k
);
769 for (tag
= ty
->tag
; tag
; tag
= tag
->next
) {
770 if (tag
->str
[0] == 'S' && tag
->str
[1] == 'O') {
771 if (strcmp(tag
->str
+3, "unsorted") == 0)
773 else if (strcmp(tag
->str
+3, "queryname") == 0)
775 else if (strcmp(tag
->str
+3, "coordinate") == 0)
777 else if (strcmp(tag
->str
+3, "unknown") != 0)
778 fprintf(stderr
, "Unknown sort order field: %s\n",
789 * Reconstructs the kstring from the header hash table.
790 * Returns 0 on success
793 int sam_hdr_rebuild(SAM_hdr
*hdr
) {
794 /* Order: HD then others */
795 kstring_t ks
= KS_INITIALIZER
;
799 k
= kh_get(sam_hdr
, hdr
->h
, K("HD"));
800 if (k
!= kh_end(hdr
->h
)) {
801 SAM_hdr_type
*ty
= kh_val(hdr
->h
, k
);
803 if (EOF
== kputs("@HD", &ks
))
805 for (tag
= ty
->tag
; tag
; tag
= tag
->next
) {
806 if (EOF
== kputc_('\t', &ks
))
808 if (EOF
== kputsn_(tag
->str
, tag
->len
, &ks
))
811 if (EOF
== kputc('\n', &ks
))
815 for (k
= kh_begin(hdr
->h
); k
!= kh_end(hdr
->h
); k
++) {
816 SAM_hdr_type
*t1
, *t2
;
818 if (!kh_exist(hdr
->h
, k
))
821 if (kh_key(hdr
->h
, k
) == K("HD"))
824 t1
= t2
= kh_val(hdr
->h
, k
);
829 if (EOF
== kputc_('@', &ks
))
831 c
[0] = kh_key(hdr
->h
, k
)>>8;
832 c
[1] = kh_key(hdr
->h
, k
)&0xff;
833 if (EOF
== kputsn_(c
, 2, &ks
))
835 for (tag
= t1
->tag
; tag
; tag
=tag
->next
) {
836 if (EOF
== kputc_('\t', &ks
))
838 if (EOF
== kputsn_(tag
->str
, tag
->len
, &ks
))
841 if (EOF
== kputc('\n', &ks
))
847 if (ks_str(&hdr
->text
))
857 * Creates an empty SAM header, ready to be populated.
859 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
862 SAM_hdr
*sam_hdr_new() {
863 SAM_hdr
*sh
= calloc(1, sizeof(*sh
));
868 sh
->h
= kh_init(sam_hdr
);
877 if (!(sh
->ref_hash
= kh_init(m_s2i
)))
882 if (!(sh
->rg_hash
= kh_init(m_s2i
)))
887 sh
->npg_end
= sh
->npg_end_alloc
= 0;
889 if (!(sh
->pg_hash
= kh_init(m_s2i
)))
894 if (!(sh
->tag_pool
= pool_create(sizeof(SAM_hdr_tag
))))
897 if (!(sh
->type_pool
= pool_create(sizeof(SAM_hdr_type
))))
900 if (!(sh
->str_pool
= string_pool_create(8192)))
907 kh_destroy(sam_hdr
, sh
->h
);
910 pool_destroy(sh
->tag_pool
);
913 pool_destroy(sh
->type_pool
);
916 string_pool_destroy(sh
->str_pool
);
925 * Tokenises a SAM header into a hash table.
926 * Also extracts a few bits on specific data types, such as @RG lines.
928 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
931 SAM_hdr
*sam_hdr_parse_(const char *hdr
, int len
) {
932 /* Make an empty SAM_hdr */
936 if (NULL
== sh
) return NULL
;
938 if (NULL
== hdr
) return sh
; // empty header is permitted
940 /* Parse the header, line by line */
941 if (-1 == sam_hdr_add_lines(sh
, hdr
, len
)) {
946 /* Obtain sort order */
947 sh
->sort_order
= sam_hdr_parse_sort_order(sh
);
950 //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
951 //sam_hdr_rebuild(sh);
952 //printf(">>%s<<", ks_str(sh->text));
954 //parse_references(sh);
955 //parse_read_groups(sh);
964 * Produces a duplicate copy of hdr and returns it.
965 * Returns NULL on failure
967 SAM_hdr
*sam_hdr_dup(SAM_hdr
*hdr
) {
968 if (-1 == sam_hdr_rebuild(hdr
))
971 return sam_hdr_parse_(sam_hdr_str(hdr
), sam_hdr_length(hdr
));
974 /*! Increments a reference count on hdr.
976 * This permits multiple files to share the same header, all calling
977 * sam_hdr_free when done, without causing errors for other open files.
979 void sam_hdr_incr_ref(SAM_hdr
*hdr
) {
983 /*! Increments a reference count on hdr.
985 * This permits multiple files to share the same header, all calling
986 * sam_hdr_free when done, without causing errors for other open files.
988 * If the reference count hits zero then the header is automatically
989 * freed. This makes it a synonym for sam_hdr_free().
991 void sam_hdr_decr_ref(SAM_hdr
*hdr
) {
995 /*! Deallocates all storage used by a SAM_hdr struct.
997 * This also decrements the header reference count. If after decrementing
998 * it is still non-zero then the header is assumed to be in use by another
999 * caller and the free is not done.
1001 * This is a synonym for sam_hdr_dec_ref().
1003 void sam_hdr_free(SAM_hdr
*hdr
) {
1007 if (--hdr
->ref_count
> 0)
1010 if (ks_str(&hdr
->text
))
1011 KS_FREE(&hdr
->text
);
1014 kh_destroy(sam_hdr
, hdr
->h
);
1017 kh_destroy(m_s2i
, hdr
->ref_hash
);
1021 for (i
= 0; i
< hdr
->nref
; i
++)
1022 if (hdr
->ref
[i
].name
)
1023 free(hdr
->ref
[i
].name
);
1028 kh_destroy(m_s2i
, hdr
->rg_hash
);
1032 for (i
= 0; i
< hdr
->nrg
; i
++)
1033 if (hdr
->rg
[i
].name
)
1034 free(hdr
->rg
[i
].name
);
1039 kh_destroy(m_s2i
, hdr
->pg_hash
);
1043 for (i
= 0; i
< hdr
->npg
; i
++)
1044 if (hdr
->pg
[i
].name
)
1045 free(hdr
->pg
[i
].name
);
1053 pool_destroy(hdr
->type_pool
);
1056 pool_destroy(hdr
->tag_pool
);
1059 string_pool_destroy(hdr
->str_pool
);
1064 int sam_hdr_length(SAM_hdr
*hdr
) {
1065 return ks_len(&hdr
->text
);
1068 char *sam_hdr_str(SAM_hdr
*hdr
) {
1069 return ks_str(&hdr
->text
);
1073 * Looks up a reference sequence by name and returns the numerical ID.
1074 * Returns -1 if unknown reference.
1076 int sam_hdr_name2ref(SAM_hdr
*hdr
, const char *ref
) {
1077 khint_t k
= kh_get(m_s2i
, hdr
->ref_hash
, ref
);
1078 return k
== kh_end(hdr
->ref_hash
) ? -1 : kh_val(hdr
->ref_hash
, k
);
1082 * Looks up a read-group by name and returns a pointer to the start of the
1083 * associated tag list.
1085 * Returns NULL on failure
1087 SAM_RG
*sam_hdr_find_rg(SAM_hdr
*hdr
, const char *rg
) {
1088 khint_t k
= kh_get(m_s2i
, hdr
->rg_hash
, rg
);
1089 return k
== kh_end(hdr
->rg_hash
)
1091 : &hdr
->rg
[kh_val(hdr
->rg_hash
, k
)];
1096 * Fixes any PP links in @PG headers.
1097 * If the entries are in order then this doesn't need doing, but incase
1098 * our header is out of order this goes through the sh->pg[] array
1099 * setting the prev_id field.
1101 * Note we can have multiple complete chains. This code should identify the
1102 * tails of these chains as these are the entries we have to link to in
1103 * subsequent PP records.
1105 * Returns 0 on sucess
1106 * -1 on failure (indicating broken PG/PP records)
1108 int sam_hdr_link_pg(SAM_hdr
*hdr
) {
1111 hdr
->npg_end_alloc
= hdr
->npg
;
1112 hdr
->pg_end
= realloc(hdr
->pg_end
, hdr
->npg
* sizeof(*hdr
->pg_end
));
1116 for (i
= 0; i
< hdr
->npg
; i
++)
1119 for (i
= 0; i
< hdr
->npg
; i
++) {
1124 for (tag
= hdr
->pg
[i
].tag
; tag
; tag
= tag
->next
) {
1125 if (tag
->str
[0] == 'P' && tag
->str
[1] == 'P')
1129 /* Chain start points */
1133 tmp
= tag
->str
[tag
->len
]; tag
->str
[tag
->len
] = 0;
1134 k
= kh_get(m_s2i
, hdr
->pg_hash
, tag
->str
+3);
1135 tag
->str
[tag
->len
] = tmp
;
1137 if (k
== kh_end(hdr
->pg_hash
)) {
1142 hdr
->pg
[i
].prev_id
= hdr
->pg
[kh_val(hdr
->pg_hash
, k
)].id
;
1143 hdr
->pg_end
[kh_val(hdr
->pg_hash
, k
)] = -1;
1146 for (i
= j
= 0; i
< hdr
->npg
; i
++) {
1147 if (hdr
->pg_end
[i
] != -1)
1148 hdr
->pg_end
[j
++] = hdr
->pg_end
[i
];
1156 * Returns a unique ID from a base name.
1158 * The value returned is valid until the next call to
1161 const char *sam_hdr_PG_ID(SAM_hdr
*sh
, const char *name
) {
1162 khint_t k
= kh_get(m_s2i
, sh
->pg_hash
, name
);
1163 if (k
== kh_end(sh
->pg_hash
))
1167 sprintf(sh
->ID_buf
, "%.1000s.%d", name
, sh
->ID_cnt
++);
1168 k
= kh_get(m_s2i
, sh
->pg_hash
, sh
->ID_buf
);
1169 } while (k
!= kh_end(sh
->pg_hash
));
1177 * If we wish complete control over this use sam_hdr_add() directly. This
1178 * function uses that, but attempts to do a lot of tedious house work for
1181 * - It will generate a suitable ID if the supplied one clashes.
1182 * - It will generate multiple @PG records if we have multiple PG chains.
1184 * Call it as per sam_hdr_add() with a series of key,value pairs ending
1187 * Returns 0 on success
1190 int sam_hdr_add_PG(SAM_hdr
*sh
, const char *name
, ...) {
1194 /* Copy ends array to avoid us looping while modifying it */
1195 int *end
= malloc(sh
->npg_end
* sizeof(int));
1196 int i
, nends
= sh
->npg_end
;
1201 memcpy(end
, sh
->pg_end
, nends
* sizeof(*end
));
1203 for (i
= 0; i
< nends
; i
++) {
1204 va_start(args
, name
);
1205 if (-1 == sam_hdr_vadd(sh
, "PG", args
,
1206 "ID", sam_hdr_PG_ID(sh
, name
),
1208 "PP", sh
->pg
[end
[i
]].name
,
1218 va_start(args
, name
);
1219 if (-1 == sam_hdr_vadd(sh
, "PG", args
,
1220 "ID", sam_hdr_PG_ID(sh
, name
),
1233 * A function to help with construction of CL tags in @PG records.
1234 * Takes an argc, argv pair and returns a single space-separated string.
1235 * This string should be deallocated by the calling function.
1237 * Returns malloced char * on success
1240 char *stringify_argv(int argc
, char *argv
[]) {
1246 for (i
= 0; i
< argc
; i
++) {
1247 if (i
> 0) nbytes
+= 1;
1248 nbytes
+= strlen(argv
[i
]);
1250 if (!(str
= malloc(nbytes
)))
1255 for (i
= 0; i
< argc
; i
++) {
1256 if (i
> 0) *cp
++ = ' ';
1258 while (argv
[i
][j
]) {
1259 if (argv
[i
][j
] == '\t')