modified: myjupyterlab.sh
[GalaxyCodeBases.git] / c_cpp / lib / htslib / cram / sam_header.c
blobe2cb55d365f3f8d7504c23e31e83ecf1acb9dd94
1 /*
2 Copyright (c) 2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 #include <config.h>
33 #include <string.h>
34 #include <assert.h>
36 #include "cram/sam_header.h"
37 #include "cram/string_alloc.h"
39 static void sam_hdr_error(char *msg, char *line, int len, int lno) {
40 int j;
42 for (j = 0; j < len && line[j] != '\n'; j++)
44 fprintf(stderr, "%s at line %d: \"%.*s\"\n", msg, lno, j, line);
47 void sam_hdr_dump(SAM_hdr *hdr) {
48 khint_t k;
49 int i;
51 printf("===DUMP===\n");
52 for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
53 SAM_hdr_type *t1, *t2;
54 char c[2];
56 if (!kh_exist(hdr->h, k))
57 continue;
59 t1 = t2 = kh_val(hdr->h, k);
60 c[0] = kh_key(hdr->h, k)>>8;
61 c[1] = kh_key(hdr->h, k)&0xff;
62 printf("Type %.2s, count %d\n", c, t1->prev->order+1);
64 do {
65 SAM_hdr_tag *tag;
66 printf(">>>%d ", t1->order);
67 for (tag = t1->tag; tag; tag=tag->next) {
68 printf("\"%.2s\":\"%.*s\"\t",
69 tag->str, tag->len-3, tag->str+3);
71 putchar('\n');
72 t1 = t1->next;
73 } while (t1 != t2);
76 /* Dump out PG chains */
77 printf("\n@PG chains:\n");
78 for (i = 0; i < hdr->npg_end; i++) {
79 int j;
80 printf(" %d:", i);
81 for (j = hdr->pg_end[i]; j != -1; j = hdr->pg[j].prev_id) {
82 printf("%s%d(%.*s)",
83 j == hdr->pg_end[i] ? " " : "->",
84 j, hdr->pg[j].name_len, hdr->pg[j].name);
86 printf("\n");
89 puts("===END DUMP===");
92 /* Updates the hash tables in the SAM_hdr structure.
94 * Returns 0 on success;
95 * -1 on failure
97 static int sam_hdr_update_hashes(SAM_hdr *sh,
98 int type,
99 SAM_hdr_type *h_type) {
100 /* Add to reference hash? */
101 if ((type>>8) == 'S' && (type&0xff) == 'Q') {
102 SAM_hdr_tag *tag;
103 SAM_SQ *new_ref;
104 int nref = sh->nref;
106 new_ref = realloc(sh->ref, (sh->nref+1)*sizeof(*sh->ref));
107 if (!new_ref)
108 return -1;
109 sh->ref = new_ref;
111 tag = h_type->tag;
112 sh->ref[nref].name = NULL;
113 sh->ref[nref].len = 0;
114 sh->ref[nref].ty = h_type;
115 sh->ref[nref].tag = tag;
117 while (tag) {
118 if (tag->str[0] == 'S' && tag->str[1] == 'N') {
119 if (!(sh->ref[nref].name = malloc(tag->len)))
120 return -1;
121 strncpy(sh->ref[nref].name, tag->str+3, tag->len-3);
122 sh->ref[nref].name[tag->len-3] = 0;
123 } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
124 sh->ref[nref].len = atoi(tag->str+3);
126 tag = tag->next;
129 if (sh->ref[nref].name) {
130 khint_t k;
131 int r;
132 k = kh_put(m_s2i, sh->ref_hash, sh->ref[nref].name, &r);
133 if (-1 == r) return -1;
134 kh_val(sh->ref_hash, k) = nref;
135 } else {
136 return -1; // SN should be present, according to spec.
139 sh->nref++;
142 /* Add to read-group hash? */
143 if ((type>>8) == 'R' && (type&0xff) == 'G') {
144 SAM_hdr_tag *tag;
145 SAM_RG *new_rg;
146 int nrg = sh->nrg;
148 new_rg = realloc(sh->rg, (sh->nrg+1)*sizeof(*sh->rg));
149 if (!new_rg)
150 return -1;
151 sh->rg = new_rg;
153 tag = h_type->tag;
154 sh->rg[nrg].name = NULL;
155 sh->rg[nrg].name_len = 0;
156 sh->rg[nrg].ty = h_type;
157 sh->rg[nrg].tag = tag;
158 sh->rg[nrg].id = nrg;
160 while (tag) {
161 if (tag->str[0] == 'I' && tag->str[1] == 'D') {
162 if (!(sh->rg[nrg].name = malloc(tag->len)))
163 return -1;
164 strncpy(sh->rg[nrg].name, tag->str+3, tag->len-3);
165 sh->rg[nrg].name[tag->len-3] = 0;
166 sh->rg[nrg].name_len = strlen(sh->rg[nrg].name);
168 tag = tag->next;
171 if (sh->rg[nrg].name) {
172 khint_t k;
173 int r;
174 k = kh_put(m_s2i, sh->rg_hash, sh->rg[nrg].name, &r);
175 if (-1 == r) return -1;
176 kh_val(sh->rg_hash, k) = nrg;
177 } else {
178 return -1; // ID should be present, according to spec.
181 sh->nrg++;
184 /* Add to program hash? */
185 if ((type>>8) == 'P' && (type&0xff) == 'G') {
186 SAM_hdr_tag *tag;
187 SAM_PG *new_pg;
188 int npg = sh->npg;
190 new_pg = realloc(sh->pg, (sh->npg+1)*sizeof(*sh->pg));
191 if (!new_pg)
192 return -1;
193 sh->pg = new_pg;
195 tag = h_type->tag;
196 sh->pg[npg].name = NULL;
197 sh->pg[npg].name_len = 0;
198 sh->pg[npg].ty = h_type;
199 sh->pg[npg].tag = tag;
200 sh->pg[npg].id = npg;
201 sh->pg[npg].prev_id = -1;
203 while (tag) {
204 if (tag->str[0] == 'I' && tag->str[1] == 'D') {
205 if (!(sh->pg[npg].name = malloc(tag->len)))
206 return -1;
207 strncpy(sh->pg[npg].name, tag->str+3, tag->len-3);
208 sh->pg[npg].name[tag->len-3] = 0;
209 sh->pg[npg].name_len = strlen(sh->pg[npg].name);
210 } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
211 // Resolve later if needed
212 khint_t k;
213 char tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
214 k = kh_get(m_s2i, sh->pg_hash, tag->str+3);
215 tag->str[tag->len] = tmp;
217 if (k != kh_end(sh->pg_hash)) {
218 int p_id = kh_val(sh->pg_hash, k);
219 sh->pg[npg].prev_id = sh->pg[p_id].id;
221 /* Unmark previous entry as a PG termination */
222 if (sh->npg_end > 0 &&
223 sh->pg_end[sh->npg_end-1] == p_id) {
224 sh->npg_end--;
225 } else {
226 int i;
227 for (i = 0; i < sh->npg_end; i++) {
228 if (sh->pg_end[i] == p_id) {
229 memmove(&sh->pg_end[i], &sh->pg_end[i+1],
230 (sh->npg_end-i-1)*sizeof(*sh->pg_end));
231 sh->npg_end--;
235 } else {
236 sh->pg[npg].prev_id = -1;
239 tag = tag->next;
242 if (sh->pg[npg].name) {
243 khint_t k;
244 int r;
245 k = kh_put(m_s2i, sh->pg_hash, sh->pg[npg].name, &r);
246 if (-1 == r) return -1;
247 kh_val(sh->pg_hash, k) = npg;
248 } else {
249 return -1; // ID should be present, according to spec.
252 /* Add to npg_end[] array. Remove later if we find a PP line */
253 if (sh->npg_end >= sh->npg_end_alloc) {
254 int *new_pg_end;
255 int new_alloc = sh->npg_end_alloc ? sh->npg_end_alloc*2 : 4;
257 new_pg_end = realloc(sh->pg_end, new_alloc * sizeof(int));
258 if (!new_pg_end)
259 return -1;
260 sh->npg_end_alloc = new_alloc;
261 sh->pg_end = new_pg_end;
263 sh->pg_end[sh->npg_end++] = npg;
265 sh->npg++;
268 return 0;
272 * Appends a formatted line to an existing SAM header.
273 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
274 * optional new-line. If it contains more than 1 line then multiple lines
275 * will be added in order.
277 * Input text is of maximum length len or as terminated earlier by a NUL.
278 * Len may be 0 if unknown, in which case lines must be NUL-terminated.
280 * Returns 0 on success
281 * -1 on failure
283 int sam_hdr_add_lines(SAM_hdr *sh, const char *lines, int len) {
284 int i, lno, text_offset;
285 char *hdr;
287 if (!len)
288 len = strlen(lines);
290 text_offset = ks_len(&sh->text);
291 if (EOF == kputsn(lines, len, &sh->text))
292 return -1;
293 hdr = ks_str(&sh->text) + text_offset;
295 for (i = 0, lno = 1; i < len && hdr[i] != '\0'; i++, lno++) {
296 khint32_t type;
297 khint_t k;
299 int l_start = i, new;
300 SAM_hdr_type *h_type;
301 SAM_hdr_tag *h_tag, *last;
303 if (hdr[i] != '@') {
304 int j;
305 for (j = i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
307 sam_hdr_error("Header line does not start with '@'",
308 &hdr[l_start], len - l_start, lno);
309 return -1;
312 type = (hdr[i+1]<<8) | hdr[i+2];
313 if (hdr[i+1] < 'A' || hdr[i+1] > 'z' ||
314 hdr[i+2] < 'A' || hdr[i+2] > 'z') {
315 sam_hdr_error("Header line does not have a two character key",
316 &hdr[l_start], len - l_start, lno);
317 return -1;
320 i += 3;
321 if (hdr[i] == '\n')
322 continue;
324 // Add the header line type
325 if (!(h_type = pool_alloc(sh->type_pool)))
326 return -1;
327 if (-1 == (k = kh_put(sam_hdr, sh->h, type, &new)))
328 return -1;
330 // Form the ring, either with self or other lines of this type
331 if (!new) {
332 SAM_hdr_type *t = kh_val(sh->h, k), *p;
333 p = t->prev;
335 assert(p->next == t);
336 p->next = h_type;
337 h_type->prev = p;
339 t->prev = h_type;
340 h_type->next = t;
341 h_type->order = p->order+1;
342 } else {
343 kh_val(sh->h, k) = h_type;
344 h_type->prev = h_type->next = h_type;
345 h_type->order = 0;
348 // Parse the tags on this line
349 last = NULL;
350 if ((type>>8) == 'C' && (type&0xff) == 'O') {
351 int j;
352 if (hdr[i] != '\t') {
353 sam_hdr_error("Missing tab",
354 &hdr[l_start], len - l_start, lno);
355 return -1;
358 for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
361 if (!(h_type->tag = h_tag = pool_alloc(sh->tag_pool)))
362 return -1;
363 h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
364 h_tag->len = j-i;
365 h_tag->next = NULL;
366 if (!h_tag->str)
367 return -1;
369 i = j;
371 } else {
372 do {
373 int j;
374 if (hdr[i] != '\t') {
375 sam_hdr_error("Missing tab",
376 &hdr[l_start], len - l_start, lno);
377 return -1;
380 for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++)
383 if (!(h_tag = pool_alloc(sh->tag_pool)))
384 return -1;
385 h_tag->str = string_ndup(sh->str_pool, &hdr[i], j-i);
386 h_tag->len = j-i;
387 h_tag->next = NULL;
388 if (!h_tag->str)
389 return -1;
391 if (h_tag->len < 3 || h_tag->str[2] != ':') {
392 sam_hdr_error("Malformed key:value pair",
393 &hdr[l_start], len - l_start, lno);
394 return -1;
397 if (last)
398 last->next = h_tag;
399 else
400 h_type->tag = h_tag;
402 last = h_tag;
403 i = j;
404 } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
407 /* Update RG/SQ hashes */
408 if (-1 == sam_hdr_update_hashes(sh, type, h_type))
409 return -1;
412 return 0;
416 * Adds a single line to a SAM header.
417 * Specify type and one or more key,value pairs, ending with the NULL key.
418 * Eg. sam_hdr_add(h, "SQ", "ID", "foo", "LN", "100", NULL).
420 * Returns index for specific entry on success (eg 2nd SQ, 4th RG)
421 * -1 on failure
423 int sam_hdr_add(SAM_hdr *sh, const char *type, ...) {
424 va_list args;
425 va_start(args, type);
426 return sam_hdr_vadd(sh, type, args, NULL);
430 * sam_hdr_add with a va_list interface.
432 * Note: this function invokes va_arg at least once, making the value
433 * of ap indeterminate after the return. The caller should call
434 * va_start/va_end before/after calling this function or use va_copy.
436 int sam_hdr_vadd(SAM_hdr *sh, const char *type, va_list ap, ...) {
437 va_list args;
438 SAM_hdr_type *h_type;
439 SAM_hdr_tag *h_tag, *last;
440 int new;
441 khint32_t type_i = (type[0]<<8) | type[1], k;
443 if (EOF == kputc_('@', &sh->text))
444 return -1;
445 if (EOF == kputsn(type, 2, &sh->text))
446 return -1;
448 if (!(h_type = pool_alloc(sh->type_pool)))
449 return -1;
450 if (-1 == (k = kh_put(sam_hdr, sh->h, type_i, &new)))
451 return -1;
453 // Form the ring, either with self or other lines of this type
454 if (!new) {
455 SAM_hdr_type *t = kh_val(sh->h, k), *p;
456 p = t->prev;
458 assert(p->next == t);
459 p->next = h_type;
460 h_type->prev = p;
462 t->prev = h_type;
463 h_type->next = t;
464 h_type->order = p->order + 1;
465 } else {
466 kh_val(sh->h, k) = h_type;
467 h_type->prev = h_type->next = h_type;
468 h_type->order = 0;
471 last = NULL;
473 // Any ... varargs
474 va_start(args, ap);
475 for (;;) {
476 char *k, *v;
477 int idx;
479 if (!(k = (char *)va_arg(args, char *)))
480 break;
481 v = va_arg(args, char *);
483 if (EOF == kputc_('\t', &sh->text))
484 return -1;
486 if (!(h_tag = pool_alloc(sh->tag_pool)))
487 return -1;
488 idx = ks_len(&sh->text);
490 if (EOF == kputs(k, &sh->text))
491 return -1;
492 if (EOF == kputc_(':', &sh->text))
493 return -1;
494 if (EOF == kputs(v, &sh->text))
495 return -1;
497 h_tag->len = ks_len(&sh->text) - idx;
498 h_tag->str = string_ndup(sh->str_pool,
499 ks_str(&sh->text) + idx,
500 h_tag->len);
501 h_tag->next = NULL;
502 if (!h_tag->str)
503 return -1;
505 if (last)
506 last->next = h_tag;
507 else
508 h_type->tag = h_tag;
510 last = h_tag;
512 va_end(args);
514 // Plus the specified va_list params
515 for (;;) {
516 char *k, *v;
517 int idx;
519 if (!(k = (char *)va_arg(ap, char *)))
520 break;
521 v = va_arg(ap, char *);
523 if (EOF == kputc_('\t', &sh->text))
524 return -1;
526 if (!(h_tag = pool_alloc(sh->tag_pool)))
527 return -1;
528 idx = ks_len(&sh->text);
530 if (EOF == kputs(k, &sh->text))
531 return -1;
532 if (EOF == kputc_(':', &sh->text))
533 return -1;
534 if (EOF == kputs(v, &sh->text))
535 return -1;
537 h_tag->len = ks_len(&sh->text) - idx;
538 h_tag->str = string_ndup(sh->str_pool,
539 ks_str(&sh->text) + idx,
540 h_tag->len);
541 h_tag->next = NULL;
542 if (!h_tag->str)
543 return -1;
545 if (last)
546 last->next = h_tag;
547 else
548 h_type->tag = h_tag;
550 last = h_tag;
552 va_end(ap);
554 if (EOF == kputc('\n', &sh->text))
555 return -1;
557 int itype = (type[0]<<8) | type[1];
558 if (-1 == sam_hdr_update_hashes(sh, itype, h_type))
559 return -1;
561 return h_type->order;
565 * Returns the first header item matching 'type'. If ID is non-NULL it checks
566 * for the tag ID: and compares against the specified ID.
568 * Returns NULL if no type/ID is found
570 SAM_hdr_type *sam_hdr_find(SAM_hdr *hdr, char *type,
571 char *ID_key, char *ID_value) {
572 SAM_hdr_type *t1, *t2;
573 int itype = (type[0]<<8)|(type[1]);
574 khint_t k;
576 /* Special case for types we have prebuilt hashes on */
577 if (ID_key) {
578 if (type[0] == 'S' && type[1] == 'Q' &&
579 ID_key[0] == 'S' && ID_key[1] == 'N') {
580 k = kh_get(m_s2i, hdr->ref_hash, ID_value);
581 return k != kh_end(hdr->ref_hash)
582 ? hdr->ref[kh_val(hdr->ref_hash, k)].ty
583 : NULL;
586 if (type[0] == 'R' && type[1] == 'G' &&
587 ID_key[0] == 'I' && ID_key[1] == 'D') {
588 k = kh_get(m_s2i, hdr->rg_hash, ID_value);
589 return k != kh_end(hdr->rg_hash)
590 ? hdr->rg[kh_val(hdr->rg_hash, k)].ty
591 : NULL;
594 if (type[0] == 'P' && type[1] == 'G' &&
595 ID_key[0] == 'I' && ID_key[1] == 'D') {
596 k = kh_get(m_s2i, hdr->pg_hash, ID_value);
597 return k != kh_end(hdr->pg_hash)
598 ? hdr->pg[kh_val(hdr->pg_hash, k)].ty
599 : NULL;
603 k = kh_get(sam_hdr, hdr->h, itype);
604 if (k == kh_end(hdr->h))
605 return NULL;
607 if (!ID_key)
608 return kh_val(hdr->h, k);
610 t1 = t2 = kh_val(hdr->h, k);
611 do {
612 SAM_hdr_tag *tag;
613 for (tag = t1->tag; tag; tag = tag->next) {
614 if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
615 char *cp1 = tag->str+3;
616 char *cp2 = ID_value;
617 while (*cp1 && *cp1 == *cp2)
618 cp1++, cp2++;
619 if (*cp2 || *cp1)
620 continue;
621 return t1;
624 t1 = t1->next;
625 } while (t1 != t2);
627 return NULL;
631 * As per SAM_hdr_type, but returns a complete line of formatted text
632 * for a specific head type/ID combination. If ID is NULL then it returns
633 * the first line of the specified type.
635 * The returned string is malloced and should be freed by the calling
636 * function with free().
638 * Returns NULL if no type/ID is found.
640 char *sam_hdr_find_line(SAM_hdr *hdr, char *type,
641 char *ID_key, char *ID_value) {
642 SAM_hdr_type *ty = sam_hdr_find(hdr, type, ID_key, ID_value);
643 kstring_t ks = KS_INITIALIZER;
644 SAM_hdr_tag *tag;
645 int r = 0;
647 if (!ty)
648 return NULL;
650 // Paste together the line from the hashed copy
651 r |= (kputc_('@', &ks) == EOF);
652 r |= (kputs(type, &ks) == EOF);
653 for (tag = ty->tag; tag; tag = tag->next) {
654 r |= (kputc_('\t', &ks) == EOF);
655 r |= (kputsn(tag->str, tag->len, &ks) == EOF);
658 if (r) {
659 KS_FREE(&ks);
660 return NULL;
663 return ks_str(&ks);
668 * Looks for a specific key in a single sam header line.
669 * If prev is non-NULL it also fills this out with the previous tag, to
670 * permit use in key removal. *prev is set to NULL when the tag is the first
671 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
672 * tag in the existing list.
674 * Returns the tag pointer on success
675 * NULL on failure
677 SAM_hdr_tag *sam_hdr_find_key(SAM_hdr *sh,
678 SAM_hdr_type *type,
679 char *key,
680 SAM_hdr_tag **prev) {
681 SAM_hdr_tag *tag, *p = NULL;
683 for (tag = type->tag; tag; p = tag, tag = tag->next) {
684 if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
685 if (prev)
686 *prev = p;
687 return tag;
691 if (prev)
692 *prev = p;
694 return NULL;
699 * Adds or updates tag key,value pairs in a header line.
700 * Eg for adding M5 tags to @SQ lines or updating sort order for the
701 * @HD line (although use the sam_hdr_sort_order() function for
702 * HD manipulation, which is a wrapper around this funuction).
704 * Specify multiple key,value pairs ending in NULL.
706 * Returns 0 on success
707 * -1 on failure
709 int sam_hdr_update(SAM_hdr *hdr, SAM_hdr_type *type, ...) {
710 va_list ap;
712 va_start(ap, type);
714 for (;;) {
715 char *k, *v;
716 int idx;
717 SAM_hdr_tag *tag, *prev;
719 if (!(k = (char *)va_arg(ap, char *)))
720 break;
721 v = va_arg(ap, char *);
723 tag = sam_hdr_find_key(hdr, type, k, &prev);
724 if (!tag) {
725 if (!(tag = pool_alloc(hdr->tag_pool)))
726 return -1;
727 if (prev)
728 prev->next = tag;
729 else
730 type->tag = tag;
732 tag->next = NULL;
735 idx = ks_len(&hdr->text);
736 if (ksprintf(&hdr->text, "%2.2s:%s", k, v) < 0)
737 return -1;
738 tag->len = ks_len(&hdr->text) - idx;
739 tag->str = string_ndup(hdr->str_pool,
740 ks_str(&hdr->text) + idx,
741 tag->len);
742 if (!tag->str)
743 return -1;
746 va_end(ap);
748 return 0;
751 #define K(a) (((a)[0]<<8)|((a)[1]))
754 * Returns the sort order:
756 enum sam_sort_order sam_hdr_sort_order(SAM_hdr *hdr) {
757 return hdr->sort_order;
760 static enum sam_sort_order sam_hdr_parse_sort_order(SAM_hdr *hdr) {
761 khint_t k;
762 enum sam_sort_order so;
764 so = ORDER_UNKNOWN;
765 k = kh_get(sam_hdr, hdr->h, K("HD"));
766 if (k != kh_end(hdr->h)) {
767 SAM_hdr_type *ty = kh_val(hdr->h, k);
768 SAM_hdr_tag *tag;
769 for (tag = ty->tag; tag; tag = tag->next) {
770 if (tag->str[0] == 'S' && tag->str[1] == 'O') {
771 if (strcmp(tag->str+3, "unsorted") == 0)
772 so = ORDER_UNSORTED;
773 else if (strcmp(tag->str+3, "queryname") == 0)
774 so = ORDER_NAME;
775 else if (strcmp(tag->str+3, "coordinate") == 0)
776 so = ORDER_COORD;
777 else if (strcmp(tag->str+3, "unknown") != 0)
778 fprintf(stderr, "Unknown sort order field: %s\n",
779 tag->str+3);
784 return so;
789 * Reconstructs the kstring from the header hash table.
790 * Returns 0 on success
791 * -1 on failure
793 int sam_hdr_rebuild(SAM_hdr *hdr) {
794 /* Order: HD then others */
795 kstring_t ks = KS_INITIALIZER;
796 khint_t k;
799 k = kh_get(sam_hdr, hdr->h, K("HD"));
800 if (k != kh_end(hdr->h)) {
801 SAM_hdr_type *ty = kh_val(hdr->h, k);
802 SAM_hdr_tag *tag;
803 if (EOF == kputs("@HD", &ks))
804 return -1;
805 for (tag = ty->tag; tag; tag = tag->next) {
806 if (EOF == kputc_('\t', &ks))
807 return -1;
808 if (EOF == kputsn_(tag->str, tag->len, &ks))
809 return -1;
811 if (EOF == kputc('\n', &ks))
812 return -1;
815 for (k = kh_begin(hdr->h); k != kh_end(hdr->h); k++) {
816 SAM_hdr_type *t1, *t2;
818 if (!kh_exist(hdr->h, k))
819 continue;
821 if (kh_key(hdr->h, k) == K("HD"))
822 continue;
824 t1 = t2 = kh_val(hdr->h, k);
825 do {
826 SAM_hdr_tag *tag;
827 char c[2];
829 if (EOF == kputc_('@', &ks))
830 return -1;
831 c[0] = kh_key(hdr->h, k)>>8;
832 c[1] = kh_key(hdr->h, k)&0xff;
833 if (EOF == kputsn_(c, 2, &ks))
834 return -1;
835 for (tag = t1->tag; tag; tag=tag->next) {
836 if (EOF == kputc_('\t', &ks))
837 return -1;
838 if (EOF == kputsn_(tag->str, tag->len, &ks))
839 return -1;
841 if (EOF == kputc('\n', &ks))
842 return -1;
843 t1 = t1->next;
844 } while (t1 != t2);
847 if (ks_str(&hdr->text))
848 KS_FREE(&hdr->text);
850 hdr->text = ks;
852 return 0;
857 * Creates an empty SAM header, ready to be populated.
859 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
860 * NULL on failure
862 SAM_hdr *sam_hdr_new() {
863 SAM_hdr *sh = calloc(1, sizeof(*sh));
865 if (!sh)
866 return NULL;
868 sh->h = kh_init(sam_hdr);
869 if (!sh->h)
870 goto err;
872 sh->ID_cnt = 1;
873 sh->ref_count = 1;
875 sh->nref = 0;
876 sh->ref = NULL;
877 if (!(sh->ref_hash = kh_init(m_s2i)))
878 goto err;
880 sh->nrg = 0;
881 sh->rg = NULL;
882 if (!(sh->rg_hash = kh_init(m_s2i)))
883 goto err;
885 sh->npg = 0;
886 sh->pg = NULL;
887 sh->npg_end = sh->npg_end_alloc = 0;
888 sh->pg_end = NULL;
889 if (!(sh->pg_hash = kh_init(m_s2i)))
890 goto err;
892 KS_INIT(&sh->text);
894 if (!(sh->tag_pool = pool_create(sizeof(SAM_hdr_tag))))
895 goto err;
897 if (!(sh->type_pool = pool_create(sizeof(SAM_hdr_type))))
898 goto err;
900 if (!(sh->str_pool = string_pool_create(8192)))
901 goto err;
903 return sh;
905 err:
906 if (sh->h)
907 kh_destroy(sam_hdr, sh->h);
909 if (sh->tag_pool)
910 pool_destroy(sh->tag_pool);
912 if (sh->type_pool)
913 pool_destroy(sh->type_pool);
915 if (sh->str_pool)
916 string_pool_destroy(sh->str_pool);
918 free(sh);
920 return NULL;
925 * Tokenises a SAM header into a hash table.
926 * Also extracts a few bits on specific data types, such as @RG lines.
928 * Returns a SAM_hdr struct on success (free with sam_hdr_free())
929 * NULL on failure
931 SAM_hdr *sam_hdr_parse_(const char *hdr, int len) {
932 /* Make an empty SAM_hdr */
933 SAM_hdr *sh;
935 sh = sam_hdr_new();
936 if (NULL == sh) return NULL;
938 if (NULL == hdr) return sh; // empty header is permitted
940 /* Parse the header, line by line */
941 if (-1 == sam_hdr_add_lines(sh, hdr, len)) {
942 sam_hdr_free(sh);
943 return NULL;
946 /* Obtain sort order */
947 sh->sort_order = sam_hdr_parse_sort_order(sh);
949 //sam_hdr_dump(sh);
950 //sam_hdr_add(sh, "RG", "ID", "foo", "SM", "bar", NULL);
951 //sam_hdr_rebuild(sh);
952 //printf(">>%s<<", ks_str(sh->text));
954 //parse_references(sh);
955 //parse_read_groups(sh);
957 sam_hdr_link_pg(sh);
958 //sam_hdr_dump(sh);
960 return sh;
964 * Produces a duplicate copy of hdr and returns it.
965 * Returns NULL on failure
967 SAM_hdr *sam_hdr_dup(SAM_hdr *hdr) {
968 if (-1 == sam_hdr_rebuild(hdr))
969 return NULL;
971 return sam_hdr_parse_(sam_hdr_str(hdr), sam_hdr_length(hdr));
974 /*! Increments a reference count on hdr.
976 * This permits multiple files to share the same header, all calling
977 * sam_hdr_free when done, without causing errors for other open files.
979 void sam_hdr_incr_ref(SAM_hdr *hdr) {
980 hdr->ref_count++;
983 /*! Increments a reference count on hdr.
985 * This permits multiple files to share the same header, all calling
986 * sam_hdr_free when done, without causing errors for other open files.
988 * If the reference count hits zero then the header is automatically
989 * freed. This makes it a synonym for sam_hdr_free().
991 void sam_hdr_decr_ref(SAM_hdr *hdr) {
992 sam_hdr_free(hdr);
995 /*! Deallocates all storage used by a SAM_hdr struct.
997 * This also decrements the header reference count. If after decrementing
998 * it is still non-zero then the header is assumed to be in use by another
999 * caller and the free is not done.
1001 * This is a synonym for sam_hdr_dec_ref().
1003 void sam_hdr_free(SAM_hdr *hdr) {
1004 if (!hdr)
1005 return;
1007 if (--hdr->ref_count > 0)
1008 return;
1010 if (ks_str(&hdr->text))
1011 KS_FREE(&hdr->text);
1013 if (hdr->h)
1014 kh_destroy(sam_hdr, hdr->h);
1016 if (hdr->ref_hash)
1017 kh_destroy(m_s2i, hdr->ref_hash);
1019 if (hdr->ref) {
1020 int i;
1021 for (i = 0; i < hdr->nref; i++)
1022 if (hdr->ref[i].name)
1023 free(hdr->ref[i].name);
1024 free(hdr->ref);
1027 if (hdr->rg_hash)
1028 kh_destroy(m_s2i, hdr->rg_hash);
1030 if (hdr->rg) {
1031 int i;
1032 for (i = 0; i < hdr->nrg; i++)
1033 if (hdr->rg[i].name)
1034 free(hdr->rg[i].name);
1035 free(hdr->rg);
1038 if (hdr->pg_hash)
1039 kh_destroy(m_s2i, hdr->pg_hash);
1041 if (hdr->pg) {
1042 int i;
1043 for (i = 0; i < hdr->npg; i++)
1044 if (hdr->pg[i].name)
1045 free(hdr->pg[i].name);
1046 free(hdr->pg);
1049 if (hdr->pg_end)
1050 free(hdr->pg_end);
1052 if (hdr->type_pool)
1053 pool_destroy(hdr->type_pool);
1055 if (hdr->tag_pool)
1056 pool_destroy(hdr->tag_pool);
1058 if (hdr->str_pool)
1059 string_pool_destroy(hdr->str_pool);
1061 free(hdr);
1064 int sam_hdr_length(SAM_hdr *hdr) {
1065 return ks_len(&hdr->text);
1068 char *sam_hdr_str(SAM_hdr *hdr) {
1069 return ks_str(&hdr->text);
1073 * Looks up a reference sequence by name and returns the numerical ID.
1074 * Returns -1 if unknown reference.
1076 int sam_hdr_name2ref(SAM_hdr *hdr, const char *ref) {
1077 khint_t k = kh_get(m_s2i, hdr->ref_hash, ref);
1078 return k == kh_end(hdr->ref_hash) ? -1 : kh_val(hdr->ref_hash, k);
1082 * Looks up a read-group by name and returns a pointer to the start of the
1083 * associated tag list.
1085 * Returns NULL on failure
1087 SAM_RG *sam_hdr_find_rg(SAM_hdr *hdr, const char *rg) {
1088 khint_t k = kh_get(m_s2i, hdr->rg_hash, rg);
1089 return k == kh_end(hdr->rg_hash)
1090 ? NULL
1091 : &hdr->rg[kh_val(hdr->rg_hash, k)];
1096 * Fixes any PP links in @PG headers.
1097 * If the entries are in order then this doesn't need doing, but incase
1098 * our header is out of order this goes through the sh->pg[] array
1099 * setting the prev_id field.
1101 * Note we can have multiple complete chains. This code should identify the
1102 * tails of these chains as these are the entries we have to link to in
1103 * subsequent PP records.
1105 * Returns 0 on sucess
1106 * -1 on failure (indicating broken PG/PP records)
1108 int sam_hdr_link_pg(SAM_hdr *hdr) {
1109 int i, j, ret = 0;
1111 hdr->npg_end_alloc = hdr->npg;
1112 hdr->pg_end = realloc(hdr->pg_end, hdr->npg * sizeof(*hdr->pg_end));
1113 if (!hdr->pg_end)
1114 return -1;
1116 for (i = 0; i < hdr->npg; i++)
1117 hdr->pg_end[i] = i;
1119 for (i = 0; i < hdr->npg; i++) {
1120 khint_t k;
1121 SAM_hdr_tag *tag;
1122 char tmp;
1124 for (tag = hdr->pg[i].tag; tag; tag = tag->next) {
1125 if (tag->str[0] == 'P' && tag->str[1] == 'P')
1126 break;
1128 if (!tag) {
1129 /* Chain start points */
1130 continue;
1133 tmp = tag->str[tag->len]; tag->str[tag->len] = 0;
1134 k = kh_get(m_s2i, hdr->pg_hash, tag->str+3);
1135 tag->str[tag->len] = tmp;
1137 if (k == kh_end(hdr->pg_hash)) {
1138 ret = -1;
1139 continue;
1142 hdr->pg[i].prev_id = hdr->pg[kh_val(hdr->pg_hash, k)].id;
1143 hdr->pg_end[kh_val(hdr->pg_hash, k)] = -1;
1146 for (i = j = 0; i < hdr->npg; i++) {
1147 if (hdr->pg_end[i] != -1)
1148 hdr->pg_end[j++] = hdr->pg_end[i];
1150 hdr->npg_end = j;
1152 return ret;
1156 * Returns a unique ID from a base name.
1158 * The value returned is valid until the next call to
1159 * this function.
1161 const char *sam_hdr_PG_ID(SAM_hdr *sh, const char *name) {
1162 khint_t k = kh_get(m_s2i, sh->pg_hash, name);
1163 if (k == kh_end(sh->pg_hash))
1164 return name;
1166 do {
1167 sprintf(sh->ID_buf, "%.1000s.%d", name, sh->ID_cnt++);
1168 k = kh_get(m_s2i, sh->pg_hash, sh->ID_buf);
1169 } while (k != kh_end(sh->pg_hash));
1171 return sh->ID_buf;
1175 * Add an @PG line.
1177 * If we wish complete control over this use sam_hdr_add() directly. This
1178 * function uses that, but attempts to do a lot of tedious house work for
1179 * you too.
1181 * - It will generate a suitable ID if the supplied one clashes.
1182 * - It will generate multiple @PG records if we have multiple PG chains.
1184 * Call it as per sam_hdr_add() with a series of key,value pairs ending
1185 * in NULL.
1187 * Returns 0 on success
1188 * -1 on failure
1190 int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...) {
1191 va_list args;
1193 if (sh->npg_end) {
1194 /* Copy ends array to avoid us looping while modifying it */
1195 int *end = malloc(sh->npg_end * sizeof(int));
1196 int i, nends = sh->npg_end;
1198 if (!end)
1199 return -1;
1201 memcpy(end, sh->pg_end, nends * sizeof(*end));
1203 for (i = 0; i < nends; i++) {
1204 va_start(args, name);
1205 if (-1 == sam_hdr_vadd(sh, "PG", args,
1206 "ID", sam_hdr_PG_ID(sh, name),
1207 "PN", name,
1208 "PP", sh->pg[end[i]].name,
1209 NULL)) {
1210 free(end);
1211 return -1;
1213 va_end(args);
1216 free(end);
1217 } else {
1218 va_start(args, name);
1219 if (-1 == sam_hdr_vadd(sh, "PG", args,
1220 "ID", sam_hdr_PG_ID(sh, name),
1221 "PN", name,
1222 NULL))
1223 return -1;
1224 va_end(args);
1227 //sam_hdr_dump(sh);
1229 return 0;
1233 * A function to help with construction of CL tags in @PG records.
1234 * Takes an argc, argv pair and returns a single space-separated string.
1235 * This string should be deallocated by the calling function.
1237 * Returns malloced char * on success
1238 * NULL on failure
1240 char *stringify_argv(int argc, char *argv[]) {
1241 char *str, *cp;
1242 size_t nbytes = 1;
1243 int i, j;
1245 /* Allocate */
1246 for (i = 0; i < argc; i++) {
1247 if (i > 0) nbytes += 1;
1248 nbytes += strlen(argv[i]);
1250 if (!(str = malloc(nbytes)))
1251 return NULL;
1253 /* Copy */
1254 cp = str;
1255 for (i = 0; i < argc; i++) {
1256 if (i > 0) *cp++ = ' ';
1257 j = 0;
1258 while (argv[i][j]) {
1259 if (argv[i][j] == '\t')
1260 *cp++ = ' ';
1261 else
1262 *cp++ = argv[i][j];
1263 j++;
1266 *cp++ = 0;
1268 return str;