modified: src1/input.c
[GalaxyCodeBases.git] / c_cpp / faststater / gFileIO.c
blob2361d4db5f988bbf75ae075a1087572f138491b7
1 #include <sys/types.h>
2 #include <sys/stat.h>
3 #include <fcntl.h> // open, above 3
4 #include <unistd.h> // read, close
5 #include <err.h>
6 #include <string.h> // memcmp
7 #include <stdlib.h> //calloc
8 #include <stdio.h> // puts
9 #include <stdint.h> // uint_fast8_t
10 //#include <zlib.h> // already in gkseq.h
11 #include "gkseq.h" // No more `kseq.h` with tons of Macros !!!
12 #include "gFileIO.h"
13 #include "2bitseq.h"
14 #include "chrseq.h"
16 ssize_t read_kseq_no2bit(SeqFileObj * const seqObj) {
17 size_t seqlen; // in fact size_t, but minus values are meanful.
18 int_fast8_t rvalue = kseq_read(seqObj->fobj);
19 if (rvalue>0) {
20 uint_fast8_t type = rvalue; // 1 or 3. No need to &3 now.
21 kseq_t *kseq;
22 kseq = seqObj->fobj;
23 seqlen = kseq->seq.l;
24 seqObj->name = kseq->name.s;
25 seqObj->comment = kseq->comment.s;
26 seqObj->seq = kseq->seq.s;
27 NormalizeChrSeq(kseq->seq.s); // to /[ATCGN]*/
28 if (rvalue&2) { // withQ
29 //encodeQ;
30 type |= 8u;
31 seqObj->qual = kseq->qual.s;
32 } else {
33 seqObj->qual = NULL;
35 seqObj->readlength = seqlen;
36 seqObj->type = type;
37 return seqlen;
38 } else return rvalue;
40 ssize_t read_kseq_with2bit(SeqFileObj * const seqObj) {
41 size_t seqlen; // in fact size_t, but minus values are meanful.
42 int_fast8_t rvalue = kseq_read(seqObj->fobj);
43 if (rvalue>0) {
44 uint_fast8_t type = rvalue; // 1 or 3. No need to &3 now.
45 kseq_t *kseq;
46 kseq = seqObj->fobj;
47 seqlen = kseq->seq.l;
48 seqObj->name = kseq->name.s;
49 seqObj->comment = kseq->comment.s;
50 seqObj->seq = kseq->seq.s;
51 if (rvalue&2) { // withQ
52 //encodeQ;
53 type |= 8u;
54 seqObj->qual = kseq->qual.s;
55 } else {
56 seqObj->qual = NULL;
58 size_t needtomallocQQW = (seqlen+31u)>>5; // 1 "QQWord" = 4 QWord = 32 bp. Well, I say there is QQW.
59 if (needtomallocQQW > seqObj->binMallocedQQWord) {
60 KROUNDUP32(needtomallocQQW);
61 seqObj->binMallocedQQWord = needtomallocQQW;
62 seqObj->diBseq = realloc(seqObj->diBseq,needtomallocQQW<<3); // 2^3=8
63 seqObj->hexBQ = realloc(seqObj->hexBQ,needtomallocQQW<<5); // 4*2^3=32
65 seqObj->binNcount = base2dbit(seqlen, kseq->seq.s, seqObj->qual, seqObj->diBseq, seqObj->hexBQ);
66 // printf("-[%s]<%s><%zx>[%s]-\n",kseq->seq.s, qstr, seqObj->diBseq[0], unit2basechr(seqObj->diBseq[0]));
67 // Well, how to deal with smallcase masking ? Not using this information yet.
68 NormalizeChrSeq(kseq->seq.s); // to /[ATCGN]*/
69 seqObj->readlength = seqlen;
70 seqObj->type = type;
71 return seqlen;
72 } else return rvalue;
74 void close_kseq(SeqFileObj * const seqObj){
75 gzFile fp = ((kseq_t*)seqObj->fobj)->f->f;
76 gzclose(fp);
77 kseq_destroy(seqObj->fobj);
80 // binmode: 1=base char, 2=base 2bit
81 SeqFileObj * inSeqFinit(const char * const filename, unsigned char binmode) {
82 int fd;
83 ssize_t fdstat;
84 char FileID[G_HEADER_LENGTH];
85 fd = open(filename, O_RDONLY);
86 if (fd == -1) {warn("\n[x]Cannot open Sequence_file [%s]", filename); return NULL;}
87 fdstat = read(fd, FileID, G_HEADER_LENGTH);
88 if ( fdstat != G_HEADER_LENGTH ) {
89 warn("[!]Too short (%zd) for sequence_file [%s].", fdstat, filename);
90 return NULL;
92 fdstat = close(fd);
93 if ( fdstat )
94 warn("[!]ErrNo:[%zd] while closing file [%s].", fdstat, filename);
96 SeqFileObj * const seqObj = malloc(sizeof(SeqFileObj)); // no more calloc(), just {.datePos[1]=0} is needed for kseq way.
97 //CANNOT use `&(SeqFileObj) {.datePos={0}};`, which is of a short lifetime !
99 if ( memcmp(G_TYPE_FAQC, FileID, G_HEADER_LENGTH) ) { // Not G_TYPE_FAQC, thus to kseq.h
100 gzFile fp;
101 kseq_t *seq;
103 fp = gzopen(filename, "r");
104 if (! fp) err(EXIT_FAILURE, "\n[x]Cannot open with zlib for [%s]", filename);
105 seq = kseq_init(fp); // calloc, thus safe
106 seqObj->datePos[0] = -1; // seeking not available here.
107 seqObj->datePos[1] = 0;
108 seqObj->fobj = seq;
109 if (binmode & GFIODIBBASE) {
110 seqObj->getNextSeq = read_kseq_with2bit; // (ssize_t (*)(void*))
111 } else {
112 seqObj->getNextSeq = read_kseq_no2bit;
114 seqObj->closefh = close_kseq;
115 seqObj->diBseq = NULL; // We need NULL to free ...
116 seqObj->hexBQ = NULL;
117 seqObj->binMallocedQQWord = 0;
118 seqObj->binNcount = 0;
119 } else { // is G_TYPE_FAQC
120 errx(EXIT_FAILURE, "\n[:(]FAQC file not supported now.");
123 return seqObj;
126 void inSeqFdestroy(SeqFileObj * const seqObj) {
127 (*seqObj->closefh)(seqObj);
128 free(seqObj->diBseq);
129 free(seqObj->hexBQ);
130 free(seqObj);