modified: nfig1.py
[GalaxyCodeBases.git] / c_cpp / readscorr / gFileIO.c
blob45b8b12c15958903f9d1ae57dcb9a9ad47bf1a33
1 #include <sys/types.h>
2 #include <sys/stat.h>
3 #include <fcntl.h> // open, above 3
4 #include <unistd.h> // read, close
5 #include <err.h>
6 #include <string.h> // memcmp
7 #include <stdlib.h> //calloc
8 #include <stdio.h> // puts
9 #include <stdint.h> // uint_fast8_t
10 //#include <zlib.h> // already in gkseq.h
11 #include "gkseq.h" // No more `kseq.h` with tons of Macros !!!
12 #include "gFileIO.h"
13 #include "2bitseq.h"
14 #include "chrseq.h"
15 //KSEQ_INIT(gzFile, gzread) // [kseq.h] Just like include, to inline some static inline functions.
18 A 41h 0100 0001 65
19 a 61h 0110 0001 97
20 T 54h 0101 0100 84
21 t 74h 0111 0100 116
22 C 43h 0100 0011 67
23 c 63h 0110 0011 99
24 G 47h 0100 0111 71
25 g 67h 0110 0111 103
26 N 4eh 0100 1110 78
27 n 6eh 0110 1110 110
30 ssize_t read_kseq_no2bit(SeqFileObj * const seqObj) {
31 size_t seqlen; // in fact size_t, but minus values are meanful.
32 int_fast8_t rvalue = kseq_read(seqObj->fobj);
33 //fputs("<--->", stderr);
34 if (rvalue>0) {
35 uint_fast8_t type = rvalue; // 1 or 3. No need to &3 now.
36 kseq_t *kseq;
37 kseq = seqObj->fobj;
38 seqlen = kseq->seq.l;
39 seqObj->name = kseq->name.s;
40 if (! kseq->comment.l) seqObj->comment = NULL;
41 else seqObj->comment = kseq->comment.s;
42 seqObj->seq = kseq->seq.s;
43 NormalizeChrSeq(kseq->seq.s); // to /[ATCGN]*/
44 if (rvalue&2) { // withQ
45 //encodeQ;
46 type |= 8u;
47 seqObj->qual = kseq->qual.s;
48 } else {
49 seqObj->qual = NULL;
51 seqObj->readlength = seqlen;
52 seqObj->type = type;
53 return seqlen;
54 } else return rvalue;
56 ssize_t read_kseq_with2bit(SeqFileObj * const seqObj) {
57 size_t seqlen; // in fact size_t, but minus values are meanful.
58 int_fast8_t rvalue = kseq_read(seqObj->fobj);
59 if (rvalue>0) {
60 uint_fast8_t type = rvalue; // 1 or 3. No need to &3 now.
61 kseq_t *kseq;
62 kseq = seqObj->fobj;
63 seqlen = kseq->seq.l;
64 seqObj->name = kseq->name.s;
65 if (! kseq->comment.l) seqObj->comment = NULL;
66 else seqObj->comment = kseq->comment.s;
67 seqObj->seq = kseq->seq.s;
68 if (rvalue&2) { // withQ
69 //encodeQ;
70 type |= 8u;
71 seqObj->qual = kseq->qual.s;
72 } else {
73 seqObj->qual = NULL;
75 size_t needtomallocQQW = (seqlen+31u)>>5; // 1 "QQWord" = 4 QWord = 32 bp. Well, I say there is QQW.
76 if (needtomallocQQW > seqObj->binMallocedQQWord) {
77 KROUNDUP32(needtomallocQQW);
78 seqObj->binMallocedQQWord = needtomallocQQW;
79 seqObj->diBseq = realloc(seqObj->diBseq,needtomallocQQW<<3); // 2^3=8
80 seqObj->hexBQ = realloc(seqObj->hexBQ,needtomallocQQW<<5); // 4*2^3=32
82 seqObj->binNcount = base2dbit(seqlen, kseq->seq.s, seqObj->qual, seqObj->diBseq, seqObj->hexBQ);
83 // printf("-[%s]<%s><%zx>[%s]-\n",kseq->seq.s, qstr, seqObj->diBseq[0], unit2basechr(seqObj->diBseq[0]));
84 // Well, how to deal with smallcase masking ? Not using this information yet.
85 NormalizeChrSeq(kseq->seq.s); // to /[ATCGN]*/
86 seqObj->readlength = seqlen;
87 seqObj->type = type;
88 return seqlen;
89 } else return rvalue;
91 void close_kseq(SeqFileObj * const seqObj){
92 gzFile fp = ((kseq_t*)seqObj->fobj)->f->f;
93 gzclose(fp);
94 kseq_destroy(seqObj->fobj);
97 // binmode: 1=base char, 2=base 2bit
98 SeqFileObj * inSeqFinit(const char * const filename, unsigned char binmode) {
99 int fd;
100 ssize_t fdstat;
101 char FileID[G_HEADER_LENGTH];
102 fd = open(filename, O_RDONLY);
103 if (fd == -1) {warn("\n[x]Cannot open Sequence_file [%s]", filename); return NULL;}
104 fdstat = read(fd, FileID, G_HEADER_LENGTH);
105 if ( fdstat != G_HEADER_LENGTH ) {
106 warn("[!]Too short (%zd) for sequence_file [%s].", fdstat, filename);
107 return NULL;
109 fdstat = close(fd);
110 if ( fdstat )
111 warn("[!]ErrNo:[%zd] while closing file [%s].", fdstat, filename);
113 SeqFileObj * const seqObj = malloc(sizeof(SeqFileObj)); // no more calloc(), just {.datePos[1]=0} is needed for kseq way.
114 //CANNOT use `&(SeqFileObj) {.datePos={0}};`, which is of a short lifetime !
116 if ( memcmp(G_TYPE_FAQC, FileID, G_HEADER_LENGTH) ) { // Not G_TYPE_FAQC, thus to kseq.h
117 gzFile fp;
118 kseq_t *seq;
120 fp = gzopen(filename, "r");
121 if (! fp) err(EXIT_FAILURE, "\n[x]Cannot open with zlib for [%s]", filename);
122 seq = kseq_init(fp); // calloc, thus safe
123 seqObj->datePos[0] = -1; // seeking not available here.
124 seqObj->datePos[1] = 0;
125 //seqObj->hasQ = 0;
126 //seqObj->name = &seq->name.s;
127 //seqObj->comment = &seq->comment.s;
128 //seqObj->seq = &seq->seq.s;
129 //seqObj->qual = &seq->qual.s;
130 //seqObj->readlength = &seq->seq.l;
131 seqObj->fobj = seq;
132 if (binmode & GFIODIBBASE) {
133 seqObj->getNextSeq = read_kseq_with2bit; // (ssize_t (*)(void*))
134 } else {
135 seqObj->getNextSeq = read_kseq_no2bit;
137 seqObj->closefh = close_kseq;
138 seqObj->diBseq = NULL; // We need NULL to free ...
139 seqObj->hexBQ = NULL;
140 seqObj->binMallocedQQWord = 0;
141 seqObj->binNcount = 0;
142 //seqObj->readlength = 0;
143 //int seqlen; // TEST ONLY !
144 //seqlen = (*seqObj->getNextSeq)(seqObj->fh); //seqlen = kseq_read(seq); // TEST ONLY !
146 } else { // is G_TYPE_FAQC
147 errx(EXIT_FAILURE, "\n[:(]FAQC file not supported now.");
150 return seqObj;
154 int_fast8_t read_kseq_no2bit()
155 int_fast8_t read_kseq_with2bit()
156 int_fast8_t read_faqc_nobasechar()
157 int_fast8_t read_faqc_withbasechar()
160 ssize_t inSeqFreadNext(SeqFileObj * const seqObj) {
161 ssize_t seqlen; // in fact size_t, but minus values are meanful.
162 seqlen = (*seqObj->getNextSeq)(seqObj);
163 //seqObj->hasQ = (*seqObj->qual != 0);
164 return seqlen;
167 void inSeqFdestroy(SeqFileObj * const seqObj) {
168 (*seqObj->closefh)(seqObj);
169 free(seqObj->diBseq);
170 free(seqObj->hexBQ);
171 free(seqObj);