modified: nfig1.py
[GalaxyCodeBases.git] / c_cpp / faststater / 2bitseqinline.h
blob7468a3bcfdd854046c6db3729b710dac7b72db8a
1 #ifndef _G_DBSEQ_INLINE_H
2 #define _G_DBSEQ_INLINE_H
4 #include "gtypendef.h"
5 #include <stdint.h> // int_fast8_t
6 #include <stdlib.h> // malloc
7 //#include "2bitseq.h"
8 #ifdef DEBUG
9 #include <stdio.h>
10 #endif
12 FORCE_INLINE uint64_t *dibmalloc(size_t len){
13 size_t needtomallocQQW = (len+31u)>>5;
14 uint64_t *outseq = malloc(needtomallocQQW*8);
15 return outseq;
18 FORCE_INLINE uint64_t singlebase2dbitplus(const char *const base, unsigned char *const hexBQchr, size_t *const Ncountp) {
19 switch (*base) {
20 case 'a': case 'A':
21 return 0;
22 break;
23 case 't': case 'T':
24 return 3;
25 break;
26 case 'c': case 'C':
27 return 1;
28 break;
29 case 'g': case 'G':
30 return 2;
31 break;
32 default:
33 *hexBQchr = 1u<<7; // 128 for N
34 ++(*Ncountp);
35 // DONOT use `|=` since the memory is just malloced
36 return 0;
37 break;
38 } // Whether a looking-up array[64] can be faster ?
41 // for gFileIO only
42 FORCE_INLINE size_t base2dbit(size_t seqlen,
43 const char *const baseschr, const char *const qstr,
44 uint64_t *diBseq, unsigned char *hexBQ) {
45 // First, do the bases, overwrite output arrays since we have just malloc without setting to zero.
46 size_t Ncount=0;
47 size_t i,j;
48 const size_t seqlenDowntoDW = seqlen & ~((2u<<4)-1);
49 #ifdef DEBUG
50 printf("[b2d]%zu %zu\n",seqlen,seqlenDowntoDW);
51 #endif
52 uint64_t tmpqdbase;
53 for (i=0;i<seqlenDowntoDW;i+=32u) {
54 tmpqdbase=0;
55 for (j=0;j<32;j++) {
56 //tmpqdbase |= singlebase2dbit(baseschr+i+j,hexBQ+i+j)<<(62u-j);
57 tmpqdbase |= singlebase2dbitplus(baseschr+i+j,hexBQ+i+j,&Ncount)<<(j*2);
59 *diBseq++ = tmpqdbase;
61 tmpqdbase = 0;
62 for (j=0;j<seqlen-i;j++) { // seqlen starts from 0.
63 // Cannot use 'j<=seqlen-i-1' here since 'size_t j' cannot be negative.
64 tmpqdbase |= singlebase2dbitplus(baseschr+i+j,hexBQ+i+j,&Ncount)<<(j*2);
66 *diBseq++ = tmpqdbase;
67 // Everything should done in for-cycle when seqlen%4 == 0.
68 // No need to add '/0' since we have got seqlen, and there may be poly-A.
69 // Then, for the Q values ...
70 if (qstr != NULL) {
71 for (i=0;i<=seqlen;i++) {
72 if ( qstr[i] >= '@' && qstr[i] <= 'h' )
73 tmpqdbase = qstr[i] - '?'; // @=1, A=2, B=3
74 else tmpqdbase = 0;
75 hexBQ[i] |= tmpqdbase; // lower 7bit already Zero-ed.
78 // Eamss-masking will be considered later.
79 return Ncount;
82 FORCE_INLINE uint64_t unitReverseComp(uint64_t seq32mer){
83 seq32mer = ~seq32mer;
84 seq32mer = ((seq32mer & 0x3333333333333333LLU)<< 2) | ((seq32mer & 0xCCCCCCCCCCCCCCCCLLU)>> 2);
85 seq32mer = ((seq32mer & 0x0F0F0F0F0F0F0F0FLLU)<< 4) | ((seq32mer & 0xF0F0F0F0F0F0F0F0LLU)>> 4);
86 #if !defined(__GNUC__) || (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
87 seq32mer = ((seq32mer & 0x00FF00FF00FF00FFLLU)<< 8) | ((seq32mer & 0xFF00FF00FF00FF00LLU)>> 8);
88 seq32mer = ((seq32mer & 0x0000FFFF0000FFFFLLU)<<16) | ((seq32mer & 0xFFFF0000FFFF0000LLU)>>16);
89 seq32mer = ((seq32mer & 0x00000000FFFFFFFFLLU)<<32) | ((seq32mer & 0xFFFFFFFF00000000LLU)>>32);
90 #else
91 seq32mer = __builtin_bswap64(seq32mer);
92 #endif
93 return seq32mer;
96 0xABCDEF0123456789 -> cgagtcgcccactagacaaattgtctattggg
97 revcomp -> cccaatagacaatttgtctagtgggcgactcg
99 //FORCE_INLINE uint64_t QQWkmerMovHigher(uint64_t *seq32mer, unsigned char bphigher, uint64_t ){}
101 // *base must have been Normalized to /[ATCGN]*/
102 FORCE_INLINE uint64_t singlebase2dbit(const char *const base, size_t *const Ncountp) {
103 switch (*base) {
104 case 'A':
105 return 0;
106 break;
107 case 'T':
108 return 3;
109 break;
110 case 'C':
111 return 1;
112 break;
113 case 'G':
114 return 2;
115 break;
116 default:
117 ++(*Ncountp);
118 // DONOT use `|=` since the memory is just malloced
119 return 0;
120 break;
121 } // Whether a looking-up array[32] can be faster ?
123 FORCE_INLINE size_t ChrSeq2dib(const char *const baseschr, size_t seqlen, uint64_t **diBseqp, size_t *const puint64cnt){
124 size_t needtomallocQQW = (seqlen+31u)>>5; // in fact length in sizeof(uint64_t)
125 if (needtomallocQQW > *puint64cnt) {
126 *diBseqp = realloc(*diBseqp, needtomallocQQW*8);
127 *puint64cnt = needtomallocQQW;
129 uint64_t *tmpdiBseqp = *diBseqp;
130 size_t Ncount = 0;
131 size_t i,j;
132 const size_t seqlenDowntoDW = seqlen & ~((2u<<4)-1);
133 uint64_t tmpqdbase;
134 for (i=0;i<seqlenDowntoDW;i+=32u) {
135 tmpqdbase=0;
136 for (j=0;j<32;j++) {
137 tmpqdbase |= singlebase2dbit(baseschr+i+j,&Ncount)<<(j*2);
139 *tmpdiBseqp++ = tmpqdbase;
141 tmpqdbase = 0;
142 for (j=0;j<seqlen-i;j++) { // seqlen starts from 0.
143 tmpqdbase |= singlebase2dbit(baseschr+i+j,&Ncount)<<(j*2);
145 *tmpdiBseqp++ = tmpqdbase;
146 #ifdef EP
147 while (tmpdiBseqp < (*diBseqp)+*puint64cnt) {
148 *tmpdiBseqp++ = 0LLU;
150 #endif // not nessary since we already have seqlen
151 #ifdef DEBUG
152 printf("[s2d]%zu %zu [%lx] [%.*s]\n",seqlen,*puint64cnt,**diBseqp,(int)seqlen,baseschr); // just show the 1st one.
153 #endif
154 return Ncount;
157 #endif // 2bitseqinline.h