modified: Makefile
[GalaxyCodeBases.git] / c_cpp / readscorr / bihash.h
blob0601748ba27fdeeaac0982c590fc2f4f6103202a
1 // by Hu Xuesong
2 #ifndef _G_BIHASH_H
3 #define _G_BIHASH_H
5 #include "gtypendef.h"
6 #include <stddef.h> //size_t
7 #include <stdint.h> //uint64_t
8 #include <stdio.h> //FILE
9 #include "gFileIO.h"
10 #include "cfgparser.h"
12 #define MAX_KMER_LEN (127)
13 #define HASH_LENB (128u)
14 #define SDL_SUBARRAY_UNIT (8u)
15 //#define SDLA_ITEMARRAY 32u
16 #define SUBARRAY_SIZE (32u*1024u)
17 //gcc -### -march=native -E /usr/include/stdlib.h 2>&1 | grep l1-cache-size
18 #define ENCODEDBIT_ENTROPY_PAD (1u);
19 /*
20 http://math.stackexchange.com/questions/53353/is-there-some-reversible-mapping-that-as-uniform-as-a-hash
21 Let $H$ be a hash function taking an arbitrary string as input and producing an $n$-bit string as output.
22 Given an input string $s$, let $s_0$ denote the first $n$ bits of $s$ and $s_1$ the rest
23 (i.e. $s = s_0\;||\;s_1$, where $s_0$ is $n$ bits long and $||$ denotes concatenation).
24 Then define $$f(s) = (s_0 \oplus H(s_1))\;||\;s_1,$$ where $\oplus$ means bitwise XOR.
25 It's easy to see that $f$ is its own inverse, i.e.$f(f(s)) = s$,
26 so that the input string $s$ can be recovered from $f(s)$ just by running it through $f$ again.
28 f(s)=(s0 XOR H(s1))||s1, length(s0)=EncodedBit, length(s1)=rBit
30 EncodedBit < ArrayBit < rBit
31 */
32 typedef struct __SDLeftArray_t {
33 unsigned char CountBit, rBit, ArrayBit, EncodedBit;
34 unsigned char itemByte; //, HashCnt;
35 uint16_t SubItemCount,SubItemByUnit; //max(SubItemCount) should be 32768
36 size_t ArraySize,SDLAbyte;
37 //uint64_t maxCount; == Item_CountBitMask
38 //unsigned char ArrayCount;
39 uint64_t ItemInsideAll, CellOverflowCount, CountBitOverflow; // ItemInsideAll = ItemInsideArray + CellOverflowCount
40 double FalsePositiveRatio;
41 void *pDLA, *pextree;
42 uint64_t maxCountSeen;
43 //uint64_t *outhash;
44 uint64_t outhash[2]; // both ArrayBit and rBit is (0,64], so HashCnt==1 for MurmurHash3_x64_128
45 uint128_t Item_rBitMask;
46 uint64_t Hash_ArrayBitMask, Hash_rBitMask, Item_CountBitMask;
47 } SDLeftArray_t;
49 typedef struct __SDLdumpHead_t {
50 char FileID[4]; //"GDSD"
51 unsigned char FileVersion[2]; //0,1
52 uint16_t kmersize;
53 unsigned char CountBit, rBit;
54 uint16_t SubItemCount;
55 uint64_t ArraySize, SDLAbyte, extreebyte;
56 uint64_t ItemInsideAll, CellOverflowCount, CountBitOverflow;
57 uint64_t maxCountSeen;
58 uint64_t HistMaxCntVal;
59 uint64_t HistMaxHistVal;
60 double HistMean;
61 double HistSStd;
62 uint32_t crc32c;
63 } __attribute__ ((packed)) SDLdumpHead;
65 SDLeftArray_t *dleft_arraynew(unsigned char CountBit, const SDLConfig * const psdlcfg, int kmer);
66 SDLeftArray_t *dleft_arrayinit(unsigned char CountBit, size_t ArraySize, uint16_t SubItemCount, int kmer);
67 size_t dleft_insert_read(unsigned int k, char const *const inseq, size_t len, SDLeftArray_t *dleftobj);
69 unsigned char GETitemByte_PADrBit_trimSubItemCount(const unsigned char CountBit, unsigned char *prBit, uint16_t *pSubItemCount);
71 void fprintSDLAnfo(FILE *stream, const SDLeftArray_t * dleftobj);
72 void dleft_arraydestroy(SDLeftArray_t * const dleftobj);
73 void dleft_dump(const SDLeftArray_t * const, SDLdumpHead * const, FILE *);
75 //#include "sdleftTF.h"
76 typedef struct __SDLeftStat_t {
77 uint64_t HistMaxCntVal;
78 uint64_t HistMaxHistVal;
79 double HistMean;
80 double HistSStd;
81 } SDLeftStat_t;
83 typedef SDLeftStat_t *(G_SDLeftArray_IN)(SDLeftArray_t * const, FILE *); //*G_SDLeftArray_IN() is OK,too .
85 #ifdef TEST
86 SDLeftStat_t * dleft_stat(SDLeftArray_t * const dleftobj, FILE *stream, FILE *fpdat);
87 #else
88 SDLeftStat_t * dleft_stat(SDLeftArray_t * const dleftobj, FILE *stream);
89 #endif
91 #endif /* bihash.h */
93 /* reprobe(i)=i(i+1)/2
94 pos(m,i)=(hash(m)+reprobe(i)) mod M