modified: Makefile
[GalaxyCodeBases.git] / c_cpp / readscorr / bihash.c
blobaf3d07975e2612ebbad40b1f96355d3f0932f033
1 #include <stdlib.h> //calloc
2 #include <stdint.h> // uint_fast8_t
3 #include <math.h> //log2, ceil
4 #include <stdio.h> //fprintf, fseek
5 #include <err.h>
6 #include <string.h> //strcmp, strncpy
7 #include <sys/mman.h>
8 #include <endian.h> //BYTE_ORDER, LITTLE_ENDIAN 1234
10 #ifdef PTHREAD
11 #include <pthread.h>
12 #endif
14 #include "bihash.h"
15 #include "MurmurHash3.h"
16 #include "2bitseq.h"
17 #include "chrseq.h"
19 #define HASHSEED (0x3ab2ae35)
21 unsigned char GETitemByte_PADrBit_trimSubItemCount(unsigned char CountBit, unsigned char *prBit, uint16_t *pSubItemCount){
22 unsigned char itemByte = (CountBit+*prBit+7u) >> 3; // 2^3=8
23 *prBit = (itemByte<<3) - CountBit;
24 #ifndef TEST /* SubItemCount is trimed on rBit only */
25 double maxSubItemByR = floor(pow(2.0,(double)*prBit));
26 if ( *pSubItemCount > maxSubItemByR ) *pSubItemCount = (uint16_t)maxSubItemByR; // safe since *pSubItemCount was bigger
27 #endif
28 return itemByte;
31 // the smarter one
32 SDLeftArray_t *dleft_arraynew(const unsigned char CountBit, const SDLConfig * const psdlcfg, int kmer){
33 unsigned char rBit;
34 size_t ArraySize;
35 uint16_t SubItemCount;
37 return dleft_arrayinit(CountBit, ArraySize, SubItemCount, kmer);
40 // the native one
41 SDLeftArray_t *dleft_arrayinit(unsigned char CountBit, size_t ArraySize, uint16_t SubItemCount, int kmer) {
42 if (ArraySize<2u || ArraySize>(1LLU<<63) || CountBit<3u || CountBit>8u*sizeof(uint64_t) || SubItemCount<1u ) {
43 err(EXIT_FAILURE, "[x]Wrong D Left Array Parameters:(%d)[%u]x%zd ",CountBit,SubItemCount,ArraySize);
44 } // CountBit+rBit >= 9, makes uint16_t always OK
45 unsigned char ArrayBit = ceil(log2(ArraySize));
46 unsigned char rBit = ArrayBit + ENCODEDBIT_ENTROPY_PAD;
47 int toEncode = kmer*2 - rBit;
48 if (toEncode > ArrayBit) {
49 toEncode = ArrayBit;
50 rBit = kmer*2 - ArrayBit;
52 unsigned char EncodedBit=(toEncode>0)?toEncode:0;
54 #ifdef TEST /* Test mode, keep rBit, pad CountBit */
55 unsigned char itemByte = GETitemByte_PADrBit_trimSubItemCount(rBit,&CountBit,&SubItemCount);
56 #else /* Normal, keep CountBit, pad rBit */
57 unsigned char itemByte = GETitemByte_PADrBit_trimSubItemCount(CountBit,&rBit,&SubItemCount);
58 #endif
59 SDLeftArray_t *dleftobj = calloc(1,sizeof(SDLeftArray_t)); // set other int to 0
60 dleftobj->SDLAbyte = (SubItemCount*itemByte*ArraySize+127u)&(~(size_t)127u); // We are reading in uint128_t now.
61 dleftobj->pDLA = calloc(1,dleftobj->SDLAbyte);
62 int mlock_r = mlock(dleftobj->pDLA,dleftobj->SDLAbyte);
63 if (mlock_r) warn("[!]Cannot lock SDL array in memory. Performance maybe reduced.");
64 dleftobj->CountBit = CountBit;
65 dleftobj->rBit = rBit;
66 dleftobj->ArrayBit = ArrayBit;
67 dleftobj->EncodedBit = EncodedBit;
68 dleftobj->itemByte = itemByte;
69 dleftobj->ArraySize = ArraySize;
70 dleftobj->SubItemCount = SubItemCount;
71 dleftobj->SubItemByUnit = SubItemCount/SDL_SUBARRAY_UNIT;
72 dleftobj->maxCountSeen = 1; //if SDLA is not empty, maxCountSeen>=1.
73 dleftobj->FalsePositiveRatio = exp2(-rBit)/(double)ArraySize;
74 //dleftobj->ItemInsideAll=0;
75 //dleftobj->CellOverflowCount=0;
76 dleftobj->Item_rBitMask=(uint128_t)((1LLU<<rBit)-1u) << CountBit;
77 dleftobj->Item_CountBitMask=(1LLU<<CountBit)-1u; // == maxCount
78 dleftobj->Hash_ArrayBitMask=(1LLU<<ArrayBit)-1u;
79 dleftobj->Hash_rBitMask=(1LLU<<rBit)-1u;
80 return dleftobj;
83 void fprintSDLAnfo(FILE *stream, const SDLeftArray_t * dleftobj){
84 fprintf(stream,"SDLA(%#zx) -> {\n\
85 Size:[r:%uB+cnt:%uB]*Item:%zd(%.3f~%uB[enc:%u])*subArray:%u = %g MiB\n\
86 Hash:%u*%uB ItemByte:%u MaxCountSeen:%lu%s\n\
87 Designed Capacity:%lu ItemCount:%lu, with Overflow:%lu\n\
88 FP:%g, estimated FP item count:%.10g\n\
89 Mem:%zu bytes\n\
91 (size_t)dleftobj,
92 dleftobj->rBit,dleftobj->CountBit,dleftobj->ArraySize,log2(dleftobj->ArraySize),
93 dleftobj->ArrayBit,dleftobj->EncodedBit,dleftobj->SubItemCount,(double)dleftobj->SubItemCount*dleftobj->itemByte*dleftobj->ArraySize/1048576,
94 1,HASH_LENB,dleftobj->itemByte,dleftobj->maxCountSeen,(dleftobj->ItemInsideAll)?"":"(=0, as SDLA is empty)",
95 dleftobj->ArraySize*(dleftobj->SubItemCount*3/4),
96 dleftobj->ItemInsideAll,dleftobj->CellOverflowCount,
97 dleftobj->FalsePositiveRatio,dleftobj->ItemInsideAll*dleftobj->FalsePositiveRatio,
98 dleftobj->SDLAbyte);
100 fprintf(stream," Item_rBitMask:[%016lX %016lX]\n", (uint64_t)(dleftobj->Item_rBitMask>>64), (uint64_t)dleftobj->Item_rBitMask);
101 fprintf(stream," Item_CountBitMask:[%016lX]\n", dleftobj->Item_CountBitMask);
102 uint128_t check = dleftobj->Item_rBitMask + dleftobj->Item_CountBitMask;
103 fprintf(stream," Sum:[%016lX %016lX]\n", (uint64_t)(check>>64), (uint64_t)check);
104 fprintf(stream," Hash_ArrayBitMask:[%016lX]\n", dleftobj->Hash_ArrayBitMask);
105 fprintf(stream," Hash_rBitMask:[%016lX]\n", dleftobj->Hash_rBitMask);
107 fputs("}\n", stream);