limit fstBC to 30bp in Python3 ver.
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / kmerhash.c
blob8266dd0335c8d1b09d73276095aca998f5bbf0ff
1 /*
2 * kmerhash.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
28 #include "def.h"
30 #define PUBLIC_FUNC
31 #define PROTECTED_FUNC
33 #define EDGEIDBLOCKSIZE 10000
35 #ifdef MER127
36 static const kmer_t2 empty_kmer2 = {{0, 0, 0, 0}, 0, 0, 0, 0};
37 //Get the hash key.
38 static inline ubyte8 modular ( KmerSet2 * set, Kmer seq )
40 ubyte8 temp;
41 temp = ( seq.high1 % set->size ) << 32 | ( seq.low1 >> 32 & 0xffffffff );
42 temp = ( temp % set->size ) << 32 | ( seq.low1 & 0xffffffff );
43 temp = ( temp % set->size ) << 32 | ( seq.high2 >> 32 & 0xffffffff );
44 temp = ( temp % set->size ) << 32 | ( seq.high2 & 0xffffffff );
45 temp = ( temp % set->size ) << 32 | ( seq.low2 >> 32 & 0xffffffff );
46 temp = ( temp % set->size ) << 32 | ( seq.low2 & 0xffffffff );
47 temp = ( ubyte8 ) ( temp % set->size );
48 return temp;
50 #else
51 static const kmer_t2 empty_kmer2 = {{0, 0}, 0, 0, 0, 0};
52 static inline ubyte8 modular ( KmerSet2 * set, Kmer seq )
54 ubyte8 hc;
55 __uint128_t temp;
56 temp = Kmer2int128 ( seq );
57 hc = temp % set->size;
58 return hc;
60 #endif
62 /*************************************************
63 Function:
64 update_kmer2
65 Description:
66 Updates the id and flag of the kmer.
67 Input:
68 1. mer: the current kmer
69 2. id: the id of edge
70 3. flag: the type of kmer(00: big "to kmer"; 01: big "from kmer"; 10: small "to kmer"; 11: small "from kmer")
71 Output:
72 None.
73 Return:
74 None.
75 *************************************************/
76 PUBLIC_FUNC void update_kmer2 ( kmer_t2 * mer, int id, char flag )
78 struct edgeID * edgeid;
79 edgeid = ( struct edgeID * ) malloc ( sizeof ( struct edgeID ) );
80 edgeid->edge = id;
81 edgeid->flag = flag;
82 edgeid->next = NULL;
84 if ( mer->edgeId )
85 { edgeid->next = mer->edgeId; }
87 mer->edgeId = edgeid;
88 mer->count++;
91 /*************************************************
92 Function:
93 set_new_kmer2
94 Description:
95 Initializes the new kmer.
96 Input:
97 1. mer: the point of the new kmer
98 2. seq: the sequence of the kmer
99 3. id: the id of edge
100 4. flag: the type of kmer(00: big "to kmer"; 01: big "from kmer"; 10: small "to kmer"; 11: small "from kmer")
101 Output:
102 None.
103 Return:
104 None.
105 *************************************************/
106 PUBLIC_FUNC void set_new_kmer2 ( kmer_t2 * mer, Kmer seq, int id, char flag )
108 *mer = empty_kmer2;
109 set_kmer_seq ( *mer, seq );
110 update_kmer2 ( mer, id, flag );
113 //Whether it's a prime number.
114 static inline int is_prime_kh ( ubyte8 num )
116 ubyte8 i, max;
118 if ( num < 4 ) { return 1; }
120 if ( num % 2 == 0 ) { return 0; }
122 max = ( ubyte8 ) sqrt ( ( float ) num );
124 for ( i = 3; i < max; i += 2 ) { if ( num % i == 0 ) { return 0; } }
126 return 1;
129 //Find next prime number.
130 static inline ubyte8 find_next_prime_kh ( ubyte8 num )
132 if ( num % 2 == 0 ) { num ++; }
134 while ( 1 ) { if ( is_prime_kh ( num ) ) { return num; } num += 2; }
137 /*************************************************
138 Function:
139 init_kmerset2
140 Description:
141 Initializes the kmerset.
142 Input:
143 1. init_size: the initial size of the kmerset
144 2. load_factor: load factor of hash
145 Output:
146 None.
147 Return:
148 The initial kmerset.
149 *************************************************/
150 PUBLIC_FUNC KmerSet2 * init_kmerset2 ( ubyte8 init_size, float load_factor )
152 KmerSet2 * set;
154 if ( init_size < 3 ) { init_size = 3; }
155 else { init_size = find_next_prime_kh ( init_size ); }
157 set = ( KmerSet2 * ) malloc ( sizeof ( KmerSet2 ) );
158 set->size = init_size;
159 set->count = 0;
160 set->max = set->size * load_factor;
162 if ( load_factor <= 0 ) { load_factor = 0.25f; }
163 else if ( load_factor >= 1 ) { load_factor = 0.75f; }
165 set->load_factor = load_factor;
166 set->iter_ptr = 0;
167 set->array = calloc ( set->size, sizeof ( kmer_t2 ) );
168 set->flags = malloc ( ( set->size + 15 ) / 16 * 4 );
169 memset ( set->flags, 0x55, ( set->size + 15 ) / 16 * 4 );
170 return set;
173 PROTECTED_FUNC static inline ubyte8 get_kmerset2 ( KmerSet2 * set, Kmer seq )
175 ubyte8 hc;
176 // hc = modular (set, seq);
177 #ifdef MER127
178 hc = modular ( set, seq );
179 #else
180 __uint128_t temp;
181 temp = Kmer2int128 ( seq );
182 hc = temp % set->size;
183 #endif
185 while ( 1 )
187 if ( is_kmer_entity_null ( set->flags, hc ) )
189 return hc;
191 else
193 if ( KmerEqual ( get_kmer_seq ( set->array[hc] ), seq ) )
194 { return hc; }
197 hc ++;
199 if ( hc == set->size ) { hc = 0; }
202 return set->size;
205 /*************************************************
206 Function:
207 search_kmerset2
208 Description:
209 Search kmer in kmerset.
210 Input:
211 1. set: the kmerset
212 2. seq: the kmer
213 3. rs: the record node
214 Output:
215 None.
216 Return:
217 1 if found.
218 *************************************************/
219 PUBLIC_FUNC int search_kmerset2 ( KmerSet2 * set, Kmer seq, kmer_t2 ** rs )
221 ubyte8 hc;
222 // hc = modular (set, seq);
223 #ifdef MER127
224 hc = modular ( set, seq );
225 #else
226 __uint128_t temp;
227 temp = Kmer2int128 ( seq );
228 hc = temp % set->size;
229 #endif
231 while ( 1 )
233 if ( is_kmer_entity_null ( set->flags, hc ) )
235 return 0;
237 else
239 if ( KmerEqual ( get_kmer_seq ( set->array[hc] ), seq ) )
241 *rs = set->array + hc;
242 return 1;
246 hc ++;
248 if ( hc == set->size ) { hc = 0; }
251 return 0;
254 /*************************************************
255 Function:
256 exists_kmerset
257 Description:
258 Whether the kmer exists in kmerset.
259 Input:
260 1. set: the kmerset
261 2. seq: the kmer
262 Output:
263 None.
264 Return:
265 1 if it exists.
266 *************************************************/
267 PUBLIC_FUNC static inline int exists_kmerset ( KmerSet2 * set, Kmer seq )
269 ubyte8 idx;
270 idx = get_kmerset2 ( set, seq );
271 return !is_kmer_entity_null ( set->flags, idx );
274 /*************************************************
275 Function:
276 encap_kmerset2
277 Description:
278 Enlarges the kmerset if necessary.
279 Input:
280 1. set: the hash of kmerset
281 2. num: the element number add to kmerset
282 Output:
283 None.
284 Return:
285 None.
286 *************************************************/
287 PROTECTED_FUNC static inline void encap_kmerset2 ( KmerSet2 * set, ubyte8 num )
289 ubyte4 * flags, *f;
290 ubyte8 i, n, size, hc;
291 kmer_t2 key, tmp;
293 if ( set->count + num <= set->max ) { return; }
295 if ( initKmerSetSize != 0 )
297 if ( set->load_factor < 0.88 )
299 set->load_factor = 0.88;
300 set->max = set->size * set->load_factor;
301 return;
303 else
305 fprintf ( stderr, "-- Static memory pool exploded, please define a larger value. --\nloadFactor\t%f\nsize\t%llu\ncnt\t%llu\n", set->load_factor, set->size, set->count );
306 abort();
310 n = set->size;
314 if ( n < 0xFFFFFFFU )
315 { n <<= 1; }
316 else
317 { n += 0xFFFFFFU; }
319 n = find_next_prime_kh ( n );
321 while ( n * set->load_factor < set->count + num );
323 set->array = realloc ( set->array, n * sizeof ( kmer_t2 ) );
325 if ( set->array == NULL )
327 fprintf ( stderr, "-- Out of memory --\n" );
328 abort();
331 flags = malloc ( ( n + 15 ) / 16 * 4 );
332 memset ( flags, 0x55, ( n + 15 ) / 16 * 4 );
333 size = set->size;
334 set->size = n;
335 set->max = n * set->load_factor;
336 f = set->flags;
337 set->flags = flags;
338 flags = f;
339 __uint128_t temp;
341 for ( i = 0; i < size; i++ )
343 if ( !exists_kmer_entity ( flags, i ) ) { continue; }
345 key = set->array[i];
346 set_kmer_entity_del ( flags, i );
348 while ( 1 )
350 hc = modular ( set, get_kmer_seq ( key ) );
351 #ifdef MER127
352 hc = modular ( set, get_kmer_seq ( key ) );
353 #else
354 temp = Kmer2int128 ( get_kmer_seq ( key ) );
355 hc = temp % set->size;
356 #endif
358 while ( !is_kmer_entity_null ( set->flags, hc ) ) { hc ++; if ( hc == set->size ) { hc = 0; } }
360 clear_kmer_entity_null ( set->flags, hc );
362 if ( hc < size && exists_kmer_entity ( flags, hc ) )
364 tmp = key;
365 key = set->array[hc];
366 set->array[hc] = tmp;
367 set_kmer_entity_del ( flags, hc );
369 else
371 set->array[hc] = key;
372 break;
377 free ( flags );
380 /*************************************************
381 Function:
382 put_kmerset2
383 Description:
384 Puts kmer into the kmerset.
385 Input:
386 1. set: the hash of kmerset
387 2. seq: the sequence of the kmer
388 3. id: the id of edge
389 4. flag: the type of kmer(00: big "to kmer"; 01: big "from kmer"; 10: small "to kmer"; 11: small "from kmer")
390 5. kmer_p: used to record the info of the kmer
391 Output:
392 None.
393 Return:
394 1 if it's successfully put kmer into kmerset.
395 *************************************************/
396 PUBLIC_FUNC int put_kmerset2 ( KmerSet2 * set, Kmer seq, int id, char flag, kmer_t2 ** kmer_p )
398 ubyte8 hc;
400 if ( set->count + 1 > set->max )
402 encap_kmerset2 ( set, 1 );
405 // hc = modular (set, seq);
406 #ifdef MER127
407 hc = modular ( set, seq );
408 #else
409 __uint128_t temp;
410 temp = Kmer2int128 ( seq );
411 hc = temp % set->size;
412 #endif
416 if ( is_kmer_entity_null ( set->flags, hc ) )
418 clear_kmer_entity_null ( set->flags, hc );
419 set_new_kmer2 ( set->array + hc, seq, id, flag );
420 set->count ++;
421 *kmer_p = set->array + hc;
422 return 0;
424 else
426 if ( KmerEqual ( get_kmer_seq ( set->array[hc] ), seq ) )
428 update_kmer2 ( set->array + hc, id, flag );
429 *kmer_p = set->array + hc;
430 return 1;
434 hc ++;
436 if ( hc == set->size ) { hc = 0; }
438 while ( 1 );
440 *kmer_p = NULL;
441 return 0;
444 /*************************************************
445 Function:
446 count_kmerset2
447 Description:
448 Returns the kmer number of the kmerset.
449 Input:
450 None.
451 Output:
452 None.
453 Return:
454 The kmer number of the kmerset.
455 *************************************************/
456 PUBLIC_FUNC byte8 count_kmerset2 ( KmerSet2 * set ) { return set->count; }
458 PUBLIC_FUNC static inline void reset_iter_kmerset2 ( KmerSet2 * set ) { set->iter_ptr = 0; }
460 PUBLIC_FUNC static inline ubyte8 iter_kmerset2 ( KmerSet2 * set, kmer_t2 ** rs )
462 while ( set->iter_ptr < set->size )
464 if ( !is_kmer_entity_null ( set->flags, set->iter_ptr ) )
466 *rs = set->array + set->iter_ptr;
467 set->iter_ptr ++;
468 return 1;
471 set->iter_ptr ++;
474 return 0;
477 //Free.
478 PUBLIC_FUNC void free_kmerset2 ( KmerSet2 * set )
480 int i;
481 struct edgeID * temp, *temp_next;
482 kmer_t2 * node;
483 set->iter_ptr = 0;
485 while ( set->iter_ptr < set->size )
487 if ( !is_kmer_entity_null ( set->flags, set->iter_ptr ) )
489 node = set->array + set->iter_ptr;
491 if ( node )
493 temp = node->edgeId;
495 while ( temp )
497 temp_next = temp->next;
498 free ( ( void * ) temp );
499 temp = temp_next;
504 set->iter_ptr ++;
507 free ( set->array );
508 set->array = NULL;
509 free ( set->flags );
510 set->flags = NULL;
511 free ( set );
512 set = NULL;
515 /*************************************************
516 Function:
517 free_Sets2
518 Description:
519 Free all the kmersets.
520 Input:
521 1. sets: the array of the kmerset
522 2. num: the number of kmerset
523 Output:
524 None.
525 Return:
526 None.
527 *************************************************/
528 PUBLIC_FUNC void free_Sets2 ( KmerSet2 ** sets, int num )
530 int i;
532 for ( i = 0; i < num; ++i )
534 free_kmerset2 ( sets[i] );
535 sets[i] = NULL;
538 free ( ( void * ) sets );
539 sets = NULL;