modified: nfig1.py
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / sparsePregraph / inc / seq_util.h
blob650976324bac0a8e8352ad6ddf9df6c80789066b
1 /*
2 * inc/seq_util.h
4 * This file is part of SOAPdenovo.
6 * Part of this file is refered and modified from SparseAssembler
7 * (See <http://sourceforge.net/projects/sparseassembler/>).
9 */
11 #ifndef _SEQ_UTIL_H
12 #define _SEQ_UTIL_H
14 #include "stdinc.h"
15 #include "core.h"
18 extern inline void Init_Read ( string & seq, struct read_t & read );
20 extern inline uint64_t * str2bitsarr ( const char * c_str, int len, uint64_t * b_str, int arr_sz );
22 extern inline char * bitsarr2str ( uint64_t * b_seq, int len, char * c_str, int arr_sz );
24 extern inline void get_sub_arr ( uint64_t * bitsarr_in, int bitsarr_len, int begin_pos, int sub_sz, uint64_t * bitsarr_out );
26 extern inline void L_shift_NC ( uint64_t * bitsarr, int shift_sz, int arr_sz );
28 extern inline void R_shift_NC ( uint64_t * bitsarr, int shift_sz, int arr_sz );
30 extern inline int uint64_t_cmp ( uint64_t * A, uint64_t * B, int Kmer_arr_sz );
32 extern inline uint64_t * get_rev_comp_seq_arr ( uint64_t * seq_arr, int seq_size, int arr_sz );
34 extern inline uint64_t get_rev_comp_seq ( uint64_t seq, int seq_size );
36 extern inline uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed );
41 inline void Init_Read ( string & seq, struct read_t & read )
43 read.readLen = ( int ) seq.size();
44 int Read_arr_sz = read.readLen / 32 + 1;
45 int rem = read.readLen % 32;
47 if ( rem == 0 )
48 {Read_arr_sz--;}
50 str2bitsarr ( seq.c_str(), ( int ) seq.size(), read.read_bits, Read_arr_sz );
54 inline uint64_t * str2bitsarr ( const char * c_str, int len, uint64_t * b_str, int arr_sz )
56 for ( int k = 0; k < arr_sz; ++k )
58 b_str[k] = 0;
61 int arr_sz_needed = len / 32 + 1;
62 int rem = len % 32;
64 if ( rem == 0 )
65 {arr_sz_needed--;}
67 int beg_arr_idx = arr_sz - arr_sz_needed;
69 if ( rem == 0 && arr_sz_needed > 0 )
70 {rem = 32;}
72 for ( int k = 0; k < len; k++ )
74 if ( rem == 0 )
75 {beg_arr_idx++; rem = 32;}
77 switch ( c_str[k] )
79 case ( 'A' ) :
80 case ( 'a' ) :
81 case ( '0' ) :
82 b_str[beg_arr_idx] <<= 2;
83 rem--;
84 break;
85 case ( 'C' ) :
86 case ( 'c' ) :
87 case ( '1' ) :
88 b_str[beg_arr_idx] <<= 2;
89 ++b_str[beg_arr_idx];
90 rem--;
91 break;
92 case 'G':
93 case 'g':
94 case '2':
95 b_str[beg_arr_idx] <<= 1;
96 ++b_str[beg_arr_idx];
97 b_str[beg_arr_idx] <<= 1;
98 rem--;
99 break;
100 case 'T':
101 case 't':
102 case '3':
103 b_str[beg_arr_idx] <<= 1;
104 ++b_str[beg_arr_idx];
105 b_str[beg_arr_idx] <<= 1;
106 ++b_str[beg_arr_idx];
107 rem--;
108 break;
109 default:
110 return b_str;
114 return b_str;
117 inline char * bitsarr2str ( uint64_t * b_seq, int len, char * c_str, int arr_sz )
119 int tot_bits = arr_sz * 64;
121 for ( int i = 0; i < len; ++i )
123 uint64_t temp, temp2[100];
125 for ( int k = 0; k < arr_sz; ++k )
127 temp2[k] = b_seq[k];
130 L_shift_NC ( temp2, tot_bits - ( len - i ) * 2, arr_sz );
131 R_shift_NC ( temp2, tot_bits - 2, arr_sz );
132 temp = temp2[arr_sz - 1];
134 switch ( temp )
136 case 0:
137 c_str[i] = 'A';
138 break;
139 case 1:
140 c_str[i] = 'C';
141 break;
142 case 2:
143 c_str[i] = 'G';
144 break;
145 case 3:
146 c_str[i] = 'T';
147 break;
151 c_str[len] = '\0';
152 return c_str;
157 inline void get_sub_arr ( uint64_t * bitsarr_in, int bitsarr_len, int begin_pos, int sub_sz, uint64_t * bitsarr_out )
159 if ( bitsarr_len < sub_sz )
160 {cout << "Error! Input kmer too short." << bitsarr_len << " " << sub_sz << endl; return;}
162 int arr_sz_in = bitsarr_len / 32 + 1;
163 int rem = bitsarr_len % 32;
165 if ( rem == 0 )
166 {arr_sz_in--;}
168 int arr_sz_out = sub_sz / 32 + 1;
170 if ( sub_sz % 32 == 0 )
171 {arr_sz_out--;}
173 uint64_t temp_arr[10];
174 memset ( temp_arr, 0, sizeof ( temp_arr ) );
175 memset ( bitsarr_out, 0, sizeof ( uint64_t ) *arr_sz_out );
176 int rem2 = ( 32 - rem + begin_pos ) % 32;
177 int block_beg = ( 32 - rem + begin_pos ) / 32;
179 if ( rem == 0 )
180 {block_beg--;}
182 int rem3 = ( 32 - rem + begin_pos + sub_sz ) % 32;
183 int block_end = ( 32 - rem + begin_pos + sub_sz ) / 32;
185 if ( rem3 != 0 )
186 {rem3 = 32 - rem3;}
187 else
189 block_end--;
192 if ( rem == 0 )
193 {block_end--;}
195 int orig_sz = ( block_end - block_beg + 1 );
196 memcpy ( temp_arr, &bitsarr_in[block_beg], orig_sz * sizeof ( uint64_t ) );
197 L_shift_NC ( temp_arr, rem2 * 2, orig_sz );
198 R_shift_NC ( temp_arr, ( rem2 + rem3 ) % 32 * 2, arr_sz_out );
199 memcpy ( bitsarr_out, temp_arr, arr_sz_out * sizeof ( uint64_t ) );
202 inline void L_shift_NC ( uint64_t * bitsarr, int shift_sz, int arr_sz )
204 uint64_t temp_arr[100];
206 for ( int i = 0; i < arr_sz; ++i )
208 temp_arr[i] = 0;
211 int jmp = shift_sz / 64;
212 int offset = shift_sz % 64;
214 for ( int i = 0; i < arr_sz; ++i )
216 if ( i + jmp + 1 < arr_sz )
218 uint64_t tt = 0;
220 if ( offset == 0 )
222 tt = 0;
224 else
226 tt = ( bitsarr[i + jmp + 1] >> ( 64 - offset ) );
229 temp_arr[i] = ( ( bitsarr[i + jmp] << offset ) | tt );
232 if ( i + jmp + 1 == arr_sz )
233 {temp_arr[i] = bitsarr[i + jmp] << offset;}
235 if ( i + jmp + 1 > arr_sz )
236 {temp_arr[i] = 0;}
239 for ( int i = 0; i < arr_sz; ++i )
241 bitsarr[i] = temp_arr[i];
247 inline void R_shift_NC ( uint64_t * bitsarr, int shift_sz, int arr_sz )
249 uint64_t temp_arr[100];
251 for ( int i = 0; i < arr_sz; ++i )
253 temp_arr[i] = 0;
256 int jmp = shift_sz / 64;
257 int offset = shift_sz % 64;
259 if ( offset == 0 ) //to fix the move 64bit bug
261 for ( int i = arr_sz - 1; i >= 0; --i )
263 if ( i - jmp > 0 )
264 {temp_arr[i] = bitsarr[i - jmp];}
266 if ( i - jmp == 0 )
267 {temp_arr[i] = bitsarr[i - jmp];}
269 if ( i - jmp < 0 )
270 {temp_arr[i] = 0;}
273 else
275 for ( int i = arr_sz - 1; i >= 0; --i )
277 if ( i - jmp > 0 )
278 {temp_arr[i] = ( bitsarr[i - jmp] >> offset ) | ( bitsarr[i - jmp - 1] << ( 64 - offset ) );}
280 if ( i - jmp == 0 )
281 {temp_arr[i] = ( bitsarr[i - jmp] >> offset );}
283 if ( i - jmp < 0 )
284 {temp_arr[i] = 0;}
288 for ( int i = 0; i < arr_sz; ++i )
290 bitsarr[i] = temp_arr[i];
295 inline int uint64_t_cmp ( uint64_t * A, uint64_t * B, int Kmer_arr_sz )
297 int flag = 0;
299 for ( int jj = 0; jj < Kmer_arr_sz; ++jj )
301 if ( A[jj] > B[jj] )
303 flag = 1;
304 break;
307 if ( A[jj] < B[jj] )
309 flag = -1;
310 break;
313 if ( A[jj] == B[jj] )
315 continue;
319 return flag;
323 //for 63mer
324 inline uint64_t * get_rev_comp_seq_arr ( uint64_t * seq_arr, int seq_size, int arr_sz )
326 if ( seq_size < 32 && arr_sz == 2 )
328 seq_arr[1] = get_rev_comp_seq ( seq_arr[1], seq_size );
330 if ( seq_arr[0] != 0 )
332 fprintf ( stderr, "ERROR: in get_rev_comp_seq_arr \n" );
333 exit ( -1 );
336 return seq_arr;
339 int tot_bits = arr_sz * 64;
341 for ( int i = 0; i < arr_sz; ++i )
343 seq_arr[i] = ~seq_arr[i];
344 seq_arr[i] = ( ( seq_arr[i] & 0x3333333333333333 ) << 2 ) | ( ( seq_arr[i] & 0xCCCCCCCCCCCCCCCC ) >> 2 );
345 seq_arr[i] = ( ( seq_arr[i] & 0x0F0F0F0F0F0F0F0F ) << 4 ) | ( ( seq_arr[i] & 0xF0F0F0F0F0F0F0F0 ) >> 4 );
346 seq_arr[i] = ( ( seq_arr[i] & 0x00FF00FF00FF00FF ) << 8 ) | ( ( seq_arr[i] & 0xFF00FF00FF00FF00 ) >> 8 );
347 seq_arr[i] = ( ( seq_arr[i] & 0x0000FFFF0000FFFF ) << 16 ) | ( ( seq_arr[i] & 0xFFFF0000FFFF0000 ) >> 16 );
348 seq_arr[i] = ( ( seq_arr[i] & 0x00000000FFFFFFFF ) << 32 ) | ( ( seq_arr[i] & 0xFFFFFFFF00000000 ) >> 32 );
351 int j = 0, k = arr_sz - 1;
353 for ( ; j < k; ++j, --k )
355 uint64_t temp;
356 temp = seq_arr[j];
357 seq_arr[j] = seq_arr[k];
358 seq_arr[k] = temp;
361 R_shift_NC ( seq_arr, tot_bits - ( seq_size * 2 ), arr_sz );
362 return seq_arr;
366 inline uint64_t get_rev_comp_seq ( uint64_t seq, int seq_size )
368 seq = ~seq;
369 seq = ( ( seq & 0x3333333333333333 ) << 2 ) | ( ( seq & 0xCCCCCCCCCCCCCCCC ) >> 2 );
370 seq = ( ( seq & 0x0F0F0F0F0F0F0F0F ) << 4 ) | ( ( seq & 0xF0F0F0F0F0F0F0F0 ) >> 4 );
371 seq = ( ( seq & 0x00FF00FF00FF00FF ) << 8 ) | ( ( seq & 0xFF00FF00FF00FF00 ) >> 8 );
372 seq = ( ( seq & 0x0000FFFF0000FFFF ) << 16 ) | ( ( seq & 0xFFFF0000FFFF0000 ) >> 16 );
373 seq = ( ( seq & 0x00000000FFFFFFFF ) << 32 ) | ( ( seq & 0xFFFFFFFF00000000 ) >> 32 );
374 return seq >> ( 64 - ( seq_size * 2 ) );
378 //for 64bit platform
379 inline uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed )
381 const uint64_t m = 0xc6a4a7935bd1e995;
382 const int r = 47;
383 uint64_t h = seed ^ ( len * m );
384 const uint64_t * data = ( const uint64_t * ) key;
385 const uint64_t * end = data + ( len / 8 );
387 while ( data != end )
389 uint64_t k = *data++;
390 k *= m;
391 k ^= k >> r;
392 k *= m;
393 h ^= k;
394 h *= m;
397 const unsigned char * data2 = ( const unsigned char * ) data;
399 switch ( len & 7 )
401 case 7:
402 h ^= uint64_t ( data2[6] ) << 48;
403 case 6:
404 h ^= uint64_t ( data2[5] ) << 40;
405 case 5:
406 h ^= uint64_t ( data2[4] ) << 32;
407 case 4:
408 h ^= uint64_t ( data2[3] ) << 24;
409 case 3:
410 h ^= uint64_t ( data2[2] ) << 16;
411 case 2:
412 h ^= uint64_t ( data2[1] ) << 8;
413 case 1:
414 h ^= uint64_t ( data2[0] );
415 h *= m;
418 h ^= h >> r;
420 h *= m;
422 h ^= h >> r;
424 return h;
428 //for 32bit platform
429 inline uint64_t MurmurHash64B ( const void * key, int len, unsigned int seed )
431 const unsigned int m = 0x5bd1e995;
432 const int r = 24;
433 unsigned int h1 = seed ^ len;
434 unsigned int h2 = 0;
435 const unsigned int * data = ( const unsigned int * ) key;
437 while ( len >= 8 )
439 unsigned int k1 = *data++;
440 k1 *= m;
441 k1 ^= k1 >> r;
442 k1 *= m;
443 h1 *= m;
444 h1 ^= k1;
445 len -= 4;
446 unsigned int k2 = *data++;
447 k2 *= m;
448 k2 ^= k2 >> r;
449 k2 *= m;
450 h2 *= m;
451 h2 ^= k2;
452 len -= 4;
455 if ( len >= 4 )
457 unsigned int k1 = *data++;
458 k1 *= m;
459 k1 ^= k1 >> r;
460 k1 *= m;
461 h1 *= m;
462 h1 ^= k1;
463 len -= 4;
466 switch ( len )
468 case 3:
469 h2 ^= ( ( unsigned char * ) data ) [2] << 16;
470 case 2:
471 h2 ^= ( ( unsigned char * ) data ) [1] << 8;
472 case 1:
473 h2 ^= ( ( unsigned char * ) data ) [0];
474 h2 *= m;
477 h1 ^= h2 >> 18;
479 h1 *= m;
481 h2 ^= h1 >> 22;
483 h2 *= m;
485 h1 ^= h2 >> 17;
487 h1 *= m;
489 h2 ^= h1 >> 19;
491 h2 *= m;
493 uint64_t h = h1;
495 h = ( h << 32 ) | h2;
497 return h;
500 #endif