4 * This file is part of SOAPdenovo.
6 * Part of this file is refered and modified from SparseAssembler
7 * (See <http://sourceforge.net/projects/sparseassembler/>).
18 extern inline void Init_Read ( string
& seq
, struct read_t
& read
);
20 extern inline uint64_t * str2bitsarr ( const char * c_str
, int len
, uint64_t * b_str
, int arr_sz
);
22 extern inline char * bitsarr2str ( uint64_t * b_seq
, int len
, char * c_str
, int arr_sz
);
24 extern inline void get_sub_arr ( uint64_t * bitsarr_in
, int bitsarr_len
, int begin_pos
, int sub_sz
, uint64_t * bitsarr_out
);
26 extern inline void L_shift_NC ( uint64_t * bitsarr
, int shift_sz
, int arr_sz
);
28 extern inline void R_shift_NC ( uint64_t * bitsarr
, int shift_sz
, int arr_sz
);
30 extern inline int uint64_t_cmp ( uint64_t * A
, uint64_t * B
, int Kmer_arr_sz
);
32 extern inline uint64_t * get_rev_comp_seq_arr ( uint64_t * seq_arr
, int seq_size
, int arr_sz
);
34 extern inline uint64_t get_rev_comp_seq ( uint64_t seq
, int seq_size
);
36 extern inline uint64_t MurmurHash64A ( const void * key
, int len
, unsigned int seed
);
41 inline void Init_Read ( string
& seq
, struct read_t
& read
)
43 read
.readLen
= ( int ) seq
.size();
44 int Read_arr_sz
= read
.readLen
/ 32 + 1;
45 int rem
= read
.readLen
% 32;
50 str2bitsarr ( seq
.c_str(), ( int ) seq
.size(), read
.read_bits
, Read_arr_sz
);
54 inline uint64_t * str2bitsarr ( const char * c_str
, int len
, uint64_t * b_str
, int arr_sz
)
56 for ( int k
= 0; k
< arr_sz
; ++k
)
61 int arr_sz_needed
= len
/ 32 + 1;
67 int beg_arr_idx
= arr_sz
- arr_sz_needed
;
69 if ( rem
== 0 && arr_sz_needed
> 0 )
72 for ( int k
= 0; k
< len
; k
++ )
75 {beg_arr_idx
++; rem
= 32;}
82 b_str
[beg_arr_idx
] <<= 2;
88 b_str
[beg_arr_idx
] <<= 2;
95 b_str
[beg_arr_idx
] <<= 1;
97 b_str
[beg_arr_idx
] <<= 1;
103 b_str
[beg_arr_idx
] <<= 1;
104 ++b_str
[beg_arr_idx
];
105 b_str
[beg_arr_idx
] <<= 1;
106 ++b_str
[beg_arr_idx
];
117 inline char * bitsarr2str ( uint64_t * b_seq
, int len
, char * c_str
, int arr_sz
)
119 int tot_bits
= arr_sz
* 64;
121 for ( int i
= 0; i
< len
; ++i
)
123 uint64_t temp
, temp2
[100];
125 for ( int k
= 0; k
< arr_sz
; ++k
)
130 L_shift_NC ( temp2
, tot_bits
- ( len
- i
) * 2, arr_sz
);
131 R_shift_NC ( temp2
, tot_bits
- 2, arr_sz
);
132 temp
= temp2
[arr_sz
- 1];
157 inline void get_sub_arr ( uint64_t * bitsarr_in
, int bitsarr_len
, int begin_pos
, int sub_sz
, uint64_t * bitsarr_out
)
159 if ( bitsarr_len
< sub_sz
)
160 {cout
<< "Error! Input kmer too short." << bitsarr_len
<< " " << sub_sz
<< endl
; return;}
162 int arr_sz_in
= bitsarr_len
/ 32 + 1;
163 int rem
= bitsarr_len
% 32;
168 int arr_sz_out
= sub_sz
/ 32 + 1;
170 if ( sub_sz
% 32 == 0 )
173 uint64_t temp_arr
[10];
174 memset ( temp_arr
, 0, sizeof ( temp_arr
) );
175 memset ( bitsarr_out
, 0, sizeof ( uint64_t ) *arr_sz_out
);
176 int rem2
= ( 32 - rem
+ begin_pos
) % 32;
177 int block_beg
= ( 32 - rem
+ begin_pos
) / 32;
182 int rem3
= ( 32 - rem
+ begin_pos
+ sub_sz
) % 32;
183 int block_end
= ( 32 - rem
+ begin_pos
+ sub_sz
) / 32;
195 int orig_sz
= ( block_end
- block_beg
+ 1 );
196 memcpy ( temp_arr
, &bitsarr_in
[block_beg
], orig_sz
* sizeof ( uint64_t ) );
197 L_shift_NC ( temp_arr
, rem2
* 2, orig_sz
);
198 R_shift_NC ( temp_arr
, ( rem2
+ rem3
) % 32 * 2, arr_sz_out
);
199 memcpy ( bitsarr_out
, temp_arr
, arr_sz_out
* sizeof ( uint64_t ) );
202 inline void L_shift_NC ( uint64_t * bitsarr
, int shift_sz
, int arr_sz
)
204 uint64_t temp_arr
[100];
206 for ( int i
= 0; i
< arr_sz
; ++i
)
211 int jmp
= shift_sz
/ 64;
212 int offset
= shift_sz
% 64;
214 for ( int i
= 0; i
< arr_sz
; ++i
)
216 if ( i
+ jmp
+ 1 < arr_sz
)
226 tt
= ( bitsarr
[i
+ jmp
+ 1] >> ( 64 - offset
) );
229 temp_arr
[i
] = ( ( bitsarr
[i
+ jmp
] << offset
) | tt
);
232 if ( i
+ jmp
+ 1 == arr_sz
)
233 {temp_arr
[i
] = bitsarr
[i
+ jmp
] << offset
;}
235 if ( i
+ jmp
+ 1 > arr_sz
)
239 for ( int i
= 0; i
< arr_sz
; ++i
)
241 bitsarr
[i
] = temp_arr
[i
];
247 inline void R_shift_NC ( uint64_t * bitsarr
, int shift_sz
, int arr_sz
)
249 uint64_t temp_arr
[100];
251 for ( int i
= 0; i
< arr_sz
; ++i
)
256 int jmp
= shift_sz
/ 64;
257 int offset
= shift_sz
% 64;
259 if ( offset
== 0 ) //to fix the move 64bit bug
261 for ( int i
= arr_sz
- 1; i
>= 0; --i
)
264 {temp_arr
[i
] = bitsarr
[i
- jmp
];}
267 {temp_arr
[i
] = bitsarr
[i
- jmp
];}
275 for ( int i
= arr_sz
- 1; i
>= 0; --i
)
278 {temp_arr
[i
] = ( bitsarr
[i
- jmp
] >> offset
) | ( bitsarr
[i
- jmp
- 1] << ( 64 - offset
) );}
281 {temp_arr
[i
] = ( bitsarr
[i
- jmp
] >> offset
);}
288 for ( int i
= 0; i
< arr_sz
; ++i
)
290 bitsarr
[i
] = temp_arr
[i
];
295 inline int uint64_t_cmp ( uint64_t * A
, uint64_t * B
, int Kmer_arr_sz
)
299 for ( int jj
= 0; jj
< Kmer_arr_sz
; ++jj
)
313 if ( A
[jj
] == B
[jj
] )
324 inline uint64_t * get_rev_comp_seq_arr ( uint64_t * seq_arr
, int seq_size
, int arr_sz
)
326 if ( seq_size
< 32 && arr_sz
== 2 )
328 seq_arr
[1] = get_rev_comp_seq ( seq_arr
[1], seq_size
);
330 if ( seq_arr
[0] != 0 )
332 fprintf ( stderr
, "ERROR: in get_rev_comp_seq_arr \n" );
339 int tot_bits
= arr_sz
* 64;
341 for ( int i
= 0; i
< arr_sz
; ++i
)
343 seq_arr
[i
] = ~seq_arr
[i
];
344 seq_arr
[i
] = ( ( seq_arr
[i
] & 0x3333333333333333 ) << 2 ) | ( ( seq_arr
[i
] & 0xCCCCCCCCCCCCCCCC ) >> 2 );
345 seq_arr
[i
] = ( ( seq_arr
[i
] & 0x0F0F0F0F0F0F0F0F ) << 4 ) | ( ( seq_arr
[i
] & 0xF0F0F0F0F0F0F0F0 ) >> 4 );
346 seq_arr
[i
] = ( ( seq_arr
[i
] & 0x00FF00FF00FF00FF ) << 8 ) | ( ( seq_arr
[i
] & 0xFF00FF00FF00FF00 ) >> 8 );
347 seq_arr
[i
] = ( ( seq_arr
[i
] & 0x0000FFFF0000FFFF ) << 16 ) | ( ( seq_arr
[i
] & 0xFFFF0000FFFF0000 ) >> 16 );
348 seq_arr
[i
] = ( ( seq_arr
[i
] & 0x00000000FFFFFFFF ) << 32 ) | ( ( seq_arr
[i
] & 0xFFFFFFFF00000000 ) >> 32 );
351 int j
= 0, k
= arr_sz
- 1;
353 for ( ; j
< k
; ++j
, --k
)
357 seq_arr
[j
] = seq_arr
[k
];
361 R_shift_NC ( seq_arr
, tot_bits
- ( seq_size
* 2 ), arr_sz
);
366 inline uint64_t get_rev_comp_seq ( uint64_t seq
, int seq_size
)
369 seq
= ( ( seq
& 0x3333333333333333 ) << 2 ) | ( ( seq
& 0xCCCCCCCCCCCCCCCC ) >> 2 );
370 seq
= ( ( seq
& 0x0F0F0F0F0F0F0F0F ) << 4 ) | ( ( seq
& 0xF0F0F0F0F0F0F0F0 ) >> 4 );
371 seq
= ( ( seq
& 0x00FF00FF00FF00FF ) << 8 ) | ( ( seq
& 0xFF00FF00FF00FF00 ) >> 8 );
372 seq
= ( ( seq
& 0x0000FFFF0000FFFF ) << 16 ) | ( ( seq
& 0xFFFF0000FFFF0000 ) >> 16 );
373 seq
= ( ( seq
& 0x00000000FFFFFFFF ) << 32 ) | ( ( seq
& 0xFFFFFFFF00000000 ) >> 32 );
374 return seq
>> ( 64 - ( seq_size
* 2 ) );
379 inline uint64_t MurmurHash64A ( const void * key
, int len
, unsigned int seed
)
381 const uint64_t m
= 0xc6a4a7935bd1e995;
383 uint64_t h
= seed
^ ( len
* m
);
384 const uint64_t * data
= ( const uint64_t * ) key
;
385 const uint64_t * end
= data
+ ( len
/ 8 );
387 while ( data
!= end
)
389 uint64_t k
= *data
++;
397 const unsigned char * data2
= ( const unsigned char * ) data
;
402 h
^= uint64_t ( data2
[6] ) << 48;
404 h
^= uint64_t ( data2
[5] ) << 40;
406 h
^= uint64_t ( data2
[4] ) << 32;
408 h
^= uint64_t ( data2
[3] ) << 24;
410 h
^= uint64_t ( data2
[2] ) << 16;
412 h
^= uint64_t ( data2
[1] ) << 8;
414 h
^= uint64_t ( data2
[0] );
429 inline uint64_t MurmurHash64B ( const void * key
, int len
, unsigned int seed
)
431 const unsigned int m
= 0x5bd1e995;
433 unsigned int h1
= seed
^ len
;
435 const unsigned int * data
= ( const unsigned int * ) key
;
439 unsigned int k1
= *data
++;
446 unsigned int k2
= *data
++;
457 unsigned int k1
= *data
++;
469 h2
^= ( ( unsigned char * ) data
) [2] << 16;
471 h2
^= ( ( unsigned char * ) data
) [1] << 8;
473 h2
^= ( ( unsigned char * ) data
) [0];
495 h
= ( h
<< 32 ) | h2
;