4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
30 void PrintKmer ( FILE * fp
, Kmer kmer
)
32 fprintf ( fp
, "%llx %llx %llx %llx", kmer
.high1
, kmer
.low1
, kmer
.high2
, kmer
.low2
);
35 /*************************************************
39 Checks if kmer1 is smaller than kmer2. if no, returns 0.
41 1. kmer1: the first kmer
42 2. kmer2: the second kmer
46 1 if kmer1 is smaller than kmer2.
47 *************************************************/
48 boolean
KmerSmaller ( Kmer kmer1
, Kmer kmer2
)
50 if ( kmer1
.high1
!= kmer2
.high1
)
52 return ( kmer1
.high1
< kmer2
.high1
);
56 if ( kmer1
.low1
!= kmer2
.low1
)
58 return ( kmer1
.low1
< kmer2
.low1
);
62 if ( kmer1
.high2
!= kmer2
.high2
)
64 return ( kmer1
.high2
< kmer2
.high2
);
68 return ( kmer1
.low2
< kmer2
.low2
);
74 /*************************************************
78 Checks if kmer1 is larger than kmer2. if no, returns 0.
80 1. kmer1: the first kmer
81 2. kmer2: the second kmer
85 1 if kmer1 is larger than kmer2.
86 *************************************************/
87 boolean
KmerLarger ( Kmer kmer1
, Kmer kmer2
)
89 if ( kmer1
.high1
!= kmer2
.high1
)
91 return ( kmer1
.high1
> kmer2
.high1
);
95 if ( kmer1
.low1
!= kmer2
.low1
)
97 return ( kmer1
.low1
> kmer2
.low1
);
101 if ( kmer1
.high2
!= kmer2
.high2
)
103 return ( kmer1
.high2
> kmer2
.high2
);
107 return ( kmer1
.low2
> kmer2
.low2
);
113 /*************************************************
117 Checks if kmer1 is same as kmer2. if no, returns 0.
119 1. kmer1: the first kmer
120 2. kmer2: the second kmer
124 1 if if kmer1 is same as kmer2.
125 *************************************************/
126 boolean
KmerEqual ( Kmer kmer1
, Kmer kmer2
)
128 if ( kmer1
.low2
!= kmer2
.low2
|| kmer1
.high2
!= kmer2
.high2
|| kmer1
.low1
!= kmer2
.low1
|| kmer1
.high1
!= kmer2
.high1
)
138 /*************************************************
144 1. kmer1: the first kmer
145 2. kmer2: the second kmer
150 *************************************************/
151 Kmer
KmerAnd ( Kmer kmer1
, Kmer kmer2
)
153 kmer1
.high1
&= kmer2
.high1
;
154 kmer1
.low1
&= kmer2
.low1
;
155 kmer1
.high2
&= kmer2
.high2
;
156 kmer1
.low2
&= kmer2
.low2
;
160 /*************************************************
171 *************************************************/
172 Kmer
KmerLeftBitMoveBy2 ( Kmer word
)
174 word
.high1
= ( word
.high1
<< 2 ) | ( word
.low1
>> 62 );
175 word
.low1
= ( word
.low1
<< 2 ) | ( word
.high2
>> 62 );
176 word
.high2
= ( word
.high2
<< 2 ) | ( word
.low2
>> 62 );
181 /*************************************************
192 *************************************************/
193 Kmer
KmerRightBitMoveBy2 ( Kmer word
)
195 word
.low2
= ( word
.low2
>> 2 ) | ( word
.high2
& 0x3 ) << 62;
196 word
.high2
= ( word
.high2
>> 2 ) | ( word
.low1
& 0x3 ) << 62;
197 word
.low1
= ( word
.low1
>> 2 ) | ( word
.high1
& 0x3 ) << 62;
202 /*************************************************
206 Appends a base to kmer to get (k+1)mer.
209 2. ch: the downstream
214 *************************************************/
215 Kmer
KmerPlus ( Kmer prev
, char ch
)
217 Kmer word
= KmerLeftBitMoveBy2 ( prev
);
222 /*************************************************
226 Combines the last (k-1) bases of a kmer with another base to get new kmer.
229 2. ch: the appened base
234 *************************************************/
235 Kmer
nextKmer ( Kmer prev
, char ch
)
237 Kmer word
= KmerLeftBitMoveBy2 ( prev
);
238 word
= KmerAnd ( word
, WORDFILTER
);
243 /*************************************************
247 Combines a base with the first (k-1) bases of a kmer to get new kmer.
250 2. ch: the first base of new kmer
255 *************************************************/
256 Kmer
prevKmer ( Kmer next
, char ch
)
258 Kmer word
= KmerRightBitMoveBy2 ( next
);
260 switch ( overlaplen
)
263 word
.low2
|= ( ( ( ubyte8
) ch
) << 2 * ( overlaplen
- 1 ) );
266 word
.high2
|= ( ( ubyte8
) ch
) << ( 2 * ( overlaplen
- 1 ) - 64 );
269 word
.low1
|= ( ( ubyte8
) ch
) << ( 2 * ( overlaplen
- 1 ) - 128 );
272 word
.high1
|= ( ( ubyte8
) ch
) << ( 2 * ( overlaplen
- 1 ) - 192 );
279 /*************************************************
283 Gets the last char of a kmer, and returns it.
289 The last char of kmer.
290 *************************************************/
291 char lastCharInKmer ( Kmer kmer
)
293 return ( char ) ( kmer
.low2
& 0x3 );
296 /*************************************************
300 Gets the first char of a kmer, and returns it.
306 The first char of kmer.
307 *************************************************/
308 char firstCharInKmer ( Kmer kmer
)
310 switch ( overlaplen
)
313 kmer
.low2
>>= 2 * ( overlaplen
- 1 );
314 return kmer
.low2
; // & 3;
316 kmer
.high2
>>= 2 * ( overlaplen
- 1 ) - 64;
317 return kmer
.high2
; // & 3;
319 kmer
.low1
>>= 2 * ( overlaplen
- 1 ) - 128;
322 kmer
.high1
>>= 2 * ( overlaplen
- 1 ) - 192;
327 /*************************************************
331 Creates filter for masking.
333 1. overlaplen: the kmer value
338 *************************************************/
339 Kmer
createFilter ( int overlaplen
)
342 word
.high1
= word
.low1
= word
.high2
= word
.low2
= 0;
344 switch ( overlaplen
)
347 word
.low2
= ( ( ( ubyte8
) 1 ) << ( 2 * overlaplen
) ) - 1;
350 word
.low2
= ~word
.low2
;
351 word
.high2
= ( ( ( ubyte8
) 1 ) << ( 2 * overlaplen
- 64 ) ) - 1;
354 word
.high2
= word
.low2
= ~word
.low2
;
355 word
.low1
= ( ( ( ubyte8
) 1 ) << ( 2 * overlaplen
- 128 ) ) - 1;
358 word
.low1
= word
.high2
= word
.low2
= ~word
.low2
;
359 word
.high1
= ( ( ( ubyte8
) 1 ) << ( 2 * overlaplen
- 192 ) ) - 1;
366 /*************************************************
370 Makes right shifts to a kmer by specified bits and returns the new kmer.
373 2. dis: the shifted bits
378 *************************************************/
379 Kmer
KmerRightBitMove ( Kmer word
, int dis
)
386 mask
= ( ( ( ubyte8
) 1 ) << dis
) - 1;
387 word
.low2
= ( word
.low2
>> dis
) | ( word
.high2
& mask
) << ( 64 - dis
);
388 word
.high2
= ( word
.high2
>> dis
) | ( word
.low1
& mask
) << ( 64 - dis
);
389 word
.low1
= ( word
.low1
>> dis
) | ( word
.high1
& mask
) << ( 64 - dis
);
393 mask
= ( ( ( ubyte8
) 1 ) << ( dis
- 64 ) ) - 1;
394 word
.low2
= word
.high2
>> ( dis
- 64 ) | ( word
.low1
& mask
) << ( 128 - dis
);
395 word
.high2
= word
.low1
>> ( dis
- 64 ) | ( word
.high1
& mask
) << ( 128 - dis
);
396 word
.low1
= word
.high1
>> ( dis
- 64 );
400 mask
= ( ( ( ubyte8
) 1 ) << ( dis
- 128 ) ) - 1;
401 word
.low2
= word
.low1
>> ( dis
- 128 ) | ( word
.high1
& mask
) << ( 192 - dis
);
402 word
.high2
= word
.high1
>> ( dis
- 128 );
403 word
.high1
= word
.low1
= 0;
406 word
.low2
= word
.high1
>> ( dis
- 192 );
407 word
.high1
= word
.low1
= word
.high2
= 0;
412 /*************************************************
419 2. kmer: the output kmer
424 *************************************************/
425 void printKmerSeq ( FILE * fp
, Kmer kmer
)
427 int i
, bit1
, bit2
, bit3
, bit4
;
428 bit4
= bit3
= bit2
= bit1
= 0;
431 switch ( overlaplen
)
438 bit3
= overlaplen
- 32;
442 bit2
= overlaplen
- 64;
445 bit4
= bit3
= bit2
= 32;
446 bit1
= overlaplen
- 96;
450 for ( i
= bit1
- 1; i
>= 0; i
-- )
452 kmerSeq
[i
] = kmer
.high1
& 0x3;
456 for ( i
= bit2
- 1; i
>= 0; i
-- )
458 kmerSeq
[i
+ bit1
] = kmer
.low1
& 0x3;
462 for ( i
= bit3
- 1; i
>= 0; i
-- )
464 kmerSeq
[i
+ bit1
+ bit2
] = kmer
.high2
& 0x3;
468 for ( i
= bit4
- 1; i
>= 0; i
-- )
470 kmerSeq
[i
+ bit1
+ bit2
+ bit3
] = kmer
.low2
& 0x3;
474 for ( i
= 0; i
< overlaplen
; i
++ )
476 fprintf ( fp
, "%c", int2base ( ( int ) kmerSeq
[i
] ) );
480 void print_kmer ( FILE * fp
, Kmer kmer
, char c
)
482 fprintf ( fp
, "%llx %llx %llx %llx", kmer
.high1
, kmer
.low1
, kmer
.high2
, kmer
.low2
);
483 fprintf ( fp
, "%c", c
);
486 void print_kmer_gz ( gzFile
* fp
, Kmer kmer
, char c
)
488 gzprintf ( fp
, "%llx %llx %llx %llx", kmer
.high1
, kmer
.low1
, kmer
.high2
, kmer
.low2
);
489 gzprintf ( fp
, "%c", c
);
492 static const ubyte2 BitReverseTable
[65536] =
494 # define R2(n) n, n + 1*16384, n + 2*16384, n + 3*16384
495 # define R4(n) R2(n), R2(n + 1*4096), R2(n + 2*4096), R2(n + 3*4096)
496 # define R6(n) R4(n), R4(n + 1*1024), R4(n + 2*1024), R4(n + 3*1024)
497 # define R8(n) R6(n), R6(n + 1*256 ), R6(n + 2*256 ), R6(n + 3*256 )
498 # define R10(n) R8(n), R8(n + 1*64 ), R8(n + 2*64 ), R8(n + 3*64 )
499 # define R12(n) R10(n),R10(n + 1*16), R10(n + 2*16 ), R10(n + 3*16 )
500 # define R14(n) R12(n),R12(n + 1*4 ), R12(n + 2*4 ), R12(n + 3*4 )
501 R14 ( 0 ), R14 ( 1 ), R14 ( 2 ), R14 ( 3 )
504 /*************************************************
508 Gets the reverse complement of a kmer.
511 2. seq_size: the length of the kmer
515 The reverse complement of kmer.
516 *************************************************/
517 static Kmer
fastReverseComp ( Kmer seq
, char seq_size
)
519 seq
.low2
^= 0xAAAAAAAAAAAAAAAALLU
;
520 seq
.low2
= ( ( ubyte8
) BitReverseTable
[seq
.low2
& 0xffff] << 48 ) |
521 ( ( ubyte8
) BitReverseTable
[ ( seq
.low2
>> 16 ) & 0xffff] << 32 ) | ( ( ubyte8
) BitReverseTable
[ ( seq
.low2
>> 32 ) & 0xffff] << 16 ) | ( ( ubyte8
) BitReverseTable
[ ( seq
.low2
>> 48 ) & 0xffff] );
525 seq
.low2
>>= ( 64 - ( seq_size
<< 1 ) );
529 seq
.high2
^= 0xAAAAAAAAAAAAAAAALLU
;
530 seq
.high2
= ( ( ubyte8
) BitReverseTable
[seq
.high2
& 0xffff] << 48 ) |
531 ( ( ubyte8
) BitReverseTable
[ ( seq
.high2
>> 16 ) & 0xffff] << 32 ) | ( ( ubyte8
) BitReverseTable
[ ( seq
.high2
>> 32 ) & 0xffff] << 16 ) | ( ( ubyte8
) BitReverseTable
[ ( seq
.high2
>> 48 ) & 0xffff] );
535 seq
.high2
= seq
.high2
^ seq
.low2
;
536 seq
.low2
= seq
.high2
^ seq
.low2
;
537 seq
.high2
= seq
.high2
^ seq
.low2
;
538 seq
= KmerRightBitMove ( seq
, 128 - ( seq_size
<< 1 ) );
542 seq
.low1
^= 0xAAAAAAAAAAAAAAAALLU
;
543 seq
.low1
= ( ( ubyte8
) BitReverseTable
[seq
.low1
& 0xffff] << 48 ) |
544 ( ( ubyte8
) BitReverseTable
[ ( seq
.low1
>> 16 ) & 0xffff] << 32 ) | ( ( ubyte8
) BitReverseTable
[ ( seq
.low1
>> 32 ) & 0xffff] << 16 ) | ( ( ubyte8
) BitReverseTable
[ ( seq
.low1
>> 48 ) & 0xffff] );
548 seq
.low1
= seq
.low1
^ seq
.low2
;
549 seq
.low2
= seq
.low1
^ seq
.low2
;
550 seq
.low1
= seq
.low1
^ seq
.low2
;
551 seq
= KmerRightBitMove ( seq
, 192 - ( seq_size
<< 1 ) );
555 seq
.high1
^= 0xAAAAAAAAAAAAAAAALLU
;
556 seq
.high1
= ( ( ubyte8
) BitReverseTable
[seq
.high1
& 0xffff] << 48 ) |
557 ( ( ubyte8
) BitReverseTable
[ ( seq
.high1
>> 16 ) & 0xffff] << 32 ) | ( ( ubyte8
) BitReverseTable
[ ( seq
.high1
>> 32 ) & 0xffff] << 16 ) | ( ( ubyte8
) BitReverseTable
[ ( seq
.high1
>> 48 ) & 0xffff] );
558 seq
.low1
= seq
.low1
^ seq
.high2
;
559 seq
.high2
= seq
.low1
^ seq
.high2
;
560 seq
.low1
= seq
.low1
^ seq
.high2
;
561 seq
.low2
= seq
.low2
^ seq
.high1
;
562 seq
.high1
= seq
.low2
^ seq
.high1
;
563 seq
.low2
= seq
.low2
^ seq
.high1
;
564 seq
= KmerRightBitMove ( seq
, 256 - ( seq_size
<< 1 ) );
568 Kmer
reverseComplementVerbose ( Kmer word
, int overlap
)
570 return fastReverseComp ( word
, overlap
);
573 Kmer
reverseComplement ( Kmer word
, int overlap
)
575 return fastReverseComp ( word
, overlap
);
579 void PrintKmer ( FILE * fp
, Kmer kmer
)
581 fprintf ( fp
, "%llx %llx", kmer
.high
, kmer
.low
);
585 __uint128_t
Kmer2int128 ( Kmer seq
)
593 boolean
KmerSmaller ( Kmer kmer1
, Kmer kmer2
)
595 if ( kmer1
.high
< kmer2
.high
)
599 else if ( kmer1
.high
== kmer2
.high
)
601 if ( kmer1
.low
< kmer2
.low
)
616 boolean
KmerLarger ( Kmer kmer1
, Kmer kmer2
)
618 if ( kmer1
.high
> kmer2
.high
)
622 else if ( kmer1
.high
== kmer2
.high
)
624 if ( kmer1
.low
> kmer2
.low
)
638 boolean
KmerEqual ( Kmer kmer1
, Kmer kmer2
)
640 if ( kmer1
.high
== kmer2
.high
&& kmer1
.low
== kmer2
.low
)
650 Kmer
KmerAnd ( Kmer kmer1
, Kmer kmer2
)
652 kmer1
.high
&= kmer2
.high
;
653 kmer1
.low
&= kmer2
.low
;
657 Kmer
KmerLeftBitMoveBy2 ( Kmer word
)
659 ubyte8 temp
= word
.low
>> 62;
666 Kmer
KmerRightBitMoveBy2 ( Kmer word
)
668 ubyte8 temp
= ( word
.high
& 0x3 ) << 62;
675 Kmer
KmerPlus ( Kmer prev
, char ch
)
677 Kmer word
= KmerLeftBitMoveBy2 ( prev
);
681 Kmer
nextKmer ( Kmer prev
, char ch
)
683 Kmer word
= KmerLeftBitMoveBy2 ( prev
);
684 word
= KmerAnd ( word
, WORDFILTER
);
689 Kmer
prevKmer ( Kmer next
, char ch
)
691 Kmer word
= KmerRightBitMoveBy2 ( next
);
693 if ( 2 * ( overlaplen
- 1 ) < 64 )
695 word
.low
|= ( ( ( ubyte8
) ch
) << 2 * ( overlaplen
- 1 ) );
699 word
.high
|= ( ( ubyte8
) ch
) << ( 2 * ( overlaplen
- 1 ) - 64 );
705 char lastCharInKmer ( Kmer kmer
)
707 return ( char ) ( kmer
.low
& 0x3 );
709 char firstCharInKmer ( Kmer kmer
)
711 if ( 2 * ( overlaplen
- 1 ) < 64 )
713 kmer
.low
>>= 2 * ( overlaplen
- 1 );
714 return kmer
.low
; // & 3;
718 kmer
.high
>>= 2 * ( overlaplen
- 1 ) - 64;
719 return kmer
.high
; // & 3;
723 Kmer
createFilter ( int overlaplen
)
726 word
.high
= word
.low
= 0;
728 if ( 2 * overlaplen
< 64 )
730 word
.low
= ( ( ( ubyte8
) 1 ) << ( 2 * overlaplen
) ) - 1;
734 word
.low
= ~word
.low
;
736 if ( 2 * overlaplen
> 64 )
738 word
.high
= ( ( ( ubyte8
) 1 ) << ( 2 * overlaplen
- 64 ) ) - 1;
745 Kmer
KmerRightBitMove ( Kmer word
, int dis
)
749 ubyte8 mask
= ( ( ( ubyte8
) 1 ) << dis
) - 1;
750 ubyte8 temp
= ( word
.high
& mask
) << ( 64 - dis
);
757 word
.high
>>= ( dis
- 64 );
758 word
.low
= word
.high
;
764 void printKmerSeq ( FILE * fp
, Kmer kmer
)
769 bit2
= overlaplen
> 32 ? 32 : overlaplen
;
770 bit1
= overlaplen
> 32 ? overlaplen
- 32 : 0;
772 for ( i
= bit1
- 1; i
>= 0; i
-- )
774 ch
= kmer
.high
& 0x3;
779 for ( i
= bit2
- 1; i
>= 0; i
-- )
783 kmerSeq
[i
+ bit1
] = ch
;
786 for ( i
= 0; i
< overlaplen
; i
++ )
788 fprintf ( fp
, "%c", int2base ( ( int ) kmerSeq
[i
] ) );
792 void print_kmer ( FILE * fp
, Kmer kmer
, char c
)
794 fprintf ( fp
, "%llx %llx", kmer
.high
, kmer
.low
);
795 fprintf ( fp
, "%c", c
);
798 void print_kmer_gz ( gzFile
* fp
, Kmer kmer
, char c
)
800 gzprintf ( fp
, "%llx %llx", kmer
.high
, kmer
.low
);
801 gzprintf ( fp
, "%c", c
);
804 static Kmer
fastReverseComp ( Kmer seq
, char seq_size
)
806 seq
.low
^= 0xAAAAAAAAAAAAAAAALLU
;
807 seq
.low
= ( ( seq
.low
& 0x3333333333333333LLU
) << 2 ) | ( ( seq
.low
& 0xCCCCCCCCCCCCCCCCLLU
) >> 2 );
808 seq
.low
= ( ( seq
.low
& 0x0F0F0F0F0F0F0F0FLLU
) << 4 ) | ( ( seq
.low
& 0xF0F0F0F0F0F0F0F0LLU
) >> 4 );
809 seq
.low
= ( ( seq
.low
& 0x00FF00FF00FF00FFLLU
) << 8 ) | ( ( seq
.low
& 0xFF00FF00FF00FF00LLU
) >> 8 );
810 seq
.low
= ( ( seq
.low
& 0x0000FFFF0000FFFFLLU
) << 16 ) | ( ( seq
.low
& 0xFFFF0000FFFF0000LLU
) >> 16 );
811 seq
.low
= ( ( seq
.low
& 0x00000000FFFFFFFFLLU
) << 32 ) | ( ( seq
.low
& 0xFFFFFFFF00000000LLU
) >> 32 );
815 seq
.low
>>= ( 64 - ( seq_size
<< 1 ) );
819 seq
.high
^= 0xAAAAAAAAAAAAAAAALLU
;
820 seq
.high
= ( ( seq
.high
& 0x3333333333333333LLU
) << 2 ) | ( ( seq
.high
& 0xCCCCCCCCCCCCCCCCLLU
) >> 2 );
821 seq
.high
= ( ( seq
.high
& 0x0F0F0F0F0F0F0F0FLLU
) << 4 ) | ( ( seq
.high
& 0xF0F0F0F0F0F0F0F0LLU
) >> 4 );
822 seq
.high
= ( ( seq
.high
& 0x00FF00FF00FF00FFLLU
) << 8 ) | ( ( seq
.high
& 0xFF00FF00FF00FF00LLU
) >> 8 );
823 seq
.high
= ( ( seq
.high
& 0x0000FFFF0000FFFFLLU
) << 16 ) | ( ( seq
.high
& 0xFFFF0000FFFF0000LLU
) >> 16 );
824 seq
.high
= ( ( seq
.high
& 0x00000000FFFFFFFFLLU
) << 32 ) | ( ( seq
.high
& 0xFFFFFFFF00000000LLU
) >> 32 );
825 ubyte8 temp
= seq
.high
;
828 seq
= KmerRightBitMove ( seq
, 128 - ( seq_size
<< 1 ) );
832 Kmer
reverseComplementVerbose ( Kmer word
, int overlap
)
834 return fastReverseComp ( word
, overlap
);
837 Kmer
reverseComplement ( Kmer word
, int overlap
)
839 return fastReverseComp ( word
, overlap
);