4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #ifndef _SPARSE_KMER_H
24 #define _SPARSE_KMER_H
28 inline void initKmerFilter ( int K_size
, kmer_t2
* kmer_filter
)
31 ( kmer_filter
->kmer
) [0] = 0;
32 ( kmer_filter
->kmer
) [1] = 1LU;
33 L_shift_NC ( kmer_filter
->kmer
, K_size
* 2, 2 );
37 ( kmer_filter
->kmer
) [1] -= 1; //
41 ( kmer_filter
->kmer
) [0] -= 1; // K_size = 32 is also ok ..
42 ( kmer_filter
->kmer
) [1] = -1; //fff..
47 memset ( kmer_filter
->kmer
, 0, sizeof ( kmer_t2
) );
48 ( kmer_filter
->kmer
) [3] = 1LU;
49 L_shift_NC ( kmer_filter
->kmer
, K_size
* 2, 4 );
53 ( kmer_filter
->kmer
) [3] -= 1;
55 else if ( K_size
<= 63 )
57 ( kmer_filter
->kmer
) [2] -= 1;
58 ( kmer_filter
->kmer
) [3] = -1;
60 else if ( K_size
<= 95 )
62 ( kmer_filter
->kmer
) [1] -= 1;
63 ( kmer_filter
->kmer
) [2] = -1;
64 ( kmer_filter
->kmer
) [3] = -1;
66 else if ( K_size
<= 127 )
68 ( kmer_filter
->kmer
) [0] -= 1;
69 ( kmer_filter
->kmer
) [1] = -1;
70 ( kmer_filter
->kmer
) [2] = -1;
71 ( kmer_filter
->kmer
) [3] = -1;
77 inline void kmerAnd ( kmer_t2
* k1
, kmer_t2
* k2
) //change k1
80 ( k1
->kmer
) [0] &= ( k2
->kmer
) [0];
81 ( k1
->kmer
) [1] &= ( k2
->kmer
) [1];
84 ( k1
->kmer
) [0] &= ( k2
->kmer
) [0];
85 ( k1
->kmer
) [1] &= ( k2
->kmer
) [1];
86 ( k1
->kmer
) [2] &= ( k2
->kmer
) [2];
87 ( k1
->kmer
) [3] &= ( k2
->kmer
) [3];
91 inline void kmerOr ( kmer_t2
* k1
, kmer_t2
* k2
) //change k1
94 ( k1
->kmer
) [0] |= ( k2
->kmer
) [0];
95 ( k1
->kmer
) [1] |= ( k2
->kmer
) [1];
98 ( k1
->kmer
) [0] |= ( k2
->kmer
) [0];
99 ( k1
->kmer
) [1] |= ( k2
->kmer
) [1];
100 ( k1
->kmer
) [2] |= ( k2
->kmer
) [2];
101 ( k1
->kmer
) [3] |= ( k2
->kmer
) [3];
105 inline void kmerMoveRight ( kmer_t2
* kmer
, int base_num
)
108 R_shift_NC ( kmer
->kmer
, base_num
* 2, 2 );
111 R_shift_NC ( kmer
->kmer
, base_num
* 2, 4 ); // has move 32 bug
116 inline void kmerMoveLeft ( kmer_t2
* kmer
, int base_num
)
119 L_shift_NC ( kmer
->kmer
, base_num
* 2, 2 );
122 L_shift_NC ( kmer
->kmer
, base_num
* 2, 4 );
126 inline int kmerCompare ( kmer_t2
* k1
, kmer_t2
* k2
)
129 return uint64_t_cmp ( k1
->kmer
, k2
->kmer
, 2 );
132 return uint64_t_cmp ( k1
->kmer
, k2
->kmer
, 4 );
136 inline void reverseCompKmer ( kmer_t2
* kmer
, int K_size
) //result stored in *kmer ...
139 get_rev_comp_seq_arr ( kmer
->kmer
, K_size
, 2 );
142 get_rev_comp_seq_arr ( kmer
->kmer
, K_size
, 4 );
146 inline bool isSmallerKmer ( kmer_t2
* kmer
, int K_size
)
150 reverseCompKmer ( &f_kmer
, K_size
);
152 if ( kmerCompare ( kmer
, &f_kmer
) < 0 )
160 inline void get_kmer_from_seq ( const char * seq
, int len
, int K_size
, int pos
, kmer_t2
* kmer
)
162 if ( pos
+ K_size
> len
)
164 fprintf ( stderr
, "ERROR: get_kmer position is invalid!\n" );
168 int start
= pos
, end
= pos
+ K_size
, index
;
169 uint64_t * arr_ptr
= kmer
->kmer
;
170 memset ( arr_ptr
, 0, sizeof ( kmer_t2
) );
171 int arr_sz
= sizeof ( kmer_t2
) / sizeof ( uint64_t );
175 for ( index
= start
, i
= 0; index
< end
; index
++, i
++ )
177 switch ( seq
[index
] )
183 tmp
= ( tmp
<< 2 ) | 1;
186 tmp
= ( tmp
<< 2 ) | 2;
189 tmp
= ( tmp
<< 2 ) | 3;
192 tmp
= ( tmp
<< 2 ) | 2; //treat N as G
195 tmp
= ( tmp
<< 2 ) | 2; // treat unknown char as G, 'S'
196 fprintf ( stderr
, "WARNING: get_kmer_from_seq process unknown char %c\n", seq
[index
] );
200 if ( ( i
+ 1 ) % 32 == 0 ) //tmp is full, tmp has stored 32 bases
202 arr_ptr
[i
/ 32] = tmp
;
209 fprintf ( stderr
, "ERROR: i %d is K_size \n", i
);
214 arr_ptr
[arr_sz
- 1] = tmp
;
216 else //if(K_size%32 != 0){ //absolute ..because K is odd
218 int left_bits
= ( 32 - K_size
% 32 ) * 2;
219 tmp
= tmp
<< left_bits
;
220 arr_ptr
[K_size
/ 32] = tmp
;
221 kmerMoveRight ( kmer
, 32 * arr_sz
- K_size
);
226 uint64_t high=0,low=0;
227 int high_start=0,high_end=0;
228 int low_satrt=0,low_end=0;
232 high_end = pos+K_size-32;
234 low_satrt = high_end;
235 low_end = low_satrt + 32;
237 high_start = high_end = pos;
239 low_end = pos+K_size;
243 for(int i=high_start;i<high_end;++i){ //dif from soapdenovo
250 high = (high << 2)|1;
253 high = (high << 2)|2;
256 high = (high << 2)|3;
259 high = (high << 2)|2;//treat N as G as same as soapdenovo
260 //debug_build<<"N occured at "<<i<<endl;
263 printf("error in process unknown char %c\n",seq[i]);
270 for(int i=low_satrt;i<low_end;++i){ //dif from soapdenovo
286 low = (low << 2)|2;//treat N as G as same as soapdenovo
287 //debug_build<<"N occured at "<<i<<endl;
290 printf("error in process unknown char %c\n",seq[i]);
301 inline void printKmer ( const kmer_t2
* kmer
, FILE * fp
)
304 fprintf ( fp
, "%llx %llx ,\n", ( kmer
->kmer
) [0], ( kmer
->kmer
) [1] );
307 fprintf ( fp
, "%llx %llx %llx %llx,\n", ( kmer
->kmer
) [0], ( kmer
->kmer
) [1], ( kmer
->kmer
) [2], ( kmer
->kmer
) [3] );
311 inline void printKmerSeq ( kmer_t2
* kmer
, int K_size
, FILE * fp
) //TODO printf ATCG
314 bitsarr2str ( kmer
->kmer
, K_size
, str
, sizeof ( kmer_t2
) / sizeof ( uint64_t ) );
315 fprintf ( fp
, "%s ,\n", str
);