modified: nfig1.py
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / sparsePregraph / inc / sparse_kmer.h
blobd013dedf64ece79af36ce0f0d495498f9c1d518b
1 /*
2 * inc/sparse_kmer.h
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #ifndef _SPARSE_KMER_H
24 #define _SPARSE_KMER_H
26 #include "seq_util.h"
28 inline void initKmerFilter ( int K_size, kmer_t2 * kmer_filter )
30 #ifdef _63MER_
31 ( kmer_filter->kmer ) [0] = 0;
32 ( kmer_filter->kmer ) [1] = 1LU;
33 L_shift_NC ( kmer_filter->kmer, K_size * 2, 2 );
35 if ( K_size <= 31 )
37 ( kmer_filter->kmer ) [1] -= 1; //
39 else
41 ( kmer_filter->kmer ) [0] -= 1; // K_size = 32 is also ok ..
42 ( kmer_filter->kmer ) [1] = -1; //fff..
45 #endif
46 #ifdef _127MER_
47 memset ( kmer_filter->kmer, 0, sizeof ( kmer_t2 ) );
48 ( kmer_filter->kmer ) [3] = 1LU;
49 L_shift_NC ( kmer_filter->kmer, K_size * 2, 4 );
51 if ( K_size <= 31 )
53 ( kmer_filter->kmer ) [3] -= 1;
55 else if ( K_size <= 63 )
57 ( kmer_filter->kmer ) [2] -= 1;
58 ( kmer_filter->kmer ) [3] = -1;
60 else if ( K_size <= 95 )
62 ( kmer_filter->kmer ) [1] -= 1;
63 ( kmer_filter->kmer ) [2] = -1;
64 ( kmer_filter->kmer ) [3] = -1;
66 else if ( K_size <= 127 )
68 ( kmer_filter->kmer ) [0] -= 1;
69 ( kmer_filter->kmer ) [1] = -1;
70 ( kmer_filter->kmer ) [2] = -1;
71 ( kmer_filter->kmer ) [3] = -1;
74 #endif
77 inline void kmerAnd ( kmer_t2 * k1, kmer_t2 * k2 ) //change k1
79 #ifdef _63MER_
80 ( k1->kmer ) [0] &= ( k2->kmer ) [0];
81 ( k1->kmer ) [1] &= ( k2->kmer ) [1];
82 #endif
83 #ifdef _127MER_
84 ( k1->kmer ) [0] &= ( k2->kmer ) [0];
85 ( k1->kmer ) [1] &= ( k2->kmer ) [1];
86 ( k1->kmer ) [2] &= ( k2->kmer ) [2];
87 ( k1->kmer ) [3] &= ( k2->kmer ) [3];
88 #endif
91 inline void kmerOr ( kmer_t2 * k1, kmer_t2 * k2 ) //change k1
93 #ifdef _63MER_
94 ( k1->kmer ) [0] |= ( k2->kmer ) [0];
95 ( k1->kmer ) [1] |= ( k2->kmer ) [1];
96 #endif
97 #ifdef _127MER_
98 ( k1->kmer ) [0] |= ( k2->kmer ) [0];
99 ( k1->kmer ) [1] |= ( k2->kmer ) [1];
100 ( k1->kmer ) [2] |= ( k2->kmer ) [2];
101 ( k1->kmer ) [3] |= ( k2->kmer ) [3];
102 #endif
105 inline void kmerMoveRight ( kmer_t2 * kmer, int base_num )
107 #ifdef _63MER_
108 R_shift_NC ( kmer->kmer, base_num * 2, 2 );
109 #endif
110 #ifdef _127MER_
111 R_shift_NC ( kmer->kmer, base_num * 2, 4 ); // has move 32 bug
112 #endif
116 inline void kmerMoveLeft ( kmer_t2 * kmer, int base_num )
118 #ifdef _63MER_
119 L_shift_NC ( kmer->kmer, base_num * 2, 2 );
120 #endif
121 #ifdef _127MER_
122 L_shift_NC ( kmer->kmer, base_num * 2, 4 );
123 #endif
126 inline int kmerCompare ( kmer_t2 * k1, kmer_t2 * k2 )
128 #ifdef _63MER_
129 return uint64_t_cmp ( k1->kmer, k2->kmer, 2 );
130 #endif
131 #ifdef _127MER_
132 return uint64_t_cmp ( k1->kmer, k2->kmer, 4 );
133 #endif
136 inline void reverseCompKmer ( kmer_t2 * kmer , int K_size ) //result stored in *kmer ...
138 #ifdef _63MER_
139 get_rev_comp_seq_arr ( kmer->kmer, K_size, 2 );
140 #endif
141 #ifdef _127MER_
142 get_rev_comp_seq_arr ( kmer->kmer, K_size, 4 );
143 #endif
146 inline bool isSmallerKmer ( kmer_t2 * kmer, int K_size )
148 kmer_t2 f_kmer;
149 f_kmer = *kmer;
150 reverseCompKmer ( &f_kmer, K_size );
152 if ( kmerCompare ( kmer, &f_kmer ) < 0 )
154 return 1;
157 return 0;
160 inline void get_kmer_from_seq ( const char * seq, int len, int K_size, int pos, kmer_t2 * kmer )
162 if ( pos + K_size > len )
164 fprintf ( stderr, "ERROR: get_kmer position is invalid!\n" );
165 exit ( 1 );
168 int start = pos, end = pos + K_size, index;
169 uint64_t * arr_ptr = kmer->kmer;
170 memset ( arr_ptr, 0, sizeof ( kmer_t2 ) );
171 int arr_sz = sizeof ( kmer_t2 ) / sizeof ( uint64_t );
172 int i = 0;
173 uint64_t tmp = 0;
175 for ( index = start, i = 0; index < end; index++, i++ )
177 switch ( seq[index] )
179 case 'A':
180 tmp = tmp << 2;
181 break;
182 case 'C':
183 tmp = ( tmp << 2 ) | 1;
184 break;
185 case 'G':
186 tmp = ( tmp << 2 ) | 2;
187 break;
188 case 'T':
189 tmp = ( tmp << 2 ) | 3;
190 break;
191 case 'N':
192 tmp = ( tmp << 2 ) | 2; //treat N as G
193 break;
194 default:
195 tmp = ( tmp << 2 ) | 2; // treat unknown char as G, 'S'
196 fprintf ( stderr, "WARNING: get_kmer_from_seq process unknown char %c\n", seq[index] );
197 //exit(1);
200 if ( ( i + 1 ) % 32 == 0 ) //tmp is full, tmp has stored 32 bases
202 arr_ptr[i / 32] = tmp;
203 tmp = 0;
207 if ( i != K_size )
209 fprintf ( stderr, "ERROR: i %d is K_size \n", i );
212 if ( K_size <= 31 )
214 arr_ptr[arr_sz - 1] = tmp;
216 else //if(K_size%32 != 0){ //absolute ..because K is odd
218 int left_bits = ( 32 - K_size % 32 ) * 2;
219 tmp = tmp << left_bits;
220 arr_ptr[K_size / 32] = tmp;
221 kmerMoveRight ( kmer, 32 * arr_sz - K_size );
226 uint64_t high=0,low=0;
227 int high_start=0,high_end=0;
228 int low_satrt=0,low_end=0;
230 if(K_size >= 33){
231 high_start = pos;
232 high_end = pos+K_size-32;
234 low_satrt = high_end;
235 low_end = low_satrt + 32;
236 }else{
237 high_start = high_end = pos;
238 low_satrt = pos;
239 low_end = pos+K_size;
242 //debug<<"kmer ";
243 for(int i=high_start;i<high_end;++i){ //dif from soapdenovo
244 //debug<<seq[i];
245 switch(seq[i]){
246 case 'A':
247 high = high<< 2;
248 break;
249 case 'C':
250 high = (high << 2)|1;
251 break;
252 case 'G':
253 high = (high << 2)|2;
254 break;
255 case 'T':
256 high = (high << 2)|3;
257 break;
258 case 'N':
259 high = (high << 2)|2;//treat N as G as same as soapdenovo
260 //debug_build<<"N occured at "<<i<<endl;
261 break;
262 default:
263 printf("error in process unknown char %c\n",seq[i]);
264 exit(1);
268 //debug<<" ";
270 for(int i=low_satrt;i<low_end;++i){ //dif from soapdenovo
271 //debug<<seq[i];
272 switch(seq[i]){
273 case 'A':
274 low= low<< 2;
275 break;
276 case 'C':
277 low = (low << 2)|1;
278 break;
279 case 'G':
280 low = (low << 2)|2;
281 break;
282 case 'T':
283 low = (low << 2)|3;
284 break;
285 case 'N':
286 low = (low << 2)|2;//treat N as G as same as soapdenovo
287 //debug_build<<"N occured at "<<i<<endl;
288 break;
289 default:
290 printf("error in process unknown char %c\n",seq[i]);
291 exit(1);
295 kmer[0]=high;
296 kmer[1]=low;
301 inline void printKmer ( const kmer_t2 * kmer, FILE * fp )
303 #ifdef _63MER_
304 fprintf ( fp, "%llx %llx ,\n", ( kmer->kmer ) [0], ( kmer->kmer ) [1] );
305 #endif
306 #ifdef _127MER_
307 fprintf ( fp, "%llx %llx %llx %llx,\n", ( kmer->kmer ) [0], ( kmer->kmer ) [1], ( kmer->kmer ) [2], ( kmer->kmer ) [3] );
308 #endif
311 inline void printKmerSeq ( kmer_t2 * kmer, int K_size, FILE * fp ) //TODO printf ATCG
313 char str[128];
314 bitsarr2str ( kmer->kmer, K_size, str, sizeof ( kmer_t2 ) / sizeof ( uint64_t ) );
315 fprintf ( fp, "%s ,\n", str );
317 #endif