modified: nfig1.py
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / sparsePregraph / convert_soapdenovo.cpp
blob89852c88370f04ee898e296afadf073dc4fe7bf7
1 /*
2 * convert_soapdenovo.cpp
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
22 #include "global.h"
23 #include "stdinc.h"
24 #include "core.h"
27 #include "convert_soapdenovo.h"
28 #include "sparse_kmer.h"
29 #include "zlib.h"
31 // can be optimized ...
32 /*************************************************
33 Function:
34 convert_kmer
35 Description:
36 convert sparse kmer to soapdenovo format ..
37 A 0 A 0
38 C 1 --> C 1
39 T 3 T 2
40 G 2 G 3
41 Input:
42 1. sparse_kmer: sparse-kmer
43 2. K_size: kmer size
44 Output:
45 1. sparse_kmer soapdenovo format kmer
46 Return:
47 None.
48 *************************************************/
49 void convert_kmer ( kmer_t2 * sparse_kmer, int K_size )
51 uint64_t tmp, tmp_res;
52 int i, j, index, arr_sz, base;
53 index = K_size / 32;
54 arr_sz = sizeof ( kmer_t2 ) / sizeof ( uint64_t );
56 for ( i = 0; i <= index; i++ )
58 tmp = ( sparse_kmer->kmer ) [arr_sz - 1 - i];
59 tmp_res = 0;
61 for ( j = 0; j < 32; j++ )
63 base = tmp & 3;
65 switch ( base )
67 case 0 :
68 break;
69 case 1 :
70 tmp_res |= ( 1LLU << 2 * j );
71 break;
72 case 2:
73 tmp_res |= ( 3LLU << 2 * j );
74 break;
75 case 3:
76 tmp_res |= ( 2LLU << 2 * j );
77 break;
80 tmp = tmp >> 2;
83 ( sparse_kmer->kmer ) [arr_sz - 1 - i] = tmp_res;
87 uint64_t high=0,low=0;
88 int chr;
89 if(K_size>=33){
90 for(int i=0;i<K_size-32;++i){
91 chr = sparse_kmer[0] & 3;
92 switch(chr){
93 case 0:
94 break;
95 case 1:
96 high|=(1LLU << 2 * i);
97 break;
98 case 2:
99 high|=(3LLU << 2 * i);
100 break;
101 case 3:
102 high|=(2LLU << 2 * i);
103 break;
106 sparse_kmer[0] = sparse_kmer[0]>>2;
109 for(int i=0;i<32;++i){
110 chr = sparse_kmer[1] & 3;
111 switch(chr){
112 case 0:
113 break;
114 case 1:
115 low|=(1LLU << 2 * i);
116 break;
117 case 2:
118 low|=(3LLU << 2 * i);
119 break;
120 case 3:
121 low|=(2LLU << 2 * i);
122 break;
125 sparse_kmer[1] = sparse_kmer[1]>>2;
127 }else{
128 for(int i=0;i<K_size;++i){
129 chr = sparse_kmer[1] & 3;
130 switch(chr){
131 case 0:
132 break;
133 case 1:
134 low|=(1LLU << 2 * i);
135 break;
136 case 2:
137 low|=(3LLU << 2 * i);
138 break;
139 case 3:
140 low|=(2LLU << 2 * i);
141 break;
144 sparse_kmer[1] = sparse_kmer[1]>>2;
147 sparse_kmer[0] = high;
148 sparse_kmer[1] = low;
153 /*************************************************
154 Function:
155 fastReverseComp
156 Description:
157 fastReverseComp soapdenovo format kmer
158 Input:
159 1. kmer2: soapdenovo fomat kmer
160 2. seq_size: kmer size
161 Output:
162 1. kmer2: reversed compliment kmer
163 Return:
164 None.
165 *************************************************/
166 static void fastReverseComp ( kmer_t2 * kmer2, int seq_size )
168 int arr_sz;
169 uint64_t * seq_arr;
170 arr_sz = sizeof ( kmer_t2 ) / sizeof ( uint64_t ); //= 2 or 4
171 seq_arr = kmer2->kmer;
172 int tot_bits = arr_sz * 64;
174 for ( int i = 0; i < arr_sz; ++i )
176 seq_arr[i] ^= 0xAAAAAAAAAAAAAAAALLU;
177 seq_arr[i] = ( ( seq_arr[i] & 0x3333333333333333 ) << 2 ) | ( ( seq_arr[i] & 0xCCCCCCCCCCCCCCCC ) >> 2 );
178 seq_arr[i] = ( ( seq_arr[i] & 0x0F0F0F0F0F0F0F0F ) << 4 ) | ( ( seq_arr[i] & 0xF0F0F0F0F0F0F0F0 ) >> 4 );
179 seq_arr[i] = ( ( seq_arr[i] & 0x00FF00FF00FF00FF ) << 8 ) | ( ( seq_arr[i] & 0xFF00FF00FF00FF00 ) >> 8 );
180 seq_arr[i] = ( ( seq_arr[i] & 0x0000FFFF0000FFFF ) << 16 ) | ( ( seq_arr[i] & 0xFFFF0000FFFF0000 ) >> 16 );
181 seq_arr[i] = ( ( seq_arr[i] & 0x00000000FFFFFFFF ) << 32 ) | ( ( seq_arr[i] & 0xFFFFFFFF00000000 ) >> 32 );
184 int j = 0, k = arr_sz - 1;
186 for ( ; j < k; ++j, --k )
188 uint64_t temp;
189 temp = seq_arr[j];
190 seq_arr[j] = seq_arr[k];
191 seq_arr[k] = temp;
194 R_shift_NC ( seq_arr, tot_bits - ( seq_size * 2 ), arr_sz );
197 struct classcomp
199 bool operator() ( const kmer_t2 & t1, const kmer_t2 & t2 ) const
201 int Kmer_arr_sz = sizeof ( kmer_t2 ) / sizeof ( uint64_t );
203 for ( int jj = 0; jj < Kmer_arr_sz; ++jj )
205 if ( ( t1.kmer ) [jj] < ( t2.kmer ) [jj] )
207 return 1;
209 else if ( ( t1.kmer ) [jj] > ( t2.kmer ) [jj] )
211 return 0;
214 continue;
217 return 0;
218 /* old
219 if((t1.kmer)[0] < (t2.kmer)[0]){
220 return 1;
221 }else if((t1.kmer)[0] == (t2.kmer)[0]){
222 return (t1.kmer)[1] < (t2.kmer)[1];
223 }else{
224 return 0;
231 /*************************************************
232 Function:
233 convert
234 Description:
235 converts *.sparse.edge to *.edge.gz *.vertex
236 Input:
237 1. sparse_edge_file: sparse edge file
238 2. K_size: kmer size
239 3. output_prefix output prefix
240 Output:
241 None.
242 Return:
243 None.
244 *************************************************/
245 void convert ( char * sparse_edge_file, int K_size, char * output_prefix )
247 if ( run_mode != 0 )
249 char temp[256];
250 sprintf ( temp, "%s.preGraphBasic", output_prefix );
251 FILE * fp = fopen ( temp, "r" );
252 char line[1024];
253 fgets ( line, 1024, fp );
254 fgets ( line, 1024, fp );
255 fgets ( line, 1024, fp );
256 fclose ( fp );
257 sscanf ( line, "%s %d %s %d", temp, &max_rd_len, temp, &min_rd_len );
260 FILE * fin, *fout2, *fout3;
261 fin = fopen ( sparse_edge_file, "r" );
262 gzFile fout;
263 char temp[256];
264 sprintf ( temp, "%s.edge.gz", output_prefix );
265 fout = gzopen ( temp, "w" );
266 //fout= fopen(temp, "w");//edge
267 //write as gzip file
268 sprintf ( temp, "%s.vertex", output_prefix );
269 fout2 = fopen ( temp, "w" );
270 sprintf ( temp, "%s.preGraphBasic", output_prefix );
271 fout3 = fopen ( temp, "w" );
273 if ( !fin || !fout || !fout2 || !fout3 )
275 fprintf ( stderr, "can't open file %s\n", sparse_edge_file );
276 exit ( 1 );
279 //cout << "right 0"<<endl;
280 kmer_t2 from_kmer, to_kmer;
281 size_t line_len, edge_len_left;
282 int edge_len;
283 int cvg;
284 int bal_ed;//»ØÎÄΪ0
285 char str[32];
286 const int BUFF_LEN = 1024;
287 char line[BUFF_LEN];
288 int start = 0;
289 int cutoff = 100;
290 map<kmer_t2, int, classcomp> vertex_nodes;
291 size_t edge_counter = 0, vertex_counter = 0;
292 int j = 0;
294 //cout << "right 1"<<endl;
296 while ( fgets ( line, BUFF_LEN, fin ) != NULL )
298 //cout << "right 2"<<endl;
299 if ( line[0] == '>' ) //get one edge length, from vertex, to vertex,cvg,bal
301 edge_counter++;
302 #ifdef _63MER_
303 sscanf ( line + 7, "%d,%llx %llx,%llx %llx,cvg %d,%d", &edge_len,
304 & ( from_kmer.kmer ) [0], & ( from_kmer.kmer ) [1], & ( to_kmer.kmer ) [0], & ( to_kmer.kmer ) [1], &cvg, &bal_ed ); // from_kmer to_kmer is of no use here
305 #endif
306 #ifdef _127MER_
307 sscanf ( line + 7, "%d,%llx %llx %llx %llx,%llx %llx %llx %llx,cvg %d,%d",
308 &edge_len, & ( from_kmer.kmer ) [0], & ( from_kmer.kmer ) [1], & ( from_kmer.kmer ) [2], & ( from_kmer.kmer ) [3],
309 & ( to_kmer.kmer ) [0], & ( to_kmer.kmer ) [1], & ( to_kmer.kmer ) [2], & ( to_kmer.kmer ) [3], &cvg, &bal_ed ); // from_kmer to_kmer is of no use here
310 #endif
312 if ( edge_len == 1 )
314 cvg = 0;
316 else
318 cvg *= 10;
321 convert_kmer ( &from_kmer, K_size );
322 convert_kmer ( &to_kmer, K_size );
323 #ifdef _63MER_
324 gzprintf ( fout, ">length %d,%llx %llx,%llx %llx,cvg %d,%d\n", edge_len,
325 ( from_kmer.kmer ) [0], ( from_kmer.kmer ) [1], ( to_kmer.kmer ) [0], ( to_kmer.kmer ) [1], cvg, bal_ed );
326 #endif
327 #ifdef _127MER_
328 gzprintf ( fout, ">length %d,%llx %llx %llx %llx,%llx %llx %llx %llx,cvg %d,%d\n", edge_len,
329 ( from_kmer.kmer ) [0], ( from_kmer.kmer ) [1], ( from_kmer.kmer ) [2], ( from_kmer.kmer ) [3],
330 ( to_kmer.kmer ) [0], ( to_kmer.kmer ) [1], ( to_kmer.kmer ) [2], ( to_kmer.kmer ) [3], cvg, bal_ed );
331 #endif
333 if ( bal_ed ) { edge_counter++; }
335 kmer_t2 f_kmer = from_kmer;
336 fastReverseComp ( &f_kmer, K_size );
338 if ( kmerCompare ( &f_kmer, &from_kmer ) < 0 )
340 from_kmer = f_kmer;
343 vertex_nodes[from_kmer]++;
344 f_kmer = to_kmer;
345 fastReverseComp ( &f_kmer, K_size );
347 if ( kmerCompare ( &f_kmer, &to_kmer ) < 0 )
349 to_kmer = f_kmer;
352 vertex_nodes[to_kmer]++;
353 start = 1;
354 j = 0;
356 else
358 //print the sequence
359 if ( start == 1 )
361 //skip the first kmer
362 int len = strlen ( line );
364 if ( line[len - 1] == '\n' )
366 line[len - 1] == '\0';
367 len --;
370 for ( int i = K_size; i < len; i++ )
372 j++;
373 gzprintf ( fout, "%c", line[i] );
375 if ( j % 100 == 0 )
377 gzprintf ( fout, "\n" );
381 edge_len -= ( len - K_size );
383 if ( edge_len == 0 && j % 100 != 0 )
385 gzprintf ( fout, "\n" );
388 start = 2;
390 else //start = 2
392 if ( line[0] == '\n' ) { continue; }
394 int len = strlen ( line );
396 if ( line[len - 1] == '\n' )
398 line[len - 1] == '\0';
399 len --;
402 for ( int i = 0; i < len; i++ )
404 j++;
405 gzprintf ( fout, "%c", line[i] );
407 if ( j % 100 == 0 )
409 gzprintf ( fout, "\n" );
413 edge_len -= len;
415 if ( edge_len == 0 && j % 100 != 0 )
417 gzprintf ( fout, "\n" );
423 //fprintf(stderr,"size of map: %llu\n",vertex_nodes.size());
424 map<kmer_t2, int>::iterator it;
426 for ( it = vertex_nodes.begin(); it != vertex_nodes.end(); ++it )
428 vertex_counter++;
429 #ifdef _63MER_
430 fprintf ( fout2, "%llx %llx ", ( ( *it ).first.kmer ) [0], ( ( *it ).first.kmer ) [1] );
431 #endif
432 #ifdef _127MER_
433 fprintf ( fout2, "%llx %llx %llx %llx ", ( ( *it ).first.kmer ) [0], ( ( *it ).first.kmer ) [1],
434 ( ( *it ).first.kmer ) [2], ( ( *it ).first.kmer ) [3] );
435 #endif
437 if ( vertex_counter % 8 == 0 ) { fprintf ( fout2, "\n" ); }
440 fprintf ( fout3, "VERTEX %lu K %d\n", vertex_counter, K_size );
441 fprintf ( fout3, "EDGEs %lu\n", edge_counter );
442 fprintf ( stderr, "%llu edges and %llu vertexes constructed.\n", edge_counter, vertex_counter );
443 fprintf ( fout3, "MaxReadLen %d MinReadLen %d MaxNameLen 256\n", max_rd_len, min_rd_len );
444 fclose ( fin );
445 gzclose ( fout );
446 fclose ( fout2 );
447 fclose ( fout3 );