2 * convert_soapdenovo.cpp
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
27 #include "convert_soapdenovo.h"
28 #include "sparse_kmer.h"
31 // can be optimized ...
32 /*************************************************
36 convert sparse kmer to soapdenovo format ..
42 1. sparse_kmer: sparse-kmer
45 1. sparse_kmer soapdenovo format kmer
48 *************************************************/
49 void convert_kmer ( kmer_t2
* sparse_kmer
, int K_size
)
51 uint64_t tmp
, tmp_res
;
52 int i
, j
, index
, arr_sz
, base
;
54 arr_sz
= sizeof ( kmer_t2
) / sizeof ( uint64_t );
56 for ( i
= 0; i
<= index
; i
++ )
58 tmp
= ( sparse_kmer
->kmer
) [arr_sz
- 1 - i
];
61 for ( j
= 0; j
< 32; j
++ )
70 tmp_res
|= ( 1LLU << 2 * j
);
73 tmp_res
|= ( 3LLU << 2 * j
);
76 tmp_res
|= ( 2LLU << 2 * j
);
83 ( sparse_kmer
->kmer
) [arr_sz
- 1 - i
] = tmp_res
;
87 uint64_t high=0,low=0;
90 for(int i=0;i<K_size-32;++i){
91 chr = sparse_kmer[0] & 3;
96 high|=(1LLU << 2 * i);
99 high|=(3LLU << 2 * i);
102 high|=(2LLU << 2 * i);
106 sparse_kmer[0] = sparse_kmer[0]>>2;
109 for(int i=0;i<32;++i){
110 chr = sparse_kmer[1] & 3;
115 low|=(1LLU << 2 * i);
118 low|=(3LLU << 2 * i);
121 low|=(2LLU << 2 * i);
125 sparse_kmer[1] = sparse_kmer[1]>>2;
128 for(int i=0;i<K_size;++i){
129 chr = sparse_kmer[1] & 3;
134 low|=(1LLU << 2 * i);
137 low|=(3LLU << 2 * i);
140 low|=(2LLU << 2 * i);
144 sparse_kmer[1] = sparse_kmer[1]>>2;
147 sparse_kmer[0] = high;
148 sparse_kmer[1] = low;
153 /*************************************************
157 fastReverseComp soapdenovo format kmer
159 1. kmer2: soapdenovo fomat kmer
160 2. seq_size: kmer size
162 1. kmer2: reversed compliment kmer
165 *************************************************/
166 static void fastReverseComp ( kmer_t2
* kmer2
, int seq_size
)
170 arr_sz
= sizeof ( kmer_t2
) / sizeof ( uint64_t ); //= 2 or 4
171 seq_arr
= kmer2
->kmer
;
172 int tot_bits
= arr_sz
* 64;
174 for ( int i
= 0; i
< arr_sz
; ++i
)
176 seq_arr
[i
] ^= 0xAAAAAAAAAAAAAAAALLU
;
177 seq_arr
[i
] = ( ( seq_arr
[i
] & 0x3333333333333333 ) << 2 ) | ( ( seq_arr
[i
] & 0xCCCCCCCCCCCCCCCC ) >> 2 );
178 seq_arr
[i
] = ( ( seq_arr
[i
] & 0x0F0F0F0F0F0F0F0F ) << 4 ) | ( ( seq_arr
[i
] & 0xF0F0F0F0F0F0F0F0 ) >> 4 );
179 seq_arr
[i
] = ( ( seq_arr
[i
] & 0x00FF00FF00FF00FF ) << 8 ) | ( ( seq_arr
[i
] & 0xFF00FF00FF00FF00 ) >> 8 );
180 seq_arr
[i
] = ( ( seq_arr
[i
] & 0x0000FFFF0000FFFF ) << 16 ) | ( ( seq_arr
[i
] & 0xFFFF0000FFFF0000 ) >> 16 );
181 seq_arr
[i
] = ( ( seq_arr
[i
] & 0x00000000FFFFFFFF ) << 32 ) | ( ( seq_arr
[i
] & 0xFFFFFFFF00000000 ) >> 32 );
184 int j
= 0, k
= arr_sz
- 1;
186 for ( ; j
< k
; ++j
, --k
)
190 seq_arr
[j
] = seq_arr
[k
];
194 R_shift_NC ( seq_arr
, tot_bits
- ( seq_size
* 2 ), arr_sz
);
199 bool operator() ( const kmer_t2
& t1
, const kmer_t2
& t2
) const
201 int Kmer_arr_sz
= sizeof ( kmer_t2
) / sizeof ( uint64_t );
203 for ( int jj
= 0; jj
< Kmer_arr_sz
; ++jj
)
205 if ( ( t1
.kmer
) [jj
] < ( t2
.kmer
) [jj
] )
209 else if ( ( t1
.kmer
) [jj
] > ( t2
.kmer
) [jj
] )
219 if((t1.kmer)[0] < (t2.kmer)[0]){
221 }else if((t1.kmer)[0] == (t2.kmer)[0]){
222 return (t1.kmer)[1] < (t2.kmer)[1];
231 /*************************************************
235 converts *.sparse.edge to *.edge.gz *.vertex
237 1. sparse_edge_file: sparse edge file
239 3. output_prefix output prefix
244 *************************************************/
245 void convert ( char * sparse_edge_file
, int K_size
, char * output_prefix
)
250 sprintf ( temp
, "%s.preGraphBasic", output_prefix
);
251 FILE * fp
= fopen ( temp
, "r" );
253 fgets ( line
, 1024, fp
);
254 fgets ( line
, 1024, fp
);
255 fgets ( line
, 1024, fp
);
257 sscanf ( line
, "%s %d %s %d", temp
, &max_rd_len
, temp
, &min_rd_len
);
260 FILE * fin
, *fout2
, *fout3
;
261 fin
= fopen ( sparse_edge_file
, "r" );
264 sprintf ( temp
, "%s.edge.gz", output_prefix
);
265 fout
= gzopen ( temp
, "w" );
266 //fout= fopen(temp, "w");//edge
268 sprintf ( temp
, "%s.vertex", output_prefix
);
269 fout2
= fopen ( temp
, "w" );
270 sprintf ( temp
, "%s.preGraphBasic", output_prefix
);
271 fout3
= fopen ( temp
, "w" );
273 if ( !fin
|| !fout
|| !fout2
|| !fout3
)
275 fprintf ( stderr
, "can't open file %s\n", sparse_edge_file
);
279 //cout << "right 0"<<endl;
280 kmer_t2 from_kmer
, to_kmer
;
281 size_t line_len
, edge_len_left
;
286 const int BUFF_LEN
= 1024;
290 map
<kmer_t2
, int, classcomp
> vertex_nodes
;
291 size_t edge_counter
= 0, vertex_counter
= 0;
294 //cout << "right 1"<<endl;
296 while ( fgets ( line
, BUFF_LEN
, fin
) != NULL
)
298 //cout << "right 2"<<endl;
299 if ( line
[0] == '>' ) //get one edge length, from vertex, to vertex,cvg,bal
303 sscanf ( line
+ 7, "%d,%llx %llx,%llx %llx,cvg %d,%d", &edge_len
,
304 & ( from_kmer
.kmer
) [0], & ( from_kmer
.kmer
) [1], & ( to_kmer
.kmer
) [0], & ( to_kmer
.kmer
) [1], &cvg
, &bal_ed
); // from_kmer to_kmer is of no use here
307 sscanf ( line
+ 7, "%d,%llx %llx %llx %llx,%llx %llx %llx %llx,cvg %d,%d",
308 &edge_len
, & ( from_kmer
.kmer
) [0], & ( from_kmer
.kmer
) [1], & ( from_kmer
.kmer
) [2], & ( from_kmer
.kmer
) [3],
309 & ( to_kmer
.kmer
) [0], & ( to_kmer
.kmer
) [1], & ( to_kmer
.kmer
) [2], & ( to_kmer
.kmer
) [3], &cvg
, &bal_ed
); // from_kmer to_kmer is of no use here
321 convert_kmer ( &from_kmer
, K_size
);
322 convert_kmer ( &to_kmer
, K_size
);
324 gzprintf ( fout
, ">length %d,%llx %llx,%llx %llx,cvg %d,%d\n", edge_len
,
325 ( from_kmer
.kmer
) [0], ( from_kmer
.kmer
) [1], ( to_kmer
.kmer
) [0], ( to_kmer
.kmer
) [1], cvg
, bal_ed
);
328 gzprintf ( fout
, ">length %d,%llx %llx %llx %llx,%llx %llx %llx %llx,cvg %d,%d\n", edge_len
,
329 ( from_kmer
.kmer
) [0], ( from_kmer
.kmer
) [1], ( from_kmer
.kmer
) [2], ( from_kmer
.kmer
) [3],
330 ( to_kmer
.kmer
) [0], ( to_kmer
.kmer
) [1], ( to_kmer
.kmer
) [2], ( to_kmer
.kmer
) [3], cvg
, bal_ed
);
333 if ( bal_ed
) { edge_counter
++; }
335 kmer_t2 f_kmer
= from_kmer
;
336 fastReverseComp ( &f_kmer
, K_size
);
338 if ( kmerCompare ( &f_kmer
, &from_kmer
) < 0 )
343 vertex_nodes
[from_kmer
]++;
345 fastReverseComp ( &f_kmer
, K_size
);
347 if ( kmerCompare ( &f_kmer
, &to_kmer
) < 0 )
352 vertex_nodes
[to_kmer
]++;
361 //skip the first kmer
362 int len
= strlen ( line
);
364 if ( line
[len
- 1] == '\n' )
366 line
[len
- 1] == '\0';
370 for ( int i
= K_size
; i
< len
; i
++ )
373 gzprintf ( fout
, "%c", line
[i
] );
377 gzprintf ( fout
, "\n" );
381 edge_len
-= ( len
- K_size
);
383 if ( edge_len
== 0 && j
% 100 != 0 )
385 gzprintf ( fout
, "\n" );
392 if ( line
[0] == '\n' ) { continue; }
394 int len
= strlen ( line
);
396 if ( line
[len
- 1] == '\n' )
398 line
[len
- 1] == '\0';
402 for ( int i
= 0; i
< len
; i
++ )
405 gzprintf ( fout
, "%c", line
[i
] );
409 gzprintf ( fout
, "\n" );
415 if ( edge_len
== 0 && j
% 100 != 0 )
417 gzprintf ( fout
, "\n" );
423 //fprintf(stderr,"size of map: %llu\n",vertex_nodes.size());
424 map
<kmer_t2
, int>::iterator it
;
426 for ( it
= vertex_nodes
.begin(); it
!= vertex_nodes
.end(); ++it
)
430 fprintf ( fout2
, "%llx %llx ", ( ( *it
).first
.kmer
) [0], ( ( *it
).first
.kmer
) [1] );
433 fprintf ( fout2
, "%llx %llx %llx %llx ", ( ( *it
).first
.kmer
) [0], ( ( *it
).first
.kmer
) [1],
434 ( ( *it
).first
.kmer
) [2], ( ( *it
).first
.kmer
) [3] );
437 if ( vertex_counter
% 8 == 0 ) { fprintf ( fout2
, "\n" ); }
440 fprintf ( fout3
, "VERTEX %lu K %d\n", vertex_counter
, K_size
);
441 fprintf ( fout3
, "EDGEs %lu\n", edge_counter
);
442 fprintf ( stderr
, "%llu edges and %llu vertexes constructed.\n", edge_counter
, vertex_counter
);
443 fprintf ( fout3
, "MaxReadLen %d MinReadLen %d MaxNameLen 256\n", max_rd_len
, min_rd_len
);