modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / loadPreGraph.c
blob1422eab60755aff2fa621c5ca27ba91f93938c4e
1 /*
2 * loadPreGraph.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
28 #include "zlib.h"
30 static void loadPreArcs ( char * graphfile );
32 int cmp_vertex ( const void * a, const void * b )
34 VERTEX * A, *B;
35 A = ( VERTEX * ) a;
36 B = ( VERTEX * ) b;
38 if ( KmerLarger ( A->kmer, B->kmer ) )
40 return 1;
42 else if ( KmerEqual ( A->kmer, B->kmer ) )
44 return 0;
46 else
48 return -1;
52 /*************************************************
53 Function:
54 loadVertex
55 Description:
56 1. Loads first kmer and the last kmer of the edge.
57 2. Sorts kmers.
58 Input:
59 1. graphfile: the input prefix
60 Output:
61 None.
62 Return:
63 None.
64 *************************************************/
65 void loadVertex ( char * graphfile )
67 char name[256], line[256];
68 FILE * fp;
69 Kmer word, bal_word, temp;
70 int num_kmer, i;
71 char ch;
72 sprintf ( name, "%s.preGraphBasic", graphfile );
73 fp = ckopen ( name, "r" );
75 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
77 if ( line[0] == 'V' )
79 sscanf ( line + 6, "%d %c %d", &num_kmer, &ch, &overlaplen );
80 fprintf ( stderr, "There are %d kmer(s) in vertex file.\n", num_kmer );
82 else if ( line[0] == 'E' )
84 sscanf ( line + 5, "%d", &num_ed );
85 fprintf ( stderr, "There are %d edge(s) in edge file.\n", num_ed );
87 else if ( line[0] == 'M' )
89 sscanf ( line, "MaxReadLen %d MinReadLen %d MaxNameLen %d", &maxReadLen, &minReadLen, &maxNameLen );
91 else if ( line[0] == 'B' )
93 if ( line[7] == 'V' )
95 sscanf ( line, "Backup VERTEX %d %c %d", &num_kmer, &ch, &overlaplen );
96 fprintf ( stderr, "Backup there are %d kmer(s) in vertex file.\n", num_kmer );
98 else if ( line[7] == 'E' )
100 sscanf ( line, "Backup EDGEs %d", &num_ed );
101 fprintf ( stderr, "Backup there are %d edge(s) in edge file.\n", num_ed );
103 else if ( line[7] == 'M' )
105 sscanf ( line, "Backup MaxReadLen %d MinReadLen %d MaxNameLen %d", &maxReadLen, &minReadLen, &maxNameLen );
110 fclose ( fp );
111 vt_array = ( VERTEX * ) ckalloc ( ( 4 * num_kmer ) * sizeof ( VERTEX ) );
112 num_kmer_limit = 4 * num_kmer;
113 sprintf ( name, "%s.vertex", graphfile );
114 fp = ckopen ( name, "r" );
116 for ( i = 0; i < num_kmer; i++ )
118 #ifdef MER127
119 fscanf ( fp, "%llx %llx %llx %llx", & ( word.high1 ), & ( word.low1 ), & ( word.high2 ), & ( word.low2 ) );
120 #else
121 fscanf ( fp, "%llx %llx", & ( word.high ), & ( word.low ) );
122 #endif
123 bal_word = reverseComplement ( word, overlaplen );
125 if ( KmerSmaller ( word, bal_word ) )
127 vt_array[i].kmer = word;
129 else
131 vt_array[i].kmer = bal_word;
135 temp = vt_array[num_kmer - 1].kmer;
136 qsort ( &vt_array[0], num_kmer, sizeof ( vt_array[0] ), cmp_vertex );
137 fprintf ( stderr, "Kmers sorted.\n" );
138 fclose ( fp );
140 for ( i = 0; i < num_kmer; i++ )
142 bal_word = reverseComplement ( vt_array[i].kmer, overlaplen );
143 vt_array[i + num_kmer].kmer = bal_word;
146 num_vt = num_kmer;
149 /*************************************************
150 Function:
151 bisearch
152 Description:
153 Uses binary search to get the index of a kmer in array.
154 Input:
155 1. vts: the array of the kmer
156 2. num: the count of the total kmers
157 3. target: the kmer to search
158 Output:
159 None.
160 Return:
161 The kmer's index in array.
162 *************************************************/
163 int bisearch ( VERTEX * vts, int num, Kmer target )
165 int mid, low, high;
166 low = 0;
167 high = num - 1;
169 while ( low <= high )
171 mid = ( low + high ) / 2;
173 if ( KmerEqual ( vts[mid].kmer, target ) )
175 break;
177 else if ( KmerLarger ( target, vts[mid].kmer ) )
179 low = mid + 1;
181 else
183 high = mid - 1;
187 if ( low <= high )
189 return mid;
191 else
193 return -1;
197 /*************************************************
198 Function:
199 kmer2vt
200 Description:
201 Searchs the index of a kmer in array.
202 Input:
203 1. kmer: the kmer
204 Output:
205 None.
206 Return:
207 The index of the kmer in the array.
208 *************************************************/
209 int kmer2vt ( Kmer kmer )
211 Kmer bal_word;
212 int vt_id;
213 bal_word = reverseComplement ( kmer, overlaplen );
215 if ( KmerSmaller ( kmer, bal_word ) )
217 vt_id = bisearch ( &vt_array[0], num_vt, kmer );
219 if ( vt_id < 0 )
221 fprintf ( stderr, "There is no vertex for kmer " );
222 PrintKmer ( stderr, kmer );
223 fprintf ( stderr, " .\n" );
225 #ifdef MER127
226 fprintf (stderr,"There is not the vertex for kmer %llx %llx %llx %llx.\n", kmer.high1, kmer.low1, kmer.high2, kmer.low2);
227 #else
228 fprintf (stderr,"There is not the vertex for kmer %llx %llx.\n", kmer.high, kmer.low);
229 #endif
233 return vt_id;
235 else
237 vt_id = bisearch ( &vt_array[0], num_vt, bal_word );
239 if ( vt_id >= 0 )
241 vt_id += num_vt;
243 else
245 fprintf ( stderr, "There is no vertex for kmer " );
246 PrintKmer ( stderr, kmer );
247 fprintf ( stderr, " .\n" );
249 #ifdef MER127
250 fprintf (stderr,"There is not the vertex for kmer %llx %llx %llx %llx.\n", kmer.high1, kmer.low1, kmer.high2, kmer.low2);
251 #else
252 fprintf (stderr,"There is not the vertex for kmer %llx %llx.\n", kmer.high, kmer.low);
253 #endif
257 return vt_id;
261 /*************************************************
262 Function:
263 buildReverseComplementEdge
264 Description:
265 Creates reverse complementary edge of a edge.
266 Input:
267 1. edgeno: the edge index
268 Output:
269 None.
270 Return:
271 None.
272 *************************************************/
273 #ifdef MER127
274 static void buildReverseComplementEdge ( unsigned int edgeno )
276 int length = edge_array[edgeno].length;
277 int i, index = 0;
278 char * sequence, ch, *tightSeq;
279 Kmer kmer = vt_array[edge_array[edgeno].from_vt].kmer;
280 sequence = ( char * ) ckalloc ( ( overlaplen + length ) * sizeof ( char ) );
281 int bit1, bit2, bit3, bit4;
283 if ( overlaplen < 32 )
285 bit4 = overlaplen;
286 bit3 = 0;
287 bit2 = 0;
288 bit1 = 0;
291 if ( overlaplen >= 32 && overlaplen < 64 )
293 bit4 = 32;
294 bit3 = overlaplen - 32;
295 bit2 = 0;
296 bit1 = 0;
299 if ( overlaplen >= 64 && overlaplen < 96 )
301 bit4 = 32;
302 bit3 = 32;
303 bit2 = overlaplen - 64;
304 bit1 = 0;
307 if ( overlaplen >= 96 && overlaplen < 128 )
309 bit4 = 32;
310 bit3 = 32;
311 bit2 = 32;
312 bit1 = overlaplen - 96;
315 for ( i = bit1 - 1; i >= 0; i-- )
317 ch = kmer.high1 & 0x3;
318 kmer.high1 >>= 2;
319 sequence[i] = ch;
322 for ( i = bit2 - 1; i >= 0; i-- )
324 ch = kmer.low1 & 0x3;
325 kmer.low1 >>= 2;
326 sequence[i + bit1] = ch;
329 for ( i = bit3 - 1; i >= 0; i-- )
331 ch = kmer.high2 & 0x3;
332 kmer.high2 >>= 2;
333 sequence[i + bit1 + bit2] = ch;
336 for ( i = bit4 - 1; i >= 0; i-- )
338 ch = kmer.low2 & 0x3;
339 kmer.low2 >>= 2;
340 sequence[i + bit1 + bit2 + bit3] = ch;
343 for ( i = 0; i < length; i++ )
345 sequence[i + overlaplen] = getCharInTightString ( edge_array[edgeno].seq, i );
348 tightSeq = ( char * ) ckalloc ( ( length / 4 + 1 ) * sizeof ( char ) );
350 for ( i = length - 1; i >= 0; i-- )
352 writeChar2tightString ( int_comp ( sequence[i] ), tightSeq, index++ );
355 edge_array[edgeno + 1].length = length;
356 edge_array[edgeno + 1].cvg = edge_array[edgeno].cvg;
357 kmer = vt_array[edge_array[edgeno].from_vt].kmer;
358 edge_array[edgeno + 1].to_vt = kmer2vt ( reverseComplement ( kmer, overlaplen ) );
359 kmer = vt_array[edge_array[edgeno].to_vt].kmer;
360 edge_array[edgeno + 1].from_vt = kmer2vt ( reverseComplement ( kmer, overlaplen ) );
361 edge_array[edgeno + 1].seq = tightSeq;
362 edge_array[edgeno + 1].bal_edge = 0;
363 edge_array[edgeno + 1].rv = NULL;
364 edge_array[edgeno + 1].arcs = NULL;
365 edge_array[edgeno + 1].flag = 0;
366 edge_array[edgeno + 1].deleted = 0;
367 free ( ( void * ) sequence );
369 #else
371 /*************************************************
372 Function:
373 buildReverseComplementEdge
374 Description:
375 Creates reverse complementary edge of a edge.
376 Input:
377 1. edgeno: the edge index
378 Output:
379 None.
380 Return:
381 None.
382 *************************************************/
383 static void buildReverseComplementEdge ( unsigned int edgeno )
385 int length = edge_array[edgeno].length;
386 int i, index = 0;
387 char * sequence, ch, *tightSeq;
388 Kmer kmer = vt_array[edge_array[edgeno].from_vt].kmer;
389 sequence = ( char * ) ckalloc ( ( overlaplen + length ) * sizeof ( char ) );
390 int bit2 = overlaplen > 32 ? 32 : overlaplen;
391 int bit1 = overlaplen > 32 ? overlaplen - 32 : 0;
393 for ( i = bit1 - 1; i >= 0; i-- )
395 ch = kmer.high & 0x3;
396 kmer.high >>= 2;
397 sequence[i] = ch;
400 for ( i = bit2 - 1; i >= 0; i-- )
402 ch = kmer.low & 0x3;
403 kmer.low >>= 2;
404 sequence[i + bit1] = ch;
407 for ( i = 0; i < length; i++ )
409 sequence[i + overlaplen] = getCharInTightString ( edge_array[edgeno].seq, i );
412 tightSeq = ( char * ) ckalloc ( ( length / 4 + 1 ) * sizeof ( char ) );
414 for ( i = length - 1; i >= 0; i-- )
416 writeChar2tightString ( int_comp ( sequence[i] ), tightSeq, index++ );
419 edge_array[edgeno + 1].length = length;
420 edge_array[edgeno + 1].cvg = edge_array[edgeno].cvg;
421 kmer = vt_array[edge_array[edgeno].from_vt].kmer;
422 edge_array[edgeno + 1].to_vt = kmer2vt ( reverseComplement ( kmer, overlaplen ) );
423 kmer = vt_array[edge_array[edgeno].to_vt].kmer;
424 edge_array[edgeno + 1].from_vt = kmer2vt ( reverseComplement ( kmer, overlaplen ) );
425 edge_array[edgeno + 1].seq = tightSeq;
426 edge_array[edgeno + 1].bal_edge = 0;
427 edge_array[edgeno + 1].rv = NULL;
428 edge_array[edgeno + 1].arcs = NULL;
429 edge_array[edgeno + 1].flag = 0;
430 edge_array[edgeno + 1].deleted = 0;
431 free ( ( void * ) sequence );
433 #endif
435 /*************************************************
436 Function:
437 loadEdge
438 Description:
439 1. Loads the info of the edges.
440 2. Loads the links between the edges.
441 Input:
442 1. graphfile: the input prefix
443 Output:
444 None.
445 Return:
446 None.
447 *************************************************/
448 void loadEdge ( char * graphfile )
450 char c, name[256], line[1024], str[32];
451 char * tightSeq = NULL;
452 gzFile * fp;
453 Kmer from_kmer, to_kmer;
454 int n = 0, i, length, cvg, index = -1, bal_ed, edgeno;
455 int linelen;
456 unsigned int j;
457 sprintf ( name, "%s.edge.gz", graphfile );
458 fp = gzopen ( name, "r" );
459 num_ed_limit = 1.2 * num_ed;
460 edge_array = ( EDGE * ) ckalloc ( ( num_ed_limit + 1 ) * sizeof ( EDGE ) );
462 for ( j = num_ed + 1; j <= num_ed_limit; j++ )
464 edge_array[j].seq = NULL;
467 while ( gzgets ( fp, line, sizeof ( line ) ) != NULL )
469 if ( line[0] == '>' )
471 if ( index >= 0 )
473 edgeno = index + 1;
474 edge_array[edgeno].length = length;
475 edge_array[edgeno].cvg = cvg;
476 edge_array[edgeno].from_vt = kmer2vt ( from_kmer );
477 edge_array[edgeno].to_vt = kmer2vt ( to_kmer );
478 edge_array[edgeno].seq = tightSeq;
479 edge_array[edgeno].bal_edge = bal_ed + 1;
480 edge_array[edgeno].rv = NULL;
481 edge_array[edgeno].arcs = NULL;
482 edge_array[edgeno].flag = 0;
483 edge_array[edgeno].deleted = 0;
485 if ( bal_ed )
487 buildReverseComplementEdge ( edgeno );
488 index++;
492 n = 0;
493 index++;
494 #ifdef MER127
495 sscanf ( line + 7, "%d,%llx %llx %llx %llx,%llx %llx %llx %llx,%s %d,%d",
496 &length, & ( from_kmer.high1 ), & ( from_kmer.low1 ), & ( from_kmer.high2 ), & ( from_kmer.low2 ), & ( to_kmer.high1 ), & ( to_kmer.low1 ),
497 & ( to_kmer.high2 ), & ( to_kmer.low2 ), str, &cvg, &bal_ed );
498 #else
499 sscanf ( line + 7, "%d,%llx %llx,%llx %llx,%s %d,%d", &length, & ( from_kmer.high ), & ( from_kmer.low ), & ( to_kmer.high ), & ( to_kmer.low ), str, &cvg, &bal_ed );
500 #endif
501 tightSeq = ( char * ) ckalloc ( ( length / 4 + 1 ) * sizeof ( char ) );
503 else
505 linelen = strlen ( line );
507 for ( i = 0; i < linelen; i++ )
509 if ( line[i] >= 'a' && line[i] <= 'z' )
511 c = base2int ( line[i] - 'a' + 'A' );
512 writeChar2tightString ( c, tightSeq, n++ );
514 else if ( line[i] >= 'A' && line[i] <= 'Z' )
516 c = base2int ( line[i] );
517 writeChar2tightString ( c, tightSeq, n++ );
523 if ( index >= 0 )
525 edgeno = index + 1;
526 edge_array[edgeno].length = length;
527 edge_array[edgeno].cvg = cvg;
528 edge_array[edgeno].from_vt = kmer2vt ( from_kmer );
529 edge_array[edgeno].to_vt = kmer2vt ( to_kmer );
530 edge_array[edgeno].seq = tightSeq;
531 edge_array[edgeno].bal_edge = bal_ed + 1;
533 if ( bal_ed )
535 buildReverseComplementEdge ( edgeno );
536 index++;
540 fprintf ( stderr, "%d edge(s) input.\n", index + 1 );
541 gzclose ( fp );
542 createArcMemo ();
543 loadPreArcs ( graphfile );
546 unsigned int getTwinEdge ( unsigned int edgeno )
548 return edgeno + edge_array[edgeno].bal_edge - 1;
551 boolean EdSmallerThanTwin ( unsigned int edgeno )
553 return edge_array[edgeno].bal_edge > 1;
556 boolean EdLargerThanTwin ( unsigned int edgeno )
558 return edge_array[edgeno].bal_edge < 1;
561 boolean EdSameAsTwin ( unsigned int edgeno )
563 return edge_array[edgeno].bal_edge == 1;
566 /*************************************************
567 Function:
568 add1Arc
569 Description:
570 Updates the arc information between two edges.
571 Input:
572 1. from_ed: the first edge
573 2. to_ed: the second edge
574 3. weight: the weight of the arc to add
575 Output:
576 None.
577 Return:
578 None.
579 *************************************************/
580 static void add1Arc ( unsigned int from_ed, unsigned int to_ed, unsigned int weight )
582 if ( edge_array[from_ed].to_vt != edge_array[to_ed].from_vt )
584 //fprintf(stderr,"add1Arc: inconsistant joins\n");
585 return;
588 unsigned int bal_fe = getTwinEdge ( from_ed );
589 unsigned int bal_te = getTwinEdge ( to_ed );
591 if ( from_ed > num_ed || to_ed > num_ed || bal_fe > num_ed || bal_te > num_ed )
593 return;
596 ARC * parc, *bal_parc;
597 //both arcs already exist
598 parc = getArcBetween ( from_ed, to_ed );
600 if ( parc )
602 bal_parc = parc->bal_arc;
603 parc->multiplicity += weight;
604 bal_parc->multiplicity += weight;
605 return;
608 //create new arcs
609 parc = allocateArc ( to_ed );
610 parc->multiplicity = weight;
611 parc->prev = NULL;
613 if ( edge_array[from_ed].arcs )
615 edge_array[from_ed].arcs->prev = parc;
618 parc->next = edge_array[from_ed].arcs;
619 edge_array[from_ed].arcs = parc;
621 // A->A'
622 if ( bal_te == from_ed )
624 //printf("preArc from A to A'\n");
625 parc->bal_arc = parc;
626 parc->multiplicity += weight;
627 return;
630 bal_parc = allocateArc ( bal_fe );
631 bal_parc->multiplicity = weight;
632 bal_parc->prev = NULL;
634 if ( edge_array[bal_te].arcs )
636 edge_array[bal_te].arcs->prev = bal_parc;
639 bal_parc->next = edge_array[bal_te].arcs;
640 edge_array[bal_te].arcs = bal_parc;
641 //link them to each other
642 parc->bal_arc = bal_parc;
643 bal_parc->bal_arc = parc;
646 /*************************************************
647 Function:
648 loadPreArcs
649 Description:
650 Loads the info of pre-arcs.
651 Input:
652 1. graphfile: the input prefix
653 Output:
654 None.
655 Return:
656 None.
657 *************************************************/
658 void loadPreArcs ( char * graphfile )
660 FILE * fp;
661 char name[256], line[1024];
662 unsigned int target, weight;
663 unsigned int from_ed;
664 char * seg;
665 sprintf ( name, "%s.preArc", graphfile );
666 fp = ckopen ( name, "r" );
667 arcCounter = 0;
669 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
671 seg = strtok ( line, " " );
672 from_ed = atoi ( seg );
674 while ( ( seg = strtok ( NULL, " " ) ) != NULL )
676 target = atoi ( seg );
677 seg = strtok ( NULL, " " );
678 weight = atoi ( seg );
679 add1Arc ( from_ed, target, weight );
683 fprintf ( stderr, "%lli pre-arcs loaded.\n", arcCounter );
684 fclose ( fp );
687 void free_edge_array ( EDGE * ed_array, int ed_num )
689 int i;
691 for ( i = 1; i <= ed_num; i++ )
692 if ( ed_array[i].seq )
694 free ( ( void * ) ed_array[i].seq );
697 free ( ( void * ) ed_array );