4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
30 static void loadPreArcs ( char * graphfile
);
32 int cmp_vertex ( const void * a
, const void * b
)
38 if ( KmerLarger ( A
->kmer
, B
->kmer
) )
42 else if ( KmerEqual ( A
->kmer
, B
->kmer
) )
52 /*************************************************
56 1. Loads first kmer and the last kmer of the edge.
59 1. graphfile: the input prefix
64 *************************************************/
65 void loadVertex ( char * graphfile
)
67 char name
[256], line
[256];
69 Kmer word
, bal_word
, temp
;
72 sprintf ( name
, "%s.preGraphBasic", graphfile
);
73 fp
= ckopen ( name
, "r" );
75 while ( fgets ( line
, sizeof ( line
), fp
) != NULL
)
79 sscanf ( line
+ 6, "%d %c %d", &num_kmer
, &ch
, &overlaplen
);
80 fprintf ( stderr
, "There are %d kmer(s) in vertex file.\n", num_kmer
);
82 else if ( line
[0] == 'E' )
84 sscanf ( line
+ 5, "%d", &num_ed
);
85 fprintf ( stderr
, "There are %d edge(s) in edge file.\n", num_ed
);
87 else if ( line
[0] == 'M' )
89 sscanf ( line
, "MaxReadLen %d MinReadLen %d MaxNameLen %d", &maxReadLen
, &minReadLen
, &maxNameLen
);
91 else if ( line
[0] == 'B' )
95 sscanf ( line
, "Backup VERTEX %d %c %d", &num_kmer
, &ch
, &overlaplen
);
96 fprintf ( stderr
, "Backup there are %d kmer(s) in vertex file.\n", num_kmer
);
98 else if ( line
[7] == 'E' )
100 sscanf ( line
, "Backup EDGEs %d", &num_ed
);
101 fprintf ( stderr
, "Backup there are %d edge(s) in edge file.\n", num_ed
);
103 else if ( line
[7] == 'M' )
105 sscanf ( line
, "Backup MaxReadLen %d MinReadLen %d MaxNameLen %d", &maxReadLen
, &minReadLen
, &maxNameLen
);
111 vt_array
= ( VERTEX
* ) ckalloc ( ( 4 * num_kmer
) * sizeof ( VERTEX
) );
112 num_kmer_limit
= 4 * num_kmer
;
113 sprintf ( name
, "%s.vertex", graphfile
);
114 fp
= ckopen ( name
, "r" );
116 for ( i
= 0; i
< num_kmer
; i
++ )
119 fscanf ( fp
, "%llx %llx %llx %llx", & ( word
.high1
), & ( word
.low1
), & ( word
.high2
), & ( word
.low2
) );
121 fscanf ( fp
, "%llx %llx", & ( word
.high
), & ( word
.low
) );
123 bal_word
= reverseComplement ( word
, overlaplen
);
125 if ( KmerSmaller ( word
, bal_word
) )
127 vt_array
[i
].kmer
= word
;
131 vt_array
[i
].kmer
= bal_word
;
135 temp
= vt_array
[num_kmer
- 1].kmer
;
136 qsort ( &vt_array
[0], num_kmer
, sizeof ( vt_array
[0] ), cmp_vertex
);
137 fprintf ( stderr
, "Kmers sorted.\n" );
140 for ( i
= 0; i
< num_kmer
; i
++ )
142 bal_word
= reverseComplement ( vt_array
[i
].kmer
, overlaplen
);
143 vt_array
[i
+ num_kmer
].kmer
= bal_word
;
149 /*************************************************
153 Uses binary search to get the index of a kmer in array.
155 1. vts: the array of the kmer
156 2. num: the count of the total kmers
157 3. target: the kmer to search
161 The kmer's index in array.
162 *************************************************/
163 int bisearch ( VERTEX
* vts
, int num
, Kmer target
)
169 while ( low
<= high
)
171 mid
= ( low
+ high
) / 2;
173 if ( KmerEqual ( vts
[mid
].kmer
, target
) )
177 else if ( KmerLarger ( target
, vts
[mid
].kmer
) )
197 /*************************************************
201 Searchs the index of a kmer in array.
207 The index of the kmer in the array.
208 *************************************************/
209 int kmer2vt ( Kmer kmer
)
213 bal_word
= reverseComplement ( kmer
, overlaplen
);
215 if ( KmerSmaller ( kmer
, bal_word
) )
217 vt_id
= bisearch ( &vt_array
[0], num_vt
, kmer
);
221 fprintf ( stderr
, "There is no vertex for kmer " );
222 PrintKmer ( stderr
, kmer
);
223 fprintf ( stderr
, " .\n" );
226 fprintf (stderr,"There is not the vertex for kmer %llx %llx %llx %llx.\n", kmer.high1, kmer.low1, kmer.high2, kmer.low2);
228 fprintf (stderr,"There is not the vertex for kmer %llx %llx.\n", kmer.high, kmer.low);
237 vt_id
= bisearch ( &vt_array
[0], num_vt
, bal_word
);
245 fprintf ( stderr
, "There is no vertex for kmer " );
246 PrintKmer ( stderr
, kmer
);
247 fprintf ( stderr
, " .\n" );
250 fprintf (stderr,"There is not the vertex for kmer %llx %llx %llx %llx.\n", kmer.high1, kmer.low1, kmer.high2, kmer.low2);
252 fprintf (stderr,"There is not the vertex for kmer %llx %llx.\n", kmer.high, kmer.low);
261 /*************************************************
263 buildReverseComplementEdge
265 Creates reverse complementary edge of a edge.
267 1. edgeno: the edge index
272 *************************************************/
274 static void buildReverseComplementEdge ( unsigned int edgeno
)
276 int length
= edge_array
[edgeno
].length
;
278 char * sequence
, ch
, *tightSeq
;
279 Kmer kmer
= vt_array
[edge_array
[edgeno
].from_vt
].kmer
;
280 sequence
= ( char * ) ckalloc ( ( overlaplen
+ length
) * sizeof ( char ) );
281 int bit1
, bit2
, bit3
, bit4
;
283 if ( overlaplen
< 32 )
291 if ( overlaplen
>= 32 && overlaplen
< 64 )
294 bit3
= overlaplen
- 32;
299 if ( overlaplen
>= 64 && overlaplen
< 96 )
303 bit2
= overlaplen
- 64;
307 if ( overlaplen
>= 96 && overlaplen
< 128 )
312 bit1
= overlaplen
- 96;
315 for ( i
= bit1
- 1; i
>= 0; i
-- )
317 ch
= kmer
.high1
& 0x3;
322 for ( i
= bit2
- 1; i
>= 0; i
-- )
324 ch
= kmer
.low1
& 0x3;
326 sequence
[i
+ bit1
] = ch
;
329 for ( i
= bit3
- 1; i
>= 0; i
-- )
331 ch
= kmer
.high2
& 0x3;
333 sequence
[i
+ bit1
+ bit2
] = ch
;
336 for ( i
= bit4
- 1; i
>= 0; i
-- )
338 ch
= kmer
.low2
& 0x3;
340 sequence
[i
+ bit1
+ bit2
+ bit3
] = ch
;
343 for ( i
= 0; i
< length
; i
++ )
345 sequence
[i
+ overlaplen
] = getCharInTightString ( edge_array
[edgeno
].seq
, i
);
348 tightSeq
= ( char * ) ckalloc ( ( length
/ 4 + 1 ) * sizeof ( char ) );
350 for ( i
= length
- 1; i
>= 0; i
-- )
352 writeChar2tightString ( int_comp ( sequence
[i
] ), tightSeq
, index
++ );
355 edge_array
[edgeno
+ 1].length
= length
;
356 edge_array
[edgeno
+ 1].cvg
= edge_array
[edgeno
].cvg
;
357 kmer
= vt_array
[edge_array
[edgeno
].from_vt
].kmer
;
358 edge_array
[edgeno
+ 1].to_vt
= kmer2vt ( reverseComplement ( kmer
, overlaplen
) );
359 kmer
= vt_array
[edge_array
[edgeno
].to_vt
].kmer
;
360 edge_array
[edgeno
+ 1].from_vt
= kmer2vt ( reverseComplement ( kmer
, overlaplen
) );
361 edge_array
[edgeno
+ 1].seq
= tightSeq
;
362 edge_array
[edgeno
+ 1].bal_edge
= 0;
363 edge_array
[edgeno
+ 1].rv
= NULL
;
364 edge_array
[edgeno
+ 1].arcs
= NULL
;
365 edge_array
[edgeno
+ 1].flag
= 0;
366 edge_array
[edgeno
+ 1].deleted
= 0;
367 free ( ( void * ) sequence
);
371 /*************************************************
373 buildReverseComplementEdge
375 Creates reverse complementary edge of a edge.
377 1. edgeno: the edge index
382 *************************************************/
383 static void buildReverseComplementEdge ( unsigned int edgeno
)
385 int length
= edge_array
[edgeno
].length
;
387 char * sequence
, ch
, *tightSeq
;
388 Kmer kmer
= vt_array
[edge_array
[edgeno
].from_vt
].kmer
;
389 sequence
= ( char * ) ckalloc ( ( overlaplen
+ length
) * sizeof ( char ) );
390 int bit2
= overlaplen
> 32 ? 32 : overlaplen
;
391 int bit1
= overlaplen
> 32 ? overlaplen
- 32 : 0;
393 for ( i
= bit1
- 1; i
>= 0; i
-- )
395 ch
= kmer
.high
& 0x3;
400 for ( i
= bit2
- 1; i
>= 0; i
-- )
404 sequence
[i
+ bit1
] = ch
;
407 for ( i
= 0; i
< length
; i
++ )
409 sequence
[i
+ overlaplen
] = getCharInTightString ( edge_array
[edgeno
].seq
, i
);
412 tightSeq
= ( char * ) ckalloc ( ( length
/ 4 + 1 ) * sizeof ( char ) );
414 for ( i
= length
- 1; i
>= 0; i
-- )
416 writeChar2tightString ( int_comp ( sequence
[i
] ), tightSeq
, index
++ );
419 edge_array
[edgeno
+ 1].length
= length
;
420 edge_array
[edgeno
+ 1].cvg
= edge_array
[edgeno
].cvg
;
421 kmer
= vt_array
[edge_array
[edgeno
].from_vt
].kmer
;
422 edge_array
[edgeno
+ 1].to_vt
= kmer2vt ( reverseComplement ( kmer
, overlaplen
) );
423 kmer
= vt_array
[edge_array
[edgeno
].to_vt
].kmer
;
424 edge_array
[edgeno
+ 1].from_vt
= kmer2vt ( reverseComplement ( kmer
, overlaplen
) );
425 edge_array
[edgeno
+ 1].seq
= tightSeq
;
426 edge_array
[edgeno
+ 1].bal_edge
= 0;
427 edge_array
[edgeno
+ 1].rv
= NULL
;
428 edge_array
[edgeno
+ 1].arcs
= NULL
;
429 edge_array
[edgeno
+ 1].flag
= 0;
430 edge_array
[edgeno
+ 1].deleted
= 0;
431 free ( ( void * ) sequence
);
435 /*************************************************
439 1. Loads the info of the edges.
440 2. Loads the links between the edges.
442 1. graphfile: the input prefix
447 *************************************************/
448 void loadEdge ( char * graphfile
)
450 char c
, name
[256], line
[1024], str
[32];
451 char * tightSeq
= NULL
;
453 Kmer from_kmer
, to_kmer
;
454 int n
= 0, i
, length
, cvg
, index
= -1, bal_ed
, edgeno
;
457 sprintf ( name
, "%s.edge.gz", graphfile
);
458 fp
= gzopen ( name
, "r" );
459 num_ed_limit
= 1.2 * num_ed
;
460 edge_array
= ( EDGE
* ) ckalloc ( ( num_ed_limit
+ 1 ) * sizeof ( EDGE
) );
462 for ( j
= num_ed
+ 1; j
<= num_ed_limit
; j
++ )
464 edge_array
[j
].seq
= NULL
;
467 while ( gzgets ( fp
, line
, sizeof ( line
) ) != NULL
)
469 if ( line
[0] == '>' )
474 edge_array
[edgeno
].length
= length
;
475 edge_array
[edgeno
].cvg
= cvg
;
476 edge_array
[edgeno
].from_vt
= kmer2vt ( from_kmer
);
477 edge_array
[edgeno
].to_vt
= kmer2vt ( to_kmer
);
478 edge_array
[edgeno
].seq
= tightSeq
;
479 edge_array
[edgeno
].bal_edge
= bal_ed
+ 1;
480 edge_array
[edgeno
].rv
= NULL
;
481 edge_array
[edgeno
].arcs
= NULL
;
482 edge_array
[edgeno
].flag
= 0;
483 edge_array
[edgeno
].deleted
= 0;
487 buildReverseComplementEdge ( edgeno
);
495 sscanf ( line
+ 7, "%d,%llx %llx %llx %llx,%llx %llx %llx %llx,%s %d,%d",
496 &length
, & ( from_kmer
.high1
), & ( from_kmer
.low1
), & ( from_kmer
.high2
), & ( from_kmer
.low2
), & ( to_kmer
.high1
), & ( to_kmer
.low1
),
497 & ( to_kmer
.high2
), & ( to_kmer
.low2
), str
, &cvg
, &bal_ed
);
499 sscanf ( line
+ 7, "%d,%llx %llx,%llx %llx,%s %d,%d", &length
, & ( from_kmer
.high
), & ( from_kmer
.low
), & ( to_kmer
.high
), & ( to_kmer
.low
), str
, &cvg
, &bal_ed
);
501 tightSeq
= ( char * ) ckalloc ( ( length
/ 4 + 1 ) * sizeof ( char ) );
505 linelen
= strlen ( line
);
507 for ( i
= 0; i
< linelen
; i
++ )
509 if ( line
[i
] >= 'a' && line
[i
] <= 'z' )
511 c
= base2int ( line
[i
] - 'a' + 'A' );
512 writeChar2tightString ( c
, tightSeq
, n
++ );
514 else if ( line
[i
] >= 'A' && line
[i
] <= 'Z' )
516 c
= base2int ( line
[i
] );
517 writeChar2tightString ( c
, tightSeq
, n
++ );
526 edge_array
[edgeno
].length
= length
;
527 edge_array
[edgeno
].cvg
= cvg
;
528 edge_array
[edgeno
].from_vt
= kmer2vt ( from_kmer
);
529 edge_array
[edgeno
].to_vt
= kmer2vt ( to_kmer
);
530 edge_array
[edgeno
].seq
= tightSeq
;
531 edge_array
[edgeno
].bal_edge
= bal_ed
+ 1;
535 buildReverseComplementEdge ( edgeno
);
540 fprintf ( stderr
, "%d edge(s) input.\n", index
+ 1 );
543 loadPreArcs ( graphfile
);
546 unsigned int getTwinEdge ( unsigned int edgeno
)
548 return edgeno
+ edge_array
[edgeno
].bal_edge
- 1;
551 boolean
EdSmallerThanTwin ( unsigned int edgeno
)
553 return edge_array
[edgeno
].bal_edge
> 1;
556 boolean
EdLargerThanTwin ( unsigned int edgeno
)
558 return edge_array
[edgeno
].bal_edge
< 1;
561 boolean
EdSameAsTwin ( unsigned int edgeno
)
563 return edge_array
[edgeno
].bal_edge
== 1;
566 /*************************************************
570 Updates the arc information between two edges.
572 1. from_ed: the first edge
573 2. to_ed: the second edge
574 3. weight: the weight of the arc to add
579 *************************************************/
580 static void add1Arc ( unsigned int from_ed
, unsigned int to_ed
, unsigned int weight
)
582 if ( edge_array
[from_ed
].to_vt
!= edge_array
[to_ed
].from_vt
)
584 //fprintf(stderr,"add1Arc: inconsistant joins\n");
588 unsigned int bal_fe
= getTwinEdge ( from_ed
);
589 unsigned int bal_te
= getTwinEdge ( to_ed
);
591 if ( from_ed
> num_ed
|| to_ed
> num_ed
|| bal_fe
> num_ed
|| bal_te
> num_ed
)
596 ARC
* parc
, *bal_parc
;
597 //both arcs already exist
598 parc
= getArcBetween ( from_ed
, to_ed
);
602 bal_parc
= parc
->bal_arc
;
603 parc
->multiplicity
+= weight
;
604 bal_parc
->multiplicity
+= weight
;
609 parc
= allocateArc ( to_ed
);
610 parc
->multiplicity
= weight
;
613 if ( edge_array
[from_ed
].arcs
)
615 edge_array
[from_ed
].arcs
->prev
= parc
;
618 parc
->next
= edge_array
[from_ed
].arcs
;
619 edge_array
[from_ed
].arcs
= parc
;
622 if ( bal_te
== from_ed
)
624 //printf("preArc from A to A'\n");
625 parc
->bal_arc
= parc
;
626 parc
->multiplicity
+= weight
;
630 bal_parc
= allocateArc ( bal_fe
);
631 bal_parc
->multiplicity
= weight
;
632 bal_parc
->prev
= NULL
;
634 if ( edge_array
[bal_te
].arcs
)
636 edge_array
[bal_te
].arcs
->prev
= bal_parc
;
639 bal_parc
->next
= edge_array
[bal_te
].arcs
;
640 edge_array
[bal_te
].arcs
= bal_parc
;
641 //link them to each other
642 parc
->bal_arc
= bal_parc
;
643 bal_parc
->bal_arc
= parc
;
646 /*************************************************
650 Loads the info of pre-arcs.
652 1. graphfile: the input prefix
657 *************************************************/
658 void loadPreArcs ( char * graphfile
)
661 char name
[256], line
[1024];
662 unsigned int target
, weight
;
663 unsigned int from_ed
;
665 sprintf ( name
, "%s.preArc", graphfile
);
666 fp
= ckopen ( name
, "r" );
669 while ( fgets ( line
, sizeof ( line
), fp
) != NULL
)
671 seg
= strtok ( line
, " " );
672 from_ed
= atoi ( seg
);
674 while ( ( seg
= strtok ( NULL
, " " ) ) != NULL
)
676 target
= atoi ( seg
);
677 seg
= strtok ( NULL
, " " );
678 weight
= atoi ( seg
);
679 add1Arc ( from_ed
, target
, weight
);
683 fprintf ( stderr
, "%lli pre-arcs loaded.\n", arcCounter
);
687 void free_edge_array ( EDGE
* ed_array
, int ed_num
)
691 for ( i
= 1; i
<= ed_num
; i
++ )
692 if ( ed_array
[i
].seq
)
694 free ( ( void * ) ed_array
[i
].seq
);
697 free ( ( void * ) ed_array
);