4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
29 //Note: branches number is greater than 4 for sparse
30 static const int MAX_BRANCH_NUM
= 1024; // the limit for the branch.
31 static unsigned int involved
[2049]; //record the upstream and downstream edge
32 static unsigned int lefts
[1024]; //record the upstream edge
33 static unsigned int rights
[1024]; //record the downstream edge
34 static unsigned char gothrough
[1024][1024]; //record the path from upstream edges to downstream edges
36 /*************************************************
40 1. Records the upsteam and downstream edges of current edge.
41 2. Checks if any of the upstream/downstream edge (or its twin edge) is the upstream edge
42 and the downstream edge at the same time.
44 1. edgeno: the current edge
45 2. repTimes: the in-degree (or out-degree)
49 1 if the repeat is not suitable to solve.
50 *************************************************/
51 static boolean
interferingCheck ( unsigned int edgeno
, int repTimes
)
58 for ( j
= 0; j
< repTimes
; j
++ )
60 involved
[i
++] = lefts
[j
];
63 for ( j
= 0; j
< repTimes
; j
++ )
65 involved
[i
++] = rights
[j
];
68 for ( j
= 0; j
< i
- 1; j
++ )
69 for ( t
= j
+ 1; t
< i
; t
++ )
70 if ( involved
[j
] == involved
[t
] )
75 for ( j
= 0; j
< i
; j
++ )
77 bal_ed
= getTwinEdge ( involved
[j
] );
79 for ( t
= 0; t
< i
; t
++ )
80 if ( bal_ed
== involved
[t
] )
89 /*************************************************
93 Calculates the num of the downstream edges.
95 1. edgeid: the edge index
97 1. num: the number of downstream edges
99 The first downstream arc.
100 *************************************************/
101 static ARC
* arcCounts ( unsigned int edgeid
, unsigned int * num
)
104 ARC
* firstValidArc
= NULL
;
105 unsigned int count
= 0;
106 arc
= edge_array
[edgeid
].arcs
;
110 if ( arc
->to_ed
> 0 )
124 return firstValidArc
;
127 /*************************************************
131 Checks if the read ID exists in the marker array of a edge.
133 1. readid: the read ID
138 1 if the read ID exists in array.
139 *************************************************/
140 static boolean
readOnEdge ( long long readid
, unsigned int edge
)
144 long long * marklist
;
146 if ( edge_array
[edge
].markers
)
148 markNum
= edge_array
[edge
].multi
;
149 marklist
= edge_array
[edge
].markers
;
156 for ( index
= 0; index
< markNum
; index
++ )
158 if ( readid
== marklist
[index
] )
167 /*************************************************
171 Checks if the read supports the path acrossing three edges.
173 1. left: the left edge
174 2. middle: the middle edge
175 3. right: the right edge
179 1 if the read supports the path.
180 *************************************************/
181 static long long cntByReads ( unsigned int left
, unsigned int middle
, unsigned int right
)
184 long long * marklist
;
186 if ( edge_array
[left
].markers
)
188 markNum
= edge_array
[left
].multi
;
189 marklist
= edge_array
[left
].markers
;
201 printf("%d markers on %d\n",markNum,left);
203 for ( index
= 0; index
< markNum
; index
++ )
205 readid
= marklist
[index
];
207 if ( readOnEdge ( readid
, middle
) && readOnEdge ( readid
, right
) )
221 /*************************************************
225 Checks if a edge is within a solvable repeat structure according to the following criterions:
226 1. in-degree equals to the out-degree.
227 2. the in-degree of the downstream edge is 1.
228 3. the out-degree of the upsteam edge is 1.
229 4. no interleaved read path between upstream and downstream edges.
236 *************************************************/
237 unsigned int solvable ( unsigned int edgeno
)
239 if ( EdSameAsTwin ( edgeno
) || edge_array
[edgeno
].multi
== 255 )
244 unsigned int bal_ed
= getTwinEdge ( edgeno
);
245 unsigned int arcRight_n
, arcLeft_n
;
246 unsigned int counter
;
248 unsigned int branch
, bal_branch
;
250 parcL
= arcCounts ( bal_ed
, &arcLeft_n
);
257 parcR
= arcCounts ( edgeno
, &arcRight_n
);
259 if ( arcLeft_n
!= arcRight_n
)
264 // check each right branch only has one upsteam connection
267 for(i=0;i<arcLeft_n;i++)
268 printf("%d,",lefts[i]);
269 printf("__left to %d\n",edgeno);
270 for(j=0;j<arcRight_n;j++)
271 printf("%d,",rights[j]);
272 printf("__right to %d\n",edgeno);
279 if ( parcR
->to_ed
== 0 )
285 branch
= parcR
->to_ed
;
287 if ( EdSameAsTwin ( branch
) || edge_array
[branch
].multi
== 255 )
292 rights
[arcRight_n
++] = branch
;
294 if ( arcRight_n
>= MAX_BRANCH_NUM
)
296 fprintf ( stderr
, "ERROR: right arc number is bigger than the max %d.\n", MAX_BRANCH_NUM
);
300 bal_branch
= getTwinEdge ( branch
);
301 arcCounts ( bal_branch
, &counter
);
311 // check if each left branch only has one downsteam connection
316 if ( parcL
->to_ed
== 0 )
322 branch
= parcL
->to_ed
;
324 if ( EdSameAsTwin ( branch
) || edge_array
[branch
].multi
== 255 )
329 bal_branch
= getTwinEdge ( branch
);
330 lefts
[arcLeft_n
++] = bal_branch
;
332 if ( arcLeft_n
>= MAX_BRANCH_NUM
)
334 fprintf ( stderr
, "ERROR: left arc number is bigger than the max %d.\n", MAX_BRANCH_NUM
);
338 arcCounts ( bal_branch
, &counter
);
348 //check if reads indicate one to one connection between upsteam and downstream edges
350 for ( i
= 0; i
< arcLeft_n
; i
++ )
354 for ( j
= 0; j
< arcRight_n
; j
++ )
356 gothrough
[i
][j
] = cntByReads ( lefts
[i
], edgeno
, rights
[j
] ) == 0 ? 0 : 1;
357 counter
+= gothrough
[i
][j
];
371 for ( j
= 0; j
< arcRight_n
; j
++ )
375 for ( i
= 0; i
< arcLeft_n
; i
++ )
377 counter
+= gothrough
[i
][j
];
389 /*************************************************
393 Copies the info of one edge to the other.
395 1. source: the source edge
396 2. target: the target edge
401 *************************************************/
402 static unsigned int cp1edge ( unsigned int source
, unsigned int target
)
404 int length
= edge_array
[source
].length
;
407 unsigned int bal_source
= getTwinEdge ( source
);
408 unsigned int bal_target
;
410 if ( bal_source
> source
)
412 bal_target
= target
+ 1;
420 tightSeq
= ( char * ) ckalloc ( ( length
/ 4 + 1 ) * sizeof ( char ) );
422 for ( index
= 0; index
< length
/ 4 + 1; index
++ )
424 tightSeq
[index
] = edge_array
[source
].seq
[index
];
427 edge_array
[target
].length
= length
;
428 edge_array
[target
].cvg
= edge_array
[source
].cvg
;
429 edge_array
[target
].to_vt
= edge_array
[source
].to_vt
;
430 edge_array
[target
].from_vt
= edge_array
[source
].from_vt
;
431 edge_array
[target
].seq
= tightSeq
;
432 edge_array
[target
].bal_edge
= edge_array
[source
].bal_edge
;
433 edge_array
[target
].rv
= NULL
;
434 edge_array
[target
].arcs
= NULL
;
435 edge_array
[target
].markers
= NULL
;
436 edge_array
[target
].flag
= 0;
437 edge_array
[target
].deleted
= 0;
438 tightSeq
= ( char * ) ckalloc ( ( length
/ 4 + 1 ) * sizeof ( char ) );
440 for ( index
= 0; index
< length
/ 4 + 1; index
++ )
442 tightSeq
[index
] = edge_array
[bal_source
].seq
[index
];
445 edge_array
[bal_target
].length
= length
;
446 edge_array
[bal_target
].cvg
= edge_array
[bal_source
].cvg
;
447 edge_array
[bal_target
].to_vt
= edge_array
[bal_source
].to_vt
;
448 edge_array
[bal_target
].from_vt
= edge_array
[bal_source
].from_vt
;
449 edge_array
[bal_target
].seq
= tightSeq
;
450 edge_array
[bal_target
].bal_edge
= edge_array
[bal_source
].bal_edge
;
451 edge_array
[bal_target
].rv
= NULL
;
452 edge_array
[bal_target
].arcs
= NULL
;
453 edge_array
[bal_target
].markers
= NULL
;
454 edge_array
[bal_target
].flag
= 0;
455 edge_array
[bal_target
].deleted
= 0;
459 /*************************************************
463 Moves the upstream and downstream arc information of source edge to target edge.
465 1. leftEd: upstream edge
466 2. rightEd: dowmstream edge
467 3. source: the source edge
468 4. target: the target edge
473 *************************************************/
474 static void moveArc2cp ( unsigned int leftEd
, unsigned int rightEd
, unsigned int source
, unsigned int target
)
476 unsigned int bal_left
= getTwinEdge ( leftEd
);
477 unsigned int bal_right
= getTwinEdge ( rightEd
);
478 unsigned int bal_source
= getTwinEdge ( source
);
479 unsigned int bal_target
= getTwinEdge ( target
);
481 ARC
* newArc
, *twinArc
;
482 //between left and source
483 arc
= getArcBetween ( leftEd
, source
);
485 newArc
= allocateArc ( target
);
486 newArc
->multiplicity
= arc
->multiplicity
;
488 newArc
->next
= edge_array
[leftEd
].arcs
;
490 if ( edge_array
[leftEd
].arcs
)
492 edge_array
[leftEd
].arcs
->prev
= newArc
;
495 edge_array
[leftEd
].arcs
= newArc
;
496 arc
= getArcBetween ( bal_source
, bal_left
);
498 twinArc
= allocateArc ( bal_left
);
499 twinArc
->multiplicity
= arc
->multiplicity
;
500 twinArc
->prev
= NULL
;
501 twinArc
->next
= NULL
;
502 edge_array
[bal_target
].arcs
= twinArc
;
503 newArc
->bal_arc
= twinArc
;
504 twinArc
->bal_arc
= newArc
;
505 //between source and right
506 arc
= getArcBetween ( source
, rightEd
);
508 newArc
= allocateArc ( rightEd
);
509 newArc
->multiplicity
= arc
->multiplicity
;
512 edge_array
[target
].arcs
= newArc
;
513 arc
= getArcBetween ( bal_right
, bal_source
);
515 twinArc
= allocateArc ( bal_target
);
516 twinArc
->multiplicity
= arc
->multiplicity
;
517 twinArc
->prev
= NULL
;
518 twinArc
->next
= edge_array
[bal_right
].arcs
;
520 if ( edge_array
[bal_right
].arcs
)
522 edge_array
[bal_right
].arcs
->prev
= twinArc
;
525 edge_array
[bal_right
].arcs
= twinArc
;
526 newArc
->bal_arc
= twinArc
;
527 twinArc
->bal_arc
= newArc
;
530 /*************************************************
534 Copies the center edge to a new edge and updates related arc information if the center edge is in a solvable repeat sturcture.
536 1. edgeno: the edge index , which is the repeat
537 2. repTimes: the in-degree( or out-degree)
542 *************************************************/
543 static void split1edge ( unsigned int edgeno
, int repTimes
)
548 for ( i
= 1; i
< repTimes
; i
++ )
550 for ( j
= 0; j
< repTimes
; j
++ )
551 if ( gothrough
[i
][j
] > 0 ) // a path supported by read
556 target
= cp1edge ( edgeno
, extraEdgeNum
);
557 moveArc2cp ( lefts
[i
], rights
[j
], edgeno
, target
);
562 static void debugging ( unsigned int i
)
565 parc
= edge_array
[i
].arcs
;
569 fprintf ( stderr
, "No downward connection for %d.\n", i
);
574 fprintf ( stderr
, "%d -> %d\n", i
, parc
->to_ed
);
579 /*************************************************
583 Uses the read path information to solve the repeats.
590 *************************************************/
594 unsigned int repTime
;
598 extraEdgeNum
= num_ed
+ 1;
600 for ( i
= 1; i
<= num_ed
; i
++ )
602 repTime
= solvable ( i
);
609 flag
= interferingCheck ( i
, repTime
);
616 split1edge ( i
, repTime
);
617 counter
++; //+= 2*(repTime-1);
619 if ( EdSmallerThanTwin ( i
) )
625 fprintf ( stderr
, "%d repeat(s) are solvable, %d more edge(s).\n", counter
, extraEdgeNum
- 1 - num_ed
);
626 num_ed
= extraEdgeNum
- 1;
631 free ( ( void * ) markersArray
);