modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / splitReps.c
blob46ebf6dafe7fb70a03979aca10efe71a2adc6468
1 /*
2 * splitReps.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
29 //Note: branches number is greater than 4 for sparse
30 static const int MAX_BRANCH_NUM = 1024; // the limit for the branch.
31 static unsigned int involved[2049]; //record the upstream and downstream edge
32 static unsigned int lefts[1024]; //record the upstream edge
33 static unsigned int rights[1024]; //record the downstream edge
34 static unsigned char gothrough[1024][1024]; //record the path from upstream edges to downstream edges
36 /*************************************************
37 Function:
38 interferingCheck
39 Description:
40 1. Records the upsteam and downstream edges of current edge.
41 2. Checks if any of the upstream/downstream edge (or its twin edge) is the upstream edge
42 and the downstream edge at the same time.
43 Input:
44 1. edgeno: the current edge
45 2. repTimes: the in-degree (or out-degree)
46 Output:
47 None.
48 Return:
49 1 if the repeat is not suitable to solve.
50 *************************************************/
51 static boolean interferingCheck ( unsigned int edgeno, int repTimes )
53 int i, j, t;
54 unsigned int bal_ed;
55 involved[0] = edgeno;
56 i = 1;
58 for ( j = 0; j < repTimes; j++ )
60 involved[i++] = lefts[j];
63 for ( j = 0; j < repTimes; j++ )
65 involved[i++] = rights[j];
68 for ( j = 0; j < i - 1; j++ )
69 for ( t = j + 1; t < i; t++ )
70 if ( involved[j] == involved[t] )
72 return 1;
75 for ( j = 0; j < i; j++ )
77 bal_ed = getTwinEdge ( involved[j] );
79 for ( t = 0; t < i; t++ )
80 if ( bal_ed == involved[t] )
82 return 1;
86 return 0;
89 /*************************************************
90 Function:
91 arcCounts
92 Description:
93 Calculates the num of the downstream edges.
94 Input:
95 1. edgeid: the edge index
96 Output:
97 1. num: the number of downstream edges
98 Return:
99 The first downstream arc.
100 *************************************************/
101 static ARC * arcCounts ( unsigned int edgeid, unsigned int * num )
103 ARC * arc;
104 ARC * firstValidArc = NULL;
105 unsigned int count = 0;
106 arc = edge_array[edgeid].arcs;
108 while ( arc )
110 if ( arc->to_ed > 0 )
112 count++;
115 if ( count == 1 )
117 firstValidArc = arc;
120 arc = arc->next;
123 *num = count;
124 return firstValidArc;
127 /*************************************************
128 Function:
129 readOnEdge
130 Description:
131 Checks if the read ID exists in the marker array of a edge.
132 Input:
133 1. readid: the read ID
134 2. edge: the edge
135 Output:
136 None.
137 Return:
138 1 if the read ID exists in array.
139 *************************************************/
140 static boolean readOnEdge ( long long readid, unsigned int edge )
142 int index;
143 int markNum;
144 long long * marklist;
146 if ( edge_array[edge].markers )
148 markNum = edge_array[edge].multi;
149 marklist = edge_array[edge].markers;
151 else
153 return 0;
156 for ( index = 0; index < markNum; index++ )
158 if ( readid == marklist[index] )
160 return 1;
164 return 0;
167 /*************************************************
168 Function:
169 cntByReads
170 Description:
171 Checks if the read supports the path acrossing three edges.
172 Input:
173 1. left: the left edge
174 2. middle: the middle edge
175 3. right: the right edge
176 Output:
177 None.
178 Return:
179 1 if the read supports the path.
180 *************************************************/
181 static long long cntByReads ( unsigned int left, unsigned int middle, unsigned int right )
183 int markNum;
184 long long * marklist;
186 if ( edge_array[left].markers )
188 markNum = edge_array[left].multi;
189 marklist = edge_array[left].markers;
191 else
193 return 0;
196 int index;
197 long long readid;
200 if(middle==8553)
201 printf("%d markers on %d\n",markNum,left);
203 for ( index = 0; index < markNum; index++ )
205 readid = marklist[index];
207 if ( readOnEdge ( readid, middle ) && readOnEdge ( readid, right ) )
209 return readid;
213 return 0;
218 > - <
221 /*************************************************
222 Function:
223 solvable
224 Description:
225 Checks if a edge is within a solvable repeat structure according to the following criterions:
226 1. in-degree equals to the out-degree.
227 2. the in-degree of the downstream edge is 1.
228 3. the out-degree of the upsteam edge is 1.
229 4. no interleaved read path between upstream and downstream edges.
230 Input:
231 1. edgeno: the edge
232 Output:
233 None.
234 Return:
235 0 if it is not.
236 *************************************************/
237 unsigned int solvable ( unsigned int edgeno )
239 if ( EdSameAsTwin ( edgeno ) || edge_array[edgeno].multi == 255 )
241 return 0;
244 unsigned int bal_ed = getTwinEdge ( edgeno );
245 unsigned int arcRight_n, arcLeft_n;
246 unsigned int counter;
247 unsigned int i, j;
248 unsigned int branch, bal_branch;
249 ARC * parcL, *parcR;
250 parcL = arcCounts ( bal_ed, &arcLeft_n );
252 if ( arcLeft_n < 2 )
254 return 0;
257 parcR = arcCounts ( edgeno, &arcRight_n );
259 if ( arcLeft_n != arcRight_n )
261 return 0;
264 // check each right branch only has one upsteam connection
266 if(edgeno==2551){
267 for(i=0;i<arcLeft_n;i++)
268 printf("%d,",lefts[i]);
269 printf("__left to %d\n",edgeno);
270 for(j=0;j<arcRight_n;j++)
271 printf("%d,",rights[j]);
272 printf("__right to %d\n",edgeno);
275 arcRight_n = 0;
277 while ( parcR )
279 if ( parcR->to_ed == 0 )
281 parcR = parcR->next;
282 continue;
285 branch = parcR->to_ed;
287 if ( EdSameAsTwin ( branch ) || edge_array[branch].multi == 255 )
289 return 0;
292 rights[arcRight_n++] = branch;
294 if ( arcRight_n >= MAX_BRANCH_NUM )
296 fprintf ( stderr, "ERROR: right arc number is bigger than the max %d.\n", MAX_BRANCH_NUM );
297 exit ( -1 );
300 bal_branch = getTwinEdge ( branch );
301 arcCounts ( bal_branch, &counter );
303 if ( counter != 1 )
305 return 0;
308 parcR = parcR->next;
311 // check if each left branch only has one downsteam connection
312 arcLeft_n = 0;
314 while ( parcL )
316 if ( parcL->to_ed == 0 )
318 parcL = parcL->next;
319 continue;
322 branch = parcL->to_ed;
324 if ( EdSameAsTwin ( branch ) || edge_array[branch].multi == 255 )
326 return 0;
329 bal_branch = getTwinEdge ( branch );
330 lefts[arcLeft_n++] = bal_branch;
332 if ( arcLeft_n >= MAX_BRANCH_NUM )
334 fprintf ( stderr, "ERROR: left arc number is bigger than the max %d.\n", MAX_BRANCH_NUM );
335 exit ( -1 );
338 arcCounts ( bal_branch, &counter );
340 if ( counter != 1 )
342 return 0;
345 parcL = parcL->next;
348 //check if reads indicate one to one connection between upsteam and downstream edges
350 for ( i = 0; i < arcLeft_n; i++ )
352 counter = 0;
354 for ( j = 0; j < arcRight_n; j++ )
356 gothrough[i][j] = cntByReads ( lefts[i], edgeno, rights[j] ) == 0 ? 0 : 1;
357 counter += gothrough[i][j];
359 if ( counter > 1 )
361 return 0;
365 if ( counter != 1 )
367 return 0;
371 for ( j = 0; j < arcRight_n; j++ )
373 counter = 0;
375 for ( i = 0; i < arcLeft_n; i++ )
377 counter += gothrough[i][j];
380 if ( counter != 1 )
382 return 0;
386 return arcLeft_n;
389 /*************************************************
390 Function:
391 cp1edge
392 Description:
393 Copies the info of one edge to the other.
394 Input:
395 1. source: the source edge
396 2. target: the target edge
397 Output:
398 None.
399 Return:
400 The target edge.
401 *************************************************/
402 static unsigned int cp1edge ( unsigned int source, unsigned int target )
404 int length = edge_array[source].length;
405 char * tightSeq;
406 int index;
407 unsigned int bal_source = getTwinEdge ( source );
408 unsigned int bal_target;
410 if ( bal_source > source )
412 bal_target = target + 1;
414 else
416 bal_target = target;
417 target = target + 1;
420 tightSeq = ( char * ) ckalloc ( ( length / 4 + 1 ) * sizeof ( char ) );
422 for ( index = 0; index < length / 4 + 1; index++ )
424 tightSeq[index] = edge_array[source].seq[index];
427 edge_array[target].length = length;
428 edge_array[target].cvg = edge_array[source].cvg;
429 edge_array[target].to_vt = edge_array[source].to_vt;
430 edge_array[target].from_vt = edge_array[source].from_vt;
431 edge_array[target].seq = tightSeq;
432 edge_array[target].bal_edge = edge_array[source].bal_edge;
433 edge_array[target].rv = NULL;
434 edge_array[target].arcs = NULL;
435 edge_array[target].markers = NULL;
436 edge_array[target].flag = 0;
437 edge_array[target].deleted = 0;
438 tightSeq = ( char * ) ckalloc ( ( length / 4 + 1 ) * sizeof ( char ) );
440 for ( index = 0; index < length / 4 + 1; index++ )
442 tightSeq[index] = edge_array[bal_source].seq[index];
445 edge_array[bal_target].length = length;
446 edge_array[bal_target].cvg = edge_array[bal_source].cvg;
447 edge_array[bal_target].to_vt = edge_array[bal_source].to_vt;
448 edge_array[bal_target].from_vt = edge_array[bal_source].from_vt;
449 edge_array[bal_target].seq = tightSeq;
450 edge_array[bal_target].bal_edge = edge_array[bal_source].bal_edge;
451 edge_array[bal_target].rv = NULL;
452 edge_array[bal_target].arcs = NULL;
453 edge_array[bal_target].markers = NULL;
454 edge_array[bal_target].flag = 0;
455 edge_array[bal_target].deleted = 0;
456 return target;
459 /*************************************************
460 Function:
461 moveArc2cp
462 Description:
463 Moves the upstream and downstream arc information of source edge to target edge.
464 Input:
465 1. leftEd: upstream edge
466 2. rightEd: dowmstream edge
467 3. source: the source edge
468 4. target: the target edge
469 Output:
470 None.
471 Return:
472 None.
473 *************************************************/
474 static void moveArc2cp ( unsigned int leftEd, unsigned int rightEd, unsigned int source, unsigned int target )
476 unsigned int bal_left = getTwinEdge ( leftEd );
477 unsigned int bal_right = getTwinEdge ( rightEd );
478 unsigned int bal_source = getTwinEdge ( source );
479 unsigned int bal_target = getTwinEdge ( target );
480 ARC * arc;
481 ARC * newArc, *twinArc;
482 //between left and source
483 arc = getArcBetween ( leftEd, source );
484 arc->to_ed = 0;
485 newArc = allocateArc ( target );
486 newArc->multiplicity = arc->multiplicity;
487 newArc->prev = NULL;
488 newArc->next = edge_array[leftEd].arcs;
490 if ( edge_array[leftEd].arcs )
492 edge_array[leftEd].arcs->prev = newArc;
495 edge_array[leftEd].arcs = newArc;
496 arc = getArcBetween ( bal_source, bal_left );
497 arc->to_ed = 0;
498 twinArc = allocateArc ( bal_left );
499 twinArc->multiplicity = arc->multiplicity;
500 twinArc->prev = NULL;
501 twinArc->next = NULL;
502 edge_array[bal_target].arcs = twinArc;
503 newArc->bal_arc = twinArc;
504 twinArc->bal_arc = newArc;
505 //between source and right
506 arc = getArcBetween ( source, rightEd );
507 arc->to_ed = 0;
508 newArc = allocateArc ( rightEd );
509 newArc->multiplicity = arc->multiplicity;
510 newArc->prev = NULL;
511 newArc->next = NULL;
512 edge_array[target].arcs = newArc;
513 arc = getArcBetween ( bal_right, bal_source );
514 arc->to_ed = 0;
515 twinArc = allocateArc ( bal_target );
516 twinArc->multiplicity = arc->multiplicity;
517 twinArc->prev = NULL;
518 twinArc->next = edge_array[bal_right].arcs;
520 if ( edge_array[bal_right].arcs )
522 edge_array[bal_right].arcs->prev = twinArc;
525 edge_array[bal_right].arcs = twinArc;
526 newArc->bal_arc = twinArc;
527 twinArc->bal_arc = newArc;
530 /*************************************************
531 Function:
532 split1edge
533 Description:
534 Copies the center edge to a new edge and updates related arc information if the center edge is in a solvable repeat sturcture.
535 Input:
536 1. edgeno: the edge index , which is the repeat
537 2. repTimes: the in-degree( or out-degree)
538 Output:
539 None.
540 Return:
541 None.
542 *************************************************/
543 static void split1edge ( unsigned int edgeno, int repTimes )
545 int i, j;
546 unsigned int target;
548 for ( i = 1; i < repTimes; i++ )
550 for ( j = 0; j < repTimes; j++ )
551 if ( gothrough[i][j] > 0 ) // a path supported by read
553 break;
556 target = cp1edge ( edgeno, extraEdgeNum );
557 moveArc2cp ( lefts[i], rights[j], edgeno, target );
558 extraEdgeNum += 2;
562 static void debugging ( unsigned int i )
564 ARC * parc;
565 parc = edge_array[i].arcs;
567 if ( !parc )
569 fprintf ( stderr, "No downward connection for %d.\n", i );
572 while ( parc )
574 fprintf ( stderr, "%d -> %d\n", i, parc->to_ed );
575 parc = parc->next;
579 /*************************************************
580 Function:
581 solveReps
582 Description:
583 Uses the read path information to solve the repeats.
584 Input:
585 None.
586 Output:
587 None.
588 Return:
589 None.
590 *************************************************/
591 void solveReps ()
593 unsigned int i;
594 unsigned int repTime;
595 int counter = 0;
596 boolean flag;
597 //debugging(30514);
598 extraEdgeNum = num_ed + 1;
600 for ( i = 1; i <= num_ed; i++ )
602 repTime = solvable ( i );
604 if ( repTime == 0 )
606 continue;
609 flag = interferingCheck ( i, repTime );
611 if ( flag )
613 continue;
616 split1edge ( i, repTime );
617 counter++; //+= 2*(repTime-1);
619 if ( EdSmallerThanTwin ( i ) )
621 i++;
625 fprintf ( stderr, "%d repeat(s) are solvable, %d more edge(s).\n", counter, extraEdgeNum - 1 - num_ed );
626 num_ed = extraEdgeNum - 1;
627 removeDeadArcs ();
629 if ( markersArray )
631 free ( ( void * ) markersArray );
632 markersArray = NULL;