modified: makefile
[GalaxyCodeBases.git] / BGI / SOAPdenovo2 / standardPregraph / attachPEinfo.c
blob77105d5f7bb48b623b5ad89c528416dbd6a52a69
1 /*
2 * attachPEinfo.c
4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
23 #include "stdinc.h"
24 #include "newhash.h"
25 #include "kmerhash.h"
26 #include "extfunc.h"
27 #include "extvab.h"
28 #include "stack.h"
29 #include "zlib.h"
31 #define CNBLOCKSIZE 10000
32 static STACK * isStack;
33 static int ignorePE1, ignorePE2, ignorePE3, static_flag;
34 static int onsameCtgPE;
35 static unsigned long long peSUM;
37 static boolean staticF;
39 static int existCounter;
41 static int calcuIS ( STACK * intStack );
43 static int cmp_pe ( const void * a, const void * b )
45 PE_INFO * A, *B;
46 A = ( PE_INFO * ) a;
47 B = ( PE_INFO * ) b;
49 if ( A->rank > B->rank )
51 return 1;
53 else if ( A->rank == B->rank )
55 return 0;
57 else
59 return -1;
63 /*************************************************
64 Function:
65 loadPEgrads
66 Description:
67 Loads general mapped information of paired-end reads.
68 Input:
69 1. infile: prefix of output graph file name
70 Output:
71 None.
72 Return:
73 None.
74 *************************************************/
75 void loadPEgrads ( char * infile )
77 FILE * fp;
78 char name[256], line[1024];
79 int i;
80 boolean rankSet = 1;
81 sprintf ( name, "%s.peGrads", infile );
82 fp = fopen ( name, "r" );
84 if ( !fp )
86 fprintf ( stderr, "Can not open file %s.\n", name );
87 gradsCounter = 0;
88 return;
91 while ( fgets ( line, sizeof ( line ), fp ) != NULL )
93 if ( line[0] == 'g' )
95 sscanf ( line + 10, "%d %lld %d", &gradsCounter, &n_solexa, &maxReadLen );
96 fprintf ( stderr, "There are %d grad(s), %lld read(s), max read len %d.\n", gradsCounter, n_solexa, maxReadLen );
97 break;
101 alloc_pe_mem ( gradsCounter );
103 for ( i = 0; i < gradsCounter; i++ )
105 fgets ( line, sizeof ( line ), fp );
106 pes[i].rank = 0;
107 sscanf ( line, "%d %lld %d %d", & ( pes[i].insertS ), & ( pes[i].PE_bound ), & ( pes[i].rank ), & ( pes[i].pair_num_cut ) );
109 if ( pes[i].rank < 1 )
111 rankSet = 0;
115 fclose ( fp );
117 if ( rankSet )
119 qsort ( &pes[0], gradsCounter, sizeof ( PE_INFO ), cmp_pe );
120 return;
123 int lastRank = 0;
125 for ( i = 0; i < gradsCounter; i++ )
127 if ( i == 0 )
129 pes[i].rank = ++lastRank;
131 else if ( pes[i].insertS < 300 )
133 pes[i].rank = lastRank;
135 else if ( pes[i].insertS < 800 )
137 if ( pes[i - 1].insertS < 300 )
139 pes[i].rank = ++lastRank;
141 else
143 pes[i].rank = lastRank;
146 else if ( pes[i].insertS < 3000 )
148 if ( pes[i - 1].insertS < 800 )
150 pes[i].rank = ++lastRank;
152 else
154 pes[i].rank = lastRank;
157 else if ( pes[i].insertS < 7000 )
159 if ( pes[i - 1].insertS < 3000 )
161 pes[i].rank = ++lastRank;
163 else
165 pes[i].rank = lastRank;
168 else
170 if ( pes[i - 1].insertS < 7000 )
172 pes[i].rank = ++lastRank;
174 else
176 pes[i].rank = lastRank;
182 /*************************************************
183 Function:
184 add1Connect
185 Description:
186 Updates the connection between two contigs.
187 Input:
188 1. e1: left contig
189 2. e2: right contig
190 3. gap: distance between two contigs
191 4. weight: weight of connection
192 5. inherit: inheritance flag of connection
193 Output:
194 None.
195 Return:
196 NULL if failed adding, otherwise the pointer to new or updated connection.
197 *************************************************/
198 CONNECT * add1Connect ( unsigned int e1, unsigned int e2, int gap, int weight, boolean inherit )
200 if ( e1 == e2 || e1 == getTwinCtg ( e2 ) )
202 return NULL;
205 CONNECT * connect = NULL, *bal_connect = NULL;
206 long long sum;
208 if ( weight > 255 )
210 weight = 255;
213 connect = getCntBetween ( e1, e2 );
214 bal_connect = getCntBetween ( e2, e1 );
216 if ( connect )
218 if ( !weight )
220 return connect;
223 existCounter++;
225 if ( !inherit )
227 sum = connect->weightNotInherit * connect->gapLen + gap * weight;
228 connect->gapLen = sum / ( connect->weightNotInherit + weight );
230 if ( connect->weightNotInherit + weight <= 255 )
232 connect->weightNotInherit += weight;
234 else if ( connect->weightNotInherit < 255 )
236 connect->weightNotInherit = 255;
239 else
241 sum = connect->weight * connect->gapLen + gap * weight;
242 connect->gapLen = sum / ( connect->weight + weight );
244 if ( !connect->inherit )
246 connect->maxSingleWeight = connect->weightNotInherit;
249 connect->inherit = 1;
250 connect->maxSingleWeight = connect->maxSingleWeight > weight ? connect->maxSingleWeight : weight;
253 if ( connect->weight + weight <= 255 )
255 connect->weight += weight;
257 else if ( connect->weight < 255 )
259 connect->weight = 255;
262 else
264 newCntCounter++;
265 connect = allocateCN ( e2, gap );
267 if ( cntLookupTable )
269 putCnt2LookupTable ( e1, connect );
272 connect->weight = weight;
274 if ( contig_array[e1].mask || contig_array[e2].mask )
276 connect->mask = 1;
279 connect->next = contig_array[e1].downwardConnect;
280 contig_array[e1].downwardConnect = connect;
282 if ( !inherit )
284 connect->weightNotInherit = weight;
286 else
288 connect->weightNotInherit = 0;
289 connect->inherit = 1;
290 connect->maxSingleWeight = weight;
294 return connect;
297 /*************************************************
298 Function:
299 attach1PE
300 Description:
301 Checks paired-end relation between two contigs, and updates the connection between
302 contigs if the relation is fine.
303 Input:
304 1. e1: left contig
305 2. pre_pos: read1's start position on left contig
306 3. bal_e2: right contig in reversed direction
307 4. pos: read2's start position on right contig
308 5. insert_size: insert size of paired-end reads
309 Output:
310 None.
311 Return:
312 -1 if left contig and the reversed right contig were the same contig .
313 0 if the calculated distance between contigs was abnormal.
314 1 if paired-end relation was normal.
315 2 if read1 and read2 were aligned to the same contig.
316 *************************************************/
317 int attach1PE ( unsigned int e1, int pre_pos, unsigned int bal_e2, int pos, int insert_size )
319 int gap, realpeSize;
320 unsigned int bal_e1, e2;
322 if ( e1 == bal_e2 )
324 ignorePE1++;
325 return -1; //orientation wrong
328 bal_e1 = getTwinCtg ( e1 );
329 e2 = getTwinCtg ( bal_e2 );
331 if ( e1 == e2 )
333 realpeSize = contig_array[e1].length + overlaplen - pre_pos - pos;
335 if ( realpeSize > 0 )
337 peSUM += realpeSize;
338 onsameCtgPE++;
340 if ( ( int ) contig_array[e1].length > insert_size )
342 int * item = ( int * ) stackPush ( isStack );
343 ( *item ) = realpeSize;
347 return 2;
350 gap = insert_size - overlaplen + pre_pos + pos - contig_array[e1].length - contig_array[e2].length;
352 if ( gap < - ( insert_size / 10 ) )
354 ignorePE2++;
355 return 0;
358 if ( gap > insert_size )
360 ignorePE3++;
361 return 0;
364 add1Connect ( e1, e2, gap, 1, 0 );
365 add1Connect ( bal_e2, bal_e1, gap, 1, 0 );
366 return 1;
369 /*************************************************
370 Function:
371 connectByPE_grad
372 Description:
373 Loads alignment information of paired-end reads of specified LIB
374 and updates connection between contigs.
375 Input:
376 1. fp: alignment information file
377 2. peGrad: LIB number
378 3. line: buffer to store one alignment record
379 Output:
380 None.
381 Return:
382 Loaded alignment record number.
383 *************************************************/
384 int connectByPE_grad ( FILE * fp, int peGrad, char * line )
386 long long pre_readno, readno, minno, maxno;
387 int pre_pos, pos, flag, PE, count = 0, Total_PE = 0;
388 unsigned int pre_contigno, contigno, newIndex;
390 if ( peGrad < 0 || peGrad > gradsCounter )
392 fprintf ( stderr, "Specified pe grad is out of bound.\n" );
393 return 0;
396 maxno = pes[peGrad].PE_bound;
398 if ( peGrad == 0 )
400 minno = 0;
402 else
404 minno = pes[peGrad - 1].PE_bound;
407 onsameCtgPE = peSUM = 0;
408 PE = pes[peGrad].insertS;
410 if ( strlen ( line ) )
412 sscanf ( line, "%lld %d %d", &pre_readno, &pre_contigno, &pre_pos );
414 if ( pre_readno <= minno )
416 pre_readno = -1;
419 else
421 pre_readno = -1;
424 ignorePE1 = ignorePE2 = ignorePE3 = 0;
425 static_flag = 1;
426 isStack = ( STACK * ) createStack ( CNBLOCKSIZE, sizeof ( int ) );
428 while ( fgets ( line, lineLen, fp ) != NULL )
430 sscanf ( line, "%lld %d %d", &readno, &contigno, &pos );
432 if ( readno > maxno )
434 break;
437 if ( readno <= minno )
439 continue;
442 newIndex = index_array[contigno];
444 if ( isSameAsTwin ( newIndex ) )
446 continue;
449 if ( PE && ( readno % 2 == 0 ) && ( pre_readno == readno - 1 ) ) // they are a pair of reads
451 Total_PE++;
452 flag = attach1PE ( pre_contigno, pre_pos, newIndex, pos, PE );
454 if ( flag == 1 )
456 count++;
460 pre_readno = readno;
461 pre_contigno = newIndex;
462 pre_pos = pos;
465 fprintf ( stderr, "For insert size: %d\n", PE );
466 fprintf ( stderr, " Total PE links %d\n", Total_PE );
467 fprintf ( stderr, " Normal PE links on same contig %d\n", onsameCtgPE );
468 fprintf ( stderr, " Abnormal PE links on same contig %d\n", ignorePE1 );
469 fprintf ( stderr, " PE links of minus distance %d\n", ignorePE2 );
470 fprintf ( stderr, " PE links of plus distance %d\n", ignorePE3 );
471 fprintf ( stderr, " Correct PE links %d\n", count );
472 fprintf ( stderr, " Accumulated connections %d\n", newCntCounter );
473 fprintf ( stderr, "Use contigs longer than %d to estimate insert size: \n", PE );
474 fprintf ( stderr, " PE links %d\n", isStack->item_c );
475 calcuIS ( isStack );
476 freeStack ( isStack );
477 return count;
480 /*************************************************
481 Function:
482 connectByPE_grad_gz
483 Description:
484 Loads alignment information of paired-end reads of specified LIB
485 and updates connection between contigs.
486 Input:
487 1. fp: alignment information file in gz format
488 2. peGrad: LIB number
489 3. line: buffer to store one alignment record
490 Output:
491 None.
492 Return:
493 Loaded alignment record number.
494 *************************************************/
495 int connectByPE_grad_gz ( gzFile * fp, int peGrad, char * line )
497 long long pre_readno, readno, minno, maxno;
498 int pre_pos, pos, flag, PE, count = 0, Total_PE = 0;
499 unsigned int pre_contigno, contigno, newIndex;
501 if ( peGrad < 0 || peGrad > gradsCounter )
503 fprintf ( stderr, "Specified pe grad is out of bound.\n" );
504 return 0;
507 maxno = pes[peGrad].PE_bound;
509 if ( peGrad == 0 )
511 minno = 0;
513 else
515 minno = pes[peGrad - 1].PE_bound;
518 onsameCtgPE = peSUM = 0;
519 PE = pes[peGrad].insertS;
521 if ( strlen ( line ) )
523 sscanf ( line, "%lld %d %d", &pre_readno, &pre_contigno, &pre_pos );
525 if ( pre_readno <= minno )
527 pre_readno = -1;
530 else
532 pre_readno = -1;
535 ignorePE1 = ignorePE2 = ignorePE3 = 0;
536 static_flag = 1;
537 isStack = ( STACK * ) createStack ( CNBLOCKSIZE, sizeof ( int ) );
539 while ( gzgets ( fp, line, lineLen ) != NULL )
541 sscanf ( line, "%lld %d %d", &readno, &contigno, &pos );
543 if ( readno > maxno )
545 break;
548 if ( readno <= minno )
550 continue;
553 newIndex = index_array[contigno];
555 if ( isSameAsTwin ( newIndex ) )
557 continue;
560 if ( PE && ( readno % 2 == 0 ) && ( pre_readno == readno - 1 ) ) // they are a pair of reads
562 Total_PE++;
563 flag = attach1PE ( pre_contigno, pre_pos, newIndex, pos, PE );
565 if ( flag == 1 )
567 count++;
571 pre_readno = readno;
572 pre_contigno = newIndex;
573 pre_pos = pos;
576 fprintf ( stderr, "For insert size: %d\n", PE );
577 fprintf ( stderr, " Total PE links %d\n", Total_PE );
578 fprintf ( stderr, " Normal PE links on same contig %d\n", onsameCtgPE );
579 fprintf ( stderr, " Incorrect oriented PE links %d\n", ignorePE1 );
580 fprintf ( stderr, " PE links of too small insert size %d\n", ignorePE2 );
581 fprintf ( stderr, " PE links of too large insert size %d\n", ignorePE3 );
582 fprintf ( stderr, " Correct PE links %d\n", count );
583 fprintf ( stderr, " Accumulated connections %d\n", newCntCounter );
584 fprintf ( stderr, "Use contigs longer than %d to estimate insert size: \n", PE );
585 fprintf ( stderr, " PE links %d\n", isStack->item_c );
586 calcuIS ( isStack );
587 freeStack ( isStack );
588 return count;
591 /*************************************************
592 Function:
593 calcuIS
594 Description:
595 Calculates insert size of LIB using paired-end reads that both
596 reads were aligned to the same contig.
597 Input:
598 1. intStack: stack storing the alignment information
599 Output:
600 None.
601 Return:
602 Calculated insert size.
603 *************************************************/
604 static int calcuIS ( STACK * intStack )
606 long long sum = 0;
607 int avg = 0;
608 int * item;
609 int num = intStack->item_c;
611 if ( num < 100 )
613 fprintf ( stderr, "Too few PE links.\n" );
614 return avg;
617 stackBackup ( intStack );
619 while ( ( item = ( int * ) stackPop ( intStack ) ) != NULL )
621 sum += *item;
624 stackRecover ( intStack );
625 num = intStack->item_c;
626 avg = sum / num;
627 sum = 0;
628 stackBackup ( intStack );
630 while ( ( item = ( int * ) stackPop ( intStack ) ) != NULL )
632 sum += ( ( long long ) * item - avg ) * ( ( long long ) * item - avg );
635 int SD = sqrt ( sum / ( num - 1 ) );
637 if ( SD == 0 )
639 fprintf ( stderr, " Average insert size %d\n SD %d\n", avg, SD );
640 return avg;
643 stackRecover ( intStack );
644 sum = num = 0;
646 while ( ( item = ( int * ) stackPop ( intStack ) ) != NULL )
647 if ( abs ( *item - avg ) < 3 * SD )
649 sum += *item;
650 num++;
653 if ( num == 0 ) { avg = 0; }
654 else { avg = sum / num; }
656 fprintf ( stderr, " Average insert size %d\n SD %d\n", avg, SD );
657 return avg;
660 unsigned int getTwinCtg ( unsigned int ctg )
662 return ctg + contig_array[ctg].bal_edge - 1;
665 boolean isSmallerThanTwin ( unsigned int ctg )
667 return contig_array[ctg].bal_edge > 1;
670 boolean isLargerThanTwin ( unsigned int ctg )
672 return contig_array[ctg].bal_edge < 1;
675 boolean isSameAsTwin ( unsigned int ctg )
677 return contig_array[ctg].bal_edge == 1;