4 * Copyright (c) 2008-2012 BGI-Shenzhen <soap at genomics dot org dot cn>.
6 * This file is part of SOAPdenovo.
8 * SOAPdenovo is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
13 * SOAPdenovo is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with SOAPdenovo. If not, see <http://www.gnu.org/licenses/>.
31 #define CNBLOCKSIZE 10000
32 static STACK
* isStack
;
33 static int ignorePE1
, ignorePE2
, ignorePE3
, static_flag
;
34 static int onsameCtgPE
;
35 static unsigned long long peSUM
;
37 static boolean staticF
;
39 static int existCounter
;
41 static int calcuIS ( STACK
* intStack
);
43 static int cmp_pe ( const void * a
, const void * b
)
49 if ( A
->rank
> B
->rank
)
53 else if ( A
->rank
== B
->rank
)
63 /*************************************************
67 Loads general mapped information of paired-end reads.
69 1. infile: prefix of output graph file name
74 *************************************************/
75 void loadPEgrads ( char * infile
)
78 char name
[256], line
[1024];
81 sprintf ( name
, "%s.peGrads", infile
);
82 fp
= fopen ( name
, "r" );
86 fprintf ( stderr
, "Can not open file %s.\n", name
);
91 while ( fgets ( line
, sizeof ( line
), fp
) != NULL
)
95 sscanf ( line
+ 10, "%d %lld %d", &gradsCounter
, &n_solexa
, &maxReadLen
);
96 fprintf ( stderr
, "There are %d grad(s), %lld read(s), max read len %d.\n", gradsCounter
, n_solexa
, maxReadLen
);
101 alloc_pe_mem ( gradsCounter
);
103 for ( i
= 0; i
< gradsCounter
; i
++ )
105 fgets ( line
, sizeof ( line
), fp
);
107 sscanf ( line
, "%d %lld %d %d", & ( pes
[i
].insertS
), & ( pes
[i
].PE_bound
), & ( pes
[i
].rank
), & ( pes
[i
].pair_num_cut
) );
109 if ( pes
[i
].rank
< 1 )
119 qsort ( &pes
[0], gradsCounter
, sizeof ( PE_INFO
), cmp_pe
);
125 for ( i
= 0; i
< gradsCounter
; i
++ )
129 pes
[i
].rank
= ++lastRank
;
131 else if ( pes
[i
].insertS
< 300 )
133 pes
[i
].rank
= lastRank
;
135 else if ( pes
[i
].insertS
< 800 )
137 if ( pes
[i
- 1].insertS
< 300 )
139 pes
[i
].rank
= ++lastRank
;
143 pes
[i
].rank
= lastRank
;
146 else if ( pes
[i
].insertS
< 3000 )
148 if ( pes
[i
- 1].insertS
< 800 )
150 pes
[i
].rank
= ++lastRank
;
154 pes
[i
].rank
= lastRank
;
157 else if ( pes
[i
].insertS
< 7000 )
159 if ( pes
[i
- 1].insertS
< 3000 )
161 pes
[i
].rank
= ++lastRank
;
165 pes
[i
].rank
= lastRank
;
170 if ( pes
[i
- 1].insertS
< 7000 )
172 pes
[i
].rank
= ++lastRank
;
176 pes
[i
].rank
= lastRank
;
182 /*************************************************
186 Updates the connection between two contigs.
190 3. gap: distance between two contigs
191 4. weight: weight of connection
192 5. inherit: inheritance flag of connection
196 NULL if failed adding, otherwise the pointer to new or updated connection.
197 *************************************************/
198 CONNECT
* add1Connect ( unsigned int e1
, unsigned int e2
, int gap
, int weight
, boolean inherit
)
200 if ( e1
== e2
|| e1
== getTwinCtg ( e2
) )
205 CONNECT
* connect
= NULL
, *bal_connect
= NULL
;
213 connect
= getCntBetween ( e1
, e2
);
214 bal_connect
= getCntBetween ( e2
, e1
);
227 sum
= connect
->weightNotInherit
* connect
->gapLen
+ gap
* weight
;
228 connect
->gapLen
= sum
/ ( connect
->weightNotInherit
+ weight
);
230 if ( connect
->weightNotInherit
+ weight
<= 255 )
232 connect
->weightNotInherit
+= weight
;
234 else if ( connect
->weightNotInherit
< 255 )
236 connect
->weightNotInherit
= 255;
241 sum
= connect
->weight
* connect
->gapLen
+ gap
* weight
;
242 connect
->gapLen
= sum
/ ( connect
->weight
+ weight
);
244 if ( !connect
->inherit
)
246 connect
->maxSingleWeight
= connect
->weightNotInherit
;
249 connect
->inherit
= 1;
250 connect
->maxSingleWeight
= connect
->maxSingleWeight
> weight
? connect
->maxSingleWeight
: weight
;
253 if ( connect
->weight
+ weight
<= 255 )
255 connect
->weight
+= weight
;
257 else if ( connect
->weight
< 255 )
259 connect
->weight
= 255;
265 connect
= allocateCN ( e2
, gap
);
267 if ( cntLookupTable
)
269 putCnt2LookupTable ( e1
, connect
);
272 connect
->weight
= weight
;
274 if ( contig_array
[e1
].mask
|| contig_array
[e2
].mask
)
279 connect
->next
= contig_array
[e1
].downwardConnect
;
280 contig_array
[e1
].downwardConnect
= connect
;
284 connect
->weightNotInherit
= weight
;
288 connect
->weightNotInherit
= 0;
289 connect
->inherit
= 1;
290 connect
->maxSingleWeight
= weight
;
297 /*************************************************
301 Checks paired-end relation between two contigs, and updates the connection between
302 contigs if the relation is fine.
305 2. pre_pos: read1's start position on left contig
306 3. bal_e2: right contig in reversed direction
307 4. pos: read2's start position on right contig
308 5. insert_size: insert size of paired-end reads
312 -1 if left contig and the reversed right contig were the same contig .
313 0 if the calculated distance between contigs was abnormal.
314 1 if paired-end relation was normal.
315 2 if read1 and read2 were aligned to the same contig.
316 *************************************************/
317 int attach1PE ( unsigned int e1
, int pre_pos
, unsigned int bal_e2
, int pos
, int insert_size
)
320 unsigned int bal_e1
, e2
;
325 return -1; //orientation wrong
328 bal_e1
= getTwinCtg ( e1
);
329 e2
= getTwinCtg ( bal_e2
);
333 realpeSize
= contig_array
[e1
].length
+ overlaplen
- pre_pos
- pos
;
335 if ( realpeSize
> 0 )
340 if ( ( int ) contig_array
[e1
].length
> insert_size
)
342 int * item
= ( int * ) stackPush ( isStack
);
343 ( *item
) = realpeSize
;
350 gap
= insert_size
- overlaplen
+ pre_pos
+ pos
- contig_array
[e1
].length
- contig_array
[e2
].length
;
352 if ( gap
< - ( insert_size
/ 10 ) )
358 if ( gap
> insert_size
)
364 add1Connect ( e1
, e2
, gap
, 1, 0 );
365 add1Connect ( bal_e2
, bal_e1
, gap
, 1, 0 );
369 /*************************************************
373 Loads alignment information of paired-end reads of specified LIB
374 and updates connection between contigs.
376 1. fp: alignment information file
377 2. peGrad: LIB number
378 3. line: buffer to store one alignment record
382 Loaded alignment record number.
383 *************************************************/
384 int connectByPE_grad ( FILE * fp
, int peGrad
, char * line
)
386 long long pre_readno
, readno
, minno
, maxno
;
387 int pre_pos
, pos
, flag
, PE
, count
= 0, Total_PE
= 0;
388 unsigned int pre_contigno
, contigno
, newIndex
;
390 if ( peGrad
< 0 || peGrad
> gradsCounter
)
392 fprintf ( stderr
, "Specified pe grad is out of bound.\n" );
396 maxno
= pes
[peGrad
].PE_bound
;
404 minno
= pes
[peGrad
- 1].PE_bound
;
407 onsameCtgPE
= peSUM
= 0;
408 PE
= pes
[peGrad
].insertS
;
410 if ( strlen ( line
) )
412 sscanf ( line
, "%lld %d %d", &pre_readno
, &pre_contigno
, &pre_pos
);
414 if ( pre_readno
<= minno
)
424 ignorePE1
= ignorePE2
= ignorePE3
= 0;
426 isStack
= ( STACK
* ) createStack ( CNBLOCKSIZE
, sizeof ( int ) );
428 while ( fgets ( line
, lineLen
, fp
) != NULL
)
430 sscanf ( line
, "%lld %d %d", &readno
, &contigno
, &pos
);
432 if ( readno
> maxno
)
437 if ( readno
<= minno
)
442 newIndex
= index_array
[contigno
];
444 if ( isSameAsTwin ( newIndex
) )
449 if ( PE
&& ( readno
% 2 == 0 ) && ( pre_readno
== readno
- 1 ) ) // they are a pair of reads
452 flag
= attach1PE ( pre_contigno
, pre_pos
, newIndex
, pos
, PE
);
461 pre_contigno
= newIndex
;
465 fprintf ( stderr
, "For insert size: %d\n", PE
);
466 fprintf ( stderr
, " Total PE links %d\n", Total_PE
);
467 fprintf ( stderr
, " Normal PE links on same contig %d\n", onsameCtgPE
);
468 fprintf ( stderr
, " Abnormal PE links on same contig %d\n", ignorePE1
);
469 fprintf ( stderr
, " PE links of minus distance %d\n", ignorePE2
);
470 fprintf ( stderr
, " PE links of plus distance %d\n", ignorePE3
);
471 fprintf ( stderr
, " Correct PE links %d\n", count
);
472 fprintf ( stderr
, " Accumulated connections %d\n", newCntCounter
);
473 fprintf ( stderr
, "Use contigs longer than %d to estimate insert size: \n", PE
);
474 fprintf ( stderr
, " PE links %d\n", isStack
->item_c
);
476 freeStack ( isStack
);
480 /*************************************************
484 Loads alignment information of paired-end reads of specified LIB
485 and updates connection between contigs.
487 1. fp: alignment information file in gz format
488 2. peGrad: LIB number
489 3. line: buffer to store one alignment record
493 Loaded alignment record number.
494 *************************************************/
495 int connectByPE_grad_gz ( gzFile
* fp
, int peGrad
, char * line
)
497 long long pre_readno
, readno
, minno
, maxno
;
498 int pre_pos
, pos
, flag
, PE
, count
= 0, Total_PE
= 0;
499 unsigned int pre_contigno
, contigno
, newIndex
;
501 if ( peGrad
< 0 || peGrad
> gradsCounter
)
503 fprintf ( stderr
, "Specified pe grad is out of bound.\n" );
507 maxno
= pes
[peGrad
].PE_bound
;
515 minno
= pes
[peGrad
- 1].PE_bound
;
518 onsameCtgPE
= peSUM
= 0;
519 PE
= pes
[peGrad
].insertS
;
521 if ( strlen ( line
) )
523 sscanf ( line
, "%lld %d %d", &pre_readno
, &pre_contigno
, &pre_pos
);
525 if ( pre_readno
<= minno
)
535 ignorePE1
= ignorePE2
= ignorePE3
= 0;
537 isStack
= ( STACK
* ) createStack ( CNBLOCKSIZE
, sizeof ( int ) );
539 while ( gzgets ( fp
, line
, lineLen
) != NULL
)
541 sscanf ( line
, "%lld %d %d", &readno
, &contigno
, &pos
);
543 if ( readno
> maxno
)
548 if ( readno
<= minno
)
553 newIndex
= index_array
[contigno
];
555 if ( isSameAsTwin ( newIndex
) )
560 if ( PE
&& ( readno
% 2 == 0 ) && ( pre_readno
== readno
- 1 ) ) // they are a pair of reads
563 flag
= attach1PE ( pre_contigno
, pre_pos
, newIndex
, pos
, PE
);
572 pre_contigno
= newIndex
;
576 fprintf ( stderr
, "For insert size: %d\n", PE
);
577 fprintf ( stderr
, " Total PE links %d\n", Total_PE
);
578 fprintf ( stderr
, " Normal PE links on same contig %d\n", onsameCtgPE
);
579 fprintf ( stderr
, " Incorrect oriented PE links %d\n", ignorePE1
);
580 fprintf ( stderr
, " PE links of too small insert size %d\n", ignorePE2
);
581 fprintf ( stderr
, " PE links of too large insert size %d\n", ignorePE3
);
582 fprintf ( stderr
, " Correct PE links %d\n", count
);
583 fprintf ( stderr
, " Accumulated connections %d\n", newCntCounter
);
584 fprintf ( stderr
, "Use contigs longer than %d to estimate insert size: \n", PE
);
585 fprintf ( stderr
, " PE links %d\n", isStack
->item_c
);
587 freeStack ( isStack
);
591 /*************************************************
595 Calculates insert size of LIB using paired-end reads that both
596 reads were aligned to the same contig.
598 1. intStack: stack storing the alignment information
602 Calculated insert size.
603 *************************************************/
604 static int calcuIS ( STACK
* intStack
)
609 int num
= intStack
->item_c
;
613 fprintf ( stderr
, "Too few PE links.\n" );
617 stackBackup ( intStack
);
619 while ( ( item
= ( int * ) stackPop ( intStack
) ) != NULL
)
624 stackRecover ( intStack
);
625 num
= intStack
->item_c
;
628 stackBackup ( intStack
);
630 while ( ( item
= ( int * ) stackPop ( intStack
) ) != NULL
)
632 sum
+= ( ( long long ) * item
- avg
) * ( ( long long ) * item
- avg
);
635 int SD
= sqrt ( sum
/ ( num
- 1 ) );
639 fprintf ( stderr
, " Average insert size %d\n SD %d\n", avg
, SD
);
643 stackRecover ( intStack
);
646 while ( ( item
= ( int * ) stackPop ( intStack
) ) != NULL
)
647 if ( abs ( *item
- avg
) < 3 * SD
)
653 if ( num
== 0 ) { avg
= 0; }
654 else { avg
= sum
/ num
; }
656 fprintf ( stderr
, " Average insert size %d\n SD %d\n", avg
, SD
);
660 unsigned int getTwinCtg ( unsigned int ctg
)
662 return ctg
+ contig_array
[ctg
].bal_edge
- 1;
665 boolean
isSmallerThanTwin ( unsigned int ctg
)
667 return contig_array
[ctg
].bal_edge
> 1;
670 boolean
isLargerThanTwin ( unsigned int ctg
)
672 return contig_array
[ctg
].bal_edge
< 1;
675 boolean
isSameAsTwin ( unsigned int ctg
)
677 return contig_array
[ctg
].bal_edge
== 1;