3 extend.c extend in one direction of BASE.
5 # Copyright (C) 2015, The University of Hong Kong.
7 # This program is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU General Public License
9 # as published by the Free Software Foundation; either version 3
10 # of the License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 03110-1301, USA.
22 Author : Binghang Liu, bhliu@cs.hku.hk
28 Get reads id and keep them in reads.
31 int set_read_ids_plus(ULL l
, ULL r
, int* tCount
, int level
, int node
, int size
, int cons
, int threadId
)
38 fprintf(stderr
, "Index PLus Error: left: %d right %d\n", l
, r
);
43 ULL id
= idx2BWT
[size
]->readIDtable
[l
+ i
- idx2BWT
[size
]->bwt
->cumulativeFreq
[4]];
44 g_reads
[threadId
][c
+i
].id
= id
;
45 g_reads
[threadId
][c
+i
].strand
= 1;
46 g_reads
[threadId
][c
+i
].size
= size
;
47 g_reads
[threadId
][c
+i
].level
= level
;
48 g_reads
[threadId
][c
+i
].node
= node
;
49 g_reads
[threadId
][c
+i
].cons
= cons
;
55 int set_read_ids_minus(ULL l
, ULL r
, int* tCount
, int level
, int node
, int size
, int cons
, int threadId
)
58 er
.saL
= l
; er
.saR
= r
; er
.strand
= 0;
61 fprintf(stderr
, "Index Minus Error: left: %d right %d\n", l
, r
);
65 int add_num
= extractReadInf(idx2BWT
[size
], er
, ri
, r
-l
+2);
68 for( i
=0; i
< add_num
; i
++)
70 g_reads
[threadId
][b
+i
].id
= ri
[i
].read_id
;
71 g_reads
[threadId
][b
+i
].level
= level
;
72 g_reads
[threadId
][b
+i
].strand
= 0;
73 g_reads
[threadId
][b
+i
].node
= node
;
74 g_reads
[threadId
][b
+i
].size
= size
;
75 g_reads
[threadId
][b
+i
].cons
= cons
;
77 (*tCount
) = b
+ add_num
;
82 Add new bases, including ACGT$.
84 int add_base(Base
* b
, int base
, Base
*s
)
86 ULL l
, r
, ll
, rr
, ll_rev
, rr_rev
, depth
=0;
89 for(i
=0; i
< bwtCount
; i
++)
91 s
[i
].plus
= 0; s
[i
].minus
= 0;
92 s
[i
].saL
= 0; s
[i
].saR
= 0; s
[i
].saLL
= 0; s
[i
].saRR
= 0; s
[i
].saLLrev
= 0; s
[i
].saRRrev
= 0;
93 if(b
[i
].plus
> 0 && b
[i
].saR
>= b
[i
].saL
&& b
[i
].saR
- b
[i
].saL
< MAXDEPTH
)
95 l
= b
[i
].saL
; r
= b
[i
].saR
;
96 BWTSARangeBackwardFor(idx2BWT
[i
], base
,&l
, &r
);
97 if(l
<= r
&& r
-l
<= b
[i
].saR
- b
[i
].saL
)
101 s
[i
].plus
= r
- l
+ 1;
104 if(b
[i
].minus
> 0 && b
[i
].saRR
>= b
[i
].saLL
&& b
[i
].saRR
- b
[i
].saLL
< MAXDEPTH
)
106 ll
= b
[i
].saLL
; rr
= b
[i
].saRR
; ll_rev
= b
[i
].saLLrev
; rr_rev
= b
[i
].saRRrev
;
107 BWTSARangeBackwardRev(idx2BWT
[i
], 3-base
,&ll
, &rr
, &ll_rev
, &rr_rev
);
108 if(ll
<= rr
&& rr
-ll
<= b
[i
].saRR
- b
[i
].saLL
)
112 s
[i
].saLLrev
= ll_rev
;
113 s
[i
].saRRrev
= rr_rev
;
114 s
[i
].minus
= rr
-ll
+ 1;
117 depth
+= s
[i
].plus
+ s
[i
].minus
;
123 int add_end(Base
* b
, int* read_num
, int level
, int node
, int cons
, int threadId
)
125 ULL l
, r
, ll
, rr
, ll_rev
, rr_rev
, total
=0;
128 for(i
=0; i
< bwtCount
; i
++)
130 if(b
[i
].plus
> 0 && b
[i
].saR
>= b
[i
].saL
&& b
[i
].saR
- b
[i
].saL
< MAXDEPTH
)
132 l
= b
[i
].saL
; r
= b
[i
].saR
;
133 BWTSARangeBackward(idx2BWT
[i
], 4 ,&l
, &r
);
134 if(l
<= r
&& r
-l
<= b
[i
].saR
- b
[i
].saL
)
136 flag
= set_read_ids_plus(l
, r
, read_num
, level
, node
, i
, cons
, threadId
);
141 if(b
[i
].minus
> 0 && b
[i
].saRR
>= b
[i
].saLL
&& b
[i
].saRR
- b
[i
].saLL
< MAXDEPTH
)
143 ll
= b
[i
].saLL
; rr
= b
[i
].saRR
; ll_rev
= b
[i
].saLLrev
; rr_rev
= b
[i
].saRRrev
;
144 BWTSARangeBackwardRev(idx2BWT
[i
], 4 ,&ll
, &rr
, &ll_rev
, &rr_rev
);
145 if(ll
<= rr
&& rr
-ll
<= b
[i
].saRR
- b
[i
].saLL
)
147 flag
= set_read_ids_minus(ll
, rr
, read_num
, level
, node
, i
, cons
, threadId
);
149 total
+= rr
- ll
+ 1;
156 int get_real_depth(Base
* b
)
160 for(i
=0; i
< bwtCount
; i
++)
161 depth
+= b
[i
].plus
+ b
[i
].minus
;
165 int get_strand_depth(Base
* b
, int strand
)
169 for(i
=0; i
< bwtCount
; i
++)
178 void set_blank_base(Base
* to
)
181 for(i
=0; i
< bwtCount
; i
++)
188 void print_inf(FILE* fp
, Node
* nodes
, Lev
* levs
, ReadId
* reads
, int* s
, int cur_len
);
189 void print_base(FILE* fp
, Base
* b
)
192 for(i
=0; i
< bwtCount
; i
++)
194 fprintf(fp
, "i: %d l %lld r %lld ll %lld rr %lld plus %d minus %d\n", i
, b
->saL
, b
->saR
, b
->saLL
, b
->saRR
, b
->plus
, b
->minus
);
198 construct the backward extension tree.
200 int backward_search(FILE* logs
, int* ss
, int threadId
)
202 int num
[3]={ss
[0], ss
[1], ss
[2]};
204 ULL l
, r
, ll
, rr
, ll_rev
, rr_rev
;
206 //define left to speed up.
207 unsigned long long left
, flag
;
209 g_levs
[threadId
][num
[1]].nnum
=0;
211 int max_i
=0, max_j
=0, max_depth
=0, count
, cons
, node_max_i
= -1, node_max_j
= 0, node_max_depth
=0;
213 for(j
=num
[0]-1; j
>=0; j
--)
215 if(g_nodes
[threadId
][j
].is_end
== 1)
217 if(g_nodes
[threadId
][j
].level
!= num
[1]-1)
220 left
= get_real_depth(g_nodes
[threadId
][j
].b
);
223 fprintf(stderr
, "j=%d, b[0].l=%d, b[0].r=%d, b[0].ll=%d, b[0].rr=%d\n", j
, g_nodes
[threadId
][j
].b
[0].saL
, g_nodes
[threadId
][j
].b
[0].saR
, g_nodes
[threadId
][j
].b
[0].saLL
, g_nodes
[threadId
][j
].b
[0].saRR
);
227 if(j
== 0 && num
[1] == 1)
229 if(j
== g_levs
[threadId
][num
[1]-1].node
&& g_levs
[threadId
][ num
[1] - g_nodes
[threadId
][j
].len
].node
== j
)
232 flag
= add_end(g_nodes
[threadId
][j
].b
, &(num
[2]), num
[1], j
, cons
, threadId
);
234 g_levs
[threadId
][num
[1]].c
[4] += flag
;
238 g_nodes
[threadId
][j
].is_end
= 1;
239 if(max_depth
== 0 || max_depth
< flag
)
249 int now_max_depth
=0, now_max_i
=0, now_noend_depth
= left
;
254 flag
= add_base(g_nodes
[threadId
][j
].b
, i
, sbase
[threadId
][i
]);
259 g_levs
[threadId
][num
[1]].c
[i
] += flag
;
261 if(now_max_depth
< flag
)
264 now_max_depth
= flag
;
269 fprintf(stderr
, "Error: total %d, left %d, c[0] %d c[1] %d c[2] %d c[3] %d\n", get_real_depth(g_nodes
[threadId
][j
].b
), left
, g_levs
[threadId
][num
[1]].c
[0], g_levs
[threadId
][num
[1]].c
[1], g_levs
[threadId
][num
[1]].c
[2], g_levs
[threadId
][num
[1]].c
[3]);
270 print_base(stderr
, g_nodes
[threadId
][j
].b
);
275 g_nodes
[threadId
][j
].seq
[g_nodes
[threadId
][j
].len
] = dnaChar
[now_max_i
];
276 g_nodes
[threadId
][j
].len
++;
277 g_nodes
[threadId
][j
].seq
[g_nodes
[threadId
][j
].len
] = '\0';
278 g_nodes
[threadId
][j
].level
= num
[1];
279 set_base(g_nodes
[threadId
][j
].b
, sbase
[threadId
][now_max_i
]);
280 g_levs
[threadId
][num
[1]].nnum
++;
282 if(max_depth
< now_max_depth
)
284 max_i
= now_max_i
; max_j
= j
;
285 max_depth
= now_max_depth
;
286 }else if(max_depth
== now_max_depth
)
287 { if(max_i
!= now_max_i
)
289 if(g_levs
[threadId
][ss
[1]-1].node
== j
)
291 max_i
= now_max_i
; max_j
= j
;
294 }else if(max_i
== now_max_i
)
296 if(g_levs
[threadId
][ss
[1]-1].node
== j
)
301 if(j
== g_levs
[threadId
][ss
[1]-1].node
)
303 node_max_i
= now_max_i
; node_max_j
= j
;
304 node_max_depth
= now_max_depth
;
311 int real_depth
= get_real_depth(sbase
[threadId
][k
]);
314 set_base(g_nodes
[threadId
][num
[0]].b
, sbase
[threadId
][k
]);
315 g_nodes
[threadId
][num
[0]].seq
[0] = dnaChar
[k
];
316 g_nodes
[threadId
][num
[0]].seq
[1] = '\0';
317 g_nodes
[threadId
][num
[0]].len
= 1;
318 g_nodes
[threadId
][num
[0]].level
= num
[1];
319 g_nodes
[threadId
][num
[0]].parent
= j
;
320 g_nodes
[threadId
][num
[0]].depth
= real_depth
;
321 g_nodes
[threadId
][num
[0]].is_end
= 0;
322 g_nodes
[threadId
][num
[0]].conf
= 0;
324 if(max_depth
< real_depth
)
326 max_i
= k
; max_j
= num
[0];
327 max_depth
= g_nodes
[threadId
][num
[0]].depth
;
328 }else if(max_depth
== real_depth
)
331 if(g_levs
[threadId
][ss
[1]-1].node
== j
&& g_levs
[threadId
][ss
[1]-1].node
!= g_nodes
[threadId
][max_j
].parent
)
333 max_i
= k
; max_j
= num
[0];
337 if(max_i
== k
&& g_levs
[threadId
][ss
[1]-1].node
== j
&& g_levs
[threadId
][ss
[1]-1].node
!= g_nodes
[threadId
][max_j
].parent
)
344 g_levs
[threadId
][num
[1]].nnum
++;
348 if(now_noend_depth
* 0.75 > now_max_depth
)
350 g_nodes
[threadId
][j
].conf
= 1;
354 //improve the treatment of heterozygous.
355 if(ss
[1] > 1 && num
[0] == ss
[0] && max_j
!= g_levs
[threadId
][ss
[1]-1].node
) //no change of node number.
357 if(node_max_i
== max_i
)
359 max_j
= g_levs
[threadId
][ss
[1]-1].node
;
360 max_depth
= node_max_depth
;
364 if(g_levs
[threadId
][num
[1]].nnum
== 0)
366 if(g_levs
[threadId
][num
[1]].c
[4] > 0)
368 g_levs
[threadId
][num
[1]].node
= max_j
;
369 g_levs
[threadId
][num
[1]].base
= max_i
;
370 g_levs
[threadId
][num
[1]].depth
= 0;
378 g_levs
[threadId
][num
[1]].base
= max_i
;
379 g_levs
[threadId
][num
[1]].depth
= max_depth
;
380 g_levs
[threadId
][num
[1]].node
= max_j
;
390 call consensus of the extended region.
393 int pe_check_pure(ULL id
, int size
, int pos
, int strand
, int *real_size
, int threadId
)
396 LL pid
= get_pair_id(id
);
399 fprintf(stdout
, "Check %lld,%lld,%d,%d ", id
, pid
, strand
, pos
);
403 if(DEBUG
) fprintf(stdout
, "Minus\n");
406 if(is_used_bwt_sparse(pid
, size
) == 0)
408 if(DEBUG
) fprintf(stdout
, "NotUsed\n");
411 int i
= tmpCount
[threadId
]-1;
412 int lowerbound
= pos
- (InsertSize
[size
] - Read_length
) - 3*SD
[size
];
415 if(tmpRead
[threadId
][i
].size
!= size
)
417 if(tmpRead
[threadId
][i
].pos
< lowerbound
)
419 if(DEBUG
) fprintf(stdout
, "OutLowerBound\n");
422 if(tmpRead
[threadId
][i
].id
== pid
)
424 *real_size
= pos
- tmpRead
[threadId
][i
].pos
+ 1 + Read_length
;
425 if(DEBUG
) fprintf(stdout
, "Find %d ", *real_size
);
426 if(tmpRead
[threadId
][i
].cons
!= 1 || tmpRead
[threadId
][i
].unique
!= 1)
428 if(DEBUG
) fprintf(stdout
, "CONS%dUnique%d\n", tmpRead
[threadId
][i
].cons
, tmpRead
[threadId
][i
].unique
);
431 *real_size
= pos
- tmpRead
[threadId
][i
].pos
+ 1 + Read_length
;
432 if(DEBUG
) fprintf(stdout
, "\n");
436 if(DEBUG
) fprintf(stdout
, "NotFound\n");
460 int get_subopt_brother(Node
* nodes
, int* bs
, int start
, int bi
, int level
, int num
)
463 for(i
= start
-1; i
>= 0; i
--)
465 if(nodes
[i
].level
< level
)
467 if(bs
[i
] == bi
|| i
== bi
)
473 if(nodes
[sub_i
].depth
< nodes
[i
].depth
)
478 for(i
=start
+1; i
< num
; i
++)
480 if(nodes
[i
].level
- nodes
[i
].len
> level
)
482 if(bs
[i
] == bi
|| i
== bi
)
487 if(nodes
[sub_i
].depth
< nodes
[i
].depth
)
495 void clean_inf(FILE* logs
, Node
* nodes
, Lev
* levs
, ReadId
* reads
, int* s
, int* bs
, int bi
)
497 //only update the levs, s[1], reads.node
499 for(i
=0; i
< s
[2]; i
++)
501 if(bs
[reads
[i
].node
] != bi
)
509 for(i
=1; i
< s
[1]; i
++)
511 if(bs
[levs
[i
].node
] != bi
)
513 j
= get_subopt_brother(nodes
, bs
, levs
[i
].node
, bi
, i
, s
[0]);
517 fprintf(stderr
, "Cut directly %d->%d\n", s
[1], i
);
521 int p
= nodes
[j
].len
- (nodes
[j
].level
- i
+ 1) ;
522 int newbase
= base2int(nodes
[j
].seq
[p
]);
523 int oldnode
= levs
[i
].node
;
524 int oldbase
= levs
[i
].base
;
527 if(newbase
== levs
[i
].base
)
529 levs
[i
].depth
= levs
[i
].c
[levs
[i
].base
] - levs
[i
].depth
;
530 levs
[i
].c
[levs
[i
].base
] = levs
[i
].depth
;
531 if(levs
[i
].depth
> nodes
[j
].depth
)
533 levs
[i
].depth
= nodes
[j
].depth
; //would be overestimated.
542 levs
[i
].base
= newbase
;
543 levs
[i
].depth
= levs
[i
].c
[levs
[i
].base
];
544 if(levs
[i
].depth
> nodes
[j
].depth
)
545 levs
[i
].depth
= nodes
[j
].depth
; //would be overestimated.
548 break; //solve one error per time.
550 if(levs
[i
].depth
< Error_depth
)
559 if(k
!= levs
[i
].base
)
569 int get_ancestor(Node
* nodes
, int i
)
572 while(nodes
[nowi
].parent
!= 0)
573 nowi
= nodes
[nowi
].parent
;
578 int fill_branches(Node
* nodes
, int n
, int* bs
)
584 if(nodes
[i
].parent
!= 0)
595 bs
[i
] = get_ancestor(nodes
, i
);
601 void clean_path(FILE* logs
, Node
* nodes
, Lev
* levs
, ReadId
* reads
, int* s
, int ancestor
)
606 for(i
=0; i
< s
[0]; i
++)
609 int branch_count
= fill_branches(nodes
, s
[0], bs
);
611 clean_inf(logs
, nodes
, levs
, reads
, s
, bs
, ancestor
);
614 int solve_conf(FILE* logs
, Node
* nodes
, Lev
* levs
, ReadId
* reads
, int* s
, int seed_len
, int ctg_len
, int threadId
, int diff
)
616 int i
, total_pe_count
=0, real_size
;
619 for(i
=0; i
< s
[0]; i
++)
622 int branch_count
= fill_branches(nodes
, s
[0], bs
);
623 int count
[branch_count
];
624 int pecount
[branch_count
];
626 for(i
=0; i
< branch_count
; i
++)
628 count
[i
]=0; pecount
[i
]=0;
631 for(i
=0; i
< s
[2]; i
++)
633 if(reads
[i
].node
== 0)
635 count
[bs
[reads
[i
].node
]]++;
636 if(pe_check_pure(reads
[i
].id
, reads
[i
].size
, reads
[i
].level
+ ctg_len
, reads
[i
].strand
, &real_size
, threadId
) == 1)
638 if(DEBUG
) fprintf(logs
, "Got: %d %d %d %d\n", reads
[i
].id
, reads
[i
].node
, bs
[reads
[i
].node
], real_size
);
639 pecount
[bs
[reads
[i
].node
]]++;
644 int sup_i
=-1, sup_c
=0, c
=0, total_c
=0, total_pe_c
= 0;
645 for(i
=1; i
< branch_count
; i
++)
647 total_pe_c
+= pecount
[i
];
649 if(count
[i
] >= Error_depth
&& pecount
[i
] > 0)
652 if(DEBUG
) fprintf(logs
, "confs: %d:%d:%d\n", i
, count
[i
], pecount
[i
]);
653 //total_c += count[i];
654 //total_pe_c += pecount[i];
655 if(sup_c
< pecount
[i
])
666 if(DEBUG
) fprintf(logs
, "CONF:%d,%d,%d,%d,%d,%d,%d;\n", c
, branch_count
, sup_i
, sup_c
, count
[sup_i
], total_c
, total_pe_c
);
667 if( (c
== 1 && (sup_c
> Error_depth
|| (sup_c
== Error_depth
&& total_pe_c
== sup_c
))&& sup_c
>= count
[sup_i
]/10)//&& (sup_c >= 2 * Consensus_depth || (sup_c < 2 * Consensus_depth && sup_c == total_pe_count )))// && total_pe_count == sup_c)
668 || (c
> 1 && sup_c
> Error_depth
&& sup_c
>= count
[sup_i
]/5 && sup_c
> double(total_pe_c
) * 0.9 && total_pe_c
- sup_c
< Error_depth
)
671 clean_inf(logs
, nodes
, levs
, reads
, s
, bs
, sup_i
);
675 //try to solve heterozygous bubble.
676 if(Solve_Hete
== 1 && diff
< 5 && c
== 2 && sup_c
> 2 && seed_len
<= depth_half_length
&& nodes
[0].depth
<= Upper_bound
* double(Expect_depth
*(Read_length
-seed_len
+1))/double(Read_length
))
678 clean_inf(logs
, nodes
, levs
, reads
, s
, bs
, sup_i
);
684 void get_check_seq(char* seq
, char* ctg_seq
, Lev
* levs
, int lnum
, int ctg_len
)
688 for(i
=ctg_len
-1; i
>=0 &&k
< Read_length
; k
++, i
--)
694 if(DEBUG
) fprintf(stdout
, "check seq:\n%s\n", seq
);
698 int is_full_cons(ULL read_id
, int size
, int read_strand
, int level
, char* check_seq
, int check_seq_len
, int seed_len
, int lnum
)
700 int start
, end
, startp
;
704 start
= level
+ seed_len
-1;
707 return minus_extract_and_compare(idx2BWT
[size
], read_id
, start
, end
, check_seq
, check_seq_len
, startp
);
709 start
= Read_length
- (level
+ seed_len
);
711 startp
= seed_len
+ start
;
712 if(startp
== Read_length
)
714 return plus_extract_and_compare(idx2BWT
[size
], read_id
, start
, end
, check_seq
, check_seq_len
, startp
);
718 int is_full_consensus(ULL read_id
, int size
, int read_strand
, int read_level
, char* check_seq
, int check_seq_len
, int seed_len
, int lnum
)
720 char seq
[Read_length
+ 1];
721 int qual
[Read_length
+ 1];
722 //is_full_cons(read_id, size, read_strand, read_level, check_seq, check_seq_len, seed_len, lnum);
723 get_id_seq(seq
, qual
, read_id
, size
);
728 int rqual
[Read_length
+ 1];
729 char rcseq
[Read_length
+ 1];
730 reverse_com(seq
, rcseq
, Read_length
);
732 for(i
=0; i
< Read_length
; i
++)
733 rqual
[i
] = qual
[Read_length
-i
-1];
735 i
= read_level
+seed_len
-1; j
= seed_len
;
736 if(DEBUG
) fprintf(stdout
, "Minu Read: %s, pos %d,%c; Cons %d,%c\n", rcseq
, i
, rcseq
[i
], j
, check_seq
[j
]);
738 for(; i
< Read_length
; i
++, j
++)
740 if(rcseq
[i
] != check_seq
[j
] && rqual
[i
] == 1)
742 if(DEBUG
) fprintf(stdout
, "i: %d,%c->%c,%d\n", i
, check_seq
[j
], rcseq
[i
], rqual
[i
]);
748 i
= read_level
+seed_len
-1; j
= seed_len
;
749 if(DEBUG
) fprintf(stdout
, "Plus Read: %s, pos %d,%c; Cons %d,%c\n", seq
, i
, seq
[i
], j
, check_seq
[j
]);
751 for(; i
< Read_length
; i
++, j
++)
753 if(seq
[i
] != check_seq
[j
] && qual
[i
] == 1)
755 if(DEBUG
) fprintf(stdout
, "i: %d,%c->%c,%d\n", i
, check_seq
[j
], seq
[i
], qual
[i
]);
760 if(DEBUG
) fflush(stdout
);
761 // is_full_cons(read_id, size, read_strand, read_level, check_seq, check_seq_len, seed_len, lnum);
765 int mask_last_reads(FILE* logs
, int cut_level
, int* s
, char* ctg_seq
, int keep_used
, int* keep_used_extension
, int *has_used_read
, int ctg_len
, int iter
, int unique
, int threadId
, int has_repeat_break
, int seed_len
)
767 int i
, j
=tmpCount
[threadId
], used
, cons
, cut
= cut_level
, tid
=-1, check_read
=0, use_read
=0;
768 char check_seq
[2*Read_length
+ 2];
770 get_check_seq(check_seq
, ctg_seq
, g_levs
[threadId
], s
[1], ctg_len
);
771 int check_seq_len
= strlen(check_seq
), cons_check
;
773 for(i
=0; i
< s
[2]; i
++)
775 if(i
> 0 && g_reads
[threadId
][i
].node
== 0 && g_reads
[threadId
][i
].level
== 0)
777 if(g_reads
[threadId
][i
].level
> cut_level
)
780 if(j
>= MaxReadCount
)
782 fprintf(stderr
, "Too many reads %d\n", j
);
785 tmpRead
[threadId
][j
].id
= g_reads
[threadId
][i
].id
;
786 tmpRead
[threadId
][j
].strand
= g_reads
[threadId
][i
].strand
;
787 tmpRead
[threadId
][j
].size
= g_reads
[threadId
][i
].size
;
788 tmpRead
[threadId
][j
].pos
= g_reads
[threadId
][i
].level
- 1 + ctg_len
; //start from 1 for level.
789 tmpRead
[threadId
][j
].iter
= iter
;
790 tmpRead
[threadId
][j
].unique
= unique
;
791 tmpRead
[threadId
][j
].cons
= g_reads
[threadId
][i
].cons
;
792 tmpRead
[threadId
][j
].keep
= 0;
794 cons_check
= double(Read_length
)/double(g_reads
[threadId
][i
].level
+ seed_len
) > Upper_bound
? 1 : 0;
795 if(DEBUG
) fprintf(logs
, "read %d cons %d pos %d \n", g_reads
[threadId
][i
].id
, g_reads
[threadId
][i
].cons
, tmpRead
[threadId
][j
].pos
);
796 if(g_reads
[threadId
][i
].cons
== 1 && tmpRead
[threadId
][j
].pos
> Read_length
)
799 //if(has_repeat_break == 1 &&
800 if(cons_check
== 1 && is_full_cons(g_reads
[threadId
][i
].id
, g_reads
[threadId
][i
].size
, g_reads
[threadId
][i
].strand
, g_reads
[threadId
][i
].level
, check_seq
, check_seq_len
, seed_len
, s
[1]) == 0)
802 tmpRead
[threadId
][j
].cons
= 0;
803 g_reads
[threadId
][i
].cons
= 0;
808 if(is_used_bwt_sparse(g_reads
[threadId
][i
].id
, g_reads
[threadId
][i
].size
) == 0)
810 endInf
[threadId
].ur
++;
811 tmpRead
[threadId
][j
].keep
= 1;
812 if(DEBUG
) fprintf(logs
, "read %d cons %d pos %d \n", g_reads
[threadId
][i
].id
, g_reads
[threadId
][i
].cons
, tmpRead
[threadId
][j
].pos
);
813 check_and_set_bwt_sparse_thread_and_used(g_reads
[threadId
][i
].id
, g_reads
[threadId
][i
].size
, threadId
);
814 // fprintf(stderr, "UR %d %lld %d %d\n", threadId, reads[i].id, is_used_bwt_sparse(reads[i].id, reads[i].size), get_threadId_bwt_sparse(reads[i].id, reads[i].size));
816 if(cons_check
== 0 && is_full_cons(g_reads
[threadId
][i
].id
, g_reads
[threadId
][i
].size
, g_reads
[threadId
][i
].strand
, g_reads
[threadId
][i
].level
, check_seq
, check_seq_len
, seed_len
, s
[1]) == 0)
818 g_reads
[threadId
][i
].cons
= 0;
819 tmpRead
[threadId
][j
].cons
= 0;
823 tid
= get_threadId_bwt_sparse(g_reads
[threadId
][i
].id
, g_reads
[threadId
][i
].size
);
824 if(tid
!= -1 && tid
!= threadId
)
826 tmpCount
[threadId
] = j
;
827 //fprintf(stderr, "CONFMeet:%d %d %d %d %d %d\n", reads[i].id, reads[i].size, threadId, j, tid, is_used_bwt_sparse(reads[i].id, reads[i].size));
828 free_thread_tmpRead(threadId
, 1);
831 if(DEBUG
) fprintf(logs
, " (%d,%d,%d)",g_reads
[threadId
][i
].id
, g_reads
[threadId
][i
].level
, tid
);
833 *keep_used_extension
= 1;
835 cut
= g_reads
[threadId
][i
].level
- 1;
838 tmpCount
[threadId
] = j
;
847 tmpCount
[threadId
] = j
;
851 if(four_iter
[threadId
] > 0)
853 four_iter
[threadId
] = 0; four_iter_checked_read
[threadId
]=0; four_iter_used_read
[threadId
] = 0;
859 if(DEBUG
) fprintf(stdout
, "Maybe Same seed problem. check read %d and use read %d\n", check_read
, use_read
);
863 four_iter
[threadId
] ++;
864 four_iter_checked_read
[threadId
] += check_read
;
865 if(four_iter
[threadId
] >= 4 && four_iter_checked_read
[threadId
] >= 10 && four_iter_used_read
[threadId
] == 0)
867 if(DEBUG
) fprintf(stdout
, "Maybe Same seed problem. iter check number %d check read %d and use read %d\n", four_iter
[threadId
], check_read
, use_read
);
870 if(four_iter
[threadId
] >= 10)
872 if(DEBUG
) fprintf(stdout
, "Maybe Same seed problem. iter check number %d check read %d\n", four_iter
[threadId
], four_iter_checked_read
[threadId
], use_read
);
878 //fit for the cut level.
884 for(j
=s
[0]-1; j
>=0; j
--)
886 if(g_nodes
[threadId
][j
].level
<= cut
)
888 //fprintf(stderr, "Cut: %d: %d, level%d, len %d\n", j, cut, nodes[j].level, nodes[j].len);
890 int left
= int(g_nodes
[threadId
][j
].level
) - cut
;
893 if(left
>= g_nodes
[threadId
][j
].len
)
894 left
= g_nodes
[threadId
][j
].len
;
895 g_nodes
[threadId
][j
].len
-= left
;
896 g_nodes
[threadId
][j
].seq
[ g_nodes
[threadId
][j
].len
]='\0';
897 //fprintf(stderr, "[%d;%d]",j,left);
904 int is_quick_decrease(int d1
, int d2
)
911 int has_branches(Lev
&lev
)
913 return lev
.c
[0] + lev
.c
[1] + lev
.c
[2] + lev
.c
[3] > lev
.depth
;
917 int get_sub_node(Node
* nodes
, int level
, int p_node
, char base
, int num
)
919 int i
, max_i
=-1, max
=0, high_num
=0;
921 for(i
=p_node
+1; i
< num
; i
++)
923 if(nodes
[i
].level
< level
-1)
925 if(nodes
[i
].level
- nodes
[i
].len
== level
-1 && nodes
[i
].seq
[0] != base
)
927 if(nodes
[i
].depth
> Error_depth
)
929 if(nodes
[i
].depth
> max
)
931 max
= nodes
[i
].depth
;
943 void get_sub_seq(Node
* nodes
, Lev
* levels
, int p
, int num
, char* subseq
)
945 int type
[num
], max_depth
[num
], max_i
[num
], max_same
[num
], type_count
[num
], nodes_count
[num
];
947 for(i
=0; i
< num
; i
++)
958 for(i
=p
+1; i
< num
; i
++)
960 if(type
[nodes
[i
].parent
] == 1)
963 type_count
[nodes
[i
].parent
]++;
964 nodes_count
[nodes
[i
].parent
] += nodes
[i
].depth
;
967 (nodes
[i
].depth
> max_depth
[nodes
[i
].parent
])
968 || (nodes
[i
].depth
== max_depth
[nodes
[i
].parent
] && max_same
[nodes
[i
].parent
] == 0 && nodes
[i
].seq
[0] == dnaChar
[levels
[nodes
[i
].level
+1 - nodes
[i
].len
].base
])
969 || (nodes
[i
].depth
== max_depth
[nodes
[i
].parent
] && max_same
[nodes
[i
].parent
] == 0 && nodes
[i
].len
> nodes
[ max_i
[nodes
[i
].parent
] ].len
)
973 max_depth
[nodes
[i
].parent
] = nodes
[i
].depth
;
974 max_i
[nodes
[i
].parent
] = i
;
975 if(nodes
[i
].seq
[0] == dnaChar
[levels
[nodes
[i
].level
+1 - nodes
[i
].len
].base
])
976 max_same
[nodes
[i
].parent
] = 1;
982 for(i
=p
; i
< num
; i
++)
984 if(i
> p
&& type
[i
] == 1 && (type
[nodes
[i
].parent
] == 0 || max_i
[nodes
[i
].parent
] != i
))
989 if(i
>p
&& type_count
[i
] > 0 && type_count
[i
] == nodes_count
[i
])
991 //fprintf(stdout, "i %d: type count %d, node count %d\n", i, type_count[i], nodes_count[i]);
994 //fprintf(stdout, ">%d,%d", i, nodes[i].len);
995 for(k
=0; k
< nodes
[i
].len
; k
++, j
++)
996 subseq
[j
] = nodes
[i
].seq
[k
];
997 if(nodes
[i
].conf
== 1)
1000 //fprintf(stderr, "\n");
1005 int get_first_branch(Node
* nodes
, int num
, int now_branch_node
, int current_node
, char* s1
)
1007 int flag
[num
], type
[num
];
1009 flag
[current_node
]=1;
1010 for(i
=0; i
< num
; i
++)
1012 flag
[i
]=0; type
[i
]=0;
1014 int node
=current_node
;
1015 while(!(node
== 0 && nodes
[node
].parent
==0) )
1017 node
= nodes
[node
].parent
;
1021 node
= now_branch_node
;
1022 while(flag
[node
] == 0)
1025 if(flag
[nodes
[node
].parent
] == 1)
1027 node
= nodes
[node
].parent
;
1030 for(i
=node
; i
< num
; i
++)
1034 for(k
=0; k
< nodes
[i
].len
; k
++, j
++)
1036 s1
[j
] = nodes
[i
].seq
[k
];
1043 int get_branch_node(FILE* logs
, Node
* nodes
, Lev
* levs
, int p
, int b
, int num
)
1045 int p_node
= nodes
[levs
[p
].node
].parent
;
1048 if(p_node
!= levs
[p
-1].node
)
1052 char base
= dnaChar
[b
];
1053 for(i
=p_node
+1; i
< num
; i
++)
1055 if(nodes
[i
].level
< p
-1 || nodes
[i
].parent
!= p_node
)
1057 if(nodes
[i
].level
- nodes
[i
].len
+ 1 == p
&& nodes
[i
].seq
[0] == base
)
1066 int get_sub_node_id2(FILE* logs
, Node
* nodes
, Lev
* levs
, int p
, int b
, int* s
, char* s1
)
1068 int p_node
= nodes
[levs
[p
].node
].parent
;
1069 int i
, max_i
=-1, max
=0, high_num
=0;
1070 char base
= dnaChar
[b
];//dnaChar[levs[p].base];
1072 for(i
=1; i
< s
[0]; i
++)
1074 //fprintf(stderr, "%d %d %d %d %d\n", i, p, nodes[i].level, nodes[i].parent, nodes[nodes[i].parent].level);
1075 if(nodes
[i
].level
< p
|| i
== levs
[p
].node
|| nodes
[nodes
[i
].parent
].level
> p
- 1)
1077 //fprintf(stderr, "%d %d %d %c \n", i, p, nodes[nodes[i].parent].level, nodes[i].seq[p - nodes[nodes[i].parent].level -1]);
1078 if(nodes
[i
].seq
[p
- nodes
[nodes
[i
].parent
].level
-1] == base
)
1080 if(nodes
[i
].depth
> Error_depth
)
1082 if(nodes
[i
].depth
> max
)
1084 max
= nodes
[i
].depth
;
1089 int n_node
= max_i
; //now branch node.
1093 fprintf(stderr
, "Subnode: %d, %d, %c\n", p_node
, p
, base
);
1097 int c_node
= levs
[p
].node
; //current node.
1098 int fb_node
= get_first_branch(nodes
, s
[0], n_node
, c_node
, s1
); //first branch node.
1100 //fprintf(logs, "nobranch %d, nonode %d, firstbranch %d\n", n_node, c_node, fb_node);
1104 int get_sub_node_id(Node
* nodes
, Lev
* levs
, int p
, int* s
)
1106 int p_node
= levs
[p
-1].node
;
1107 if(nodes
[p_node
].level
!= p
-1)
1110 int node
= get_sub_node(nodes
, p
, p_node
, dnaChar
[levs
[p
].base
], s
[0]);
1116 int get_pair_seq(FILE* logs
, Node
* nodes
, Lev
* levs
, int p
, int* s
, char* s1
, char* s2
, int sub_node
)
1120 get_sub_seq(nodes
, levs
, sub_node
, s
[0], s1
);
1122 int i
, j
, dif
=0, node_id
, k
;
1124 // fprintf(logs, "s1 %s\n", s1);
1125 for(i
=p
, j
=0; i
< s
[1] && j
<strlen(s1
); i
++, j
++)
1127 if( levs
[i
].base
== 4)
1129 if(levs
[i
].node
!= levs
[i
-1].node
&& (nodes
[levs
[i
].node
].parent
!= levs
[i
-1].node
|| (nodes
[levs
[i
-1].node
].conf
== 1 && i
> p
)))
1133 if(i
<= nodes
[levs
[i
-1].node
].level
)
1135 node_id
= levs
[i
-1].node
;
1137 for(k
= i
; k
<= nodes
[node_id
].level
&& j
< strlen(s1
); k
++, i
++, j
++)
1139 s2
[i
-p
] = nodes
[node_id
].seq
[nodes
[node_id
].len
- (nodes
[node_id
].level
-k
)-1];
1140 if(s2
[i
-p
] != s1
[j
])
1146 s2
[i
-p
] = dnaChar
[ levs
[i
].base
];
1147 if(s2
[i
-p
] != s1
[j
])
1152 //fprintf(logs, "%s, %s, %d, %d\n", s1, s2, p, sub_node);
1158 int has_branches2(int level
, Node
* nodes
, int nnum
, int node
)
1161 for(i
=0; i
< nnum
; i
++)
1165 if(nodes
[i
].level
< level
)
1167 if(nodes
[i
].level
- nodes
[i
].len
>= level
)
1169 if(nodes
[i
].level
>= level
&& nodes
[i
].level
- nodes
[i
].len
< level
&& nodes
[i
].depth
> 1)
1170 return nodes
[i
].level
- nodes
[i
].len
;
1175 int quick_decrease(Node
* nodes
, Lev
* levs
, int p
, int seed_len
)
1177 double base_depth
= double(nodes
[0].depth
) / double(Read_length
- seed_len
+ 1 ) * Read_length
;
1178 double lev_dec
= base_depth
/ double(Read_length
);
1179 double node_dec
= double(nodes
[levs
[p
].node
].len
- (nodes
[levs
[p
].node
].level
- p
)) * lev_dec
;
1180 int real_lev_dec
= levs
[p
-1].depth
- levs
[p
].depth
;
1181 int real_node_dec
= nodes
[levs
[p
].node
].depth
- levs
[p
].depth
;
1183 //fprintf(logs, " %.2f:[%.2f,%d;%.2f,%d]\n", base_depth, lev_dec, real_lev_dec, node_dec, real_node_dec);
1184 //decreasing too fast.
1185 if(real_lev_dec
- lev_dec
> Error_depth
*2 )
1187 if(real_node_dec
- node_dec
> Error_depth
*2 )// || (levs[p].depth <= 2* Error_depth && real_node_dec - node_dec > real_node_dec/2))
1190 //extending single node too long.
1191 if(p
+ seed_len
>= depth_9_length
&& levs
[p
].depth
< int(nodes
[levs
[p
].node
].depth
/3))
1193 if(p
+ seed_len
>= depth_6_length
&& levs
[p
].depth
<= 2* Error_depth
&& levs
[p
].depth
< int(nodes
[levs
[p
].node
].depth
/2))
1199 int is_short_cons(int len
, int seed_len
)
1201 return (len
< seed_len
/2 && len
< MINOVERLAP
) ? 1 : 0;
1204 int get_consist_seq(int ii
, int kk
, char* s1
, char* s2
, char* seq
, int seed_len
)
1208 for(j
=0; i
>=1 && k
>=1; i
--, j
++, k
--)
1212 if(is_short_cons(j
, seed_len
) == 1)
1225 int is_hete_bubble(char* s1
, char* s2
, int now_base_depth
, int diff
, int seed_len
, int threadId
)
1227 int nlen
= strlen(s1
);
1230 int i
=nlen
-1, j
, k
=i
;
1231 j
= get_consist_seq(i
, k
, s1
, s2
, seq
, seed_len
);
1233 if(is_short_cons(j
, seed_len
) == 1)
1235 if(s1
[i
] == s2
[k
-1])
1237 j
= get_consist_seq(i
, k
, s1
, s2
, seq
, seed_len
);
1241 if(is_short_cons(j
, seed_len
) == 1 && s1
[i
-1] == s2
[k
])
1244 j
= get_consist_seq(i
, k
, s1
, s2
, seq
, seed_len
);
1249 if(is_short_cons(j
, seed_len
) == 1)
1251 return dual_bubble_check(seq
, now_base_depth
, threadId
);
1254 int solve_by_seed(FILE* logs
, char* cseq
, char* st
, char* s1
, char* s2
, int p
, double d1
, int d2
, double diff
, int seed_len
, int max_level
, int threadId
)
1257 int tlen
= Read_length
- 1 - nlen
;//strlen(s1);
1259 int i
,j
, slen
= strlen(s1
);
1260 for(i
=1; i
< slen
; i
++)
1264 if(g_levs
[threadId
][p
+i
-1].c
[ charMap
[s1
[i
]] ] < Error_depth
)
1271 if(i
== slen
|| i
+p
-1 >= max_level
)
1276 // fprintf(logs, "i %d, p %d, max_level %d, slen %d\n", i, p, max_level, slen);
1277 tlen
= Read_length
- nlen
- slen
;
1278 if(strlen(cseq
) < tlen
)
1279 tlen
= strlen(cseq
);
1281 char seq1
[tlen
+ nlen
+ slen
+1];
1282 char seq2
[tlen
+ nlen
+ slen
+1];
1284 for(i
=0, j
=strlen(cseq
)-tlen
; i
<tlen
; i
++, j
++)
1286 seq1
[i
] = complementMap
[ cseq
[j
] ];
1287 seq2
[i
] = complementMap
[ cseq
[j
] ];
1290 //st starting from 1.
1291 for(j
=1; j
< p
; j
++, i
++)
1293 seq1
[i
] = complementMap
[ st
[j
] ];
1294 seq2
[i
] = complementMap
[ st
[j
] ];
1297 for(j
=0; j
< slen
; j
++, i
++)
1299 seq1
[i
] = complementMap
[ s1
[j
] ];
1300 seq2
[i
] = complementMap
[ s2
[j
] ];
1302 seq1
[i
]='\0'; seq2
[i
]='\0';
1303 //fprintf(logs, "seq1: %s seq2: %s\n", seq1, seq2);
1304 int type
= dual_seq_check(logs
, seq1
, seq2
, nlen
+ seed_len
+ slen
, d1
, strlen(s1
), diff
, threadId
);
1309 int is_long_repeat_seed(int depth
, int len
);
1311 int high_confident_err(int seed_len
, int depth0
, int p
, int prev_depth
, int p_depth
, int c_depth
, int diff
)
1313 if(seed_len
< depth_half_length
&& seed_len
+ p
< depth_9_length
1314 && depth0
< Expect_depth
*(Read_length
-seed_len
+1)/Read_length
&& c_depth
== 1
1315 && diff
== 1 && prev_depth
- p_depth
< Error_depth
)
1320 int set_reads_cons(int *s
, int p
, int flag
, int threadId
)
1323 for(i
=0; i
< s
[2]; i
++)
1325 if(g_reads
[threadId
][i
].level
== p
&& g_reads
[threadId
][i
].node
== g_levs
[threadId
][p
-1].node
&& g_reads
[threadId
][i
].cons
!= flag
)
1327 //fprintf(stdout, "set cons %d\n", g_reads[threadId][i].id);
1328 g_reads
[threadId
][i
].cons
= flag
;
1333 int is_risk_tag(char* seed_seq
, int flag
)
1338 if(seed_seq
[0] == 'C' && seed_seq
[1] == 'G' && seed_seq
[2] == 'G')
1340 for(i
=0; i
< 10; i
++)
1342 if(seed_seq
[i
] == 'C')
1344 for(j
=i
+1; j
<12; j
++)
1346 if(seed_seq
[j
] != 'G')
1356 if(seed_seq
[0] == 'G' && seed_seq
[1] == 'C' && seed_seq
[2] == 'C')
1358 for(i
=0; i
< 10; i
++)
1360 if(seed_seq
[i
] == 'G')
1362 for(j
=i
+1; j
<12; j
++)
1364 if(seed_seq
[j
] != 'C')
1376 int is_strand_bias(ULL plus
, ULL minus
)
1378 if((plus
== 0 && minus
> 3* Consensus_depth
)|| (plus
> 0 && plus
< Error_depth
/2 && plus
* 3* Consensus_depth
< minus
))
1381 if((minus
== 0 && plus
> 3 * Consensus_depth
)|| (minus
> 0 && minus
< Error_depth
/2 && minus
*3* Consensus_depth
< plus
))
1386 int is_illumina_error(FILE* logs
, char* seed_seq
, Node
&n
, Node
&nc
, int now_level_depth
, int branch_depth
)
1388 int l
= strlen(seed_seq
);
1391 int risk_plus
= is_risk_tag(seed_seq
, 1);
1392 int risk_minus
= is_risk_tag(seed_seq
, 0);
1394 if(risk_plus
== 0 && risk_minus
== 0)
1397 ULL plus_depth
= get_strand_depth(n
.b
, 1);
1398 ULL minus_depth
= get_strand_depth(n
.b
, 0);
1400 ULL cons_plus_depth
= get_strand_depth(nc
.b
, 1);
1401 ULL cons_minus_depth
= get_strand_depth(nc
.b
, 0);
1403 int strand_bias
= is_strand_bias(plus_depth
, minus_depth
);
1404 int cons_strand_bias
= is_strand_bias(cons_plus_depth
, cons_minus_depth
);
1406 // fprintf(logs, "risk plus %d risk minus %d; strand bias %d cons strand bias %d, level depth %d branch depth %d; nc depth %d nc len %d.\n", risk_plus, risk_minus, strand_bias, cons_strand_bias, now_level_depth, branch_depth, nc.depth, nc.len);
1408 if(cons_strand_bias
== strand_bias
)
1411 int risk_conf
=0, risk_cons
=0;
1412 if((risk_plus
== 1 && strand_bias
== 2) || (risk_minus
== 1 && strand_bias
== 1))
1415 if((risk_plus
== 1 && cons_strand_bias
== 2) || (risk_minus
== 1 && cons_strand_bias
== 1))
1418 if(risk_conf
== 1 && risk_cons
== 1)
1420 if((plus_depth
+ minus_depth
) == 1 && branch_depth
> Error_depth
&& now_level_depth
- (cons_plus_depth
+ cons_minus_depth
) < Error_depth
/2)
1427 if(risk_cons
== 1 && now_level_depth
- (cons_plus_depth
+ cons_minus_depth
) < Error_depth
/2 && (now_level_depth
> Error_depth
|| nc
.len
== 1))
1433 int find_last_lev(FILE* logs
, int* s
, char* ctg_seq
, char* seed_seq
, int seed_len
, int ctg_len
, int initial_depth
, int* is_unique
, int* has_repeat_break
, int threadId
)
1435 int i
, j
, k
, is_depth
= 0, has_other
=0, has_other_depth
=0;
1438 int back_len
= 0, sub_node
=0, sub_node_level
=0;
1439 char s1
[Read_length
], s2
[Read_length
], st
[Read_length
];
1440 int is_tip
, is_solved
;
1441 int is_low_depth
= initial_depth
< (double(Read_length
-seed_len
+1)*Expect_depth
)/double(Read_length
)/2.5 ? 1 : 0;
1442 int is_high_depth
= initial_depth
> (double(Read_length
-seed_len
+1)*Expect_depth
)/double(Read_length
) ? 1 : 0;
1443 double current_base_depth
= double(initial_depth
* Read_length
)/double(Read_length
-seed_len
+1);
1444 if(current_base_depth
> Expect_depth
)
1445 current_base_depth
= Expect_depth
;
1447 for(i
=1; i
<s
[1]; i
++)
1450 double curr_depth
= double(current_base_depth
*(Read_length
-(seed_len
+i
)+1))/double(Read_length
);
1451 //checking no-branch issue.
1452 if(g_levs
[threadId
][i
].depth
< Error_depth
//low depth.
1453 || (g_levs
[threadId
][i
].nnum
<= 3 && 3 * g_levs
[threadId
][i
].depth
<= initial_depth
) //too many branches with high depth.
1456 (g_levs
[threadId
][i
].node
!= g_levs
[threadId
][i
-1].node
&& g_nodes
[threadId
][ g_levs
[threadId
][i
].node
].parent
!= g_levs
[threadId
][i
-1].node
) //path changed.
1457 || (g_levs
[threadId
][i
].nnum
== 1 && curr_depth
< 2 * Expect_depth
&& g_levs
[threadId
][i
].depth
< 2 * Error_depth
) //low depth region, base by base extension.
1458 || (g_levs
[threadId
][i
].nnum
<= 2 && //has confliction here.
1459 ( quick_decrease(g_nodes
[threadId
], g_levs
[threadId
], i
, seed_len
) == 1 //quick decrease of depth.
1460 || (g_levs
[threadId
][i
].depth
<= 3 * Error_depth
&& curr_depth
<= 3*Error_depth
) //depth too low to solve this branch easily.
1461 || (g_levs
[threadId
][i
].depth
<= 3 * Error_depth
&& 2 * g_levs
[threadId
][i
].depth
<= initial_depth
) //clear branch and low depth.
1462 || (is_low_depth
== 1 && 2 * g_levs
[threadId
][i
].depth
<= initial_depth
)
1473 int changed_consensus
= 0;
1474 //checking branching issue.
1477 if(g_levs
[threadId
][i
].c
[j
] == 0 || j
== g_levs
[threadId
][i
].base
)
1479 //having one branch.
1481 if(DEBUG
) fprintf(logs
, "seed_len %d, i %d, total len %d, depth %d, conf_depth %d, tobase %d, cur depth %.1f\n", seed_len
, i
, seed_len
+ i
, g_levs
[threadId
][i
].depth
, g_levs
[threadId
][i
].c
[j
], j
, curr_depth
);
1483 //long extension break, reduce mismatches.
1484 if( i
> 1 && (g_levs
[threadId
][i
].depth
+ g_levs
[threadId
][i
].c
[j
]) * 2 < initial_depth
&& g_levs
[threadId
][i
].depth
<= 3* Error_depth
)
1486 if(DEBUG
) fprintf(logs
, "Decrease to low.\n");
1490 //careful solving cases, increase length.
1491 if( is_low_depth
== 1 && i
> 1 && initial_depth
- g_levs
[threadId
][i
].depth
> g_levs
[threadId
][i
].depth
/2 && initial_depth
- g_levs
[threadId
][i
].depth
> 2*Error_depth
&& g_levs
[threadId
][i
].c
[j
] >= Error_depth
)
1493 if(DEBUG
) fprintf(logs
, "Branches in low coverage regions.\n");
1498 //having been solved.
1499 if(g_nodes
[threadId
][ g_levs
[threadId
][i
].node
].level
- g_nodes
[threadId
][ g_levs
[threadId
][i
].node
].len
+ 1 != i
)
1501 if(DEBUG
) fprintf(logs
, "Innode.\n");
1503 if(curr_depth
<= 2* Error_depth
|| g_levs
[threadId
][i
].depth
<= 2*Error_depth
)
1510 has_other
= 0; has_other_depth
=0;
1511 for(k
=j
+1; k
< 4; k
++)
1513 if(k
== g_levs
[threadId
][i
].base
)
1515 if(g_levs
[threadId
][i
].c
[k
] >= Error_depth
)
1517 has_other
= 1; has_other_depth
+= g_levs
[threadId
][i
].c
[k
];
1520 if(has_other
== 1 && has_other_depth
+ g_levs
[threadId
][i
].c
[j
] > g_levs
[threadId
][i
].depth
/2 && curr_depth
< Expect_depth
/5 && curr_depth
< current_base_depth
/3)
1522 if(DEBUG
) fprintf(logs
, "Too much confliction and too long extension.\n");
1525 //get current branch caused sub-node.
1527 sub_node
= get_branch_node(logs
, g_nodes
[threadId
], g_levs
[threadId
], i
, j
, s
[0]);
1530 //no new sub-node from the main branch.
1533 s1
[0]=dnaChar
[j
]; s1
[1]='\0';
1534 s2
[0]=dnaChar
[g_levs
[threadId
][i
].base
]; s2
[1]='\0';
1535 is_solved
= solve_by_seed(logs
, ctg_seq
, st
, s1
, s2
, i
, current_base_depth
, g_levs
[threadId
][i
].depth
, 0, seed_len
, g_nodes
[threadId
][g_levs
[threadId
][i
].node
].level
, threadId
);
1536 if(DEBUG
) fprintf(logs
, "No sub node.\n");
1538 if(is_solved
== -1 && has_other
== 1)
1540 changed_consensus
= 1;
1541 if(DEBUG
) fprintf(logs
, "ChangeCons3 at %d with num %d\n", i
, g_levs
[threadId
][i
].nnum
);
1542 set_reads_cons(s
, i
, 0, threadId
);
1543 g_levs
[threadId
][i
].base
= j
;
1544 g_levs
[threadId
][i
].depth
= g_levs
[threadId
][i
].c
[j
];
1545 g_levs
[threadId
][i
].node
= sub_node
;
1549 if(is_solved
!= 0 && is_solved
< 3)
1554 *has_repeat_break
= 1;
1555 if(is_solved
== 1 && i
> 1 && g_levs
[threadId
][i
].c
[j
] == 1 && g_levs
[threadId
][i
].depth
<= 3 * Consensus_depth
)
1559 if(DEBUG
) fprintf(logs
, "ChangeCons1 at %d with num %d\n", i
, g_levs
[threadId
][i
].nnum
);
1560 set_reads_cons(s
, i
, 0, threadId
);
1561 g_levs
[threadId
][i
].base
= j
;
1562 g_levs
[threadId
][i
].depth
= g_levs
[threadId
][i
].c
[j
];
1563 g_levs
[threadId
][i
].node
= sub_node
;
1564 //set_reads_cons(s, i, 1, threadId);
1575 if(i
== 1 && Expect_depth
> Error_depth
* 10 && g_levs
[threadId
][i
].depth
+ g_levs
[threadId
][i
].c
[j
] < curr_depth
* Upper_bound
)
1577 //if(is_high_depth_sequencing_error(logs, g_nodes[threadId], g_levs[threadId], i, s, sub_node) == 1)
1578 //print_base(logs, g_nodes[threadId][sub_node].b);
1579 //print_base(logs, g_nodes[threadId][g_levs[threadId][i].node].b);
1580 int i_err
= is_illumina_error(logs
, seed_seq
, g_nodes
[threadId
][sub_node
], g_nodes
[threadId
][g_levs
[threadId
][i
].node
], g_levs
[threadId
][i
].depth
, g_levs
[threadId
][i
].c
[j
]);
1583 if(DEBUG
) fprintf(logs
, "Illumina sequencing error: %s.\n", seed_seq
);
1587 if(i_err
== 2 && is_low_depth
== 0)
1589 if(DEBUG
) fprintf(logs
, "ChangeCons3 at %d with num %d seed %s.\n", i
, g_levs
[threadId
][i
].nnum
, seed_seq
);
1590 set_reads_cons(s
, i
, 0, threadId
);
1591 g_levs
[threadId
][i
].base
= j
;
1592 g_levs
[threadId
][i
].depth
= g_levs
[threadId
][i
].c
[j
];
1593 g_levs
[threadId
][i
].node
= sub_node
;
1598 //get the sequences of two branches, one is main, the other is the new branch.
1599 int diff
= get_pair_seq(logs
, g_nodes
[threadId
], g_levs
[threadId
], i
, s
, s1
, s2
, sub_node
);
1602 if(DEBUG
) fprintf(logs
, "treat as sequencing error.\n");
1606 double dif_ratio
= double(diff
)/double(strlen(s1
));
1607 if(DEBUG
) fprintf(logs
, "s1 %s, s2 %s diff %d len %d\n", s1
, s2
, diff
, strlen(s1
));
1610 if(dif_ratio
< 0.2 && high_confident_err(seed_len
, initial_depth
, i
, g_levs
[threadId
][i
-1].depth
, g_levs
[threadId
][i
].depth
, g_levs
[threadId
][i
].c
[j
], diff
) == 1)
1615 if(DEBUG
) fprintf(logs
, "lowconf\n");
1617 //double curr_depth = double(Expect_depth*(Read_length-(seed_len+i)+1))/double(Read_length);
1618 if(g_levs
[threadId
][i
].c
[j
] <= Error_depth
&& g_levs
[threadId
][i
].depth
> 2*Error_depth
&& curr_depth
> 2 * Error_depth
1619 && g_levs
[threadId
][i
].c
[j
] < g_levs
[threadId
][i
].depth
* 0.1
1620 && !((seed_len
> depth_half_length
&& (curr_depth
* 2 < g_levs
[threadId
][i
].depth
|| (dif_ratio
> 0.05))))// || (dif_ratio > 0.1 )))
1627 if(DEBUG
) fprintf(logs
, "NotLargeDif&LowBranch\n");
1629 //checking whether this is a tip or solved by increasing seed length.
1630 is_solved
= solve_by_seed(logs
, ctg_seq
, st
, s1
, s2
, i
, current_base_depth
, g_levs
[threadId
][i
].depth
, dif_ratio
, seed_len
, g_nodes
[threadId
][g_levs
[threadId
][i
].node
].level
, threadId
);
1632 if(is_solved
== -1 && has_other
== 1)
1634 changed_consensus
= 1;
1635 if(DEBUG
) fprintf(logs
, "ChangeCons4 at %d with nnum %d \n", i
, g_levs
[threadId
][i
].nnum
);
1636 set_reads_cons(s
, i
, 0, threadId
);
1637 g_levs
[threadId
][i
].base
= j
;
1638 g_levs
[threadId
][i
].depth
= g_levs
[threadId
][i
].c
[j
];
1639 g_levs
[threadId
][i
].node
= sub_node
;
1642 if(is_solved
!= 0 && is_solved
< 3 && !(is_solved
== -1 && has_other
== 1))
1648 *has_repeat_break
= 1;
1649 if(is_solved
== 1 && i
> 1 && g_levs
[threadId
][i
].c
[j
] == 1 && g_levs
[threadId
][i
].depth
<= 3 * Consensus_depth
)
1653 if(DEBUG
) fprintf(logs
, "ChangeCons2 at %d with nnum %d \n", i
, g_levs
[threadId
][i
].nnum
);
1654 set_reads_cons(s
, i
, 0, threadId
);
1655 g_levs
[threadId
][i
].base
= j
;
1656 g_levs
[threadId
][i
].depth
= g_levs
[threadId
][i
].c
[j
];
1657 g_levs
[threadId
][i
].node
= sub_node
;
1658 //set_reads_cons(s, i, 1, threadId);
1666 if(is_solved
== 5 && has_other
== 1)
1671 if(DEBUG
) fprintf(logs
, "Unsolved.\n");
1673 if(Solve_Hete
== 1 && curr_depth
> 2 * Error_depth
//&& is_low_depth == 0
1674 && (g_levs
[threadId
][i
].depth
+ g_levs
[threadId
][i
].c
[j
] <= Upper_bound
* curr_depth
|| strlen(s1
) > seed_len
* 2)
1675 && is_short_cons(strlen(s1
), seed_len
) == 0
1678 if(is_hete_bubble(s1
, s2
, current_base_depth
, diff
, seed_len
, threadId
) == 1)
1682 }else if(diff
== 1 && i
+ seed_len
<= MINOVERLAP
+ 1 && strlen(s1
) >= MINOVERLAP
&& g_levs
[threadId
][i
].depth
+ g_levs
[threadId
][i
].c
[j
] <= Upper_bound
* curr_depth
)
1689 if(DEBUG
) fprintf(logs
, "NotHete\n");
1690 //masked at Sep 18.2014 for YH assembly.
1692 && (is_solved
>= 3 && is_solved
<5)
1693 && ((diff
< 3 && dif_ratio
< 0.2) || (diff
< 5 && dif_ratio
<= 0.05))
1694 && g_levs
[threadId
][i
].depth
+ g_levs
[threadId
][i
].c
[j
] <= curr_depth
1697 if(is_solved
== 4 && (g_levs
[threadId
][i
].c
[j
] == g_levs
[threadId
][i
].depth
|| g_levs
[threadId
][i
].c
[j
] > Error_depth
))
1699 if(DEBUG
) fprintf(logs
, "ChangeCons3 at %d with nnum %d \n", i
, g_levs
[threadId
][i
].nnum
);
1700 set_reads_cons(s
, i
, 0, threadId
);
1701 g_levs
[threadId
][i
].base
= j
;
1702 g_levs
[threadId
][i
].depth
= g_levs
[threadId
][i
].c
[j
];
1703 g_levs
[threadId
][i
].node
= sub_node
;
1710 if(is_solved
!= 3 && is_solved
!= 4)
1711 *has_repeat_break
= 1;
1713 if(DEBUG
) fprintf(logs
, "NotHete2\n");
1714 //solving repeat by Pair end information.
1715 if(Solve_Conf
== 1 && i
== 1 )//&& !(g_levs[threadId][i].c[j] < Error_depth && g_levs[threadId][i].depth > 2 * Error_depth && g_levs[threadId][i].depth + g_levs[threadId][i].c[j] < 4 * Error_depth && diff >= 5))
1717 is_solved
= solve_conf(logs
, g_nodes
[threadId
], g_levs
[threadId
], g_reads
[threadId
], s
, seed_len
, ctg_len
, threadId
, diff
);
1720 if(DEBUG
) fprintf(logs
, "conf_solved.\n");
1723 //return find_last_lev(logs, s, ctg_seq, seed_seq, seed_len, ctg_len, initial_depth, is_unique, threadId);
1727 if(DEBUG
) fprintf(logs
, "CONF fail.\n");
1730 //fprintf(logs, " %d,%d,%d",seed_len, g_levs[threadId][i].depth, g_levs[threadId][i].c[j]);
1737 //checking have branch or not.
1738 if(solved
[i
] == 0)// || seed_len + i >= depth_6_length || levs[i].depth <= 2 * Error_depth)
1742 st
[i
] = dnaChar
[g_levs
[threadId
][i
].base
];
1748 int iter_consensus(FILE* logs
, int* s
, int *len
, char* ctg_seq
, char* seed_seq
, int seed_len
, int ctg_len
, int keep_used
, int initial_depth
, int iter
, int* is_unique
, int threadId
)
1750 int i
, j
=0, k
, keep_used_extension
=0, has_used
= 0, has_repeat_break
= 0;
1752 i
= find_last_lev(logs
, s
, ctg_seq
, seed_seq
, seed_len
, ctg_len
, initial_depth
, is_unique
, &has_repeat_break
, threadId
);
1753 if( i
<= 0) //lowdepth2.
1755 s
[0]=0; s
[1]=0; s
[2]=0; *len
= 0;
1760 //s is updated here. Used reads.
1761 i
= mask_last_reads(logs
, i
, s
, ctg_seq
, keep_used
, &keep_used_extension
, &has_used
, ctg_len
, iter
, *is_unique
, threadId
, has_repeat_break
, seed_len
);
1769 if(i
== 0 && keep_used_extension
== 0)
1774 i
= g_levs
[threadId
][i
].node
;
1779 for(k
=g_nodes
[threadId
][i
].len
-1; k
>=0; k
--, j
++)
1780 g_seq
[threadId
][j
] = g_nodes
[threadId
][i
].seq
[k
];
1782 if(i
==0 && i
== g_nodes
[threadId
][i
].parent
)
1784 i
= g_nodes
[threadId
][i
].parent
;
1790 fprintf(stderr
, "[%d,%d]",i
, g_nodes
[threadId
][i
].len
);
1793 g_seq
[threadId
][j
]='\0';
1795 if(keep_used_extension
== 1)
1804 Update reads used in this contigs.
1807 ULL
get_pair_id(ULL id
)
1815 int is_unique(int depth
, int len
)
1817 return depth
> double(Read_length
-len
+1)/double(Read_length
)*double(Expect_depth
)*Upper_bound
? 0 : 1;
1820 int is_repeat(int depth
, int len
)
1822 return depth
> (3- Upper_bound
)*double(Read_length
-len
+1)/double(Read_length
)*double(Expect_depth
) ? 1 : 0;
1825 int is_long_repeat_seed(int depth
, int len
)
1827 return (len
>= depth_9_length
&& depth
>= 2 * double(Read_length
-len
+1)/double(Read_length
)*double(Expect_depth
));
1830 void free_thread_tmpRead(int threadId
, int flag
)
1835 for(i
=0; i
< tmpCount
[threadId
]; i
++)
1837 if(tmpRead
[threadId
][i
].keep
== 0)
1839 clean_use_bwt_sparse(tmpRead
[threadId
][i
].id
, tmpRead
[threadId
][i
].size
);
1840 endInf
[threadId
].ur
--;
1841 tmpRead
[threadId
][i
].keep
= 0;
1844 for(i
=0; i
< tmpCount
[threadId
]; i
++)
1846 if(tmpRead
[threadId
][i
].keep
== 0)
1848 int flag
= clean_current_bwt_sparse(tmpRead
[threadId
][i
].id
, tmpRead
[threadId
][i
].size
);
1851 tmpCount
[threadId
] = 0;
1855 Update contig and get the new iteration seed using the extended sequence.
1858 void str_copy(char* seq
, char* from
, int start
, int len
)
1861 for(i
=start
,j
=0,l
=0; i
< strlen(from
) && l
<len
; i
++,j
++,l
++)
1868 void reverse_com(char* from
, char* to
, int len
)
1871 for(i
=0; i
<len
; i
++)
1873 to
[i
] = complementMap
[ from
[len
-i
-1]];
1878 void clean_trimed_reads(int ctg_len
, int read_num
, int threadId
)
1881 for(i
= tmpCount
[threadId
] -1; i
>=0 && j
< read_num
; i
--, j
++)
1883 if(tmpRead
[threadId
][i
].pos
< ctg_len
)
1885 if(tmpRead
[threadId
][i
].keep
== 0)
1888 clean_use_bwt_sparse(tmpRead
[threadId
][i
].id
, tmpRead
[threadId
][i
].size
);
1889 endInf
[threadId
].ur
--;
1890 tmpRead
[threadId
][i
].keep
= 0;
1895 int update_ctg(int len
, int cur_iter
, int read_num
, int threadId
)
1897 //update the sequence of ctg.
1899 if(len
+ ctgs
[threadId
][ctg_num
[threadId
]].len
>= MaxCtgLen
)
1901 fprintf(stderr
, "Too long contigs with length %d, %d\n", ctgs
[threadId
][ctg_num
[threadId
]].len
, len
);
1904 int i
=ctgs
[threadId
][ctg_num
[threadId
]].len
, j
=0;
1906 for(; j
< len
; i
++, j
++)
1908 ctgs
[threadId
][ctg_num
[threadId
]].seq
[i
] = g_seq
[threadId
][len
- 1 - j
];
1910 ctgs
[threadId
][ctg_num
[threadId
]].seq
[i
]='\0';
1911 ctgs
[threadId
][ctg_num
[threadId
]].len
= i
;
1916 fprintf(stderr
, "Please check: %d\n", cur_iter
);
1919 char tmpseq
[len
+ g_iters
[threadId
][cur_iter
-1].slen
];
1920 char rctmpseq
[len
+ g_iters
[threadId
][cur_iter
-1].slen
];
1923 for(j
=0; j
<len
; i
++, j
++)
1924 tmpseq
[i
] = g_seq
[threadId
][j
];
1926 for(j
=0; j
< g_iters
[threadId
][cur_iter
-1].slen
; i
++, j
++)
1927 tmpseq
[i
] = g_iters
[threadId
][cur_iter
-1].seq
[j
];
1931 reverse_com(tmpseq
, rctmpseq
, tmplen
);
1932 int end
= tmplen
- 1;
1933 for(i
=0; i
< bwtCount
; i
++)
1935 tmpbase
[threadId
][i
].saL
= 0; tmpbase
[threadId
][i
].saR
= 0;
1936 tmpbase
[threadId
][i
].plus
= 0; tmpbase
[threadId
][i
].minus
= 0;
1937 tmpbase
[threadId
][i
].saLL
= 0; tmpbase
[threadId
][i
].saRR
= 0;
1939 unsigned long long depth
=0, tdepth
;
1941 int prev_repeat
= is_repeat(g_iters
[threadId
][cur_iter
-1].depth
, g_iters
[threadId
][cur_iter
-1].slen
);
1942 int trim_len
= 0, rep_start
= -1;
1944 if(DEBUG
) fprintf(stdout
, "[iter] %d\n", cur_iter
-1);
1945 int start
= extend_backwards_rep(rctmpseq
, 0, end
, tmpbase
[threadId
], &(depth
), &rep_start
);
1946 if(DEBUG
) fprintf(stdout
, "[iter] %d,d=%d,l=%d, r=%d, start=%d, depth=%d\n", cur_iter
-1, g_iters
[threadId
][cur_iter
-1].depth
, g_iters
[threadId
][cur_iter
-1].slen
, prev_repeat
, start
, depth
);
1948 if((start
< 0 && depth
> Error_depth
) || prev_repeat
==1)
1953 end
= tmplen
- rep_start
-1;
1955 end
= strlen(tmpseq
) - 1;
1957 if(end
>= strlen(tmpseq
))
1958 fprintf(stdout
, "T1: %s, %d; new seq %s; iter seq %s; newlen %d, seedlen %d, end %d; iter %d\n", tmpseq
, strlen(tmpseq
), g_seq
[threadId
], g_iters
[threadId
][cur_iter
-1].seq
, len
, g_iters
[threadId
][cur_iter
-1].slen
, end
, cur_iter
-1);
1959 set_SAranges(tmpseq
, 0, end
, tmpbase
[threadId
], &(depth
));
1962 if(prev_repeat
== 1)
1965 fprintf(stdout
, "Iter %d PrevRepeat: %d, %d; ", cur_iter
, g_iters
[threadId
][cur_iter
-1].depth
, g_iters
[threadId
][cur_iter
-1].slen
);
1966 while(start
>= 0 && end
> g_iters
[threadId
][cur_iter
-1].slen
)
1969 start
= extend_backwards(rctmpseq
, 0, end
, tmpbase
[threadId
], &(depth
), &break_start
);
1971 //fprintf(stdout, "Findstart %d ", start);
1975 trim_len
= tmplen
- 1 - end
;
1976 end
= MINOVERLAP
- 1 + trim_len
;
1977 set_SAranges(tmpseq
, trim_len
, end
, tmpbase
[threadId
], &(depth
));
1978 while(is_repeat(depth
, MINOVERLAP
) == 0 && end
< tmplen
-1 && trim_len
< len
)
1982 set_SAranges(tmpseq
, trim_len
, end
, tmpbase
[threadId
], &(depth
));
1984 if(end
== tmplen
-1 || trim_len
== len
)
1986 fprintf(stdout
, "all trimed0.\n");
1987 ctgs
[threadId
][ctg_num
[threadId
]].len
= oldctglen
;
1988 ctgs
[threadId
][ctg_num
[threadId
]].seq
[ ctgs
[threadId
][ctg_num
[threadId
]].len
] = '\0';
1991 fprintf(stdout
, "FinalTrim%d and %d\n", trim_len
, end
);
1992 ctgs
[threadId
][ctg_num
[threadId
]].len
-= trim_len
;
1993 ctgs
[threadId
][ctg_num
[threadId
]].seq
[ ctgs
[threadId
][ctg_num
[threadId
]].len
] = '\0';
1995 fprintf(stdout
, "all trimed1.\n");
1996 ctgs
[threadId
][ctg_num
[threadId
]].len
= oldctglen
;
1997 ctgs
[threadId
][ctg_num
[threadId
]].seq
[ ctgs
[threadId
][ctg_num
[threadId
]].len
] = '\0';
2001 //find a unique seed for the new iteration.
2002 if(depth
<= Error_depth
&& g_iters
[threadId
][cur_iter
-1].depth
> Error_depth
+ 1)
2004 trim_len
= 1; int start_trim
=0, now_rep_start
= -1;
2005 if(DEBUG
) fprintf(stdout
, "Iter %d PrevRepeat: %d, %d; ", cur_iter
, g_iters
[threadId
][cur_iter
-1].depth
, g_iters
[threadId
][cur_iter
-1].slen
);
2006 while(depth
<= Error_depth
&& trim_len
< len
)
2009 if(DEBUG
) fprintf(stdout
, "Ending at %d\n", end
-trim_len
);
2010 start_trim
= extend_backwards_rep(rctmpseq
, 0, end
-trim_len
, tmpbase
[threadId
], &(depth
), &now_rep_start
);
2011 if(DEBUG
) fprintf(stdout
, "End got depth %d, start %d\n", depth
, start_trim
);
2015 if(trim_len
<= len
&& depth
> Error_depth
)
2018 if(DEBUG
) fprintf(stdout
, "LowTrim%d \n", trim_len
);
2019 ctgs
[threadId
][ctg_num
[threadId
]].len
-= trim_len
;
2020 ctgs
[threadId
][ctg_num
[threadId
]].seq
[ ctgs
[threadId
][ctg_num
[threadId
]].len
] = '\0';
2022 end
= tmplen
-1 - start_trim
;
2024 if(now_rep_start
!= -1)
2025 end
= tmplen
-1 - now_rep_start
;
2027 end
= strlen(tmpseq
) -1;
2029 clean_trimed_reads(ctgs
[threadId
][ctg_num
[threadId
]].len
, read_num
, threadId
);
2031 if(DEBUG
) fprintf(stdout
, "Lowtrim fail with trim_len %d and start_trim %d, final depth %lld\n", trim_len
, start_trim
, depth
);
2033 trim_len
= 0; end
= tmplen
- 1 - start
;
2037 end
= tmplen
-1 - rep_start
;
2039 end
= strlen(tmpseq
) -1;
2043 trim_len
= 0; end
= tmplen
- 1 - start
;
2048 end
= tmplen
- rep_start
-1;
2049 // fprintf(stdout, "New seed length2: %d for iter %d\n", end+1, cur_iter);
2051 end
= strlen(tmpseq
) -1;
2054 if(end
>= strlen(tmpseq
))
2055 fprintf(stdout
, "T2: %s, %d %d; iter %d\n", tmpseq
, strlen(tmpseq
), end
, cur_iter
-1);
2056 set_SAranges(tmpseq
, trim_len
, end
, tmpbase
[threadId
], &(depth
));
2060 set_base(g_iters
[threadId
][cur_iter
].b
, tmpbase
[threadId
]);
2062 if(depth
> MAXDEPTH
)
2063 g_iters
[threadId
][cur_iter
].depth
= MAXDEPTH
;
2065 g_iters
[threadId
][cur_iter
].depth
= depth
;
2067 g_iters
[threadId
][cur_iter
].slen
= end
- trim_len
+ 1;
2068 str_copy(g_iters
[threadId
][cur_iter
].seq
, tmpseq
, trim_len
, g_iters
[threadId
][cur_iter
].slen
);
2070 g_iters
[threadId
][cur_iter
].rnum
= 0;
2071 g_iters
[threadId
][cur_iter
].end
= 0;
2072 g_iters
[threadId
][cur_iter
].pe
= 0;
2073 g_iters
[threadId
][cur_iter
].len
= 0;
2079 Using PE information to improve iteration.
2080 iter_num >=0, read_num >=0.
2083 Fill in ctg, tmpRead, iters.
2085 void print_inf(FILE* fp
, Node
* nodes
, Lev
* levs
, ReadId
* reads
, int* s
, int cur_len
)
2087 fprintf(fp
, "current contig length: %d\n", cur_len
);
2088 fprintf(fp
, "Node:\n");
2090 for(i
=0; i
< s
[0]; i
++)
2091 fprintf(fp
, "%d\t%s,%d,%d,%d,%d,%d\n",i
, nodes
[i
].seq
, nodes
[i
].len
, nodes
[i
].level
, nodes
[i
].depth
, nodes
[i
].parent
, nodes
[i
].is_end
);
2093 fprintf(fp
, "Read:\n");
2094 for(i
=0; i
< s
[2]; i
++)
2095 fprintf(fp
, "%d\t%d,%d,%d,%d,%d\n",i
, reads
[i
].id
, reads
[i
].node
, reads
[i
].level
, reads
[i
].cons
, reads
[i
].strand
);
2097 fprintf(fp
, "Lev:\n");
2098 for(i
=1; i
< s
[1]; i
++)
2099 fprintf(fp
, "%d\t%d,%d,%d,%d\n",i
, levs
[i
].base
, levs
[i
].depth
, levs
[i
].node
, levs
[i
].nnum
);
2103 void mask_branch_reads( Node
* nodes
, Lev
* levs
, ReadId
* reads
, int* s
)
2106 int node_type
[s
[0]];
2107 for(i
=0; i
< s
[0]; i
++)
2110 for(i
=1; i
< s
[1]; i
++)
2112 node_type
[ levs
[i
].node
]=1;
2115 for(i
=0; i
< s
[2]; i
++)
2117 if(node_type
[ reads
[i
].node
] == 0)
2119 reads
[i
].node
=0; reads
[i
].level
=0;
2124 int backward_extending_bwtpe(FILE* logs
, int is_first
, int threadId
)
2126 int cur_iter
= iterCount
[threadId
] + 1;
2127 //fprintf(stderr, "thread%d and cur %d\n", threadId, *iter_num);
2128 if(cur_iter
> MaxIterNum
-2)
2130 if(DEBUG
) fprintf(logs
, " TooManyIteration");
2131 g_iters
[threadId
][cur_iter
-1].end
= 4; iterCount
[threadId
] = cur_iter
;
2135 if(g_iters
[threadId
][cur_iter
-1].depth
< Error_depth
)
2137 if(DEBUG
) fprintf(logs
, " LowDepth1");
2138 g_iters
[threadId
][cur_iter
-1].end
= 0; iterCount
[threadId
] = cur_iter
;
2142 if(g_iters
[threadId
][cur_iter
-1].depth
>= MAXDEPTH
)
2144 if(DEBUG
) fprintf(logs
, " HighDepth%d,%d", g_iters
[threadId
][cur_iter
-1].depth
, g_iters
[threadId
][cur_iter
-1].slen
);
2145 g_iters
[threadId
][cur_iter
-1].end
= 2; iterCount
[threadId
] = cur_iter
;
2149 if((cur_iter
> 1 && strcmp(g_iters
[threadId
][cur_iter
-1].seq
, g_iters
[threadId
][cur_iter
-2].seq
) == 0)
2150 || (cur_iter
>2 && strcmp(g_iters
[threadId
][cur_iter
-1].seq
, g_iters
[threadId
][cur_iter
-3].seq
) == 0)
2151 || (cur_iter
>3 && strcmp(g_iters
[threadId
][cur_iter
-1].seq
, g_iters
[threadId
][cur_iter
-4].seq
) == 0)
2154 if(DEBUG
) fprintf(logs
, " SameSeed: now seed %s, old1 %s\n", g_iters
[threadId
][cur_iter
-1].seq
, g_iters
[threadId
][cur_iter
-2].seq
);
2155 g_iters
[threadId
][cur_iter
-1].end
= 5; iterCount
[threadId
] = cur_iter
;
2161 double depth1
= double(Read_length
*g_iters
[threadId
][cur_iter
-1].depth
)/double(Read_length
-g_iters
[threadId
][cur_iter
-1].slen
+1);
2162 double depth2
= double(Read_length
*g_iters
[threadId
][cur_iter
-2].depth
)/double(Read_length
-g_iters
[threadId
][cur_iter
-2].slen
+1);
2163 if(depth1
* 6 < depth2
&& depth1
< Expect_depth
/3)
2165 if(DEBUG
) fprintf(logs
, " LargeDecrease%.1f,%.1f",depth1
, depth2
);
2166 g_iters
[threadId
][cur_iter
-1].end
= 2; iterCount
[threadId
] = cur_iter
;
2170 //define and initialize extending structures.
2172 int max_node_num
= g_iters
[threadId
][ iterCount
[threadId
] ].depth
* Read_length
;
2178 for(i
=0; i
< max_node_num
; i
++)
2180 g_nodes
[threadId
][i
].seq
[0] = '\0';
2181 g_nodes
[threadId
][i
].len
= 0;
2182 g_nodes
[threadId
][i
].level
= 0;
2183 g_nodes
[threadId
][i
].parent
= 0;
2184 g_nodes
[threadId
][i
].is_end
= 0;
2185 g_nodes
[threadId
][i
].conf
= 0;
2186 set_blank_base( g_nodes
[threadId
][i
].b
);
2188 set_base(g_nodes
[threadId
][0].b
, g_iters
[threadId
][ cur_iter
- 1].b
);
2190 g_nodes
[threadId
][0].depth
= g_iters
[threadId
][ cur_iter
- 1 ].depth
;
2191 g_seq
[threadId
][0]='\0';
2194 for(i
=0; i
< Read_length
; i
++)
2196 g_levs
[threadId
][i
].nnum
= 0;
2198 g_levs
[threadId
][i
].c
[j
]=0;
2200 g_levs
[threadId
][0].node
= 0;
2203 //fprintf(stderr, "Iter %d is fisrt %d with given depth %d from depth %d and calculated %d\n", cur_iter - 1, is_first, get_real_depth(g_nodes[threadId][0].b), get_real_depth(g_iters[threadId][cur_iter - 1].b), g_iters[threadId][cur_iter - 1].depth);
2205 iterCount
[threadId
] = cur_iter
;
2206 int is_continue
= 1;
2207 start_time
= clock();
2208 while(is_continue
== 1 && s
[1] + g_iters
[threadId
][cur_iter
-1].slen
<= Read_length
+ 1)
2210 is_continue
= backward_search(logs
, s
, threadId
);
2215 fprintf(logs
, "thread=%d;%d,%d,%d,%d,%d,%d\n",threadId
, cur_iter
-1, g_iters
[threadId
][cur_iter
-1].slen
, g_iters
[threadId
][cur_iter
-1].depth
, s
[0], s
[1], s
[2]);
2219 fprintf(logs
, "\nIter %d, depth %.1f\n", cur_iter
-1, double(Read_length
*g_iters
[threadId
][cur_iter
-1].depth
)/double(Read_length
-g_iters
[threadId
][cur_iter
-1].slen
+1));
2221 print_inf(logs
, g_nodes
[threadId
], g_levs
[threadId
], g_reads
[threadId
], s
, ctgs
[threadId
][ctg_num
[threadId
]].len
);
2227 int extension_type
=1;
2228 int unique
= is_unique(g_iters
[threadId
][cur_iter
-1].depth
, g_iters
[threadId
][cur_iter
-1].slen
);
2229 int repeat
= is_repeat(g_iters
[threadId
][cur_iter
-1].depth
, g_iters
[threadId
][cur_iter
-1].slen
);
2230 int initial_read_num
=s
[2], is_now_unique
= unique
;
2232 extension_type
= iter_consensus(logs
, s
, &len
, ctgs
[threadId
][ctg_num
[threadId
]].seq
, g_iters
[threadId
][cur_iter
-1].seq
, g_iters
[threadId
][cur_iter
-1].slen
, ctgs
[threadId
][ctg_num
[threadId
]].len
, 0, g_iters
[threadId
][cur_iter
-1].depth
, cur_iter
-1, &is_now_unique
, threadId
);
2234 if(extension_type
== 10)
2236 //meet thread conflition.
2240 if(DEBUG
) fprintf(logs
, "thread%d Seq: %s, len %d, %d, %d, %d\n", threadId
, g_seq
[threadId
], len
, s
[0], s
[1], s
[2]);
2243 if(ctgs
[threadId
][ctg_num
[threadId
]].len
> Read_length
&& extension_type
== 4 && (is_now_unique
== 1 || repeat
== 0))// && ctgs[threadId][ctg_num[threadId]].len >= 2* Read_length)
2245 ctgs
[threadId
][ctg_num
[threadId
]].len
= ctgs
[threadId
][ctg_num
[threadId
]].len
- Read_length
;
2246 ctgs
[threadId
][ctg_num
[threadId
]].seq
[ctgs
[threadId
][ctg_num
[threadId
]].len
] = '\0';
2247 clean_trimed_reads(ctgs
[threadId
][ctg_num
[threadId
]].len
, tmpCount
[threadId
], threadId
);
2252 trim_len
= update_ctg(len
, cur_iter
, s
[2], threadId
);
2260 }else if(extension_type
== 1)
2263 //update ctg and iters.
2264 g_iters
[threadId
][cur_iter
-1].len
= len
- trim_len
;
2265 g_iters
[threadId
][cur_iter
-1].rnum
= s
[2];
2266 g_iters
[threadId
][cur_iter
-1].pe
= s
[0]>0;
2267 g_iters
[threadId
][cur_iter
-1].end
= 0;
2269 //further extending.
2270 switch (extension_type
)
2273 //fprintf(logs, " NOEXTEND");
2274 g_iters
[threadId
][cur_iter
-1].end
= 4;
2277 //fprintf(stderr, "SIN");
2278 return backward_extending_bwtpe(logs
, 0, threadId
);
2281 if(DEBUG
) fprintf(logs
, "UsedRead");
2282 g_iters
[threadId
][cur_iter
-1].end
= 3;
2283 //backward_extending_bwtpe(ctg, tmpRead, iters, iter_num, read_num);
2286 if(DEBUG
) fprintf(logs
, "LowDepth2");
2287 g_iters
[threadId
][cur_iter
-1].end
= 1;
2290 if(DEBUG
) fprintf(logs
, "UsedRead2");
2291 g_iters
[threadId
][cur_iter
-1].end
= 6;
2294 if(DEBUG
) fprintf(logs
, "CB");
2295 g_iters
[threadId
][cur_iter
-1].end
= 7;
2298 if(DEBUG
) fprintf(logs
, " R2U");
2299 g_iters
[threadId
][cur_iter
-1].end
= 8;
2302 if(DEBUG
) fprintf(logs
, "CurrentUse");
2303 g_iters
[threadId
][cur_iter
-1].end
= 9;
2306 if(DEBUG
) fprintf(logs
, "Cut");
2307 g_iters
[threadId
][cur_iter
-1].end
= 10;
2310 if(DEBUG
) fprintf(logs
, "CurrentUse0");
2311 g_iters
[threadId
][cur_iter
-1].end
= 11;