modified: nfig1.py
[GalaxyCodeBases.git] / BGI / BASE / src / extend.c
blob4e4b173318bd1e3203ff64b69c0c15e0cb0e43e0
1 /*
3 extend.c extend in one direction of BASE.
5 # Copyright (C) 2015, The University of Hong Kong.
7 # This program is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU General Public License
9 # as published by the Free Software Foundation; either version 3
10 # of the License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 03110-1301, USA.
21 Date : 28th Jan 2016
22 Author : Binghang Liu, bhliu@cs.hku.hk
25 #include "extend.h"
28 Get reads id and keep them in reads.
31 int set_read_ids_plus(ULL l, ULL r, int* tCount, int level, int node, int size, int cons, int threadId)
33 ULL num = r-l+1;
34 int c = *tCount;
35 ULL i;
36 if(num > MAXDEPTH)
38 fprintf(stderr, "Index PLus Error: left: %d right %d\n", l, r);
39 return 0;
41 for(i=0; i< num; ++i)
43 ULL id = idx2BWT[size]->readIDtable[l + i - idx2BWT[size]->bwt->cumulativeFreq[4]];
44 g_reads[threadId][c+i].id = id;
45 g_reads[threadId][c+i].strand = 1;
46 g_reads[threadId][c+i].size = size;
47 g_reads[threadId][c+i].level = level;
48 g_reads[threadId][c+i].node = node;
49 g_reads[threadId][c+i].cons = cons;
51 *tCount = c + num;
52 return 1;
55 int set_read_ids_minus(ULL l, ULL r, int* tCount, int level, int node, int size, int cons, int threadId)
57 ExactRange er;
58 er.saL = l; er.saR = r; er.strand = 0;
59 if(r-l+1 > MAXDEPTH)
61 fprintf(stderr, "Index Minus Error: left: %d right %d\n", l, r);
62 return 0;
64 ReadInf ri[r-l+2];
65 int add_num = extractReadInf(idx2BWT[size], er, ri, r-l+2);
66 int i, b = (*tCount);
68 for( i=0; i< add_num; i++)
70 g_reads[threadId][b+i].id = ri[i].read_id;
71 g_reads[threadId][b+i].level = level;
72 g_reads[threadId][b+i].strand = 0;
73 g_reads[threadId][b+i].node = node;
74 g_reads[threadId][b+i].size = size;
75 g_reads[threadId][b+i].cons = cons;
77 (*tCount) = b + add_num;
78 return 1;
82 Add new bases, including ACGT$.
84 int add_base(Base* b, int base, Base *s)
86 ULL l, r, ll, rr, ll_rev, rr_rev, depth=0;
87 int i;
89 for(i=0; i< bwtCount; i++)
91 s[i].plus = 0; s[i].minus = 0;
92 s[i].saL = 0; s[i].saR = 0; s[i].saLL = 0; s[i].saRR = 0; s[i].saLLrev = 0; s[i].saRRrev = 0;
93 if(b[i].plus > 0 && b[i].saR >= b[i].saL && b[i].saR - b[i].saL < MAXDEPTH)
95 l = b[i].saL; r = b[i].saR;
96 BWTSARangeBackwardFor(idx2BWT[i], base ,&l, &r);
97 if(l <= r && r-l <= b[i].saR - b[i].saL)
99 s[i].saL = l;
100 s[i].saR = r;
101 s[i].plus = r - l + 1;
104 if(b[i].minus > 0 && b[i].saRR >= b[i].saLL && b[i].saRR - b[i].saLL < MAXDEPTH)
106 ll = b[i].saLL; rr = b[i].saRR; ll_rev = b[i].saLLrev; rr_rev = b[i].saRRrev;
107 BWTSARangeBackwardRev(idx2BWT[i], 3-base ,&ll, &rr, &ll_rev, &rr_rev);
108 if(ll <= rr && rr-ll <= b[i].saRR - b[i].saLL)
110 s[i].saLL = ll;
111 s[i].saRR = rr;
112 s[i].saLLrev = ll_rev;
113 s[i].saRRrev = rr_rev;
114 s[i].minus = rr -ll + 1;
117 depth += s[i].plus + s[i].minus;
120 return depth;
123 int add_end(Base* b, int* read_num, int level, int node, int cons, int threadId)
125 ULL l, r, ll, rr, ll_rev, rr_rev, total=0;
126 int i, flag;
128 for(i=0; i< bwtCount; i++)
130 if(b[i].plus > 0 && b[i].saR >= b[i].saL && b[i].saR - b[i].saL < MAXDEPTH)
132 l = b[i].saL; r = b[i].saR;
133 BWTSARangeBackward(idx2BWT[i], 4 ,&l, &r);
134 if(l <= r && r-l <= b[i].saR - b[i].saL)
136 flag = set_read_ids_plus(l, r, read_num, level, node, i, cons, threadId);
137 if(flag == 1)
138 total += r - l + 1;
141 if(b[i].minus > 0 && b[i].saRR >= b[i].saLL && b[i].saRR - b[i].saLL < MAXDEPTH)
143 ll = b[i].saLL; rr = b[i].saRR; ll_rev = b[i].saLLrev; rr_rev = b[i].saRRrev;
144 BWTSARangeBackwardRev(idx2BWT[i], 4 ,&ll, &rr, &ll_rev, &rr_rev);
145 if(ll <= rr && rr-ll <= b[i].saRR - b[i].saLL)
147 flag = set_read_ids_minus(ll, rr, read_num, level, node, i, cons, threadId);
148 if(flag == 1)
149 total += rr - ll + 1;
153 return total;
156 int get_real_depth(Base* b)
158 int i;
159 ULL depth = 0;
160 for(i=0; i< bwtCount; i++)
161 depth += b[i].plus + b[i].minus;
162 return depth;
165 int get_strand_depth(Base* b, int strand)
167 int i;
168 ULL depth = 0;
169 for(i=0; i< bwtCount; i++)
171 if(strand == 1)
172 depth += b[i].plus;
173 else
174 depth += b[i].minus;
176 return depth;
178 void set_blank_base(Base* to)
180 int i;
181 for(i=0; i< bwtCount; i++)
183 to[i].plus = 0;
184 to[i].minus = 0;
188 void print_inf(FILE* fp, Node* nodes, Lev* levs, ReadId* reads, int* s, int cur_len);
189 void print_base(FILE* fp, Base* b)
191 int i;
192 for(i=0; i< bwtCount; i++)
194 fprintf(fp, "i: %d l %lld r %lld ll %lld rr %lld plus %d minus %d\n", i, b->saL, b->saR, b->saLL, b->saRR, b->plus, b->minus);
198 construct the backward extension tree.
200 int backward_search(FILE* logs, int* ss, int threadId)
202 int num[3]={ss[0], ss[1], ss[2]};
203 int i,j, k;
204 ULL l, r, ll, rr, ll_rev, rr_rev;
206 //define left to speed up.
207 unsigned long long left, flag;
209 g_levs[threadId][num[1]].nnum=0;
211 int max_i=0, max_j=0, max_depth=0, count, cons, node_max_i = -1, node_max_j = 0, node_max_depth =0;
212 //check ACGT.
213 for(j=num[0]-1; j>=0; j--)
215 if(g_nodes[threadId][j].is_end == 1)
216 continue;
217 if(g_nodes[threadId][j].level != num[1]-1)
218 continue;
220 left = get_real_depth(g_nodes[threadId][j].b);
221 if(left == 0)
223 fprintf(stderr, "j=%d, b[0].l=%d, b[0].r=%d, b[0].ll=%d, b[0].rr=%d\n", j, g_nodes[threadId][j].b[0].saL, g_nodes[threadId][j].b[0].saR, g_nodes[threadId][j].b[0].saLL, g_nodes[threadId][j].b[0].saRR);
224 exit(7);
226 cons = 0;
227 if(j == 0 && num[1] == 1)
228 cons = 1;
229 if(j == g_levs[threadId][num[1]-1].node && g_levs[threadId][ num[1] - g_nodes[threadId][j].len ].node == j)
230 cons = 1;
232 flag = add_end(g_nodes[threadId][j].b, &(num[2]), num[1], j, cons, threadId);
233 left -= flag;
234 g_levs[threadId][num[1]].c[4] += flag;
236 if(left <= 0)
238 g_nodes[threadId][j].is_end = 1;
239 if(max_depth == 0 || max_depth < flag)
241 max_j = j;
242 max_i = 4;
243 max_depth = flag;
245 continue;
248 count=0;
249 int now_max_depth=0, now_max_i=0, now_noend_depth = left;
250 for(i=0; i<4; i++)
252 if(left == 0)
253 break;
254 flag = add_base(g_nodes[threadId][j].b, i, sbase[threadId][i]);
255 if(flag == 0)
256 continue;
258 left -= flag;
259 g_levs[threadId][num[1]].c[i] += flag;
260 count++;
261 if(now_max_depth < flag)
263 now_max_i = i;
264 now_max_depth = flag;
267 if(count == 0)
269 fprintf(stderr, "Error: total %d, left %d, c[0] %d c[1] %d c[2] %d c[3] %d\n", get_real_depth(g_nodes[threadId][j].b), left, g_levs[threadId][num[1]].c[0], g_levs[threadId][num[1]].c[1], g_levs[threadId][num[1]].c[2], g_levs[threadId][num[1]].c[3]);
270 print_base(stderr, g_nodes[threadId][j].b);
271 exit(2);
273 if(count == 1)
275 g_nodes[threadId][j].seq[g_nodes[threadId][j].len] = dnaChar[now_max_i];
276 g_nodes[threadId][j].len ++;
277 g_nodes[threadId][j].seq[g_nodes[threadId][j].len] = '\0';
278 g_nodes[threadId][j].level = num[1];
279 set_base(g_nodes[threadId][j].b, sbase[threadId][now_max_i]);
280 g_levs[threadId][num[1]].nnum++;
282 if(max_depth < now_max_depth)
284 max_i = now_max_i; max_j = j;
285 max_depth = now_max_depth;
286 }else if(max_depth == now_max_depth)
287 { if(max_i != now_max_i)
289 if(g_levs[threadId][ss[1]-1].node == j)
291 max_i = now_max_i; max_j = j;
294 }else if(max_i == now_max_i)
296 if(g_levs[threadId][ss[1]-1].node == j)
298 max_j = j;
301 if(j == g_levs[threadId][ss[1]-1].node)
303 node_max_i = now_max_i; node_max_j = j;
304 node_max_depth = now_max_depth;
306 continue;
309 for(k=0; k < i; k++)
311 int real_depth = get_real_depth(sbase[threadId][k]);
312 if(real_depth > 0)
314 set_base(g_nodes[threadId][num[0]].b, sbase[threadId][k]);
315 g_nodes[threadId][num[0]].seq[0] = dnaChar[k];
316 g_nodes[threadId][num[0]].seq[1] = '\0';
317 g_nodes[threadId][num[0]].len = 1;
318 g_nodes[threadId][num[0]].level = num[1];
319 g_nodes[threadId][num[0]].parent = j;
320 g_nodes[threadId][num[0]].depth = real_depth;
321 g_nodes[threadId][num[0]].is_end = 0;
322 g_nodes[threadId][num[0]].conf = 0;
324 if(max_depth < real_depth)
326 max_i = k; max_j = num[0];
327 max_depth = g_nodes[threadId][num[0]].depth;
328 }else if(max_depth == real_depth)
329 { if(max_i != k)
331 if(g_levs[threadId][ss[1]-1].node == j && g_levs[threadId][ss[1]-1].node != g_nodes[threadId][max_j].parent)
333 max_i = k; max_j = num[0];
336 }else{
337 if(max_i == k && g_levs[threadId][ss[1]-1].node == j && g_levs[threadId][ss[1]-1].node != g_nodes[threadId][max_j].parent)
339 max_j = num[0];
343 num[0]++;
344 g_levs[threadId][num[1]].nnum++;
348 if(now_noend_depth * 0.75 > now_max_depth)
350 g_nodes[threadId][j].conf = 1;
354 //improve the treatment of heterozygous.
355 if(ss[1] > 1 && num[0] == ss[0] && max_j != g_levs[threadId][ss[1]-1].node) //no change of node number.
357 if(node_max_i == max_i)
359 max_j = g_levs[threadId][ss[1]-1].node;
360 max_depth = node_max_depth;
364 if(g_levs[threadId][num[1]].nnum == 0)
366 if(g_levs[threadId][num[1]].c[4] > 0)
368 g_levs[threadId][num[1]].node = max_j;
369 g_levs[threadId][num[1]].base = max_i;
370 g_levs[threadId][num[1]].depth = 0;
372 ss[0] = num[0];
373 ss[2] = num[2];
374 ss[1] = ss[1] + 1;
376 return 0;
378 g_levs[threadId][num[1]].base = max_i;
379 g_levs[threadId][num[1]].depth = max_depth;
380 g_levs[threadId][num[1]].node = max_j;
382 ss[0] = num[0];
383 ss[2] = num[2];
384 ss[1] = ss[1] + 1;
386 return 1;
390 call consensus of the extended region.
393 int pe_check_pure(ULL id, int size, int pos, int strand, int *real_size, int threadId)
395 *real_size = 0;
396 LL pid = get_pair_id(id);
397 if(DEBUG)
399 fprintf(stdout, "Check %lld,%lld,%d,%d ", id, pid, strand, pos);
401 if(strand == 0)
403 if(DEBUG) fprintf(stdout, "Minus\n");
404 return 0;
406 if(is_used_bwt_sparse(pid, size) == 0)
408 if(DEBUG) fprintf(stdout, "NotUsed\n");
409 return 0;
411 int i = tmpCount[threadId]-1;
412 int lowerbound = pos - (InsertSize[size] - Read_length) - 3*SD[size];
413 for(; i>=0; i--)
415 if(tmpRead[threadId][i].size != size)
416 continue;
417 if(tmpRead[threadId][i].pos < lowerbound)
419 if(DEBUG) fprintf(stdout, "OutLowerBound\n");
420 return 0;
422 if(tmpRead[threadId][i].id == pid)
424 *real_size = pos - tmpRead[threadId][i].pos + 1 + Read_length;
425 if(DEBUG) fprintf(stdout, "Find %d ", *real_size);
426 if(tmpRead[threadId][i].cons != 1 || tmpRead[threadId][i].unique != 1)
428 if(DEBUG) fprintf(stdout, "CONS%dUnique%d\n", tmpRead[threadId][i].cons, tmpRead[threadId][i].unique);
429 return 0;
431 *real_size = pos - tmpRead[threadId][i].pos + 1 + Read_length;
432 if(DEBUG) fprintf(stdout, "\n");
433 return 1;
436 if(DEBUG) fprintf(stdout, "NotFound\n");
437 return 0;
440 int base2int(char b)
442 if(b > 'T')
443 b = b - ('T' - 't');
445 switch (b)
447 case 'A':
448 return 0;
449 case 'C':
450 return 1;
451 case 'G':
452 return 2;
453 case 'T':
454 return 3;
455 return 0;
457 return 0;
460 int get_subopt_brother(Node* nodes, int* bs, int start, int bi, int level, int num)
462 int i, sub_i=-1;
463 for(i = start-1; i>= 0; i--)
465 if(nodes[i].level < level)
466 break;
467 if(bs[i] == bi || i == bi)
469 if(sub_i == -1)
470 sub_i = i;
471 else
473 if(nodes[sub_i].depth < nodes[i].depth)
474 sub_i = i;
478 for(i=start +1; i< num; i++)
480 if(nodes[i].level - nodes[i].len > level)
481 break;
482 if(bs[i] == bi || i == bi)
484 if(sub_i == -1)
485 sub_i = i;
486 else{
487 if(nodes[sub_i].depth < nodes[i].depth)
488 sub_i = i;
492 return sub_i;
495 void clean_inf(FILE* logs, Node* nodes, Lev* levs, ReadId* reads, int* s, int* bs, int bi)
497 //only update the levs, s[1], reads.node
498 int i, j;
499 for(i=0; i< s[2]; i++)
501 if(bs[reads[i].node] != bi)
503 reads[i].node = 0;
504 reads[i].level = 0;
508 //update s[1]
509 for(i=1; i< s[1]; i++)
511 if(bs[levs[i].node] != bi)
513 j = get_subopt_brother(nodes, bs, levs[i].node, bi, i, s[0]);
515 if(j == -1)
517 fprintf(stderr, "Cut directly %d->%d\n", s[1], i);
518 s[1] = i;
519 break;
521 int p = nodes[j].len - (nodes[j].level - i + 1) ;
522 int newbase = base2int(nodes[j].seq[p]);
523 int oldnode = levs[i].node;
524 int oldbase = levs[i].base;
525 levs[i].node = j;
527 if(newbase == levs[i].base)
529 levs[i].depth = levs[i].c[levs[i].base] - levs[i].depth;
530 levs[i].c[levs[i].base] = levs[i].depth;
531 if(levs[i].depth > nodes[j].depth)
533 levs[i].depth = nodes[j].depth; //would be overestimated.
535 }else{
536 int k ;
537 for(k=0; k<4; k++)
539 if(k != newbase)
540 levs[i].c[k]=0;
542 levs[i].base = newbase;
543 levs[i].depth = levs[i].c[levs[i].base];
544 if(levs[i].depth > nodes[j].depth)
545 levs[i].depth = nodes[j].depth; //would be overestimated.
547 s[1]=i+1;
548 break; //solve one error per time.
550 if(levs[i].depth < Error_depth)
552 s[1] = i+1;
553 break;
555 }else{
556 int k ;
557 for(k=0; k<4; k++)
559 if(k != levs[i].base)
560 levs[i].c[k]=0;
562 s[1]=i+1;
563 break;
569 int get_ancestor(Node* nodes, int i)
571 int nowi=i;
572 while(nodes[nowi].parent != 0)
573 nowi = nodes[nowi].parent;
575 return nowi;
578 int fill_branches(Node* nodes, int n, int* bs)
580 int branch_count=1;
581 int i;
582 for(i=1; i< n; i++)
584 if(nodes[i].parent != 0)
586 i--;
587 break;
589 branch_count++;
590 bs[i] = i;
593 for(; i< n; i++)
595 bs[i] = get_ancestor(nodes, i);
598 return branch_count;
601 void clean_path(FILE* logs, Node* nodes, Lev* levs, ReadId* reads, int* s, int ancestor )
603 int i;
604 int bs[s[0]];
606 for(i=0; i< s[0]; i++)
607 bs[i]=0;
609 int branch_count = fill_branches(nodes, s[0], bs);
611 clean_inf(logs, nodes, levs, reads, s, bs, ancestor);
614 int solve_conf(FILE* logs, Node* nodes, Lev* levs, ReadId* reads, int* s, int seed_len, int ctg_len, int threadId, int diff)
616 int i, total_pe_count=0, real_size;
617 int bs[s[0]];
619 for(i=0; i< s[0]; i++)
620 bs[i]=0;
622 int branch_count = fill_branches(nodes, s[0], bs);
623 int count[branch_count];
624 int pecount[branch_count];
626 for(i=0; i< branch_count; i++)
628 count[i]=0; pecount[i]=0;
631 for(i=0; i< s[2]; i++)
633 if(reads[i].node == 0)
634 continue;
635 count[bs[reads[i].node]]++;
636 if(pe_check_pure(reads[i].id, reads[i].size, reads[i].level + ctg_len, reads[i].strand, &real_size, threadId) == 1)
638 if(DEBUG) fprintf(logs, "Got: %d %d %d %d\n", reads[i].id, reads[i].node, bs[reads[i].node], real_size);
639 pecount[bs[reads[i].node]]++;
640 total_pe_count++;
644 int sup_i=-1, sup_c=0, c=0, total_c=0, total_pe_c = 0;
645 for(i=1; i< branch_count; i++)
647 total_pe_c += pecount[i];
648 total_c += count[i];
649 if(count[i] >= Error_depth && pecount[i] > 0)
651 c++;
652 if(DEBUG) fprintf(logs, "confs: %d:%d:%d\n", i, count[i], pecount[i]);
653 //total_c += count[i];
654 //total_pe_c += pecount[i];
655 if(sup_c < pecount[i])
657 sup_c = pecount[i];
658 sup_i = i;
663 if(c == 0)
664 return 0;
666 if(DEBUG) fprintf(logs, "CONF:%d,%d,%d,%d,%d,%d,%d;\n", c, branch_count, sup_i, sup_c, count[sup_i], total_c, total_pe_c);
667 if( (c == 1 && (sup_c > Error_depth || (sup_c == Error_depth && total_pe_c == sup_c))&& sup_c >= count[sup_i]/10)//&& (sup_c >= 2 * Consensus_depth || (sup_c < 2 * Consensus_depth && sup_c == total_pe_count )))// && total_pe_count == sup_c)
668 || (c > 1 && sup_c > Error_depth && sup_c >= count[sup_i]/5 && sup_c > double(total_pe_c) * 0.9 && total_pe_c - sup_c < Error_depth)
671 clean_inf(logs, nodes, levs, reads, s, bs, sup_i);
672 return 1;
675 //try to solve heterozygous bubble.
676 if(Solve_Hete == 1 && diff < 5 && c == 2 && sup_c > 2 && seed_len <= depth_half_length && nodes[0].depth <= Upper_bound * double(Expect_depth*(Read_length-seed_len+1))/double(Read_length))
678 clean_inf(logs, nodes, levs, reads, s, bs, sup_i);
679 return 1;
681 return 0;
684 void get_check_seq(char* seq, char* ctg_seq, Lev* levs, int lnum, int ctg_len)
686 int i, j=0, k=0;
688 for(i=ctg_len-1; i>=0 &&k < Read_length; k++, i--)
690 seq[k] = ctg_seq[i];
692 seq[k]='\0';
694 if(DEBUG) fprintf(stdout, "check seq:\n%s\n", seq);
698 int is_full_cons(ULL read_id, int size, int read_strand, int level, char* check_seq, int check_seq_len, int seed_len, int lnum)
700 int start, end, startp;
702 if(read_strand == 0)
704 start = level + seed_len -1;
705 end = Read_length-1;
706 startp = seed_len;
707 return minus_extract_and_compare(idx2BWT[size], read_id, start, end, check_seq, check_seq_len, startp);
708 }else{
709 start = Read_length - (level + seed_len);
710 end = 0;
711 startp = seed_len + start;
712 if(startp == Read_length)
713 return 1;
714 return plus_extract_and_compare(idx2BWT[size], read_id, start, end, check_seq, check_seq_len, startp);
718 int is_full_consensus(ULL read_id, int size, int read_strand, int read_level, char* check_seq, int check_seq_len, int seed_len, int lnum)
720 char seq[Read_length + 1];
721 int qual[Read_length + 1];
722 //is_full_cons(read_id, size, read_strand, read_level, check_seq, check_seq_len, seed_len, lnum);
723 get_id_seq(seq, qual, read_id, size);
725 int i, j;
726 if(read_strand == 0)
728 int rqual[Read_length + 1];
729 char rcseq[Read_length + 1];
730 reverse_com(seq, rcseq, Read_length);
732 for(i=0; i< Read_length; i++)
733 rqual[i] = qual[Read_length-i-1];
735 i = read_level+seed_len-1; j = seed_len;
736 if(DEBUG) fprintf(stdout, "Minu Read: %s, pos %d,%c; Cons %d,%c\n", rcseq, i, rcseq[i], j, check_seq[j]);
738 for(; i< Read_length; i++, j++)
740 if(rcseq[i] != check_seq[j] && rqual[i] == 1)
742 if(DEBUG) fprintf(stdout, "i: %d,%c->%c,%d\n", i, check_seq[j], rcseq[i], rqual[i]);
743 return 0;
746 }else{
748 i = read_level+seed_len-1; j = seed_len;
749 if(DEBUG) fprintf(stdout, "Plus Read: %s, pos %d,%c; Cons %d,%c\n", seq, i, seq[i], j, check_seq[j]);
751 for(; i< Read_length; i++, j++)
753 if(seq[i] != check_seq[j] && qual[i] == 1)
755 if(DEBUG) fprintf(stdout, "i: %d,%c->%c,%d\n", i, check_seq[j], seq[i], qual[i]);
756 return 0;
760 if(DEBUG) fflush(stdout);
761 // is_full_cons(read_id, size, read_strand, read_level, check_seq, check_seq_len, seed_len, lnum);
762 return 1;
765 int mask_last_reads(FILE* logs, int cut_level, int* s, char* ctg_seq, int keep_used, int* keep_used_extension, int *has_used_read, int ctg_len, int iter, int unique, int threadId, int has_repeat_break, int seed_len)
767 int i, j=tmpCount[threadId], used, cons, cut = cut_level, tid=-1, check_read=0, use_read=0;
768 char check_seq[2*Read_length + 2];
770 get_check_seq(check_seq, ctg_seq, g_levs[threadId], s[1], ctg_len);
771 int check_seq_len = strlen(check_seq), cons_check;
773 for(i=0; i< s[2]; i++)
775 if(i > 0 && g_reads[threadId][i].node == 0 && g_reads[threadId][i].level == 0)
776 continue;
777 if(g_reads[threadId][i].level > cut_level)
778 break;
780 if(j >= MaxReadCount)
782 fprintf(stderr, "Too many reads %d\n", j);
783 return -3;
785 tmpRead[threadId][j].id = g_reads[threadId][i].id;
786 tmpRead[threadId][j].strand = g_reads[threadId][i].strand;
787 tmpRead[threadId][j].size = g_reads[threadId][i].size;
788 tmpRead[threadId][j].pos = g_reads[threadId][i].level - 1 + ctg_len; //start from 1 for level.
789 tmpRead[threadId][j].iter = iter;
790 tmpRead[threadId][j].unique = unique;
791 tmpRead[threadId][j].cons = g_reads[threadId][i].cons;
792 tmpRead[threadId][j].keep = 0;
794 cons_check = double(Read_length)/double(g_reads[threadId][i].level + seed_len) > Upper_bound ? 1 : 0;
795 if(DEBUG) fprintf(logs, "read %d cons %d pos %d \n", g_reads[threadId][i].id, g_reads[threadId][i].cons, tmpRead[threadId][j].pos);
796 if(g_reads[threadId][i].cons == 1 && tmpRead[threadId][j].pos > Read_length)
798 check_read++;
799 //if(has_repeat_break == 1 &&
800 if(cons_check == 1 && is_full_cons(g_reads[threadId][i].id, g_reads[threadId][i].size, g_reads[threadId][i].strand, g_reads[threadId][i].level, check_seq, check_seq_len, seed_len, s[1]) == 0)
802 tmpRead[threadId][j].cons = 0;
803 g_reads[threadId][i].cons = 0;
804 continue;
806 use_read++;
808 if(is_used_bwt_sparse(g_reads[threadId][i].id, g_reads[threadId][i].size) == 0)
810 endInf[threadId].ur ++;
811 tmpRead[threadId][j].keep = 1;
812 if(DEBUG) fprintf(logs, "read %d cons %d pos %d \n", g_reads[threadId][i].id, g_reads[threadId][i].cons, tmpRead[threadId][j].pos);
813 check_and_set_bwt_sparse_thread_and_used(g_reads[threadId][i].id, g_reads[threadId][i].size, threadId);
814 // fprintf(stderr, "UR %d %lld %d %d\n", threadId, reads[i].id, is_used_bwt_sparse(reads[i].id, reads[i].size), get_threadId_bwt_sparse(reads[i].id, reads[i].size));
815 }else{
816 if(cons_check == 0 && is_full_cons(g_reads[threadId][i].id, g_reads[threadId][i].size, g_reads[threadId][i].strand, g_reads[threadId][i].level, check_seq, check_seq_len, seed_len, s[1]) == 0)
818 g_reads[threadId][i].cons = 0;
819 tmpRead[threadId][j].cons = 0;
820 use_read--;
821 continue;
823 tid = get_threadId_bwt_sparse(g_reads[threadId][i].id, g_reads[threadId][i].size);
824 if(tid != -1 && tid != threadId)
826 tmpCount[threadId] = j;
827 //fprintf(stderr, "CONFMeet:%d %d %d %d %d %d\n", reads[i].id, reads[i].size, threadId, j, tid, is_used_bwt_sparse(reads[i].id, reads[i].size));
828 free_thread_tmpRead(threadId, 1);
829 return -100;
831 if(DEBUG) fprintf(logs, " (%d,%d,%d)",g_reads[threadId][i].id, g_reads[threadId][i].level, tid);
832 if(tid == threadId)
833 *keep_used_extension = 1;
835 cut = g_reads[threadId][i].level - 1;
836 if(cut <= 0)
838 tmpCount[threadId] = j;
839 return cut;
841 *has_used_read = 1;
842 break;
845 j++;
847 tmpCount[threadId] = j;
849 if(use_read > 0)
851 if(four_iter[threadId] > 0)
853 four_iter[threadId] = 0; four_iter_checked_read[threadId]=0; four_iter_used_read[threadId] = 0;
855 }else
857 if(check_read > 5)
859 if(DEBUG) fprintf(stdout, "Maybe Same seed problem. check read %d and use read %d\n", check_read, use_read);
860 return -2;
863 four_iter[threadId] ++;
864 four_iter_checked_read[threadId] += check_read;
865 if(four_iter[threadId] >= 4 && four_iter_checked_read[threadId] >= 10 && four_iter_used_read[threadId] == 0)
867 if(DEBUG) fprintf(stdout, "Maybe Same seed problem. iter check number %d check read %d and use read %d\n", four_iter[threadId], check_read, use_read);
868 return -2;
870 if(four_iter[threadId] >= 10)
872 if(DEBUG) fprintf(stdout, "Maybe Same seed problem. iter check number %d check read %d\n", four_iter[threadId], four_iter_checked_read[threadId], use_read);
873 return -2;
878 //fit for the cut level.
879 if(i < s[2])
881 s[2] = i;
884 for(j=s[0]-1; j>=0; j--)
886 if(g_nodes[threadId][j].level <= cut)
887 continue;
888 //fprintf(stderr, "Cut: %d: %d, level%d, len %d\n", j, cut, nodes[j].level, nodes[j].len);
890 int left = int(g_nodes[threadId][j].level) - cut ;
891 if(left <=0)
892 continue;
893 if(left >= g_nodes[threadId][j].len)
894 left = g_nodes[threadId][j].len;
895 g_nodes[threadId][j].len -= left;
896 g_nodes[threadId][j].seq[ g_nodes[threadId][j].len ]='\0';
897 //fprintf(stderr, "[%d;%d]",j,left);
899 s[1] = cut + 1;
901 return cut;
904 int is_quick_decrease(int d1, int d2)
906 if(d2 < 5)
907 return d2*2-1 <= d1;
908 return d2*1.5 <= d1;
911 int has_branches(Lev &lev)
913 return lev.c[0] + lev.c[1] + lev.c[2] + lev.c[3] > lev.depth;
917 int get_sub_node(Node* nodes, int level, int p_node, char base, int num)
919 int i, max_i=-1, max=0, high_num=0;
921 for(i=p_node+1; i< num; i++)
923 if(nodes[i].level < level-1)
924 continue;
925 if(nodes[i].level - nodes[i].len == level-1 && nodes[i].seq[0] != base)
927 if(nodes[i].depth > Error_depth)
928 high_num++;
929 if(nodes[i].depth > max)
931 max = nodes[i].depth;
932 max_i = i;
937 if(high_num > 1)
938 return 0;
940 return max_i;
943 void get_sub_seq(Node* nodes, Lev* levels, int p, int num, char* subseq)
945 int type[num], max_depth[num], max_i[num], max_same[num], type_count[num], nodes_count[num];
946 int i, j, k;
947 for(i=0; i< num; i++)
949 type[i]=0;
950 type_count[i]=0;
951 nodes_count[i]=0;
952 max_depth[i]=0;
953 max_i[i]=0;
954 max_same[i]=0;
957 type[p]=1;
958 for(i=p+1; i< num; i++)
960 if(type[nodes[i].parent] == 1)
962 type[i]=1;
963 type_count[nodes[i].parent]++;
964 nodes_count[nodes[i].parent] += nodes[i].depth;
965 if(
967 (nodes[i].depth > max_depth[nodes[i].parent])
968 || (nodes[i].depth == max_depth[nodes[i].parent] && max_same[nodes[i].parent] == 0 && nodes[i].seq[0] == dnaChar[levels[nodes[i].level +1 - nodes[i].len].base])
969 || (nodes[i].depth == max_depth[nodes[i].parent] && max_same[nodes[i].parent] == 0 && nodes[i].len > nodes[ max_i[nodes[i].parent] ].len)
973 max_depth[nodes[i].parent] = nodes[i].depth;
974 max_i[nodes[i].parent] = i;
975 if(nodes[i].seq[0] == dnaChar[levels[nodes[i].level +1 - nodes[i].len].base])
976 max_same[nodes[i].parent] = 1;
981 j=0;
982 for(i=p; i< num; i++)
984 if(i > p && type[i] == 1 && (type[nodes[i].parent] == 0 || max_i[nodes[i].parent] != i))
985 type[i] = 0;
987 if(type[i] == 0 )
988 continue;
989 if(i>p && type_count[i] > 0 && type_count[i] == nodes_count[i])
991 //fprintf(stdout, "i %d: type count %d, node count %d\n", i, type_count[i], nodes_count[i]);
992 break;
994 //fprintf(stdout, ">%d,%d", i, nodes[i].len);
995 for(k=0; k < nodes[i].len; k++, j++)
996 subseq[j] = nodes[i].seq[k];
997 if(nodes[i].conf == 1)
998 break;
1000 //fprintf(stderr, "\n");
1001 subseq[j]='\0';
1005 int get_first_branch(Node* nodes, int num, int now_branch_node, int current_node, char* s1)
1007 int flag[num], type[num];
1008 int i;
1009 flag[current_node]=1;
1010 for(i=0; i< num; i++)
1012 flag[i]=0; type[i]=0;
1014 int node=current_node;
1015 while(!(node == 0 && nodes[node].parent ==0) )
1017 node = nodes[node].parent;
1018 flag[node] = 1;
1021 node = now_branch_node;
1022 while(flag[node] == 0)
1024 type[node]=1;
1025 if(flag[nodes[node].parent] == 1)
1026 break;
1027 node = nodes[node].parent;
1029 int j=0, k=0;
1030 for(i=node; i< num; i++)
1032 if(type[i] == 0 )
1033 continue;
1034 for(k=0; k < nodes[i].len; k++, j++)
1036 s1[j] = nodes[i].seq[k];
1039 s1[j]='\0';
1040 return node;
1043 int get_branch_node(FILE* logs, Node* nodes, Lev* levs, int p, int b, int num)
1045 int p_node = nodes[levs[p].node].parent;
1048 if(p_node != levs[p-1].node)
1049 return 0;
1051 int i;
1052 char base = dnaChar[b];
1053 for(i=p_node+1; i< num; i++)
1055 if(nodes[i].level < p-1 || nodes[i].parent != p_node)
1056 continue;
1057 if(nodes[i].level - nodes[i].len + 1 == p && nodes[i].seq[0] == base)
1059 return i;
1062 return 0;
1066 int get_sub_node_id2(FILE* logs, Node* nodes, Lev* levs, int p, int b, int* s, char* s1)
1068 int p_node = nodes[levs[p].node].parent;
1069 int i, max_i=-1, max=0, high_num=0;
1070 char base = dnaChar[b];//dnaChar[levs[p].base];
1072 for(i=1; i< s[0]; i++)
1074 //fprintf(stderr, "%d %d %d %d %d\n", i, p, nodes[i].level, nodes[i].parent, nodes[nodes[i].parent].level);
1075 if(nodes[i].level < p || i == levs[p].node || nodes[nodes[i].parent].level > p - 1)
1076 continue;
1077 //fprintf(stderr, "%d %d %d %c \n", i, p, nodes[nodes[i].parent].level, nodes[i].seq[p - nodes[nodes[i].parent].level -1]);
1078 if(nodes[i].seq[p - nodes[nodes[i].parent].level -1] == base)
1080 if(nodes[i].depth > Error_depth)
1081 high_num++;
1082 if(nodes[i].depth > max)
1084 max = nodes[i].depth;
1085 max_i = i;
1089 int n_node = max_i; //now branch node.
1090 if(n_node < 0)
1092 fflush(logs);
1093 fprintf(stderr, "Subnode: %d, %d, %c\n", p_node, p, base);
1094 exit(1);
1097 int c_node = levs[p].node; //current node.
1098 int fb_node = get_first_branch(nodes, s[0], n_node, c_node, s1); //first branch node.
1100 //fprintf(logs, "nobranch %d, nonode %d, firstbranch %d\n", n_node, c_node, fb_node);
1101 return fb_node;
1104 int get_sub_node_id(Node* nodes, Lev* levs, int p, int* s)
1106 int p_node = levs[p-1].node;
1107 if(nodes[p_node].level != p-1)
1108 return 0;
1110 int node = get_sub_node(nodes, p, p_node, dnaChar[levs[p].base], s[0]);
1111 if(node < 0)
1112 return 0;
1113 return node;
1116 int get_pair_seq(FILE* logs, Node* nodes, Lev* levs, int p, int* s, char* s1, char* s2, int sub_node)
1119 if(strlen(s1) == 0)
1120 get_sub_seq(nodes, levs, sub_node, s[0], s1);
1122 int i, j, dif=0, node_id, k;
1124 // fprintf(logs, "s1 %s\n", s1);
1125 for(i=p, j=0; i< s[1] && j<strlen(s1); i++, j++)
1127 if( levs[i].base == 4)
1128 break;
1129 if(levs[i].node != levs[i-1].node && (nodes[levs[i].node].parent != levs[i-1].node || (nodes[levs[i-1].node].conf == 1 && i > p)))
1131 //dif++;
1133 if(i <= nodes[levs[i-1].node].level)
1135 node_id = levs[i-1].node;
1137 for(k = i; k<= nodes[node_id].level && j < strlen(s1); k++, i++, j++)
1139 s2[i-p] = nodes[node_id].seq[nodes[node_id].len - (nodes[node_id].level -k)-1];
1140 if(s2[i-p] != s1[j])
1141 dif++;
1144 break;
1146 s2[i-p] = dnaChar[ levs[i].base ];
1147 if(s2[i-p] != s1[j])
1148 dif++;
1150 s2[i-p]='\0';
1151 s1[i-p]='\0';
1152 //fprintf(logs, "%s, %s, %d, %d\n", s1, s2, p, sub_node);
1153 return dif;
1158 int has_branches2(int level, Node* nodes, int nnum, int node)
1160 int i=0;
1161 for(i=0; i< nnum; i++)
1163 if(i == node)
1164 continue;
1165 if(nodes[i].level < level)
1166 continue;
1167 if(nodes[i].level - nodes[i].len >= level)
1168 break;
1169 if(nodes[i].level >= level && nodes[i].level - nodes[i].len < level && nodes[i].depth > 1)
1170 return nodes[i].level - nodes[i].len;
1172 return 0;
1175 int quick_decrease(Node* nodes, Lev* levs, int p, int seed_len)
1177 double base_depth = double(nodes[0].depth) / double(Read_length - seed_len + 1 ) * Read_length;
1178 double lev_dec = base_depth / double(Read_length);
1179 double node_dec = double(nodes[levs[p].node].len - (nodes[levs[p].node].level - p)) * lev_dec;
1180 int real_lev_dec = levs[p-1].depth - levs[p].depth;
1181 int real_node_dec = nodes[levs[p].node].depth - levs[p].depth;
1183 //fprintf(logs, " %.2f:[%.2f,%d;%.2f,%d]\n", base_depth, lev_dec, real_lev_dec, node_dec, real_node_dec);
1184 //decreasing too fast.
1185 if(real_lev_dec - lev_dec > Error_depth*2 )
1186 return 1;
1187 if(real_node_dec - node_dec > Error_depth*2 )// || (levs[p].depth <= 2* Error_depth && real_node_dec - node_dec > real_node_dec/2))
1188 return 1;
1190 //extending single node too long.
1191 if(p + seed_len >= depth_9_length && levs[p].depth < int(nodes[levs[p].node].depth/3))
1192 return 1;
1193 if(p + seed_len >= depth_6_length && levs[p].depth <= 2* Error_depth && levs[p].depth < int(nodes[levs[p].node].depth/2))
1194 return 1;
1196 return 0;
1199 int is_short_cons(int len, int seed_len)
1201 return (len < seed_len/2 && len < MINOVERLAP) ? 1 : 0;
1204 int get_consist_seq(int ii, int kk, char* s1, char* s2, char* seq, int seed_len)
1206 int j;
1207 int i = ii, k = kk;
1208 for(j=0; i>=1 && k>=1; i--, j++, k--)
1210 if(s1[i] != s2[k])
1212 if(is_short_cons(j, seed_len) == 1)
1214 j=-1;
1215 continue;
1217 break;
1219 seq[j] = s1[i];
1221 seq[j]='\0';
1222 return j;
1225 int is_hete_bubble(char* s1, char* s2, int now_base_depth, int diff, int seed_len, int threadId)
1227 int nlen = strlen(s1);
1228 char seq[nlen + 1];
1230 int i=nlen-1, j, k=i;
1231 j = get_consist_seq(i, k, s1, s2, seq, seed_len);
1233 if(is_short_cons(j, seed_len) == 1)
1235 if(s1[i] == s2[k-1])
1236 { k--;
1237 j = get_consist_seq(i, k, s1, s2, seq, seed_len);
1238 k++;
1241 if(is_short_cons(j, seed_len) == 1 && s1[i-1] == s2[k])
1243 i--;
1244 j = get_consist_seq(i, k, s1, s2, seq, seed_len);
1246 }else if(diff > 4)
1247 return 0;
1249 if(is_short_cons(j, seed_len) == 1)
1250 return 0;
1251 return dual_bubble_check(seq, now_base_depth, threadId);
1254 int solve_by_seed(FILE* logs, char* cseq, char* st, char* s1, char* s2, int p, double d1, int d2, double diff, int seed_len, int max_level, int threadId)
1256 int nlen = p -1 ;
1257 int tlen = Read_length - 1 - nlen;//strlen(s1);
1259 int i,j, slen = strlen(s1);
1260 for(i=1; i< slen; i++)
1262 if(s1[i] != s2[i])
1263 break;
1264 if(g_levs[threadId][p+i-1].c[ charMap[s1[i]] ] < Error_depth)
1266 i = slen;
1267 break;
1271 if(i == slen || i+p-1 >= max_level)
1272 slen = 1;
1273 else
1274 slen = i + 1;
1276 // fprintf(logs, "i %d, p %d, max_level %d, slen %d\n", i, p, max_level, slen);
1277 tlen = Read_length - nlen - slen;
1278 if(strlen(cseq) < tlen)
1279 tlen = strlen(cseq);
1281 char seq1[tlen + nlen + slen+1];
1282 char seq2[tlen + nlen + slen+1];
1284 for(i=0, j=strlen(cseq)-tlen; i<tlen; i++, j++)
1286 seq1[i] = complementMap[ cseq[j] ];
1287 seq2[i] = complementMap[ cseq[j] ];
1290 //st starting from 1.
1291 for(j=1; j< p; j++, i++)
1293 seq1[i] = complementMap[ st[j] ];
1294 seq2[i] = complementMap[ st[j] ];
1297 for(j=0; j< slen; j++, i++)
1299 seq1[i] = complementMap[ s1[j] ];
1300 seq2[i] = complementMap[ s2[j] ];
1302 seq1[i]='\0'; seq2[i]='\0';
1303 //fprintf(logs, "seq1: %s seq2: %s\n", seq1, seq2);
1304 int type = dual_seq_check(logs, seq1, seq2, nlen + seed_len + slen, d1, strlen(s1), diff, threadId);
1305 return type;
1309 int is_long_repeat_seed(int depth, int len);
1311 int high_confident_err(int seed_len, int depth0, int p, int prev_depth, int p_depth, int c_depth, int diff)
1313 if(seed_len < depth_half_length && seed_len + p < depth_9_length
1314 && depth0 < Expect_depth*(Read_length-seed_len+1)/Read_length && c_depth == 1
1315 && diff == 1 && prev_depth - p_depth < Error_depth)
1316 return 1;
1317 return 0;
1320 int set_reads_cons(int *s, int p, int flag, int threadId)
1322 int i;
1323 for(i=0; i< s[2]; i++)
1325 if(g_reads[threadId][i].level == p && g_reads[threadId][i].node == g_levs[threadId][p-1].node && g_reads[threadId][i].cons != flag)
1327 //fprintf(stdout, "set cons %d\n", g_reads[threadId][i].id);
1328 g_reads[threadId][i].cons = flag;
1333 int is_risk_tag(char* seed_seq, int flag)
1335 int i, j;
1336 if(flag == 1)
1338 if(seed_seq[0] == 'C' && seed_seq[1] == 'G' && seed_seq[2] == 'G')
1339 return 1;
1340 for(i=0; i< 10; i++)
1342 if(seed_seq[i] == 'C')
1344 for(j=i+1; j<12; j++)
1346 if(seed_seq[j] != 'G')
1347 break;
1349 if(j-i > 2)
1350 return 1;
1353 return 0;
1355 }else{
1356 if(seed_seq[0] == 'G' && seed_seq[1] == 'C' && seed_seq[2] == 'C')
1357 return 1;
1358 for(i=0; i< 10; i++)
1360 if(seed_seq[i] == 'G')
1362 for(j=i+1; j<12; j++)
1364 if(seed_seq[j] != 'C')
1365 break;
1367 if(j-i > 2)
1368 return 1;
1371 return 0;
1373 return 0;
1376 int is_strand_bias(ULL plus, ULL minus)
1378 if((plus == 0 && minus > 3* Consensus_depth)|| (plus > 0 && plus < Error_depth/2 && plus * 3* Consensus_depth < minus))
1379 return 1;
1381 if((minus == 0 && plus > 3 * Consensus_depth)|| (minus > 0 && minus< Error_depth/2 && minus *3* Consensus_depth < plus))
1382 return 2;
1384 return 0;
1386 int is_illumina_error(FILE* logs, char* seed_seq, Node &n, Node &nc, int now_level_depth, int branch_depth)
1388 int l = strlen(seed_seq);
1389 int i, j;
1391 int risk_plus = is_risk_tag(seed_seq, 1);
1392 int risk_minus = is_risk_tag(seed_seq, 0);
1394 if(risk_plus == 0 && risk_minus == 0)
1395 return 0;
1397 ULL plus_depth = get_strand_depth(n.b, 1);
1398 ULL minus_depth = get_strand_depth(n.b, 0);
1400 ULL cons_plus_depth = get_strand_depth(nc.b, 1);
1401 ULL cons_minus_depth = get_strand_depth(nc.b, 0);
1403 int strand_bias = is_strand_bias(plus_depth, minus_depth);
1404 int cons_strand_bias = is_strand_bias(cons_plus_depth, cons_minus_depth);
1406 // fprintf(logs, "risk plus %d risk minus %d; strand bias %d cons strand bias %d, level depth %d branch depth %d; nc depth %d nc len %d.\n", risk_plus, risk_minus, strand_bias, cons_strand_bias, now_level_depth, branch_depth, nc.depth, nc.len);
1408 if(cons_strand_bias == strand_bias)
1409 return 0;
1411 int risk_conf=0, risk_cons=0;
1412 if((risk_plus == 1 && strand_bias == 2) || (risk_minus == 1 && strand_bias == 1))
1413 risk_conf = 1;
1415 if((risk_plus == 1 && cons_strand_bias == 2) || (risk_minus == 1 && cons_strand_bias == 1))
1416 risk_cons = 1;
1418 if(risk_conf == 1 && risk_cons == 1)
1420 if((plus_depth + minus_depth) == 1 && branch_depth > Error_depth && now_level_depth - (cons_plus_depth + cons_minus_depth) < Error_depth/2)
1421 return 2;
1422 return 1;
1424 if(risk_conf == 1)
1425 return 1;
1427 if(risk_cons == 1 && now_level_depth - (cons_plus_depth + cons_minus_depth) < Error_depth/2 && (now_level_depth > Error_depth || nc.len == 1))
1428 return 2;
1430 return 0;
1433 int find_last_lev(FILE* logs, int* s, char* ctg_seq, char* seed_seq, int seed_len, int ctg_len, int initial_depth, int* is_unique, int* has_repeat_break, int threadId)
1435 int i, j, k, is_depth = 0, has_other =0, has_other_depth=0;
1436 int solved[s[1]];
1438 int back_len = 0, sub_node=0, sub_node_level=0;
1439 char s1[Read_length], s2[Read_length], st[Read_length];
1440 int is_tip, is_solved;
1441 int is_low_depth = initial_depth < (double(Read_length-seed_len+1)*Expect_depth)/double(Read_length)/2.5 ? 1 : 0;
1442 int is_high_depth = initial_depth > (double(Read_length-seed_len+1)*Expect_depth)/double(Read_length) ? 1 : 0;
1443 double current_base_depth = double(initial_depth * Read_length)/double(Read_length-seed_len+1);
1444 if(current_base_depth > Expect_depth)
1445 current_base_depth = Expect_depth;
1447 for(i=1; i<s[1]; i++)
1449 solved[i] = 1;
1450 double curr_depth = double(current_base_depth*(Read_length-(seed_len+i)+1))/double(Read_length);
1451 //checking no-branch issue.
1452 if(g_levs[threadId][i].depth < Error_depth //low depth.
1453 || (g_levs[threadId][i].nnum <= 3 && 3 * g_levs[threadId][i].depth <= initial_depth) //too many branches with high depth.
1454 || ( i>1
1455 && (
1456 (g_levs[threadId][i].node != g_levs[threadId][i-1].node && g_nodes[threadId][ g_levs[threadId][i].node ].parent != g_levs[threadId][i-1].node) //path changed.
1457 || (g_levs[threadId][i].nnum == 1 && curr_depth < 2 * Expect_depth && g_levs[threadId][i].depth < 2 * Error_depth) //low depth region, base by base extension.
1458 || (g_levs[threadId][i].nnum <= 2 && //has confliction here.
1459 ( quick_decrease(g_nodes[threadId], g_levs[threadId], i, seed_len) == 1 //quick decrease of depth.
1460 || (g_levs[threadId][i].depth <= 3 * Error_depth && curr_depth <= 3*Error_depth ) //depth too low to solve this branch easily.
1461 || (g_levs[threadId][i].depth <= 3 * Error_depth && 2 * g_levs[threadId][i].depth <= initial_depth ) //clear branch and low depth.
1462 || (is_low_depth == 1 && 2 * g_levs[threadId][i].depth <= initial_depth)
1470 is_depth = 1;
1471 break;
1473 int changed_consensus = 0;
1474 //checking branching issue.
1475 for(j=0; j<4; j++)
1477 if(g_levs[threadId][i].c[j] == 0 || j == g_levs[threadId][i].base)
1478 continue;
1479 //having one branch.
1480 solved[i] = 0;
1481 if(DEBUG) fprintf(logs, "seed_len %d, i %d, total len %d, depth %d, conf_depth %d, tobase %d, cur depth %.1f\n", seed_len, i, seed_len + i, g_levs[threadId][i].depth, g_levs[threadId][i].c[j], j, curr_depth);
1483 //long extension break, reduce mismatches.
1484 if( i> 1 && (g_levs[threadId][i].depth + g_levs[threadId][i].c[j]) * 2 < initial_depth && g_levs[threadId][i].depth <= 3* Error_depth)
1486 if(DEBUG) fprintf(logs, "Decrease to low.\n");
1487 break;
1490 //careful solving cases, increase length.
1491 if( is_low_depth == 1 && i > 1 && initial_depth - g_levs[threadId][i].depth > g_levs[threadId][i].depth/2 && initial_depth - g_levs[threadId][i].depth > 2*Error_depth && g_levs[threadId][i].c[j] >= Error_depth)
1493 if(DEBUG) fprintf(logs, "Branches in low coverage regions.\n");
1494 is_depth = 1;
1495 break;
1498 //having been solved.
1499 if(g_nodes[threadId][ g_levs[threadId][i].node ].level - g_nodes[threadId][ g_levs[threadId][i].node ].len + 1 != i)
1501 if(DEBUG) fprintf(logs, "Innode.\n");
1503 if(curr_depth <= 2* Error_depth || g_levs[threadId][i].depth <= 2*Error_depth)
1504 break;
1506 solved[i] = 1;
1507 continue;
1510 has_other = 0; has_other_depth=0;
1511 for(k=j+1; k< 4; k++)
1513 if(k == g_levs[threadId][i].base)
1514 continue;
1515 if(g_levs[threadId][i].c[k] >= Error_depth)
1517 has_other = 1; has_other_depth += g_levs[threadId][i].c[k];
1520 if(has_other == 1 && has_other_depth + g_levs[threadId][i].c[j] > g_levs[threadId][i].depth/2 && curr_depth < Expect_depth/5 && curr_depth < current_base_depth/3)
1522 if(DEBUG) fprintf(logs, "Too much confliction and too long extension.\n");
1523 break;
1525 //get current branch caused sub-node.
1526 s1[0]='\0';
1527 sub_node = get_branch_node(logs, g_nodes[threadId], g_levs[threadId], i, j, s[0]);
1530 //no new sub-node from the main branch.
1531 if(sub_node == 0)
1533 s1[0]=dnaChar[j]; s1[1]='\0';
1534 s2[0]=dnaChar[g_levs[threadId][i].base]; s2[1]='\0';
1535 is_solved = solve_by_seed(logs, ctg_seq, st, s1, s2, i, current_base_depth, g_levs[threadId][i].depth, 0, seed_len, g_nodes[threadId][g_levs[threadId][i].node].level, threadId);
1536 if(DEBUG) fprintf(logs, "No sub node.\n");
1538 if(is_solved == -1 && has_other == 1)
1540 changed_consensus = 1;
1541 if(DEBUG) fprintf(logs, "ChangeCons3 at %d with num %d\n", i, g_levs[threadId][i].nnum);
1542 set_reads_cons(s, i, 0, threadId);
1543 g_levs[threadId][i].base = j;
1544 g_levs[threadId][i].depth = g_levs[threadId][i].c[j];
1545 g_levs[threadId][i].node = sub_node;
1546 continue;
1549 if(is_solved != 0 && is_solved < 3)
1551 if(is_solved != 2)
1553 *is_unique = 0;
1554 *has_repeat_break = 1;
1555 if(is_solved == 1 && i > 1 && g_levs[threadId][i].c[j] == 1 && g_levs[threadId][i].depth <= 3 * Consensus_depth)
1556 break;
1557 if(is_solved == -1)
1559 if(DEBUG) fprintf(logs, "ChangeCons1 at %d with num %d\n", i, g_levs[threadId][i].nnum);
1560 set_reads_cons(s, i, 0, threadId);
1561 g_levs[threadId][i].base = j;
1562 g_levs[threadId][i].depth = g_levs[threadId][i].c[j];
1563 g_levs[threadId][i].node = sub_node;
1564 //set_reads_cons(s, i, 1, threadId);
1565 return i;
1569 solved[i] = 1;
1570 continue;
1572 break;
1575 if(i == 1 && Expect_depth > Error_depth * 10 && g_levs[threadId][i].depth + g_levs[threadId][i].c[j] < curr_depth * Upper_bound)
1577 //if(is_high_depth_sequencing_error(logs, g_nodes[threadId], g_levs[threadId], i, s, sub_node) == 1)
1578 //print_base(logs, g_nodes[threadId][sub_node].b);
1579 //print_base(logs, g_nodes[threadId][g_levs[threadId][i].node].b);
1580 int i_err = is_illumina_error(logs, seed_seq, g_nodes[threadId][sub_node], g_nodes[threadId][g_levs[threadId][i].node], g_levs[threadId][i].depth, g_levs[threadId][i].c[j]);
1581 if(i_err == 1)
1583 if(DEBUG) fprintf(logs, "Illumina sequencing error: %s.\n", seed_seq);
1584 solved[i] = 1;
1585 continue;
1587 if(i_err == 2 && is_low_depth == 0)
1589 if(DEBUG) fprintf(logs, "ChangeCons3 at %d with num %d seed %s.\n", i, g_levs[threadId][i].nnum, seed_seq);
1590 set_reads_cons(s, i, 0, threadId);
1591 g_levs[threadId][i].base = j;
1592 g_levs[threadId][i].depth = g_levs[threadId][i].c[j];
1593 g_levs[threadId][i].node = sub_node;
1594 return i;
1598 //get the sequences of two branches, one is main, the other is the new branch.
1599 int diff = get_pair_seq(logs, g_nodes[threadId], g_levs[threadId], i, s, s1, s2, sub_node);
1600 if(strlen(s1) == 0)
1602 if(DEBUG) fprintf(logs, "treat as sequencing error.\n");
1603 solved[i] = 1;
1604 continue;
1606 double dif_ratio = double(diff)/double(strlen(s1));
1607 if(DEBUG) fprintf(logs, "s1 %s, s2 %s diff %d len %d\n", s1, s2, diff, strlen(s1));
1610 if(dif_ratio < 0.2 && high_confident_err(seed_len, initial_depth, i, g_levs[threadId][i-1].depth, g_levs[threadId][i].depth, g_levs[threadId][i].c[j], diff) == 1)
1612 solved[i] = 1;
1613 continue;
1615 if(DEBUG) fprintf(logs, "lowconf\n");
1617 //double curr_depth = double(Expect_depth*(Read_length-(seed_len+i)+1))/double(Read_length);
1618 if(g_levs[threadId][i].c[j] <= Error_depth && g_levs[threadId][i].depth > 2*Error_depth && curr_depth > 2 * Error_depth
1619 && g_levs[threadId][i].c[j] < g_levs[threadId][i].depth * 0.1
1620 && !((seed_len > depth_half_length && (curr_depth * 2 < g_levs[threadId][i].depth || (dif_ratio > 0.05))))// || (dif_ratio > 0.1 )))
1621 && !(diff >= 5)
1624 solved[i] = 1;
1625 continue;
1627 if(DEBUG) fprintf(logs, "NotLargeDif&LowBranch\n");
1629 //checking whether this is a tip or solved by increasing seed length.
1630 is_solved = solve_by_seed(logs, ctg_seq, st, s1, s2, i, current_base_depth , g_levs[threadId][i].depth, dif_ratio, seed_len, g_nodes[threadId][g_levs[threadId][i].node].level, threadId);
1632 if(is_solved == -1 && has_other == 1)
1634 changed_consensus = 1;
1635 if(DEBUG) fprintf(logs, "ChangeCons4 at %d with nnum %d \n", i, g_levs[threadId][i].nnum);
1636 set_reads_cons(s, i, 0, threadId);
1637 g_levs[threadId][i].base = j;
1638 g_levs[threadId][i].depth = g_levs[threadId][i].c[j];
1639 g_levs[threadId][i].node = sub_node ;
1640 continue;
1642 if(is_solved != 0 && is_solved < 3 && !(is_solved == -1 && has_other == 1))
1644 if(is_solved != 2)
1646 *is_unique = 0;
1647 //break;
1648 *has_repeat_break = 1;
1649 if(is_solved == 1 && i > 1 && g_levs[threadId][i].c[j] == 1 && g_levs[threadId][i].depth <= 3 * Consensus_depth)
1650 break;
1651 if(is_solved == -1)
1653 if(DEBUG) fprintf(logs, "ChangeCons2 at %d with nnum %d \n", i, g_levs[threadId][i].nnum);
1654 set_reads_cons(s, i, 0, threadId);
1655 g_levs[threadId][i].base = j;
1656 g_levs[threadId][i].depth = g_levs[threadId][i].c[j];
1657 g_levs[threadId][i].node = sub_node ;
1658 //set_reads_cons(s, i, 1, threadId);
1659 return i;
1662 solved[i] = 1;
1663 continue;
1666 if(is_solved == 5 && has_other == 1)
1668 solved[i] = 1;
1669 continue;
1671 if(DEBUG) fprintf(logs, "Unsolved.\n");
1673 if(Solve_Hete == 1 && curr_depth > 2 * Error_depth //&& is_low_depth == 0
1674 && (g_levs[threadId][i].depth + g_levs[threadId][i].c[j] <= Upper_bound * curr_depth || strlen(s1) > seed_len * 2)
1675 && is_short_cons(strlen(s1), seed_len) == 0
1678 if(is_hete_bubble(s1, s2, current_base_depth, diff, seed_len, threadId) == 1)
1680 solved[i] = 1;
1681 continue;
1682 }else if(diff == 1 && i+ seed_len <= MINOVERLAP + 1 && strlen(s1) >= MINOVERLAP && g_levs[threadId][i].depth + g_levs[threadId][i].c[j] <= Upper_bound * curr_depth)
1684 solved[i] = 1;
1685 continue;
1689 if(DEBUG) fprintf(logs, "NotHete\n");
1690 //masked at Sep 18.2014 for YH assembly.
1691 if(Solve_Hete == 1
1692 && (is_solved >= 3 && is_solved <5)
1693 && ((diff < 3 && dif_ratio < 0.2) || (diff< 5 && dif_ratio <= 0.05))
1694 && g_levs[threadId][i].depth + g_levs[threadId][i].c[j] <= curr_depth
1697 if(is_solved == 4 && (g_levs[threadId][i].c[j] == g_levs[threadId][i].depth || g_levs[threadId][i].c[j] > Error_depth))
1699 if(DEBUG) fprintf(logs, "ChangeCons3 at %d with nnum %d \n", i, g_levs[threadId][i].nnum);
1700 set_reads_cons(s, i, 0, threadId);
1701 g_levs[threadId][i].base = j;
1702 g_levs[threadId][i].depth = g_levs[threadId][i].c[j];
1703 g_levs[threadId][i].node = sub_node ;
1704 return i;
1707 solved[i] = 1;
1708 continue;
1710 if(is_solved != 3 && is_solved != 4)
1711 *has_repeat_break = 1;
1713 if(DEBUG) fprintf(logs, "NotHete2\n");
1714 //solving repeat by Pair end information.
1715 if(Solve_Conf == 1 && i == 1 )//&& !(g_levs[threadId][i].c[j] < Error_depth && g_levs[threadId][i].depth > 2 * Error_depth && g_levs[threadId][i].depth + g_levs[threadId][i].c[j] < 4 * Error_depth && diff >= 5))
1717 is_solved = solve_conf(logs, g_nodes[threadId], g_levs[threadId], g_reads[threadId], s, seed_len, ctg_len, threadId, diff);
1718 if(is_solved == 1)
1720 if(DEBUG) fprintf(logs, "conf_solved.\n");
1721 solved[i] = 1;
1722 return i;
1723 //return find_last_lev(logs, s, ctg_seq, seed_seq, seed_len, ctg_len, initial_depth, is_unique, threadId);
1727 if(DEBUG) fprintf(logs, "CONF fail.\n");
1728 if(i == 1)
1730 //fprintf(logs, " %d,%d,%d",seed_len, g_levs[threadId][i].depth, g_levs[threadId][i].c[j]);
1731 i=-1;
1732 return i;
1734 break;
1737 //checking have branch or not.
1738 if(solved[i] == 0)// || seed_len + i >= depth_6_length || levs[i].depth <= 2 * Error_depth)
1740 break;
1742 st[i] = dnaChar[g_levs[threadId][i].base];
1744 i--;
1745 return i;
1748 int iter_consensus(FILE* logs, int* s, int *len, char* ctg_seq, char* seed_seq, int seed_len, int ctg_len, int keep_used, int initial_depth, int iter, int* is_unique, int threadId)
1750 int i, j=0, k, keep_used_extension=0, has_used = 0, has_repeat_break = 0;
1752 i = find_last_lev(logs, s, ctg_seq, seed_seq, seed_len, ctg_len, initial_depth, is_unique, &has_repeat_break, threadId);
1753 if( i <= 0) //lowdepth2.
1755 s[0]=0; s[1]=0; s[2]=0; *len = 0;
1756 if(i<0)
1757 return 5;
1758 return 3;
1760 //s is updated here. Used reads.
1761 i = mask_last_reads(logs, i, s, ctg_seq, keep_used, &keep_used_extension, &has_used, ctg_len, iter, *is_unique, threadId, has_repeat_break, seed_len);
1763 if(i <=0 )
1765 *len = 0;
1766 if(i == -100)
1767 return 10;
1769 if(i == 0 && keep_used_extension == 0)
1770 return 4;
1771 return 2;
1774 i = g_levs[threadId][i].node;
1775 j = 0;
1777 while(1)
1779 for(k=g_nodes[threadId][i].len-1; k>=0; k--, j++)
1780 g_seq[threadId][j] = g_nodes[threadId][i].seq[k];
1782 if(i==0 && i == g_nodes[threadId][i].parent)
1783 break;
1784 i = g_nodes[threadId][i].parent;
1786 *len = j;
1788 if(j==0)
1790 fprintf(stderr, "[%d,%d]",i, g_nodes[threadId][i].len);
1793 g_seq[threadId][j]='\0';
1795 if(keep_used_extension == 1)
1796 return 7;
1797 if(has_used == 1)
1798 return 4;
1800 return 1;
1804 Update reads used in this contigs.
1807 ULL get_pair_id(ULL id)
1809 if(id%2 == 0)
1810 return id+1;
1811 return id-1;
1815 int is_unique(int depth, int len)
1817 return depth > double(Read_length-len+1)/double(Read_length)*double(Expect_depth)*Upper_bound ? 0 : 1;
1820 int is_repeat(int depth, int len)
1822 return depth > (3- Upper_bound)*double(Read_length-len+1)/double(Read_length)*double(Expect_depth) ? 1 : 0;
1825 int is_long_repeat_seed(int depth, int len)
1827 return (len >= depth_9_length && depth >= 2 * double(Read_length-len+1)/double(Read_length)*double(Expect_depth));
1830 void free_thread_tmpRead(int threadId, int flag)
1832 int i;
1833 if(flag == 1)
1835 for(i=0; i< tmpCount[threadId]; i++)
1837 if(tmpRead[threadId][i].keep == 0)
1838 continue;
1839 clean_use_bwt_sparse(tmpRead[threadId][i].id, tmpRead[threadId][i].size);
1840 endInf[threadId].ur--;
1841 tmpRead[threadId][i].keep = 0;
1843 }else{
1844 for(i=0; i< tmpCount[threadId]; i++)
1846 if(tmpRead[threadId][i].keep == 0)
1847 continue;
1848 int flag = clean_current_bwt_sparse(tmpRead[threadId][i].id, tmpRead[threadId][i].size);
1851 tmpCount[threadId] = 0;
1855 Update contig and get the new iteration seed using the extended sequence.
1858 void str_copy(char* seq, char* from, int start, int len)
1860 int i, j, l=0;
1861 for(i=start,j=0,l=0; i< strlen(from) && l<len; i++,j++,l++)
1863 seq[j]=from[i];
1865 seq[j]='\0';
1868 void reverse_com(char* from, char* to, int len)
1870 int i,j;
1871 for(i=0; i<len ; i++)
1873 to[i] = complementMap[ from[len-i-1]];
1875 to[i]='\0';
1878 void clean_trimed_reads(int ctg_len, int read_num, int threadId)
1880 int i=0, j=0;
1881 for(i = tmpCount[threadId] -1; i>=0 && j < read_num; i--, j++)
1883 if(tmpRead[threadId][i].pos < ctg_len)
1884 break;
1885 if(tmpRead[threadId][i].keep == 0)
1886 continue;
1888 clean_use_bwt_sparse(tmpRead[threadId][i].id, tmpRead[threadId][i].size);
1889 endInf[threadId].ur--;
1890 tmpRead[threadId][i].keep = 0;
1895 int update_ctg(int len, int cur_iter, int read_num, int threadId)
1897 //update the sequence of ctg.
1899 if(len + ctgs[threadId][ctg_num[threadId]].len >= MaxCtgLen)
1901 fprintf(stderr, "Too long contigs with length %d, %d\n", ctgs[threadId][ctg_num[threadId]].len, len);
1902 return len;
1904 int i=ctgs[threadId][ctg_num[threadId]].len, j=0;
1905 int oldctglen = i;
1906 for(; j < len ; i++, j++)
1908 ctgs[threadId][ctg_num[threadId]].seq[i] = g_seq[threadId][len - 1 - j];
1910 ctgs[threadId][ctg_num[threadId]].seq[i]='\0';
1911 ctgs[threadId][ctg_num[threadId]].len = i;
1913 //get the new seed.
1914 if(cur_iter < 1)
1916 fprintf(stderr, "Please check: %d\n", cur_iter);
1917 exit(2);
1919 char tmpseq[len + g_iters[threadId][cur_iter-1].slen];
1920 char rctmpseq[len + g_iters[threadId][cur_iter-1].slen];
1922 i=0;
1923 for(j=0; j<len; i++, j++)
1924 tmpseq[i] = g_seq[threadId][j];
1926 for(j=0; j< g_iters[threadId][cur_iter-1].slen; i++, j++)
1927 tmpseq[i] = g_iters[threadId][cur_iter-1].seq[j];
1929 tmpseq[i]='\0';
1930 int tmplen = i;
1931 reverse_com(tmpseq, rctmpseq, tmplen);
1932 int end = tmplen - 1;
1933 for(i=0; i< bwtCount; i++)
1935 tmpbase[threadId][i].saL = 0; tmpbase[threadId][i].saR = 0;
1936 tmpbase[threadId][i].plus = 0; tmpbase[threadId][i].minus = 0;
1937 tmpbase[threadId][i].saLL = 0; tmpbase[threadId][i].saRR = 0;
1939 unsigned long long depth=0, tdepth;
1941 int prev_repeat = is_repeat(g_iters[threadId][cur_iter-1].depth, g_iters[threadId][cur_iter-1].slen);
1942 int trim_len = 0, rep_start = -1;
1944 if(DEBUG) fprintf(stdout, "[iter] %d\n", cur_iter-1);
1945 int start = extend_backwards_rep(rctmpseq, 0, end, tmpbase[threadId], &(depth), &rep_start);
1946 if(DEBUG) fprintf(stdout, "[iter] %d,d=%d,l=%d, r=%d, start=%d, depth=%d\n", cur_iter-1, g_iters[threadId][cur_iter-1].depth, g_iters[threadId][cur_iter-1].slen, prev_repeat, start, depth);
1948 if((start < 0 && depth > Error_depth) || prev_repeat ==1)
1951 if(rep_start != -1)
1953 end = tmplen - rep_start -1;
1954 }else
1955 end = strlen(tmpseq) - 1;
1957 if(end >= strlen(tmpseq))
1958 fprintf(stdout, "T1: %s, %d; new seq %s; iter seq %s; newlen %d, seedlen %d, end %d; iter %d\n", tmpseq, strlen(tmpseq), g_seq[threadId], g_iters[threadId][cur_iter-1].seq, len, g_iters[threadId][cur_iter-1].slen, end, cur_iter-1);
1959 set_SAranges(tmpseq, 0, end, tmpbase[threadId], &(depth));
1960 }else{
1962 if(prev_repeat == 1)
1964 int break_start;
1965 fprintf(stdout, "Iter %d PrevRepeat: %d, %d; ", cur_iter, g_iters[threadId][cur_iter-1].depth, g_iters[threadId][cur_iter-1].slen);
1966 while(start >= 0 && end > g_iters[threadId][cur_iter-1].slen)
1968 end --;
1969 start = extend_backwards(rctmpseq, 0, end, tmpbase[threadId], &(depth), &break_start);
1971 //fprintf(stdout, "Findstart %d ", start);
1973 if(start <0)
1975 trim_len = tmplen - 1 - end;
1976 end = MINOVERLAP - 1 + trim_len;
1977 set_SAranges(tmpseq, trim_len, end, tmpbase[threadId], &(depth));
1978 while(is_repeat(depth, MINOVERLAP) == 0 && end < tmplen -1 && trim_len < len)
1980 trim_len ++;
1981 end ++;
1982 set_SAranges(tmpseq, trim_len, end, tmpbase[threadId], &(depth));
1984 if(end == tmplen -1 || trim_len == len)
1986 fprintf(stdout, "all trimed0.\n");
1987 ctgs[threadId][ctg_num[threadId]].len = oldctglen;
1988 ctgs[threadId][ctg_num[threadId]].seq[ ctgs[threadId][ctg_num[threadId]].len ] = '\0';
1989 return len;
1991 fprintf(stdout, "FinalTrim%d and %d\n", trim_len, end);
1992 ctgs[threadId][ctg_num[threadId]].len -= trim_len;
1993 ctgs[threadId][ctg_num[threadId]].seq[ ctgs[threadId][ctg_num[threadId]].len ] = '\0';
1994 }else{
1995 fprintf(stdout, "all trimed1.\n");
1996 ctgs[threadId][ctg_num[threadId]].len = oldctglen;
1997 ctgs[threadId][ctg_num[threadId]].seq[ ctgs[threadId][ctg_num[threadId]].len ] = '\0';
1998 return len;
2000 }else{
2001 //find a unique seed for the new iteration.
2002 if(depth <= Error_depth && g_iters[threadId][cur_iter-1].depth > Error_depth + 1)
2004 trim_len = 1; int start_trim=0, now_rep_start = -1;
2005 if(DEBUG) fprintf(stdout, "Iter %d PrevRepeat: %d, %d; ", cur_iter, g_iters[threadId][cur_iter-1].depth, g_iters[threadId][cur_iter-1].slen);
2006 while(depth <= Error_depth && trim_len < len )
2008 now_rep_start = -1;
2009 if(DEBUG) fprintf(stdout, "Ending at %d\n", end-trim_len);
2010 start_trim = extend_backwards_rep(rctmpseq, 0, end-trim_len, tmpbase[threadId], &(depth), &now_rep_start);
2011 if(DEBUG) fprintf(stdout, "End got depth %d, start %d\n", depth, start_trim);
2012 trim_len++;
2015 if(trim_len <= len && depth > Error_depth)
2017 trim_len--;
2018 if(DEBUG) fprintf(stdout, "LowTrim%d \n", trim_len);
2019 ctgs[threadId][ctg_num[threadId]].len -= trim_len;
2020 ctgs[threadId][ctg_num[threadId]].seq[ ctgs[threadId][ctg_num[threadId]].len ] = '\0';
2021 if(start_trim >= 0)
2022 end = tmplen -1 - start_trim;
2023 else{
2024 if(now_rep_start != -1)
2025 end = tmplen -1 - now_rep_start;
2026 else
2027 end = strlen(tmpseq) -1;
2029 clean_trimed_reads(ctgs[threadId][ctg_num[threadId]].len, read_num, threadId);
2030 }else{
2031 if(DEBUG) fprintf(stdout, "Lowtrim fail with trim_len %d and start_trim %d, final depth %lld\n", trim_len, start_trim, depth);
2033 trim_len = 0; end = tmplen - 1 - start;
2034 if(start < 0)
2036 if(rep_start != -1)
2037 end = tmplen -1 - rep_start;
2038 else
2039 end = strlen(tmpseq) -1;
2042 }else{
2043 trim_len = 0; end = tmplen - 1 - start;
2044 if(start < 0)
2046 if(rep_start != -1)
2048 end = tmplen - rep_start -1;
2049 // fprintf(stdout, "New seed length2: %d for iter %d\n", end+1, cur_iter);
2050 }else
2051 end = strlen(tmpseq) -1;
2054 if(end >= strlen(tmpseq))
2055 fprintf(stdout, "T2: %s, %d %d; iter %d\n", tmpseq, strlen(tmpseq), end, cur_iter-1);
2056 set_SAranges(tmpseq, trim_len, end, tmpbase[threadId], &(depth));
2060 set_base(g_iters[threadId][cur_iter].b, tmpbase[threadId]);
2062 if(depth > MAXDEPTH)
2063 g_iters[threadId][cur_iter].depth = MAXDEPTH;
2064 else
2065 g_iters[threadId][cur_iter].depth = depth;
2067 g_iters[threadId][cur_iter].slen = end - trim_len + 1;
2068 str_copy(g_iters[threadId][cur_iter].seq, tmpseq, trim_len, g_iters[threadId][cur_iter].slen);
2070 g_iters[threadId][cur_iter].rnum = 0;
2071 g_iters[threadId][cur_iter].end = 0;
2072 g_iters[threadId][cur_iter].pe = 0;
2073 g_iters[threadId][cur_iter].len = 0;
2075 return trim_len;
2079 Using PE information to improve iteration.
2080 iter_num >=0, read_num >=0.
2082 Output:
2083 Fill in ctg, tmpRead, iters.
2085 void print_inf(FILE* fp, Node* nodes, Lev* levs, ReadId* reads, int* s, int cur_len)
2087 fprintf(fp, "current contig length: %d\n", cur_len);
2088 fprintf(fp, "Node:\n");
2089 int i;
2090 for(i=0; i< s[0]; i++)
2091 fprintf(fp, "%d\t%s,%d,%d,%d,%d,%d\n",i, nodes[i].seq, nodes[i].len, nodes[i].level, nodes[i].depth, nodes[i].parent, nodes[i].is_end);
2093 fprintf(fp, "Read:\n");
2094 for(i=0; i< s[2]; i++)
2095 fprintf(fp, "%d\t%d,%d,%d,%d,%d\n",i, reads[i].id, reads[i].node, reads[i].level, reads[i].cons, reads[i].strand);
2097 fprintf(fp, "Lev:\n");
2098 for(i=1; i< s[1]; i++)
2099 fprintf(fp, "%d\t%d,%d,%d,%d\n",i, levs[i].base, levs[i].depth, levs[i].node, levs[i].nnum);
2103 void mask_branch_reads( Node* nodes, Lev* levs, ReadId* reads, int* s)
2105 int i;
2106 int node_type[s[0]];
2107 for(i=0; i< s[0]; i++)
2108 node_type[i]=0;
2110 for(i=1; i< s[1]; i++)
2112 node_type[ levs[i].node ]=1;
2115 for(i=0; i< s[2]; i++)
2117 if(node_type[ reads[i].node ] == 0)
2119 reads[i].node=0; reads[i].level=0;
2124 int backward_extending_bwtpe(FILE* logs, int is_first, int threadId)
2126 int cur_iter = iterCount[threadId] + 1;
2127 //fprintf(stderr, "thread%d and cur %d\n", threadId, *iter_num);
2128 if(cur_iter > MaxIterNum-2)
2130 if(DEBUG) fprintf(logs, " TooManyIteration");
2131 g_iters[threadId][cur_iter-1].end = 4; iterCount[threadId] = cur_iter;
2132 return 0;
2135 if(g_iters[threadId][cur_iter-1].depth < Error_depth)
2137 if(DEBUG) fprintf(logs, " LowDepth1");
2138 g_iters[threadId][cur_iter-1].end = 0; iterCount[threadId] = cur_iter;
2139 return 0;
2142 if(g_iters[threadId][cur_iter-1].depth >= MAXDEPTH)
2144 if(DEBUG) fprintf(logs, " HighDepth%d,%d", g_iters[threadId][cur_iter-1].depth, g_iters[threadId][cur_iter-1].slen);
2145 g_iters[threadId][cur_iter-1].end = 2; iterCount[threadId] = cur_iter;
2146 return 0;
2149 if((cur_iter > 1 && strcmp(g_iters[threadId][cur_iter-1].seq, g_iters[threadId][cur_iter-2].seq) == 0)
2150 || (cur_iter >2 && strcmp(g_iters[threadId][cur_iter-1].seq, g_iters[threadId][cur_iter-3].seq) == 0)
2151 || (cur_iter >3 && strcmp(g_iters[threadId][cur_iter-1].seq, g_iters[threadId][cur_iter-4].seq) == 0)
2154 if(DEBUG) fprintf(logs, " SameSeed: now seed %s, old1 %s\n", g_iters[threadId][cur_iter-1].seq, g_iters[threadId][cur_iter-2].seq);
2155 g_iters[threadId][cur_iter-1].end = 5; iterCount[threadId] = cur_iter;
2156 return 0;
2159 if(cur_iter > 1)
2161 double depth1 = double(Read_length*g_iters[threadId][cur_iter-1].depth)/double(Read_length-g_iters[threadId][cur_iter-1].slen+1);
2162 double depth2 = double(Read_length*g_iters[threadId][cur_iter-2].depth)/double(Read_length-g_iters[threadId][cur_iter-2].slen+1);
2163 if(depth1 * 6 < depth2 && depth1 < Expect_depth/3)
2165 if(DEBUG) fprintf(logs, " LargeDecrease%.1f,%.1f",depth1, depth2);
2166 g_iters[threadId][cur_iter-1].end = 2; iterCount[threadId] = cur_iter;
2167 return 0;
2170 //define and initialize extending structures.
2171 int len = 0;
2172 int max_node_num = g_iters[threadId][ iterCount[threadId] ].depth * Read_length;
2174 int s[]={0,0,0};
2175 int i,j;
2176 clock_t start_time;
2178 for(i=0; i< max_node_num; i++)
2180 g_nodes[threadId][i].seq[0] = '\0';
2181 g_nodes[threadId][i].len = 0;
2182 g_nodes[threadId][i].level = 0;
2183 g_nodes[threadId][i].parent = 0;
2184 g_nodes[threadId][i].is_end = 0;
2185 g_nodes[threadId][i].conf = 0;
2186 set_blank_base( g_nodes[threadId][i].b);
2188 set_base(g_nodes[threadId][0].b, g_iters[threadId][ cur_iter - 1].b);
2190 g_nodes[threadId][0].depth = g_iters[threadId][ cur_iter - 1 ].depth;
2191 g_seq[threadId][0]='\0';
2192 s[0]++;
2194 for(i=0; i< Read_length; i++)
2196 g_levs[threadId][i].nnum = 0;
2197 for(j=0; j<5; j++)
2198 g_levs[threadId][i].c[j]=0;
2200 g_levs[threadId][0].node = 0;
2201 s[1]++;
2203 //fprintf(stderr, "Iter %d is fisrt %d with given depth %d from depth %d and calculated %d\n", cur_iter - 1, is_first, get_real_depth(g_nodes[threadId][0].b), get_real_depth(g_iters[threadId][cur_iter - 1].b), g_iters[threadId][cur_iter - 1].depth);
2204 //starting iterate.
2205 iterCount[threadId] = cur_iter;
2206 int is_continue = 1;
2207 start_time = clock();
2208 while(is_continue == 1 && s[1] + g_iters[threadId][cur_iter-1].slen <= Read_length + 1)
2210 is_continue = backward_search(logs, s, threadId);
2213 if(DEBUG)
2215 fprintf(logs, "thread=%d;%d,%d,%d,%d,%d,%d\n",threadId, cur_iter-1, g_iters[threadId][cur_iter-1].slen, g_iters[threadId][cur_iter-1].depth, s[0], s[1], s[2]);
2217 if(cur_iter < 7000)
2219 fprintf(logs, "\nIter %d, depth %.1f\n", cur_iter-1, double(Read_length*g_iters[threadId][cur_iter-1].depth)/double(Read_length-g_iters[threadId][cur_iter-1].slen+1));
2220 if(s[1] > 0)
2221 print_inf(logs, g_nodes[threadId], g_levs[threadId], g_reads[threadId], s, ctgs[threadId][ctg_num[threadId]].len);
2224 //fflush(stdout);
2226 //local consensus.
2227 int extension_type =1;
2228 int unique = is_unique(g_iters[threadId][cur_iter-1].depth, g_iters[threadId][cur_iter-1].slen);
2229 int repeat = is_repeat(g_iters[threadId][cur_iter-1].depth, g_iters[threadId][cur_iter-1].slen);
2230 int initial_read_num=s[2], is_now_unique = unique;
2232 extension_type = iter_consensus(logs, s, &len, ctgs[threadId][ctg_num[threadId]].seq, g_iters[threadId][cur_iter-1].seq, g_iters[threadId][cur_iter-1].slen, ctgs[threadId][ctg_num[threadId]].len, 0, g_iters[threadId][cur_iter-1].depth, cur_iter-1, &is_now_unique, threadId );
2234 if(extension_type == 10)
2236 //meet thread conflition.
2237 return 1;
2240 if(DEBUG) fprintf(logs, "thread%d Seq: %s, len %d, %d, %d, %d\n", threadId, g_seq[threadId], len, s[0], s[1], s[2]);
2242 int trim_len = 0;
2243 if(ctgs[threadId][ctg_num[threadId]].len > Read_length && extension_type == 4 && (is_now_unique == 1 || repeat == 0))// && ctgs[threadId][ctg_num[threadId]].len >= 2* Read_length)
2245 ctgs[threadId][ctg_num[threadId]].len = ctgs[threadId][ctg_num[threadId]].len - Read_length;
2246 ctgs[threadId][ctg_num[threadId]].seq[ctgs[threadId][ctg_num[threadId]].len] = '\0';
2247 clean_trimed_reads(ctgs[threadId][ctg_num[threadId]].len, tmpCount[threadId], threadId);
2248 trim_len = len;
2249 extension_type = 2;
2250 }else if(len > 0)
2252 trim_len = update_ctg(len, cur_iter, s[2], threadId);
2253 if(len == trim_len)
2255 s[2]=0;
2256 len = 0;
2257 trim_len = 0;
2258 extension_type = 6;
2260 }else if(extension_type == 1)
2261 extension_type = 0;
2263 //update ctg and iters.
2264 g_iters[threadId][cur_iter-1].len = len - trim_len;
2265 g_iters[threadId][cur_iter-1].rnum = s[2];
2266 g_iters[threadId][cur_iter-1].pe = s[0]>0;
2267 g_iters[threadId][cur_iter-1].end = 0;
2269 //further extending.
2270 switch (extension_type)
2272 case 0:
2273 //fprintf(logs, " NOEXTEND");
2274 g_iters[threadId][cur_iter-1].end = 4;
2275 break;
2276 case 1:
2277 //fprintf(stderr, "SIN");
2278 return backward_extending_bwtpe(logs, 0, threadId);
2279 break;
2280 case 2:
2281 if(DEBUG) fprintf(logs, "UsedRead");
2282 g_iters[threadId][cur_iter-1].end = 3;
2283 //backward_extending_bwtpe(ctg, tmpRead, iters, iter_num, read_num);
2284 break;
2285 case 3:
2286 if(DEBUG) fprintf(logs, "LowDepth2");
2287 g_iters[threadId][cur_iter-1].end = 1;
2288 break;
2289 case 4:
2290 if(DEBUG) fprintf(logs, "UsedRead2");
2291 g_iters[threadId][cur_iter-1].end = 6;
2292 break;
2293 case 5:
2294 if(DEBUG) fprintf(logs, "CB");
2295 g_iters[threadId][cur_iter-1].end = 7;
2296 break;
2297 case 6:
2298 if(DEBUG) fprintf(logs, " R2U");
2299 g_iters[threadId][cur_iter-1].end = 8;
2300 break;
2301 case 7:
2302 if(DEBUG) fprintf(logs, "CurrentUse");
2303 g_iters[threadId][cur_iter-1].end = 9;
2304 break;
2305 case 8:
2306 if(DEBUG) fprintf(logs, "Cut");
2307 g_iters[threadId][cur_iter-1].end = 10;
2308 break;
2309 case 9:
2310 if(DEBUG) fprintf(logs, "CurrentUse0");
2311 g_iters[threadId][cur_iter-1].end = 11;
2312 break;
2313 default:
2314 break;
2316 //fflush(logs);
2317 return 0;