modified: myjupyterlab.sh
[GalaxyCodeBases.git] / BGI / SOAPsnp / call_genotype.cc
bloba1cd3873058dc613d3db1d8eda1a973375f08935
1 #include "soap_snp.h"
2 #include <cassert>
4 int Call_win::initialize(ubit64_t start) {
5 std::string::size_type i;
6 for(i=0; i != read_len + win_size ; i++) {
7 sites[i].pos = i+start;
9 return 1;
11 //update 2010-12-20 by guyue
12 int Call_win::deep_init(ubit64_t start) {
13 memset(sites, 0, pos_size * (read_len + win_size));
14 int i;
15 for(i=0; i != read_len + win_size ; i++) {
16 sites[i].pos = i+start;
17 sites[i].ori = 0xFF;
18 //sites[i].depth = 0;
19 //sites[i].repeat_time = 0;
20 //sites[i].dep_uni = 0;
21 //memset(sites[i].count_uni,0,sizeof(int)*4);
22 //memset(sites[i].q_sum,0,sizeof(int)*4);
23 //// add by Bill
24 //memset(sites[i].count_sfs,0,sizeof(int)*4);
25 //memset(sites[i].base_info,0,sizeof(small_int)*4*2*64*256);
26 //memset(sites[i].count_all,0,sizeof(int)*4);
28 return 1;
30 //update 2010-12-20 by guyue
31 int Call_win::recycle()
33 std::string::size_type i;
34 //for(i=0; i != read_len ; i++)
35 //{
36 // sites[i].pos = sites[i+win_size].pos;
37 // sites[i].ori = sites[i+win_size].ori;
38 // sites[i].depth = sites[i+win_size].depth;
39 // sites[i].repeat_time = sites[i+win_size].repeat_time;
40 // sites[i].dep_uni = sites[i+win_size].dep_uni;
41 // memcpy(sites[i].base_info, sites[i+win_size].base_info, sizeof(small_int)*4*2*64*256); // 4 types of bases, 2 strands, max quality score is 64, and max read length 256
42 // memcpy(sites[i].count_uni, sites[i+win_size].count_uni, sizeof(int)*4);
43 // memcpy(sites[i].q_sum, sites[i+win_size].q_sum, sizeof(int)*4);
44 // memcpy(sites[i].count_all, sites[i+win_size].count_all, sizeof(int)*4);
45 // // add by Bill
46 // memcpy(sites[i].count_sfs, sites[i+win_size].count_sfs, sizeof(int)*4);
47 //}
49 memcpy(&sites[0], &sites[win_size], pos_size*read_len);
50 memset(&sites[read_len], 0, pos_size * (win_size));
51 for(i=read_len; i != read_len+win_size; i++)
53 //sites[i].ori = 0xFF;
54 //sites[i].pos = sites[i-1].pos+1;
55 //sites[i].depth = 0;
56 //sites[i].repeat_time = 0;
57 //sites[i].dep_uni = 0;
58 //memset(sites[i].count_uni,0,sizeof(int)*4);
59 //memset(sites[i].q_sum,0,sizeof(int)*4);
60 //memset(sites[i].base_info,0,sizeof(small_int)*4*2*64*256);
61 //memset(sites[i].count_all,0,sizeof(int)*4);
62 //// add by Bill
63 //memset(sites[i].count_sfs,0,sizeof(int)*4);
64 sites[i].ori = 0xFF;
65 sites[i].pos = sites[i-1].pos+1;
67 return 1;
71 int Call_win::call_cns(Chr_name call_name, Chr_info* call_chr, ubit64_t call_length, Prob_matrix * mat, Parameter * para, gzoutstream * consensus, my_ofstream & baseinfo, SfsMethod &sfsMethod, const int id)
73 std::string::size_type coord;
74 small_int k;
75 ubit64_t o_base, strand;
76 char allele1, allele2, genotype, type, type1/*best genotype*/, type2/*suboptimal genotype*/, base1, base2, base3;
77 int i, q_score, q_adjusted, qual1, qual2, qual3, q_cns, all_count1, all_count2, all_count3;
78 //int *pcr_dep_count;
79 //pcr_dep_count = new int [para->read_length*2];
80 double rank_sum_test_value, binomial_test_value;
81 bool is_out;
82 // double * real_p_prior = new double [16];
83 // double * likelihoods = new double [10];
84 // memset(likelihoods, 0, sizeof(double)*10);
85 //int n_pcr_dep_count;//update by zhukai on 2010-12-27
87 //std::cerr<<"Call length="<<call_length<<endl;
88 for(std::string::size_type j=0; j != call_length; ++j) //modify the j++ to ++j update by zhukai on 2010-12-21
90 //cerr<<sites[j].pos<<endl;
91 if(para->region_only)
93 if (NULL== call_chr->get_region() )
95 break;
97 else if (! call_chr->is_in_region(sites[j].pos))
99 continue;
101 else
106 sites[j].ori = (call_chr->get_bin_base(sites[j].pos))&0xF;
107 if( ((sites[j].ori&4) !=0)/*an N*/ && sites[j].depth == 0)
108 { // CNS text format:
109 // ChrID\tPos\tRef\tCns\tQual\tBase1\tAvgQ1\tCountUni1\tCountAll1\tBase2\tAvgQ2\tCountUni2\tCountAll2\tDepth\tRank_sum\tCopyNum\tSNPstauts\n"
110 if(!para->glf_format && ! para->is_snp_only)
112 // the output file was not opened,
113 if (!consensus->is_open())
115 continue;
117 (*consensus)<<call_name<<'\t'<<(sites[j].pos+1)<<"\tN\tN\t0\tN\t0\t0\t0\tN\t0\t0\t0\t0\t1.00000\t255.00000\t0"<<"\n";
119 else if (para->glf_format)
121 memset(likelihoods, 0, sizeof(double)*10);
122 consensus->write(reinterpret_cast<char*>(likelihoods), sizeof(double)*10);
123 consensus->flush();
124 if(!consensus->good())
126 cerr<<"Broken ofstream after writting Position "<<(sites[j].pos+1)<<" at "<<call_name<<endl;
127 exit(255);
129 baseinfo<<call_name<<'\t'<<(sites[j].pos+1)<<"\tN\tN\t0\tN\t0\t0\t0\tN\t0\t0\t0\t0\t1.00000\t255.00000\t0"<<"\n";
131 else
135 continue;
137 base1 = 0, base2 = 0, base3 = 0;
138 qual1 = -1, qual2= -2, qual3 = -3;
139 all_count1 = 0, all_count2 =0, all_count3 =0;
140 if(sites[j].dep_uni)
142 // This position is uniquely covered by at least one nucleotide
143 for(i=0;i!=4; ++i)
145 // i is four kind of alleles
146 if(sites[j].q_sum[i] >= qual1)
148 base3 = base2;
149 qual3 = qual2;
150 base2 = base1;
151 qual2 = qual1;
152 base1 = i;
153 qual1 = sites[j].q_sum[i];
155 else if (sites[j].q_sum[i]>=qual2)
157 base3 = base2;
158 qual3 = qual2;
159 base2 = i;
160 qual2 = sites[j].q_sum[i];
162 else if (sites[j].q_sum[i]>=qual3)
164 base3 = i;
165 qual3 = sites[j].q_sum[i];
167 else
172 if(qual1 == 0)
174 // Adjust the best base so that things won't look ugly if the pos is not covered
175 base1 = (sites[j].ori&7);
177 else if(qual2 ==0 && base1 != (sites[j].ori&7))
179 base2 = (sites[j].ori&7);
181 else
186 else
188 // This position is covered by all repeats
189 for(i=0;i!=4; ++i)
191 if(sites[j].count_all[i] >= all_count1)
193 base3 = base2;
194 all_count3 = all_count2;
195 base2 = base1;
196 all_count2 = all_count1;
197 base1 = i;
198 all_count1 = sites[j].count_all[i];
200 else if (sites[j].count_all[i]>=all_count2)
202 base3 = base2;
203 all_count3 = all_count2;
204 base2 = i;
205 all_count2 = sites[j].count_all[i];
207 else if (sites[j].count_all[i]>=all_count3)
209 base3 = i;
210 all_count3 = sites[j].count_all[i];
212 else
217 if(all_count1 ==0)
219 base1 = (sites[j].ori&7);
221 else if(all_count2 ==0 && base1 != (sites[j].ori&7))
223 base2 = (sites[j].ori&7);
225 else
230 // Calculate likelihood
231 for(genotype=0;genotype!=16; ++genotype)
233 mat->type_likely[genotype] = 0.0;
235 for(o_base=0;o_base!=4; ++o_base)
237 //cerr<<o_base<<endl;
238 if(sites[j].count_uni[o_base]==0)
240 continue;
242 global_dep_count = -1;
243 memset(pcr_dep_count, 0, sizeof(int)*2*para->read_length);
245 for(q_score=para->q_max-para->q_min; q_score != -1; --q_score)
247 for(coord=0; coord != para->read_length; ++coord)
249 for(strand=0; strand!=2; ++strand)
251 base_i = sites[j].base_info[o_base<<15|strand<<14|q_score<<8|coord]; //update by zhukai on 2010-12-28
252 for(k=0; k != base_i; ++k)
254 if(pcr_dep_count[strand*para->read_length+coord]==0)
256 global_dep_count += 1;
258 pcr_dep_count[strand*para->read_length+coord] += 1;
259 q_adjusted = int( pow(10, (log10(q_score) +(pcr_dep_count[strand*para->read_length+coord]-1)*para->pcr_dependency +global_dep_count*para->global_dependency)) +0.5);
260 if(q_adjusted < 1)
262 q_adjusted = 1;
264 for(allele1 = 0;allele1 != 4; ++allele1)
266 for(allele2 = allele1; allele2 != 4; ++allele2)
268 mat->type_likely[allele1<<2|allele2] += log10(0.5*mat->p_matrix[ ((ubit64_t)q_adjusted<<12) | (coord <<4) | (allele1<<2) | o_base] +0.5*mat->p_matrix[((ubit64_t)q_adjusted<<12) | (coord <<4) | (allele2<<2) | o_base]);
277 /* it is used to sfs, add by Bill*/
278 if (para->sfs != NULL)
280 sfsMethod.getMapData(sites[j], mat, call_name, id);
283 // the output file was not opened,
284 if (!consensus->is_open())
286 continue;
289 if(para->glf_format)
291 for(int i=0; i!=10; ++i)
293 consensus->write(reinterpret_cast<char*>(&mat->type_likely[glf_type_code[i]]), sizeof(double));
295 consensus->flush();
296 if(!consensus->good())
298 cerr<<"Broken ofstream after writting Position "<<(sites[j].pos+1)<<" at "<<call_name<<endl;
299 exit(255);
301 //continue;
303 // Calculate Posterior Probability
304 memcpy(real_p_prior, &mat->p_prior[((ubit64_t)sites[j].ori&0x7)<<4], sizeof(double)*16);
305 if ( (sites[j].ori & 0x8) && para->refine_mode)
307 // a dbSNP
308 snp_p_prior_gen(real_p_prior, call_chr->find_snp(sites[j].pos), para, sites[j].ori);
310 memset(mat->type_prob,0,sizeof(rate_t)*17);
311 type2=type1=16;
312 for (allele1=0; allele1!=4; ++allele1)
314 for (allele2=allele1; allele2!=4; ++allele2)
316 genotype = allele1<<2|allele2;
317 if (para->is_monoploid && allele1 != allele2)
319 continue;
321 mat->type_prob[genotype] = mat->type_likely[genotype] + log10(real_p_prior[genotype]) ;
323 if (mat->type_prob[genotype] >= mat->type_prob[type1] || type1 == 16)
325 // find the genotype which type_prob is biggest.
326 type2 = type1;
327 type1 = genotype;
329 else if (mat->type_prob[genotype] >= mat->type_prob[type2] || type2 ==16)
331 // find the genotype which type_prob is second biggest.
332 type2 = genotype;
334 else
340 is_out = true; // Check if the position needs to be output, useful in snp-only mode
342 if (para->rank_sum_mode)
344 rank_sum_test_value = rank_test(sites[j], type1, mat->p_rank, para);
346 else
348 rank_sum_test_value = 1.0;
350 if(rank_sum_test_value==0.0)
352 // avoid double genotype overflow
353 q_cns = 0;
355 else
357 q_cns = (int)(10*(mat->type_prob[type1]-mat->type_prob[type2])+10*log10(rank_sum_test_value));
360 if ( (type1&3) == ((type1>>2)&3))
361 { // Called Homozygous
362 if (qual1>0 && base1 != (type1&3))
364 //Wired: best base is not the consensus!
365 q_cns = 0;
367 else if (/*qual2>0 &&*/ q_cns > qual1-qual2)
369 // Should not bigger than this
370 q_cns = qual1-qual2;
372 else
377 else
378 { // Called Heterozygous
379 if(sites[j].q_sum[base1]>0 && sites[j].q_sum[base2]>0 && type1 == (base1<base2 ? (base1<<2|base2):(base2<<2|base1)))
381 // The best bases are in the heterozygote
382 if (q_cns > qual2-qual3)
384 q_cns = qual2-qual3;
387 else
388 { // Ok, wired things happened
389 q_cns = 0;
392 if(q_cns>99)
394 q_cns = 99;
396 if (q_cns<0)
398 q_cns = 0;
400 if(! para->glf_format)
402 // ChrID\tPos\tRef\tCns\tQual\tBase1\tAvgQ1\tCountUni1\tCountAll1\tBase2\tAvgQ2\tCountUni2\tCountAll2\tDepth\tRank_sum\tCopyNum\tSNPstauts\n"
403 if(! para->is_snp_only || (abbv[type1] != "ACTGNNNN"[(sites[j].ori&0x7)] && sites[j].depth > 0))
405 if(base1<4 && base2<4)
407 (*consensus)<<call_name<<'\t'<<(sites[j].pos+1)<<'\t'<<("ACTGNNNN"[(sites[j].ori&0x7)])<<'\t'<<abbv[type1]<<'\t'<<q_cns<<'\t'
408 <<("ACTGNNNN"[base1])<<'\t'<<(sites[j].q_sum[base1]==0?0:sites[j].q_sum[base1]/sites[j].count_uni[base1])<<'\t'<<sites[j].count_uni[base1]<<'\t'<<sites[j].count_all[base1]<<'\t'
409 <<("ACTGNNNN"[base2])<<'\t'<<(sites[j].q_sum[base2]==0?0:sites[j].q_sum[base2]/sites[j].count_uni[base2])<<'\t'<<sites[j].count_uni[base2]<<'\t'<<sites[j].count_all[base2]<<'\t'
410 <<sites[j].depth<<'\t'<<rank_sum_test_value<<'\t'<<(sites[j].depth==0?255:(double)(sites[j].repeat_time)/sites[j].depth)<<'\t'<<((sites[j].ori&8)?1:0)<<"\n";
412 else if(base1<4)
414 (*consensus)<<call_name<<'\t'<<(sites[j].pos+1)<<'\t'<<("ACTGNNNN"[(sites[j].ori&0x7)])<<'\t'<<abbv[type1]<<'\t'<<q_cns<<'\t'
415 <<("ACTGNNNN"[base1])<<'\t'<<(sites[j].q_sum[base1]==0?0:sites[j].q_sum[base1]/sites[j].count_uni[base1])<<'\t'<<sites[j].count_uni[base1]<<'\t'<<sites[j].count_all[base1]<<'\t'
416 <<"N\t0\t0\t0\t"
417 <<sites[j].depth<<'\t'<<rank_sum_test_value<<'\t'<<(sites[j].depth==0?255:(double)(sites[j].repeat_time)/sites[j].depth)<<'\t'<<((sites[j].ori&8)?1:0)<<"\n";
419 else
421 (*consensus)<<call_name<<'\t'<<(sites[j].pos+1)<<"\tN\tN\t0\tN\t0\t0\t0\tN\t0\t0\t0\t0\t1.000\t255.000\t0"<<"\n";
425 else
427 if(base1<4 && base2<4)
429 baseinfo<<call_name<<'\t'<<(sites[j].pos+1)<<'\t'<<("ACTGNNNN"[(sites[j].ori&0x7)])<<'\t'<<abbv[type1]<<'\t'<<q_cns<<'\t'
430 <<("ACTGNNNN"[base1])<<'\t'<<(sites[j].q_sum[base1]==0?0:sites[j].q_sum[base1]/sites[j].count_uni[base1])<<'\t'<<sites[j].count_uni[base1]<<'\t'<<sites[j].count_all[base1]<<'\t'
431 <<("ACTGNNNN"[base2])<<'\t'<<(sites[j].q_sum[base2]==0?0:sites[j].q_sum[base2]/sites[j].count_uni[base2])<<'\t'<<sites[j].count_uni[base2]<<'\t'<<sites[j].count_all[base2]<<'\t'
432 <<sites[j].depth<<'\t'<<showpoint<<rank_sum_test_value<<'\t'<<(sites[j].depth==0?255:(double)(sites[j].repeat_time)/sites[j].depth)<<'\t'<<((sites[j].ori&8)?1:0)<<"\n";
434 else if(base1<4)
436 baseinfo<<call_name<<'\t'<<(sites[j].pos+1)<<'\t'<<("ACTGNNNN"[(sites[j].ori&0x7)])<<'\t'<<abbv[type1]<<'\t'<<q_cns<<'\t'
437 <<("ACTGNNNN"[base1])<<'\t'<<(sites[j].q_sum[base1]==0?0:sites[j].q_sum[base1]/sites[j].count_uni[base1])<<'\t'<<sites[j].count_uni[base1]<<'\t'<<sites[j].count_all[base1]<<'\t'
438 <<"N\t0\t0\t0\t"
439 <<sites[j].depth<<'\t'<<showpoint<<rank_sum_test_value<<'\t'<<(sites[j].depth==0?255:(double)(sites[j].repeat_time)/sites[j].depth)<<'\t'<<((sites[j].ori&8)?1:0)<<"\n";
441 else
443 baseinfo<<call_name<<'\t'<<(sites[j].pos+1)<<"\tN\tN\t0\tN\t0\t0\t0\tN\t0\t0\t0\t0\t1.000\t255.000\t0"<<"\n";
446 //if(sites[j].pos == 250949) {
447 // exit(1);
450 // delete [] real_p_prior;
451 // delete [] pcr_dep_count;
452 // delete [] likelihoods;
453 return 1;
456 //update 2010-11-08 guyue
458 * DATE:
459 * FUNCTION: call the consensus from soap file .
460 * PARAMETER:
461 * RETURN :
463 int Call_win::soap2cns(igzstream & alignment, gzoutstream * consensus, my_ofstream & baseinfo, Genome * genome, Prob_matrix * mat, Parameter * para, SfsMethod &sfsMethod, const int id)
465 Soap_format soap;
466 current_chr = genome->chromosomes.end();
467 for(std::string line; getline(alignment, line);)
469 std::istringstream ss(line);
470 if( ss >> soap )
471 deal_read(soap, consensus, baseinfo, genome, mat, para, sfsMethod, id);
474 deal_tail(consensus, baseinfo, genome, mat, para, sfsMethod, id);
476 alignment.close();
477 consensus->close();
478 baseinfo.close();
479 return 1;
482 /* program by Bill */
483 int Call_win::soap2cns(SamCtrl &alignment, gzoutstream * consensus, my_ofstream & baseinfo, Genome * genome, Prob_matrix * mat, Parameter * para, SfsMethod &sfsMethod, const int id) {
484 Soap_format soap;
485 current_chr = genome->chromosomes.end();
486 int r;
487 std::string line;
488 while((r = alignment.readline(line)) >=0) {
489 line = alignment_format(line); //from sam/bam to soap file
491 if (line == NOUSE_ALIGNMENT)
492 continue;
493 std::istringstream ss(line);
494 if( ss >> soap )
495 deal_read(soap, consensus, baseinfo, genome, mat, para, sfsMethod, id);
498 deal_tail(consensus, baseinfo, genome, mat, para, sfsMethod, id);
500 consensus->close();
501 baseinfo.close();
502 return 1;
506 * DATE: 2010-8-5
507 * FUNCTION: deal with the win.
508 * PARAMETER: current_chr: is the chr to be processed. last_start: the last position that be processed
509 * consensus: output file handle. genome: is the point to the Genome.
510 * mat: the matrix. para: the parameter.
511 * sfsMethod: a SfsMethod object, use to sfs.
512 * id: the individual's id, use to sfs
513 * RETURN: void
515 void Call_win::pro_win(gzoutstream * consensus, my_ofstream & baseinfo, Genome * genome, Prob_matrix * mat, Parameter *para, SfsMethod &sfsMethod, const int id)
517 //done_pro_win = true; // 2011-1-20
519 if (current_chr == genome->chromosomes.end())
521 return;
524 if (!para->region_only || current_chr->second->is_in_region_win(last_start))
526 //cerr<<"at"<<soap.get_pos()<<"Called"<<last_start<<endl;
527 if(last_start > sites[win_size-1].pos)
529 initialize((last_start/win_size)*win_size);
531 call_cns(current_chr->first, current_chr->second, win_size, mat, para, consensus, baseinfo, sfsMethod, id);
532 last_start = sites[win_size-1].pos; //last start is the end position of the process window
533 recycled = false;
536 if (!para->region_only)
538 //cerr<<"recycled"<<last_start<<endl;
539 recycle();
540 recycled = true;
541 last_start = sites[win_size-1].pos;
543 else if (!recycled)
545 //cerr<<"recycled"<<" "<<last_start<<endl;
546 /* if (last_start>(sites[win_size-1].pos)) {
547 cerr<<"Unexpected "<<last_start<<">"<<(sites[win_size-1].pos)<<endl;
548 exit(1);
549 if ((last_start+1)%win_size!=0) {
550 cerr<<"Assertion Error!"<<last_start<<">"<<sites[win_size-1].pos<<endl;
551 exit(1);
552 //cerr << last_satrt <<" "<<sites[win_size-1].pos<<endl;
554 initialize(last_start-win_size+1);
556 else {*/
557 if (current_chr->second->is_in_region_win(sites[win_size].pos))
559 recycle();
561 else
563 deep_init(sites[win_size].pos);
566 recycled = true;
567 last_start = sites[win_size-1].pos;
569 else
571 assert((last_start+1)%win_size==0);
572 last_start += win_size;
577 * DATE: 2010-8-5
578 * FUNCTION: deal with one read.
579 * PARAMETER: recycled: the flag.
580 * soap: the soapformat to be processed. soap: SOAP format read from the file.
581 * consensus: output file handle. genome: is the point to the Genome.
582 * mat: the matrix. para: the parameter.
583 * sfsMethod: a SfsMethod object, use to sfs.
584 * id: the individual's id, use to sfs
585 * RETURN: void
587 void Call_win::deal_read(Soap_format &soap, gzoutstream * consensus, my_ofstream & baseinfo, Genome * genome, Prob_matrix * mat, Parameter * para, SfsMethod &sfsMethod, const int id)
589 int coord, sub;
590 //cerr<<"A"<<soap.get_pos()<<endl;
591 //exit(1);
592 if(soap.get_pos() < 0)
594 return;
596 // first time deal_read or come to a new chromose
597 if (current_chr == genome->chromosomes.end() || current_chr->first != soap.get_chr_name())
599 //come to a new chromose
600 if(current_chr != genome->chromosomes.end())
602 recycled = false;
603 while(current_chr->second->length() > sites[win_size-1].pos && current_chr->second->length() > last_start && (current_chr->second->get_region() != NULL || current_chr->second->m_region_len > last_start))
605 pro_win(consensus, baseinfo, genome, mat, para, sfsMethod, id);
608 // The last window
609 if(last_start > sites[win_size-1].pos)
611 initialize((last_start/win_size)*win_size);
613 call_cns(current_chr->first, current_chr->second, current_chr->second->length()%win_size, mat, para, consensus, baseinfo, sfsMethod, id);
617 current_chr = genome->chromosomes.find(soap.get_chr_name());
618 //initialize(0);
619 /*if (last_start < current_chr->second->getStartPos())
620 last_start = current_chr->second->getStartPos();*/
621 last_start = current_chr->second->getStartPos();
622 //initialize((last_start/win_size)*win_size);
623 // use deep_init to make sites to be pure. 2010-12-16 Bill.
624 deep_init((last_start/win_size)*win_size);
625 cerr<<"Processing "<<current_chr->first<<endl;
627 else {
631 //the end of this read is out of the chromorese
632 if (soap.get_pos()+soap.get_read_len() >= current_chr->second->length()) {
633 return;
636 //is out of targ region
637 if (para->region_only && (current_chr->second->get_region() == NULL || !current_chr->second->is_in_region(soap.get_pos()))) {
638 return;
641 if(soap.get_pos() < last_start)
643 //cerr << soap.get_pos() << "<" <<last_start<<endl;
644 return;
645 /*cerr<<"Errors in sorting:"<<soap.get_pos()<<"<"<<last_start<<endl;
646 exit(255);*/
649 recycled = false;
650 //some window before the start of this soap isn't process
651 while (soap.get_pos()/win_size > last_start/win_size )
653 //if ( soap.get_pos()/win_size - last_start/win_size >1)
654 //cerr << soap.get_pos()/win_size <<" "<<last_start/win_size<<endl;
655 pro_win(consensus, baseinfo, genome, mat, para, sfsMethod, id); //deal the pre-window
657 // 2011-1-20
658 done_pro_win = true;
659 //cerr<<"die"<<endl;
660 //exit(1);
661 last_start=soap.get_pos();
662 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
664 for(coord=0; coord<soap.get_read_len(); coord++){
665 if( (soap.get_pos()+coord)/win_size == soap.get_pos()/win_size ) {
666 // In the same sliding window
667 sub = (soap.get_pos()+coord) % win_size;
669 else {
670 sub = (soap.get_pos()+coord) % win_size + win_size; // Use the tail to store the info so that it won't intervene the uncalled bases
673 if (sub >= (win_size + para->read_length))
675 // update at 2010-10-12
676 cerr << "\tWARNING: the parameter '-L' was wrong, you should set the longest read's length!!!"
677 << " At position : " << soap.get_pos()
678 << endl;
679 break; // we don't need to exit the program. juet beak;
680 // exit(0);
683 sites[sub].depth += 1;
684 sites[sub].repeat_time += soap.get_hit();
685 if((soap.is_N(coord)) || soap.get_qual(coord)<para->q_min || sites[sub].dep_uni >= 0xFF) {
686 // An N, low quality or meaningless huge depth
687 continue;
689 //just deal with the position best hit equal to 1
690 if(soap.get_hit() == 1) {
691 //exit((fprintf(stderr, "Wo Cao!\n")));
692 sites[sub].dep_uni += 1;
693 // Update the covering info: 4x2x64x64 matrix, base x strand x q_score x read_pos, 2-1-6-6 bits for each
694 if(soap.is_fwd()) {
695 // Binary strand: 0 for plus and 1 for minus
696 sites[sub].base_info[(((ubit64_t)(soap.get_base(coord)&0x6)|0))<<14 | ((ubit64_t)(soap.get_qual(coord)-para->q_min))<<8 | coord ] += 1;
698 else {
699 sites[sub].base_info[(((ubit64_t)(soap.get_base(coord)&0x6)|1))<<14 | ((ubit64_t)(soap.get_qual(coord)-para->q_min))<<8 | (soap.get_read_len()-1-coord) ] += 1;
702 sites[sub].count_uni[(soap.get_base(coord)>>1)&3] += 1;
703 sites[sub].q_sum[(soap.get_base(coord)>>1)&3] += (soap.get_qual(coord)-para->q_min);
705 // add by Bill at 2010-10-15
706 if (para->sfs != NULL && soap.get_qual(coord) > para->sfs->qs)
708 sites[sub].count_sfs[(soap.get_base(coord)>>1)&3] += 1;
711 else {
712 ;// Repeats
714 sites[sub].count_all[(soap.get_base(coord)>>1)&3] += 1;
719 // processed tail of chromosome
721 * DATE: 2010-8-18
722 * FUNCTION: processed tail of chromosome.
723 * PARAMETER: recycled: the flag. consensus: output file handle. baseinfo: output file handle.
724 * genome: is the point to the Genome.
725 * mat: the matrix. para: the parameter.
726 * sfsMethod: a SfsMethod object, use to sfs.
727 * id: the individual's id, use to sfs
728 * RETURN: void
730 void Call_win::deal_tail(gzoutstream * consensus, my_ofstream& baseinfo, Genome* genome, Prob_matrix* mat, Parameter* para, SfsMethod &sfsMethod, const int id)
732 //The unprocessed tail of chromosome
733 recycled = false;
734 while(current_chr->second->length() > sites[win_size-1].pos && current_chr->second->length() > last_start && (current_chr->second->get_region() != NULL || current_chr->second->m_region_len > last_start)) {
736 pro_win(consensus, baseinfo, genome, mat, para, sfsMethod, id);
739 // The last window
740 if(last_start > sites[win_size-1].pos) {
741 initialize((last_start/win_size)*win_size);
743 call_cns(current_chr->first, current_chr->second, current_chr->second->length()%win_size, mat, para, consensus, baseinfo, sfsMethod, id);