limit fstBC to 30bp in Python3 ver.
[GalaxyCodeBases.git] / BGI / SOAPsnp / SfsMethod.cpp
bloba6df59d62640042b680be6ead99affadd1710cd9
1 /*
2 ******************************************************************************
3 *Copyright 2010
4 * BGI-SHENZHEN
5 *All Rights Reserved
6 *ATHUOR : Bill Tang
7 *CREATE DATE : 2010-9-13
8 *CLASS NAME: SfsMethod
9 *FUNCTION : a class that contain method about sfs.
10 *FILE NAME : SfsMethod.cpp
11 *UPDATE DATE : 2010-9-13
12 *UPDATE BY : Bill Tang
13 *UPDATE DATE : 2010-9-27
14 *UPDATE BY : Bill Tang
15 *******************************************************************************
18 #include "SfsMethod.h"
19 #include "soap_snp.h"
21 #define BE_PROCESSED -1
23 bool operator <= (const loci& first,const loci& second) {
24 if(first.chromo==second.chromo)
25 return first.position<=second.position;
26 else
27 return first.chromo<=second.chromo;
31 bool operator > (const loci& first,const loci& second) {
32 return (!(first <= second));
35 bool operator == (const loci& first,const loci& second) {
36 if(first.chromo==second.chromo)
37 return first.position==second.position;
38 else
39 return first.chromo==second.chromo;
42 SfsMethod::SfsMethod()
44 binomLookup = NULL;
45 likes = NULL;
46 bayes = NULL;
47 asso[0].clear();
48 asso[1].clear();
49 asso[2].clear();
50 //end of update
51 m_map_idx = 0;
52 m_map_idx_process = BE_PROCESSED;
53 sem_init(&sem_call_sfs, 0, 0);
54 sem_init(&sem_call_sfs_return, 0, 1);
55 //sem_init(&sem_map_number, 0, 2);
56 file_end_flag = 0;
59 pthread_mutex_t SfsMethod::m_pthreadMutex = PTHREAD_MUTEX_INITIALIZER;
61 SfsMethod::SfsMethod(int numInds)
64 depending on major/minor we will only use 3 genotypes,
65 so we will need a small lookup table to decide which columns to extract from the glf file
66 A C G T
67 A 0 1 2 3
68 C 4 5 6
69 G 7 8
70 T 9
73 int n = 0;
74 for(int i = 0; i < 4; i++)
76 for(int j = i; j < 4; j++)
78 glfLookup[i][j] = n;
79 glfLookup[j][i] = n;
80 n++;
84 generate_likeLookup();
85 if (numInds < 0)
87 generate_binom(0);
88 likes = allocDoubleArray(1);
89 bayes = allocDoubleArray(1);
90 m_numInds = numInds;
92 else
94 generate_binom(numInds);
95 likes = allocDoubleArray(2 * numInds + 1);
96 bayes = allocDoubleArray(2 * numInds + 1);
97 m_numInds = numInds;
102 * DATE: 2010-9-16
103 * FUNCTION: initialization :likes , bays, binomLookup matrix
104 * PARAMETER: numInds: the individual number.
105 * RETURN: 0
107 int SfsMethod::init(const int numInds)
110 depending on major/minor we will only use 3 genotypes,
111 so we will need a small lookup table to decide which columns to extract from the glf file
112 A C G T
113 A 0 1 2 3
114 C 4 5 6
115 G 7 8
119 int n = 0;
120 for(int i = 0; i < 4; i++)
122 for(int j = i; j < 4; j++)
124 glfLookup[i][j] = n;
125 glfLookup[j][i] = n;
126 n++;
130 generate_likeLookup();
131 if (numInds < 0)
133 generate_binom(0);
134 likes = allocDoubleArray(1);
135 bayes = allocDoubleArray(1);
136 m_numInds = numInds;
138 else
140 generate_binom(numInds);
141 likes = allocDoubleArray(2 * numInds + 1);
142 bayes = allocDoubleArray(2 * numInds + 1);
143 m_numInds = numInds;
145 //allocVec();
146 return 0;
149 SfsMethod::~SfsMethod(void)
151 if (binomLookup != NULL)
153 delete [] binomLookup;
154 binomLookup = NULL;
156 if (likes != NULL)
158 delete [] likes;
159 likes = NULL;
161 if (bayes != NULL)
163 delete [] bayes;
164 bayes = NULL;
166 //delVector(asso);
167 cleanUpMap(asso[0]);
168 cleanUpMap(asso[1]);
169 cleanUpMap(asso[2]);
170 //clear sem
171 sem_destroy(&sem_call_sfs);
172 sem_destroy(&sem_call_sfs_return);
175 void outMap(aMap::iterator &itr, int numInds)
177 cerr << "chr : " << itr->first.chromo << " pos : " << itr->first.position;
178 datum p = itr->second;
179 cerr << " ref :" << p.ref << " major : " << p.major << " minor: " << p.minor << endl;
180 int *lk = p.lk;
181 cerr << "lk :" << endl;
182 for (int j = 0; j < numInds; j++)
184 for (int i = 0; i < 10; i++)
185 cerr << lk[4 * j + i] << "\t";
186 cerr << endl;
188 cerr << endl;
189 int *locus = p.locus;
190 cerr << "locus: " << endl;
191 for (int j = 0; j <= numInds; j++)
193 for (int i = 0; i < 4; i++)
194 cerr << locus[4 * j + i] << "\t";
195 cerr << endl;
197 cerr << "phat : " << p.phat;
198 cerr << endl << endl;
202 * DATE: 2010-9-13
203 * FUNCTION: the function for algo
204 * PARAMETER: asso : the maps of the datums
205 * numInds : the number of individuals
206 * eps : the errorate
207 * pvar : the probability of being variable
208 * B :the bias correction coefficient
209 * sfsfile : sfs file pionter
210 * normalize :
211 * alternative :
212 * singleMinor : we should loop over 3 possible minor or one.
213 * pThres : select the out put threshold of snp £¬default is 0.01
214 * allowPhatZero
215 * RETURN: void
216 * UPDATE:2010-10-21, add a set phat condition.
217 * UPDATE:2010-11-29, control the sfs file column
219 void SfsMethod::algo(aMap & asso, int numInds, double eps, double pvar, double B, FILE * sfsfile,
220 int normalize, int alternative, int singleMinor, double pThres, int allowPhatZero, int firstlast)
222 normalize=0;
224 //algorithm goes on by a site on site basis
225 double *pis = new double[numInds];
226 double *wis = new double[numInds];
227 double *cf = new double[numInds];//the bias coefiecients
228 double *sumMinors = allocDoubleArray(2*numInds+1);//hte sum of the 3 different minors
229 double *hj = allocDoubleArray(2*numInds+1);
231 for(aMap::iterator it=asso.begin(); it!=asso.end(); it++) {
233 //outMap(it, numInds);
234 datum p = it->second;
235 //part one
236 if (p.major > 3 || p.minor==p.major){
237 //printf("never here\n");
238 continue;
241 //set the resultarray to zeros
242 for(int sm=0 ; sm<(2*numInds+1) ; sm++ )
243 sumMinors[sm] = 0;
245 //loop through the 3 different minors
246 for(int minor_offset=0;minor_offset<4;minor_offset++) {
248 if(minor_offset == p.major)
249 continue;
251 //hook for only calculating one minor
252 if(singleMinor)
253 minor_offset = p.minor;
255 int major_offset = p.major;
256 int Aa_offset = glfLookup[minor_offset][major_offset];//0-9
257 int AA_offset = glfLookup[minor_offset][minor_offset];//0-9
258 int aa_offset = glfLookup[major_offset][major_offset];//0-9
261 for(int i=0;i<numInds;i++) {
262 int ni = p.locus[i*4+minor_offset];
263 int nt = p.locus[i*4+minor_offset] + p.locus[i*4+major_offset];
265 if(B!=0)//bias version
266 cf[i] = pow((0.5-B),ni)*pow((0.5+B),nt-ni)*pow(0.5,-nt);
267 else
268 cf[i] =1;
270 //FIX if we have a very large number ( paralogue sites), the cf[i] will overflow, and it should be zero
271 if(isinf_local(cf[i])||isnan_local(cf[i])){
272 // fprintf(stderr,"POSSIBLE ERROR: Problems calculating bias correction coefficient, site might be paralogue?\t%d\t%d\n",it->first.chromo,it->first.position);
273 cerr << "POSSIBLE ERROR: Problems calculating bias correction coefficient, site might be paralogue?\t" << it->first.chromo << "\t" << it->first.position << endl;
274 cf[i] = 1;
276 if(nt==0){//if we dont have any reads for individual 'i'
277 pis[i] = 0;
278 wis[i] = 0;
279 continue;
281 pis[i] = (ni-eps*nt)/(nt*(1-2*eps));
282 wis[i] = 2.0*nt/(nt+1.0);
285 double tmp=0;
286 for(int i=0;i<numInds;i++)
287 tmp+= (pis[i]*wis[i]);
289 double phat = std::max(0.0,tmp/sum(wis,numInds));//original
291 // phat = std::max(pvar/(4*numInds),tmp/sum(wis,numInds)); //ramus old
293 if(allowPhatZero==0){
294 int depth = 0;
295 for (int pl=0;pl<4;pl++) depth += p.locus[4*numInds+pl];
296 phat = std::max(pvar/std::min(10,depth),phat); //thorfinn new
299 //add bias stuff
300 double newc = (0.5-B)/(0.5+B);
301 phat = std::min(1.0,phat/(newc+phat*(1.0-newc)));
302 //end bias
304 // change at 2010-10-21
305 if (minor_offset == p.minor)
307 it->second.phat = phat;
310 //part two
311 // change on 2010-9-18
312 memset(hj, 0, sizeof(double) * (2*numInds+1));
313 /*for(int i = 0; i < 2*numInds+1; ++i)
315 hj[i] = 0.0;
318 if(0)
319 for(int index=0;index<(2*numInds+1);index++)
320 hj[index]=log(hj[index]);
321 double PAA,PAa,Paa;
322 // phat = 0.224518;
324 double totmax = 0.0;
325 double gaa_prod = 1.0; //product of the major genotyp likelihood
326 for(int i=0 ; i<numInds ;i++) {
327 double GAA,GAa,Gaa;
328 #ifndef RASMUS_INPUT
329 GAA = likeLookup[p.lk[i*10+AA_offset]];
330 GAa = likeLookup[p.lk[i*10+Aa_offset]];
331 Gaa = likeLookup[p.lk[i*10+aa_offset]];
332 #else
333 GAA = exp(p.lk[i*10+AA_offset]);
334 GAa = exp(p.lk[i*10+Aa_offset]);
335 Gaa = exp(p.lk[i*10+aa_offset]);
337 // printf("GAA=%.15f\tGAa=%.15f\tGaa=%.15f\n",GAA,GAa,Gaa);
338 //printf("sums g,=%.15f\n",GAa+GAA+Gaa);
339 #endif
340 if(1){
341 double sums= GAA+GAa+Gaa;//this is essential for floating point precision
342 GAA = GAA/(sums);
343 GAa = GAa/(sums);
344 Gaa = Gaa/(sums);
348 gaa_prod *= Gaa;
349 double MAA,MAa,Maa;
350 if(phat==0||1){
351 MAA = GAA * phat*phat;
352 MAa = GAa * 2*(1-phat)*phat*cf[i]; //DRAGON remember to add cf[i]
353 Maa = Gaa * (1-phat)*(1-phat);
354 }else{
355 // fprintf(stderr,"hereee\n");
356 MAA = GAA ;
357 MAa = GAa * 2;
358 Maa = Gaa ;
360 //printf("mAA=%.15f\tmAa=%.15f\tMaa=%.15f\n",MAA,MAa,Maa);
362 if(1 &&(isnan_local(Maa) || isnan_local(MAa) || isnan_local(MAA))){
363 //fprintf(stderr,"Possible underflow at: \t%d\t%d\n",it->first.chromo,it->first.position);
364 cerr << "Possible underflow at: \t" << it->first.chromo << "\t" << it->first.position << endl;
365 goto next_allele;// change 2010.9.18
366 //exit(0);
368 if(0){
369 PAA =log(MAA);///(MAA+MAa+Maa);
370 PAa =log(MAa);///(MAA+MAa+Maa);
371 Paa =log(Maa);///(MAA+MAa+Maa);
372 // printf("sum af P's=%f\n",PAA+PAa+Paa);
373 }else{
374 PAA =(MAA);///(MAA+MAa+Maa);
375 PAa =(MAa);///(MAA+MAa+Maa);
376 Paa =(Maa);///(MAA+MAa+Maa);
379 if(normalize){
380 PAA =log(MAA/(MAA+MAa+Maa));
381 PAa =log(MAa/(MAA+MAa+Maa));
382 Paa =log(Maa/(MAA+MAa+Maa));
385 //check for underflow error, this should only occur once in a blue moon
386 if(isnan_local(Paa)||isnan_local(PAa)||isnan_local(PAA)){
387 printf("fixing nan error at : \t%s\t%d\n",it->first.chromo.c_str(),it->first.position);
388 goto next_allele;// change 2010.9.18
389 //exit(0);
391 if(isnan_local(Paa)||isnan_local(PAa)||isnan_local(PAA)){
392 //fprintf(stderr,"Possible underflow at: \t%d\t%d\n",it->first.chromo,it->first.position);
393 cerr << "Possible underflow at: \t" << it->first.chromo << "\t" << it->first.position << endl;
394 printf("PAA=%f\tPAa=%f\tPaa=%f\tMAA=%f\tMAa=%f\tMaa=%f\n",PAA,PAa,Paa,MAA,MAa,Maa);
398 double mymax;
399 if (Paa > PAa && Paa > PAA) mymax = Paa;
400 else if (PAa > PAA) mymax = PAa;
401 else mymax = PAA;
404 Paa=Paa/mymax;
405 PAa=PAa/mymax;
406 PAA=PAA/mymax;
407 totmax = totmax + log(mymax);
409 if(i==0){
410 hj[0] =Paa;
411 hj[1] =PAa;
412 hj[2] =PAA;
413 }else{
415 for(int j=2*(i+1); j>1;j--){
416 double tmp;
417 if(0)
418 tmp =log(exp(PAA+hj[j-2])+exp(PAa+hj[j-1])+exp(Paa+hj[j]));
419 else
420 tmp = PAA*hj[j-2]+PAa*hj[j-1]+Paa*hj[j];
422 if(isnan_local(cf[i]))
423 //fprintf(stderr,"cfi is inf\n");
424 cerr << "cfi is inf" <<endl;
425 if(isnan_local(tmp)){
426 //fprintf(stderr,"jis nan in algorithm:%d\n",j );
427 cerr << "jis nan in algorithm: " << j <<endl;
428 //fprintf(stderr,"%f\t%f\t%f\t%f\t%f\n",PAA,PAa,Paa,mymax,cf[i]);
429 cerr << PAA << "\t" << PAa << "\t" << Paa << "\t" << mymax << "\t" << cf[i] <<endl;
430 //fprintf(stderr,"at : \t%d\t%d\n",it->first.chromo,it->first.position);
431 cerr << "at : " << it->first.chromo << "\t" << it->first.position << endl;
432 hj[j] = 0;
433 goto next_allele;// change 2010.9.18
434 //exit(0);
435 }else
436 hj[j] =tmp;
438 if(0){
439 hj[1] = log(exp(Paa+hj[1])+exp(PAa+hj[0]));
440 hj[0] = Paa+hj[0];
442 else{
443 hj[1] = Paa*hj[1] + PAa*hj[0];
444 hj[0] = Paa*hj[0];
448 //recursion done, now do the normalization, and extrastuff
450 if(alternative){
451 //populate the log(M) vector
452 double *m = allocDoubleArray(2*numInds+1);
453 for(int mi=0;mi<(2*numInds+1);mi++)
454 m[mi] = binomLookup[mi]*pow(phat,mi)*pow(1-phat,2*numInds-mi);
456 //now add the mi to the hi,
457 for(int i=0;i<(2*numInds+1) ;i++)
458 hj[i] = hj[i] * m[i];
460 // change on 2010-9-25
461 delete [] m;
464 if(0) // this is a artifact from the no-underflow version
465 for (int i=0;i<(2*numInds+1);i++)
466 hj[i] = exp(hj[i]);
468 for (int i=0;i<(2*numInds+1);i++)
469 hj[i] = exp(log(hj[i])+totmax);
471 double bottom = pvar*sum(hj,2*numInds+1)+(1-pvar)*gaa_prod;
474 if(bottom==0||isnan_local(bottom)||isinf_local(bottom)){
475 //fprintf(stderr,"bottom equal zero should not happen: %f\n",bottom);
476 cerr << "bottom equal zero should not happen: " << bottom <<endl;
477 //fprintf(stderr,"at : \t%d\t%d\n",it->first.chromo,it->first.position);
478 cerr << "at : " << "\t" <<it->first.chromo << "\t" << it->first.position <<endl;
479 //fprintf(skipped_sites_fp,"%s\t%d\n",it->first.chromo.c_str(),it->first.position);
480 //sfsfile << it->second.chromo << "\t" << it->second.position << "\t-1" << endl;
481 fprintf(sfsfile,"%s\t%d\t-1\n",it->first.chromo.c_str(),it->first.position);
483 minor_offset=4;
484 goto next_allele; // change on 2010.9.18
485 break;
487 // exit(0);
491 //now input the normalized hj array
492 hj[0] = ((pvar*hj[0])+(1-pvar)*gaa_prod)/bottom;//prod(p.lk,numInds,2))/bottom;
493 for(int i=1;i<(2*numInds+1) ; i++)
494 hj[i] = (hj[i]*pvar/bottom);
496 //validate that everything looks fine
497 if(sum(hj,2*numInds+1)>1.0000001){
498 printf("This shouldnt occur, it means that the sfs for a loci has an improper prob space\n");
499 printf("sums is:%f\n",sum(hj,2*numInds+1));
500 printf("at : \t%s\t%d\n",it->first.chromo.c_str(),it->first.position);
501 goto next_allele; // change 2010.9.18
502 //exit(0);
504 //update the sumSFS of the site
505 for(int ss=0;ss<(2*numInds+1);ss++)
506 sumMinors[ss] += hj[ss];
507 likes[getMaxId(hj,2*numInds+1)]++;
509 if(singleMinor)
510 break;
514 for(int i=0;i<(2*numInds+1);i++)
515 if(singleMinor==0)
516 sumMinors[i] = sumMinors[i]/3.0;
518 //update the global bayes/like estimates
519 for(int i=0;i<(2*numInds+1);i++)
520 bayes[i] += sumMinors[i];
522 //printit
523 if((1-sumMinors[0]-sumMinors[2*numInds])>pThres){
524 //sfsfile << it->second.chromo << "\t" << it->second.position << "\t";
525 fprintf(sfsfile,"%s\t%d\t",it->first.chromo.c_str(),it->first.position);
526 //sfsfile << getChar(it->second.major) << "\t" << getChar(it->second.minor) << "\t";
527 fprintf(sfsfile,"%c\t%c\t",getChar(it->second.major),getChar(it->second.minor));
528 //update 11-29 for control the sfs file column
529 //fprintf(sfsfile,"%f\t",sumMinors[i]);
530 for(int i=0;i<(2*numInds);i++)
532 if ( firstlast == 1 ) //only output the first and last column to the sfs file
534 fprintf(sfsfile,"%f\t",sumMinors[i]);
535 break;
537 else
539 fprintf(sfsfile,"%f\t",sumMinors[i]);
542 //sfsfile << sumMinors[i] << "\t";
543 //sfsfile << sumMinors[2*numInds] << endl;
544 fprintf(sfsfile,"%f\n",sumMinors[2*numInds]);
547 next_allele:
550 //cleanup
554 delete [] hj;
555 delete [] pis;
556 delete [] wis;
557 delete [] sumMinors;
558 delete [] cf;
563 * DATE: 2010-8-9
564 * FUNCTION: do algo joint function
565 * PARAMETER: asso : the maps of the datums
566 * numInds : the number of individuals
567 * eps : the errorate
568 * pvar : the probability of being variable
569 * B : the bias correction coefficient
570 * sfsfile : sfs file pionter
571 * underFlowProtect:
572 * alternative :
573 * singleMinor : should we loop over 3 possible minor. MAYBE
574 * fold :
575 * RETURN: void
577 void SfsMethod::algoJoint(aMap & asso, int numInds, double eps, double pvar, double B, FILE * sfsfile,
578 int underFlowProtect, int alternative, int singleMinor, int fold)
581 //algorithm goes on by a site on site basis //pretty much the same as 'algo' without the prior phat
583 double sumMinors[2*numInds+1]; //the sum of the 3 different minors
585 for(aMap::iterator it=asso.begin(); it!=asso.end(); it++) {//loop over sites
586 datum p = it->second;
588 //part one
590 if (p.major > 3 || p.minor==p.major){
591 // printf("never runs\n");
592 continue;
595 //set the resultarray to zeros
596 for(int sm=0 ; sm<(2*numInds+1) ; sm++ )
597 sumMinors[sm] = 0;
599 //loop through the 3 different minors
600 for(int minor_offset=0;minor_offset<4;minor_offset++) {
602 if(minor_offset == p.major)
603 continue;
605 //hook for only calculating one minor
606 if(singleMinor)
607 minor_offset = p.minor;
609 int major_offset = p.major;
610 int Aa_offset = glfLookup[minor_offset][major_offset];//0-9
611 int AA_offset = glfLookup[minor_offset][minor_offset];//0-9
612 int aa_offset = glfLookup[major_offset][major_offset];//0-9
614 //part two
615 double hj[2*numInds+1];
616 for(int index=0;index<(2*numInds+1);index++)
617 if(underFlowProtect==0)
618 hj[index]=0;
619 else
620 hj[index]=log(0);
621 double PAA,PAa,Paa;
623 for(int i=0 ; i<numInds ;i++) {
624 //printf("AA=%d\tAa=%d\taa=%d\n",p.lk[i*3+AA],p.lk[i*3+Aa],p.lk[i*3+aa]);
625 double GAA,GAa,Gaa;
626 #ifndef RASMUS_INPUT //abi input
627 GAA = log(likeLookup[p.lk[i*10+AA_offset]]);
628 if(p.lk[i*10+AA_offset]==0 && p.lk[i*10+Aa_offset]==0 && p.lk[i*10+aa_offset]==0)
629 GAa = log(likeLookup[p.lk[i*10+Aa_offset]]);
630 else
631 GAa = log(2.0*likeLookup[p.lk[i*10+Aa_offset]]);
632 Gaa = log(likeLookup[p.lk[i*10+aa_offset]]);
633 #else
634 GAA = p.lk[i*10+AA_offset];
635 GAa = log(2.0)+p.lk[i*10+Aa_offset];
636 Gaa = p.lk[i*10+aa_offset];
638 #endif
639 if(underFlowProtect==0){
640 GAA=exp(GAA);
641 GAa=exp(GAa);
642 Gaa=exp(Gaa);
646 PAA =(GAA);///(MAA+MAa+Maa);
647 PAa =(GAa);///(MAA+MAa+Maa);
648 Paa =(Gaa);///(MAA+MAa+Maa);
651 //check for underflow error, this should only occur once in a blue moon
652 if(isnan_local(Paa)||isnan_local(PAa)||isnan_local(Paa)){
653 //fprintf(stderr,"Possible underflow at: \t%d\t%d\n",it->first.chromo,it->first.position);
654 cerr << "Possible underflow at: \t" << it->first.chromo << "\t" << it->first.position << endl;
655 printf("PAA=%f\tPAa=%f\tPaa=%f\n",PAA,PAa,Paa);
658 if(i==0){
659 hj[0] =Paa;
660 hj[1] =PAa;
661 hj[2] =PAA;
662 }else{
664 for(int j=2*(i+1); j>1;j--){
665 //print_array(hj,2*numInds+1);
666 double tmp;
667 if(underFlowProtect==1)
668 tmp =addProtect3(PAA+hj[j-2],PAa+hj[j-1],Paa+hj[j]);
669 else
670 tmp = PAA*hj[j-2]+PAa*hj[j-1]+Paa*hj[j];
672 if(isnan_local(tmp)){
673 printf("jis nan:%d\n",j );
674 print_array(hj,2*numInds+2);
675 hj[j] = 0;
676 // exit(0);
677 break;
678 }else
679 hj[j] =tmp;
681 if(underFlowProtect==1){
682 hj[1] = addProtect2(Paa+hj[1],PAa+hj[0]);
683 hj[0] = Paa+hj[0];
685 else{
686 hj[1] = Paa*hj[1] + PAa*hj[0];
687 hj[0] = Paa*hj[0];
693 for(int i=0;i<(2*numInds+1);i++)
694 if(underFlowProtect==0)
695 sumMinors[i] += exp(log(hj[i])-log(bico(2*numInds,i)));
696 else
697 sumMinors[i] += exp(hj[i]-log(bico(2*numInds,i)));
698 if(singleMinor)
699 break;
703 //printit by rasmus 0.98 modified by thorfinn 0.981
705 for(int i=0;i<(2*numInds+1);i++)
706 if(singleMinor)
707 sumMinors[i] = log(sumMinors[i]);
708 else
709 sumMinors[i] = log(sumMinors[i]/3);
711 if(fold){
712 int newDim = numInds+1;
713 for(int i=0;i<newDim-1;i++)// we shouldn't touch the last element
714 sumMinors[i] = log(exp(sumMinors[i]-log(2.0)) + exp(sumMinors[2*numInds-i]-log(2.0)));//THORFINN NEW
715 //sfsfile << sumMinors;
716 fwrite(sumMinors,sizeof(double),newDim,sfsfile);
717 }else
718 //sfsfile << sumMinors;
719 fwrite(sumMinors,sizeof(double),2*numInds+1,sfsfile);
726 * DATE: 2010-9-13
727 * FUNCTION: write frequnce file
728 * PARAMETER:
729 * pFile : a file pointer
730 * numInds : the number of individuals
731 * asso : the maps of the datums
733 * RETURN: void
735 void SfsMethod::writeFreq(FILE * pFile, int numInds, aMap & asso){
736 FILE *db =NULL;
737 int printCounts=0;
738 if(printCounts)
739 db=fopen("indcounts.txt","w");
740 for(aMap::iterator it=asso.begin(); it!=asso.end(); it++){
741 //loci l = it->first;
742 //datum d = it->second;
743 //outMap(it, numInds);
745 datum d = it->second;
746 if (d.major > 3 || d.minor > 3 || it->first.position == 0)
748 continue;
751 //pFile << d.chromo << "\t" << d.position << "\t";
752 fprintf(pFile,"%s\t%d\t",it->first.chromo.c_str(),it->first.position) ;
753 //pFile << getChar(d.major) << "\t" << getChar(d.minor) << "\t";
754 fprintf(pFile,"%c\t%c\t",getChar(d.major),getChar(d.minor)) ;
755 int *bases = d.locus;
756 for(int i=0;i<4;i++)
757 fprintf(pFile,"%d\t",bases[4*numInds+i]);
758 //pFile << bases[4*numInds+i] << "\t";
759 if(d.major!=d.minor)
760 fprintf(pFile,"%d\t",bases[4*numInds+d.major]+bases[4*numInds+d.minor]);
761 //pFile << bases[4*numInds+d.major]+bases[4*numInds+d.minor] << "\t";
762 else
763 fprintf(pFile,"%d\t",bases[4*numInds+d.major]);
764 //pFile << bases[4*numInds+d.major] << "\t";
765 //pFile << d.phat << endl;
766 fprintf(pFile,"%f\n",d.phat);
768 if(printCounts){
769 for(int i=0;i<4*numInds-1;i++)
770 fprintf(db,"%d ",bases[i]);
771 fprintf(db,"%d\n",bases[4*numInds-1]);
774 if(printCounts)
775 fclose(db);
776 fflush(pFile);
780 * DATE: 2010-9-13
781 * FUNCTION: allocation functions, this shouldn't look magical for anyone
782 * PARAMETER: len is the requested length of double array.
783 * RETURN: the pointer to the new double array.
785 double* SfsMethod::allocDoubleArray(int len)
787 double *ret= new double[len];
788 memset(ret, 0, sizeof(double) * len);
789 /*for(int i=0;i<len;i++)
790 ret[i]=0;*/
791 return ret;
795 * DATE: 2010-9-13
796 * FUNCTION: test for infinity
797 * PARAMETER: x is the double to be tested.
798 * RETURN:
800 int SfsMethod::isinf_local(double x)
802 #ifdef __INTEL_COMPILER
803 return isinf(x);
804 #else
805 return std::isinf(x);
806 #endif
811 * DATE: 2010-9-13
812 * FUNCTION: test for a NaN
813 * PARAMETER: x is the double to be tested.
814 * RETURN:
816 int SfsMethod::isnan_local(double x)
818 #ifdef __INTEL_COMPILER
819 return isnan(x);
820 #else
821 return std::isnan(x);
822 #endif
826 * DATE: 2010-9-13
827 * FUNCTION: add up all the value form the ary
828 * PARAMETER: ary is the double array pointer, len is the array's length.
829 * RETURN: the count.
831 double SfsMethod::sum(const double* ary, int len)
833 double s =0;
834 for(int i=0;i<len ; i++)
835 s+=ary[i];
836 return s;
840 * DATE: 2010-9-13
841 * FUNCTION: generate like look up table.
842 * PARAMETER: void
843 * RETURN: void
845 void SfsMethod::generate_likeLookup()
847 for (short int i=0;i<=255;i++)
848 likeLookup[i] = calcLikeRatio(i); //(pow(10.0,-i/10.0));
853 * DATE: 2010-9-13
854 * FUNCTION: generate binom. input should be number of individuals
855 * PARAMETER: k is the individuals' number.
856 * RETURN: void
858 void SfsMethod::generate_binom(int k)
860 binomLookup = new double[k*2+1];
861 for(int i=0;i<(k*2+1);i++)
862 binomLookup[i] = bico(k*2,i);
866 * DATE: 2010-9-13
867 * FUNCTION : To calculate the nCk ( n combination k)
868 * PARAMETER : n is the number of individulals.
869 * k is the individuals' number.
870 * RETURN: double
872 double SfsMethod::bico(int n, int k)
874 return floor(0.5+exp(factln(n)-factln(k)-factln(n-k)));
878 * DATE: 2010-9-13
879 * FUNCTION: generate factln
880 * PARAMETER: n : number of individuals
881 * RETURN: double
883 double SfsMethod::factln(int n)
885 static double a[101]; //A static array is automatically initialized to zero.
886 if (n < 0) printf("Negative factorial in routine factln: %d \n", n);
887 if (n <= 1) return 0.0;
888 if (n <= 100) return a[n] ? a[n] : (a[n]=gammln(n+1.0));
889 else return gammln(n+1.0);
893 * DATE: 2010-9-13
894 * FUNCTION: generate gammln
895 * PARAMETER: xx
896 * RETURN: double
898 double SfsMethod::gammln(double xx)
900 double x,y,tmp,ser;
901 static double cof[6]={76.18009172947146,-86.50532032941677,
902 24.01409824083091,-1.231739572450155,
903 0.1208650973866179e-2,-0.5395239384953e-5};
904 int j;
905 y=x=xx;
906 tmp=x+5.5;
907 tmp -= (x+0.5)*log(tmp);
908 ser=1.000000000190015;
909 for (j=0;j<=5;j++) ser += cof[j]/++y;
910 return -tmp+log(2.5066282746310005*ser/x);
914 * DATE: 2010-9-13
915 * FUNCTION: get max id
916 * PARAMETER: d : the double array pointer
917 * len £º the array's length.
918 * RETURN: int.
920 int SfsMethod::getMaxId(double *d,int len){
921 int m=0;
922 for(int i=1; i<len; i++)
923 if(d[i]>d[m])
924 m=i;
925 return m;
929 * DATE: 2010-9-13
930 * FUNCTION: clean map
931 * PARAMETER: asso : the maps of the datums
932 * RETURN: void.
934 void SfsMethod::cleanUpMap(aMap& asso)
936 for(aMap::iterator it=asso.begin();it!=asso.end();it++)
938 delete [] it->second.lk;
939 delete [] it->second.locus;
944 * DATE: 2010-9-13
945 * FUNCTION: get char from int
946 * PARAMETER: i : genetype number
947 * RETURN: genetype.
949 char SfsMethod::getChar(int i){
950 if(i==0)
951 return 'a';
952 else if(i==1)
953 return 'c';
954 else if(i==2)
955 return 'g';
956 else if(i==3)
957 return 't';
958 else
959 puts("Error getting char value of base");
960 return 0;
964 * DATE: 2010-9-13
965 * FUNCTION: add protect
966 * PARAMETER: ary : the double array pointer
967 * len £º the array's length.
968 * RETURN: protect add result .
970 double SfsMethod::addProtect(double *ary,int len){
971 double maxVal = ary[0];
972 for(int i=1;i<len;i++)
973 if(ary[i]>maxVal)
974 maxVal=ary[i];
976 double sumVal = 0;
977 for(int i=0;i<len;i++)
978 sumVal += exp(ary[i]-maxVal);
979 return log(sumVal) + maxVal;
984 * DATE: 2010-9-13
985 * FUNCTION: add three protect
986 * PARAMETER: a,b,c : double
988 * RETURN: three protect add result .
990 double SfsMethod::addProtect3(double a,double b, double c){
991 double maxVal;// = std::max(a,std::max(b,c));
992 if(a>b&&a>c)
993 maxVal=a;
994 else if(b>c)
995 maxVal=b;
996 else
997 maxVal=c;
998 double sumVal = exp(a-maxVal)+exp(b-maxVal)+exp(c-maxVal);
999 return log(sumVal) + maxVal;
1003 * DATE: 2010-9-13
1004 * FUNCTION: add two protect
1005 * PARAMETER: a,b : double
1007 * RETURN: two protect add result .
1009 double SfsMethod::addProtect2(double a,double b){
1010 double maxVal = std::max(a,b);
1012 double sumVal = exp(a-maxVal)+exp(b-maxVal);
1013 return log(sumVal) + maxVal;
1017 * DATE: 2010-9-13
1018 * FUNCTION: print array
1019 * PARAMETER: ary : the double array pointer
1020 * len £º the array's length.
1021 * RETURN: void.
1023 void SfsMethod::print_array(double *ary,int len){
1024 for (int i=0;i<len;i++)
1025 printf("%f\t",(ary[i]));
1026 printf("\n");
1030 allocates the arrays needed for one site for all individuals
1033 * DATE: 2010-9-14
1034 * FUNCTION: allocates the arrays needed for one site for all individuals
1035 * PARAMETER: numInds : the individuals' number.
1036 * RETURN: datum structure.
1038 datum SfsMethod::allocDatum(int numInds){
1039 datum data;
1040 //4 times the number of individuals +4 (the last 4elems will be the base sum of all inds)
1041 data.locus=allocIntArray(4*numInds+4);
1042 data.lk=allocIntArray(10*numInds);//allocUnsignedCharArray(3*numInds);
1043 data.phat =0.0;
1044 // data.position = 0;
1045 // data.chromo = "";
1046 // data.is_be_record = false;
1047 return data;
1051 * DATE: 2010-9-14
1052 * FUNCTION: allocates the arrays needed for one site for all individuals
1053 * PARAMETER: len : array length.
1054 * RETURN: array pointer.
1056 int* SfsMethod::allocIntArray(int len){
1057 // printf("allocing a int array:%d\n",len);
1058 int *ret = NULL;
1059 ret = new int[len];
1060 if (ret == NULL)
1062 cerr << "\tallocIntArray: new operation failed!" << endl;
1063 exit(0);
1065 /*for(int i=0;i<len;i++)
1066 ret[i]=0;*/
1067 memset(ret, 0, sizeof(int) * len);
1068 return ret;
1072 * DATE: 2010-9-14
1073 * FUNCTION: will calculate the sums and input the values in the last 4 slots,
1074 * and set the minor, major accordingly
1075 * PARAMETER: asso : the maps of the datums
1076 * numInds : the number of individuals
1077 * RETURN: void
1079 void SfsMethod::calcSumBias(aMap & asso,int numInds){
1080 // for(aMap::iterator it = asso.begin(); it != asso.end(); ++it){
1081 for(aMap::iterator it=asso.begin(); it!=asso.end(); it++){
1082 int *bases = it->second.locus; //4length array of basesums
1083 for(int i=0;i<numInds;i++){
1084 bases[4*numInds+A] += bases[i*4+A];
1085 bases[4*numInds+C] += bases[i*4+C];
1086 bases[4*numInds+G] += bases[i*4+G];
1087 bases[4*numInds+T] += bases[i*4+T];
1089 //now get the major/minor
1091 // int maj = it->second.major;
1093 int maj = 0;
1094 maj = it->second.ref;
1096 int temp=0;
1098 int min = maj;
1100 for(int i=0;i<4;i++)
1102 if(maj==i) //we should check the against the major allele
1103 continue;
1104 else if (bases[numInds*4+i] > temp){
1105 min = i;
1106 temp = bases[numInds*4+i];
1110 if (maj==min){
1111 min=0;
1112 while (maj==min)
1113 min++;
1115 it->second.minor = min;
1116 it->second.major = maj;
1122 * DATE: 2010-9-15
1123 * FUNCTION: soaplikelihood to sfslikelihood
1124 * PARAMETER: type_likely:sopa likelihood. likelihood : sfs likelihood
1125 * RETURN: void
1127 void SfsMethod::soaplk_sfslk(int * likelihood, double * type_likely, int id)
1129 assert(type_likely != NULL && likelihood != NULL);
1131 char allele1;
1132 char allele2;
1133 char genotype;
1134 char type1 = 0;
1135 for (allele1 = 0; allele1 != 4; allele1++)
1137 // Find the largest likelihood
1138 for (allele2 = allele1; allele2 != 4; allele2++)
1140 genotype = allele1<<2|allele2;
1141 if (type_likely[genotype] > type_likely[type1])
1143 type1 = genotype;
1148 for(int i = 0; i != 10; i++)
1150 if(type_likely[type1] - type_likely[sfs_type[i]] > 25.5)
1152 likelihood[10 * id + i] = 255;
1154 else
1156 likelihood[10 * id + i] = 10 * (type_likely[type1] - type_likely[sfs_type[i]]);// sfs_type[10]={0,1,3,2,5,7,6,15,11,10}; AA,AC,AG,AT,CC,CG,CT,GG,GT,TT
1162 * DATE: 2010-9-15
1163 * FUNCTION: copy source's coverage to the dest
1164 * PARAMETER: dest: the object array. source : the source array. id the individual's index.
1165 * RETURN: void
1167 void SfsMethod::cov_Cpy(int *dest, int const *source, int id)
1169 assert(source != NULL && dest != NULL);
1170 dest[4 * id + 0] = source[0];
1171 dest[4 * id + 1] = source[1];
1172 dest[4 * id + 2] = source[3];
1173 dest[4 * id + 3] = source[2];
1178 * DATE: 2010-9-15
1179 * FUNCTION: alloc vector memery
1180 * PARAMETER: void
1181 * RETURN: void
1183 void SfsMethod::allocVec(void)
1185 /* //asso = new aMap();
1186 asso = NULL;
1187 asso = new aVector[global_win_size];
1188 if (asso == NULL)
1190 cerr << "\tallocVec : new opreation failed!" << endl;
1191 exit(0);
1193 for (int i = 0; i < global_win_size; ++i)
1194 asso[i] = allocDatum(m_numInds);
1200 * DATE: 2010-9-15
1201 * FUNCTION: free the map memery
1202 * PARAMETER: amap the pointer to be deleted.
1203 * RETURN: void
1205 void SfsMethod::delMap(aMap &amap)
1207 // clean up the map's node
1208 cleanUpMap(amap);
1209 amap.clear();
1214 * DATE: 2010-9-15
1215 * FUNCTION: do SFS
1216 * PARAMETER: para: the parameters files: Files pointer.
1217 * RETURN: void
1219 int SfsMethod::call_SFS(Parameter* para, Files* files)
1221 if (files == NULL)
1223 return POINTER_NULL;
1225 //update 11-`6
1226 sem_t * sem_call_sfs_p = &(sem_call_sfs);
1227 sem_t * sem_call_sfs_return_p = &(sem_call_sfs_return);
1228 ///////////////////////time //////////////////
1229 //static time_t sumtime = 0;
1230 //time_t start = 0, end = 0;
1232 //////////////////////////////////////////////
1234 //sem_t * sem_map_number_p = &(sem_map_number);
1235 while (1)
1237 sem_wait(sem_call_sfs_p);
1238 /*time(&start);*/
1239 int tmp = getidxProcess();
1240 mapChange();
1241 if (file_end_flag == 0)
1243 sem_post(sem_call_sfs_return_p);
1245 SFS_PARA *sfs = para->sfs;
1246 //int tmp = getidxProcess();
1247 calcSumBias(asso[tmp], m_numInds);//get major and minor
1248 if (sfs->doBay)
1250 // do bay.
1251 algo(asso[tmp], m_numInds, sfs->eps, sfs->pvar, sfs->bias, files->sfsfile, sfs->under_FP, sfs->alternative, sfs->sigle_MB, sfs->pThres, sfs->allow_PathZ, sfs->sfs_first_last);
1253 if (sfs->doJoint)
1255 // do joint.
1256 algoJoint(asso[tmp], m_numInds, sfs->eps, sfs->pvar, sfs->bias, files->jointSfsfile, sfs->under_FP, sfs->alternative, sfs->sigle_MJ, sfs->jointFold);
1258 if (sfs->writeFr)
1260 // output frequence.
1261 writeFreq(files->freqfile, m_numInds, asso[tmp]);
1263 // free the map.
1264 delMap(asso[tmp]);
1265 //////////////////////////////////////////
1266 //time(&end);
1267 //sumtime += end - start;
1268 //clog << "sfs tmp sumtime : "<< sumtime <<endl;
1269 //////////////////////////////////////////
1270 //sem_post(sem_map_number_p);
1271 if (file_end_flag == 1)
1272 return 0;
1277 // get the data what sfs want
1279 * DATE: 2010-9-15
1280 * FUNCTION: get the data what sfs want, include site's pos, chr name, depth, and the 10 likelihoods.
1281 * PARAMETER: site: the allete information.
1282 * mat: Prob_matrix point, include likelihoods information.
1283 * chr: chromosome name.
1284 * id: the individual's index.
1285 * RETURN: void
1286 * UPDATE: 2010-10-14 Add a pthread_mutex_lock.
1288 int SfsMethod::getMapData(const Pos_info& site, const Prob_matrix* mat, const std::string & chr, const int id)
1290 assert(id < m_numInds);
1291 loci tmp = {chr, site.pos + 1};
1292 // lock.
1293 pthread_mutex_lock(&m_pthreadMutex);
1294 aMap::iterator itr = asso[m_map_idx].find(tmp);
1295 int *the_line = NULL;
1296 int *lk = NULL;
1298 if (itr == asso[m_map_idx].end())
1300 // new site.
1301 datum dats = allocDatum(m_numInds);
1302 // because soapsnp not the same with realsfs in base sort, snp is ACTG, sfs is ACGT.
1303 if (site.ori == 2)
1304 dats.ref = 3;
1305 else if (site.ori == 3)
1306 dats.ref = 2;
1307 else
1308 dats.ref = site.ori;
1309 the_line = dats.locus;
1310 lk = dats.lk;
1311 asso[m_map_idx].insert(std::make_pair(tmp, dats));
1312 // asso[tmp] = dats;
1314 else
1316 the_line = itr->second.locus;
1317 lk = itr->second.lk;
1318 assert(itr->first.position == tmp.position);
1320 // unlock
1321 pthread_mutex_unlock(&m_pthreadMutex);
1323 // get coverage.
1324 cov_Cpy(the_line, site.count_sfs, id);
1325 // get likelihoods.
1326 soaplk_sfslk(lk, mat->type_likely, id);
1328 return 0;
1335 // This builds a map from: char* -> int
1337 * DATE: 2010-9-15
1338 * FUNCTION: This builds a map from: char* -> int
1339 * such that each chromosome is represented as an integer
1340 * input is a file looking like
1341 * chr1
1342 * chr2
1343 * or a .fai generated file from samtools
1344 * PARAMETER: fname : file name
1345 * RETURN: void
1347 void SfsMethod::buildMap(const char* fname)
1349 my_ifstream pfile;
1350 pfile.open(fname,std::ios::in);
1351 char buffer[1024];
1352 int id=0;
1353 //fprintf(stderr,"\t-> Building chromosome index\n");
1354 cerr << "\t-> Building chromosome index" <<endl;
1355 while(pfile.getline(buffer,1024) ) {
1356 char* chr = strdup(strtok(buffer,"\t"));
1357 m_faiIndex.insert(std::make_pair(chr,id++));
1358 //fprintf(stderr,"\t-> %s->%d\n",chr,id-1);
1359 cerr << "\t->" << chr << "->" << id-1 <<endl;
1361 //fprintf(stderr,"\t-> Done building chromosome index\n");
1362 cerr << "\t-> Done building chromosome index" << endl;
1366 // delete the vector's data
1368 * DATE: 2010-9-26
1369 * FUNCTION: free the vector's data memery
1370 * PARAMETER: avec the aVector to be deleted.
1371 * RETURN: void
1373 void SfsMethod::delVector(aVector * avec)
1375 for (int i = 0; i < global_win_size; i++)
1377 if (avec[i].lk != NULL)
1378 delete [] avec[i].lk;
1379 if (avec[i].locus != NULL)
1380 delete [] avec[i].locus;
1382 delete [] avec;
1387 * DATE: 2010-9-26
1388 * FUNCTION: get the data what sfs want, include site's pos, chr name, depth, and the 10 likelihoods.
1389 * PARAMETER: site: the allete information.
1390 * mat: Prob_matrix point, include likelihoods information.
1391 * chr: chromosome name.
1392 * id: the individual's index.
1393 * RETURN: void
1395 int SfsMethod::getSFSData(const Pos_info & site, const Prob_matrix * mat, const std::string & chr, const int id)
1397 /* assert(id < m_numInds);
1398 int index = site.pos % global_win_size;
1399 assert(index < global_win_size);
1401 if (!asso[index].is_be_record)
1403 asso[index].chromo = chr;
1404 asso[index].is_be_record = true;
1405 if (site.ori == 2)
1406 asso[index].ref = 3;
1407 else if (site.ori == 3)
1408 asso[index].ref = 2;
1409 else
1410 asso[index].ref = site.ori;
1411 asso[index].position = site.pos + 1;
1414 assert(asso[index].position == (site.pos + 1));
1415 // get coverage.
1416 cov_Cpy(asso[index].locus, site.count_all, id);
1417 // get likelihoods.
1418 soaplk_sfslk(asso[index].lk, mat->type_likely, id);
1420 return 0;
1424 // clean up the vector's member.
1426 * DATE: 2010-9-30
1427 * FUNCTION: clean up the vector's member.
1428 * PARAMETER: void
1429 * RETURN: void
1431 void SfsMethod::cleanVec(void)
1433 /* for (int i = 0; i < global_win_size; ++i)
1435 asso[i].chromo = "";
1436 asso[i].is_be_record = false;
1437 asso[i].major = 0;
1438 asso[i].minor = 0;
1439 asso[i].ref = 0;
1440 asso[i].phat = 0;
1441 asso[i].position = 0;
1442 memset(asso[i].lk, 0, sizeof(int) * (10 * m_numInds));
1443 memset(asso[i].locus, 0, sizeof(int) * (4 * m_numInds + 4));
1447 * DATE: 2010-11-17
1448 * FUNCTION: change to the next map
1449 * PARAMETER: void
1450 * RETURN: void
1452 void SfsMethod::mapChange(void)
1454 // move behind another map
1455 m_map_idx = 1 - m_map_idx;
1459 //get the map can be processed
1461 * DATE: 2010-11-16
1462 * FUNCTION: get the index can be process
1463 * PARAMETER: void
1464 * RETURN: the index of map can be process
1466 int SfsMethod::getidxProcess(void)
1468 while(m_map_idx_process == BE_PROCESSED) //wait the get index .
1470 usleep(1);
1472 int tmp;
1473 (m_map_idx_process == BE_PROCESSED) ? (tmp = 2) : (tmp = m_map_idx_process);
1474 m_map_idx_process = BE_PROCESSED;
1475 return tmp;
1478 //set the map index
1480 * DATE: 2010-11-16
1481 * FUNCTION: get the index can be process
1482 * PARAMETER: void
1483 * RETURN: the index of map can be process
1485 void SfsMethod::setidxProcess(void)
1487 while(m_map_idx_process != BE_PROCESSED) //wait the get index .
1489 usleep(1);
1491 m_map_idx_process = m_map_idx ;
1495 * DATE: 2010-11-16
1496 * FUNCTION: a thread function used to run SfsMethod::call_sfs() function.
1497 * PARAMETER: __Args the parameter structure.
1498 * RETURN:
1500 void *_sfsMethod_callsfs(void * __Args)
1502 // get args.
1503 BIG_CALL_SFS_ARGS * args = (BIG_CALL_SFS_ARGS*)__Args;
1504 // run the function.
1505 args->sfsMethod->call_SFS(args->para, args->files);
1506 return NULL;