2 ******************************************************************************
7 *CREATE DATE : 2010-9-13
9 *FUNCTION : a class that contain method about sfs.
10 *FILE NAME : SfsMethod.cpp
11 *UPDATE DATE : 2010-9-13
12 *UPDATE BY : Bill Tang
13 *UPDATE DATE : 2010-9-27
14 *UPDATE BY : Bill Tang
15 *******************************************************************************
18 #include "SfsMethod.h"
21 #define BE_PROCESSED -1
23 bool operator <= (const loci
& first
,const loci
& second
) {
24 if(first
.chromo
==second
.chromo
)
25 return first
.position
<=second
.position
;
27 return first
.chromo
<=second
.chromo
;
31 bool operator > (const loci
& first
,const loci
& second
) {
32 return (!(first
<= second
));
35 bool operator == (const loci
& first
,const loci
& second
) {
36 if(first
.chromo
==second
.chromo
)
37 return first
.position
==second
.position
;
39 return first
.chromo
==second
.chromo
;
42 SfsMethod::SfsMethod()
52 m_map_idx_process
= BE_PROCESSED
;
53 sem_init(&sem_call_sfs
, 0, 0);
54 sem_init(&sem_call_sfs_return
, 0, 1);
55 //sem_init(&sem_map_number, 0, 2);
59 pthread_mutex_t
SfsMethod::m_pthreadMutex
= PTHREAD_MUTEX_INITIALIZER
;
61 SfsMethod::SfsMethod(int numInds
)
64 depending on major/minor we will only use 3 genotypes,
65 so we will need a small lookup table to decide which columns to extract from the glf file
74 for(int i
= 0; i
< 4; i
++)
76 for(int j
= i
; j
< 4; j
++)
84 generate_likeLookup();
88 likes
= allocDoubleArray(1);
89 bayes
= allocDoubleArray(1);
94 generate_binom(numInds
);
95 likes
= allocDoubleArray(2 * numInds
+ 1);
96 bayes
= allocDoubleArray(2 * numInds
+ 1);
103 * FUNCTION: initialization :likes , bays, binomLookup matrix
104 * PARAMETER: numInds: the individual number.
107 int SfsMethod::init(const int numInds
)
110 depending on major/minor we will only use 3 genotypes,
111 so we will need a small lookup table to decide which columns to extract from the glf file
120 for(int i
= 0; i
< 4; i
++)
122 for(int j
= i
; j
< 4; j
++)
130 generate_likeLookup();
134 likes
= allocDoubleArray(1);
135 bayes
= allocDoubleArray(1);
140 generate_binom(numInds
);
141 likes
= allocDoubleArray(2 * numInds
+ 1);
142 bayes
= allocDoubleArray(2 * numInds
+ 1);
149 SfsMethod::~SfsMethod(void)
151 if (binomLookup
!= NULL
)
153 delete [] binomLookup
;
171 sem_destroy(&sem_call_sfs
);
172 sem_destroy(&sem_call_sfs_return
);
175 void outMap(aMap::iterator
&itr
, int numInds
)
177 cerr
<< "chr : " << itr
->first
.chromo
<< " pos : " << itr
->first
.position
;
178 datum p
= itr
->second
;
179 cerr
<< " ref :" << p
.ref
<< " major : " << p
.major
<< " minor: " << p
.minor
<< endl
;
181 cerr
<< "lk :" << endl
;
182 for (int j
= 0; j
< numInds
; j
++)
184 for (int i
= 0; i
< 10; i
++)
185 cerr
<< lk
[4 * j
+ i
] << "\t";
189 int *locus
= p
.locus
;
190 cerr
<< "locus: " << endl
;
191 for (int j
= 0; j
<= numInds
; j
++)
193 for (int i
= 0; i
< 4; i
++)
194 cerr
<< locus
[4 * j
+ i
] << "\t";
197 cerr
<< "phat : " << p
.phat
;
198 cerr
<< endl
<< endl
;
203 * FUNCTION: the function for algo
204 * PARAMETER: asso : the maps of the datums
205 * numInds : the number of individuals
207 * pvar : the probability of being variable
208 * B :the bias correction coefficient
209 * sfsfile : sfs file pionter
212 * singleMinor : we should loop over 3 possible minor or one.
213 * pThres : select the out put threshold of snp £¬default is 0.01
216 * UPDATE:2010-10-21, add a set phat condition.
217 * UPDATE:2010-11-29, control the sfs file column
219 void SfsMethod::algo(aMap
& asso
, int numInds
, double eps
, double pvar
, double B
, FILE * sfsfile
,
220 int normalize
, int alternative
, int singleMinor
, double pThres
, int allowPhatZero
, int firstlast
)
224 //algorithm goes on by a site on site basis
225 double *pis
= new double[numInds
];
226 double *wis
= new double[numInds
];
227 double *cf
= new double[numInds
];//the bias coefiecients
228 double *sumMinors
= allocDoubleArray(2*numInds
+1);//hte sum of the 3 different minors
229 double *hj
= allocDoubleArray(2*numInds
+1);
231 for(aMap::iterator it
=asso
.begin(); it
!=asso
.end(); it
++) {
233 //outMap(it, numInds);
234 datum p
= it
->second
;
236 if (p
.major
> 3 || p
.minor
==p
.major
){
237 //printf("never here\n");
241 //set the resultarray to zeros
242 for(int sm
=0 ; sm
<(2*numInds
+1) ; sm
++ )
245 //loop through the 3 different minors
246 for(int minor_offset
=0;minor_offset
<4;minor_offset
++) {
248 if(minor_offset
== p
.major
)
251 //hook for only calculating one minor
253 minor_offset
= p
.minor
;
255 int major_offset
= p
.major
;
256 int Aa_offset
= glfLookup
[minor_offset
][major_offset
];//0-9
257 int AA_offset
= glfLookup
[minor_offset
][minor_offset
];//0-9
258 int aa_offset
= glfLookup
[major_offset
][major_offset
];//0-9
261 for(int i
=0;i
<numInds
;i
++) {
262 int ni
= p
.locus
[i
*4+minor_offset
];
263 int nt
= p
.locus
[i
*4+minor_offset
] + p
.locus
[i
*4+major_offset
];
265 if(B
!=0)//bias version
266 cf
[i
] = pow((0.5-B
),ni
)*pow((0.5+B
),nt
-ni
)*pow(0.5,-nt
);
270 //FIX if we have a very large number ( paralogue sites), the cf[i] will overflow, and it should be zero
271 if(isinf_local(cf
[i
])||isnan_local(cf
[i
])){
272 // fprintf(stderr,"POSSIBLE ERROR: Problems calculating bias correction coefficient, site might be paralogue?\t%d\t%d\n",it->first.chromo,it->first.position);
273 cerr
<< "POSSIBLE ERROR: Problems calculating bias correction coefficient, site might be paralogue?\t" << it
->first
.chromo
<< "\t" << it
->first
.position
<< endl
;
276 if(nt
==0){//if we dont have any reads for individual 'i'
281 pis
[i
] = (ni
-eps
*nt
)/(nt
*(1-2*eps
));
282 wis
[i
] = 2.0*nt
/(nt
+1.0);
286 for(int i
=0;i
<numInds
;i
++)
287 tmp
+= (pis
[i
]*wis
[i
]);
289 double phat
= std::max(0.0,tmp
/sum(wis
,numInds
));//original
291 // phat = std::max(pvar/(4*numInds),tmp/sum(wis,numInds)); //ramus old
293 if(allowPhatZero
==0){
295 for (int pl
=0;pl
<4;pl
++) depth
+= p
.locus
[4*numInds
+pl
];
296 phat
= std::max(pvar
/std::min(10,depth
),phat
); //thorfinn new
300 double newc
= (0.5-B
)/(0.5+B
);
301 phat
= std::min(1.0,phat
/(newc
+phat
*(1.0-newc
)));
304 // change at 2010-10-21
305 if (minor_offset
== p
.minor
)
307 it
->second
.phat
= phat
;
311 // change on 2010-9-18
312 memset(hj
, 0, sizeof(double) * (2*numInds
+1));
313 /*for(int i = 0; i < 2*numInds+1; ++i)
319 for(int index
=0;index
<(2*numInds
+1);index
++)
320 hj
[index
]=log(hj
[index
]);
325 double gaa_prod
= 1.0; //product of the major genotyp likelihood
326 for(int i
=0 ; i
<numInds
;i
++) {
329 GAA
= likeLookup
[p
.lk
[i
*10+AA_offset
]];
330 GAa
= likeLookup
[p
.lk
[i
*10+Aa_offset
]];
331 Gaa
= likeLookup
[p
.lk
[i
*10+aa_offset
]];
333 GAA
= exp(p
.lk
[i
*10+AA_offset
]);
334 GAa
= exp(p
.lk
[i
*10+Aa_offset
]);
335 Gaa
= exp(p
.lk
[i
*10+aa_offset
]);
337 // printf("GAA=%.15f\tGAa=%.15f\tGaa=%.15f\n",GAA,GAa,Gaa);
338 //printf("sums g,=%.15f\n",GAa+GAA+Gaa);
341 double sums
= GAA
+GAa
+Gaa
;//this is essential for floating point precision
351 MAA
= GAA
* phat
*phat
;
352 MAa
= GAa
* 2*(1-phat
)*phat
*cf
[i
]; //DRAGON remember to add cf[i]
353 Maa
= Gaa
* (1-phat
)*(1-phat
);
355 // fprintf(stderr,"hereee\n");
360 //printf("mAA=%.15f\tmAa=%.15f\tMaa=%.15f\n",MAA,MAa,Maa);
362 if(1 &&(isnan_local(Maa
) || isnan_local(MAa
) || isnan_local(MAA
))){
363 //fprintf(stderr,"Possible underflow at: \t%d\t%d\n",it->first.chromo,it->first.position);
364 cerr
<< "Possible underflow at: \t" << it
->first
.chromo
<< "\t" << it
->first
.position
<< endl
;
365 goto next_allele
;// change 2010.9.18
369 PAA
=log(MAA
);///(MAA+MAa+Maa);
370 PAa
=log(MAa
);///(MAA+MAa+Maa);
371 Paa
=log(Maa
);///(MAA+MAa+Maa);
372 // printf("sum af P's=%f\n",PAA+PAa+Paa);
374 PAA
=(MAA
);///(MAA+MAa+Maa);
375 PAa
=(MAa
);///(MAA+MAa+Maa);
376 Paa
=(Maa
);///(MAA+MAa+Maa);
380 PAA
=log(MAA
/(MAA
+MAa
+Maa
));
381 PAa
=log(MAa
/(MAA
+MAa
+Maa
));
382 Paa
=log(Maa
/(MAA
+MAa
+Maa
));
385 //check for underflow error, this should only occur once in a blue moon
386 if(isnan_local(Paa
)||isnan_local(PAa
)||isnan_local(PAA
)){
387 printf("fixing nan error at : \t%s\t%d\n",it
->first
.chromo
.c_str(),it
->first
.position
);
388 goto next_allele
;// change 2010.9.18
391 if(isnan_local(Paa
)||isnan_local(PAa
)||isnan_local(PAA
)){
392 //fprintf(stderr,"Possible underflow at: \t%d\t%d\n",it->first.chromo,it->first.position);
393 cerr
<< "Possible underflow at: \t" << it
->first
.chromo
<< "\t" << it
->first
.position
<< endl
;
394 printf("PAA=%f\tPAa=%f\tPaa=%f\tMAA=%f\tMAa=%f\tMaa=%f\n",PAA
,PAa
,Paa
,MAA
,MAa
,Maa
);
399 if (Paa
> PAa
&& Paa
> PAA
) mymax
= Paa
;
400 else if (PAa
> PAA
) mymax
= PAa
;
407 totmax
= totmax
+ log(mymax
);
415 for(int j
=2*(i
+1); j
>1;j
--){
418 tmp
=log(exp(PAA
+hj
[j
-2])+exp(PAa
+hj
[j
-1])+exp(Paa
+hj
[j
]));
420 tmp
= PAA
*hj
[j
-2]+PAa
*hj
[j
-1]+Paa
*hj
[j
];
422 if(isnan_local(cf
[i
]))
423 //fprintf(stderr,"cfi is inf\n");
424 cerr
<< "cfi is inf" <<endl
;
425 if(isnan_local(tmp
)){
426 //fprintf(stderr,"jis nan in algorithm:%d\n",j );
427 cerr
<< "jis nan in algorithm: " << j
<<endl
;
428 //fprintf(stderr,"%f\t%f\t%f\t%f\t%f\n",PAA,PAa,Paa,mymax,cf[i]);
429 cerr
<< PAA
<< "\t" << PAa
<< "\t" << Paa
<< "\t" << mymax
<< "\t" << cf
[i
] <<endl
;
430 //fprintf(stderr,"at : \t%d\t%d\n",it->first.chromo,it->first.position);
431 cerr
<< "at : " << it
->first
.chromo
<< "\t" << it
->first
.position
<< endl
;
433 goto next_allele
;// change 2010.9.18
439 hj
[1] = log(exp(Paa
+hj
[1])+exp(PAa
+hj
[0]));
443 hj
[1] = Paa
*hj
[1] + PAa
*hj
[0];
448 //recursion done, now do the normalization, and extrastuff
451 //populate the log(M) vector
452 double *m
= allocDoubleArray(2*numInds
+1);
453 for(int mi
=0;mi
<(2*numInds
+1);mi
++)
454 m
[mi
] = binomLookup
[mi
]*pow(phat
,mi
)*pow(1-phat
,2*numInds
-mi
);
456 //now add the mi to the hi,
457 for(int i
=0;i
<(2*numInds
+1) ;i
++)
458 hj
[i
] = hj
[i
] * m
[i
];
460 // change on 2010-9-25
464 if(0) // this is a artifact from the no-underflow version
465 for (int i
=0;i
<(2*numInds
+1);i
++)
468 for (int i
=0;i
<(2*numInds
+1);i
++)
469 hj
[i
] = exp(log(hj
[i
])+totmax
);
471 double bottom
= pvar
*sum(hj
,2*numInds
+1)+(1-pvar
)*gaa_prod
;
474 if(bottom
==0||isnan_local(bottom
)||isinf_local(bottom
)){
475 //fprintf(stderr,"bottom equal zero should not happen: %f\n",bottom);
476 cerr
<< "bottom equal zero should not happen: " << bottom
<<endl
;
477 //fprintf(stderr,"at : \t%d\t%d\n",it->first.chromo,it->first.position);
478 cerr
<< "at : " << "\t" <<it
->first
.chromo
<< "\t" << it
->first
.position
<<endl
;
479 //fprintf(skipped_sites_fp,"%s\t%d\n",it->first.chromo.c_str(),it->first.position);
480 //sfsfile << it->second.chromo << "\t" << it->second.position << "\t-1" << endl;
481 fprintf(sfsfile
,"%s\t%d\t-1\n",it
->first
.chromo
.c_str(),it
->first
.position
);
484 goto next_allele
; // change on 2010.9.18
491 //now input the normalized hj array
492 hj
[0] = ((pvar
*hj
[0])+(1-pvar
)*gaa_prod
)/bottom
;//prod(p.lk,numInds,2))/bottom;
493 for(int i
=1;i
<(2*numInds
+1) ; i
++)
494 hj
[i
] = (hj
[i
]*pvar
/bottom
);
496 //validate that everything looks fine
497 if(sum(hj
,2*numInds
+1)>1.0000001){
498 printf("This shouldnt occur, it means that the sfs for a loci has an improper prob space\n");
499 printf("sums is:%f\n",sum(hj
,2*numInds
+1));
500 printf("at : \t%s\t%d\n",it
->first
.chromo
.c_str(),it
->first
.position
);
501 goto next_allele
; // change 2010.9.18
504 //update the sumSFS of the site
505 for(int ss
=0;ss
<(2*numInds
+1);ss
++)
506 sumMinors
[ss
] += hj
[ss
];
507 likes
[getMaxId(hj
,2*numInds
+1)]++;
514 for(int i
=0;i
<(2*numInds
+1);i
++)
516 sumMinors
[i
] = sumMinors
[i
]/3.0;
518 //update the global bayes/like estimates
519 for(int i
=0;i
<(2*numInds
+1);i
++)
520 bayes
[i
] += sumMinors
[i
];
523 if((1-sumMinors
[0]-sumMinors
[2*numInds
])>pThres
){
524 //sfsfile << it->second.chromo << "\t" << it->second.position << "\t";
525 fprintf(sfsfile
,"%s\t%d\t",it
->first
.chromo
.c_str(),it
->first
.position
);
526 //sfsfile << getChar(it->second.major) << "\t" << getChar(it->second.minor) << "\t";
527 fprintf(sfsfile
,"%c\t%c\t",getChar(it
->second
.major
),getChar(it
->second
.minor
));
528 //update 11-29 for control the sfs file column
529 //fprintf(sfsfile,"%f\t",sumMinors[i]);
530 for(int i
=0;i
<(2*numInds
);i
++)
532 if ( firstlast
== 1 ) //only output the first and last column to the sfs file
534 fprintf(sfsfile
,"%f\t",sumMinors
[i
]);
539 fprintf(sfsfile
,"%f\t",sumMinors
[i
]);
542 //sfsfile << sumMinors[i] << "\t";
543 //sfsfile << sumMinors[2*numInds] << endl;
544 fprintf(sfsfile
,"%f\n",sumMinors
[2*numInds
]);
564 * FUNCTION: do algo joint function
565 * PARAMETER: asso : the maps of the datums
566 * numInds : the number of individuals
568 * pvar : the probability of being variable
569 * B : the bias correction coefficient
570 * sfsfile : sfs file pionter
573 * singleMinor : should we loop over 3 possible minor. MAYBE
577 void SfsMethod::algoJoint(aMap
& asso
, int numInds
, double eps
, double pvar
, double B
, FILE * sfsfile
,
578 int underFlowProtect
, int alternative
, int singleMinor
, int fold
)
581 //algorithm goes on by a site on site basis //pretty much the same as 'algo' without the prior phat
583 double sumMinors
[2*numInds
+1]; //the sum of the 3 different minors
585 for(aMap::iterator it
=asso
.begin(); it
!=asso
.end(); it
++) {//loop over sites
586 datum p
= it
->second
;
590 if (p
.major
> 3 || p
.minor
==p
.major
){
591 // printf("never runs\n");
595 //set the resultarray to zeros
596 for(int sm
=0 ; sm
<(2*numInds
+1) ; sm
++ )
599 //loop through the 3 different minors
600 for(int minor_offset
=0;minor_offset
<4;minor_offset
++) {
602 if(minor_offset
== p
.major
)
605 //hook for only calculating one minor
607 minor_offset
= p
.minor
;
609 int major_offset
= p
.major
;
610 int Aa_offset
= glfLookup
[minor_offset
][major_offset
];//0-9
611 int AA_offset
= glfLookup
[minor_offset
][minor_offset
];//0-9
612 int aa_offset
= glfLookup
[major_offset
][major_offset
];//0-9
615 double hj
[2*numInds
+1];
616 for(int index
=0;index
<(2*numInds
+1);index
++)
617 if(underFlowProtect
==0)
623 for(int i
=0 ; i
<numInds
;i
++) {
624 //printf("AA=%d\tAa=%d\taa=%d\n",p.lk[i*3+AA],p.lk[i*3+Aa],p.lk[i*3+aa]);
626 #ifndef RASMUS_INPUT //abi input
627 GAA
= log(likeLookup
[p
.lk
[i
*10+AA_offset
]]);
628 if(p
.lk
[i
*10+AA_offset
]==0 && p
.lk
[i
*10+Aa_offset
]==0 && p
.lk
[i
*10+aa_offset
]==0)
629 GAa
= log(likeLookup
[p
.lk
[i
*10+Aa_offset
]]);
631 GAa
= log(2.0*likeLookup
[p
.lk
[i
*10+Aa_offset
]]);
632 Gaa
= log(likeLookup
[p
.lk
[i
*10+aa_offset
]]);
634 GAA
= p
.lk
[i
*10+AA_offset
];
635 GAa
= log(2.0)+p
.lk
[i
*10+Aa_offset
];
636 Gaa
= p
.lk
[i
*10+aa_offset
];
639 if(underFlowProtect
==0){
646 PAA
=(GAA
);///(MAA+MAa+Maa);
647 PAa
=(GAa
);///(MAA+MAa+Maa);
648 Paa
=(Gaa
);///(MAA+MAa+Maa);
651 //check for underflow error, this should only occur once in a blue moon
652 if(isnan_local(Paa
)||isnan_local(PAa
)||isnan_local(Paa
)){
653 //fprintf(stderr,"Possible underflow at: \t%d\t%d\n",it->first.chromo,it->first.position);
654 cerr
<< "Possible underflow at: \t" << it
->first
.chromo
<< "\t" << it
->first
.position
<< endl
;
655 printf("PAA=%f\tPAa=%f\tPaa=%f\n",PAA
,PAa
,Paa
);
664 for(int j
=2*(i
+1); j
>1;j
--){
665 //print_array(hj,2*numInds+1);
667 if(underFlowProtect
==1)
668 tmp
=addProtect3(PAA
+hj
[j
-2],PAa
+hj
[j
-1],Paa
+hj
[j
]);
670 tmp
= PAA
*hj
[j
-2]+PAa
*hj
[j
-1]+Paa
*hj
[j
];
672 if(isnan_local(tmp
)){
673 printf("jis nan:%d\n",j
);
674 print_array(hj
,2*numInds
+2);
681 if(underFlowProtect
==1){
682 hj
[1] = addProtect2(Paa
+hj
[1],PAa
+hj
[0]);
686 hj
[1] = Paa
*hj
[1] + PAa
*hj
[0];
693 for(int i
=0;i
<(2*numInds
+1);i
++)
694 if(underFlowProtect
==0)
695 sumMinors
[i
] += exp(log(hj
[i
])-log(bico(2*numInds
,i
)));
697 sumMinors
[i
] += exp(hj
[i
]-log(bico(2*numInds
,i
)));
703 //printit by rasmus 0.98 modified by thorfinn 0.981
705 for(int i
=0;i
<(2*numInds
+1);i
++)
707 sumMinors
[i
] = log(sumMinors
[i
]);
709 sumMinors
[i
] = log(sumMinors
[i
]/3);
712 int newDim
= numInds
+1;
713 for(int i
=0;i
<newDim
-1;i
++)// we shouldn't touch the last element
714 sumMinors
[i
] = log(exp(sumMinors
[i
]-log(2.0)) + exp(sumMinors
[2*numInds
-i
]-log(2.0)));//THORFINN NEW
715 //sfsfile << sumMinors;
716 fwrite(sumMinors
,sizeof(double),newDim
,sfsfile
);
718 //sfsfile << sumMinors;
719 fwrite(sumMinors
,sizeof(double),2*numInds
+1,sfsfile
);
727 * FUNCTION: write frequnce file
729 * pFile : a file pointer
730 * numInds : the number of individuals
731 * asso : the maps of the datums
735 void SfsMethod::writeFreq(FILE * pFile
, int numInds
, aMap
& asso
){
739 db
=fopen("indcounts.txt","w");
740 for(aMap::iterator it
=asso
.begin(); it
!=asso
.end(); it
++){
741 //loci l = it->first;
742 //datum d = it->second;
743 //outMap(it, numInds);
745 datum d
= it
->second
;
746 if (d
.major
> 3 || d
.minor
> 3 || it
->first
.position
== 0)
751 //pFile << d.chromo << "\t" << d.position << "\t";
752 fprintf(pFile
,"%s\t%d\t",it
->first
.chromo
.c_str(),it
->first
.position
) ;
753 //pFile << getChar(d.major) << "\t" << getChar(d.minor) << "\t";
754 fprintf(pFile
,"%c\t%c\t",getChar(d
.major
),getChar(d
.minor
)) ;
755 int *bases
= d
.locus
;
757 fprintf(pFile
,"%d\t",bases
[4*numInds
+i
]);
758 //pFile << bases[4*numInds+i] << "\t";
760 fprintf(pFile
,"%d\t",bases
[4*numInds
+d
.major
]+bases
[4*numInds
+d
.minor
]);
761 //pFile << bases[4*numInds+d.major]+bases[4*numInds+d.minor] << "\t";
763 fprintf(pFile
,"%d\t",bases
[4*numInds
+d
.major
]);
764 //pFile << bases[4*numInds+d.major] << "\t";
765 //pFile << d.phat << endl;
766 fprintf(pFile
,"%f\n",d
.phat
);
769 for(int i
=0;i
<4*numInds
-1;i
++)
770 fprintf(db
,"%d ",bases
[i
]);
771 fprintf(db
,"%d\n",bases
[4*numInds
-1]);
781 * FUNCTION: allocation functions, this shouldn't look magical for anyone
782 * PARAMETER: len is the requested length of double array.
783 * RETURN: the pointer to the new double array.
785 double* SfsMethod::allocDoubleArray(int len
)
787 double *ret
= new double[len
];
788 memset(ret
, 0, sizeof(double) * len
);
789 /*for(int i=0;i<len;i++)
796 * FUNCTION: test for infinity
797 * PARAMETER: x is the double to be tested.
800 int SfsMethod::isinf_local(double x
)
802 #ifdef __INTEL_COMPILER
805 return std::isinf(x
);
812 * FUNCTION: test for a NaN
813 * PARAMETER: x is the double to be tested.
816 int SfsMethod::isnan_local(double x
)
818 #ifdef __INTEL_COMPILER
821 return std::isnan(x
);
827 * FUNCTION: add up all the value form the ary
828 * PARAMETER: ary is the double array pointer, len is the array's length.
831 double SfsMethod::sum(const double* ary
, int len
)
834 for(int i
=0;i
<len
; i
++)
841 * FUNCTION: generate like look up table.
845 void SfsMethod::generate_likeLookup()
847 for (short int i
=0;i
<=255;i
++)
848 likeLookup
[i
] = calcLikeRatio(i
); //(pow(10.0,-i/10.0));
854 * FUNCTION: generate binom. input should be number of individuals
855 * PARAMETER: k is the individuals' number.
858 void SfsMethod::generate_binom(int k
)
860 binomLookup
= new double[k
*2+1];
861 for(int i
=0;i
<(k
*2+1);i
++)
862 binomLookup
[i
] = bico(k
*2,i
);
867 * FUNCTION : To calculate the nCk ( n combination k)
868 * PARAMETER : n is the number of individulals.
869 * k is the individuals' number.
872 double SfsMethod::bico(int n
, int k
)
874 return floor(0.5+exp(factln(n
)-factln(k
)-factln(n
-k
)));
879 * FUNCTION: generate factln
880 * PARAMETER: n : number of individuals
883 double SfsMethod::factln(int n
)
885 static double a
[101]; //A static array is automatically initialized to zero.
886 if (n
< 0) printf("Negative factorial in routine factln: %d \n", n
);
887 if (n
<= 1) return 0.0;
888 if (n
<= 100) return a
[n
] ? a
[n
] : (a
[n
]=gammln(n
+1.0));
889 else return gammln(n
+1.0);
894 * FUNCTION: generate gammln
898 double SfsMethod::gammln(double xx
)
901 static double cof
[6]={76.18009172947146,-86.50532032941677,
902 24.01409824083091,-1.231739572450155,
903 0.1208650973866179e-2,-0.5395239384953e-5};
907 tmp
-= (x
+0.5)*log(tmp
);
908 ser
=1.000000000190015;
909 for (j
=0;j
<=5;j
++) ser
+= cof
[j
]/++y
;
910 return -tmp
+log(2.5066282746310005*ser
/x
);
915 * FUNCTION: get max id
916 * PARAMETER: d : the double array pointer
917 * len £º the array's length.
920 int SfsMethod::getMaxId(double *d
,int len
){
922 for(int i
=1; i
<len
; i
++)
930 * FUNCTION: clean map
931 * PARAMETER: asso : the maps of the datums
934 void SfsMethod::cleanUpMap(aMap
& asso
)
936 for(aMap::iterator it
=asso
.begin();it
!=asso
.end();it
++)
938 delete [] it
->second
.lk
;
939 delete [] it
->second
.locus
;
945 * FUNCTION: get char from int
946 * PARAMETER: i : genetype number
949 char SfsMethod::getChar(int i
){
959 puts("Error getting char value of base");
965 * FUNCTION: add protect
966 * PARAMETER: ary : the double array pointer
967 * len £º the array's length.
968 * RETURN: protect add result .
970 double SfsMethod::addProtect(double *ary
,int len
){
971 double maxVal
= ary
[0];
972 for(int i
=1;i
<len
;i
++)
977 for(int i
=0;i
<len
;i
++)
978 sumVal
+= exp(ary
[i
]-maxVal
);
979 return log(sumVal
) + maxVal
;
985 * FUNCTION: add three protect
986 * PARAMETER: a,b,c : double
988 * RETURN: three protect add result .
990 double SfsMethod::addProtect3(double a
,double b
, double c
){
991 double maxVal
;// = std::max(a,std::max(b,c));
998 double sumVal
= exp(a
-maxVal
)+exp(b
-maxVal
)+exp(c
-maxVal
);
999 return log(sumVal
) + maxVal
;
1004 * FUNCTION: add two protect
1005 * PARAMETER: a,b : double
1007 * RETURN: two protect add result .
1009 double SfsMethod::addProtect2(double a
,double b
){
1010 double maxVal
= std::max(a
,b
);
1012 double sumVal
= exp(a
-maxVal
)+exp(b
-maxVal
);
1013 return log(sumVal
) + maxVal
;
1018 * FUNCTION: print array
1019 * PARAMETER: ary : the double array pointer
1020 * len £º the array's length.
1023 void SfsMethod::print_array(double *ary
,int len
){
1024 for (int i
=0;i
<len
;i
++)
1025 printf("%f\t",(ary
[i
]));
1030 allocates the arrays needed for one site for all individuals
1034 * FUNCTION: allocates the arrays needed for one site for all individuals
1035 * PARAMETER: numInds : the individuals' number.
1036 * RETURN: datum structure.
1038 datum
SfsMethod::allocDatum(int numInds
){
1040 //4 times the number of individuals +4 (the last 4elems will be the base sum of all inds)
1041 data
.locus
=allocIntArray(4*numInds
+4);
1042 data
.lk
=allocIntArray(10*numInds
);//allocUnsignedCharArray(3*numInds);
1044 // data.position = 0;
1045 // data.chromo = "";
1046 // data.is_be_record = false;
1052 * FUNCTION: allocates the arrays needed for one site for all individuals
1053 * PARAMETER: len : array length.
1054 * RETURN: array pointer.
1056 int* SfsMethod::allocIntArray(int len
){
1057 // printf("allocing a int array:%d\n",len);
1062 cerr
<< "\tallocIntArray: new operation failed!" << endl
;
1065 /*for(int i=0;i<len;i++)
1067 memset(ret
, 0, sizeof(int) * len
);
1073 * FUNCTION: will calculate the sums and input the values in the last 4 slots,
1074 * and set the minor, major accordingly
1075 * PARAMETER: asso : the maps of the datums
1076 * numInds : the number of individuals
1079 void SfsMethod::calcSumBias(aMap
& asso
,int numInds
){
1080 // for(aMap::iterator it = asso.begin(); it != asso.end(); ++it){
1081 for(aMap::iterator it
=asso
.begin(); it
!=asso
.end(); it
++){
1082 int *bases
= it
->second
.locus
; //4length array of basesums
1083 for(int i
=0;i
<numInds
;i
++){
1084 bases
[4*numInds
+A
] += bases
[i
*4+A
];
1085 bases
[4*numInds
+C
] += bases
[i
*4+C
];
1086 bases
[4*numInds
+G
] += bases
[i
*4+G
];
1087 bases
[4*numInds
+T
] += bases
[i
*4+T
];
1089 //now get the major/minor
1091 // int maj = it->second.major;
1094 maj
= it
->second
.ref
;
1100 for(int i
=0;i
<4;i
++)
1102 if(maj
==i
) //we should check the against the major allele
1104 else if (bases
[numInds
*4+i
] > temp
){
1106 temp
= bases
[numInds
*4+i
];
1115 it
->second
.minor
= min
;
1116 it
->second
.major
= maj
;
1123 * FUNCTION: soaplikelihood to sfslikelihood
1124 * PARAMETER: type_likely:sopa likelihood. likelihood : sfs likelihood
1127 void SfsMethod::soaplk_sfslk(int * likelihood
, double * type_likely
, int id
)
1129 assert(type_likely
!= NULL
&& likelihood
!= NULL
);
1135 for (allele1
= 0; allele1
!= 4; allele1
++)
1137 // Find the largest likelihood
1138 for (allele2
= allele1
; allele2
!= 4; allele2
++)
1140 genotype
= allele1
<<2|allele2
;
1141 if (type_likely
[genotype
] > type_likely
[type1
])
1148 for(int i
= 0; i
!= 10; i
++)
1150 if(type_likely
[type1
] - type_likely
[sfs_type
[i
]] > 25.5)
1152 likelihood
[10 * id
+ i
] = 255;
1156 likelihood
[10 * id
+ i
] = 10 * (type_likely
[type1
] - type_likely
[sfs_type
[i
]]);// sfs_type[10]={0,1,3,2,5,7,6,15,11,10}; AA,AC,AG,AT,CC,CG,CT,GG,GT,TT
1163 * FUNCTION: copy source's coverage to the dest
1164 * PARAMETER: dest: the object array. source : the source array. id the individual's index.
1167 void SfsMethod::cov_Cpy(int *dest
, int const *source
, int id
)
1169 assert(source
!= NULL
&& dest
!= NULL
);
1170 dest
[4 * id
+ 0] = source
[0];
1171 dest
[4 * id
+ 1] = source
[1];
1172 dest
[4 * id
+ 2] = source
[3];
1173 dest
[4 * id
+ 3] = source
[2];
1179 * FUNCTION: alloc vector memery
1183 void SfsMethod::allocVec(void)
1185 /* //asso = new aMap();
1187 asso = new aVector[global_win_size];
1190 cerr << "\tallocVec : new opreation failed!" << endl;
1193 for (int i = 0; i < global_win_size; ++i)
1194 asso[i] = allocDatum(m_numInds);
1201 * FUNCTION: free the map memery
1202 * PARAMETER: amap the pointer to be deleted.
1205 void SfsMethod::delMap(aMap
&amap
)
1207 // clean up the map's node
1216 * PARAMETER: para: the parameters files: Files pointer.
1219 int SfsMethod::call_SFS(Parameter
* para
, Files
* files
)
1223 return POINTER_NULL
;
1226 sem_t
* sem_call_sfs_p
= &(sem_call_sfs
);
1227 sem_t
* sem_call_sfs_return_p
= &(sem_call_sfs_return
);
1228 ///////////////////////time //////////////////
1229 //static time_t sumtime = 0;
1230 //time_t start = 0, end = 0;
1232 //////////////////////////////////////////////
1234 //sem_t * sem_map_number_p = &(sem_map_number);
1237 sem_wait(sem_call_sfs_p
);
1239 int tmp
= getidxProcess();
1241 if (file_end_flag
== 0)
1243 sem_post(sem_call_sfs_return_p
);
1245 SFS_PARA
*sfs
= para
->sfs
;
1246 //int tmp = getidxProcess();
1247 calcSumBias(asso
[tmp
], m_numInds
);//get major and minor
1251 algo(asso
[tmp
], m_numInds
, sfs
->eps
, sfs
->pvar
, sfs
->bias
, files
->sfsfile
, sfs
->under_FP
, sfs
->alternative
, sfs
->sigle_MB
, sfs
->pThres
, sfs
->allow_PathZ
, sfs
->sfs_first_last
);
1256 algoJoint(asso
[tmp
], m_numInds
, sfs
->eps
, sfs
->pvar
, sfs
->bias
, files
->jointSfsfile
, sfs
->under_FP
, sfs
->alternative
, sfs
->sigle_MJ
, sfs
->jointFold
);
1260 // output frequence.
1261 writeFreq(files
->freqfile
, m_numInds
, asso
[tmp
]);
1265 //////////////////////////////////////////
1267 //sumtime += end - start;
1268 //clog << "sfs tmp sumtime : "<< sumtime <<endl;
1269 //////////////////////////////////////////
1270 //sem_post(sem_map_number_p);
1271 if (file_end_flag
== 1)
1277 // get the data what sfs want
1280 * FUNCTION: get the data what sfs want, include site's pos, chr name, depth, and the 10 likelihoods.
1281 * PARAMETER: site: the allete information.
1282 * mat: Prob_matrix point, include likelihoods information.
1283 * chr: chromosome name.
1284 * id: the individual's index.
1286 * UPDATE: 2010-10-14 Add a pthread_mutex_lock.
1288 int SfsMethod::getMapData(const Pos_info
& site
, const Prob_matrix
* mat
, const std::string
& chr
, const int id
)
1290 assert(id
< m_numInds
);
1291 loci tmp
= {chr
, site
.pos
+ 1};
1293 pthread_mutex_lock(&m_pthreadMutex
);
1294 aMap::iterator itr
= asso
[m_map_idx
].find(tmp
);
1295 int *the_line
= NULL
;
1298 if (itr
== asso
[m_map_idx
].end())
1301 datum dats
= allocDatum(m_numInds
);
1302 // because soapsnp not the same with realsfs in base sort, snp is ACTG, sfs is ACGT.
1305 else if (site
.ori
== 3)
1308 dats
.ref
= site
.ori
;
1309 the_line
= dats
.locus
;
1311 asso
[m_map_idx
].insert(std::make_pair(tmp
, dats
));
1312 // asso[tmp] = dats;
1316 the_line
= itr
->second
.locus
;
1317 lk
= itr
->second
.lk
;
1318 assert(itr
->first
.position
== tmp
.position
);
1321 pthread_mutex_unlock(&m_pthreadMutex
);
1324 cov_Cpy(the_line
, site
.count_sfs
, id
);
1326 soaplk_sfslk(lk
, mat
->type_likely
, id
);
1335 // This builds a map from: char* -> int
1338 * FUNCTION: This builds a map from: char* -> int
1339 * such that each chromosome is represented as an integer
1340 * input is a file looking like
1343 * or a .fai generated file from samtools
1344 * PARAMETER: fname : file name
1347 void SfsMethod::buildMap(const char* fname
)
1350 pfile
.open(fname
,std::ios::in
);
1353 //fprintf(stderr,"\t-> Building chromosome index\n");
1354 cerr
<< "\t-> Building chromosome index" <<endl
;
1355 while(pfile
.getline(buffer
,1024) ) {
1356 char* chr
= strdup(strtok(buffer
,"\t"));
1357 m_faiIndex
.insert(std::make_pair(chr
,id
++));
1358 //fprintf(stderr,"\t-> %s->%d\n",chr,id-1);
1359 cerr
<< "\t->" << chr
<< "->" << id
-1 <<endl
;
1361 //fprintf(stderr,"\t-> Done building chromosome index\n");
1362 cerr
<< "\t-> Done building chromosome index" << endl
;
1366 // delete the vector's data
1369 * FUNCTION: free the vector's data memery
1370 * PARAMETER: avec the aVector to be deleted.
1373 void SfsMethod::delVector(aVector
* avec
)
1375 for (int i
= 0; i
< global_win_size
; i
++)
1377 if (avec
[i
].lk
!= NULL
)
1378 delete [] avec
[i
].lk
;
1379 if (avec
[i
].locus
!= NULL
)
1380 delete [] avec
[i
].locus
;
1388 * FUNCTION: get the data what sfs want, include site's pos, chr name, depth, and the 10 likelihoods.
1389 * PARAMETER: site: the allete information.
1390 * mat: Prob_matrix point, include likelihoods information.
1391 * chr: chromosome name.
1392 * id: the individual's index.
1395 int SfsMethod::getSFSData(const Pos_info
& site
, const Prob_matrix
* mat
, const std::string
& chr
, const int id
)
1397 /* assert(id < m_numInds);
1398 int index = site.pos % global_win_size;
1399 assert(index < global_win_size);
1401 if (!asso[index].is_be_record)
1403 asso[index].chromo = chr;
1404 asso[index].is_be_record = true;
1406 asso[index].ref = 3;
1407 else if (site.ori == 3)
1408 asso[index].ref = 2;
1410 asso[index].ref = site.ori;
1411 asso[index].position = site.pos + 1;
1414 assert(asso[index].position == (site.pos + 1));
1416 cov_Cpy(asso[index].locus, site.count_all, id);
1418 soaplk_sfslk(asso[index].lk, mat->type_likely, id);
1424 // clean up the vector's member.
1427 * FUNCTION: clean up the vector's member.
1431 void SfsMethod::cleanVec(void)
1433 /* for (int i = 0; i < global_win_size; ++i)
1435 asso[i].chromo = "";
1436 asso[i].is_be_record = false;
1441 asso[i].position = 0;
1442 memset(asso[i].lk, 0, sizeof(int) * (10 * m_numInds));
1443 memset(asso[i].locus, 0, sizeof(int) * (4 * m_numInds + 4));
1448 * FUNCTION: change to the next map
1452 void SfsMethod::mapChange(void)
1454 // move behind another map
1455 m_map_idx
= 1 - m_map_idx
;
1459 //get the map can be processed
1462 * FUNCTION: get the index can be process
1464 * RETURN: the index of map can be process
1466 int SfsMethod::getidxProcess(void)
1468 while(m_map_idx_process
== BE_PROCESSED
) //wait the get index .
1473 (m_map_idx_process
== BE_PROCESSED
) ? (tmp
= 2) : (tmp
= m_map_idx_process
);
1474 m_map_idx_process
= BE_PROCESSED
;
1481 * FUNCTION: get the index can be process
1483 * RETURN: the index of map can be process
1485 void SfsMethod::setidxProcess(void)
1487 while(m_map_idx_process
!= BE_PROCESSED
) //wait the get index .
1491 m_map_idx_process
= m_map_idx
;
1496 * FUNCTION: a thread function used to run SfsMethod::call_sfs() function.
1497 * PARAMETER: __Args the parameter structure.
1500 void *_sfsMethod_callsfs(void * __Args
)
1503 BIG_CALL_SFS_ARGS
* args
= (BIG_CALL_SFS_ARGS
*)__Args
;
1504 // run the function.
1505 args
->sfsMethod
->call_SFS(args
->para
, args
->files
);