new file: pixi.toml
[GalaxyCodeBases.git] / projects / recombineMap / sperm_by_fw / snp_calling / call_sperm_snp.cpp
blob8dd3c06edba03d6d209c319cc101118684dfa456
1 //本程序根据参考序列上mapped的reads来计算测序样本的genotype, 此处仅适用于精子数据(二倍体杂合SNP已知)。
2 //依据bayes theory,以二倍体杂合SNP和测序错误分值模型为先验概率,计算观测数据下每一种genotype的后验概率。
3 //二倍体杂合SNP,只有两种genotype,且出现的先验概率各为0.5。
4 //测序错误分值模型,与模拟reads所用的模型相同,由分析重测序数据而得。它是一个四维矩阵:ref_base x cycle x seq_base x quality。
5 //解释:我们实际想得到的是P(D|genotype), 此处ref_base即为geotype, 而D包含三个属性(cycle,seq_base,quality)。
6 //我们想得到是P(cycle,seq_base,quality|ref_base), 而它又等价于P(cycle|ref_base) * P(seq_base,quality|cycle,ref_base)。
7 //因为P(cycle|ref_base)可以认为是一个常数,在整个bayes公式的计算中不起作用,因此我们把它忽略。
8 //这样可以将P(seq_base,quality|cycle,ref_base)直接用于bayes公式,也就形成了ref_base*cycle作为列,seq_base*quality作为行的二维matrix文件。
9 //本程序采用long double (128bit浮点数),来高精度的表示概率。
10 //coverage depth最大允许值为400,以防计算过程中数值指数部分超过long double界限。
11 //选取概率最大的genotype作为最终结果,并将其错误率转换为phred quality, 设上限为100。
13 //此版读取soapsnp格式的error matrix;
15 //Author: Fan Wei, fanweisz09@gmail.com
16 //Date: 2012/5/16
19 #include <stdio.h>
20 #include <iostream>
21 #include <fstream>
22 #include <string>
23 #include <vector>
24 #include <algorithm>
25 #include <map>
26 #include <set>
27 #include <cmath>
28 #include <inttypes.h>
29 #include<zlib.h>
30 #include <boost/lexical_cast.hpp>
31 #include <boost/algorithm/string.hpp>
32 #include "gzstream.h"
34 using namespace std;
36 typedef long double rate_t;
37 map<long int,vector<rate_t> > PTi; //store the prior probability for each substitutions
38 map<long int, string > DiGeno;
39 rate_t *PdTi; //store the distribution of error and quality
40 int Ref_Base_num;
41 int Cycle_num;
42 int Seq_Base_num;
43 int Quality_num;
44 int Qual_add_val;
45 int Cycle_add_val;
46 int Ref_add_val;
47 int Ref_Cyc_base_qual;
48 string Error_profile_file;
49 string Out_prefix = "output";
50 int Phred_score_cutoff = 5;
51 int Ascii_Qual_Start = 33;
53 //由0123 4到ACGT N
54 char Bases[5] ={
55 'A', 'C', 'G', 'T', 'N'
58 void usage()
59 { cout << "calculate_sperm_genotype <diploid_snp_file> <sperm_pileup_file>" << endl;
60 cout << " version 1.0" << endl;
61 cout << " -m <file> the matrix file of error and quality distributions, default=exe_path/XieSunney.blood.matrix.soapsnp" << endl;
62 cout << " -q <int> ASCII chracter standing for quality==0, default=" << Ascii_Qual_Start << endl;
63 cout << " -s <file> the cutoff of score in phred scale, default=" << Phred_score_cutoff << endl;
64 cout << " -h get help information" << endl;
65 exit(0);
69 //由ACGT N得到0123 4
70 int get_base (char refBase)
72 int refBaseId;
73 switch (refBase)
75 case 'A': refBaseId = 0; break;
76 case 'C': refBaseId = 1; break;
77 case 'G': refBaseId = 2; break;
78 case 'T': refBaseId = 3; break;
79 case 'a': refBaseId = 0; break;
80 case 'c': refBaseId = 1; break;
81 case 'g': refBaseId = 2; break;
82 case 't': refBaseId = 3; break;
83 default: refBaseId = 4;
85 return refBaseId;
89 //限定freq_rate在1到1e-9之间,统计样本量级1G
90 void load_soapsnp_error_profile (string &matrix_file)
92 igzstream infile;
93 infile.open(matrix_file.c_str());
94 if ( ! infile )
95 { cerr << "fail to open input file" << matrix_file << endl;
96 exit(0);
99 int pi = 0; //PdTi的循环变量
100 int line_field_number = 18;
101 vector<string> textlines;
103 string lineStr;
104 while (getline( infile, lineStr, '\n' ))
105 { if (lineStr[0] == '#')
106 { continue;
109 vector<string> lineVec;
110 boost::split(lineVec,lineStr, boost::is_any_of(" \t\n"), boost::token_compress_on);
111 if (lineVec.size() != line_field_number)
112 { continue;
115 Quality_num = boost::lexical_cast<int>(lineVec[0]) + 1;
116 Cycle_num = boost::lexical_cast<int>(lineVec[1]) + 1;
118 textlines.push_back(lineStr);
121 Ref_Base_num = 4;
122 Seq_Base_num = 4;
123 Ref_Cyc_base_qual = Ref_Base_num * Seq_Base_num * Quality_num * Cycle_num;
124 Qual_add_val = Ref_Base_num * Seq_Base_num * Cycle_num;
125 Cycle_add_val = Ref_Base_num * Seq_Base_num;
126 Ref_add_val = Seq_Base_num;
128 PdTi = new rate_t[Ref_Cyc_base_qual];
129 for (int j=0; j<textlines.size(); j++)
131 vector<string> lineVec;
132 boost::split(lineVec,textlines[j], boost::is_any_of(" \t\n"), boost::token_compress_on);
133 int this_qual = boost::lexical_cast<int>(lineVec[0]);
134 int this_cycl = boost::lexical_cast<int>(lineVec[1]);
136 for (int i=2; i<line_field_number; i++)
137 { int k = i - 2;
138 int this_ref = k / Seq_Base_num;
139 int this_base = k % Seq_Base_num;
140 pi = this_qual*Qual_add_val + this_cycl*Cycle_add_val + this_ref*Ref_add_val + this_base;
141 rate_t freq_rate = boost::lexical_cast<rate_t>(lineVec[i]);
143 if (pi < Ref_Cyc_base_qual)
144 { PdTi[pi] = (freq_rate > 1e-9L) ? freq_rate : 1e-9L;
145 }else{
146 cerr << "pi exceeds its range " << Ref_Cyc_base_qual << endl;
147 exit(0);
154 //从snp文件中读取对于特定位置某一种genotype出现的先验概率
155 void assign_PTi_rates (string &diploid_snp_file)
157 igzstream infile;
158 infile.open(diploid_snp_file.c_str());
159 if ( ! infile )
160 { cerr << "fail to open input file" << diploid_snp_file << endl;
163 string lineStr;
164 while (getline( infile, lineStr, '\n' ))
165 { vector<string> lineVec;
166 boost::split(lineVec,lineStr, boost::is_any_of("\t"), boost::token_compress_on);
168 long int refpos = boost::lexical_cast<long int>(lineVec[1]);
169 vector<rate_t> snp_prob(4, 0.0); //每种genotype的概率初始化为0
170 string hetero;
171 hetero.push_back(lineVec[5][0]);
172 hetero.push_back(lineVec[9][0]);
173 int snp1_id = get_base(lineVec[5][0]);
174 int snp2_id = get_base(lineVec[9][0]);
175 snp_prob[snp1_id] = 0.5;
176 snp_prob[snp2_id] = 0.5;
178 PTi[refpos] = snp_prob;
179 DiGeno[refpos] = hetero;
184 //从pileup格式的base_str中得到干净的bases信息
185 void get_clean_bases(int refBase, string &raw_base_str,vector<int> &clean_base_vec)
187 int pos = 0;
188 int len = raw_base_str.size();
189 while (pos < len)
191 if (raw_base_str[pos] == '^')
192 { pos += 2;
194 else if (raw_base_str[pos] == '+' || raw_base_str[pos] == '-')
195 { pos += 1;
196 string indel_num;
197 while (pos < len)
198 { if (raw_base_str[pos] >= 48 && raw_base_str[pos] <= 57 ) //ASCII: 0-9
199 { indel_num.push_back(raw_base_str[pos]);
200 pos ++;
202 else
203 { break;
206 pos += atoi(indel_num.c_str());
208 else
210 switch (raw_base_str[pos])
212 case '.': clean_base_vec.push_back(refBase); break;
213 case ',': clean_base_vec.push_back(refBase); break;
214 case 'A': clean_base_vec.push_back(0); break;
215 case 'a': clean_base_vec.push_back(0); break;
216 case 'C': clean_base_vec.push_back(1); break;
217 case 'c': clean_base_vec.push_back(1); break;
218 case 'G': clean_base_vec.push_back(2); break;
219 case 'g': clean_base_vec.push_back(2); break;
220 case 'T': clean_base_vec.push_back(3); break;
221 case 't': clean_base_vec.push_back(3); break;
222 case 'N': clean_base_vec.push_back(4); break;
223 case 'n': clean_base_vec.push_back(4); break;
224 case '*': clean_base_vec.push_back(4); break;
225 case '>': clean_base_vec.push_back(4); break;
226 case '<': clean_base_vec.push_back(4); break;
227 //自动省略$符号的判断
229 pos += 1;
237 //从pileup格式的quality_str中得到quality信息,并转换成phred-scale, 33开始
238 void get_quality(vector<int> &qual_vec, string &qual_str)
241 for (int i=0; i<qual_str.size(); i++)
242 { int qual_val = qual_str[i] - Ascii_Qual_Start;
243 qual_val = (qual_val < Quality_num) ? qual_val : Quality_num - 1;
244 qual_vec.push_back(qual_val);
248 //从pileup格式的cycle_str中得到cycle信息
249 void get_cycle (vector<int> &cycl_vec, string &cycl_str)
251 vector<string> temp_vec;
252 boost::split(temp_vec,cycl_str, boost::is_any_of(","), boost::token_compress_on);
253 for (int i=0; i<temp_vec.size(); i++)
254 { //int cycle_val = boost::lexical_cast<int>(temp_vec[i]); //runs much slow
255 int cycle_val = atoi(temp_vec[i].c_str()); //runs fast
256 if (cycle_val > Cycle_num)
257 { cycle_val = Cycle_num;
259 cycl_vec.push_back(cycle_val - 1); //cycle 0-99代表100个cycles
265 //根据指定的参考序列碱基,计算每一种genotype的后验概率P(Ti|D)
266 //coverage depth最大允许值为400,以防计算过程中数值指数部分超过long double界限
267 void calculte_genotype_probability(int refPose, vector<rate_t> &pTiDvec, vector<int> &baseVec, vector<int> &qualVec, vector<int> &cyclVec)
269 rate_t pD = 0.0;
270 int len = (baseVec.size() < 400) ? baseVec.size() : 400;
271 for (int genotype=0; genotype<4; genotype++)
272 { rate_t pTi = PTi[refPose][genotype];
273 rate_t pDTi = 1.0;
274 if (pTi != 0)
276 for (int j=0; j<len; j++)
278 int num = qualVec[j]*Qual_add_val + cyclVec[j]*Cycle_add_val + genotype*Ref_add_val + + baseVec[j];
279 if (num >= Ref_Cyc_base_qual)
280 { cerr << "num exceeds its range " << num << endl;
281 exit(0);
283 pDTi *= PdTi[num];
286 pTiDvec.push_back( pTi*pDTi );
287 pD += pTi*pDTi;
290 for (int genotype=0; genotype<4; genotype++)
291 { pTiDvec[genotype] = pTiDvec[genotype] / pD;
296 //find the genotype with maximum posterior likelihood and calcualte the phred-scale score (up to 100)
297 void find_genotype_score(vector<rate_t> &pTiDvec, char &sample_base, int &phred_score)
299 int max_id = 0;
300 rate_t max_likelihood = 0.0;
301 for (int i=0; i<pTiDvec.size(); i++)
302 { if (pTiDvec[i] > max_likelihood)
303 { max_id = i;
304 max_likelihood = pTiDvec[i];
308 sample_base = Bases[max_id];
310 rate_t error_rate = 1 - pTiDvec[max_id];
311 if (error_rate < 1e-10L)
312 { error_rate = 1e-10L;
314 phred_score = int(-10 * log10(error_rate));
318 int main(int argc, char *argv[])
320 //get options from command line
321 int c;
322 while((c=getopt(argc, argv, "m:q:s:h")) !=-1) {
323 switch(c) {
324 case 'm': Error_profile_file=optarg; break;
325 case 'q': Ascii_Qual_Start=atoi(optarg); break;
326 case 's': Phred_score_cutoff=atoi(optarg); break;
327 case 'h': usage(); break;
328 default: usage();
331 if (argc < 3) usage();
333 if (! Error_profile_file.size())
334 { string exepath = argv[0];
335 int numh = exepath.rfind("/");
336 Error_profile_file = exepath.substr(0,numh+1) + "XieSunney.blood.matrix.soapsnp";
338 cerr << "\nUsed error profile matrix file: " << Error_profile_file << endl;
341 string diploid_snp_file = argv[optind++]; //optind, argv[optind++]顺序指向非option的参数
342 string sperm_pileup_file = argv[optind++]; //optind, argv[optind++]顺序指向非option的参数
344 clock_t time_start, time_end;
345 time_start = clock();
347 //caluate the prior probability of each genotype
348 assign_PTi_rates(diploid_snp_file);
349 cerr << "Load the snp file done:\n\n";
351 //load the error and quality profile matrix into memory
352 load_soapsnp_error_profile(Error_profile_file);
353 cerr << "Dimensions of error profile:\n";
354 cerr << "\n Ref_Base_num " << Ref_Base_num << endl;
355 cerr << " Cycle_num " << Cycle_num << endl;
356 cerr << " Seq_Base_num " << Seq_Base_num << endl;
357 cerr << " Quality_num " << Quality_num << endl;
358 cerr << " PdTi array size: " << Ref_Cyc_base_qual << endl;
359 cerr << " Qual_add_val " << Qual_add_val << endl;
360 cerr << " Cycle_add_val " << Cycle_add_val << endl;
361 cerr << " Ref_add_val " << Ref_add_val << endl;
363 time_end = clock();
364 cerr << "\nLoad the error profile done\n";
365 cerr << "Run time: " << double(time_end - time_start) / CLOCKS_PER_SEC << endl;
367 //parse the pileup file, and calculate the likelihood of each genotypes
368 igzstream infile;
369 infile.open(sperm_pileup_file.c_str());
370 if ( ! infile )
371 { cerr << "fail to open input file" << sperm_pileup_file << endl;
374 cout << "#Chr\tPos\tRef\tDiGt\tSpGt\tScore\tDepth\tBases\tQuals\tCycles\t[all mapped bases]\n";
375 string lineStr;
376 while (getline( infile, lineStr, '\n' ))
377 { vector<string> lineVec;
378 boost::split(lineVec,lineStr, boost::is_any_of("\t"), boost::token_compress_on);
380 long int refPos = boost::lexical_cast<long int>(lineVec[1]);
382 if ( ! PTi.count( refPos ) )
383 { continue;
386 int refBase = get_base(lineVec[2][0]);
388 vector<int> baseVec;
389 get_clean_bases(refBase,lineVec[4],baseVec);
391 vector<int> qualVec;
392 get_quality(qualVec, lineVec[5]);
394 vector<int> cyclVec;
395 get_cycle(cyclVec, lineVec[7]);
397 //check the result of parsing pileup file
398 if (baseVec.size() != qualVec.size() || baseVec.size() != cyclVec.size() || baseVec.size() == 0)
399 { cerr << "Error happens at : " << lineVec[4] << "\t" << lineVec[5] << "\t" << lineVec[7] << endl << endl;
400 exit(0);
403 vector<int> FbaseVec;
404 vector<int> FqualVec;
405 vector<int> FcyclVec;
407 //filter the observed data, base not N, and qual > 5;
408 for (int i=0; i<baseVec.size(); i++)
409 { if (baseVec[i] != 4 && qualVec[i] >= 5) //当质量值小于5的时候,P(D|G)几乎是相等的,因此P(G|D) = P(G), 数据没有发挥作用,结果将只反映先验概率,造成错误。
410 { FbaseVec.push_back(baseVec[i]);
411 FqualVec.push_back(qualVec[i]);
412 FcyclVec.push_back(cyclVec[i]);
415 if (FbaseVec.size() == 0)
416 { continue;
419 vector<rate_t> pTiDvec;
420 calculte_genotype_probability(refPos,pTiDvec, FbaseVec, FqualVec, FcyclVec);
422 char sample_base;
423 int phred_score;
424 find_genotype_score(pTiDvec,sample_base,phred_score);
426 if (phred_score >= Phred_score_cutoff)
428 cout << lineVec[0] << "\t" << lineVec[1] << "\t" << Bases[refBase] << "\t" << DiGeno[refPos] << "\t" << sample_base << "\t" << phred_score << "\t" << FbaseVec.size() << "\t";
429 for (int i=0; i<FbaseVec.size(); i++)
430 { cout << Bases[FbaseVec[i]];
432 cout << "\t";
433 for (int i=0; i<FbaseVec.size(); i++)
434 { cout << FqualVec[i] << ",";
436 cout << "\t";
437 for (int i=0; i<FbaseVec.size(); i++)
438 { cout << FcyclVec[i] << ",";
440 cout << "\t" << baseVec.size() << "\t"<< lineVec[4] << "\t" << lineVec[5] << "\t" << lineVec[7] << endl;
444 time_end = clock();
445 cerr << "\nParse the pileup file and genotype calculation done\n";
446 cerr << "Run time: " << double(time_end - time_start) / CLOCKS_PER_SEC << endl;