1 //本程序根据参考序列上mapped的reads来计算测序样本的genotype, 此处仅适用于精子数据(二倍体杂合SNP已知)。
2 //依据bayes theory,以二倍体杂合SNP和测序错误分值模型为先验概率,计算观测数据下每一种genotype的后验概率。
3 //二倍体杂合SNP,只有两种genotype,且出现的先验概率各为0.5。
4 //测序错误分值模型,与模拟reads所用的模型相同,由分析重测序数据而得。它是一个四维矩阵:ref_base x cycle x seq_base x quality。
5 //解释:我们实际想得到的是P(D|genotype), 此处ref_base即为geotype, 而D包含三个属性(cycle,seq_base,quality)。
6 //我们想得到是P(cycle,seq_base,quality|ref_base), 而它又等价于P(cycle|ref_base) * P(seq_base,quality|cycle,ref_base)。
7 //因为P(cycle|ref_base)可以认为是一个常数,在整个bayes公式的计算中不起作用,因此我们把它忽略。
8 //这样可以将P(seq_base,quality|cycle,ref_base)直接用于bayes公式,也就形成了ref_base*cycle作为列,seq_base*quality作为行的二维matrix文件。
9 //本程序采用long double (128bit浮点数),来高精度的表示概率。
10 //coverage depth最大允许值为400,以防计算过程中数值指数部分超过long double界限。
11 //选取概率最大的genotype作为最终结果,并将其错误率转换为phred quality, 设上限为100。
13 //此版读取soapsnp格式的error matrix;
15 //Author: Fan Wei, fanweisz09@gmail.com
30 #include <boost/lexical_cast.hpp>
31 #include <boost/algorithm/string.hpp>
36 typedef long double rate_t
;
37 map
<long int,vector
<rate_t
> > PTi
; //store the prior probability for each substitutions
38 map
<long int, string
> DiGeno
;
39 rate_t
*PdTi
; //store the distribution of error and quality
47 int Ref_Cyc_base_qual
;
48 string Error_profile_file
;
49 string Out_prefix
= "output";
50 int Phred_score_cutoff
= 5;
51 int Ascii_Qual_Start
= 33;
55 'A', 'C', 'G', 'T', 'N'
59 { cout
<< "calculate_sperm_genotype <diploid_snp_file> <sperm_pileup_file>" << endl
;
60 cout
<< " version 1.0" << endl
;
61 cout
<< " -m <file> the matrix file of error and quality distributions, default=exe_path/XieSunney.blood.matrix.soapsnp" << endl
;
62 cout
<< " -q <int> ASCII chracter standing for quality==0, default=" << Ascii_Qual_Start
<< endl
;
63 cout
<< " -s <file> the cutoff of score in phred scale, default=" << Phred_score_cutoff
<< endl
;
64 cout
<< " -h get help information" << endl
;
70 int get_base (char refBase
)
75 case 'A': refBaseId
= 0; break;
76 case 'C': refBaseId
= 1; break;
77 case 'G': refBaseId
= 2; break;
78 case 'T': refBaseId
= 3; break;
79 case 'a': refBaseId
= 0; break;
80 case 'c': refBaseId
= 1; break;
81 case 'g': refBaseId
= 2; break;
82 case 't': refBaseId
= 3; break;
83 default: refBaseId
= 4;
89 //限定freq_rate在1到1e-9之间,统计样本量级1G
90 void load_soapsnp_error_profile (string
&matrix_file
)
93 infile
.open(matrix_file
.c_str());
95 { cerr
<< "fail to open input file" << matrix_file
<< endl
;
99 int pi
= 0; //PdTi的循环变量
100 int line_field_number
= 18;
101 vector
<string
> textlines
;
104 while (getline( infile
, lineStr
, '\n' ))
105 { if (lineStr
[0] == '#')
109 vector
<string
> lineVec
;
110 boost::split(lineVec
,lineStr
, boost::is_any_of(" \t\n"), boost::token_compress_on
);
111 if (lineVec
.size() != line_field_number
)
115 Quality_num
= boost::lexical_cast
<int>(lineVec
[0]) + 1;
116 Cycle_num
= boost::lexical_cast
<int>(lineVec
[1]) + 1;
118 textlines
.push_back(lineStr
);
123 Ref_Cyc_base_qual
= Ref_Base_num
* Seq_Base_num
* Quality_num
* Cycle_num
;
124 Qual_add_val
= Ref_Base_num
* Seq_Base_num
* Cycle_num
;
125 Cycle_add_val
= Ref_Base_num
* Seq_Base_num
;
126 Ref_add_val
= Seq_Base_num
;
128 PdTi
= new rate_t
[Ref_Cyc_base_qual
];
129 for (int j
=0; j
<textlines
.size(); j
++)
131 vector
<string
> lineVec
;
132 boost::split(lineVec
,textlines
[j
], boost::is_any_of(" \t\n"), boost::token_compress_on
);
133 int this_qual
= boost::lexical_cast
<int>(lineVec
[0]);
134 int this_cycl
= boost::lexical_cast
<int>(lineVec
[1]);
136 for (int i
=2; i
<line_field_number
; i
++)
138 int this_ref
= k
/ Seq_Base_num
;
139 int this_base
= k
% Seq_Base_num
;
140 pi
= this_qual
*Qual_add_val
+ this_cycl
*Cycle_add_val
+ this_ref
*Ref_add_val
+ this_base
;
141 rate_t freq_rate
= boost::lexical_cast
<rate_t
>(lineVec
[i
]);
143 if (pi
< Ref_Cyc_base_qual
)
144 { PdTi
[pi
] = (freq_rate
> 1e-9L) ? freq_rate
: 1e-9L;
146 cerr
<< "pi exceeds its range " << Ref_Cyc_base_qual
<< endl
;
154 //从snp文件中读取对于特定位置某一种genotype出现的先验概率
155 void assign_PTi_rates (string
&diploid_snp_file
)
158 infile
.open(diploid_snp_file
.c_str());
160 { cerr
<< "fail to open input file" << diploid_snp_file
<< endl
;
164 while (getline( infile
, lineStr
, '\n' ))
165 { vector
<string
> lineVec
;
166 boost::split(lineVec
,lineStr
, boost::is_any_of("\t"), boost::token_compress_on
);
168 long int refpos
= boost::lexical_cast
<long int>(lineVec
[1]);
169 vector
<rate_t
> snp_prob(4, 0.0); //每种genotype的概率初始化为0
171 hetero
.push_back(lineVec
[5][0]);
172 hetero
.push_back(lineVec
[9][0]);
173 int snp1_id
= get_base(lineVec
[5][0]);
174 int snp2_id
= get_base(lineVec
[9][0]);
175 snp_prob
[snp1_id
] = 0.5;
176 snp_prob
[snp2_id
] = 0.5;
178 PTi
[refpos
] = snp_prob
;
179 DiGeno
[refpos
] = hetero
;
184 //从pileup格式的base_str中得到干净的bases信息
185 void get_clean_bases(int refBase
, string
&raw_base_str
,vector
<int> &clean_base_vec
)
188 int len
= raw_base_str
.size();
191 if (raw_base_str
[pos
] == '^')
194 else if (raw_base_str
[pos
] == '+' || raw_base_str
[pos
] == '-')
198 { if (raw_base_str
[pos
] >= 48 && raw_base_str
[pos
] <= 57 ) //ASCII: 0-9
199 { indel_num
.push_back(raw_base_str
[pos
]);
206 pos
+= atoi(indel_num
.c_str());
210 switch (raw_base_str
[pos
])
212 case '.': clean_base_vec
.push_back(refBase
); break;
213 case ',': clean_base_vec
.push_back(refBase
); break;
214 case 'A': clean_base_vec
.push_back(0); break;
215 case 'a': clean_base_vec
.push_back(0); break;
216 case 'C': clean_base_vec
.push_back(1); break;
217 case 'c': clean_base_vec
.push_back(1); break;
218 case 'G': clean_base_vec
.push_back(2); break;
219 case 'g': clean_base_vec
.push_back(2); break;
220 case 'T': clean_base_vec
.push_back(3); break;
221 case 't': clean_base_vec
.push_back(3); break;
222 case 'N': clean_base_vec
.push_back(4); break;
223 case 'n': clean_base_vec
.push_back(4); break;
224 case '*': clean_base_vec
.push_back(4); break;
225 case '>': clean_base_vec
.push_back(4); break;
226 case '<': clean_base_vec
.push_back(4); break;
237 //从pileup格式的quality_str中得到quality信息,并转换成phred-scale, 33开始
238 void get_quality(vector
<int> &qual_vec
, string
&qual_str
)
241 for (int i
=0; i
<qual_str
.size(); i
++)
242 { int qual_val
= qual_str
[i
] - Ascii_Qual_Start
;
243 qual_val
= (qual_val
< Quality_num
) ? qual_val
: Quality_num
- 1;
244 qual_vec
.push_back(qual_val
);
248 //从pileup格式的cycle_str中得到cycle信息
249 void get_cycle (vector
<int> &cycl_vec
, string
&cycl_str
)
251 vector
<string
> temp_vec
;
252 boost::split(temp_vec
,cycl_str
, boost::is_any_of(","), boost::token_compress_on
);
253 for (int i
=0; i
<temp_vec
.size(); i
++)
254 { //int cycle_val = boost::lexical_cast<int>(temp_vec[i]); //runs much slow
255 int cycle_val
= atoi(temp_vec
[i
].c_str()); //runs fast
256 if (cycle_val
> Cycle_num
)
257 { cycle_val
= Cycle_num
;
259 cycl_vec
.push_back(cycle_val
- 1); //cycle 0-99代表100个cycles
265 //根据指定的参考序列碱基,计算每一种genotype的后验概率P(Ti|D)
266 //coverage depth最大允许值为400,以防计算过程中数值指数部分超过long double界限
267 void calculte_genotype_probability(int refPose
, vector
<rate_t
> &pTiDvec
, vector
<int> &baseVec
, vector
<int> &qualVec
, vector
<int> &cyclVec
)
270 int len
= (baseVec
.size() < 400) ? baseVec
.size() : 400;
271 for (int genotype
=0; genotype
<4; genotype
++)
272 { rate_t pTi
= PTi
[refPose
][genotype
];
276 for (int j
=0; j
<len
; j
++)
278 int num
= qualVec
[j
]*Qual_add_val
+ cyclVec
[j
]*Cycle_add_val
+ genotype
*Ref_add_val
+ + baseVec
[j
];
279 if (num
>= Ref_Cyc_base_qual
)
280 { cerr
<< "num exceeds its range " << num
<< endl
;
286 pTiDvec
.push_back( pTi
*pDTi
);
290 for (int genotype
=0; genotype
<4; genotype
++)
291 { pTiDvec
[genotype
] = pTiDvec
[genotype
] / pD
;
296 //find the genotype with maximum posterior likelihood and calcualte the phred-scale score (up to 100)
297 void find_genotype_score(vector
<rate_t
> &pTiDvec
, char &sample_base
, int &phred_score
)
300 rate_t max_likelihood
= 0.0;
301 for (int i
=0; i
<pTiDvec
.size(); i
++)
302 { if (pTiDvec
[i
] > max_likelihood
)
304 max_likelihood
= pTiDvec
[i
];
308 sample_base
= Bases
[max_id
];
310 rate_t error_rate
= 1 - pTiDvec
[max_id
];
311 if (error_rate
< 1e-10L)
312 { error_rate
= 1e-10L;
314 phred_score
= int(-10 * log10(error_rate
));
318 int main(int argc
, char *argv
[])
320 //get options from command line
322 while((c
=getopt(argc
, argv
, "m:q:s:h")) !=-1) {
324 case 'm': Error_profile_file
=optarg
; break;
325 case 'q': Ascii_Qual_Start
=atoi(optarg
); break;
326 case 's': Phred_score_cutoff
=atoi(optarg
); break;
327 case 'h': usage(); break;
331 if (argc
< 3) usage();
333 if (! Error_profile_file
.size())
334 { string exepath
= argv
[0];
335 int numh
= exepath
.rfind("/");
336 Error_profile_file
= exepath
.substr(0,numh
+1) + "XieSunney.blood.matrix.soapsnp";
338 cerr
<< "\nUsed error profile matrix file: " << Error_profile_file
<< endl
;
341 string diploid_snp_file
= argv
[optind
++]; //optind, argv[optind++]顺序指向非option的参数
342 string sperm_pileup_file
= argv
[optind
++]; //optind, argv[optind++]顺序指向非option的参数
344 clock_t time_start
, time_end
;
345 time_start
= clock();
347 //caluate the prior probability of each genotype
348 assign_PTi_rates(diploid_snp_file
);
349 cerr
<< "Load the snp file done:\n\n";
351 //load the error and quality profile matrix into memory
352 load_soapsnp_error_profile(Error_profile_file
);
353 cerr
<< "Dimensions of error profile:\n";
354 cerr
<< "\n Ref_Base_num " << Ref_Base_num
<< endl
;
355 cerr
<< " Cycle_num " << Cycle_num
<< endl
;
356 cerr
<< " Seq_Base_num " << Seq_Base_num
<< endl
;
357 cerr
<< " Quality_num " << Quality_num
<< endl
;
358 cerr
<< " PdTi array size: " << Ref_Cyc_base_qual
<< endl
;
359 cerr
<< " Qual_add_val " << Qual_add_val
<< endl
;
360 cerr
<< " Cycle_add_val " << Cycle_add_val
<< endl
;
361 cerr
<< " Ref_add_val " << Ref_add_val
<< endl
;
364 cerr
<< "\nLoad the error profile done\n";
365 cerr
<< "Run time: " << double(time_end
- time_start
) / CLOCKS_PER_SEC
<< endl
;
367 //parse the pileup file, and calculate the likelihood of each genotypes
369 infile
.open(sperm_pileup_file
.c_str());
371 { cerr
<< "fail to open input file" << sperm_pileup_file
<< endl
;
374 cout
<< "#Chr\tPos\tRef\tDiGt\tSpGt\tScore\tDepth\tBases\tQuals\tCycles\t[all mapped bases]\n";
376 while (getline( infile
, lineStr
, '\n' ))
377 { vector
<string
> lineVec
;
378 boost::split(lineVec
,lineStr
, boost::is_any_of("\t"), boost::token_compress_on
);
380 long int refPos
= boost::lexical_cast
<long int>(lineVec
[1]);
382 if ( ! PTi
.count( refPos
) )
386 int refBase
= get_base(lineVec
[2][0]);
389 get_clean_bases(refBase
,lineVec
[4],baseVec
);
392 get_quality(qualVec
, lineVec
[5]);
395 get_cycle(cyclVec
, lineVec
[7]);
397 //check the result of parsing pileup file
398 if (baseVec
.size() != qualVec
.size() || baseVec
.size() != cyclVec
.size() || baseVec
.size() == 0)
399 { cerr
<< "Error happens at : " << lineVec
[4] << "\t" << lineVec
[5] << "\t" << lineVec
[7] << endl
<< endl
;
403 vector
<int> FbaseVec
;
404 vector
<int> FqualVec
;
405 vector
<int> FcyclVec
;
407 //filter the observed data, base not N, and qual > 5;
408 for (int i
=0; i
<baseVec
.size(); i
++)
409 { if (baseVec
[i
] != 4 && qualVec
[i
] >= 5) //当质量值小于5的时候,P(D|G)几乎是相等的,因此P(G|D) = P(G), 数据没有发挥作用,结果将只反映先验概率,造成错误。
410 { FbaseVec
.push_back(baseVec
[i
]);
411 FqualVec
.push_back(qualVec
[i
]);
412 FcyclVec
.push_back(cyclVec
[i
]);
415 if (FbaseVec
.size() == 0)
419 vector
<rate_t
> pTiDvec
;
420 calculte_genotype_probability(refPos
,pTiDvec
, FbaseVec
, FqualVec
, FcyclVec
);
424 find_genotype_score(pTiDvec
,sample_base
,phred_score
);
426 if (phred_score
>= Phred_score_cutoff
)
428 cout
<< lineVec
[0] << "\t" << lineVec
[1] << "\t" << Bases
[refBase
] << "\t" << DiGeno
[refPos
] << "\t" << sample_base
<< "\t" << phred_score
<< "\t" << FbaseVec
.size() << "\t";
429 for (int i
=0; i
<FbaseVec
.size(); i
++)
430 { cout
<< Bases
[FbaseVec
[i
]];
433 for (int i
=0; i
<FbaseVec
.size(); i
++)
434 { cout
<< FqualVec
[i
] << ",";
437 for (int i
=0; i
<FbaseVec
.size(); i
++)
438 { cout
<< FcyclVec
[i
] << ",";
440 cout
<< "\t" << baseVec
.size() << "\t"<< lineVec
[4] << "\t" << lineVec
[5] << "\t" << lineVec
[7] << endl
;
445 cerr
<< "\nParse the pileup file and genotype calculation done\n";
446 cerr
<< "Run time: " << double(time_end
- time_start
) / CLOCKS_PER_SEC
<< endl
;