projects/recombineMap/sperm_by_fw/snp_calling/call_diploid_snp.cpp

   1 //本程序根据参考序列上mapped的reads来计算测序样本的genotype, 读取bwa mpielup格式输入文件
   2 //依据bayes theory，以sample出现纯合和杂合SNP的概率和测序错误分值模型为先验概率，计算观测数据下每一种genotype的后验概率。
   3 //测序错误分值模型，与模拟reads所用的模型相同，由分析重测序数据而得。它是一个四维矩阵：ref_base x cycle x seq_base x quality。
   4 //解释：我们实际想得到的是P(D|genotype), 此处ref_base即为geotype, 而D包含三个属性(cycle,seq_base,quality)。
   5 //我们想得到是P(cycle,seq_base,quality|ref_base), 而它又等价于P(cycle|ref_base) * P(seq_base,quality|cycle,ref_base)。
   6 //因为P(cycle|ref_base)可以认为是一个常数，在整个bayes公式的计算中不起作用，因此我们把它忽略。
   7 //这样可以将P(seq_base,quality|cycle,ref_base)直接用于bayes公式，也就形成了ref_base*cycle作为列，seq_base*quality作为行的二维matrix文件。
   8 //本程序采用long double (128bit浮点数)，来高精度的表示概率。
   9 //coverage depth最大允许值为400,以防计算过程中数值指数部分超过long double界限。超过400的部分将被忽略，程序不报错不中止。
  10 //选取概率最大的genotype作为最终结果，并将其错误率转换为phred quality score, 并设上限为100。
  11
  12 //pirs和soapsnp的matrix是等效的，为保持一致，此程序采用soapsnp格式的matrix，  P(base|ref,qual,cycle)
  13
  14 //Author: Fan Wei, fanweisz09@gmail.com
  15 //Date: 2012/6/4
  16
  17
  18 #include <stdio.h>
  19 #include <iostream>
  20 #include <fstream>
  21 #include <string>
  22 #include <vector>
  23 #include <algorithm>
  24 #include <map>
  25 #include <set>
  26 #include <cmath>
  27 #include <inttypes.h>
  28 #include<zlib.h>
  29 #include <boost/lexical_cast.hpp>
  30 #include <boost/algorithm/string.hpp>
  31 #include "gzstream.h"
  32
  33 using namespace std;
  34
  35 typedef long double rate_t;
  36 rate_t PTi[4][10]; //store the prior probability for each substitutions
  37 rate_t *PdTi;   //store the distribution of error and quality
  38 int Ref_Base_num;
  39 int Cycle_num;
  40 int Seq_Base_num;
  41 int Quality_num;
  42 int Qual_add_val;
  43 int Cycle_add_val;
  44 int Ref_add_val;
  45 int Ref_Cyc_base_qual;
  46 string Error_profile_file;
  47 string Out_prefix = "output";
  48 float Trans_Tranv_ratio = 2.0;
  49 float Hom_SNP_rate = 0.0005;
  50 float Het_SNP_rate = 0.0005;
  51 int SNP_prior_mode = 1;
  52 int Ascii_Qual_Start = 33;
  53 int Only_output_snp = 1;
  54
  55 //由０１２３ 4到ＡＣＧＴ N
  56 char Bases[5] ={
  57                 'A', 'C', 'G', 'T', 'N'
  58 };
  59
  60 //用0，1，2，3，分别代表A,C,G,T
  61 int DipGeno[10][2] = {0,0, 1,1, 2,2, 3,3, 0,1, 0,2, 0,3, 1,2, 1,3, 2,3};
  62
  63 map<int,char> TwoBases;
  64 map<int, int> TransSubs;
  65
  66
  67 void define_TransSubs()
  68 {
  69         TransSubs[0] = 2;  TransSubs[2] = 0;
  70         TransSubs[1] = 3;  TransSubs[3] = 1;
  71 }
  72
  73
  74 void define_two_bases_markers()
  75 {
  76
  77         TwoBases[0] = 'A';      TwoBases[1] = 'C';      TwoBases[2] = 'G';      TwoBases[3] = 'T';
  78         TwoBases[4] = 'M';      TwoBases[5] = 'R';  TwoBases[6] = 'W';
  79         TwoBases[7] = 'S'; TwoBases[8] = 'Y'; TwoBases[9] = 'K';
  80 }
  81
  82
  83 void usage()
  84 {       cout << "calculate_diploid_genotype  <*.mpileup>" << endl;
  85         cout << "   version 1.0" << endl;
  86         cout << "   -r <float>  homozygous SNP prior rate, default=" << Hom_SNP_rate << endl;
  87         cout << "   -e <float>  heterozygous SNP prior rate, default=" << Het_SNP_rate << endl;
  88         cout << "   -t <float>  transition vs transversion prior ratio, default=" << Trans_Tranv_ratio << endl;
  89         cout << "   -d <int>    set the prior mode, 0: no prior; 1: with prior;  default="<< SNP_prior_mode << endl;
  90         cout << "   -m <file>   matrix file of error and quality distributions, default=exe_path/XieSunney.blood.matrix.soapsnp" << endl;
  91         cout << "   -q <int>    ASCII chracter standing for quality==0, default=" << Ascii_Qual_Start << endl;
  92         cout << "   -o <string> prefix for the output files" << endl;
  93         cout << "   -s <int>    two output mode, 1, only snp; 0, cns; default=" << Only_output_snp << endl;
  94         cout << "   -h          get help information" << endl;
  95         exit(0);
  96 }
  97
  98
  99 //由ＡＣＧＴ N得到０１２３ 4
 100 int get_base (char refBase)
 101 {
 102         int refBaseId;
 103         switch (refBase)
 104         {
 105                 case 'A': refBaseId = 0; break;
 106                 case 'C': refBaseId = 1; break;
 107                 case 'G': refBaseId = 2; break;
 108                 case 'T': refBaseId = 3; break;
 109                 case 'a': refBaseId = 0; break;
 110                 case 'c': refBaseId = 1; break;
 111                 case 'g': refBaseId = 2; break;
 112                 case 't': refBaseId = 3; break;
 113                 default:  refBaseId = 4;
 114         }
 115         return refBaseId;
 116 }
 117
 118
 119 //限定freq_rate在1到1e-9之间，统计样本量级1G
 120 void load_soapsnp_error_profile (string &matrix_file)
 121 {
 122         igzstream infile;
 123         infile.open(matrix_file.c_str());
 124         if ( ! infile )
 125         {       cerr << "fail to open input file" << matrix_file << endl;
 126                 exit(0);
 127         }
 128
 129         int pi = 0; //PdTi的循环变量
 130         int line_field_number = 18;
 131         vector<string> textlines;
 132
 133         string lineStr;
 134         while (getline( infile, lineStr, '\n' ))
 135         {       if (lineStr[0] == '#')
 136                 {       continue;
 137                 }
 138
 139                 vector<string> lineVec;
 140                 boost::split(lineVec,lineStr, boost::is_any_of(" \t\n"), boost::token_compress_on);
 141                 if (lineVec.size() != line_field_number)
 142                 {       continue;
 143                 }
 144
 145                 Quality_num = boost::lexical_cast<int>(lineVec[0]) + 1;
 146                 Cycle_num = boost::lexical_cast<int>(lineVec[1]) + 1;
 147
 148                 textlines.push_back(lineStr);
 149         }
 150
 151         Ref_Base_num = 4;
 152         Seq_Base_num = 4;
 153         Ref_Cyc_base_qual = Ref_Base_num * Seq_Base_num * Quality_num * Cycle_num;
 154         Qual_add_val = Ref_Base_num * Seq_Base_num * Cycle_num;
 155         Cycle_add_val = Ref_Base_num * Seq_Base_num;
 156         Ref_add_val = Seq_Base_num;
 157
 158         PdTi = new rate_t[Ref_Cyc_base_qual];
 159         for (int j=0; j<textlines.size(); j++)
 160         {
 161                 vector<string> lineVec;
 162                 boost::split(lineVec,textlines[j], boost::is_any_of(" \t\n"), boost::token_compress_on);
 163                 int this_qual = boost::lexical_cast<int>(lineVec[0]);
 164                 int this_cycl = boost::lexical_cast<int>(lineVec[1]);
 165
 166                 for (int i=2; i<line_field_number; i++)
 167                 {       int k = i - 2;
 168                         int this_ref = k / Seq_Base_num;
 169                         int this_base = k % Seq_Base_num;
 170                         pi = this_qual*Qual_add_val + this_cycl*Cycle_add_val + this_ref*Ref_add_val  + this_base;
 171                         rate_t freq_rate = boost::lexical_cast<rate_t>(lineVec[i]);
 172
 173                         if (pi < Ref_Cyc_base_qual)
 174                         {       PdTi[pi] = (freq_rate > 1e-9L) ? freq_rate : 1e-9L;
 175                         }else{
 176                                 cerr << "pi exceeds its range " << Ref_Cyc_base_qual <<  endl;
 177                                 exit(0);
 178                         }
 179                 }
 180         }
 181
 182 }
 183
 184
 185 //根据Trans_Tranv_ratio,Hom_SNP_rate,Het_SNP_rate,SNP_prior_mode来计算genotype的prior probability
 186 void assign_PTi_rates ()
 187 {
 188         double TsTvR = Trans_Tranv_ratio;
 189         double NonSnpRate = 1 - Hom_SNP_rate - Het_SNP_rate;
 190         double hom_trans_rate = Hom_SNP_rate * TsTvR*2 / (TsTvR*2 + 1 + 1);
 191         double hom_tranv_rate = Hom_SNP_rate * 1 / (TsTvR*2 + 1 + 1);
 192
 193         double HapSnpRate = Hom_SNP_rate + Het_SNP_rate;  //近似，非准确
 194         double HapNonSnpRate = 1 - HapSnpRate;
 195         double HapTransSnpRate = HapSnpRate * TsTvR*2 / (TsTvR*2 + 1 + 1);
 196         double HapTranvSnpRate = HapSnpRate * 1 / (TsTvR*2 + 1 + 1);
 197
 198         double hetNonTransRate;
 199         double hetNonTranvRate;
 200         double hetTransTranvRate;
 201         double hetTranvTranvRate;
 202         double relativeTotal;
 203
 204         relativeTotal = HapNonSnpRate*HapTransSnpRate * 1 + HapNonSnpRate*HapTranvSnpRate *2 + HapTransSnpRate*HapTranvSnpRate * 2 + HapTranvSnpRate*HapTranvSnpRate  * 1;
 205         hetNonTransRate = (relativeTotal > 0) ? HapNonSnpRate*HapTransSnpRate / relativeTotal * Het_SNP_rate : 0;
 206         hetNonTranvRate = (relativeTotal > 0) ? HapNonSnpRate*HapTranvSnpRate / relativeTotal * Het_SNP_rate : 0;
 207         hetTransTranvRate = (relativeTotal > 0) ? HapTransSnpRate*HapTranvSnpRate / relativeTotal * Het_SNP_rate : 0;
 208         hetTranvTranvRate = (relativeTotal > 0) ? HapTranvSnpRate*HapTranvSnpRate / relativeTotal * Het_SNP_rate : 0;
 209
 210         if (SNP_prior_mode == 0)
 211         {       NonSnpRate = hom_trans_rate = hom_tranv_rate = hetNonTransRate = hetNonTranvRate = hetTransTranvRate = hetTranvTranvRate = 0.1;
 212         }
 213
 214         for (int i=0; i<4; i++)
 215         {       for (int j=0; j<10; j++)
 216                 {       if (i == DipGeno[j][0]  && i == DipGeno[j][1])   // non-SNP
 217                         {
 218                                 PTi[i][j] = NonSnpRate;
 219                         }else if(i != DipGeno[j][0]  && DipGeno[j][0] == DipGeno[j][1])  //homo SNP
 220                         {
 221                                 PTi[i][j] = (TransSubs[i] == DipGeno[j][0]) ? hom_trans_rate : hom_tranv_rate ;
 222                         }else //het SNP
 223                         {       if (i == DipGeno[j][0]  || i == DipGeno[j][1]) //het-one
 224                                 {       PTi[i][j] = (TransSubs[i] == DipGeno[j][0] || TransSubs[i] == DipGeno[j][1]) ? hetNonTransRate : hetNonTranvRate;
 225                                 }else //het-two
 226                                 {       PTi[i][j] = (TransSubs[i] != DipGeno[j][0] && TransSubs[i] != DipGeno[j][1]) ? hetTranvTranvRate : hetTransTranvRate;
 227                                 }
 228                         }
 229                 }
 230         }
 231
 232 }
 233
 234 //从pileup格式的base_str中得到干净的bases信息
 235 void get_clean_bases(int refBase, string &raw_base_str,vector<int> &clean_base_vec)
 236 {
 237         int pos = 0;
 238         int len = raw_base_str.size();
 239         while (pos < len)
 240         {
 241                 if (raw_base_str[pos] == '^')
 242                 {       pos += 2;
 243                 }
 244                 else if (raw_base_str[pos] == '+' || raw_base_str[pos] == '-')
 245                 {       pos += 1;
 246                         string indel_num;
 247                         while (pos < len)
 248                         {       if (raw_base_str[pos] >= 48 && raw_base_str[pos] <= 57 )  //ASCII: 0-9
 249                                 {       indel_num.push_back(raw_base_str[pos]);
 250                                         pos ++;
 251                                 }
 252                                 else
 253                                 {       break;
 254                                 }
 255                         }
 256                         pos += atoi(indel_num.c_str());
 257                 }
 258                 else
 259                 {
 260                         switch (raw_base_str[pos])
 261                         {
 262                                 case '.': clean_base_vec.push_back(refBase); break;
 263                                 case ',': clean_base_vec.push_back(refBase); break;
 264                                 case 'A': clean_base_vec.push_back(0); break;
 265                                 case 'a': clean_base_vec.push_back(0); break;
 266                                 case 'C': clean_base_vec.push_back(1); break;
 267                                 case 'c': clean_base_vec.push_back(1); break;
 268                                 case 'G': clean_base_vec.push_back(2); break;
 269                                 case 'g': clean_base_vec.push_back(2); break;
 270                                 case 'T': clean_base_vec.push_back(3); break;
 271                                 case 't': clean_base_vec.push_back(3); break;
 272                                 case 'N': clean_base_vec.push_back(4); break;
 273                                 case 'n': clean_base_vec.push_back(4); break;
 274                                 case '*': clean_base_vec.push_back(4); break;
 275                                 case '>': clean_base_vec.push_back(4); break;
 276                                 case '<': clean_base_vec.push_back(4); break;
 277                                 //自动省略$符号的判断
 278                         }
 279                         pos += 1;
 280                 }
 281
 282         }
 283
 284 }
 285
 286 //从pileup格式的quality_str中得到quality信息，并转换成phred-scale, 33开始
 287 void get_quality(vector<int> &qual_vec, string &qual_str)
 288 {
 289         for (int i=0; i<qual_str.size(); i++)
 290         {       int qual_val = qual_str[i] - Ascii_Qual_Start;
 291                 qual_val = (qual_val < Quality_num) ? qual_val : Quality_num - 1;  //质量值大于40的全部当成40
 292                 qual_vec.push_back(qual_val);
 293         }
 294 }
 295
 296 //从pileup格式的cycle_str中得到cycle信息
 297 void get_cycle (vector<int> &cycl_vec, string &cycl_str)
 298 {
 299         vector<string> temp_vec;
 300         boost::split(temp_vec,cycl_str, boost::is_any_of(","), boost::token_compress_on);
 301         for (int i=0; i<temp_vec.size(); i++)
 302         {       //int cycle_val = boost::lexical_cast<int>(temp_vec[i]);  //runs much slow
 303                 int cycle_val = atoi(temp_vec[i].c_str()); //runs fast
 304                 if (cycle_val > Cycle_num)
 305                 {       cycle_val = Cycle_num;
 306                 }
 307                 cycl_vec.push_back(cycle_val - 1);  //cycle 0-99代表100个cycles
 308         }
 309 }
 310
 311
 312 //根据指定的参考序列碱基，计算每一种genotype的后验概率P(Ti|D)
 313 //coverage depth最大允许值为400,以防计算过程中数值指数部分超过long double界限
 314 void calculte_genotype_probability(int refBase, vector<rate_t> &pTiDvec, vector<int> &baseVec, vector<int> &qualVec, vector<int> &cyclVec)
 315 {
 316         //cerr << "refbase: " << refBase << "  " << Bases[refBase] << endl;
 317         rate_t pD = 0.0;
 318         int len = (baseVec.size() < 400) ? baseVec.size() : 400;
 319
 320         //vector<rate_t> pDTiVec;
 321         for (int genotype=0; genotype<10; genotype++)
 322         {
 323                 rate_t pTi = PTi[refBase][genotype];  //
 324                 rate_t pDTi = 1.0;
 325
 326                 if (pTi != 0)
 327                 {       for (int j=0; j<len; j++)
 328                         {
 329                                 int num1 = qualVec[j]*Qual_add_val + cyclVec[j]*Cycle_add_val + DipGeno[genotype][0]*Ref_add_val +  + baseVec[j];
 330                                 int num2 = qualVec[j]*Qual_add_val + cyclVec[j]*Cycle_add_val + DipGeno[genotype][1]*Ref_add_val +  + baseVec[j];
 331                                 if (num1 >= Ref_Cyc_base_qual || num2 >= Ref_Cyc_base_qual)
 332                                 {       cerr << "num1 or num2 exceeds its range " << num1 << "\t" << num2 << endl;
 333                                         exit(0);
 334                                 }
 335                                 pDTi *=    (PdTi[num1] + PdTi[num2] ) / 2;
 336                                 //cerr << "# " << DipGeno[genotype][0] << "\t" << DipGeno[genotype][1] << "\t" << cyclVec[j] << "\t" << baseVec[j] << "\t" << qualVec[j] << "\t"<< PdTi[num1] << "\t" << PdTi[num2]  << "\t" << num1 << "\t" << num2 << endl;
 337
 338                         }
 339                 }
 340                 //pDTiVec.push_back(pDTi);
 341                 pTiDvec.push_back( pTi*pDTi );
 342                 pD += pTi*pDTi;
 343                 //cerr << "calculate: " << genotype << "\t" << pTi << "\t" << pDTi << "\t" << pTi*pDTi << "\t" << pD << endl;
 344         }
 345
 346         for (int genotype=0; genotype<10; genotype++)
 347         {       pTiDvec[genotype] = pTiDvec[genotype] / pD;
 348         }
 349
 350         //output middle results
 351         /*for (int genotype=0; genotype<10; genotype++)
 352         {       rate_t pTi = PTi[refBase][genotype];
 353                 rate_t pDTi = pDTiVec[genotype];
 354                 cout << genotype << "\t" << pTi << "\t" << pDTi << "\t" << pTi*pDTi << "\t" << pD << "\t" << pTiDvec[genotype] << endl;
 355         }*/
 356 }
 357
 358 //find the genotype with maximum posterior likelihood and calcualte the phred-scale score (up to 100)
 359 void find_genotype_score(vector<rate_t> &pTiDvec, int &sample_genotype, int &phred_score)
 360 {
 361         int max_id = 0;
 362         rate_t max_likelihood = 0.0;
 363         for (int i=0; i<pTiDvec.size(); i++)
 364         {       if (pTiDvec[i] > max_likelihood)
 365                 {       max_id = i;
 366                         max_likelihood = pTiDvec[i];
 367
 368                 }
 369                 //cerr << "find max: " << i << "  " << pTiDvec[i] << endl;
 370         }
 371
 372         sample_genotype = max_id;
 373
 374         rate_t error_rate = 1 - pTiDvec[max_id];
 375         if (error_rate < 1e-10L)
 376         {       error_rate = 1e-10L;
 377         }
 378         phred_score = int(-10 * log10(error_rate));
 379
 380 }
 381
 382 void output_prior_file(string &prior_file)
 383 {
 384         ofstream PRIOR (prior_file.c_str());
 385         if (! PRIOR)
 386         {       cerr << "fail create prior file" << prior_file << endl;
 387         }
 388
 389         PRIOR << "The 10 genotypes: \n";
 390         for (int i=0; i<10; i++)
 391         {       PRIOR << i << "(" << Bases[DipGeno[i][0]] << Bases[DipGeno[i][1]] << ")  ";
 392         }
 393         PRIOR << endl << endl;
 394
 395
 396         PRIOR << "The compressed representaiton for each genotype:\n";
 397         map<int,char>::const_iterator map_it = TwoBases.begin();
 398         while (map_it != TwoBases.end())
 399         {       PRIOR << map_it->first << ":" << map_it->second << ";  ";
 400                 ++map_it;
 401         }
 402         PRIOR << "\n\n";
 403
 404         PRIOR << "The transison substitution matrix:\n";
 405         map<int, int>::const_iterator map_it2 = TransSubs.begin();
 406         while (map_it2 != TransSubs.end())
 407         {       PRIOR << Bases[map_it2->first] << ":" << Bases[map_it2->second] << "; ";
 408                 ++map_it2;
 409         }
 410         PRIOR << endl  << endl;
 411
 412
 413         PRIOR << "Hom_SNP_rate: " << Hom_SNP_rate << endl;
 414         PRIOR << "Het_SNP_rate: " << Het_SNP_rate << endl;
 415         PRIOR << "All_SNP_rate: " << Hom_SNP_rate + Het_SNP_rate << endl;
 416         PRIOR << "Trans_Tranv_ratio: " << Trans_Tranv_ratio << endl;
 417
 418         PRIOR << "\nThe prior probabilites of genotypes for each reference base:\n\n";
 419         PRIOR << "R\\G";
 420         for (int j=0; j<10; j++)
 421         {       PRIOR << "\t" << Bases[DipGeno[j][0]] << Bases[DipGeno[j][1]];
 422         }
 423         PRIOR << endl;
 424         for (int i=0; i<4; i++)
 425         {       PRIOR << Bases[i] ;
 426                 for (int j=0; j<10; j++)
 427                 {
 428                         PRIOR << "\t" << PTi[i][j];
 429                 }
 430                 PRIOR << endl;
 431
 432         }
 433
 434         PRIOR << "\n    Ref_Base_num " << Ref_Base_num << endl;
 435         PRIOR << "      Cycle_num " << Cycle_num << endl;
 436         PRIOR << "      Seq_Base_num " << Seq_Base_num << endl;
 437         PRIOR << "      Quality_num " << Quality_num << endl;
 438         PRIOR << "      PdTi array size: " << Ref_Cyc_base_qual << endl;
 439         PRIOR << "      Qual_add_val " << Qual_add_val << endl;
 440         PRIOR << "      Cycle_add_val " << Cycle_add_val << endl;
 441         PRIOR << "      Ref_add_val " << Ref_add_val << endl;
 442
 443 }
 444
 445 int main(int argc, char *argv[])
 446 {
 447         //get options from command line
 448         int c;
 449         while((c=getopt(argc, argv, "t:r:e:d:m:q:o:s:h")) !=-1) {
 450                 switch(c) {
 451                         case 't': Trans_Tranv_ratio=atof(optarg); break;
 452                         case 'r': Hom_SNP_rate=atof(optarg); break;
 453                         case 'e': Het_SNP_rate=atof(optarg); break;
 454                         case 'd': SNP_prior_mode=atoi(optarg); break;
 455                         case 'm': Error_profile_file=optarg; break;
 456                         case 'q': Ascii_Qual_Start=atoi(optarg); break;
 457                         case 'o': Out_prefix=optarg; break;
 458                         case 's': Only_output_snp=atoi(optarg); break;
 459                         case 'h': usage(); break;
 460                         default: usage();
 461                 }
 462         }
 463         if (argc < 2) usage();
 464
 465         if (! Error_profile_file.size())
 466         {       string exepath = argv[0];
 467                 int numh = exepath.rfind("/");
 468                 Error_profile_file = exepath.substr(0,numh+1) + "XieSunney.blood.matrix.soapsnp";
 469         }
 470         cerr << "\nUsed error profile matrix file: " << Error_profile_file << endl << endl;
 471
 472         string pileup_file = argv[optind++]; //optind, argv[optind++]顺序指向非option的参数
 473
 474         clock_t time_start, time_end;
 475         time_start = clock();
 476
 477         //caluate the prior probability of each genotype
 478         define_two_bases_markers();
 479         define_TransSubs();
 480         assign_PTi_rates();
 481         cerr << "Assign prior probability to genotype done:\n\n";
 482
 483         //load the error and quality profile matrix into memory
 484         load_soapsnp_error_profile(Error_profile_file);
 485         cerr << "Dimensions of error profile:\n";
 486
 487         string prior_file = Out_prefix + ".prior";
 488         output_prior_file(prior_file);
 489
 490         time_end = clock();
 491         cerr << "\nLoad the error profile done\n";
 492         cerr << "Run time: " << double(time_end - time_start) / CLOCKS_PER_SEC << endl;
 493
 494         //parse the pileup file, and calculate the likelihood of each genotypes
 495         igzstream infile;
 496         infile.open(pileup_file.c_str());
 497         if ( ! infile )
 498         {       cerr << "fail to open input file" << pileup_file << endl;
 499         }
 500
 501         string gentoytpe_file;
 502         if (Only_output_snp == 1)
 503         {       gentoytpe_file = Out_prefix + ".snp";
 504         }else
 505         {       gentoytpe_file = Out_prefix + ".cns";
 506         }
 507         ofstream outfile (gentoytpe_file.c_str());
 508         if ( ! outfile )
 509         {       cerr << "fail to open output file" << gentoytpe_file << endl;
 510         }
 511
 512         long int loop = 0;
 513         outfile << "#Chr\tPos\tRef\tGeno\tScore\tDepth\tBases\tQuals\tCycles\t[all mapped bases]\n";
 514         string lineStr;
 515         while (getline( infile, lineStr, '\n' ))
 516         {
 517                 vector<string> lineVec;
 518                 boost::split(lineVec,lineStr, boost::is_any_of("\t"), boost::token_compress_on);
 519                 int refBase = get_base(lineVec[2][0]);
 520
 521                 vector<int> baseVec;
 522                 get_clean_bases(refBase,lineVec[4],baseVec);
 523
 524                 vector<int> qualVec;
 525                 get_quality(qualVec, lineVec[5]);
 526
 527                 vector<int> cyclVec;
 528                 get_cycle(cyclVec, lineVec[7]);
 529
 530                 ////////////////debug///////////////
 531                 //cerr << "refBase: " << Bases[refBase] << endl;
 532                 //for (int i=0; i<baseVec.size(); i++)
 533                 //{     cerr << i+1 << "\t" << Bases[baseVec[i]] << "\t" << qualVec[i] << "\t" << cyclVec[i] << endl;
 534                 //}
 535
 536                 ////////////////debug///////////////
 537
 538                 //check the result of parsing pileup file
 539                 if (baseVec.size() != qualVec.size() || baseVec.size() != cyclVec.size())
 540                 {       cerr << "Error happens at : " << lineVec[4] << "\t" << lineVec[5] << "\t" << lineVec[7] << endl << endl;
 541                         exit(0);
 542                 }
 543
 544                 vector<int> FbaseVec;
 545                 vector<int> FqualVec;
 546                 vector<int> FcyclVec;
 547
 548                 //filter the observed data, base not N, and qual > 5;
 549                 for (int i=0; i<baseVec.size(); i++)
 550                 {       if (baseVec[i] != 4 && qualVec[i] >= 5) //当质量值小于5的时候，P(D|G)几乎是相等的，因此P(G|D) = P(G), 数据没有发挥作用，结果将只反映先验概率，造成错误。
 551                         {       FbaseVec.push_back(baseVec[i]);
 552                                 FqualVec.push_back(qualVec[i]);
 553                                 FcyclVec.push_back(cyclVec[i]);
 554                         }
 555                 }
 556                 if (FbaseVec.size() == 0)
 557                 {       continue;
 558                 }
 559
 560                 vector<rate_t> pTiDvec;
 561                 calculte_genotype_probability(refBase,pTiDvec, FbaseVec, FqualVec, FcyclVec);
 562                 int sample_genotype = 0;
 563                 int phred_score = 0;
 564                 find_genotype_score(pTiDvec,sample_genotype,phred_score);
 565                 if (Only_output_snp == 0 || sample_genotype != refBase)
 566                 {
 567
 568                         outfile << lineVec[0] << "\t" << lineVec[1] << "\t" << lineVec[2] << "\t";
 569                         if (sample_genotype < 4)
 570                         {       outfile << Bases[sample_genotype];
 571                         }else{
 572                                 outfile << TwoBases[sample_genotype] << "(" << Bases[DipGeno[sample_genotype][0]] << Bases[DipGeno[sample_genotype][1]] << ")";
 573                         }
 574
 575                         outfile << "\t" << phred_score << "\t" << FbaseVec.size() << "\t" ;
 576                         for (int i=0; i<FbaseVec.size(); i++)
 577                         {       outfile << Bases[FbaseVec[i]];
 578                         }
 579                         outfile << "\t";
 580                         for (int i=0; i<FbaseVec.size(); i++)
 581                         {       outfile << FqualVec[i] << ",";
 582                         }
 583                         outfile << "\t";
 584                         for (int i=0; i<FbaseVec.size(); i++)
 585                         {       outfile << FcyclVec[i] << ",";
 586                         }
 587                         outfile << "\t" << baseVec.size() << "\t"<< lineVec[4] << "\t" << lineVec[5] << "\t" << lineVec[7] << endl;
 588
 589                 }
 590
 591                 //loop ++;
 592                 //if (loop > 10000)
 593                 //{     break;
 594                 //}
 595         }
 596
 597         infile.close();
 598         outfile.close();
 599         time_end = clock();
 600         cerr << "\nParse the pileup file and genotype calculation done\n";
 601         cerr << "Run time: " << double(time_end - time_start) / CLOCKS_PER_SEC << endl;
 602
 603 }
 604