projects/recombineMap/sperm_by_fw/snp_calling/call_sperm_snp.cpp

   1 //本程序根据参考序列上mapped的reads来计算测序样本的genotype, 此处仅适用于精子数据(二倍体杂合SNP已知)。
   2 //依据bayes theory，以二倍体杂合SNP和测序错误分值模型为先验概率，计算观测数据下每一种genotype的后验概率。
   3 //二倍体杂合SNP，只有两种genotype，且出现的先验概率各为0.5。
   4 //测序错误分值模型，与模拟reads所用的模型相同，由分析重测序数据而得。它是一个四维矩阵：ref_base x cycle x seq_base x quality。
   5 //解释：我们实际想得到的是P(D|genotype), 此处ref_base即为geotype, 而D包含三个属性(cycle,seq_base,quality)。
   6 //我们想得到是P(cycle,seq_base,quality|ref_base), 而它又等价于P(cycle|ref_base) * P(seq_base,quality|cycle,ref_base)。
   7 //因为P(cycle|ref_base)可以认为是一个常数，在整个bayes公式的计算中不起作用，因此我们把它忽略。
   8 //这样可以将P(seq_base,quality|cycle,ref_base)直接用于bayes公式，也就形成了ref_base*cycle作为列，seq_base*quality作为行的二维matrix文件。
   9 //本程序采用long double (128bit浮点数)，来高精度的表示概率。
  10 //coverage depth最大允许值为400,以防计算过程中数值指数部分超过long double界限。
  11 //选取概率最大的genotype作为最终结果，并将其错误率转换为phred quality, 设上限为100。
  12
  13 //此版读取soapsnp格式的error matrix；
  14
  15 //Author: Fan Wei, fanweisz09@gmail.com
  16 //Date: 2012/5/16
  17
  18
  19 #include <stdio.h>
  20 #include <iostream>
  21 #include <fstream>
  22 #include <string>
  23 #include <vector>
  24 #include <algorithm>
  25 #include <map>
  26 #include <set>
  27 #include <cmath>
  28 #include <inttypes.h>
  29 #include<zlib.h>
  30 #include <boost/lexical_cast.hpp>
  31 #include <boost/algorithm/string.hpp>
  32 #include "gzstream.h"
  33
  34 using namespace std;
  35
  36 typedef long double rate_t;
  37 map<long int,vector<rate_t> > PTi; //store the prior probability for each substitutions
  38 map<long int, string > DiGeno;
  39 rate_t *PdTi;   //store the distribution of error and quality
  40 int Ref_Base_num;
  41 int Cycle_num;
  42 int Seq_Base_num;
  43 int Quality_num;
  44 int Qual_add_val;
  45 int Cycle_add_val;
  46 int Ref_add_val;
  47 int Ref_Cyc_base_qual;
  48 string Error_profile_file;
  49 string Out_prefix = "output";
  50 int Phred_score_cutoff = 5;
  51 int Ascii_Qual_Start = 33;
  52
  53 //由０１２３ 4到ＡＣＧＴ N
  54 char Bases[5] ={
  55                 'A', 'C', 'G', 'T', 'N'
  56 };
  57
  58 void usage()
  59 {       cout << "calculate_sperm_genotype <diploid_snp_file> <sperm_pileup_file>" << endl;
  60         cout << "   version 1.0" << endl;
  61         cout << "   -m <file>   the matrix file of error and quality distributions, default=exe_path/XieSunney.blood.matrix.soapsnp" << endl;
  62         cout << "   -q <int>    ASCII chracter standing for quality==0, default=" << Ascii_Qual_Start << endl;
  63         cout << "   -s <file>   the cutoff of score in phred scale, default=" << Phred_score_cutoff << endl;
  64         cout << "   -h          get help information" << endl;
  65         exit(0);
  66 }
  67
  68
  69 //由ＡＣＧＴ N得到０１２３ 4
  70 int get_base (char refBase)
  71 {
  72         int refBaseId;
  73         switch (refBase)
  74         {
  75                 case 'A': refBaseId = 0; break;
  76                 case 'C': refBaseId = 1; break;
  77                 case 'G': refBaseId = 2; break;
  78                 case 'T': refBaseId = 3; break;
  79                 case 'a': refBaseId = 0; break;
  80                 case 'c': refBaseId = 1; break;
  81                 case 'g': refBaseId = 2; break;
  82                 case 't': refBaseId = 3; break;
  83                 default:  refBaseId = 4;
  84         }
  85         return refBaseId;
  86 }
  87
  88
  89 //限定freq_rate在1到1e-9之间，统计样本量级1G
  90 void load_soapsnp_error_profile (string &matrix_file)
  91 {
  92         igzstream infile;
  93         infile.open(matrix_file.c_str());
  94         if ( ! infile )
  95         {       cerr << "fail to open input file" << matrix_file << endl;
  96                 exit(0);
  97         }
  98
  99         int pi = 0; //PdTi的循环变量
 100         int line_field_number = 18;
 101         vector<string> textlines;
 102
 103         string lineStr;
 104         while (getline( infile, lineStr, '\n' ))
 105         {       if (lineStr[0] == '#')
 106                 {       continue;
 107                 }
 108
 109                 vector<string> lineVec;
 110                 boost::split(lineVec,lineStr, boost::is_any_of(" \t\n"), boost::token_compress_on);
 111                 if (lineVec.size() != line_field_number)
 112                 {       continue;
 113                 }
 114
 115                 Quality_num = boost::lexical_cast<int>(lineVec[0]) + 1;
 116                 Cycle_num = boost::lexical_cast<int>(lineVec[1]) + 1;
 117
 118                 textlines.push_back(lineStr);
 119         }
 120
 121         Ref_Base_num = 4;
 122         Seq_Base_num = 4;
 123         Ref_Cyc_base_qual = Ref_Base_num * Seq_Base_num * Quality_num * Cycle_num;
 124         Qual_add_val = Ref_Base_num * Seq_Base_num * Cycle_num;
 125         Cycle_add_val = Ref_Base_num * Seq_Base_num;
 126         Ref_add_val = Seq_Base_num;
 127
 128         PdTi = new rate_t[Ref_Cyc_base_qual];
 129         for (int j=0; j<textlines.size(); j++)
 130         {
 131                 vector<string> lineVec;
 132                 boost::split(lineVec,textlines[j], boost::is_any_of(" \t\n"), boost::token_compress_on);
 133                 int this_qual = boost::lexical_cast<int>(lineVec[0]);
 134                 int this_cycl = boost::lexical_cast<int>(lineVec[1]);
 135
 136                 for (int i=2; i<line_field_number; i++)
 137                 {       int k = i - 2;
 138                         int this_ref = k / Seq_Base_num;
 139                         int this_base = k % Seq_Base_num;
 140                         pi = this_qual*Qual_add_val + this_cycl*Cycle_add_val + this_ref*Ref_add_val  + this_base;
 141                         rate_t freq_rate = boost::lexical_cast<rate_t>(lineVec[i]);
 142
 143                         if (pi < Ref_Cyc_base_qual)
 144                         {       PdTi[pi] = (freq_rate > 1e-9L) ? freq_rate : 1e-9L;
 145                         }else{
 146                                 cerr << "pi exceeds its range " << Ref_Cyc_base_qual <<  endl;
 147                                 exit(0);
 148                         }
 149                 }
 150         }
 151
 152 }
 153
 154 //从snp文件中读取对于特定位置某一种genotype出现的先验概率
 155 void assign_PTi_rates (string &diploid_snp_file)
 156 {
 157         igzstream infile;
 158         infile.open(diploid_snp_file.c_str());
 159         if ( ! infile )
 160         {       cerr << "fail to open input file" << diploid_snp_file << endl;
 161         }
 162
 163         string lineStr;
 164         while (getline( infile, lineStr, '\n' ))
 165         {       vector<string> lineVec;
 166                 boost::split(lineVec,lineStr, boost::is_any_of("\t"), boost::token_compress_on);
 167
 168                 long int refpos = boost::lexical_cast<long int>(lineVec[1]);
 169                 vector<rate_t> snp_prob(4, 0.0);  //每种genotype的概率初始化为0
 170                 string hetero;
 171                 hetero.push_back(lineVec[5][0]);
 172                 hetero.push_back(lineVec[9][0]);
 173                 int snp1_id = get_base(lineVec[5][0]);
 174                 int snp2_id = get_base(lineVec[9][0]);
 175                 snp_prob[snp1_id] = 0.5;
 176                 snp_prob[snp2_id] = 0.5;
 177
 178                 PTi[refpos] = snp_prob;
 179                 DiGeno[refpos] = hetero;
 180         }
 181
 182 }
 183
 184 //从pileup格式的base_str中得到干净的bases信息
 185 void get_clean_bases(int refBase, string &raw_base_str,vector<int> &clean_base_vec)
 186 {
 187         int pos = 0;
 188         int len = raw_base_str.size();
 189         while (pos < len)
 190         {
 191                 if (raw_base_str[pos] == '^')
 192                 {       pos += 2;
 193                 }
 194                 else if (raw_base_str[pos] == '+' || raw_base_str[pos] == '-')
 195                 {       pos += 1;
 196                         string indel_num;
 197                         while (pos < len)
 198                         {       if (raw_base_str[pos] >= 48 && raw_base_str[pos] <= 57 )  //ASCII: 0-9
 199                                 {       indel_num.push_back(raw_base_str[pos]);
 200                                         pos ++;
 201                                 }
 202                                 else
 203                                 {       break;
 204                                 }
 205                         }
 206                         pos += atoi(indel_num.c_str());
 207                 }
 208                 else
 209                 {
 210                         switch (raw_base_str[pos])
 211                         {
 212                                 case '.': clean_base_vec.push_back(refBase); break;
 213                                 case ',': clean_base_vec.push_back(refBase); break;
 214                                 case 'A': clean_base_vec.push_back(0); break;
 215                                 case 'a': clean_base_vec.push_back(0); break;
 216                                 case 'C': clean_base_vec.push_back(1); break;
 217                                 case 'c': clean_base_vec.push_back(1); break;
 218                                 case 'G': clean_base_vec.push_back(2); break;
 219                                 case 'g': clean_base_vec.push_back(2); break;
 220                                 case 'T': clean_base_vec.push_back(3); break;
 221                                 case 't': clean_base_vec.push_back(3); break;
 222                                 case 'N': clean_base_vec.push_back(4); break;
 223                                 case 'n': clean_base_vec.push_back(4); break;
 224                                 case '*': clean_base_vec.push_back(4); break;
 225                                 case '>': clean_base_vec.push_back(4); break;
 226                                 case '<': clean_base_vec.push_back(4); break;
 227                                 //自动省略$符号的判断
 228                         }
 229                         pos += 1;
 230                 }
 231
 232         }
 233
 234 }
 235
 236
 237 //从pileup格式的quality_str中得到quality信息，并转换成phred-scale, 33开始
 238 void get_quality(vector<int> &qual_vec, string &qual_str)
 239 {
 240
 241         for (int i=0; i<qual_str.size(); i++)
 242         {       int qual_val = qual_str[i] - Ascii_Qual_Start;
 243                 qual_val = (qual_val < Quality_num) ? qual_val : Quality_num - 1;
 244                 qual_vec.push_back(qual_val);
 245         }
 246 }
 247
 248 //从pileup格式的cycle_str中得到cycle信息
 249 void get_cycle (vector<int> &cycl_vec, string &cycl_str)
 250 {
 251         vector<string> temp_vec;
 252         boost::split(temp_vec,cycl_str, boost::is_any_of(","), boost::token_compress_on);
 253         for (int i=0; i<temp_vec.size(); i++)
 254         {       //int cycle_val = boost::lexical_cast<int>(temp_vec[i]);  //runs much slow
 255                 int cycle_val = atoi(temp_vec[i].c_str()); //runs fast
 256                 if (cycle_val > Cycle_num)
 257                 {       cycle_val = Cycle_num;
 258                 }
 259                 cycl_vec.push_back(cycle_val - 1);  //cycle 0-99代表100个cycles
 260         }
 261 }
 262
 263
 264
 265 //根据指定的参考序列碱基，计算每一种genotype的后验概率P(Ti|D)
 266 //coverage depth最大允许值为400,以防计算过程中数值指数部分超过long double界限
 267 void calculte_genotype_probability(int refPose, vector<rate_t> &pTiDvec, vector<int> &baseVec, vector<int> &qualVec, vector<int> &cyclVec)
 268 {
 269         rate_t pD = 0.0;
 270         int len = (baseVec.size() < 400) ? baseVec.size() : 400;
 271         for (int genotype=0; genotype<4; genotype++)
 272         {       rate_t pTi = PTi[refPose][genotype];
 273                 rate_t pDTi = 1.0;
 274                 if (pTi != 0)
 275                 {
 276                         for (int j=0; j<len; j++)
 277                         {
 278                                 int num = qualVec[j]*Qual_add_val + cyclVec[j]*Cycle_add_val + genotype*Ref_add_val +  + baseVec[j];
 279                                 if (num >= Ref_Cyc_base_qual)
 280                                 {       cerr << "num exceeds its range " << num << endl;
 281                                         exit(0);
 282                                 }
 283                                 pDTi *= PdTi[num];
 284                         }
 285                 }
 286                 pTiDvec.push_back( pTi*pDTi );
 287                 pD += pTi*pDTi;
 288         }
 289
 290         for (int genotype=0; genotype<4; genotype++)
 291         {       pTiDvec[genotype] = pTiDvec[genotype] / pD;
 292         }
 293
 294 }
 295
 296 //find the genotype with maximum posterior likelihood and calcualte the phred-scale score (up to 100)
 297 void find_genotype_score(vector<rate_t> &pTiDvec, char &sample_base, int &phred_score)
 298 {
 299         int max_id = 0;
 300         rate_t max_likelihood = 0.0;
 301         for (int i=0; i<pTiDvec.size(); i++)
 302         {       if (pTiDvec[i] > max_likelihood)
 303                 {       max_id = i;
 304                         max_likelihood = pTiDvec[i];
 305                 }
 306         }
 307
 308         sample_base = Bases[max_id];
 309
 310         rate_t error_rate = 1 - pTiDvec[max_id];
 311         if (error_rate < 1e-10L)
 312         {       error_rate = 1e-10L;
 313         }
 314         phred_score = int(-10 * log10(error_rate));
 315 }
 316
 317
 318 int main(int argc, char *argv[])
 319 {
 320         //get options from command line
 321         int c;
 322         while((c=getopt(argc, argv, "m:q:s:h")) !=-1) {
 323                 switch(c) {
 324                         case 'm': Error_profile_file=optarg; break;
 325                         case 'q': Ascii_Qual_Start=atoi(optarg); break;
 326                         case 's': Phred_score_cutoff=atoi(optarg); break;
 327                         case 'h': usage(); break;
 328                         default: usage();
 329                 }
 330         }
 331         if (argc < 3) usage();
 332
 333         if (! Error_profile_file.size())
 334         {       string exepath = argv[0];
 335                 int numh = exepath.rfind("/");
 336                 Error_profile_file = exepath.substr(0,numh+1) + "XieSunney.blood.matrix.soapsnp";
 337         }
 338         cerr << "\nUsed error profile matrix file: " << Error_profile_file << endl;
 339
 340
 341         string diploid_snp_file = argv[optind++]; //optind, argv[optind++]顺序指向非option的参数
 342         string sperm_pileup_file = argv[optind++]; //optind, argv[optind++]顺序指向非option的参数
 343
 344         clock_t time_start, time_end;
 345         time_start = clock();
 346
 347         //caluate the prior probability of each genotype
 348         assign_PTi_rates(diploid_snp_file);
 349         cerr << "Load the snp file done:\n\n";
 350
 351         //load the error and quality profile matrix into memory
 352         load_soapsnp_error_profile(Error_profile_file);
 353         cerr << "Dimensions of error profile:\n";
 354         cerr << "\n     Ref_Base_num " << Ref_Base_num << endl;
 355         cerr << "       Cycle_num " << Cycle_num << endl;
 356         cerr << "       Seq_Base_num " << Seq_Base_num << endl;
 357         cerr << "       Quality_num " << Quality_num << endl;
 358         cerr << "       PdTi array size: " << Ref_Cyc_base_qual << endl;
 359         cerr << "       Qual_add_val " << Qual_add_val << endl;
 360         cerr << "       Cycle_add_val " << Cycle_add_val << endl;
 361         cerr << "       Ref_add_val " << Ref_add_val << endl;
 362
 363         time_end = clock();
 364         cerr << "\nLoad the error profile done\n";
 365         cerr << "Run time: " << double(time_end - time_start) / CLOCKS_PER_SEC << endl;
 366
 367         //parse the pileup file, and calculate the likelihood of each genotypes
 368         igzstream infile;
 369         infile.open(sperm_pileup_file.c_str());
 370         if ( ! infile )
 371         {       cerr << "fail to open input file" << sperm_pileup_file << endl;
 372         }
 373
 374         cout << "#Chr\tPos\tRef\tDiGt\tSpGt\tScore\tDepth\tBases\tQuals\tCycles\t[all mapped bases]\n";
 375         string lineStr;
 376         while (getline( infile, lineStr, '\n' ))
 377         {       vector<string> lineVec;
 378                 boost::split(lineVec,lineStr, boost::is_any_of("\t"), boost::token_compress_on);
 379
 380                 long int refPos = boost::lexical_cast<long int>(lineVec[1]);
 381
 382                 if ( ! PTi.count( refPos ) )
 383                 {       continue;
 384                 }
 385
 386                 int refBase = get_base(lineVec[2][0]);
 387
 388                 vector<int> baseVec;
 389                 get_clean_bases(refBase,lineVec[4],baseVec);
 390
 391                 vector<int> qualVec;
 392                 get_quality(qualVec, lineVec[5]);
 393
 394                 vector<int> cyclVec;
 395                 get_cycle(cyclVec, lineVec[7]);
 396
 397                 //check the result of parsing pileup file
 398                 if (baseVec.size() != qualVec.size() || baseVec.size() != cyclVec.size() || baseVec.size() == 0)
 399                 {       cerr << "Error happens at : " << lineVec[4] << "\t" << lineVec[5] << "\t" << lineVec[7] << endl << endl;
 400                         exit(0);
 401                 }
 402
 403                 vector<int> FbaseVec;
 404                 vector<int> FqualVec;
 405                 vector<int> FcyclVec;
 406
 407                 //filter the observed data, base not N, and qual > 5;
 408                 for (int i=0; i<baseVec.size(); i++)
 409                 {       if (baseVec[i] != 4 && qualVec[i] >= 5) //当质量值小于5的时候，P(D|G)几乎是相等的，因此P(G|D) = P(G), 数据没有发挥作用，结果将只反映先验概率，造成错误。
 410                         {       FbaseVec.push_back(baseVec[i]);
 411                                 FqualVec.push_back(qualVec[i]);
 412                                 FcyclVec.push_back(cyclVec[i]);
 413                         }
 414                 }
 415                 if (FbaseVec.size() == 0)
 416                 {       continue;
 417                 }
 418
 419                 vector<rate_t> pTiDvec;
 420                 calculte_genotype_probability(refPos,pTiDvec, FbaseVec, FqualVec, FcyclVec);
 421
 422                 char sample_base;
 423                 int phred_score;
 424                 find_genotype_score(pTiDvec,sample_base,phred_score);
 425
 426                 if (phred_score >= Phred_score_cutoff)
 427                 {
 428                         cout << lineVec[0] << "\t" << lineVec[1] << "\t" << Bases[refBase] << "\t" << DiGeno[refPos] << "\t" << sample_base << "\t" << phred_score << "\t" << FbaseVec.size() << "\t";
 429                         for (int i=0; i<FbaseVec.size(); i++)
 430                         {       cout << Bases[FbaseVec[i]];
 431                         }
 432                         cout << "\t";
 433                         for (int i=0; i<FbaseVec.size(); i++)
 434                         {       cout << FqualVec[i] << ",";
 435                         }
 436                         cout << "\t";
 437                         for (int i=0; i<FbaseVec.size(); i++)
 438                         {       cout << FcyclVec[i] << ",";
 439                         }
 440                         cout << "\t" << baseVec.size() << "\t"<< lineVec[4] << "\t" << lineVec[5] << "\t" << lineVec[7] << endl;
 441                 }
 442         }
 443
 444         time_end = clock();
 445         cerr << "\nParse the pileup file and genotype calculation done\n";
 446         cerr << "Run time: " << double(time_end - time_start) / CLOCKS_PER_SEC << endl;
 447
 448 }
 449