R/FGI/rmd4/doAll.r

   1 #!/usr/bin/env littler
   2
   3 ########################################################################################################
   4 #
   5 # Usage: Rscript gene_id_run.R -i input_samples.csv -f allele_freq.csv -d database.csv -m input_info.txt
   6 #
   7 ########################################################################################################
   8
   9 # -i files/msSNP.csv -m files/meta.txt -d files/database.csv -f files/allele_freq.csv -p out
  10
  11 # 设置数值显示位数
  12 options(scipen = 200)
  13
  14 # 读取命令行参数
  15 library(getopt)
  16 spec = matrix(
  17   c("input","i",2,"character","the input genotype csv of suspect",
  18     "allele_freq","f",2,"character","allele frequence csv of SNPs",
  19     "database","d",2,"character","the database csv stored SNPs genotype",
  20         "outpath","p",2,"character","out dir.",
  21     "input_info","m",2,"character","meta infomation of input samples"),
  22   byrow = TRUE, ncol = 5
  23 )
  24
  25 opt = getopt(spec = spec)
  26
  27 library(RCircos)
  28 data(UCSC.HG19.Human.CytoBandIdeogram)
  29
  30 # 定义一个计算两个样本相似度的核心函数
  31 calculate = function(input_geno_df, sample_geno_df, allele_freq_df){
  32   likehood = 1
  33   for (rsid in colnames(input_geno_df)){
  34     ig = input_geno_df[1,rsid]
  35     sg = sample_geno_df[1,rsid]
  36     if ((ig == 'Null') || (sg == 'Null')){pm_value = 1}
  37     else{
  38       if (ig != sg){pm_value = 0.0001}
  39       if (ig == sg){
  40         iw = c(substr(ig,1,1), substr(ig,2,2))
  41         sw = c(substr(sg,1,1), substr(sg,2,2))
  42         if (iw[1] == iw[2]){pm_value = 1 / ((allele_freq_df[rsid,iw[1]])**2)}
  43         if (iw[1] != iw[2]){
  44           if ((allele_freq_df[rsid,iw[1]]==0.01) || (allele_freq_df[rsid,iw[2]]==0.01)){pm_value = 100}
  45           else{pm_value = 1 / (2 * allele_freq_df[rsid,iw[1]] * allele_freq_df[rsid,iw[2]])}
  46         }
  47       }
  48     }
  49     sample_geno_df[1,rsid] = paste(sg, pm_value)
  50     likehood = likehood * pm_value
  51   }
  52   sample_geno_df$tpm = likehood
  53   return(sample_geno_df)
  54 }
  55
  56 # 读入文件
  57 input_geno_df = read.csv(opt$input, row.names = 1, stringsAsFactors=F)
  58 #allele_freq_df = read.csv(opt$allele_freq, row.names = 1)
  59 #database_df = read.csv(opt$database, row.names = 1)
  60 input_info_df = suppressWarnings(read.delim(opt$input_info, fileEncoding = "UTF-16", row.names = 1, colClasses = "character"))
  61 #input_geno_df = read.csv("C:\\Users\\wangjingdong\\Desktop\\gene_id_rscript\\gene_id_rscript\\data\\input_csv.csv", row.names = 1)
  62 allele_freq_df = read.csv("files/allele_freq.csv", row.names = 1)
  63 database_df = read.csv("files/database.csv", row.names = 1, stringsAsFactors=F)
  64 #input_info_df = read.delim("C:\\Users\\wangjingdong\\Desktop\\gene_id_rscript\\gene_id_rscript\\data\\input_info.txt", fileEncoding = "UTF-16", row.names = 1)
  65
  66 # 创建样本比对结果输出文件夹
  67 input_file_name = strsplit(basename(opt$input),".", fixed = T)[[1]][1]
  68 #input_file_name = strsplit(basename("C:\\Users\\wangjingdong\\Desktop\\gene_id_rscript\\gene_id_rscript\\data\\input_csv.csv"),".", fixed = T)[[1]][1]
  69 #out_dir_name = paste(input_file_name, "_out", sep = '')
  70 #out_dir_name = paste(opt$outpath)
  71 out_dir_name <- 'out'
  72 dir.create(out_dir_name,showWarnings=F)
  73 dir.create('pdf',showWarnings=F)
  74
  75 # 对数据预处理，将等位基因频率0替换成0.01，将纯合‘A’替换成’AA‘
  76 allele_freq_df[allele_freq_df==0] = 0.01
  77 input_geno_df[input_geno_df == "TRUE"] = "T"
  78 for (r in rownames(input_geno_df)){
  79   for (c in colnames(input_geno_df)){
  80     if (nchar(input_geno_df[r,c]) == 1){
  81       input_geno_df[r,c] = paste(input_geno_df[r,c], input_geno_df[r,c], sep = '')
  82     }
  83   }
  84 }
  85
  86 for (r in rownames(database_df)){
  87   for (c in colnames(database_df)){
  88     if (nchar(database_df[r,c]) == 1){
  89       database_df[r,c] = paste(database_df[r,c], database_df[r,c], sep = '')
  90     }
  91   }
  92 }
  93
  94 # 计算每个样本与数据库的比对结果，并返回比对总表、基因型报告两个表格
  95 NeedRerun = c()
  96 for (suspect in rownames(input_geno_df)){
  97   match_file = paste(suspect, "_match.csv", sep = '')
  98   report_file = paste(suspect, "_report.csv", sep = '')
  99   suspect_geno_df = input_geno_df[suspect,]
 100   cat('Processing',suspect,':')
 101   # 计算该样本与数据库中每个样本的比对结果
 102   samples = rownames(database_df)
 103   sample_geno_df_one = calculate(suspect_geno_df, database_df[samples[1],], allele_freq_df)
 104   for (sample_name in samples[c(2:length(samples))]){
 105     sample_geno_df_two = calculate(suspect_geno_df, database_df[sample_name,], allele_freq_df)
 106     sample_geno_df_one = rbind(sample_geno_df_one, sample_geno_df_two)
 107   }
 108   sorted_all_result_df = sample_geno_df_one[order(sample_geno_df_one[,"tpm"], decreasing = T), ]
 109   # 对匹配结果进行分级，做判断陈述
 110   QCstate = 'PASS'
 111   if (sorted_all_result_df[1,"tpm"] < 0.0001){statement = "无匹配对象"}else{
 112     if (sorted_all_result_df[1,"tpm"] <= 8000000000){
 113       mid_match = subset(sorted_all_result_df, (tpm <= 8000000000) && (tpm >= 0.0001))
 114       samples = paste(rownames(mid_match), collapse = ',')
 115       statement = paste("与样本", samples, "可能一致, 应加测数据")
 116       QCstate = 'FAIL'
 117       NeedRerun <- c(NeedRerun,suspect)
 118     }else{
 119       top_match = subset(sorted_all_result_df, tpm > 8000000000)
 120       samples = paste(rownames(top_match), collapse = ',')
 121       statement = paste("与样本", samples, "一致")
 122     }
 123   }
 124   sorted_all_result_df$tpm = format(sorted_all_result_df$tpm, scientific = T)
 125   # 输出单个样本比对总表，要根据系统调整文件名分隔符，‘/’或者‘\\’
 126   write.csv(sorted_all_result_df, file.path(out_dir_name, match_file), fileEncoding = "UTF-8")
 127
 128   # 输出单个样本的基因分型报告
 129   best_rate_person = rownames(sorted_all_result_df)[1]
 130   best_rate = sorted_all_result_df[best_rate_person, "tpm"]
 131
 132   # 合成初步的报告表格
 133   trans_suspect_geno_df = as.data.frame(t(suspect_geno_df))
 134   names(trans_suspect_geno_df) = "GT"
 135   report_df = cbind(trans_suspect_geno_df, allele_freq_df[,c("Number","Chr","Pos","A","C","G","T")])
 136
 137   # 计算人群基因型频率百分比
 138   for (rsid in rownames(report_df)){
 139     genotype = report_df[rsid,"GT"]
 140     if (genotype == "Null"){report_df[rsid, "GT%"] = "0%"}
 141     else{
 142       allele_1 = substr(genotype, 1, 1)
 143       allele_2 = substr(genotype, 2, 2)
 144       if (allele_1 == allele_2){report_df[rsid, "GT%"] = paste(format((report_df[rsid, allele_1]) ** 2 * 100, digits = 4),"%", sep = '')}
 145       else{report_df[rsid, "GT%"] = paste(format(2 * report_df[rsid, allele_1] * report_df[rsid, allele_2] * 100, digits = 4), "%", sep = '')}
 146     }
 147   }
 148
 149   report_df$rsID = rownames(report_df)
 150   report_df[rownames(report_df)[1],"report"] = suspect
 151   report_df[rownames(report_df)[2],"report"] = best_rate
 152   report_df[rownames(report_df)[3],"report"] = best_rate_person
 153   report_df[rownames(report_df)[4],"report"] = input_info_df[suspect, "Name"]
 154   report_df[rownames(report_df)[5],"report"] = input_info_df[suspect, "Sex"]
 155   report_df[rownames(report_df)[6],"report"] = input_info_df[suspect, "Birth"]
 156   report_df[rownames(report_df)[7],"report"] = input_info_df[suspect, "Date"]
 157   report_df[rownames(report_df)[8],"report"] = input_info_df[suspect, "Tel"]
 158   report_df[rownames(report_df)[9],"report"] = input_info_df[suspect, "Address"]
 159   report_df[rownames(report_df)[10],"report"] = statement
 160   report_df[rownames(report_df)[11],"report"] = QCstate
 161   output_report_df = report_df[,c("Number","GT","GT%","rsID","Chr","Pos","report")]
 162
 163   # 输出单个样本基因型报告表格，要根据系统调整文件名分隔符，‘/’或者‘\\’
 164   write.csv(output_report_df, file.path(out_dir_name, report_file), row.names = F, fileEncoding = "UTF-8")
 165
 166   # Plot
 167   sample_report_df <- output_report_df
 168   # 准备数据框为目标画图形式
 169   sample_report_df$End = sample_report_df$Pos
 170   for (rsid in rownames(sample_report_df)){
 171     genotype = sample_report_df[rsid,"GT"]
 172     if (genotype == "Null"){sample_report_df[rsid, "Val"] = 0}
 173     else{
 174       allele_1 = substr(genotype, 1, 1)
 175       allele_2 = substr(genotype, 2, 2)
 176       if (allele_1 == allele_2){sample_report_df[rsid, "Val"] = 1}
 177       else{sample_report_df[rsid, "Val"] = 0.5}
 178     }
 179   }
 180   # 分成4个碱基文件
 181   base_group = function(df, base){
 182     bool = c()
 183     for (i in 1:length(rownames(df))){
 184       geno = df[i,2]
 185       if (geno == "Null"){bool = append(bool,FALSE)}
 186       else{
 187         alleles = c(substr(geno, 1, 1), substr(geno, 2, 2))
 188         if (base %in% alleles){bool = append(bool,TRUE)}
 189         else{bool = append(bool,FALSE)}
 190       }
 191     }
 192     return(df[rownames(df)[bool],c("Chr","Pos","End","Val")])
 193   }
 194
 195   matrix_A = base_group(sample_report_df, "A")
 196   matrix_A$PlotColor = "#5050FF"
 197
 198   matrix_T = base_group(sample_report_df, "T")
 199   matrix_T$PlotColor = "#CC9900"
 200
 201   matrix_G = base_group(sample_report_df, "G")
 202   matrix_G$PlotColor = "#00C000"
 203
 204   matrix_C = base_group(sample_report_df, "C")
 205   matrix_C$PlotColor = "#E00000"
 206
 207   #out.file <- "nCircos.pdf"
 208   png_file <- paste(suspect, "_report.png", sep = '')
 209   png(file=file.path(out_dir_name, png_file), height=2160, width=2160,pointsize=48)
 210
 211   #RCircos.Set.Core.Components(cyto.info=UCSC.HG19.Human.CytoBandIdeogram, tracks.inside=4, tracks.outside=0, chr.exclude=c("chrX", "chrY"))
 212   suppressMessages(
 213           RCircos.Set.Core.Components(cyto.info=UCSC.HG19.Human.CytoBandIdeogram, tracks.inside=4, tracks.outside=0, chr.exclude=c("chrX", "chrY"))
 214   )
 215
 216   rcircos.params <- RCircos.Get.Plot.Parameters()
 217   rcircos.params$track.background <- 'white'
 218   rcircos.params$grid.line.color <- 'white'
 219   rcircos.params$hist.width <- 160
 220   rcircos.params$track.padding <- 0
 221   suppressMessages(
 222           RCircos.Reset.Plot.Parameters(rcircos.params)
 223   )
 224
 225   RCircos.Set.Plot.Area(margins=0)
 226   legend("center", legend=c("A", "C", "G", "T"), col=c("#5050FF", "#E00000","#00C000","#CC9900"), lwd=16)
 227
 228   RCircos.Chromosome.Ideogram.Plot()
 229
 230   RCircos.Histogram.Plot(hist.data=matrix_A, data.col=4,track.num=1, side="in",is.sorted=F)
 231   RCircos.Histogram.Plot(hist.data=matrix_C, data.col=4,track.num=2, side="in",is.sorted=F)
 232   RCircos.Histogram.Plot(hist.data=matrix_G, data.col=4,track.num=3, side="in",is.sorted=F)
 233   RCircos.Histogram.Plot(hist.data=matrix_T, data.col=4,track.num=4, side="in",is.sorted=F)
 234
 235   dev.off()
 236
 237   png_trimed <- paste(suspect, ".png", sep = '')
 238   imagemagickCLI <- paste('convert -trim +repage -bordercolor none -border 0x20',file.path(out_dir_name, png_file),file.path(out_dir_name, png_trimed))
 239   system(imagemagickCLI)
 240
 241   thePrefix <- paste(suspect, "_report", sep = '')
 242   invisible(capture.output(
 243           rmarkdown::render('20200608rep.Rmd',output_dir=out_dir_name,output_file=paste0(thePrefix,'.pdf'),quiet=TRUE,clean=TRUE)
 244   ))
 245
 246   mergeCLI <- paste('qpdf --empty --pages inc/Cover.pdf ',file.path(out_dir_name, paste0(thePrefix,'.pdf')),' -- ', file.path('pdf', paste0(thePrefix,'.pdf')) )
 247   system(mergeCLI)
 248
 249   cat(' done.\n')
 250 }
 251
 252 qcfilecon <- file(file.path('pdf', '_QC.txt'),'wt',encoding='UTF-8')
 253 if (length(NeedRerun)) {
 254         QCstate <- paste(sep="\n",'下列样品需要重测：',NeedRerun,'')
 255 } else {
 256         QCstate <- '这批样品正常。\n'
 257 }
 258 cat('\n=================\n',QCstate,'=================\n',sep='')
 259 writeLines(QCstate,con=qcfilecon)
 260 close(qcfilecon)