3 ########################################################################################################
5 # Usage: Rscript gene_id_run.R -i input_samples.csv -f allele_freq.csv -d database.csv -m input_info.txt
7 ########################################################################################################
9 # -i files/msSNP.csv -m files/meta.txt -d files/database.csv -f files/allele_freq.csv -p out
17 c("input","i",2,"character","the input genotype csv of suspect",
18 "allele_freq","f",2,"character","allele frequence csv of SNPs",
19 "database","d",2,"character","the database csv stored SNPs genotype",
20 "outpath","p",2,"character","out dir.",
21 "input_info","m",2,"character","meta infomation of input samples"),
22 byrow
= TRUE, ncol
= 5
25 opt
= getopt(spec
= spec
)
28 data(UCSC
.HG19
.Human
.CytoBandIdeogram
)
31 calculate
= function(input_geno_df
, sample_geno_df
, allele_freq_df
){
33 for (rsid
in colnames(input_geno_df
)){
34 ig
= input_geno_df
[1,rsid
]
35 sg
= sample_geno_df
[1,rsid
]
36 if ((ig
== 'Null') || (sg
== 'Null')){pm_value
= 1}
38 if (ig
!= sg
){pm_value
= 0.0001}
40 iw
= c(substr(ig
,1,1), substr(ig
,2,2))
41 sw
= c(substr(sg
,1,1), substr(sg
,2,2))
42 if (iw
[1] == iw
[2]){pm_value
= 1 / ((allele_freq_df
[rsid
,iw
[1]])**2)}
44 if ((allele_freq_df
[rsid
,iw
[1]]==0.01) || (allele_freq_df
[rsid
,iw
[2]]==0.01)){pm_value
= 100}
45 else{pm_value
= 1 / (2 * allele_freq_df
[rsid
,iw
[1]] * allele_freq_df
[rsid
,iw
[2]])}
49 sample_geno_df
[1,rsid
] = paste(sg
, pm_value
)
50 likehood
= likehood
* pm_value
52 sample_geno_df$tpm
= likehood
53 return(sample_geno_df
)
57 input_geno_df
= read
.csv(opt$input
, row
.names
= 1)
58 #allele_freq_df = read.csv(opt$allele_freq, row.names = 1)
59 #database_df = read.csv(opt$database, row.names = 1)
60 input_info_df
= suppressWarnings(read
.delim(opt$input_info
, fileEncoding
= "UTF-16", row
.names
= 1))
61 #input_geno_df = read.csv("C:\\Users\\wangjingdong\\Desktop\\gene_id_rscript\\gene_id_rscript\\data\\input_csv.csv", row.names = 1)
62 allele_freq_df
= read
.csv("files/allele_freq.csv", row
.names
= 1)
63 database_df
= read
.csv("files/database.csv", row
.names
= 1)
64 #input_info_df = read.delim("C:\\Users\\wangjingdong\\Desktop\\gene_id_rscript\\gene_id_rscript\\data\\input_info.txt", fileEncoding = "UTF-16", row.names = 1)
67 input_file_name
= strsplit(basename(opt$input
),".", fixed
= T)[[1]][1]
68 #input_file_name = strsplit(basename("C:\\Users\\wangjingdong\\Desktop\\gene_id_rscript\\gene_id_rscript\\data\\input_csv.csv"),".", fixed = T)[[1]][1]
69 #out_dir_name = paste(input_file_name, "_out", sep = '')
70 #out_dir_name = paste(opt$outpath)
72 dir
.create(out_dir_name
,showWarnings
=F)
73 dir
.create('pdf',showWarnings
=F)
75 # 对数据预处理,将等位基因频率0替换成0.01,将纯合‘A’替换成’AA‘
76 allele_freq_df
[allele_freq_df
==0] = 0.01
77 input_geno_df
[input_geno_df
== "TRUE"] = "T"
78 for (r
in rownames(input_geno_df
)){
79 for (c
in colnames(input_geno_df
)){
80 if (nchar(input_geno_df
[r
,c
]) == 1){
81 input_geno_df
[r
,c
] = paste(input_geno_df
[r
,c
], input_geno_df
[r
,c
], sep
= '')
86 for (r
in rownames(database_df
)){
87 for (c
in colnames(database_df
)){
88 if (nchar(database_df
[r
,c
]) == 1){
89 database_df
[r
,c
] = paste(database_df
[r
,c
], database_df
[r
,c
], sep
= '')
94 # 计算每个样本与数据库的比对结果,并返回比对总表、基因型报告两个表格
96 for (suspect
in rownames(input_geno_df
)){
97 match_file
= paste(suspect
, "_match.csv", sep
= '')
98 report_file
= paste(suspect
, "_report.csv", sep
= '')
99 suspect_geno_df
= input_geno_df
[suspect
,]
100 cat('Processing',suspect
,':')
101 # 计算该样本与数据库中每个样本的比对结果
102 samples
= rownames(database_df
)
103 sample_geno_df_one
= calculate(suspect_geno_df
, database_df
[samples
[1],], allele_freq_df
)
104 for (sample_name
in samples
[c(2:length(samples
))]){
105 sample_geno_df_two
= calculate(suspect_geno_df
, database_df
[sample_name
,], allele_freq_df
)
106 sample_geno_df_one
= rbind(sample_geno_df_one
, sample_geno_df_two
)
108 sorted_all_result_df
= sample_geno_df_one
[order(sample_geno_df_one
[,"tpm"], decreasing
= T), ]
111 if (sorted_all_result_df
[1,"tpm"] < 0.0001){statement
= "无匹配对象"}else{
112 if (sorted_all_result_df
[1,"tpm"] <= 8000000000){
113 mid_match
= subset(sorted_all_result_df
, (tpm
<= 8000000000) && (tpm
>= 0.0001))
114 samples
= paste(rownames(mid_match
), collapse
= ',')
115 statement
= paste("与样本", samples
, "可能一致, 应增加其它检测")
117 NeedRerun
<- c(NeedRerun
,suspect
)
119 top_match
= subset(sorted_all_result_df
, tpm
> 8000000000)
120 samples
= paste(rownames(top_match
), collapse
= ',')
121 statement
= paste("与样本", samples
, "一致")
124 sorted_all_result_df$tpm
= format(sorted_all_result_df$tpm
, scientific
= T)
125 # 输出单个样本比对总表,要根据系统调整文件名分隔符,‘/’或者‘\\’
126 write
.csv(sorted_all_result_df
, file
.path(out_dir_name
, match_file
), fileEncoding
= "UTF-8")
129 best_rate_person
= rownames(sorted_all_result_df
)[1]
130 best_rate
= sorted_all_result_df
[best_rate_person
, "tpm"]
133 trans_suspect_geno_df
= as
.data
.frame(t(suspect_geno_df
))
134 names(trans_suspect_geno_df
) = "GT"
135 report_df
= cbind(trans_suspect_geno_df
, allele_freq_df
[,c("Number","Chr","Pos","A","C","G","T")])
138 for (rsid
in rownames(report_df
)){
139 genotype
= report_df
[rsid
,"GT"]
140 if (genotype
== "Null"){report_df
[rsid
, "GT%"] = "0%"}
142 allele_1
= substr(genotype
, 1, 1)
143 allele_2
= substr(genotype
, 2, 2)
144 if (allele_1
== allele_2
){report_df
[rsid
, "GT%"] = paste(format((report_df
[rsid
, allele_1
]) ** 2 * 100, digits
= 4),"%", sep
= '')}
145 else{report_df
[rsid
, "GT%"] = paste(format(2 * report_df
[rsid
, allele_1
] * report_df
[rsid
, allele_2
] * 100, digits
= 4), "%", sep
= '')}
149 report_df$rsID
= rownames(report_df
)
150 report_df
[rownames(report_df
)[1],"report"] = suspect
151 report_df
[rownames(report_df
)[2],"report"] = best_rate
152 report_df
[rownames(report_df
)[3],"report"] = best_rate_person
153 report_df
[rownames(report_df
)[4],"report"] = input_info_df
[suspect
, "Name"]
154 report_df
[rownames(report_df
)[5],"report"] = input_info_df
[suspect
, "Sex"]
155 report_df
[rownames(report_df
)[6],"report"] = input_info_df
[suspect
, "Birth"]
156 report_df
[rownames(report_df
)[7],"report"] = input_info_df
[suspect
, "Date"]
157 report_df
[rownames(report_df
)[8],"report"] = input_info_df
[suspect
, "Tel"]
158 report_df
[rownames(report_df
)[9],"report"] = input_info_df
[suspect
, "Address"]
159 report_df
[rownames(report_df
)[10],"report"] = statement
160 report_df
[rownames(report_df
)[11],"report"] = QCstate
161 output_report_df
= report_df
[,c("Number","GT","GT%","rsID","Chr","Pos","report")]
163 # 输出单个样本基因型报告表格,要根据系统调整文件名分隔符,‘/’或者‘\\’
164 write
.csv(output_report_df
, file
.path(out_dir_name
, report_file
), row
.names
= F, fileEncoding
= "UTF-8")
167 sample_report_df
<- output_report_df
169 sample_report_df$End
= sample_report_df$Pos
170 for (rsid
in rownames(sample_report_df
)){
171 genotype
= sample_report_df
[rsid
,"GT"]
172 if (genotype
== "Null"){sample_report_df
[rsid
, "Val"] = 0}
174 allele_1
= substr(genotype
, 1, 1)
175 allele_2
= substr(genotype
, 2, 2)
176 if (allele_1
== allele_2
){sample_report_df
[rsid
, "Val"] = 1}
177 else{sample_report_df
[rsid
, "Val"] = 0.5}
181 base_group
= function(df
, base
){
183 for (i
in 1:length(rownames(df
))){
185 if (geno
== "Null"){bool
= append(bool
,FALSE)}
187 alleles
= c(substr(geno
, 1, 1), substr(geno
, 2, 2))
188 if (base
%in% alleles
){bool
= append(bool
,TRUE)}
189 else{bool
= append(bool
,FALSE)}
192 return(df
[rownames(df
)[bool
],c("Chr","Pos","End","Val")])
195 matrix_A
= base_group(sample_report_df
, "A")
196 matrix_A$PlotColor
= "#5050FF"
198 matrix_T
= base_group(sample_report_df
, "T")
199 matrix_T$PlotColor
= "#CC9900"
201 matrix_G
= base_group(sample_report_df
, "G")
202 matrix_G$PlotColor
= "#00C000"
204 matrix_C
= base_group(sample_report_df
, "C")
205 matrix_C$PlotColor
= "#E00000"
207 #out.file <- "nCircos.pdf"
208 png_file
<- paste(suspect
, "_report.png", sep
= '')
209 png(file
=file
.path(out_dir_name
, png_file
), height
=2160, width
=2160,pointsize
=48)
211 #RCircos.Set.Core.Components(cyto.info=UCSC.HG19.Human.CytoBandIdeogram, tracks.inside=4, tracks.outside=0, chr.exclude=c("chrX", "chrY"))
213 RCircos
.Set
.Core
.Components(cyto
.info
=UCSC
.HG19
.Human
.CytoBandIdeogram
, tracks
.inside
=4, tracks
.outside
=0, chr
.exclude
=c("chrX", "chrY"))
214 , classes
= "message")
216 rcircos
.params
<- RCircos
.Get
.Plot
.Parameters()
217 rcircos
.params$track
.background
<- 'white'
218 rcircos
.params$grid
.line
.color
<- 'white'
219 rcircos
.params$hist
.width
<- 160
220 rcircos
.params$track
.padding
<- 0
222 RCircos
.Reset
.Plot
.Parameters(rcircos
.params
)
223 , classes
= "message")
225 RCircos
.Set
.Plot
.Area(margins
=0)
226 legend("center", legend
=c("A", "C", "G", "T"), col
=c("#5050FF", "#E00000","#00C000","#CC9900"), lwd
=16)
228 RCircos
.Chromosome
.Ideogram
.Plot()
230 RCircos
.Histogram
.Plot(hist
.data
=matrix_A
, data
.col
=4,track
.num
=1, side
="in",is
.sorted
=F)
231 RCircos
.Histogram
.Plot(hist
.data
=matrix_C
, data
.col
=4,track
.num
=2, side
="in",is
.sorted
=F)
232 RCircos
.Histogram
.Plot(hist
.data
=matrix_G
, data
.col
=4,track
.num
=3, side
="in",is
.sorted
=F)
233 RCircos
.Histogram
.Plot(hist
.data
=matrix_T
, data
.col
=4,track
.num
=4, side
="in",is
.sorted
=F)
237 png_trimed
<- paste(suspect
, ".png", sep
= '')
238 imagemagickCLI
<- paste('convert -trim +repage -bordercolor none -border 0x20',file
.path(out_dir_name
, png_file
),file
.path(out_dir_name
, png_trimed
))
239 system(imagemagickCLI
)
241 thePrefix
<- paste(suspect
, "_report", sep
= '')
242 invisible(capture
.output(
243 rmarkdown
::render('20200608rep.Rmd',output_dir
=out_dir_name
,output_file
=paste0(thePrefix
,'.pdf'),quiet
=TRUE,clean
=TRUE)
246 mergeCLI
<- paste('qpdf --empty --pages inc/Cover.pdf ',file
.path(out_dir_name
, paste0(thePrefix
,'.pdf')),' -- ', file
.path('pdf', paste0(thePrefix
,'.pdf')) )
252 qcfilecon
<- file(file
.path(out_dir_name
, '_QC.txt'),'wt',encoding
='UTF-8')
253 if (length(NeedRerun
)) {
254 QCstate
<- paste(sep
="\n",'下列样品需要重测:',NeedRerun
,'')
256 QCstate
<- '这批样品正常。\n'
258 cat('\n=================\n',QCstate
,'=================\n',sep
='')
259 writeLines(QCstate
,con
=qcfilecon
)