2 #calculates genomic estimated breeding values (GEBVs) using rrBLUP,
6 # Isaak Y Tecle (iyt2@cornell.edu)
10 suppressWarnings(suppressPackageStartupMessages({
17 library(genoDataFilter
)
18 library(phenoAnalysis
)
26 library(genoDataFilter
)
29 allArgs
<- commandArgs()
32 inputFiles
<- scan(grep("input_files", allArgs
, value
= TRUE),
35 outputFiles
<- scan(grep("output_files", allArgs
, value
= TRUE),
38 traitsFile
<- grep("traits", inputFiles
, value
= TRUE)
39 modelInfoFile
<- grep("model_info", inputFiles
, value
= TRUE)
40 message('model_info_file ', modelInfoFile
)
42 modelInfo
<- read
.table(modelInfoFile
,
43 header
=TRUE, sep
="\t",
46 modelInfo
<- column_to_rownames(modelInfo
, var
="Name")
47 traitId
<- modelInfo
["trait_id", 1]
48 traitAbbr
<- modelInfo
["trait_abbr", 1]
49 modelId
<- modelInfo
["model_id", 1]
50 protocolId
<- modelInfo
["protocol_id", 1]
51 protocolPage
<- modelInfo
["protocol_url", 1]
53 message('trait_id ', traitId
)
54 message('trait_abbr ', traitAbbr
)
55 message('protocol_id ', protocolId
)
56 message('protocol detail page ', protocolPage
)
58 message('model_id ', modelId
)
60 datasetInfoFile
<- grep("dataset_info", inputFiles
, value
= TRUE)
63 if (length(datasetInfoFile
) != 0 ) {
64 datasetInfo
<- scan(datasetInfoFile
, what
= "character")
65 datasetInfo
<- paste(datasetInfo
, collapse
= " ")
67 datasetInfo
<- c('single_population')
70 #validationTrait <- paste("validation", trait, sep = "_")
71 validationFile
<- grep('validation', outputFiles
, value
= TRUE)
73 if (is
.null(validationFile
)) {
74 stop("Validation output file is missing.")
77 #kinshipTrait <- paste("rrblup_training_gebvs", trait, sep = "_")
78 blupFile
<- grep('rrblup_training_gebvs', outputFiles
, value
= TRUE)
80 if (is
.null(blupFile
)) {
81 stop("GEBVs file is missing.")
84 #markerTrait <- paste("marker_effects", trait, sep = "_")
85 markerFile
<- grep('marker_effects', outputFiles
, value
= TRUE)
87 #traitPhenoFile <- paste("trait_phenotype_data", traitId, sep = "_")
88 modelPhenoFile
<- grep('model_phenodata', outputFiles
, value
= TRUE)
89 message('model input trait pheno file ', modelPhenoFile
)
90 modelGenoFile
<- grep('model_genodata', outputFiles
, value
= TRUE)
91 message('model input trait geno file ', modelGenoFile
)
92 traitRawPhenoFile
<- grep('trait_raw_phenodata', outputFiles
, value
= TRUE)
93 varianceComponentsFile
<- grep("variance_components", outputFiles
, value
= TRUE)
94 analysisReportFile
<- grep("_report_", outputFiles
, value
= TRUE)
95 genoFilteringLogFile
<- grep("genotype_filtering_log", outputFiles
, value
= TRUE)
97 filteredTrainingGenoFile
<- grep("filtered_training_genotype_data", outputFiles
, value
= TRUE)
98 filteredSelGenoFile
<- grep("filtered_selection_genotype_data", outputFiles
, value
= TRUE)
99 formattedPhenoFile
<- grep("formatted_phenotype_data", inputFiles
, value
= TRUE)
101 genoFile
<- grep("genotype_data_", inputFiles
, value
= TRUE)
103 if (is
.null(genoFile
)) {
104 stop("genotype data file is missing.")
107 if (file
.info(genoFile
)$size
== 0) {
108 stop(paste0("genotype data file ", genoFile
, " is empty."))
111 readfilteredTrainingGenoData
<- c()
112 filteredTrainingGenoData
<- c()
114 formattedPhenoData
<- c()
121 logHeading
<- paste0("Genomic Prediction Analysis Log for ", traitAbbr
, ".\n")
122 logHeading
<- append(logHeading
, paste0("Date: ", format(Sys
.time(), "%d %b %Y %H:%M"), "\n\n\n"))
123 logHeading
<- format(logHeading
, width
=80, justify
="c")
124 trainingLog
<- paste0("\n\n#Preprocessing training population genotype data.\n\n")
125 trainingLog
<- append(trainingLog
, "The following data filtering will be applied to the genotype dataset:\n\n")
126 trainingLog
<- append(trainingLog
, paste0("Markers with less or equal to ", maf
* 100, "% minor allele frequency (maf) will be removed.\n"))
127 trainingLog
<- append(trainingLog
, paste0("\nMarkers with greater or equal to ", markerFilter
* 100, "% missing values will be removed.\n"))
128 trainingLog
<- append(trainingLog
, paste0("Clones with greater or equal to ", cloneFilter
* 100, "% missing values will be removed.\n") )
130 if (length(filteredTrainingGenoFile
) != 0 && file
.info(filteredTrainingGenoFile
)$size
!= 0) {
131 filteredTrainingGenoData
<- fread(filteredTrainingGenoFile
,
132 na
.strings
= c("NA", "", "--", "-"),
135 genoData
<- data
.frame(filteredTrainingGenoData
)
136 genoData
<- column_to_rownames(genoData
, 'V1')
137 readfilteredTrainingGenoData
<- 1
140 if (is
.null(filteredTrainingGenoData
)) {
141 genoData
<- fread(genoFile
,
142 na
.strings
= c("NA", "", "--", "-"),
144 genoData
<- unique(genoData
, by
='V1')
145 genoData
<- data
.frame(genoData
)
146 genoData
<- column_to_rownames(genoData
, 'V1')
147 #genoDataFilter::filterGenoData
148 genoData
<- convertToNumeric(genoData
)
150 trainingLog
<- append(trainingLog
, "#Running training population genotype data cleaning.\n\n")
151 genoFilterOut
<- filterGenoData(genoData
, maf
=maf
, markerFilter
=markerFilter
, indFilter
=cloneFilter
, logReturn
=TRUE)
153 genoData
<- genoFilterOut$data
154 genoFilteringLog
<- genoFilterOut$log
155 genoData
<- roundAlleleDosage(genoData
)
156 filteredTrainingGenoData
<- genoData
159 genoFilteringLog
<- scan(genoFilteringLogFile
, what
= "character", sep
="\n")
160 genoFilteringLog
<- paste0(genoFilteringLog
, collapse
="\n")
163 message("genofilteringlogfile: ", genoFilteringLogFile
)
164 message(genoFilteringLog
)
165 trainingLog
<- append(trainingLog
, genoFilteringLog
)
167 genoData
<- genoData
[order(row
.names(genoData
)), ]
169 if (length(formattedPhenoFile
) != 0 && file
.info(formattedPhenoFile
)$size
!= 0) {
170 formattedPhenoData
<- data
.frame(fread(formattedPhenoFile
,
172 na
.strings
= c("NA", "", "--", "-", ".")
177 if (datasetInfo
== 'combined_populations') {
179 phenoFile
<- grep("model_phenodata", inputFiles
, value
= TRUE)
182 phenoFile
<- grep("\\/phenotype_data", inputFiles
, value
= TRUE)
185 if (is
.null(phenoFile
)) {
186 stop("phenotype data file is missing.")
189 if (file
.info(phenoFile
)$size
== 0) {
190 stop(paste0("phenotype data file ", phenoFile
, " is empty."))
194 phenoData
<- data
.frame(fread(phenoFile
,
196 na
.strings
= c("NA", "", "--", "-", "."),
203 traitRawPhenoData
<- c()
204 anovaLog
<- paste0("#Preprocessing training population phenotype data.\n\n")
206 if (datasetInfo
== 'combined_populations') {
207 anovaLog
<- scan(analysisReportFile
, what
= "character", sep
="\n")
208 anovaLog
<- paste0(anovaLog
, collapse
="\n")
210 if (!is
.null(formattedPhenoData
)) {
211 phenoTrait
<- subset(formattedPhenoData
, select
= traitAbbr
)
212 phenoTrait
<- na
.omit(phenoTrait
)
216 if (any(grepl('Average', names(phenoData
)))) {
217 phenoTrait
<- phenoData
%>% select(V1
, Average
) %>% data
.frame
219 phenoTrait
<- phenoData
222 colnames(phenoTrait
) <- c('genotypes', traitAbbr
)
226 if (!is
.null(formattedPhenoData
)) {
227 phenoTrait
<- subset(formattedPhenoData
, select
= c('V1', traitAbbr
))
228 phenoTrait
<- as
.data
.frame(phenoTrait
)
229 phenoTrait
<- na
.omit(phenoTrait
)
230 colnames(phenoTrait
)[1] <- 'genotypes'
232 } else if (length(grep('list', phenoFile
)) != 0) {
233 phenoTrait
<- averageTrait(phenoData
, traitAbbr
)
236 meansResult
<- getAdjMeans(phenoData
,
237 traitName
= traitAbbr
,
243 anovaLog
<- paste0(anovaLog
, meansResult$log
)
244 phenoTrait
<- meansResult$adjMeans
247 keepMetaCols
<- c('observationUnitName', 'germplasmName', 'studyDbId', 'locationName',
248 'studyYear', 'replicate', 'blockNumber', traitAbbr
)
250 traitRawPhenoData
<- phenoData
%>%
251 select(all_of(keepMetaCols
))
256 meanType
<- names(phenoTrait
)[2]
257 names(phenoTrait
) <- c('genotypes', traitAbbr
)
259 selectionTempFile
<- grep("selection_population", inputFiles
, value
= TRUE)
262 filteredPredGenoFile
<- c()
263 selectionAllFiles
<- c()
265 if (length(selectionTempFile
) !=0 ) {
266 selectionAllFiles
<- scan(selectionTempFile
, what
= "character")
268 selectionFile
<- grep("\\/genotype_data", selectionAllFiles
, value
= TRUE)
270 #filteredPredGenoFile <- grep("filtered_genotype_data_", selectionAllFiles, value = TRUE)
273 selectionPopGEBVsFile
<- grep("rrblup_selection_gebvs", outputFiles
, value
= TRUE)
276 readFilteredPredGenoData
<- c()
277 filteredPredGenoData
<- c()
279 ## if (length(filteredPredGenoFile) != 0 && file.info(filteredPredGenoFile)$size != 0) {
280 ## selectionData <- fread(filteredPredGenoFile, na.strings = c("NA", " ", "--", "-"),)
281 ## readFilteredPredGenoData <- 1
283 ## selectionData <- data.frame(selectionData)
284 ## rownames(selectionData) <- selectionData[, 1]
285 ## selectionData[, 1] <- NULL
289 if (length(selectionFile
) != 0) {
290 selectionLog
<- append(selectionLog
, paste0("#Data preprocessing of selection population genotype data.\n\n"))
292 selectionData
<- fread(selectionFile
,
294 na
.strings
= c("NA", "", "--", "-"))
296 selectionData
<- data
.frame(selectionData
)
297 selectionData
<- unique(selectionData
, by
='V1')
298 selectionData
<- column_to_rownames(selectionData
, 'V1')
299 selectionData
<- convertToNumeric(selectionData
)
301 selectionLog
<- append(selectionLog
, paste0("Running selection population genotype data cleaning."))
303 selectionFilterOut
<- filterGenoData(selectionData
, maf
=maf
, markerFilter
=markerFilter
, indFilter
=cloneFilter
, logReturn
=TRUE)
304 selectionData
<- selectionFilterOut$data
305 selectionLog
<- append(selectionLog
, selectionFilterOut$log
)
306 selectionData
<- roundAlleleDosage(selectionData
)
309 #impute genotype values for obs with missing values,
310 genoDataMissing
<- c()
312 if (sum(is
.na(genoData
)) > 0) {
313 genoDataMissing
<- c('yes')
315 genoData
<- na
.roughfix(genoData
)
316 genoData
<- data
.frame(genoData
)
319 #create phenotype and genotype datasets with
322 #extract observation lines with both
323 #phenotype and genotype data only.
324 trainingLog
<- append(trainingLog
, paste0("\n\n#Filtering for training population genotypes with both phenotype and marker data.\n\n"))
325 trainingLog
<- append(trainingLog
, paste0("After calculating trait averages, the training population phenotype dataset has ", length(rownames(phenoTrait
)), " individuals.\n") )
326 trainingLog
<- append(trainingLog
, paste0("After cleaning up for missing values, the training population genotype dataset has ", length(rownames(genoData
)), " individuals.\n") )
328 commonObs
<- intersect(phenoTrait$genotypes
, row
.names(genoData
))
330 trainingLog
<- append(trainingLog
, paste0(length(commonObs
), " individuals are shared in both phenotype and genotype datasets.\n"))
332 #remove genotyped lines without phenotype data
333 genoDataFilteredObs
<- genoData
[(rownames(genoData
) %in% commonObs
), ]
335 trainingLog
<- append(trainingLog
, paste0("After removing individuals without phenotype data, this genotype dataset has ", length(rownames(genoDataFilteredObs
)), " individuals.\n"))
337 #remove phenotyped lines without genotype data
338 phenoTrait
<- phenoTrait
[(phenoTrait$genotypes
%in% commonObs
), ]
340 trainingLog
<- append(trainingLog
, paste0("After removing individuals without genotype data, this phenotype dataset has ", length(rownames(phenoTrait
)), " individuals.\n" ))
342 phenoTraitMarker
<- data
.frame(phenoTrait
)
343 rownames(phenoTraitMarker
) <- phenoTraitMarker
[, 1]
344 phenoTraitMarker
[, 1] <- NULL
346 #impute missing data in prediction data
348 selectionDataMissing
<- c()
349 if (length(selectionData
) != 0) {
350 #purge markers unique to both populations
351 trainingMarkers
<- names(genoDataFilteredObs
)
352 selectionMarkers
<- names(selectionData
)
354 selectionLog
<- append(selectionLog
, paste0("#Comparing markers in the training and selection populations genotype datasets.\n\n" ))
356 selectionLog
<- append(selectionLog
, paste0("The training population genotype dataset has ", length(trainingMarkers
), " markers.\n" ))
357 selectionLog
<- append(selectionLog
, paste0("The selection population genotype dataset has ", length(selectionMarkers
), " markers.\n" ))
359 commonMarkers
<- intersect(trainingMarkers
, selectionMarkers
)
360 selectionLog
<- append(selectionLog
, paste0("The training and selection populations genotype dataset have ", length(trainingMarkers
), " markers in common.\n" ))
362 genoDataFilteredObs
<- subset(genoDataFilteredObs
, select
= commonMarkers
)
363 selectionLog
<- append(selectionLog
, paste0("After filtering for shared markers, the training population genotype dataset has ", length(names(selectionData
)), " markers.\n" ))
365 selectionData
<- subset(selectionData
, select
= commonMarkers
)
366 selectionLog
<- append(selectionLog
, paste0("After filtering for shared markers, the selection population genotype dataset has ", length(names(selectionData
)), " markers.\n" ))
368 if (sum(is
.na(selectionData
)) > 0) {
369 selectionDataMissing
<- c('yes')
370 selectionData
<- na
.roughfix(selectionData
)
371 selectionData
<- data
.frame(selectionData
)
374 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
375 genoTrCode
<- grep("2", genoDataFilteredObs
[1, ], value
= TRUE)
376 if(length(genoTrCode
)) {
377 genoData
<- genoData
- 1
378 genoDataFilteredObs
<- genoDataFilteredObs
- 1
381 if (length(selectionData
) != 0 ) {
382 genoSlCode
<- grep("2", selectionData
[1, ], value
= TRUE)
383 if (length(genoSlCode
) != 0 ) {
384 selectionData
<- selectionData
- 1
388 ordered
.markerEffects
<- c()
391 combinedGebvsFile
<- c()
393 modelPhenoData
<- c()
394 relationshipMatrix
<- c()
396 #additive relationship model
397 #calculate the inner products for
398 #genotypes (realized relationship matrix)
399 relationshipMatrixFile
<- grep("relationship_matrix_table", outputFiles
, value
= TRUE)
400 relationshipMatrixJsonFile
<- grep("relationship_matrix_json", outputFiles
, value
= TRUE)
402 traitRelationshipMatrixFile
<- grep("relationship_matrix_adjusted_table", outputFiles
, value
= TRUE)
403 traitRelationshipMatrixJsonFile
<- grep("relationship_matrix_adjusted_json", outputFiles
, value
= TRUE)
405 inbreedingFile
<- grep('inbreeding_coefficients', outputFiles
, value
=TRUE)
406 aveKinshipFile
<- grep('average_kinship', outputFiles
, value
=TRUE)
411 if (length(relationshipMatrixFile
) != 0) {
412 if (file
.info(relationshipMatrixFile
)$size
> 0 ) {
413 relationshipMatrix
<- data
.frame(fread(relationshipMatrixFile
,
416 rownames(relationshipMatrix
) <- relationshipMatrix
[, 1]
417 relationshipMatrix
[, 1] <- NULL
418 colnames(relationshipMatrix
) <- rownames(relationshipMatrix
)
419 relationshipMatrix
<- data
.matrix(relationshipMatrix
)
422 relationshipMatrix
<- A
.mat(genoData
)
423 diag(relationshipMatrix
) <- diag(relationshipMatrix
) %>% replace(., . < 1, 1)
424 relationshipMatrix
<- relationshipMatrix
%>% replace(., . <= 0, 0.00001)
426 inbreeding
<- diag(relationshipMatrix
)
427 inbreeding
<- inbreeding
- 1
428 inbreeding
<- data
.frame(inbreeding
)
430 inbreeding
<- inbreeding
%>%
431 rownames_to_column('genotypes') %>%
432 rename(Inbreeding
= inbreeding
) %>%
433 arrange(Inbreeding
) %>%
434 mutate_at('Inbreeding', round
, 3) %>%
435 column_to_rownames('genotypes')
439 relationshipMatrix
<- data
.frame(relationshipMatrix
)
440 colnames(relationshipMatrix
) <- rownames(relationshipMatrix
)
442 relationshipMatrix
<- rownames_to_column(relationshipMatrix
, var
="genotypes")
443 relationshipMatrix
<- relationshipMatrix
%>% mutate_if(is
.numeric
, round
, 5)
444 relationshipMatrix
<- column_to_rownames(relationshipMatrix
, var
="genotypes")
446 traitRelationshipMatrix
<- relationshipMatrix
[(rownames(relationshipMatrix
) %in% commonObs
), ]
447 traitRelationshipMatrix
<- traitRelationshipMatrix
[, (colnames(traitRelationshipMatrix
) %in% commonObs
)]
450 if (any(eigen(traitRelationshipMatrix
)$values
< 0) ) {
451 kinshipLog
<- paste0("\n\nNote: The kinship matrix of this dataset causes 'Not positive semi-definite error' while running the Cholesky decomposition. To fix this and run the modeling, a corrected positive semi-definite matrix was computed using the 'Matrix::nearPD' function. The negative eigen values from this decomposition nudged to positive values.\n\n")
453 traitRelationshipMatrix
<- Matrix
::nearPD(as
.matrix(traitRelationshipMatrix
))$mat
456 traitRelationshipMatrix
<- data
.matrix(traitRelationshipMatrix
)
458 nCores
<- detectCores()
461 nCores
<- (nCores
%/% 2)
466 modelingLog
<- paste0("\n\n#Training a model for ", traitAbbr
, ".\n\n")
467 modelingLog
<- append(modelingLog
, paste0("The genomic prediction modeling follows a two-step approach. First trait average values, as described above, are computed for each genotype. This is followed by the model fitting on the basis of single phenotype value for each genotype entry and kinship matrix computed from their marker data.\n"))
469 if (length(kinshipLog
)) {
470 modelingLog
<- append(modelingLog
, paste0(kinshipLog
))
473 if (length(selectionData
) == 0) {
475 trModel
<- kin
.blup(data
= phenoTrait
,
478 K
= traitRelationshipMatrix
,
483 modelingLog
<- paste0(modelingLog
, "The model training is based on rrBLUP R package, version ", packageVersion('rrBLUP'), ". GEBVs are predicted using the kin.blup function and GBLUP method.\n\n")
486 trGEBVPEV
<- trModel$PEV
487 trGEBVSE
<- sqrt(trGEBVPEV
)
488 trGEBVSE
<- data
.frame(round(trGEBVSE
, 2))
490 trGEBV
<- data
.frame(round(trGEBV
, 2))
492 colnames(trGEBVSE
) <- c('SE')
493 colnames(trGEBV
) <- traitAbbr
495 trGEBVSE
<- rownames_to_column(trGEBVSE
, var
="genotypes")
496 trGEBV
<- rownames_to_column(trGEBV
, var
="genotypes")
498 trGEBVSE
<- full_join(trGEBV
, trGEBVSE
)
500 trGEBVSE
<- trGEBVSE
%>% arrange(across(traitAbbr
, desc
))
501 trGEBVSE
<- column_to_rownames(trGEBVSE
, var
="genotypes")
503 trGEBV
<- trGEBV
%>% arrange(across(traitAbbr
, desc
))
504 trGEBV
<- column_to_rownames(trGEBV
, var
="genotypes")
506 phenoTraitMarker
<- data
.matrix(phenoTraitMarker
)
507 genoDataFilteredObs
<- data
.matrix(genoDataFilteredObs
)
509 markerEffects
<- mixed
.solve(y
= phenoTraitMarker
,
510 Z
= genoDataFilteredObs
513 ordered
.markerEffects
<- data
.matrix(markerEffects$u
)
514 ordered
.markerEffects
<- data
.matrix(ordered
.markerEffects
[order (-ordered
.markerEffects
[, 1]), ])
515 ordered
.markerEffects
<- round(ordered
.markerEffects
, 5)
517 colnames(ordered
.markerEffects
) <- c("Marker Effects")
518 ordered
.markerEffects
<- data
.frame(ordered
.markerEffects
)
520 modelPhenoData
<- data
.frame(round(phenoTraitMarker
, 2))
522 heritability
<- round((trModel$Vg
/(trModel$Ve
+ trModel$Vg
)), 2)
523 additiveVar
<- round(trModel$Vg
, 2)
524 errorVar
<- round(trModel$Ve
, 2)
526 varCompData
<- c("\nAdditive genetic variance\t", additiveVar
, "\n")
527 varCompData
<- append(varCompData
, c("Error variance\t", errorVar
, "\n"))
528 varCompData
<- append(varCompData
, c("SNP heritability (h)\t", heritability
, "\n"))
530 combinedGebvsFile
<- grep('selected_traits_gebv', outputFiles
, ignore
.case
= TRUE,value
= TRUE)
532 if (length(combinedGebvsFile
) != 0) {
533 fileSize
<- file
.info(combinedGebvsFile
)$size
534 if (fileSize
!= 0 ) {
535 combinedGebvs
<- data
.frame(fread(combinedGebvsFile
,
538 rownames(combinedGebvs
) <- combinedGebvs
[,1]
539 combinedGebvs
[,1] <- NULL
541 allGebvs
<- merge(combinedGebvs
, trGEBV
,
546 rownames(allGebvs
) <- allGebvs
[,1]
553 if (is
.null(selectionFile
)) {
554 genoNum
<- nrow(phenoTrait
)
557 warning(genoNum
, " is too small number of genotypes.")
564 cvFolds
<- createMultiFolds(phenoTrait
[, 2], k
=k
, times
=reps
)
566 modelingLog
<- paste0(modelingLog
, "Model prediction accuracy is evaluated using cross-validation method. ", k
, " folds, replicated ", reps
, " times are used to predict the model accuracy.\n\n")
569 re
<- paste0('Rep', r
)
572 fo
<- ifelse(i
< 10, 'Fold0', 'Fold')
574 trFoRe
<- paste0(fo
, i
, '.', re
)
575 trG
<- cvFolds
[[trFoRe
]]
576 slG
<- as
.numeric(rownames(phenoTrait
[-trG
,]))
578 kblup
<- paste("rKblup", i
, sep
= ".")
580 result
<- kin
.blup(data
= phenoTrait
[trG
,],
583 K
= traitRelationshipMatrix
,
588 assign(kblup
, result
)
590 #calculate cross-validation accuracy
592 valBlups
<- data
.frame(valBlups
)
594 slG
<- slG
[which(slG
<= nrow(phenoTrait
))]
596 slGDf
<- phenoTrait
[(rownames(phenoTrait
) %in% slG
),]
597 rownames(slGDf
) <- slGDf
[, 1]
600 valBlups
<- rownames_to_column(valBlups
, var
="genotypes")
601 slGDf
<- rownames_to_column(slGDf
, var
="genotypes")
603 valCorData
<- inner_join(slGDf
, valBlups
, by
="genotypes")
604 valCorData$genotypes
<- NULL
606 accuracy
<- try(cor(valCorData
))
607 validation
<- paste("validation", trFoRe
, sep
= ".")
608 cvTest
<- paste("CV", trFoRe
, sep
= " ")
610 if (inherits(accuracy
, 'try-error') == FALSE)
612 accuracy
<- round(accuracy
[1,2], digits
= 3)
613 accuracy
<- data
.matrix(accuracy
)
615 colnames(accuracy
) <- c("correlation")
616 rownames(accuracy
) <- cvTest
618 assign(validation
, accuracy
)
620 if (!is
.na(accuracy
[1,1])) {
621 validationAll
<- rbind(validationAll
, accuracy
)
627 validationAll
<- data
.frame(validationAll
[order(-validationAll
[, 1]), ])
628 colnames(validationAll
) <- c('Correlation')
632 selectionPopResult
<- c()
633 selectionPopGEBVs
<- c()
634 selectionPopGEBVSE
<- c()
636 #selection pop geno data after cleaning up and removing unique markers to selection pop
637 filteredSelGenoData
<- selectionData
638 if (length(selectionData
) != 0) {
640 genoDataTrSl
<- rbind(genoDataFilteredObs
, selectionData
)
641 rTrSl
<- A
.mat(genoDataTrSl
)
643 selectionPopResult
<- kin
.blup(data
= phenoTrait
,
651 selectionPopGEBVs
<- round(data
.frame(selectionPopResult$g
), 2)
652 colnames(selectionPopGEBVs
) <- traitAbbr
653 selectionPopGEBVs
<- rownames_to_column(selectionPopGEBVs
, var
="genotypes")
655 selectionPopPEV
<- selectionPopResult$PEV
656 selectionPopSE
<- sqrt(selectionPopPEV
)
657 selectionPopSE
<- data
.frame(round(selectionPopSE
, 2))
658 colnames(selectionPopSE
) <- 'SE'
659 genotypesSl
<- rownames(selectionData
)
661 selectionPopSE
<- rownames_to_column(selectionPopSE
, var
="genotypes")
662 selectionPopSE
<- selectionPopSE
%>% filter(genotypes
%in% genotypesSl
)
664 selectionPopGEBVs
<- selectionPopGEBVs
%>% filter(genotypes
%in% genotypesSl
)
666 selectionPopGEBVSE
<- inner_join(selectionPopGEBVs
, selectionPopSE
, by
="genotypes")
668 sortVar
<- parse_expr(traitAbbr
)
669 selectionPopGEBVs
<- selectionPopGEBVs
%>% arrange(desc((!!sortVar
)))
670 selectionPopGEBVs
<- column_to_rownames(selectionPopGEBVs
, var
="genotypes")
672 selectionPopGEBVSE
<- selectionPopGEBVSE
%>% arrange(desc((!!sortVar
)))
673 selectionPopGEBVSE
<- column_to_rownames(selectionPopGEBVSE
, var
="genotypes")
676 if (!is
.null(selectionPopGEBVs
) & length(selectionPopGEBVsFile
) != 0) {
677 fwrite(selectionPopGEBVs
,
678 file
= selectionPopGEBVsFile
,
685 if(!is
.null(validationAll
)) {
686 fwrite(validationAll
,
687 file
= validationFile
,
695 if (!is
.null(ordered
.markerEffects
)) {
696 fwrite(ordered
.markerEffects
,
705 if (!is
.null(trGEBV
)) {
714 if (length(combinedGebvsFile
) != 0 ) {
715 if(file
.info(combinedGebvsFile
)$size
== 0) {
717 file
= combinedGebvsFile
,
724 file
= combinedGebvsFile
,
732 if (!is
.null(modelPhenoData
) && length(modelPhenoFile
) != 0) {
734 if (!is
.null(meanType
)) {
735 colnames(modelPhenoData
) <- meanType
738 fwrite(modelPhenoData
,
739 file
= modelPhenoFile
,
746 if (!is
.null(genoDataFilteredObs
) && length(modelGenoFile
) != 0) {
748 fwrite(genoDataFilteredObs
,
749 file
= modelGenoFile
,
756 if (!is
.null(traitRawPhenoData
) && length(traitRawPhenoFile
) != 0) {
758 fwrite(traitRawPhenoData
,
759 file
= traitRawPhenoFile
,
767 if (!is
.null(filteredTrainingGenoData
) && file
.info(filteredTrainingGenoFile
)$size
== 0) {
768 fwrite(filteredTrainingGenoData
,
769 file
= filteredTrainingGenoFile
,
775 cat(genoFilteringLog
, fill
= TRUE, file
= genoFilteringLogFile
, append
=FALSE)
778 if (length(filteredSelGenoFile
) != 0 && file
.info(filteredSelGenoFile
)$size
== 0) {
779 fwrite(filteredSelGenoData
,
780 file
= filteredSelGenoFile
,
787 ## if (!is.null(genoDataMissing)) {
788 ## write.table(genoData,
797 ## if (!is.null(predictionDataMissing)) {
798 ## write.table(predictionData,
799 ## file = predictionFile,
806 if (file
.info(relationshipMatrixFile
)$size
== 0) {
808 fwrite(relationshipMatrix
,
809 file
= relationshipMatrixFile
,
816 if (file
.info(relationshipMatrixJsonFile
)$size
== 0) {
818 relationshipMatrixJson
<- relationshipMatrix
819 relationshipMatrixJson
[upper
.tri(relationshipMatrixJson
)] <- NA
822 relationshipMatrixJson
<- data
.frame(relationshipMatrixJson
)
824 relationshipMatrixList
<- list(labels
= names(relationshipMatrixJson
),
825 values
= relationshipMatrixJson
)
827 relationshipMatrixJson
<- jsonlite
::toJSON(relationshipMatrixList
)
830 write(relationshipMatrixJson
,
831 file
= relationshipMatrixJsonFile
,
835 if (file
.info(traitRelationshipMatrixFile
)$size
== 0) {
837 inbre
<- diag(traitRelationshipMatrix
)
840 diag(traitRelationshipMatrix
) <- inbre
842 traitRelationshipMatrix
<- data
.frame(traitRelationshipMatrix
) %>% replace(., . < 0, 0)
844 fwrite(traitRelationshipMatrix
,
845 file
= traitRelationshipMatrixFile
,
851 if (file
.info(traitRelationshipMatrixJsonFile
)$size
== 0) {
853 traitRelationshipMatrixJson
<- traitRelationshipMatrix
854 traitRelationshipMatrixJson
[upper
.tri(traitRelationshipMatrixJson
)] <- NA
856 traitRelationshipMatrixJson
<- data
.frame(traitRelationshipMatrixJson
)
858 traitRelationshipMatrixList
<- list(labels
= names(traitRelationshipMatrixJson
),
859 values
= traitRelationshipMatrixJson
)
861 traitRelationshipMatrixJson
<- jsonlite
::toJSON(traitRelationshipMatrixList
)
863 write(traitRelationshipMatrixJson
,
864 file
= traitRelationshipMatrixJsonFile
,
870 if (file
.info(inbreedingFile
)$size
== 0) {
873 file
= inbreedingFile
,
880 if (file
.info(aveKinshipFile
)$size
== 0) {
882 aveKinship
<- data
.frame(apply(traitRelationshipMatrix
, 1, mean
))
884 aveKinship
<- aveKinship
%>%
885 rownames_to_column('genotypes') %>%
886 rename(Mean_kinship
= contains('traitRe')) %>%
887 arrange(Mean_kinship
) %>%
888 mutate_at('Mean_kinship', round
, 3) %>%
889 column_to_rownames('genotypes')
892 file
= aveKinshipFile
,
899 if (file
.info(formattedPhenoFile
)$size
== 0 && !is
.null(formattedPhenoData
) ) {
900 fwrite(formattedPhenoData
,
901 file
= formattedPhenoFile
,
907 if (!is
.null(varCompData
)) {
908 cat(varCompData
, file
= varianceComponentsFile
)
911 if (!is
.null(selectionLog
)) {
912 cat(logHeading
, selectionLog
, fill
= TRUE, file
= analysisReportFile
, append
=FALSE)
914 cat(logHeading
, anovaLog
, trainingLog
, modelingLog
, fill
= TRUE, file
= analysisReportFile
, append
=FALSE)
919 q(save
= "no", runLast
= FALSE)