tools/genotyping/mpr.R

   1 # MPR-internal.R
   2 `.loopMPR` <-
   3 function (genoData, allele.matrix, numStep = 1, ...)
   4 {
   5     rawGenoSum <- NumRecomEvents(genoData = genoData)
   6     for (x in 1:(nrow(allele.matrix) - numStep + 1)) {
   7         ids <- x:(x + numStep - 1)
   8         new.genoData <- genoData
   9         new.genoData[ids, ] <- abs(new.genoData[ids, ] - 1)
  10         newGenoSum <- NumRecomEvents(genoData = new.genoData)
  11         if (newGenoSum < rawGenoSum) {
  12             allele.matrix[ids, ] <- allele.matrix[ids, c(2, 1)]
  13             genoData <- new.genoData
  14             rawGenoSum <- newGenoSum
  15         }
  16     }
  17     list(rawGenoSum, allele.matrix, genoData)
  18 }
  19 `.MPR_hetero_` <-
  20 0.5
  21 `.MPR_p0_` <-
  22 0
  23 `.MPR_p1_` <-
  24 1
  25 `._rice_phy2get_factor_` <-
  26 244000
  27
  28 # NumRecomEvents.R
  29 `NumRecomEvents` <-
  30 function (baseData, allele.matrix, genoData = NULL)
  31 {
  32     if (is.null(genoData)) {
  33         genoData <- base2Geno(baseData, allele.matrix)
  34     }
  35     y <- !is.na(genoData)
  36     idsBorder <- cumsum(colSums(y))
  37     idsBorder <- idsBorder[idsBorder < sum(y)]
  38     sum(diff(as.numeric(genoData[y]))[-idsBorder] != 0, na.rm = TRUE)
  39 }
  40
  41 #R A or G
  42 #Y C or T
  43 #S G or C
  44 #W A or T
  45 #K G or T
  46 #M A or C
  47 #B C or G or T
  48 #D A or G or T
  49 #H A or C or T
  50 #V A or C or G
  51 #N any base
  52 #        if (length(x[x == 'R'])) {
  53 #            x[x == 'R']=NA;
  54 #            x=c(x,'A','G');
  55 #        }
  56
  57 # base2Allele.R
  58 .IUPAC=c('R','Y','S','W','K','M','B','D','H','V','N');
  59 .Decode=list(R=c('A','G'),Y=c('C','T'),S=c('G','C'),W=c('A','T'),K=c('G','T'),M=c('A','C'),B=c('C','G','T'),D=c('A','G','T'),H=c('A','C','T'),V=c('A','C','G'),N=NA);
  60
  61 `base2Allele` <-
  62 function (baseData = NULL)
  63 {
  64     allele.matrix <- t(apply(baseData, 1, function(x) {
  65         x <- unique(x[!is.na(x)])
  66
  67         for(i in 1:length(.IUPAC)) {
  68             if ( length(grep(.IUPAC[i],x)) ) {
  69                   x=x[-(grep(.IUPAC[i],x))]
  70                   x=c(x,.Decode[[i]])
  71             }
  72         }
  73
  74         x <- unique(x[!is.na(x)])
  75         if (length(x) == 2)
  76             return(x)
  77         else warning('SNP sit is not biallelic!')
  78         c(NA, NA)
  79     }))
  80     colnames(allele.matrix) <- c('P1', 'P2')
  81     na.exclude(allele.matrix)
  82 }
  83
  84 # base2Geno.R
  85 `base2Geno` <-
  86 function (baseData = NULL, allele.matrix = NULL)
  87 {
  88     if (nrow(baseData) == ncol(allele.matrix))
  89         allele.matrix <- t(allele.matrix)
  90     if (nrow(baseData) != nrow(allele.matrix))
  91         stop('nrow(baseData)!=nrow(allele.matrix), allele.matrix error!!!')
  92     genoData <- baseData
  93     genoData[baseData == allele.matrix[, 1]] <- 0
  94     genoData[baseData == allele.matrix[, 2]] <- 1
  95     genoData[genoData != 1 & genoData != 0] <- NA
  96     genoData <- matrix(as.numeric(genoData), ncol = ncol(genoData))
  97     dimnames(genoData) <- dimnames(baseData)
  98     genoData
  99 }
 100
 101 # calculateLODScore.R
 102 `calculateLODScore` <-
 103 function (geno.data, pheno.data, mapInfo)
 104 {
 105     geno.data.tmp <- geno.data > 0.5
 106     geno.data.tmp[geno.data.tmp == FALSE] <- NA
 107     ph1 <- matrix(t(matrix(pheno.data, nrow = length(pheno.data),
 108         ncol = nrow(geno.data)))[geno.data.tmp], ncol = ncol(geno.data))
 109     geno.data.tmp <- geno.data < 0.5
 110     geno.data.tmp[geno.data.tmp == FALSE] <- NA
 111     ph0 <- matrix(t(matrix(pheno.data, nrow = length(pheno.data),
 112         ncol = nrow(geno.data)))[geno.data.tmp], ncol = ncol(geno.data))
 113     g1 <- rowMeans(ph1, na.rm = T)
 114     n1 <- rowSums(!is.na(ph1))
 115     s1 <- rowVars(ph1, na.rm = T)
 116     g0 <- rowMeans(ph0, na.rm = T)
 117     n0 <- rowSums(!is.na(ph0))
 118     s0 <- rowVars(ph0, na.rm = T)
 119     n <- rowSums(!is.na(geno.data))
 120     ss <- (s1 * (n1 - 1) + s0 * (n0 - 1))/(n1 + n0 - 2)
 121     t <- (g1 - g0)/sqrt(ss/n1 + ss/n0)
 122     res <- data.frame(mapInfo, '-log10 P' = -log10(pt(abs(t),
 123         df = pmax(1, n1 + n0 - 2), lower.tail = F, log.p = FALSE) *
 124         2), t = t, d = g1 - g0, n0 = n0, n1 = n1, check.names = FALSE)
 125     class(res) <- c('scanone', 'data.frame')
 126     res
 127 }
 128
 129 # correctFUNHMM.R
 130 `correctFUNHMM` <-
 131 function (x, base.position = as.numeric(sub('[^0-9]*([0-9]*)[^0-9]*',
 132     '\\1', names(x))), hmmFUN = hmm.vitFUN.rils, geno.probability = c(0.495,
 133     0.495, 0.01), transitionFUN = phy2get.haldane.rils, emissionFUN = makeEmissionFUN(errorRate = 0.01),
 134     ...)
 135 {
 136     x.cr <- hmmFUN(geno = x + 1, position = base.position, geno.probability = geno.probability,
 137         transitionFUN = transitionFUN, emissionFUN = emissionFUN,
 138         ...) - 1
 139     x.cr[x.cr == 2] <- .MPR_hetero_
 140     x.cr
 141 }
 142
 143 # correctGeno.R
 144 `correctGeno` <-
 145 function (geno.data, base.position = as.numeric(sub('[^0-9]*([0-9]*)[^0-9]*',
 146     '\\1', rownames(geno.data))), correct.FUN = correctFUNHMM,
 147     minInterval = 1, verbose = TRUE, ...)
 148 {
 149     i <- 0
 150     geno.data.cr <- apply(geno.data, 2, function(x, ...) {
 151         if (verbose)
 152             cat('\r', i <<- i + 1)
 153         x.nna <- which(!is.na(x))
 154         ids <- sort(unique(x.nna[c(1, which(diff(x[x.nna]) !=
 155             0 | diff(base.position[x.nna]) >= minInterval) +
 156             1)]))
 157         x.cr <- correct.FUN(x[ids], base.position = base.position[ids],
 158             ...)
 159         x[x.nna] <- NA
 160         x[ids] <- x.cr
 161         x
 162     }, ...)
 163     if (verbose)
 164         cat('\tDone.\n')
 165     geno.data.cr
 166 }
 167
 168 # findBlockAndFilter.R
 169 `findBlockAndFilter` <-
 170 function (x, base.position = 1:length(x), size = sum(blocks[,
 171     'size'], na.rm = TRUE)/100, num = sum(blocks[, 'num'], na.rm = TRUE)/100,
 172     extendNA = TRUE, fillSmallNA = FALSE)
 173 {
 174     x <- as.numeric(x)
 175     t <- length(x)
 176     block.num <- 1
 177     block_start <- base.position[1]
 178     block_end <- base.position[t]
 179     blockNumItem <- t
 180     blockType <- x[1]
 181     x.val <- x
 182     x.val[is.na(x)] <- max(x, na.rm = TRUE) + 1
 183     x.diff <- sort(unique(c(1, which(x.val[-1] != x.val[-t]) +
 184         1)))
 185     x.diff <- x.diff[x.diff <= t]
 186     block.num <- length(x.diff)
 187     if (block.num <= 1)
 188         return(blocks <- cbind(start = block_start, end = block_end,
 189             size = block_end - block_start + 1, num = blockNumItem,
 190             type = blockType))
 191     block_start <- base.position[x.diff]
 192     block_end <- base.position[c(x.diff[-1] - 1, t)]
 193     blockNumItem <- c(diff(x.diff), t - x.diff[block.num] + 1)
 194     blockType <- x[x.diff]
 195     blocks <- cbind(start = block_start, end = block_end, size = block_end -
 196         block_start + 1, num = blockNumItem, type = blockType)
 197     if (extendNA) {
 198         block.ids <- which(is.na(blocks[, 'type']) & blocks[,
 199             'size'] > -1)
 200         block.ids.next <- block.ids[block.ids < nrow(blocks)]
 201         block.ids.pre <- block.ids[block.ids > 1]
 202         if (length(block.ids.next) > 0)
 203             blocks[block.ids.next, 'end'] <- blocks[block.ids.next +
 204                 1, 'start'] - 1
 205         if (length(block.ids.pre) > 0)
 206             blocks[block.ids.pre, 'start'] <- blocks[block.ids.pre -
 207                 1, 'end'] + 1
 208         blocks[, 'size'] <- blocks[, 'end'] - blocks[, 'start'] +
 209             1
 210     }
 211     blocks.margin <- block_start[-1] - block_end[-length(block_end)] -
 212         1
 213     blocks.extentSize <- as.numeric(blocks[, 'size']) + c(blocks.margin,
 214         0) + c(0, blocks.margin)
 215     filter.block <- (blocks.extentSize < size & as.numeric(blocks[,
 216         'num']) < num)
 217     if (sum(filter.block, na.rm = T) > 0) {
 218         blocks[filter.block, 'type'] <- NA
 219     }
 220     blocks <- mergeBlocks(blocks)
 221     if (nrow(blocks) == 1)
 222         return(blocks)
 223     if (fillSmallNA) {
 224         blocks.margin <- blocks[-1, 'start'] - blocks[-nrow(blocks),
 225             'end'] - 1
 226         blocks.extentSize <- as.numeric(blocks[, 'size']) + c(blocks.margin,
 227             0) + c(0, blocks.margin)
 228         filter.block <- blocks.extentSize < size
 229         block.na.ids <- which(is.na(blocks[, 'type']) & filter.block)
 230         block.na.num <- length(block.na.ids)
 231         if (block.na.num > 0) {
 232             blocks[block.na.ids, 'type'] <- sapply(block.na.ids,
 233                 function(i) {
 234                   if (i == 1)
 235                     return(blocks[2, 'type'])
 236                   if (i == nrow(blocks))
 237                     return(blocks[nrow(blocks) - 1, 'type'])
 238                   if (blocks[i - 1, 'type'] == blocks[i + 1,
 239                     'type'])
 240                     return(blocks[i - 1, 'type'])
 241                   blocks[i, 'type']
 242                 })
 243         }
 244     }
 245     blocks <- mergeBlocks(blocks)
 246     if (nrow(blocks) == 1)
 247         return(blocks)
 248     block.ids <- which(is.na(blocks[, 'type']))
 249     block.ids.next <- block.ids[block.ids < nrow(blocks)]
 250     block.ids.pre <- block.ids[block.ids > 1]
 251     if (length(block.ids.next) > 0)
 252         blocks[block.ids.next, 'end'] <- blocks[block.ids.next +
 253             1, 'start'] - 1
 254     if (length(block.ids.pre) > 0)
 255         blocks[block.ids.pre, 'start'] <- blocks[block.ids.pre -
 256             1, 'end'] + 1
 257     tmp.blocks <- blocks
 258     for (i in 2:nrow(blocks)) if (blocks[i, 1] - blocks[i - 1,
 259         2] > 1)
 260         tmp.blocks <- rbind(tmp.blocks, c(blocks[i - 1, 'end'] +
 261             1, blocks[i, 'start'] - 1, blocks[i, 'start'] - blocks[i -
 262             1, 'end'] - 1, 0, NA))
 263     blocks <- tmp.blocks[order(tmp.blocks[, 'end']), ]
 264     mergeBlocks(blocks)
 265 }
 266
 267 # geno2Cross.R
 268 `geno2Cross` <-
 269 function (geno.data, pheno.data, cm_unit = 250000)
 270 {
 271     myGeno.data <- geno.data
 272     geno.value <- sort(unique(na.omit(c(myGeno.data))))
 273     if (identical(as.numeric(geno.value), c(0, 1))) {
 274         myGeno.data <- myGeno.data + 1
 275         geno.value <- sort(unique(na.omit(c(myGeno.data))))
 276     }
 277     if (!identical(as.numeric(geno.value), c(1, 2)))
 278         stop('the value of geno.data should be 1, 2, or NA.')
 279     ids.RILs <- match(colnames(myGeno.data), rownames(pheno.data))
 280     myGeno.site <- data.frame(Chr = as.numeric(substr(rownames(myGeno.data),
 281         1, 2)), Position = as.numeric(substr(rownames(myGeno.data),
 282         3, 10)))
 283     myCrossData <- list()
 284     myCrossData$geno <- lapply(split(1:nrow(myGeno.data), myGeno.site$Chr),
 285         function(ids) {
 286             myMarkers <- myGeno.site$Position[ids]/cm_unit
 287             names(myMarkers) <- rownames(myGeno.data)[ids]
 288             myMap <- list(data = t(myGeno.data[ids, ]), map = myMarkers)
 289             class(myMap) <- 'A'
 290             myMap
 291         })
 292     myCrossData$pheno <- as.data.frame(pheno.data[ids.RILs, ])
 293     class(myCrossData) <- c('riself', 'cross')
 294     attr(myCrossData, 'alleles') <- c('A', 'B')
 295     myCrossData
 296 }
 297
 298 # genoToBin.R
 299 `genoToBin` <-
 300 function (genoData, base.position = 1:nrow(genoData), corrected = FALSE,
 301     correct.FUN = correctFUNHMM, size = 250000, num = 5, fillSmallNA = TRUE,
 302     minBinsize = 0, seqERR = 0.01, heterozygote = FALSE, ...)
 303 {
 304     SNPbyChr <- split(1:nrow(genoData), 1)
 305     i.chr <- 0
 306     res <- lapply(SNPbyChr, function(ids) {
 307         i.line <- 0
 308         i.chr <<- i.chr + 1
 309         blocks <- apply(genoData[ids, ], 2, function(x) {
 310             cat('chr: ', i.chr, '\tline: ', i.line <<- i.line +
 311                 1, '\t', colnames(genoData)[i.line], '\r')
 312             x.nna <- !is.na(x)
 313             if (corrected == FALSE)
 314                 x.correct <- correct.FUN(x[x.nna], base.position[ids][x.nna],
 315                   ...)
 316             else x.correct <- x[x.nna]
 317             blocks.mat <- findBlockAndFilter(x.correct, base.position = base.position[ids][x.nna],
 318                 size = size, num = num, fillSmallNA = fillSmallNA)
 319             blocks.mat[nrow(blocks.mat), 'end'] <- max(base.position[ids])
 320             blocks.mat[nrow(blocks.mat), 'size'] <- blocks.mat[nrow(blocks.mat),
 321                 'end'] - blocks.mat[nrow(blocks.mat), 'start'] +
 322                 1
 323             if (heterozygote == FALSE)
 324                 blocks.mat[blocks.mat[, 'type'] == .MPR_hetero_, 'type'] <- NA
 325             blocks.mat <- mergeBlocks(blocks.mat)
 326             t(blocks.mat)
 327         })
 328         blocks.mat <- matrix(unlist(blocks, recursive = TRUE,
 329             use.names = FALSE), ncol = 5, byrow = TRUE)
 330         bin.border <- sort(unique(as.numeric(blocks.mat[, 2])))
 331         cat('\rchr: ', i.chr, '\tTotal', length(bin.border),
 332             'borders. ')
 333         bin.border <- sort(unique(bin.border[c(1, which(diff(bin.border) >=
 334             minBinsize) + 1)]))
 335         cat(length(bin.border), 'borders after filtering out bins less than',
 336             round(minBinsize/1000, 1), 'kb.\n')
 337         geno.bin <- sapply(blocks, function(blocks.line) {
 338             blocks.end <- rbind(blocks.line)[2, ]
 339             ids <- match(bin.border, blocks.end)
 340             ids[is.na(ids)] <- findInterval(bin.border[is.na(ids)],
 341                 blocks.end, rightmost.closed = FALSE) + 1
 342             filter <- ids > ncol(blocks.line)
 343             ids[filter] <- ncol(blocks.line)
 344             rbind(blocks.line)[5, ids]
 345         })
 346         rownames(geno.bin) <- sprintf('%010d', bin.border)
 347         list(block = blocks, bin = geno.bin, border = bin.border)
 348     })
 349     geno.bin <- NULL
 350     for (i in 1:length(res)) geno.bin <- rbind(geno.bin, res[[i]][[2]])
 351     cat('Done.\n')
 352     list(block = res, bin = geno.bin, border = as.numeric(rownames(geno.bin)))
 353 }
 354
 355 # genotypeCallsBayes.R
 356 `genotypeCallsBayes` <-
 357 function (ALLELE.num, errorRate = 5e-04, eps = 1e-10, maxIterate = 100,
 358     verbose = FALSE)
 359 {
 360     P0.n <- 0.5
 361     P0.p1 <- P0.p2 <- (1 - P0.n)/2
 362     ALLELE.prob <- NULL
 363     P.n <- 1/3
 364     P.p1 <- P.p2 <- (1 - P.n)/2
 365     numIterate <- 1
 366     E <- errorRate
 367     while (abs(P0.n - P.n) > eps && numIterate <= maxIterate) {
 368         P0.p1 <- P.p1
 369         P0.p2 <- P.p2
 370         P0.n <- P.n
 371         n <- rowSums(ALLELE.num)
 372         k <- ALLELE.num[, 1]
 373         ALLELE.prob <- cbind((1 - E)^k * E^(n - k), 0.5^n, (1 -
 374             E)^(n - k) * E^k) %*% diag(c(P0.p1, P0.n, P0.p2))
 375         ALLELE.type <- rep(NA, nrow(ALLELE.prob))
 376         ALLELE.maxprob <- rowMax(ALLELE.prob)
 377         ALLELE.type[ALLELE.prob[, 1] == ALLELE.maxprob] <- 1
 378         ALLELE.type[ALLELE.prob[, 3] == ALLELE.maxprob] <- 3
 379         ALLELE.type[ALLELE.prob[, 2] == ALLELE.maxprob | ALLELE.prob[,
 380             1] == ALLELE.prob[, 3]] <- 2
 381         P.n <- mean(ALLELE.type == 2)
 382         P.p1 <- P.p2 <- (1 - P.n)/2
 383         if (verbose)
 384             cat(P.p1, P.n, P.p2, table(filter.markers <- ALLELE.type !=
 385                 2), '\n', sep = '\t')
 386         numIterate <- numIterate + 1
 387     }
 388     ALLELE.prob <- ALLELE.prob/rowSums(ALLELE.prob, na.rm = TRUE)
 389     list(prop = c(P.p1, P.n, P.p2), prob = ALLELE.prob, type = ALLELE.type)
 390 }
 391
 392 # globalMPRByMarkers.R
 393 `globalMPRByMarkers` <-
 394 function (baseData, markers = NULL, alleleA = NULL, numTry = 3,
 395     numBaseStep = 50, numBaseCandidateStep = numBaseStep * 2,
 396     numKnownStep = pmax(numBaseStep/5, 10), numKnownCandidateStep = numKnownStep *
 397         1.5, useMedianToFindKnown = TRUE, maxIterate = 150, maxNStep = 3,
 398     scoreMin = 0.8, verbose = FALSE, strSTART = '\r', strEND = '',
 399     ...)
 400 {
 401     if (is.null(alleleA))
 402         alleleA <- markers
 403     ALLELE.mat <- matrix(NA, nrow = nrow(baseData), ncol = 2)
 404     rownames(ALLELE.mat) <- rownames(baseData)
 405     alleleA.base <- alleleA[match(rownames(ALLELE.mat), names(alleleA))]
 406     alleleA.ids <- which(!is.na(alleleA.base))
 407     if (numKnownCandidateStep > length(alleleA.ids))
 408         numKnownCandidateStep <- length(alleleA.ids)
 409     if (numKnownStep > numKnownCandidateStep)
 410         numKnownStep <- numKnownCandidateStep
 411     j <- 0
 412     ids.RILrows <- which(rowSums(!is.na(cbind(baseData))) > 0)
 413     rowN <- length(ids.RILrows)
 414     ids.times <- rep(0, rowN)
 415     ids.ok <- rep(0, rowN)
 416     ids.candidate <- na.omit(which(ids.times < numTry & ids.ok ==
 417         0)[1:numBaseCandidateStep])
 418     n <- length(ids.candidate)
 419     while (n > 1) {
 420         if (length(ids.candidate) > numBaseStep) {
 421             filter.dis <- ids.candidate < (median(ids.candidate) -
 422                 numBaseCandidateStep)
 423             ids.dis <- ids.candidate[filter.dis]
 424             if (length(ids.dis) < numBaseStep)
 425                 ids.candidate <- c(ids.dis, sample(ids.candidate[!filter.dis],
 426                   numBaseStep - length(ids.dis)))
 427             else ids.candidate <- ids.dis
 428         }
 429         ids.times[ids.candidate] <- ids.times[ids.candidate] +
 430             1
 431         ids <- ids.RILrows[ids.candidate]
 432         ids.point <- ifelse(useMedianToFindKnown == TRUE, median(ids),
 433             ids[1])
 434         is.known <- !is.na(alleleA.base[ids])
 435         if (sum(is.known) < numKnownStep) {
 436             ids.known <- na.omit(alleleA.ids[order(abs(alleleA.ids -
 437                 ids.point))[sample(numKnownCandidateStep, numKnownStep)]])
 438             if (length(ids.known) > (numKnownStep - sum(is.known)))
 439                 ids.know <- ids.known[1:(numKnownStep - sum(is.known))]
 440             ids <- unique(c(ids, ids.known))
 441         }
 442         ids <- sort(ids)
 443         is.known <- !is.na(alleleA.base[ids])
 444         iResult <- localMPR(baseData[ids, ], maxIterate = maxIterate,
 445             maxNStep = maxNStep, returnNumIterate = TRUE, verbose = 0)
 446         allele.matrix <- iResult[['allele']]
 447         a <- allele.matrix[is.known, ]
 448         b <- alleleA.base[ids[is.known]]
 449         a1 <- colSums(a == b, na.rm = T)/length(b)
 450         if (sum(a1 >= scoreMin) > 0) {
 451             if (a1[1] < a1[2])
 452                 allele.matrix <- allele.matrix[, c(2, 1)]
 453             j <- j + 1
 454             a <- ALLELE.mat[ids, ]
 455             ids.na <- rowSums(is.na(a)) > 0
 456             if (sum(ids.na) > 0)
 457                 ALLELE.mat[ids[ids.na], ] <- allele.matrix[ids.na,
 458                   ]
 459             ids.ok[ids.candidate] <- 1
 460         }
 461         ids.all <- which(ids.times < numTry & ids.ok == 0)
 462         ids.candidate <- na.omit(ids.all[1:numBaseCandidateStep])
 463         n <- length(ids.candidate)
 464         if (verbose)
 465             cat(strSTART, length(ids.all), j, strEND, sep = '\t')
 466     }
 467     invisible(ALLELE.mat)
 468 }
 469
 470 # globalMPRByMarkersPlus.R
 471 `globalMPRByMarkersPlus` <-
 472 function (baseData, markers = NULL, alleleA = NULL, numGroup = 1,
 473     groupSort = FALSE, numPerm = 1, numTry = 3, numBaseStep = 50,
 474     numBaseCandidateStep = numBaseStep * 2, numKnownStep = pmax(numBaseStep/5,
 475         10), numKnownCandidateStep = numKnownStep * 2, useMedianToFindKnown = TRUE,
 476     maxIterate = 150, maxNStep = 3, scoreMin = 0.8, saveMidData = FALSE,
 477     verbose = FALSE, strSTART = '\r', strEND = '', ...)
 478 {
 479     if (is.null(alleleA))
 480         alleleA <- markers
 481     ALLELE.mat <- base2Allele(baseData)
 482     ALLELE.num <- matrix(0, nrow = nrow(ALLELE.mat), ncol = ncol(ALLELE.mat))
 483     dimnames(ALLELE.num) <- dimnames(ALLELE.mat)
 484     alleleA.base <- alleleA[match(rownames(ALLELE.mat), names(alleleA))]
 485     alleleA.ids <- which(!is.na(alleleA.base))
 486     if (numKnownCandidateStep > length(alleleA.ids))
 487         numKnownCandidateStep <- length(alleleA.ids)
 488     if (numKnownStep > numKnownCandidateStep)
 489         numKnownStep <- numKnownCandidateStep
 490     j <- 0
 491     midData <- NULL
 492     for (perm in 1:numPerm) {
 493         if (numGroup >= 2) {
 494             if (groupSort)
 495                 groupRIL <- split(order(colSums(!is.na(baseData)), decreasing = TRUE), cut(1:ncol(baseData), numGroup))
 496             else groupRIL <- split(sample(ncol(baseData)), cut(1:ncol(baseData), numGroup))
 497             names(groupRIL) <- 1:numGroup
 498         }
 499         else {
 500             groupRIL <- list('1' = 1:ncol(baseData))
 501         }
 502         groupRIL <- groupRIL[sapply(groupRIL, length) > 0]
 503         for (ids.cols in groupRIL) {
 504             ids.RILrows <- which(rowSums(!is.na(cbind(baseData[, ids.cols]))) > 0)
 505             rowN <- length(ids.RILrows)
 506             ids.times <- rep(0, rowN)
 507             ids.ok <- rep(0, rowN)
 508             ids.candidate <- na.omit(which(ids.times < numTry &
 509                 ids.ok == 0)[1:numBaseCandidateStep])
 510             n <- length(ids.candidate)
 511             while (n > 1) {
 512                 if (length(ids.candidate) > numBaseStep) {
 513                   filter.dis <- ids.candidate < (median(ids.candidate) - numBaseCandidateStep)
 514                   ids.dis <- ids.candidate[filter.dis]
 515                   if (length(ids.dis) < numBaseStep)
 516                     ids.candidate <- c(ids.dis, sample(ids.candidate[!filter.dis], numBaseStep - length(ids.dis)))
 517                   else ids.candidate <- ids.dis
 518                 }
 519                 ids.times[ids.candidate] <- ids.times[ids.candidate] + 1
 520                 ids <- ids.RILrows[ids.candidate]
 521                 ids.point <- ifelse(useMedianToFindKnown == TRUE, median(ids), ids[1])
 522                 is.known <- !is.na(alleleA.base[ids])
 523                 if (sum(is.known) < numKnownStep) {
 524                   ids.known <- na.omit(alleleA.ids[order(abs(alleleA.ids - ids.point))[sample(numKnownCandidateStep, numKnownStep)]])
 525                   if (length(ids.known) > (numKnownStep - sum(is.known)))
 526                     ids.know <- ids.known[1:(numKnownStep - sum(is.known))]
 527                   ids <- unique(c(ids, ids.known))
 528                 }
 529                 ids <- sort(ids)
 530                 is.known <- !is.na(alleleA.base[ids])
 531                 iResult <- localMPR(baseData[ids, ], allele.matrix = ALLELE.mat[ids,
 532                   ], maxIterate = maxIterate, maxNStep = maxNStep,
 533                   returnNumIterate = TRUE, verbose = 0)
 534                 allele.matrix <- iResult[['allele']]
 535                 a <- allele.matrix[is.known, ]
 536                 b <- alleleA.base[ids[is.known]]
 537                 a1 <- colSums(a == b, na.rm = T)/length(b)
 538                 if (sum(a1 >= scoreMin) > 0) {
 539                   if (a1[1] < a1[2])
 540                     allele.matrix <- allele.matrix[, c(2, 1)]
 541                   j <- j + 1
 542                   a <- ALLELE.mat[ids, ]
 543                   a1 <- rowSums(a == allele.matrix) == 2
 544                   ALLELE.num[ids, ] <- ALLELE.num[ids, ] + cbind(a1,
 545                     !a1)
 546                   ids.ok[ids.candidate] <- 1
 547                 }
 548                 ids.all <- which(ids.times < numTry & ids.ok ==
 549                   0)
 550                 ids.candidate <- na.omit(ids.all[1:numBaseCandidateStep])
 551                 n <- length(ids.candidate)
 552                 if (verbose)
 553                   cat(strSTART, 'perm', perm, 'unprocessed',
 554                     length(ids.all), j, strEND, sep = '\t')
 555             }
 556         }
 557         filter.exchange <- ALLELE.num[, 1] < ALLELE.num[, 2]
 558         tmp <- ALLELE.mat
 559         tmp[filter.exchange, ] <- tmp[filter.exchange, c(2, 1)]
 560         ALLELE.mat <- tmp
 561         tmp <- ALLELE.num
 562         tmp[filter.exchange, ] <- tmp[filter.exchange, c(2, 1)]
 563         ALLELE.num <- tmp
 564         if (saveMidData)
 565             midData[[perm]] <- list(allele = ALLELE.mat, call = ALLELE.num)
 566     }
 567     invisible(list(allele = ALLELE.mat, call = ALLELE.num, midData = midData))
 568 }
 569
 570 # globalMPRRefine.R
 571 `globalMPRRefine` <-
 572 function (baseData, markers = NULL, alleleA = NULL, numGroup = ncol(baseData),
 573     groupSort = FALSE, numPerm = 10, numTry = 3, numBaseStep = 50,
 574     numBaseCandidateStep = numBaseStep * 2, numKnownStep = numBaseStep/2,
 575     numKnownCandidateStep = numBaseStep * 2, useMedianToFindKnown = TRUE,
 576     maxIterate = 150, maxNStep = 3, scoreMin = 0.8, useOnlyKnownToType = FALSE,
 577     useBayes = FALSE, errorRate = 5e-04, saveMidData = FALSE,
 578     verbose = FALSE, strSTART = '\r', strEND = '', ...)
 579 {
 580     if (is.null(alleleA))
 581         alleleA <- markers
 582     ALLELE.mat <- base2Allele(baseData)
 583     ALLELE.num <- matrix(0, nrow = nrow(ALLELE.mat), ncol = ncol(ALLELE.mat))
 584     dimnames(ALLELE.num) <- dimnames(ALLELE.mat)
 585     alleleA.base <- alleleA[match(rownames(ALLELE.mat), names(alleleA))]
 586     alleleA.ids <- which(!is.na(alleleA.base))
 587     ALLELE.num[ALLELE.mat == alleleA.base] <- 1
 588     tmpALLELE.num <- ALLELE.num
 589     if (numKnownCandidateStep > length(alleleA.ids))
 590         numKnownCandidateStep <- length(alleleA.ids)
 591     if (numKnownStep > numKnownCandidateStep)
 592         numKnownStep <- numKnownCandidateStep
 593     j <- 0
 594     midData <- NULL
 595     for (perm in 1:numPerm) {
 596         if (numGroup >= 2) {
 597             if (groupSort)
 598                 groupRIL <- split(order(colSums(!is.na(baseData)),
 599                   decreasing = TRUE), cut(1:ncol(baseData), numGroup))
 600             else groupRIL <- split(sample(ncol(baseData)), cut(1:ncol(baseData),
 601                 numGroup))
 602             names(groupRIL) <- 1:numGroup
 603         }
 604         else {
 605             groupRIL <- list('1' = 1:ncol(baseData))
 606         }
 607         groupRIL <- groupRIL[sapply(groupRIL, length) > 0]
 608         for (g in 1:length(groupRIL)) {
 609             ids.cols <- groupRIL[[g]]
 610             ids.RILrows <- which(rowSums(!is.na(cbind(baseData[,
 611                 ids.cols]))) > 0)
 612             rowN <- length(ids.RILrows)
 613             ids.times <- rep(0, rowN)
 614             ids.ok <- rep(0, rowN)
 615             ids.candidate <- na.omit(which(ids.times < numTry &
 616                 ids.ok == 0)[1:numBaseCandidateStep])
 617             n <- length(ids.candidate)
 618             while (n > 1) {
 619                 if (length(ids.candidate) > numBaseStep) {
 620                   filter.dis <- ids.candidate < (median(ids.candidate) -
 621                     numBaseCandidateStep)
 622                   ids.dis <- ids.candidate[filter.dis]
 623                   if (length(ids.dis) < numBaseStep)
 624                     ids.candidate <- c(ids.dis, sample(ids.candidate[!filter.dis],
 625                       numBaseStep - length(ids.dis)))
 626                   else ids.candidate <- ids.dis
 627                 }
 628                 ids.times[ids.candidate] <- ids.times[ids.candidate] +
 629                   1
 630                 ids <- ids.RILrows[ids.candidate]
 631                 ids.point <- ifelse(useMedianToFindKnown == TRUE,
 632                   median(ids), ids[1])
 633                 is.known <- !is.na(alleleA.base[ids])
 634                 if (sum(is.known) < numKnownStep) {
 635                   ids.known <- na.omit(alleleA.ids[order(abs(alleleA.ids -
 636                     ids.point))[sample(numKnownCandidateStep,
 637                     numKnownStep)]])
 638                   if (length(ids.known) > (numKnownStep - sum(is.known)))
 639                     ids.know <- ids.known[1:(numKnownStep - sum(is.known))]
 640                   ids <- unique(c(ids, ids.known))
 641                 }
 642                 ids <- sort(as.numeric(ids))
 643                 is.known <- !is.na(alleleA.base[ids])
 644                 iResult <- localMPR(baseData[ids, ], allele.matrix = ALLELE.mat[ids,
 645                   ], maxIterate = maxIterate, maxNStep = maxNStep,
 646                   returnNumIterate = TRUE, verbose = 0)
 647                 allele.matrix <- iResult[['allele']]
 648                 if (useOnlyKnownToType) {
 649                   a <- allele.matrix[is.known, ]
 650                   b <- alleleA.base[ids[is.known]]
 651                   a1 <- colSums(a == b, na.rm = T)/length(b)
 652                 }
 653                 else {
 654                   a <- ALLELE.mat[ids, ]
 655                   b <- ALLELE.num[ids, ]
 656                   a.x <- a
 657                   b.f <- b[, 1] < b[, 2]
 658                   a.x[b.f, ] <- a.x[b.f, c(2, 1)]
 659                   a1 <- colMeans(allele.matrix == a.x[, 1], na.rm = T)
 660                 }
 661                 if (sum(a1 >= scoreMin) > 0) {
 662                   if (a1[1] < a1[2])
 663                     allele.matrix <- allele.matrix[, c(2, 1)]
 664                   j <- j + 1
 665                   a <- ALLELE.mat[ids, ]
 666                   a1 <- rowSums(a == allele.matrix) == 2
 667                   ALLELE.num[ids, ] <- ALLELE.num[ids, ] + cbind(a1,
 668                     !a1)
 669                   ids.ok[ids.candidate] <- 1
 670                 }
 671                 ids.all <- which(ids.times < numTry & ids.ok ==
 672                   0)
 673                 ids.candidate <- na.omit(ids.all[1:numBaseCandidateStep])
 674                 n <- length(ids.candidate)
 675                 if (verbose)
 676                   cat(strSTART, 'perm', perm, 'group', g, 'unprocessed',
 677                     length(ids.all), 'MPR', j, strEND, sep = '\t')
 678             }
 679             if (useBayes) {
 680                 geno.res <- genotypeCallsBayes(ALLELE.num, errorRate = errorRate,
 681                   eps = 1e-10, maxIterate = 100, verbose = FALSE)$type
 682                 tmp <- ALLELE.mat
 683                 tmp[geno.res == 2, ] <- NA
 684                 tmp[geno.res == 3, ] <- tmp[geno.res == 3, c(2,
 685                   1)]
 686             }
 687             else {
 688                 tmp <- ALLELE.mat
 689                 filter <- ALLELE.num[, 1] < ALLELE.num[, 2]
 690                 tmp[filter, ] <- tmp[filter, c(2, 1)]
 691             }
 692             alleleA.base <- tmp[, 1]
 693             alleleA.ids <- which(!is.na(alleleA.base))
 694         }
 695         if (saveMidData)
 696             midData[[perm]] <- list(allele = ALLELE.mat, call = ALLELE.num -
 697                 tmpALLELE.num, base = alleleA.base)
 698     }
 699     invisible(list(allele = ALLELE.mat, call = ALLELE.num - tmpALLELE.num,
 700         base = alleleA.base, midData = midData))
 701 }
 702
 703 # hmm.vitFUN.rils.R
 704 `hmm.vitFUN.rils` <-
 705 function (geno, position, geno.probability, transitionFUN = phy2get.haldane.rils,
 706     emissionFUN = makeEmissionFUN(errorRate = 0.01), ...)
 707 {
 708     n.obs <- length(geno)
 709     n.state <- length(geno.probability)
 710     psi <- delta <- matrix(0, nrow = n.state, ncol = n.obs)
 711     n.con <- geno.cr <- numeric(n.obs)
 712     geno.dis <- abs(diff(position))
 713     n.con[1] <- 1
 714     g <- geno[1]
 715     for (i in 2:n.obs) {
 716         n.con[i] <- ifelse(geno[i] == g, n.con[i - 1] + 1, 1)
 717         g <- geno[i]
 718     }
 719     for (i in 1:n.state) delta[i, 1] <- log(geno.probability[i]) +
 720         emissionFUN(i, geno[1], n.con[1])
 721     preProb <- numeric(n.state)
 722     for (t in 2:n.obs) {
 723         for (j in 1:n.state) {
 724             for (i in 1:n.state) preProb[i] <- delta[i, t - 1] +
 725                 log(transitionFUN(i, j, geno.dis[t - 1]))
 726             psi[j, t] <- which.max(preProb)
 727             delta[j, t] <- max(preProb) + emissionFUN(j, geno[t],
 728                 n.con[t])
 729         }
 730     }
 731     geno.cr[n.obs] <- which.max(delta[, n.obs])
 732     for (t in seq(n.obs - 1, 1, by = -1)) geno.cr[t] <- psi[geno.cr[t +
 733         1], t + 1]
 734     geno.cr
 735 }
 736
 737 # localMPR.R
 738 `localMPR` <-
 739 function (baseData, allele.matrix = NULL, maxIterate = 50, returnNumIterate = FALSE,
 740     maxNStep = 5, verbose = FALSE, strEND = '\n', ...)
 741 {
 742     if (is.null(allele.matrix))
 743         allele.matrix <- base2Allele(baseData)
 744     if (nrow(baseData) == ncol(allele.matrix))
 745         allele.matrix <- t(allele.matrix)
 746     if (nrow(baseData) != nrow(allele.matrix))
 747         stop('nrow(baseData)!=nrow(allele.matrix), allele.matrix error!!!')
 748     newALLELE <- ALLELE <- allele.matrix
 749     genoData <- base2Geno(baseData, allele.matrix)
 750     newRSum <- oriRSum <- length(baseData)
 751     numStepSize <- 1
 752     numIterate <- 0
 753     while (numStepSize <= maxNStep) {
 754         numIterate <- numIterate + 1
 755         if (verbose)
 756             cat('\r', numIterate, '\t', newRSum, '\t', numStepSize,
 757                 '\n')
 758         oriRSum <- newRSum
 759         loopRes <- .loopMPR(genoData, allele.matrix = ALLELE,
 760             numStep = numStepSize)
 761         newRSum <- loopRes[[1]]
 762         if (oriRSum > newRSum) {
 763             ALLELE <- loopRes[[2]]
 764             genoData <- loopRes[[3]]
 765             numStepSize <- 1
 766         }
 767         else {
 768             numStepSize <- numStepSize + 1
 769         }
 770     }
 771     if (verbose)
 772         cat('\tDone.', strEND)
 773     rownames(ALLELE) <- rownames(baseData)
 774     if (returnNumIterate)
 775         list(allele = ALLELE, num_iterate = numIterate, numR = newRSum)
 776     else ALLELE
 777 }
 778
 779 # makeEmissionFUN.R
 780 `makeEmissionFUN` <-
 781 function (errorRate = 0.01)
 782 {
 783     E <- log(errorRate)
 784     E2 <- log(1 - errorRate)
 785     E3 <- log(0.5)
 786     function(h, x, n) {
 787         if (h != 3)
 788             return(ifelse(h == x, E2, E))
 789         else return(n * E3)
 790     }
 791 }
 792
 793 # mergeBinMap.R
 794 `mergeBinMap` <-
 795 function (geno.fill)
 796 {
 797     pre <- NULL
 798     pre.nna <- NULL
 799     IsUniq <- rep(1, nrow(geno.fill))
 800     for (j in 1:nrow(geno.fill)) {
 801         cur.nna <- which(!is.na(geno.fill[j, ]))
 802         cur <- geno.fill[j, cur.nna]
 803         if (identical(pre.nna, cur.nna) && identical(pre, cur))
 804             IsUniq[j] <- 0
 805         else {
 806             pre <- cur
 807             pre.nna <- cur.nna
 808         }
 809     }
 810     geno.fill[unique(c(which(IsUniq == 1)[-1] - 1, nrow(geno.fill))),
 811         ]
 812 }
 813
 814 # mergeBlocks.R
 815 `mergeBlocks` <-
 816 function (blocks)
 817 {
 818     if (nrow(blocks) < 2)
 819         return(blocks)
 820     type.val <- blocks[, 'type']
 821     t <- length(type.val)
 822     type.val[is.na(blocks[, 'type'])] <- as.numeric(max(blocks[,
 823         'type'], na.rm = TRUE)) + 1
 824     block.start.ids <- sort(unique(c(1, which(type.val[-1] !=
 825         type.val[-t]) + 1)))
 826     block.start.ids <- block.start.ids[block.start.ids <= nrow(blocks)]
 827     block.block.num <- length(block.start.ids)
 828     blocks[-block.start.ids, 'start'] <- NA
 829     blocks[block.start.ids, c('end', 'num')] <- t(sapply(1:block.block.num,
 830         function(i) {
 831             ids <- block.start.ids[i]:ifelse(i >= block.block.num,
 832                 nrow(blocks), block.start.ids[i + 1] - 1)
 833             c(blocks[ids[length(ids)], 'end'], sum(blocks[ids,
 834                 'num'], na.rm = TRUE))
 835         }))
 836     blocks <- rbind(blocks[!is.na(blocks[, 'start']), ])
 837     blocks[, 'size'] <- blocks[, 'end'] - blocks[, 'start'] +
 838         1
 839     blocks
 840 }
 841
 842 # phy2get.haldane.rils.R
 843 `phy2get.haldane.rils` <-
 844 function (a, b, l)
 845 {
 846     d <- l/(._rice_phy2get_factor_ * 100)
 847     p <- (1 - exp(-2 * d))
 848     p <- p/(1 + p)
 849     ifelse(a == b, 1 - p, p)
 850 }
 851
 852 # Append for de-dependence rowQ from package:Biobase
 853 `rowMax` <-
 854 function (matrix)
 855 {
 856         rowmax <-apply(matrix,1,function(x){
 857                 return(max(x))})
 858         rowmax
 859 }
 860 `rowMin` <-
 861 function (matrix)
 862 {
 863         rowmin <-apply(matrix,1,function(x){
 864                 return(min(x))})
 865         rowmin
 866 }