data-science-platform/R-code-for-RDataMining-book/ch-outlier.R

   1 ### R code from vignette source 'ch-outlier.rnw'
   2
   3 ###################################################
   4 ### code chunk number 1: ch-outlier.rnw:6-9
   5 ###################################################
   6 # free memory
   7 rm(list = ls())
   8 gc()
   9
  10
  11 ###################################################
  12 ### code chunk number 2: ch-outlier.rnw:24-30
  13 ###################################################
  14 set.seed(3147)
  15 x <- rnorm(100)
  16 summary(x)
  17 # outliers
  18 boxplot.stats(x)$out
  19 boxplot(x)
  20
  21
  22 ###################################################
  23 ### code chunk number 3: ch-outlier.rnw:38-48
  24 ###################################################
  25 y <- rnorm(100)
  26 df <- data.frame(x, y)
  27 rm(x, y)
  28 head(df)
  29 attach(df)
  30 # find the index of outliers from x
  31 (a <- which(x %in% boxplot.stats(x)$out))
  32 # find the index of outliers from y
  33 (b <- which(y %in% boxplot.stats(y)$out))
  34 detach(df)
  35
  36
  37 ###################################################
  38 ### code chunk number 4: ch-outlier.rnw:54-58
  39 ###################################################
  40 # outliers in both x and y
  41 (outlier.list1 <- intersect(a,b))
  42 plot(df)
  43 points(df[outlier.list1,], col="red", pch="+", cex=2.5)
  44
  45
  46 ###################################################
  47 ### code chunk number 5: ch-outlier.rnw:69-73
  48 ###################################################
  49 # outliers in either x or y
  50 (outlier.list2 <- union(a,b))
  51 plot(df)
  52 points(df[outlier.list2,], col="blue", pch="x", cex=2)
  53
  54
  55 ###################################################
  56 ### code chunk number 6: ch-outlier.rnw:94-99
  57 ###################################################
  58 library(DMwR)
  59 # remove "Species", which is a categorical column
  60 iris2 <- iris[,1:4]
  61 outlier.scores <- lofactor(iris2, k=5)
  62 plot(density(outlier.scores))
  63
  64
  65 ###################################################
  66 ### code chunk number 7: ch-outlier.rnw:105-110
  67 ###################################################
  68 # pick top 5 as outliers
  69 outliers <- order(outlier.scores, decreasing=T)[1:5]
  70 # who are outliers
  71 print(outliers)
  72 print(iris2[outliers,])
  73
  74
  75 ###################################################
  76 ### code chunk number 8: ch-outlier.rnw:118-122
  77 ###################################################
  78 n <- nrow(iris2)
  79 labels <- 1:n
  80 labels[-outliers] <- "."
  81 biplot(prcomp(iris2), cex=.8, xlabs=labels)
  82
  83
  84 ###################################################
  85 ### code chunk number 9: ch-outlier.rnw:135-140
  86 ###################################################
  87 pch <- rep(".", n)
  88 pch[outliers] <- "+"
  89 col <- rep("black", n)
  90 col[outliers] <- "red"
  91 pairs(iris2, pch=pch, col=col)
  92
  93
  94 ###################################################
  95 ### code chunk number 10: ch-outlier.rnw:148-152 (eval = FALSE)
  96 ###################################################
  97 ## library(Rlof)
  98 ## outlier.scores <- lof(iris2, k=5)
  99 ## # try with different number of neighbors (k = 5,6,7,8,9 and 10)
 100 ## outlier.scores <- lof(iris2, k=c(5:10))
 101
 102
 103 ###################################################
 104 ### code chunk number 11: ch-outlier.rnw:164-179
 105 ###################################################
 106 # remove species from the data to cluster
 107 iris2 <- iris[,1:4]
 108 kmeans.result <- kmeans(iris2, centers=3)
 109 # cluster centers
 110 kmeans.result$centers
 111 # cluster IDs
 112 kmeans.result$cluster
 113 # calculate distances between objects and cluster centers
 114 centers <- kmeans.result$centers[kmeans.result$cluster, ]
 115 distances <- sqrt(rowSums((iris2 - centers)^2))
 116 # pick top 5 largest distances
 117 outliers <- order(distances, decreasing=T)[1:5]
 118 # who are outliers
 119 print(outliers)
 120 print(iris2[outliers,])
 121
 122
 123 ###################################################
 124 ### code chunk number 12: ch-outlier.rnw:185-193
 125 ###################################################
 126 # plot clusters
 127 plot(iris2[,c("Sepal.Length", "Sepal.Width")], pch="o",
 128      col=kmeans.result$cluster, cex=0.3)
 129 # plot cluster centers
 130 points(kmeans.result$centers[,c("Sepal.Length", "Sepal.Width")], col=1:3,
 131        pch=8, cex=1.5)
 132 # plot outliers
 133 points(iris2[outliers, c("Sepal.Length", "Sepal.Width")], pch="+", col=4, cex=1.5)
 134
 135
 136 ###################################################
 137 ### code chunk number 13: ch-outlier.rnw:209-219
 138 ###################################################
 139 # use robust fitting
 140 f <- stl(AirPassengers, "periodic", robust=TRUE)
 141 (outliers <- which(f$weights<1e-8))
 142 # set layout
 143 op <- par(mar=c(0, 4, 0, 3), oma=c(5, 0, 4, 0), mfcol=c(4, 1))
 144 plot(f, set.pars=NULL)
 145 sts <- f$time.series
 146 # plot outliers
 147 points(time(sts)[outliers], 0.8*sts[,"remainder"][outliers], pch="x", col="red")
 148 par(op) # reset layout
 149
 150