data-science-platform/R-code-for-RDataMining-book/cs-response-prediction.R

   1 ### R code from vignette source 'cs-response-prediction.rnw'
   2 ### Encoding: ISO8859-1
   3
   4 ###################################################
   5 ### code chunk number 1: cs-response-prediction.rnw:9-12
   6 ###################################################
   7 # free memory
   8 rm(list = ls())
   9 gc()
  10
  11
  12 ###################################################
  13 ### code chunk number 2: cs-response-prediction.rnw:45-46 (eval = FALSE)
  14 ###################################################
  15 ## cup98 <- read.csv("./data/KDDCup1998/cup98LRN.txt")
  16
  17
  18 ###################################################
  19 ### code chunk number 3: cs-response-prediction.rnw:51-52
  20 ###################################################
  21 load("./data/cup98.rdata")
  22
  23
  24 ###################################################
  25 ### code chunk number 4: cs-response-prediction.rnw:55-61
  26 ###################################################
  27 dim(cup98)
  28 # have a look at the first 30 variables
  29 str(cup98[,1:30])
  30 head(cup98[,1:30])
  31 # a summary of the first 10 variables
  32 summary(cup98[,1:10])
  33
  34
  35 ###################################################
  36 ### code chunk number 5: cs-response-prediction.rnw:66-80 (eval = FALSE)
  37 ###################################################
  38 ## library(Hmisc)
  39 ## describe(cup98[,1:28]) # demographics
  40 ## describe(cup98[,29:42]) # number of times responded to other types of mail order offers
  41 ## describe(cup98[,43:55]) # overlay data
  42 ## describe(cup98[,56:74]) # donor interests
  43 ## describe(cup98[,75]) # PEP star RFA status
  44 ## describe(cup98[,76:361]) # characteristics of the donors neighborhood
  45 ## describe(cup98[,362:407]) # promotion history
  46 ## describe(cup98[,408:412]) # summary variables of promotion history
  47 ## describe(cup98[,413:456]) # giving history
  48 ## describe(cup98[,457:469]) # summary variables of giving history
  49 ## describe(cup98[,470:473]) # ID & targets
  50 ## describe(cup98[,474:479]) # RFA (Recency/Frequency/Donation Amount)
  51 ## describe(cup98[,480:481]) # CLUSTER & GEOCODE
  52
  53
  54 ###################################################
  55 ### code chunk number 6: cs-response-prediction.rnw:87-91
  56 ###################################################
  57 (response.percentage <- round(100 * prop.table(table(cup98$TARGET_B)), digits=1))
  58 mylabels <- paste("TARGET_B=", names(response.percentage), "\n",
  59                   response.percentage, "%", sep="")
  60 pie(response.percentage, labels=mylabels)
  61
  62
  63 ###################################################
  64 ### code chunk number 7: cs-response-prediction.rnw:101-106
  65 ###################################################
  66 # data with positive donations
  67 cup98pos <- cup98[cup98$TARGET_D>0, ]
  68 targetPos <- cup98pos$TARGET_D
  69 summary(targetPos)
  70 boxplot(targetPos)
  71
  72
  73 ###################################################
  74 ### code chunk number 8: cs-response-prediction.rnw:123-129
  75 ###################################################
  76 # number of positive donations
  77 length(targetPos)
  78 # number of positive donations not in whole dollars
  79 sum(!(targetPos %in% 1:200))
  80 targetPos <- round(targetPos)
  81 barplot(table(targetPos), las=2)
  82
  83
  84 ###################################################
  85 ### code chunk number 9: cs-response-prediction.rnw:138-143
  86 ###################################################
  87 cup98$TARGET_D2 <- cut(cup98$TARGET_D, right=F,
  88         breaks=c(0, 0.1, 10, 15, 20, 25, 30, 50, max(cup98$TARGET_D)))
  89 table(cup98$TARGET_D2)
  90 cup98pos$TARGET_D2 <- cut(cup98pos$TARGET_D, right=F,
  91         breaks=c(0, 0.1, 10, 15, 20, 25, 30, 50, max(cup98pos$TARGET_D)))
  92
  93
  94 ###################################################
  95 ### code chunk number 10: cs-response-prediction.rnw:151-153
  96 ###################################################
  97 table(cup98$RFA_2R)
  98 round(100 * prop.table(table(cup98$NOEXCH)), digits=3)
  99
 100
 101 ###################################################
 102 ### code chunk number 11: cs-response-prediction.rnw:210-233
 103 ###################################################
 104 varSet <- c(
 105    # demographics
 106    "ODATEDW", "OSOURCE", "STATE", "ZIP", "PVASTATE", "DOB", "RECINHSE",
 107    "MDMAUD", "DOMAIN", "CLUSTER", "AGE", "HOMEOWNR", "CHILD03", "CHILD07",
 108    "CHILD12", "CHILD18", "NUMCHLD", "INCOME", "GENDER", "WEALTH1", "HIT",
 109    # donor interests
 110    "COLLECT1", "VETERANS", "BIBLE", "CATLG", "HOMEE", "PETS", "CDPLAY",
 111    "STEREO", "PCOWNERS", "PHOTO", "CRAFTS", "FISHER", "GARDENIN", "BOATS",
 112    "WALKER", "KIDSTUFF", "CARDS", "PLATES",
 113    # PEP star RFA status
 114    "PEPSTRFL",
 115    # summary variables of promotion history
 116    "CARDPROM", "MAXADATE", "NUMPROM", "CARDPM12", "NUMPRM12",
 117    # summary variables of giving history
 118    "RAMNTALL", "NGIFTALL", "CARDGIFT", "MINRAMNT", "MAXRAMNT", "LASTGIFT",
 119    "LASTDATE", "FISTDATE", "TIMELAG", "AVGGIFT",
 120    # ID & targets
 121    "CONTROLN", "TARGET_B", "TARGET_D", "TARGET_D2", "HPHONE_D",
 122    # RFA (Recency/Frequency/Donation Amount)
 123    "RFA_2F", "RFA_2A", "MDMAUD_R", "MDMAUD_F", "MDMAUD_A",
 124    #others
 125    "CLUSTER2", "GEOCODE2")
 126 cup98 <- cup98[, varSet]
 127
 128
 129 ###################################################
 130 ### code chunk number 12: cs-response-prediction.rnw:245-247
 131 ###################################################
 132 # select numeric variables
 133 idx.num <- which(sapply(cup98, is.numeric))
 134
 135
 136 ###################################################
 137 ### code chunk number 13: cs-response-prediction.rnw:254-263
 138 ###################################################
 139 layout(matrix(c(1,2), 1, 2)) # 2 graphs per page
 140 # histograms of numeric variables
 141 myHist <- function(x) {
 142    hist(cup98[,x], main=NULL, xlab=x)
 143 }
 144 sapply(names(idx.num[4:5]), myHist)
 145 layout(matrix(1)) # change back to one graph per page
 146 # run code below to generate histograms for all numeric variables
 147 # sapply(names(idx.num), myHist)
 148
 149
 150 ###################################################
 151 ### code chunk number 14: cs-response-prediction.rnw:274-279
 152 ###################################################
 153 layout(matrix(c(1,2),1,2)) # 2 graphs per page
 154 boxplot(cup98$HIT)
 155 cup98$HIT[cup98$HIT>200]
 156 boxplot(cup98$HIT[cup98$HIT<200])
 157 layout(matrix(1)) # change back to one graph per page
 158
 159
 160 ###################################################
 161 ### code chunk number 15: cs-response-prediction.rnw:290-292
 162 ###################################################
 163 AGE2 <- cut(cup98pos$AGE, right=F, breaks=seq(0, 100, by=5))
 164 boxplot(cup98pos$TARGET_D ~ AGE2, ylim=c(0,40), las=3)
 165
 166
 167 ###################################################
 168 ### code chunk number 16: cs-response-prediction.rnw:303-313
 169 ###################################################
 170 attach(cup98pos)
 171 layout(matrix(c(1,2),1,2)) # 2 graphs per page
 172 boxplot(TARGET_D ~ GENDER, ylim=c(0,80))
 173 # density plot
 174 plot(density(TARGET_D[GENDER=="F"]), xlim=c(0,60), col=1, lty=1)
 175 lines(density(TARGET_D[GENDER=="M"]), col=2, lty=2)
 176 lines(density(TARGET_D[GENDER=="J"]), col=3, lty=3)
 177 legend("topright", c("Female", "Male", "Joint account"), col=1:3, lty=1:3)
 178 layout(matrix(1)) # change back to one graph per page
 179 detach(cup98pos)
 180
 181
 182 ###################################################
 183 ### code chunk number 17: cs-response-prediction.rnw:322-327
 184 ###################################################
 185 correlation <- cor(cup98$TARGET_D, cup98[,idx.num], use="pairwise.complete.obs")
 186 correlation <- abs(correlation)
 187 (correlation <- correlation[,order(correlation, decreasing=T)])
 188 # save to a CSV file, with important variables at the top
 189 write.csv(correlation, "absolute_correlation.csv")
 190
 191
 192 ###################################################
 193 ### code chunk number 18: cs-response-prediction.rnw:331-333 (eval = FALSE)
 194 ###################################################
 195 ## cor(cup98[,idx.num])
 196 ## pairs(cup98)
 197
 198
 199 ###################################################
 200 ### code chunk number 19: cs-response-prediction.rnw:339-345
 201 ###################################################
 202 color <- ifelse(cup98$TARGET_D>0, "blue", "black")
 203 pch <- ifelse(cup98$TARGET_D>0, "+", ".")
 204 plot(jitter(cup98$AGE), jitter(cup98$HIT), pch=pch, col=color, cex=0.7,
 205      ylim=c(0,70), xlab="AGE", ylab="HIT")
 206 legend("topleft", c("TARGET_D>0", "TARGET_D=0"), col=c("blue", "black"),
 207        pch=c("+", "."))
 208
 209
 210 ###################################################
 211 ### code chunk number 20: cs-response-prediction.rnw:359-369
 212 ###################################################
 213 myChisqTest <- function(x) {
 214    t1 <- table(cup98pos[,x], cup98pos$TARGET_D2)
 215    plot(t1, main=x, las=1)
 216    print(x)
 217    print(chisq.test(t1))
 218 }
 219 myChisqTest("GENDER")
 220 # run the code below to do chi-square test for all categorical variables
 221 # idx.cat <- which(sapply(cup98pos, is.factor))
 222 # sapply(names(idx.cat), myChisqTest)
 223
 224
 225 ###################################################
 226 ### code chunk number 21: cs-response-prediction.rnw:384-403
 227 ###################################################
 228 nRec <- dim(cup98)[1]
 229 trainSize <- round(nRec * 0.7)
 230 testSize <- nRec - trainSize
 231 # ctree parameters
 232 MinSplit <- 1000
 233 MinBucket <- 400
 234 MaxSurrogate <- 4
 235 MaxDepth <- 10
 236 (strParameters <- paste(MinSplit, MinBucket, MaxSurrogate, MaxDepth, sep="-"))
 237 LoopNum <- 9
 238 # The cost for each contact is $0.68.
 239 cost <- 0.68
 240 varSet2 <- c("AGE", "AVGGIFT", "CARDGIFT", "CARDPM12", "CARDPROM", "CLUSTER2",
 241    "DOMAIN", "GENDER", "GEOCODE2", "HIT", "HOMEOWNR", "HPHONE_D", "INCOME",
 242    "LASTGIFT", "MAXRAMNT", "MDMAUD_F", "MDMAUD_R", "MINRAMNT", "NGIFTALL",
 243    "NUMPRM12", "PCOWNERS", "PEPSTRFL", "PETS", "RAMNTALL", "RECINHSE",
 244    "RFA_2A", "RFA_2F", "STATE", "TIMELAG")
 245 cup98 <- cup98[, c("TARGET_D", varSet2)]
 246 library(party) # for ctree
 247
 248
 249 ###################################################
 250 ### code chunk number 22: cs-response-prediction.rnw:408-471 (eval = FALSE)
 251 ###################################################
 252 ## pdf(paste("evaluation-tree-", strParameters, ".pdf", sep=""),
 253 ##     width=12, height=9, paper="a4r", pointsize=6)
 254 ## cat(date(), "\n")
 255 ## cat(" trainSize=", trainSize, ", testSize=", testSize, "\n")
 256 ## cat(" MinSplit=", MinSplit, ", MinBucket=", MinBucket,
 257 ##     ", MaxSurrogate=", MaxSurrogate, ", MaxDepth=", MaxDepth, "\n\n")
 258 ##
 259 ## # run for multiple times and get the average result
 260 ## allTotalDonation <- matrix(0, nrow=testSize, ncol=LoopNum)
 261 ## allAvgDonation <- matrix(0, nrow=testSize, ncol=LoopNum)
 262 ## allDonationPercentile <- matrix(0, nrow=testSize, ncol=LoopNum)
 263 ## for (loopCnt in 1:LoopNum) {
 264 ##    cat(date(), ":  iteration = ", loopCnt, "\n")
 265 ##
 266 ##    # split into training data and testing data
 267 ##    trainIdx <- sample(1:nRec, trainSize)
 268 ##    trainData <- cup98[trainIdx,]
 269 ##    testData <- cup98[-trainIdx,]
 270 ##
 271 ##    # train a decision tree
 272 ##    myCtree <- ctree(TARGET_D ~ ., data=trainData,
 273 ##          controls=ctree_control(minsplit=MinSplit, minbucket=MinBucket,
 274 ##                                 maxsurrogate=MaxSurrogate, maxdepth=MaxDepth))
 275 ##    # size of ctree
 276 ##    print(object.size(myCtree), units="auto")
 277 ##    save(myCtree, file=paste("cup98-ctree-", strParameters, "-run-",
 278 ##                             loopCnt, ".rdata", sep=""))
 279 ##
 280 ##    figTitle <- paste("Tree", loopCnt)
 281 ##    plot(myCtree, main=figTitle, type="simple", ip_args=list(pval=FALSE),
 282 ##         ep_args=list(digits=0,abbreviate=TRUE), tp_args=list(digits=2))
 283 ##    #print(myCtree)
 284 ##
 285 ##    # test
 286 ##    pred <- predict(myCtree, newdata=testData)
 287 ##    plot(pred, testData$TARGET_D)
 288 ##    print(sum(testData$TARGET_D[pred > cost] - cost))
 289 ##    # quick sort is "unstable" for tie values, so it is used here to introduce
 290 ##    # a bit random for tie values
 291 ##    s1 <- sort(pred, decreasing=TRUE, method = "quick", index.return=TRUE)
 292 ##    totalDonation <- cumsum(testData$TARGET_D[s1$ix]) # cumulative sum
 293 ##    avgDonation <- totalDonation / (1:testSize)
 294 ##    donationPercentile <- 100 * totalDonation / sum(testData$TARGET_D)
 295 ##    allTotalDonation[,loopCnt] <- totalDonation
 296 ##    allAvgDonation[,loopCnt] <- avgDonation
 297 ##    allDonationPercentile[,loopCnt] <- donationPercentile
 298 ##    plot(totalDonation, type="l")
 299 ##    grid()
 300 ## }
 301 ## graphics.off()
 302 ## cat(date(), ":  Loop completed.\n\n\n")
 303 ##
 304 ## fnlTotalDonation <- rowMeans(allTotalDonation)
 305 ## fnlAvgDonation <- rowMeans(allAvgDonation)
 306 ## fnlDonationPercentile <- rowMeans(allDonationPercentile)
 307 ##
 308 ## rm(trainData, testData, pred)
 309 ##
 310 ## # save results into a CSV file
 311 ## results <- data.frame(cbind(allTotalDonation,fnlTotalDonation))
 312 ## names(results) <- c(paste("run",1:LoopNum), "Average")
 313 ## write.csv(results, paste("evaluation-TotalDonation-", strParameters, ".csv",
 314 ##                          sep=""))
 315
 316
 317 ###################################################
 318 ### code chunk number 23: cs-response-prediction.rnw:498-510
 319 ###################################################
 320 result <- read.csv("evaluation-TotalDonation-1000-400-4-10.csv")
 321 head(result)
 322 result[,2:12] <- result[,2:12] - cost * (1:testSize)
 323 # to reduce size of the file to save this chart
 324 idx.pos <- c(seq(1, nrow(result), by=10), nrow(result))
 325 plot(result[idx.pos,12], type="l", lty=1, col=1, ylim=c(0,4500),
 326      xlab="Number of Mails", ylab="Amount of Donations ($)")
 327 for (fCnt in 1:LoopNum) {
 328    lines(result[idx.pos,fCnt+1], pty=".", type="l", lty=1+fCnt, col=1+fCnt)
 329 }
 330 legend("bottomright", col=1:(LoopNum+1), lty=1:(LoopNum+1),
 331        legend=c("Average", paste("Run",1:LoopNum)))
 332
 333
 334 ###################################################
 335 ### code chunk number 24: cs-response-prediction.rnw:520-533
 336 ###################################################
 337 donationPercentile <- sapply(2:12, function(i)
 338       100 *  result[,i] / result[testSize,i])
 339 percentile <- 100 * (1:testSize)/testSize
 340 plot(percentile[idx.pos], donationPercentile[idx.pos,11], pty=".", type="l",
 341      lty=1, col=1, ylim=c(0,170), xlab="Contact Percentile (%)",
 342      ylab="Donation Percentile (%)")
 343 grid(col = "gray", lty = "dotted")
 344 for (fCnt in 1:LoopNum) {
 345    lines(percentile[idx.pos], donationPercentile[idx.pos,fCnt], pty=".",
 346          type="l", lty=1+fCnt, col=1+fCnt)
 347 }
 348 legend("bottomright", col=1:(LoopNum+1), lty=1:(LoopNum+1),
 349        legend=c("Average", paste("Run",1:LoopNum)))
 350
 351
 352 ###################################################
 353 ### code chunk number 25: cs-response-prediction.rnw:567-582
 354 ###################################################
 355 avgDonation <- sapply(2:12, function(i) result[,i] / (1:testSize))
 356 yTitle = c("Total Donation Amount Percentile (%)",
 357            "Average Donation Amount per Contact ($)")
 358 par(mar=c(5,4,4,5)+.1)
 359 plot(percentile[idx.pos], donationPercentile[idx.pos,7], pty=".", type="l",
 360      lty="solid", col="red", ylab=yTitle[1], xlab="Contact Percentile (%)")
 361 grid(col = "gray", lty = "dotted")
 362 par(new=TRUE)
 363 plot(percentile[idx.pos], avgDonation[idx.pos,7], type="l", lty="dashed",
 364      col="blue", xaxt="n", yaxt="n", xlab="", ylab="",
 365      ylim=c(0,max(avgDonation[,7])))
 366 axis(4)
 367 mtext(yTitle[2], side=4, line=2)
 368 legend("right", col=c("red","blue"), lty=c("solid","dashed"),
 369        legend=yTitle)
 370
 371
 372 ###################################################
 373 ### code chunk number 26: cs-response-prediction.rnw:604-625 (eval = FALSE)
 374 ###################################################
 375 ## # compare results got with different parameters
 376 ## parameters <- c("1000-400-4-5", "1000-400-4-6", "1000-400-4-8", "1000-400-4-10")
 377 ## #parameters <- c("1000-400-4-10", "700-200-4-10", "200-50-4-10")
 378 ## paraNum <- length(parameters)
 379 ## percentile <- 100 * (1:testSize)/testSize
 380 ## # 1st result
 381 ## results <- read.csv(paste("evaluation-TotalDonation-", parameters[1], ".csv",
 382 ##                           sep=""))
 383 ## avgResult <- results$Average - cost * (1:testSize)
 384 ## plot(percentile, avgResult, pty=1, type="l", lty=1, col=1, #ylim=c(0,4000),
 385 ##      ylab="Amount of Donation", xlab="Contact Percentile (%)",
 386 ##      main="Parameters: MinSplit, MinBucket, MaxSurrogate, MaxDepth")
 387 ## grid(col = "gray", lty = "dotted")
 388 ## # other results
 389 ## for (i in 2:paraNum) {
 390 ##    results <- read.csv(paste("evaluation-TotalDonation-", parameters[i],
 391 ##                              ".csv", sep=""))
 392 ##    avgResult <- results$Average - cost * (1:testSize)
 393 ##    lines(percentile, avgResult, type="l", lty=i, col=i)
 394 ## }
 395 ## legend("bottomright", col=1:paraNum, lty=1:paraNum, legend=parameters)
 396
 397
 398 ###################################################
 399 ### code chunk number 27: cs-response-prediction.rnw:659-660 (eval = FALSE)
 400 ###################################################
 401 ## cup98val <- read.csv("./data/KDDCup1998/cup98VAL.txt")
 402
 403
 404 ###################################################
 405 ### code chunk number 28: cs-response-prediction.rnw:665-666
 406 ###################################################
 407 load("./data/cup98val.rdata")
 408
 409
 410 ###################################################
 411 ### code chunk number 29: cs-response-prediction.rnw:669-692
 412 ###################################################
 413 cup98val <- cup98val[, c("CONTROLN", varSet2)]
 414 trainNames <- names(cup98)
 415 scoreNames <- names(cup98val)
 416 # check if any variables not in scoring data
 417 idx <- which(trainNames %in% scoreNames)
 418 print(trainNames[-idx])
 419
 420 # check and set levels in factors in scoring data
 421 scoreData <- cup98val
 422 vars <- intersect(trainNames, scoreNames)
 423 for (i in 1:length(vars)) {
 424    varname <- vars[i]
 425    trainLevels <- levels(cup98[,varname])
 426    scoreLevels <- levels(scoreData[,varname])
 427    if (is.factor(cup98[,varname]) & setequal(trainLevels, scoreLevels)==F) {
 428       cat("Warning: new values found in score data, and they will be changed to NA!\n")
 429       cat(varname, "\n")
 430       #cat("train: ", length(trainLevels), ", ", trainLevels, "\n")
 431       #cat("score: ", length(scoreLevels), ", ", scoreLevels, "\n\n")
 432       scoreData[,varname] <- factor(scoreData[,varname], levels=trainLevels)
 433    }
 434 }
 435 rm(cup98val)
 436
 437
 438 ###################################################
 439 ### code chunk number 30: cs-response-prediction.rnw:697-705 (eval = FALSE)
 440 ###################################################
 441 ## # loading the selected model
 442 ## load("cup98-ctree-1000-400-4-10-run-7.Rdata")
 443 ## # predicting
 444 ## pred <- predict(myCtree, newdata = scoreData)
 445 ## pred <- round(pred, digits=3)
 446 ## #table(pred, useNA="ifany")
 447 ## result <- data.frame(scoreData$CONTROLN, pred)
 448 ## names(result) <- c("CONTROLN", "pred")
 449
 450
 451 ###################################################
 452 ### code chunk number 31: cs-response-prediction.rnw:710-711
 453 ###################################################
 454 load("response-result.rdata")
 455
 456
 457 ###################################################
 458 ### code chunk number 32: cs-response-prediction.rnw:725-732
 459 ###################################################
 460 valTarget <- read.csv("./data/KDDCup1998/valtargt.txt")
 461 merged <- merge(result, valTarget, by="CONTROLN")
 462 # donation profit if mail all people
 463 sum(valTarget$TARGET_D - cost)
 464 # donation profit if mail those predicted to donate more than mail cost
 465 idx <- (merged$pred > cost)
 466 sum(merged$TARGET_D[idx] - cost)
 467
 468
 469 ###################################################
 470 ### code chunk number 33: cs-response-prediction.rnw:739-748
 471 ###################################################
 472 # ranking customers
 473 merged <- merged[order(merged$pred, decreasing=T),]
 474 x <- 100 * (1:nrow(merged)) / nrow(merged)
 475 y <- cumsum(merged$TARGET_D) - cost*(1:nrow(valTarget))
 476 # to reduce size of the file to save this chart
 477 idx.pos <- c(seq(1, length(x), by=10), length(x))
 478 plot(x[idx.pos], y[idx.pos], type="l", xlab="Contact Percentile (%)",
 479      ylab="Amount of Donation")
 480 grid()
 481
 482