1 ### R code from vignette source 'cs-response-prediction.rnw'
2 ### Encoding: ISO8859-1
4 ###################################################
5 ### code chunk number 1: cs-response-prediction.rnw:9-12
6 ###################################################
12 ###################################################
13 ### code chunk number 2: cs-response-prediction.rnw:45-46 (eval = FALSE)
14 ###################################################
15 ## cup98 <- read.csv("./data/KDDCup1998/cup98LRN.txt")
18 ###################################################
19 ### code chunk number 3: cs-response-prediction.rnw:51-52
20 ###################################################
21 load("./data/cup98.rdata")
24 ###################################################
25 ### code chunk number 4: cs-response-prediction.rnw:55-61
26 ###################################################
28 # have a look at the first 30 variables
31 # a summary of the first 10 variables
35 ###################################################
36 ### code chunk number 5: cs-response-prediction.rnw:66-80 (eval = FALSE)
37 ###################################################
39 ## describe(cup98[,1:28]) # demographics
40 ## describe(cup98[,29:42]) # number of times responded to other types of mail order offers
41 ## describe(cup98[,43:55]) # overlay data
42 ## describe(cup98[,56:74]) # donor interests
43 ## describe(cup98[,75]) # PEP star RFA status
44 ## describe(cup98[,76:361]) # characteristics of the donors neighborhood
45 ## describe(cup98[,362:407]) # promotion history
46 ## describe(cup98[,408:412]) # summary variables of promotion history
47 ## describe(cup98[,413:456]) # giving history
48 ## describe(cup98[,457:469]) # summary variables of giving history
49 ## describe(cup98[,470:473]) # ID & targets
50 ## describe(cup98[,474:479]) # RFA (Recency/Frequency/Donation Amount)
51 ## describe(cup98[,480:481]) # CLUSTER & GEOCODE
54 ###################################################
55 ### code chunk number 6: cs-response-prediction.rnw:87-91
56 ###################################################
57 (response.percentage <- round(100 * prop.table(table(cup98$TARGET_B)), digits=1))
58 mylabels <- paste("TARGET_B=", names(response.percentage), "\n",
59 response.percentage, "%", sep="")
60 pie(response.percentage, labels=mylabels)
63 ###################################################
64 ### code chunk number 7: cs-response-prediction.rnw:101-106
65 ###################################################
66 # data with positive donations
67 cup98pos <- cup98[cup98$TARGET_D>0, ]
68 targetPos <- cup98pos$TARGET_D
73 ###################################################
74 ### code chunk number 8: cs-response-prediction.rnw:123-129
75 ###################################################
76 # number of positive donations
78 # number of positive donations not in whole dollars
79 sum(!(targetPos %in% 1:200))
80 targetPos <- round(targetPos)
81 barplot(table(targetPos), las=2)
84 ###################################################
85 ### code chunk number 9: cs-response-prediction.rnw:138-143
86 ###################################################
87 cup98$TARGET_D2 <- cut(cup98$TARGET_D, right=F,
88 breaks=c(0, 0.1, 10, 15, 20, 25, 30, 50, max(cup98$TARGET_D)))
89 table(cup98$TARGET_D2)
90 cup98pos$TARGET_D2 <- cut(cup98pos$TARGET_D, right=F,
91 breaks=c(0, 0.1, 10, 15, 20, 25, 30, 50, max(cup98pos$TARGET_D)))
94 ###################################################
95 ### code chunk number 10: cs-response-prediction.rnw:151-153
96 ###################################################
98 round(100 * prop.table(table(cup98$NOEXCH)), digits=3)
101 ###################################################
102 ### code chunk number 11: cs-response-prediction.rnw:210-233
103 ###################################################
106 "ODATEDW", "OSOURCE", "STATE", "ZIP", "PVASTATE", "DOB", "RECINHSE",
107 "MDMAUD", "DOMAIN", "CLUSTER", "AGE", "HOMEOWNR", "CHILD03", "CHILD07",
108 "CHILD12", "CHILD18", "NUMCHLD", "INCOME", "GENDER", "WEALTH1", "HIT",
110 "COLLECT1", "VETERANS", "BIBLE", "CATLG", "HOMEE", "PETS", "CDPLAY",
111 "STEREO", "PCOWNERS", "PHOTO", "CRAFTS", "FISHER", "GARDENIN", "BOATS",
112 "WALKER", "KIDSTUFF", "CARDS", "PLATES",
113 # PEP star RFA status
115 # summary variables of promotion history
116 "CARDPROM", "MAXADATE", "NUMPROM", "CARDPM12", "NUMPRM12",
117 # summary variables of giving history
118 "RAMNTALL", "NGIFTALL", "CARDGIFT", "MINRAMNT", "MAXRAMNT", "LASTGIFT",
119 "LASTDATE", "FISTDATE", "TIMELAG", "AVGGIFT",
121 "CONTROLN", "TARGET_B", "TARGET_D", "TARGET_D2", "HPHONE_D",
122 # RFA (Recency/Frequency/Donation Amount)
123 "RFA_2F", "RFA_2A", "MDMAUD_R", "MDMAUD_F", "MDMAUD_A",
125 "CLUSTER2", "GEOCODE2")
126 cup98 <- cup98[, varSet]
129 ###################################################
130 ### code chunk number 12: cs-response-prediction.rnw:245-247
131 ###################################################
132 # select numeric variables
133 idx.num <- which(sapply(cup98, is.numeric))
136 ###################################################
137 ### code chunk number 13: cs-response-prediction.rnw:254-263
138 ###################################################
139 layout(matrix(c(1,2), 1, 2)) # 2 graphs per page
140 # histograms of numeric variables
141 myHist <- function(x) {
142 hist(cup98[,x], main=NULL, xlab=x)
144 sapply(names(idx.num[4:5]), myHist)
145 layout(matrix(1)) # change back to one graph per page
146 # run code below to generate histograms for all numeric variables
147 # sapply(names(idx.num), myHist)
150 ###################################################
151 ### code chunk number 14: cs-response-prediction.rnw:274-279
152 ###################################################
153 layout(matrix(c(1,2),1,2)) # 2 graphs per page
155 cup98$HIT[cup98$HIT>200]
156 boxplot(cup98$HIT[cup98$HIT<200])
157 layout(matrix(1)) # change back to one graph per page
160 ###################################################
161 ### code chunk number 15: cs-response-prediction.rnw:290-292
162 ###################################################
163 AGE2 <- cut(cup98pos$AGE, right=F, breaks=seq(0, 100, by=5))
164 boxplot(cup98pos$TARGET_D ~ AGE2, ylim=c(0,40), las=3)
167 ###################################################
168 ### code chunk number 16: cs-response-prediction.rnw:303-313
169 ###################################################
171 layout(matrix(c(1,2),1,2)) # 2 graphs per page
172 boxplot(TARGET_D ~ GENDER, ylim=c(0,80))
174 plot(density(TARGET_D[GENDER=="F"]), xlim=c(0,60), col=1, lty=1)
175 lines(density(TARGET_D[GENDER=="M"]), col=2, lty=2)
176 lines(density(TARGET_D[GENDER=="J"]), col=3, lty=3)
177 legend("topright", c("Female", "Male", "Joint account"), col=1:3, lty=1:3)
178 layout(matrix(1)) # change back to one graph per page
182 ###################################################
183 ### code chunk number 17: cs-response-prediction.rnw:322-327
184 ###################################################
185 correlation <- cor(cup98$TARGET_D, cup98[,idx.num], use="pairwise.complete.obs")
186 correlation <- abs(correlation)
187 (correlation <- correlation[,order(correlation, decreasing=T)])
188 # save to a CSV file, with important variables at the top
189 write.csv(correlation, "absolute_correlation.csv")
192 ###################################################
193 ### code chunk number 18: cs-response-prediction.rnw:331-333 (eval = FALSE)
194 ###################################################
195 ## cor(cup98[,idx.num])
199 ###################################################
200 ### code chunk number 19: cs-response-prediction.rnw:339-345
201 ###################################################
202 color <- ifelse(cup98$TARGET_D>0, "blue", "black")
203 pch <- ifelse(cup98$TARGET_D>0, "+", ".")
204 plot(jitter(cup98$AGE), jitter(cup98$HIT), pch=pch, col=color, cex=0.7,
205 ylim=c(0,70), xlab="AGE", ylab="HIT")
206 legend("topleft", c("TARGET_D>0", "TARGET_D=0"), col=c("blue", "black"),
210 ###################################################
211 ### code chunk number 20: cs-response-prediction.rnw:359-369
212 ###################################################
213 myChisqTest <- function(x) {
214 t1 <- table(cup98pos[,x], cup98pos$TARGET_D2)
215 plot(t1, main=x, las=1)
217 print(chisq.test(t1))
219 myChisqTest("GENDER")
220 # run the code below to do chi-square test for all categorical variables
221 # idx.cat <- which(sapply(cup98pos, is.factor))
222 # sapply(names(idx.cat), myChisqTest)
225 ###################################################
226 ### code chunk number 21: cs-response-prediction.rnw:384-403
227 ###################################################
228 nRec <- dim(cup98)[1]
229 trainSize <- round(nRec * 0.7)
230 testSize <- nRec - trainSize
236 (strParameters <- paste(MinSplit, MinBucket, MaxSurrogate, MaxDepth, sep="-"))
238 # The cost for each contact is $0.68.
240 varSet2 <- c("AGE", "AVGGIFT", "CARDGIFT", "CARDPM12", "CARDPROM", "CLUSTER2",
241 "DOMAIN", "GENDER", "GEOCODE2", "HIT", "HOMEOWNR", "HPHONE_D", "INCOME",
242 "LASTGIFT", "MAXRAMNT", "MDMAUD_F", "MDMAUD_R", "MINRAMNT", "NGIFTALL",
243 "NUMPRM12", "PCOWNERS", "PEPSTRFL", "PETS", "RAMNTALL", "RECINHSE",
244 "RFA_2A", "RFA_2F", "STATE", "TIMELAG")
245 cup98 <- cup98[, c("TARGET_D", varSet2)]
246 library(party) # for ctree
249 ###################################################
250 ### code chunk number 22: cs-response-prediction.rnw:408-471 (eval = FALSE)
251 ###################################################
252 ## pdf(paste("evaluation-tree-", strParameters, ".pdf", sep=""),
253 ## width=12, height=9, paper="a4r", pointsize=6)
255 ## cat(" trainSize=", trainSize, ", testSize=", testSize, "\n")
256 ## cat(" MinSplit=", MinSplit, ", MinBucket=", MinBucket,
257 ## ", MaxSurrogate=", MaxSurrogate, ", MaxDepth=", MaxDepth, "\n\n")
259 ## # run for multiple times and get the average result
260 ## allTotalDonation <- matrix(0, nrow=testSize, ncol=LoopNum)
261 ## allAvgDonation <- matrix(0, nrow=testSize, ncol=LoopNum)
262 ## allDonationPercentile <- matrix(0, nrow=testSize, ncol=LoopNum)
263 ## for (loopCnt in 1:LoopNum) {
264 ## cat(date(), ": iteration = ", loopCnt, "\n")
266 ## # split into training data and testing data
267 ## trainIdx <- sample(1:nRec, trainSize)
268 ## trainData <- cup98[trainIdx,]
269 ## testData <- cup98[-trainIdx,]
271 ## # train a decision tree
272 ## myCtree <- ctree(TARGET_D ~ ., data=trainData,
273 ## controls=ctree_control(minsplit=MinSplit, minbucket=MinBucket,
274 ## maxsurrogate=MaxSurrogate, maxdepth=MaxDepth))
276 ## print(object.size(myCtree), units="auto")
277 ## save(myCtree, file=paste("cup98-ctree-", strParameters, "-run-",
278 ## loopCnt, ".rdata", sep=""))
280 ## figTitle <- paste("Tree", loopCnt)
281 ## plot(myCtree, main=figTitle, type="simple", ip_args=list(pval=FALSE),
282 ## ep_args=list(digits=0,abbreviate=TRUE), tp_args=list(digits=2))
286 ## pred <- predict(myCtree, newdata=testData)
287 ## plot(pred, testData$TARGET_D)
288 ## print(sum(testData$TARGET_D[pred > cost] - cost))
289 ## # quick sort is "unstable" for tie values, so it is used here to introduce
290 ## # a bit random for tie values
291 ## s1 <- sort(pred, decreasing=TRUE, method = "quick", index.return=TRUE)
292 ## totalDonation <- cumsum(testData$TARGET_D[s1$ix]) # cumulative sum
293 ## avgDonation <- totalDonation / (1:testSize)
294 ## donationPercentile <- 100 * totalDonation / sum(testData$TARGET_D)
295 ## allTotalDonation[,loopCnt] <- totalDonation
296 ## allAvgDonation[,loopCnt] <- avgDonation
297 ## allDonationPercentile[,loopCnt] <- donationPercentile
298 ## plot(totalDonation, type="l")
302 ## cat(date(), ": Loop completed.\n\n\n")
304 ## fnlTotalDonation <- rowMeans(allTotalDonation)
305 ## fnlAvgDonation <- rowMeans(allAvgDonation)
306 ## fnlDonationPercentile <- rowMeans(allDonationPercentile)
308 ## rm(trainData, testData, pred)
310 ## # save results into a CSV file
311 ## results <- data.frame(cbind(allTotalDonation,fnlTotalDonation))
312 ## names(results) <- c(paste("run",1:LoopNum), "Average")
313 ## write.csv(results, paste("evaluation-TotalDonation-", strParameters, ".csv",
317 ###################################################
318 ### code chunk number 23: cs-response-prediction.rnw:498-510
319 ###################################################
320 result <- read.csv("evaluation-TotalDonation-1000-400-4-10.csv")
322 result[,2:12] <- result[,2:12] - cost * (1:testSize)
323 # to reduce size of the file to save this chart
324 idx.pos <- c(seq(1, nrow(result), by=10), nrow(result))
325 plot(result[idx.pos,12], type="l", lty=1, col=1, ylim=c(0,4500),
326 xlab="Number of Mails", ylab="Amount of Donations ($)")
327 for (fCnt in 1:LoopNum) {
328 lines(result[idx.pos,fCnt+1], pty=".", type="l", lty=1+fCnt, col=1+fCnt)
330 legend("bottomright", col=1:(LoopNum+1), lty=1:(LoopNum+1),
331 legend=c("Average", paste("Run",1:LoopNum)))
334 ###################################################
335 ### code chunk number 24: cs-response-prediction.rnw:520-533
336 ###################################################
337 donationPercentile <- sapply(2:12, function(i)
338 100 * result[,i] / result[testSize,i])
339 percentile <- 100 * (1:testSize)/testSize
340 plot(percentile[idx.pos], donationPercentile[idx.pos,11], pty=".", type="l",
341 lty=1, col=1, ylim=c(0,170), xlab="Contact Percentile (%)",
342 ylab="Donation Percentile (%)")
343 grid(col = "gray", lty = "dotted")
344 for (fCnt in 1:LoopNum) {
345 lines(percentile[idx.pos], donationPercentile[idx.pos,fCnt], pty=".",
346 type="l", lty=1+fCnt, col=1+fCnt)
348 legend("bottomright", col=1:(LoopNum+1), lty=1:(LoopNum+1),
349 legend=c("Average", paste("Run",1:LoopNum)))
352 ###################################################
353 ### code chunk number 25: cs-response-prediction.rnw:567-582
354 ###################################################
355 avgDonation <- sapply(2:12, function(i) result[,i] / (1:testSize))
356 yTitle = c("Total Donation Amount Percentile (%)",
357 "Average Donation Amount per Contact ($)")
358 par(mar=c(5,4,4,5)+.1)
359 plot(percentile[idx.pos], donationPercentile[idx.pos,7], pty=".", type="l",
360 lty="solid", col="red", ylab=yTitle[1], xlab="Contact Percentile (%)")
361 grid(col = "gray", lty = "dotted")
363 plot(percentile[idx.pos], avgDonation[idx.pos,7], type="l", lty="dashed",
364 col="blue", xaxt="n", yaxt="n", xlab="", ylab="",
365 ylim=c(0,max(avgDonation[,7])))
367 mtext(yTitle[2], side=4, line=2)
368 legend("right", col=c("red","blue"), lty=c("solid","dashed"),
372 ###################################################
373 ### code chunk number 26: cs-response-prediction.rnw:604-625 (eval = FALSE)
374 ###################################################
375 ## # compare results got with different parameters
376 ## parameters <- c("1000-400-4-5", "1000-400-4-6", "1000-400-4-8", "1000-400-4-10")
377 ## #parameters <- c("1000-400-4-10", "700-200-4-10", "200-50-4-10")
378 ## paraNum <- length(parameters)
379 ## percentile <- 100 * (1:testSize)/testSize
381 ## results <- read.csv(paste("evaluation-TotalDonation-", parameters[1], ".csv",
383 ## avgResult <- results$Average - cost * (1:testSize)
384 ## plot(percentile, avgResult, pty=1, type="l", lty=1, col=1, #ylim=c(0,4000),
385 ## ylab="Amount of Donation", xlab="Contact Percentile (%)",
386 ## main="Parameters: MinSplit, MinBucket, MaxSurrogate, MaxDepth")
387 ## grid(col = "gray", lty = "dotted")
389 ## for (i in 2:paraNum) {
390 ## results <- read.csv(paste("evaluation-TotalDonation-", parameters[i],
392 ## avgResult <- results$Average - cost * (1:testSize)
393 ## lines(percentile, avgResult, type="l", lty=i, col=i)
395 ## legend("bottomright", col=1:paraNum, lty=1:paraNum, legend=parameters)
398 ###################################################
399 ### code chunk number 27: cs-response-prediction.rnw:659-660 (eval = FALSE)
400 ###################################################
401 ## cup98val <- read.csv("./data/KDDCup1998/cup98VAL.txt")
404 ###################################################
405 ### code chunk number 28: cs-response-prediction.rnw:665-666
406 ###################################################
407 load("./data/cup98val.rdata")
410 ###################################################
411 ### code chunk number 29: cs-response-prediction.rnw:669-692
412 ###################################################
413 cup98val <- cup98val[, c("CONTROLN", varSet2)]
414 trainNames <- names(cup98)
415 scoreNames <- names(cup98val)
416 # check if any variables not in scoring data
417 idx <- which(trainNames %in% scoreNames)
418 print(trainNames[-idx])
420 # check and set levels in factors in scoring data
421 scoreData <- cup98val
422 vars <- intersect(trainNames, scoreNames)
423 for (i in 1:length(vars)) {
425 trainLevels <- levels(cup98[,varname])
426 scoreLevels <- levels(scoreData[,varname])
427 if (is.factor(cup98[,varname]) & setequal(trainLevels, scoreLevels)==F) {
428 cat("Warning: new values found in score data, and they will be changed to NA!\n")
430 #cat("train: ", length(trainLevels), ", ", trainLevels, "\n")
431 #cat("score: ", length(scoreLevels), ", ", scoreLevels, "\n\n")
432 scoreData[,varname] <- factor(scoreData[,varname], levels=trainLevels)
438 ###################################################
439 ### code chunk number 30: cs-response-prediction.rnw:697-705 (eval = FALSE)
440 ###################################################
441 ## # loading the selected model
442 ## load("cup98-ctree-1000-400-4-10-run-7.Rdata")
444 ## pred <- predict(myCtree, newdata = scoreData)
445 ## pred <- round(pred, digits=3)
446 ## #table(pred, useNA="ifany")
447 ## result <- data.frame(scoreData$CONTROLN, pred)
448 ## names(result) <- c("CONTROLN", "pred")
451 ###################################################
452 ### code chunk number 31: cs-response-prediction.rnw:710-711
453 ###################################################
454 load("response-result.rdata")
457 ###################################################
458 ### code chunk number 32: cs-response-prediction.rnw:725-732
459 ###################################################
460 valTarget <- read.csv("./data/KDDCup1998/valtargt.txt")
461 merged <- merge(result, valTarget, by="CONTROLN")
462 # donation profit if mail all people
463 sum(valTarget$TARGET_D - cost)
464 # donation profit if mail those predicted to donate more than mail cost
465 idx <- (merged$pred > cost)
466 sum(merged$TARGET_D[idx] - cost)
469 ###################################################
470 ### code chunk number 33: cs-response-prediction.rnw:739-748
471 ###################################################
473 merged <- merged[order(merged$pred, decreasing=T),]
474 x <- 100 * (1:nrow(merged)) / nrow(merged)
475 y <- cumsum(merged$TARGET_D) - cost*(1:nrow(valTarget))
476 # to reduce size of the file to save this chart
477 idx.pos <- c(seq(1, length(x), by=10), length(x))
478 plot(x[idx.pos], y[idx.pos], type="l", xlab="Contact Percentile (%)",
479 ylab="Amount of Donation")