Example code and data for "Practical Data Science with R" by Nina Zumel and John Mount, Manning 2014.

The book: "Practical Data Science with R" by Nina Zumel and John Mount, Manning 2014 (book copyright Manning Publications Co., all rights reserved)
The support site: GitHub WinVector/zmPDSwR

The code and data in this directory supports examples from:

Chapter 5: Choosing and Evaluating Models
Chapter 9: Exploring Advanced Methods

4-26-2013 Data from http://archive.ics.uci.edu/ml/datasets/Spambase Data file is: http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data

$ shasum spambase.data 
e28aa2a7d4592b4f5f71347912c1b4b759336b58  spambase.data

Data preparation steps:

spamD <- read.table('http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data',sep=',',header=F)
spamCols <- c(
   'word.freq.make', 'word.freq.address', 'word.freq.all',
   'word.freq.3d', 'word.freq.our', 'word.freq.over', 'word.freq.remove',
   'word.freq.internet', 'word.freq.order', 'word.freq.mail',
   'word.freq.receive', 'word.freq.will', 'word.freq.people',
   'word.freq.report', 'word.freq.addresses', 'word.freq.free',
   'word.freq.business', 'word.freq.email', 'word.freq.you',
   'word.freq.credit', 'word.freq.your', 'word.freq.font',
   'word.freq.000', 'word.freq.money', 'word.freq.hp', 'word.freq.hpl',
   'word.freq.george', 'word.freq.650', 'word.freq.lab',
   'word.freq.labs', 'word.freq.telnet', 'word.freq.857',
   'word.freq.data', 'word.freq.415', 'word.freq.85',
   'word.freq.technology', 'word.freq.1999', 'word.freq.parts',
   'word.freq.pm', 'word.freq.direct', 'word.freq.cs',
   'word.freq.meeting', 'word.freq.original', 'word.freq.project',
   'word.freq.re', 'word.freq.edu', 'word.freq.table',
   'word.freq.conference', 'char.freq.semi', 'char.freq.lparen',
   'char.freq.lbrack', 'char.freq.bang', 'char.freq.dollar',
   'char.freq.hash', 'capital.run.length.average',
   'capital.run.length.longest', 'capital.run.length.total',
   'spam'
)
colnames(spamD) <- spamCols
spamD$spam <- as.factor(ifelse(spamD$spam>0.5,'spam','non-spam'))
set.seed(2350290)
spamD$rgroup <- floor(100*runif(dim(spamD)[[1]]))
#write.table(spamD,file='spamD.tsv',quote=F,sep='\t',row.names=F)

Analysis steps, download https://raw.github.com/WinVector/zmPDSwR/master/Spambase/spamD.tsv

spamD <- read.table('spamD.tsv',header=T,sep='\t')
spamTrain <- subset(spamD,spamD$rgroup>=10)
spamTest <- subset(spamD,spamD$rgroup<10)
spamVars <- setdiff(colnames(spamD),list('rgroup','spam'))
spamFormula <- as.formula(paste('spam=="spam"',
   paste(spamVars,collapse=' + '),sep=' ~ '))
spamModel <- glm(spamFormula,family=binomial(link='logit'),
   data=spamTrain)
spamTrain$pred <- predict(spamModel,newdata=spamTrain,type='response')
spamTest$pred <- predict(spamModel,newdata=spamTest,type='response')

trainSpamTable <- table(truth=spamTrain$spam,
   prediction=spamTrain$pred>0.5)
testSpamTable <- table(truth=spamTest$spam,
   prediction=spamTest$pred>0.5)
# sort(sample(1:(dim(spamTest)[[1]]),size=4,replace=F))
# [1]   7  35 224 327
sample <- spamTest[c(7,35,224,327),c('spam','pred')]
cM <- table(truth=spamTest$spam,prediction=spamTest$pred>0.5)
(cM[1,1]+cM[2,2])/sum(cM)
cM[2,2]/(cM[2,2]+cM[1,2])
cM[2,2]/(cM[2,2]+cM[2,1])
cM[1,1]/(cM[1,1]+cM[1,2])
t <- as.table(matrix(data=c(288-1,17,1,13882-17),nrow=2,ncol=2))
rownames(t) <- rownames(cM)
colnames(t) <- colnames(cM)
(t[1,1]+t[2,2])/sum(t)
t[2,2]/(t[2,2]+t[1,2])
t[2,2]/(t[2,2]+t[2,1])
t[1,1]/(t[1,1]+t[1,2])
# ROC curve
library('ROCR')
eval <- prediction(spamTest$pred,spamTest$spam)
plot(performance(eval,"tpr","fpr"))
# AUC
attributes(performance(eval,'auc'))$y.values[[1]]

Saturated model / Bayes rate estimate

quantized <- subset(spamD,T,select=spamVars)
quantized <- as.data.frame(lapply(quantized,
   function(col) { ecdf(col)(col) }))
quantized$groupId <- sapply(1:dim(quantized)[[1]],
   function(row) paste(floor(5*quantized[row,spamVars]),collapse=' '))
quantized$spam <- spamD$spam
satTable <- table(quantized$groupId,quantized$spam)
quantized$satPred <- sapply(1:(dim(quantized)[[1]]),function(rowNum) {
   row <- satTable[quantized[rowNum,'groupId'],]
   row['spam']>row['non-spam']
   })
quantized$groupCount <- table(quantized$groupId)[quantized$groupId]
table(quantized$spam,quantized$satPred>0.5)
repeated <- subset(quantized,groupCount>1)
cRepeated = table(repeated$spam,repeated$satPred>0.5)
print(cRepeated)
#           FALSE TRUE
#  non-spam  1022    2
#  spam        31  848
print((cRepeated[1,1]+cRepeated[2,2])/sum(cRepeated))
# [1] 0.982659

License for additional documentation, notes, code, and example data:

This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.

No guarantee, indemnification or claim of fitness is made regarding any of these items.

No claim of license on works of others or derived data.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

README.md

README.md

Example code and data for "Practical Data Science with R" by Nina Zumel and John Mount, Manning 2014.

The code and data in this directory supports examples from:

License for additional documentation, notes, code, and example data:

Files

README.md

Latest commit

History

README.md

File metadata and controls

Example code and data for "Practical Data Science with R" by Nina Zumel and John Mount, Manning 2014.

The code and data in this directory supports examples from:

License for additional documentation, notes, code, and example data: