Skip to content

Latest commit

 

History

History
129 lines (110 loc) · 5.2 KB

README.md

File metadata and controls

129 lines (110 loc) · 5.2 KB

Example code and data for "Practical Data Science with R" by Nina Zumel and John Mount, Manning 2014.

The code and data in this directory supports examples from:

  • Chapter 5: Choosing and Evaluating Models
  • Chapter 9: Exploring Advanced Methods

4-26-2013 Data from http://archive.ics.uci.edu/ml/datasets/Spambase Data file is: http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data

$ shasum spambase.data 
e28aa2a7d4592b4f5f71347912c1b4b759336b58  spambase.data

Data preparation steps:

spamD <- read.table('http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data',sep=',',header=F)
spamCols <- c(
   'word.freq.make', 'word.freq.address', 'word.freq.all',
   'word.freq.3d', 'word.freq.our', 'word.freq.over', 'word.freq.remove',
   'word.freq.internet', 'word.freq.order', 'word.freq.mail',
   'word.freq.receive', 'word.freq.will', 'word.freq.people',
   'word.freq.report', 'word.freq.addresses', 'word.freq.free',
   'word.freq.business', 'word.freq.email', 'word.freq.you',
   'word.freq.credit', 'word.freq.your', 'word.freq.font',
   'word.freq.000', 'word.freq.money', 'word.freq.hp', 'word.freq.hpl',
   'word.freq.george', 'word.freq.650', 'word.freq.lab',
   'word.freq.labs', 'word.freq.telnet', 'word.freq.857',
   'word.freq.data', 'word.freq.415', 'word.freq.85',
   'word.freq.technology', 'word.freq.1999', 'word.freq.parts',
   'word.freq.pm', 'word.freq.direct', 'word.freq.cs',
   'word.freq.meeting', 'word.freq.original', 'word.freq.project',
   'word.freq.re', 'word.freq.edu', 'word.freq.table',
   'word.freq.conference', 'char.freq.semi', 'char.freq.lparen',
   'char.freq.lbrack', 'char.freq.bang', 'char.freq.dollar',
   'char.freq.hash', 'capital.run.length.average',
   'capital.run.length.longest', 'capital.run.length.total',
   'spam'
)
colnames(spamD) <- spamCols
spamD$spam <- as.factor(ifelse(spamD$spam>0.5,'spam','non-spam'))
set.seed(2350290)
spamD$rgroup <- floor(100*runif(dim(spamD)[[1]]))
#write.table(spamD,file='spamD.tsv',quote=F,sep='\t',row.names=F)

Analysis steps, download https://raw.github.com/WinVector/zmPDSwR/master/Spambase/spamD.tsv

spamD <- read.table('spamD.tsv',header=T,sep='\t')
spamTrain <- subset(spamD,spamD$rgroup>=10)
spamTest <- subset(spamD,spamD$rgroup<10)
spamVars <- setdiff(colnames(spamD),list('rgroup','spam'))
spamFormula <- as.formula(paste('spam=="spam"',
   paste(spamVars,collapse=' + '),sep=' ~ '))
spamModel <- glm(spamFormula,family=binomial(link='logit'),
   data=spamTrain)
spamTrain$pred <- predict(spamModel,newdata=spamTrain,type='response')
spamTest$pred <- predict(spamModel,newdata=spamTest,type='response')

trainSpamTable <- table(truth=spamTrain$spam,
   prediction=spamTrain$pred>0.5)
testSpamTable <- table(truth=spamTest$spam,
   prediction=spamTest$pred>0.5)
# sort(sample(1:(dim(spamTest)[[1]]),size=4,replace=F))
# [1]   7  35 224 327
sample <- spamTest[c(7,35,224,327),c('spam','pred')]
cM <- table(truth=spamTest$spam,prediction=spamTest$pred>0.5)
(cM[1,1]+cM[2,2])/sum(cM)
cM[2,2]/(cM[2,2]+cM[1,2])
cM[2,2]/(cM[2,2]+cM[2,1])
cM[1,1]/(cM[1,1]+cM[1,2])
t <- as.table(matrix(data=c(288-1,17,1,13882-17),nrow=2,ncol=2))
rownames(t) <- rownames(cM)
colnames(t) <- colnames(cM)
(t[1,1]+t[2,2])/sum(t)
t[2,2]/(t[2,2]+t[1,2])
t[2,2]/(t[2,2]+t[2,1])
t[1,1]/(t[1,1]+t[1,2])
# ROC curve
library('ROCR')
eval <- prediction(spamTest$pred,spamTest$spam)
plot(performance(eval,"tpr","fpr"))
# AUC
attributes(performance(eval,'auc'))$y.values[[1]]

Saturated model / Bayes rate estimate

quantized <- subset(spamD,T,select=spamVars)
quantized <- as.data.frame(lapply(quantized,
   function(col) { ecdf(col)(col) }))
quantized$groupId <- sapply(1:dim(quantized)[[1]],
   function(row) paste(floor(5*quantized[row,spamVars]),collapse=' '))
quantized$spam <- spamD$spam
satTable <- table(quantized$groupId,quantized$spam)
quantized$satPred <- sapply(1:(dim(quantized)[[1]]),function(rowNum) {
   row <- satTable[quantized[rowNum,'groupId'],]
   row['spam']>row['non-spam']
   })
quantized$groupCount <- table(quantized$groupId)[quantized$groupId]
table(quantized$spam,quantized$satPred>0.5)
repeated <- subset(quantized,groupCount>1)
cRepeated = table(repeated$spam,repeated$satPred>0.5)
print(cRepeated)
#           FALSE TRUE
#  non-spam  1022    2
#  spam        31  848
print((cRepeated[1,1]+cRepeated[2,2])/sum(cRepeated))
# [1] 0.982659

License for additional documentation, notes, code, and example data:

Creative Commons License
This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.

No guarantee, indemnification or claim of fitness is made regarding any of these items.

No claim of license on works of others or derived data.