Skip to content

Latest commit

 

History

History

Spambase

Folders and files

NameName
Last commit message
Last commit date

parent directory

..
 
 
 
 
 
 

Example code and data for "Practical Data Science with R" by Nina Zumel and John Mount, Manning 2014.

The code and data in this directory supports examples from:

  • Chapter 5: Choosing and Evaluating Models
  • Chapter 9: Exploring Advanced Methods

4-26-2013 Data from http://archive.ics.uci.edu/ml/datasets/Spambase Data file is: http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data

$ shasum spambase.data 
e28aa2a7d4592b4f5f71347912c1b4b759336b58  spambase.data

Data preparation steps:

spamD <- read.table('http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data',sep=',',header=F)
spamCols <- c(
   'word.freq.make', 'word.freq.address', 'word.freq.all',
   'word.freq.3d', 'word.freq.our', 'word.freq.over', 'word.freq.remove',
   'word.freq.internet', 'word.freq.order', 'word.freq.mail',
   'word.freq.receive', 'word.freq.will', 'word.freq.people',
   'word.freq.report', 'word.freq.addresses', 'word.freq.free',
   'word.freq.business', 'word.freq.email', 'word.freq.you',
   'word.freq.credit', 'word.freq.your', 'word.freq.font',
   'word.freq.000', 'word.freq.money', 'word.freq.hp', 'word.freq.hpl',
   'word.freq.george', 'word.freq.650', 'word.freq.lab',
   'word.freq.labs', 'word.freq.telnet', 'word.freq.857',
   'word.freq.data', 'word.freq.415', 'word.freq.85',
   'word.freq.technology', 'word.freq.1999', 'word.freq.parts',
   'word.freq.pm', 'word.freq.direct', 'word.freq.cs',
   'word.freq.meeting', 'word.freq.original', 'word.freq.project',
   'word.freq.re', 'word.freq.edu', 'word.freq.table',
   'word.freq.conference', 'char.freq.semi', 'char.freq.lparen',
   'char.freq.lbrack', 'char.freq.bang', 'char.freq.dollar',
   'char.freq.hash', 'capital.run.length.average',
   'capital.run.length.longest', 'capital.run.length.total',
   'spam'
)
colnames(spamD) <- spamCols
spamD$spam <- as.factor(ifelse(spamD$spam>0.5,'spam','non-spam'))
set.seed(2350290)
spamD$rgroup <- floor(100*runif(dim(spamD)[[1]]))
#write.table(spamD,file='spamD.tsv',quote=F,sep='\t',row.names=F)

Analysis steps, download https://raw.github.com/WinVector/zmPDSwR/master/Spambase/spamD.tsv

spamD <- read.table('spamD.tsv',header=T,sep='\t')
spamTrain <- subset(spamD,spamD$rgroup>=10)
spamTest <- subset(spamD,spamD$rgroup<10)
spamVars <- setdiff(colnames(spamD),list('rgroup','spam'))
spamFormula <- as.formula(paste('spam=="spam"',
   paste(spamVars,collapse=' + '),sep=' ~ '))
spamModel <- glm(spamFormula,family=binomial(link='logit'),
   data=spamTrain)
spamTrain$pred <- predict(spamModel,newdata=spamTrain,type='response')
spamTest$pred <- predict(spamModel,newdata=spamTest,type='response')

trainSpamTable <- table(truth=spamTrain$spam,
   prediction=spamTrain$pred>0.5)
testSpamTable <- table(truth=spamTest$spam,
   prediction=spamTest$pred>0.5)
# sort(sample(1:(dim(spamTest)[[1]]),size=4,replace=F))
# [1]   7  35 224 327
sample <- spamTest[c(7,35,224,327),c('spam','pred')]
cM <- table(truth=spamTest$spam,prediction=spamTest$pred>0.5)
(cM[1,1]+cM[2,2])/sum(cM)
cM[2,2]/(cM[2,2]+cM[1,2])
cM[2,2]/(cM[2,2]+cM[2,1])
cM[1,1]/(cM[1,1]+cM[1,2])
t <- as.table(matrix(data=c(288-1,17,1,13882-17),nrow=2,ncol=2))
rownames(t) <- rownames(cM)
colnames(t) <- colnames(cM)
(t[1,1]+t[2,2])/sum(t)
t[2,2]/(t[2,2]+t[1,2])
t[2,2]/(t[2,2]+t[2,1])
t[1,1]/(t[1,1]+t[1,2])
# ROC curve
library('ROCR')
eval <- prediction(spamTest$pred,spamTest$spam)
plot(performance(eval,"tpr","fpr"))
# AUC
attributes(performance(eval,'auc'))$y.values[[1]]

Saturated model / Bayes rate estimate

quantized <- subset(spamD,T,select=spamVars)
quantized <- as.data.frame(lapply(quantized,
   function(col) { ecdf(col)(col) }))
quantized$groupId <- sapply(1:dim(quantized)[[1]],
   function(row) paste(floor(5*quantized[row,spamVars]),collapse=' '))
quantized$spam <- spamD$spam
satTable <- table(quantized$groupId,quantized$spam)
quantized$satPred <- sapply(1:(dim(quantized)[[1]]),function(rowNum) {
   row <- satTable[quantized[rowNum,'groupId'],]
   row['spam']>row['non-spam']
   })
quantized$groupCount <- table(quantized$groupId)[quantized$groupId]
table(quantized$spam,quantized$satPred>0.5)
repeated <- subset(quantized,groupCount>1)
cRepeated = table(repeated$spam,repeated$satPred>0.5)
print(cRepeated)
#           FALSE TRUE
#  non-spam  1022    2
#  spam        31  848
print((cRepeated[1,1]+cRepeated[2,2])/sum(cRepeated))
# [1] 0.982659

License for additional documentation, notes, code, and example data:

Creative Commons License
This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.

No guarantee, indemnification or claim of fitness is made regarding any of these items.

No claim of license on works of others or derived data.