Snippit upload.

Chris R. Albon · Chris R. Albon · commit 4ef52910fd0a · 2013-12-08T12:20:37.000-05:00
diff --git a/.Rhistory b/.Rhistory
diff --git a/animated-map-gif.r b/animated-map-gif.r
@@ -0,0 +1,59 @@
+# Animated Map Gif
+# original source: http://rud.is/b/2013/09/19/animated-irl-pirate-attacks-in-r/
+
+# load the required packages
+library(maps)
+library(hexbin)
+library(maptools)
+library(ggplot2)
+library(sp)
+library(mapproj)
+
+# download the pirate data shape file
+download.file("http://msi.nga.mil/MSISiteContent/StaticFiles/Files/ASAM_shp.zip", destfile="ASAM_shp.zip")
+
+# unzip the file in the working directory
+unzip("ASAM_shp.zip")
+
+# load the data file "ASAM 05 DEC 13.shp as a dataframe called pirates.df (you'll need to change the file depending on when you download the data)
+pirates.df <- as.data.frame(readShapePoints("ASAM 05 DEC 13"))
+
+# load a map of the world
+world <- map_data("world")
+
+# remove Antarctica
+world <- subset(world, region != "Antarctica")
+
+# create a vector with a list of years we want the data from
+ends <- 1979:2013
+
+# loop thRough, extRact data, build plot, save plot: BOOM
+
+# for each year in "ends"...
+for (end in ends) {
+  # create a 500x250 png containing...
+  png(filename=sprintf("arrr-%d.png",end),width=500,height=250,bg="white")
+  # create a vector of the dates of the attacks
+  dec.df <- pirates.df[pirates.df$DateOfOcc > "1970-01-01" & pirates.df$DateOfOcc < as.Date(sprintf("%s-12-31",end)),] 
+  # create an element of the first and land attack dates
+  rng <- range(dec.df$DateOfOcc)
+  # create an employ ggplot
+  p <- ggplot() 
+  # draw a polygon of the world
+  p <- p + geom_polygon(data=world, aes(x=long, y=lat, group=group), fill="gray40", colour="white")
+  # plot the event data (dec.df) in a hexagon grid
+  p <- p + stat_summary_hex(fun="length", data=dec.df, aes(x=coords.x1, y=coords.x2, z=coords.x2), alpha=0.8)
+  # create a legend
+  p <- p + scale_fill_gradient(low="white", high="red", "Pirate Attacks recorded")
+  # make the plot's theme black and white, and change the labels
+  p <- p + theme_bw() + labs(x="",y="", title=sprintf("Pirate Attacks From %s to %s",rng[1],rng[2]))
+  # make the plot prettier and with a border
+  p <- p + theme(panel.background = element_rect(fill='#A6BDDB', colour='white'))
+  # view the plot
+  print(p)
+  # reset devices so it is ready for the next plot
+  dev.off()
+}
+
+# run a terminal command to turn all the pngs created in the working directory into one animated gif (requires imagemagick)
+system("convert -delay 45 -loop 0 arrr*g arrr500.gif")
diff --git a/check-if-variable-exists.r b/check-if-variable-exists.r
@@ -0,0 +1,17 @@
+# Check To See If A Variable Exists
+# original source: http://www.r-bloggers.com/check-if-a-variable-exists-in-r/
+
+# create a dataframe with simulated values
+x <- runif(1000)
+y <- runif(1000)
+z <- runif(1000)
+a <- runif(1000)
+data <- data.frame(x, y, z, a)
+rm(x, y, z, a)
+
+# does a variable called "x" exists in the object "data"?
+"x" %in% names(data)
+
+# does a column called "x" exists in the object "data"?
+"x" %in% colnames(data)
+
diff --git a/cleaning-data-example.r b/cleaning-data-example.r
@@ -1,11 +1,17 @@
 # Cleaing Up OKF's Dirty London Transport Data
 
+# load the stringr package
+library(stringr)
+
 # load the csv file into R
 lon.df <- read.csv("http://data.london.gov.uk/datafiles/transport/tfl_passengers.csv", header=TRUE)
 
 # view the top few lines of the dataset
 head(lon.df)
 
+# view the bottom few lines of the dataset
+tail(lon.df)
+
 # look at the structure of the dataset, take careful note of the class of the columns
 str(lon.df)
 
diff --git a/crosstabs.r b/crosstabs.r
@@ -0,0 +1,13 @@
+# Crosstabs
+
+# create some simulated disaster data
+event <- c("flood","fire","flood","fire","riot","flood","riot","riot","flood"); disaster
+location <- c("africa", "asia", "europe","africa", "asia", "europe","africa", "asia", "europe")
+disasters <- data.frame(event, location)
+rm(event, location)
+
+# create a variable that is the frequency counts of different types of disaster events
+event.counts.df <- as.data.frame(table(disasters$event));event.counts.df
+
+# create a crosstab of event types by location
+disaster.crosstab <- table(disasters$event, disasters$location); disaster.crosstab
diff --git a/download-and-unzip-data.r b/download-and-unzip-data.r
@@ -0,0 +1,13 @@
+# Download And Unzip Data
+
+# download a zip file containing broadband data, save it to the working directory
+download.file("http://www2.ntia.doc.gov/files/broadband-data/AK-NBM-CSV-Dec-2012.zip", destfile="AK-NBM-CSV-Dec-2012.zip")
+
+# unzip the file
+unzip("AK-NBM-CSV-Dec-2012.zip")
+
+# unzip the file inside the zip file
+unzip("AK-NBM-WIRELESS-CSV-Dec-2012.zip")
+
+# read the data into R, with "|" seperating values
+data <- read.delim("AK-NBM-Wireless-CSV-DEC-2012.TXT", sep = "|")
diff --git a/expand-counts-into-dataframe.r b/expand-counts-into-dataframe.r
@@ -0,0 +1,18 @@
+# Expand A Table Of Counts Into A Dataframe
+# Original source: The R Book
+
+# create a dataframe of simulated values
+count <- c(2, 3, 4, 1)
+sex <- c("male", "female", "male", "female")
+nationality <- c("USA", "USA", "UK", "UK")
+data.df <- data.frame(count, sex, nationality)
+rm(count, sex, nationality)
+
+# apply a function that repeats a row the number of times it appears in data.df$count
+data.expand <- lapply(data.df,function(x)rep(x, data.df$count))
+
+# convert it to a data frame
+data.expand.df <- as.data.frame(data.expand)
+
+# remove the no-longer-needed count column
+data.expand.df <- data.expand.df[,-1]; data.expand.df
diff --git a/file-paths-2.r b/file-paths-2.r
diff --git a/find-and-replace.r b/find-and-replace.r
@@ -0,0 +1,14 @@
+# Find And Replace
+# Original source: http://christophergandrud.blogspot.com/2013/12/three-quick-and-simple-data-cleaning.html
+
+# load the DataCombine package
+library(DataCombine)
+
+# create a dataframe of simulated values
+data.df <- data.frame(cities = c("London, UK", "Oxford, UK", "Berlin, DE", "Hamburg, DE", "Oslo, NO"), score = c(8, 0.1, 3, 2, 1))
+
+# create a dataframe of two vectors, one with the characters to be replaced and the other with what to replace it with
+replace.values <- data.frame(short = c("UK", "DE"), long = c("England", "Germany"))
+
+# find and replace the character strings
+data.longnames.df <- FindReplace(data = data.df, Var = "cities", replaceData = replace.values, from = "short", to = "long"); data.longnames.df
diff --git a/for-loop.r b/for-loop.r
@@ -0,0 +1,18 @@
+# For Loops
+# original source: the r book
+
+# create a dataframe with simulated values
+x <- runif(1000)
+y <- runif(1000)
+z <- runif(1000)
+a <- runif(1000)
+data <- data.frame(x, y, z, a)
+rm(x, y, z, a)
+
+# create a variable to place the results of the for loop in
+data.altered <- NULL
+
+# for each element in data, square x and put the value in data.altered
+for (i in data) {
+  data.altered <- data$x^2
+}
diff --git a/list-indexing.r b/list-indexing.r
@@ -0,0 +1,21 @@
+# Indexing Lists
+# Source: The R Book
+
+# create a list with simulated values
+score <- runif(100)
+states.df <- data.frame(state1 = state.name[1:10], state2 = state.name[11:20], state3 = state.name[21:30])
+name <- letters[1:20]
+data.ls <- list(score, states.df, name)
+rm(score, states.df, name)
+
+# view the list
+data.ls
+
+# select 1st list element
+data.ls[[1]]
+
+# select the 1st list element, then select it's 2nd value
+data.ls[[1]][2]
+
+# select the 2nd list element, then select it's value in the 3rd row and 1st column
+data.ls[[2]][3,1]
diff --git a/normality-test.r b/normality-test.r
@@ -0,0 +1,29 @@
+# Normality Test
+
+# One simple test for normality is a quantile-quantile plot. It plots the sample's quantiles against a set of quantiles taken from a normal distribution.
+
+# If the points follow the line drawn, they are roughly normally distributed. If the points create a S-shape or other shape, they are not normally distributed
+
+# create simulated data that is not normal
+y <- runif(1000)
+
+# create simulated data that is normal
+y.norm <- rnorm(1000)
+
+# create a qq-plot for the non-normal data
+qqnorm(y)
+qqline(y,lty=2)
+
+# create a qq-plot for the normal data
+qqnorm(y.norm)
+qqline(y.norm,lty=2)
+
+# Shapiro test
+
+# The null hypothesis is that the data is normally distributed. We want a large p-value, meaning that we cannot reject the null hypothesis (that the data is normally distributed)
+
+# shapiro test on non-normal data (results show that we can reject the null that the data is normally distributed)
+shapiro.test(y)
+
+# shapiro test on normal data (results show that we cannot reject the null hypothesis that the data is normally distributed)
+shapiro.test(y.norm)
diff --git a/proportion-table.r b/proportion-table.r
@@ -0,0 +1,11 @@
+# Proporton Table
+# Original source: The R Book
+
+# create a matrix of simulated count data
+counts <- matrix(sample(1:100, 20, replace=T), nrow=4); counts
+
+# calculate each cell's proportion of the entire row's total counts
+prop.table(counts,1)
+
+# calculate each cell's proportion of the entire columns's total counts
+prop.table(counts,2)
diff --git a/remove-duplicate-rows.r b/remove-duplicate-rows.r
@@ -0,0 +1,16 @@
+# Remove Duplicate Rows
+# original source: the r book
+
+# create a dataframe with simulated values
+x <- c(1,2,3,1,2,2)
+y <- c(1,6,3,1,2,2)
+z <- c(1,2,3,1,2,2)
+a <- c(1,5,6,1,2,2)
+data <- data.frame(x, y, z, a)
+rm(x, y, z, a)
+
+# find all the rows that are the same
+duplicates <- data[duplicated(data),];duplicates
+
+# find all the rows that are unique
+not.duplicates <- unique(data);not.duplicates
diff --git a/select-rows-by-logical-test.r b/select-rows-by-logical-test.r
@@ -13,4 +13,4 @@ rm(x, y, z, a)
 data[data$y > data$x,]
 
 # select all rows where y IS NOT greater than x
-data[!(data$y > data$x),]
+data[!(data$y > data$x),]
diff --git a/srs-of-rows.r b/srs-of-rows.r
@@ -7,6 +7,7 @@ z <- runif(1000)
 a <- runif(1000)
 df <- data.frame(x, y, z, a)
 
+# create a vector of weighs
 w <- runif(1000)
 
 # sample 10 rows of the dataframe at pseudorandom, without replacement
diff --git a/sweep-over-a-df.r b/sweep-over-a-df.r
@@ -0,0 +1,13 @@
+# Sweep Over An Dataframe
+# original source: the r book
+
+# create a dataframe with simulated values
+x <- runif(1000)
+y <- runif(1000)
+z <- runif(1000)
+a <- runif(1000)
+data <- data.frame(x, y, z, a)
+rm(x, y, z, a)
+
+# Add 10 to the first column, 20 to the second column, 30 to the third column, 40 to the fourth column. Columns are denoted with the "2"
+sweep(data, 2, c(10, 20, 30, 40), "+")
diff --git a/truncate-a-string.r b/truncate-a-string.r
@@ -0,0 +1,9 @@
+# Truncate A String
+# Source: http://www.r-bloggers.com/truncate-by-delimiter-in-r/
+
+# create some simulated data
+patients <- data.frame(
+  uid = 1:3,
+  fullname = c("Smith/John", "Jackson/Smith", "Joel/Billy"))
+
+patients$lastname <- sub("/.*", "", patients$fullname); patients$lastname