-
Notifications
You must be signed in to change notification settings - Fork 2
/
tutorial06_script.R
110 lines (85 loc) · 3.37 KB
/
tutorial06_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# R introduction to Twitter scraping
# Social Computing - The Univeristy of Melbourne
# Created August 2018 - Niels van Berkel
# Install required packages for scraping Twitter data
# install.packages("twitteR")
library("twitteR")
# Authenticate with Twitter. I set up a developer account which can be used for this subject only.
# Create your own account at developer.twitter.com if interested in more serious applications.
# Please do not test the limits of the API as that can result in the account being shut down.
# Same goes with scraping of 'questionable' content, use your own account for that if interested.
# The lecturer will provide you with letters to complete the secret strings
lecture_secret_1 <- ""
lecture_secret_2 <- ""
account_key <- "g4nlVMLIYueBZAz45770FpoNh"
account_secret <- paste("teLnox4GoVXheXhtfhBBE8rsNf6VqnvesG9Lk7rYsnXhRUaLJ", lecture_secret_1, sep = "")
access_token <- "1022637363845951503-O9aXVu2eUrClQCR5Y8RSBaAZRsiD8o"
access_secret <- paste("wzoU6gdgShcESiFdcBjNRGjR10ixwkbUWQBitSMpEaUW", lecture_secret_2, sep = "")
setup_twitter_oauth(account_key, account_secret, access_token, access_secret)
# Choose either 0 or 1 in the console below, does not matter at this point.
# Scraping of tweets by hashtag / topic:
tweets <- searchTwitter("#studentlife",n=100,lang="en",resultType="recent")
# Convert collected tweets to a dataframe for easy access
tweets <- twListToDF(tweets)
# Explore the data object, what information is contained in a Tweet other than the text?
tweets
# Remove URL information
tweets$stripped_text <- gsub("http.*","", tweets$text)
tweets$stripped_text <- gsub("http.*","", tweets$stripped_text)
# Lets use the tidytext library to 'clean up' our tweets. Removes punctuation, capitals, etc.
# install.packages("tidytext")
library(tidytext)
library(dplyr)
library(ggplot2)
tweets_cleaned <- tweets %>%
dplyr::select(stripped_text) %>%
unnest_tokens(word, stripped_text)
# plot the top 15 words -- notice any issues?
tweets_cleaned %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
data("stop_words")
head(stop_words)
# Add custom words to the stop words. For example 'rt' shows up a lot. Guess why?
str(stop_words)
df <- data.frame("rt","SMART")
names(df)<-c("word","lexicon")
stop_words <- rbind(stop_words, df)
nrow(tweets_cleaned)
# remove stop words from your list of words
tweets_cleaned <- tweets_cleaned %>%
anti_join(stop_words)
nrow(tweets_cleaned)
# plot again
tweets_cleaned %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
# Load dictionary with -5 to 5 sentiment score assigned to each word
afinn <- get_sentiments("afinn")
# Load dictionary with positive / negative sentiment words
bing <- get_sentiments("bing")
sentiment_tweets <- tweets_cleaned %>%
inner_join(afinn) %>%
count(word, sort = TRUE)
sentiment_tweets <- merge(sentiment_tweets, afinn, by = "word")
# install.packages("wordcloud")
library(wordcloud)
tweets_cleaned %>%
count(word) %>%
with(wordcloud(word, n, max.words = 1000))