-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrss_collector.R
48 lines (42 loc) · 1.28 KB
/
rss_collector.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# RSS to SQLite feed downloader
# Clean up the memory of your R session
rm(list=ls(all=TRUE))
library("tidyRSS")
library("RSQLite")
library("digest")
library("dplyr")
library("lgr")
lgr$info("RSS2SQLite R language edition")
dbfile <- "feeds.db"
feeds <- as.list(readLines("feed_urls.txt"))
# Columns with metadata that will be removed
remove_cols <- c('feed_category',
'item_enclosure',
"item_category",
"entry_category")
# Input is feed URL, write directly to DB
run_feed <- function(feed_url) {
parsed_feed <- tidyfeed(toString(feed_url),
clean_tags=TRUE,
parse_dates=TRUE,
list=FALSE)
parsed_feed = subset(parsed_feed,
select = !(names(parsed_feed) %in% remove_cols))
# Hash code to check against the feed URL
table_hash <- digest(feed, "md5", serialize=FALSE)
# Create or append table
dbWriteTable(conn,
table_hash,
parsed_feed,
append=TRUE,
row.names=TRUE)
}
# Entry - connect to DB and parse all feeds
conn <- dbConnect(RSQLite::SQLite(), dbfile)
for (feed in feeds) {
lgr$info(feed)
try(run_feed(feed))
}
dbDisconnect(conn)
lgr$info("DONE")