Skip to content

Commit 9d878d2

Browse files
committed
Adding Testing capabilities, README, and fixed authentication code problem
2 parents 26b79c8 + 1bd5b4c commit 9d878d2

9 files changed

+171
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
*tweets*
33
*.pyc
44
*.swp
5+
*.json

README.md

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# tweet_scraper
2+
this project can be used to scrape tweets and append them to previously scraped tweets from the same user.
3+
I built this so I could accumulate twitter data and train an RNN on it so it can post to twitter itself.
4+
5+
The RNN I am using is also open source and can be found here https://github.com/jcjohnson/torch-rnn
6+
7+
Usage
8+
9+
```
10+
python getTweets.py screen_name amount [filename]
11+
```
12+
13+
The filename is an optional argument to specify which file you want to write the tweets to
14+
The filename can be used to append to files
15+
16+
The id of last tweet scraped is kept in a file called lastTweetScraped.txt which will be created if it does not exist in the current directory.
17+
18+
TODO's:
19+
20+
add specifity for arguments
21+
-f file
22+
-af append to file
23+
-stop Scrape only new tweets, ie. all up to last tweet scraped as specified in the lastTweetScraped.txt file
24+
25+
Maybe scrape just a range
26+
Better File I/O functionality

testingFiles/TestsssTom.txt

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Test9
2+
Test8
3+
Test7
4+
Test6
5+
Test5
6+
Test4
7+
Test3
8+
Test2
9+
Test1
10+
Test0
11+
Test19
12+
Test18
13+
Test17
14+
Test16
15+
Test15
16+
Test14
17+
Test13
18+
Test12
19+
Test11
20+
Test10

testingFiles/TestsssTomVerify1.txt

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
Test9
2+
Test8
3+
Test7
4+
Test6
5+
Test5
6+
Test4
7+
Test3
8+
Test2
9+
Test1
10+
Test0

testingFiles/TestsssTomVerify2.txt

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Test9
2+
Test8
3+
Test7
4+
Test6
5+
Test5
6+
Test4
7+
Test3
8+
Test2
9+
Test1
10+
Test0
11+
Test19
12+
Test18
13+
Test17
14+
Test16
15+
Test15
16+
Test14
17+
Test13
18+
Test12
19+
Test11
20+
Test10

testingFiles/testing1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Test1
2+
Test2
3+
Test3

testingFiles/testing2.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Test1
2+
Test2
3+
Test3
4+
Test4

testingFiles/testing3.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Test1
2+
Test2
3+
Test3

tests.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import tweepy #https://github.com/tweepy/tweepy
2+
import os
3+
import filecmp
4+
import tweet_dumper
5+
import json
6+
7+
# For twitter account @TestsssTom
8+
# Password to account is "saymanyougottajoint"
9+
# key management
10+
# https://apps.twitter.com/app/13524605/keys
11+
12+
# looks for json file with the proper credentials for the user
13+
# {
14+
# "consumer_key" : "value",
15+
# "consumer_secret" : "value",
16+
# "access_token" : "value",
17+
# "access_token_secret" : "value"
18+
# }
19+
def get_credentials(filename):
20+
with open(filename) as file:
21+
cfg = json.load(file)
22+
return cfg
23+
24+
def get_api(cfg):
25+
auth = tweepy.OAuthHandler(cfg['consumer_key'], cfg['consumer_secret'])
26+
auth.set_access_token(cfg['access_token'], cfg['access_token_secret'])
27+
return tweepy.API(auth)
28+
29+
30+
def batch_delete(api):
31+
for status in tweepy.Cursor(api.user_timeline).items():
32+
try:
33+
api.destroy_status(status.id)
34+
except:
35+
print "Failed to delete:", status.id
36+
37+
def batch_tweet(api, startid):
38+
tweet = "Test"
39+
for i in range(startid,startid+10):
40+
api.update_status(tweet+str(i))
41+
42+
def batch_tweet_from_file(api, filename):
43+
for line in reversed(open(filename).readlines()):
44+
api.update_status(line)
45+
46+
47+
def compare(file1, file2):
48+
different = filecmp.cmp(file1, file2)
49+
return different
50+
51+
52+
if __name__ == "__main__":
53+
username = "TestsssTom"
54+
55+
cfg = get_credentials(username+".json")
56+
api = get_api(cfg)
57+
# desttroy all tweets on the page
58+
batch_delete(api)
59+
os.remove("TestingFiles/TestsssTom.txt")
60+
61+
62+
print "Deleted all Tweets"
63+
batch_tweet(api,0)
64+
print "First set of tweets printed"
65+
# get the tweets and match them with
66+
tweet_dumper.get_all_tweets5args(username, api, 3240, "TestingFiles/TestsssTom.txt", "lastTweetScraped.txt")
67+
firstPull = compare("TestingFiles/TestsssTom.txt", "TestingFiles/TestsssTomVerify1.txt")
68+
if not firstPull:
69+
print "scraped tweets do not match as expected"
70+
else:
71+
print "scraped tweets match desired output"
72+
73+
# batch tweet 10 more
74+
batch_tweet(api,10)
75+
tweet_dumper.get_all_tweets5args(username, api, 3240, "TestingFiles/TestsssTom.txt", "lastTweetScraped.txt")
76+
fileappend = compare("TestingFiles/TestsssTom.txt", "TestingFiles/TestsssTomVerify2.txt")
77+
if not fileappend:
78+
print "appending new tweets not working"
79+
else:
80+
print "append tweets working"
81+
# delete all the tweets again
82+
batch_delete(api)
83+
84+

0 commit comments

Comments
 (0)