33import os
44import re
55
6+
7+ # update the id of the lasttweet stored in the given filename
8+ # if the file doesn't exist, one is created
69def updateLastTweet (filename , screen_name , newLastTweet ):
710 tmpLine = ""
811 newData = ""
@@ -22,6 +25,8 @@ def updateLastTweet(filename, screen_name, newLastTweet):
2225 os .remove (filename )
2326 os .rename (filename + ".swp" , filename )
2427
28+ # get the id of the most recent tweet recorded from the file
29+ # specified by filename
2530def getMostRecentTweet (filename , screen_name ):
2631 if not os .path .isfile (filename ):
2732 newFile = open (filename , 'w' )
@@ -59,11 +64,12 @@ def writeTweets(filename, mode, tweets):
5964 f .close ()
6065
6166def get_all_tweets5args (screen_name , api , amount , filename , lastTweetScrapedFile ):
62- #Twitter only allows access to a users most recent 3240 tweets with this method
6367 #initialize a list to hold all the tweepy Tweets
6468 alltweets = []
6569
66- # make request for most recent tweets (200 is the maximum allowed count)
70+ # make request for most recent tweets
71+ # you can only pull 200 tweets from twitter at a time,
72+ # hence count=200
6773 new_tweets = api .user_timeline (screen_name = screen_name ,count = 200 , include_rts = True )
6874 numTweets = len (new_tweets )
6975 MostRecentTweetPulled = getMostRecentTweet (lastTweetScrapedFile , screen_name )
@@ -72,13 +78,16 @@ def get_all_tweets5args(screen_name, api, amount, filename, lastTweetScrapedFile
7278 # record most recent tweet id.
7379 # The id of the last tweet the user tweeted.
7480 if len (new_tweets ) > 0 :
81+ # most recent tweet will replace MostRecentTweetPulled
82+ # after all new tweets are scraped
7583 most_recent_tweet = new_tweets [0 ].id
7684 else :
7785 most_recent_tweet = 0
7886 # loop over first request and keep requesting until amount
7987 # is reached or the 3240 threshold is reached
8088 while len (new_tweets ) > 0 :
8189 for tweet in new_tweets :
90+ # break if you see the id of a tweet you have scraped
8291 if int (MostRecentTweetPulled ) >= int (tweet .id ):
8392 print "Most Recent Tweet Pulled =" , MostRecentTweetPulled
8493 print "tweet.id = " , tweet .id
@@ -95,7 +104,7 @@ def get_all_tweets5args(screen_name, api, amount, filename, lastTweetScrapedFile
95104 strippedTweet += character
96105 tweet = strippedTweet
97106 # strip URLS, Ampersands, retweets, and newlines
98- tweet = re .sub (r'(?:www|https?)[^\s]+' , '' , tweet , flags = re .MULTILINE )
107+ tweet = re .sub (r'(?:www|https?)[^\s]+\s ' , '' , tweet , flags = re .MULTILINE )
99108 tweet = re .sub (r'&' , '&' , tweet , flags = re .MULTILINE )
100109 tweet = re .sub (r'^RT.*:+ ' , '' , tweet , flags = re .MULTILINE )
101110 tweet = tweet .replace ('\n ' , ' ' )
0 commit comments