diff --git a/CHANGES.txt b/CHANGES.txt new file mode 100644 index 0000000..6a6a1cf --- /dev/null +++ b/CHANGES.txt @@ -0,0 +1,9 @@ +v0.1.0, 01 Jan 2013 -- Initial release. + +v0.1.1, 08 Jan 2013 -- Added cache for Google geocoder + +v0.1.2, 20 Jan 2013 -- Improved exception handling for less fragile connections to twitter.com. + Streaming 'location' option now uses the API's 'locations' parameter. + Dynamic throttling of Google geocode requests to not exceed rate limit. + +v1.0.0, 30 Jan 2013 -- Uploaded to github. \ No newline at end of file diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..55d347b --- /dev/null +++ b/MANIFEST @@ -0,0 +1,11 @@ +# file GENERATED by distutils, do NOT edit +CHANGES.txt +README.txt +setup.py +twittergeo/Geocoder.py +twittergeo/SearchGeo.py +twittergeo/SearchPics.py +twittergeo/StreamGeo.py +twittergeo/StreamPics.py +twittergeo/__init__.py +twittergeo/credentials.txt diff --git a/README.md b/README.md index bb43524..71deaeb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,73 @@ -TwitterGeo -========== +### UNDER DEVELOPMENT ### -Scripts for geocoding tweets and for downloading images embedded in tweets. \ No newline at end of file +_Expected release date is sometime in early February 2013_ + +--- + +### TwitterGeo ### + +_Scripts for geocoding tweets and for downloading images embedded in tweets._ + +TwitterGeo contains command line scripts for geocoding tweets and extracting embedded images from tweets from twitter.com. The scripts take one or more search words as command line arguments. The scripts download old tweets using Twitter's REST API and download new tweets using Twitter's Streaming API. + +About 1% or 2% of tweets contain latitude and longitude. Of those tweets that do not contain coordinate data, about 60% have the user's profile location, a descriptive text field that may or not be accurate. Using Google's Maps API, we can geocode these tweets, which locates about half of all tweets, a portion of which are suspect. + +Use the location option to restrict searches to a geographic location. Twitter returns tweets that either contain geocode in the location region or tweets from users whose profile location is in the specified region. + +Google does not require autentication, but it does enforce a daily limit of about 2,500 requests per day and about 10 requests per second. + +The Twitter API's require OAuth credentials which you can get by creating an application on dev.twitter.com. Once you have your OAuth secrets and keys, copy them into twittergeo/credentials.txt. Alternatively, specify the credentials file on the command line. + +Twitter restricts searching old tweets to within roughly the past week. Twitter also places a bandwidth limit on searching current tweet, but you will notice this only when you are searching a popular word. When this limit occurs the total number of skipped tweets is printed and the connection is maintained. + +# Features # + +*The following modules run as command line scripts and write tweets to the console.* + +### SearchGeo ### + +Prints old tweets and their location information and coordinates when possible. + +### StreamGeo ### + +Prints new tweets and their location information and coordinates when possible. + +### SearchPics ### + +Prints old tweets, their coordinates and URLs of any embedded photos. To download the photos use the -photo_dir option. To get tweets only from a specific geographic region use the -location. + +### StreamPics ### + +Prints new tweets, their coordinates and URLs of any embedded photos. To download the photos use the -photo_dir option. To get tweets only from a specific geographic region use the -location. + +*These are utility modules.* + +### Geocoder ### + +A wrapper for pygeocoder that provides a few Twitter helper methods. It adds throttling to respect Google's daily quota and rate limit. It also provides a caching mechanism for storing geocode lookups to a text file. The caching is only partially effective because user can enter their location in any format. + +# Installation # + + +1. On a command line, type: + +`pip install twittergeo` + +2. Either copy your OAuth consumer secret and key and your access token secret and key into twittergeo/credentials.txt, or copy them into another file which you will specify on the command line. See credentials.txt for the expected file format. + +3. Then, run a script type with '-m' option, for example: + +`python -m twittergeo.StreamGeo zzz` +`python -m twittergeo.StreamGeo zzz -oauth ./my_credentials.txt` + +# External Dependencies # + +This package uses the following external packages. + +* twitterapi - for downloading tweets +* pygeocoder - for geo-referencing using Google's Maps service +* fridge - for caching latitudes and longitudes in a persistant dict + +# Contributors # + +Jonas Geduldig \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..09beea8 --- /dev/null +++ b/setup.py @@ -0,0 +1,17 @@ +from distutils.core import setup + +setup( + name='TwitterGeo', + version='1.0.0', + author='Jonas Geduldig', + author_email='boxnumber03@gmail.com', + packages=['twittergeo'], + package_data={'': ['credentials.txt']}, + url='https://github.com/geduldig/twittergeo', + download_url = 'https://github.com/gedldig/twittergeo/tarball/1.0.0', + license='MIT', + keywords='twitter', + description='Command line scripts for geocoding old and new tweets from twitter.com and for downloading embedded photos.', + long_description=open('README.txt').read(), + install_requires = ['twitterapi', 'pygeocoder', 'Fridge'] +) \ No newline at end of file diff --git a/twittergeo/Geocoder.py b/twittergeo/Geocoder.py new file mode 100755 index 0000000..4b6b9f9 --- /dev/null +++ b/twittergeo/Geocoder.py @@ -0,0 +1,329 @@ +__author__ = "Jonas Geduldig" +__date__ = "December 20, 2012" +__license__ = "MIT" + +import datetime +import fridge +import math +import os +import pygeocoder +import socket +import time + +SOCKET_TIMEOUT = 3 # seconds -- need to set a timeout or connection can hang indefinitely +THROTTLE_INCR = .1 # seconds -- the time by which to dynamically increase between successive requests +DEFAULT_CACHE_FILE = 'geocode.cache' + +class Geocoder: + """Wrapper for pygeocoder with Twitter helper methods and Google Maps throttling and caching. + + Google has two geocoding limits: + 1) About 2,500 requests per day + 2) About 10 requests per second + Geocode request results are cached to a local file. + """ + + def __init__(self, cache_file=None): + """Zero counters and open cache file. + + Parameters + ---------- + cache_file : str + File path for cache file. File will get opened for append or created if not found. + If cache_file is not supplied, the default file will be used. + + """ + self.count_request = 0 # total number of geocode requests + self.count_request_ok = 0 # total number of successful geocode requests + self.count_nowhere = 0 # total number of tweets without geocode and without location + self.count_has_geocode = 0 # total number of tweets with embedded lat,lng + self.count_has_location = 0 # total number of tweets with geocode-able location in user profile + + self.quota_exceeded = False # true when Google's geocode request quota is exceeded (2500 per day) + self.quota_exceeded_at = None # date and time when Google's geocode request was first exceeded + + self.retry_count = 0 # retry once to check if request rate should be throttled + self.throttle = THROTTLE_INCR # the throttle in seconds to wait between requests + self.last_exec = None # time updated at each geocode request + + if cache_file == None: + path = os.path.dirname(__file__) + cache_file = os.path.join(path, DEFAULT_CACHE_FILE) + + # cache is a persistent dict with place address as key and lat/lng and count as value + self.cache = fridge.Fridge(cache_file) + + def _throttle(self): + """Wait an interval to not exceed rate limit. Called before each geocode request. + + """ + if self.retry_count == 1: + # increase the throttle to respect rate limit + self.retry_count = 2 + self.throttle += THROTTLE_INCR + elif self.retry_count == 2: + # increased throttle was sufficient + self.retry_count = 0 + now = datetime.datetime.now() + if self.last_exec: + # throttle for rate limit + delta = self.throttle - (now - self.last_exec).total_seconds() + if delta > 0: + time.sleep(delta) + self.last_exec = now + + def _should_retry(self): + """Handle an OVER QUERY LIMIT exception. Called when GeocodeError is thrown. + + Return + ------ + retry : boolean + True means wait 2 seconds, increase the throttle, and retry the request. + False means stop making geocode requests because daily limit was exceeded. + + """ + if not self.quota_exceeded: + if self.retry_count == 0: + # wait and retry once to see if we exceeded the rate limit + self.retry_count = 1 + time.sleep(2) + return True + else: + # if the second attempt failed we exceeded the 24-hour quota + self.retry_count = 0 + self.quota_exceeded = True + self.quota_exceeded_at = datetime.datetime.now() + return False + else: + return False + + def geocode(self, place): + """Returns Google's geocode data for a place. + + Parameters + ---------- + place : str + An address or partial address in any format. + + Return + ------ + geocode data : dict + Keys and values are from Google's JSON data. + + Raises + ------ + pygeocoder.GeocoderError + Quota exceeded, indecipherable address, etc. + Exception + Socket errors. + + """ + self._throttle() + try: + self.count_request += 1 + socket.setdefaulttimeout(SOCKET_TIMEOUT) + data = pygeocoder.Geocoder.geocode(place) + self.count_request_ok += 1 + return data + except pygeocoder.GeocoderError, e: + if e.status == pygeocoder.GeocoderError.G_GEO_OVER_QUERY_LIMIT and self._should_retry(): + return self.geocode(place) + else: + raise + + def latlng_to_address(self, lat, lng): + self._throttle() + try: + self.count_request += 1 + socket.setdefaulttimeout(SOCKET_TIMEOUT) + place = pygeocoder.Geocoder.latlng_to_address(lat, lng) + self.count_request_ok += 1 + return place + except pygeocoder.GeocoderError, e: + if e.status == pygeocoder.GeocoderError.G_GEO_OVER_QUERY_LIMIT and self._should_retry(): + return self.latlng_to_address(lan, lng) + else: + raise + + def address_to_latlng(self, place): + self._throttle() + try: + self.count_request += 1 + socket.setdefaulttimeout(SOCKET_TIMEOUT) + lat, lng = pygeocoder.Geocoder.address_to_latlng(place) + self.count_request_ok += 1 + return lat, lng + except pygeocoder.GeocoderError, e: + if e.status == pygeocoder.GeocoderError.G_GEO_OVER_QUERY_LIMIT and self._should_retry(): + return self.address_to_latlng(place) + else: + raise + + def geocode_tweet(self, status): + """Returns an address and coordinates associated with a tweet. + + Parameters + ---------- + status : dict + Keys and values of a tweet (i.e. a Twitter status). + + Return + ------ + place : str + An address or part of an address from either the tweeter's Twitter profile + or from reverse geocoding coordinates associated with the tweet. + latitude, longitude : float + Coordinates either assocatiated with the tweet or from geocoding the + location in the tweeter's Twitter profile. + + Raises + ------ + See Geocoder.geocode() documentation. + + """ + # start off with the location in the user's profile (it may be empty) + place = status['user']['location'] + if status['coordinates'] != None: + # the status is geocoded (swapped lat/lng), so use the coordinates to get the address + lng, lat = status['coordinates']['coordinates'] + place = self.latlng_to_address(float(lat), float(lng)) + self.count_has_geocode += 1 + elif ':' in place: + # users may put their coordinates in their profile + # the format is either "iPhone: lat,lng" or "UT: lat,lng" + (tmp, coord) = place.split(':', 1) + coord = coord.strip() + if ',' in coord: + lat, lng = coord.strip().split(',', 1) + elif ' ' in coord: + lat, lng = coord.strip().split(' ', 1) + if lat != None and lng != None: + try: + lat, lng = lat.strip(), lng.strip() + place = self.latlng_to_address(float(lat), float(lng)) + self.count_has_location += 1 + except ValueError, TypeError: + pass + elif place != None and place != '': + # there is a location in the user profile, so see if it is usable + # cache key is the place stripped of all punctuation and lower case + key = ' '.join(''.join(e for e in place if e.isalnum() or e == ' ').split()).lower() + cached_data = None + if self.cache != None and key in self.cache: + # see if the place name is in our cache + cached_data = self.cache[key] + lat, lng = cached_data[0], cached_data[1] + cached_data[2] += 1 + if not cached_data: + # see if Google can interpret the location + lat, lng = self.address_to_latlng(place) + cached_data = ( lat, lng, 1 ) + if self.cache != None: + self.cache[key] = cached_data + self.count_has_location += 1 + else: + lat, lng = None, None + self.count_nowhere += 1 + return place, lat, lng + + def get_region_box(self, place): + """Get the coordinates of a place and its bounding box. + The size of bounding box that Google returns depends on whether the place is + an address, a town or a country. + + Parameters + ---------- + place : str + An address or partial address in any format. Googles will try anything. + + Return + ------ + latitude, longitude : float + The place's coordinates. + latitude, longitude : float + The place's SW coordinates. + latitude, longitude : float + The place's NE coordinates. + + Raises + ------ + See Geocoder.geocode() documentation. + + """ + results = self.geocode(place) + geometry = results.raw[0]['geometry'] + latC, lngC = geometry['location']['lat'], geometry['location']['lng'] + latSW, lngSW = geometry['bounds']['southwest']['lat'], geometry['bounds']['southwest']['lng'] + latNE, lngNE = geometry['bounds']['northeast']['lat'], geometry['bounds']['northeast']['lng'] + return latC, lngC, latSW, lngSW, latNE, lngNE + + def get_region_circle(self, place): + """Get the coordinates of a place and its bounding circle. + The circle's radius is calculated from Google's bounding box and the + Haversine formula that takes into account the curvature of the earch. + The motivation for this method is Twitter's Search API's 'geocode' + parameter. + + Parameters + ---------- + place : str + An address or partial address in any format. + + Return + ------ + latitude, longitude : float + The place's coordinates. + radius : str + Half the distance spanning the corner's of the place's bounding box in kilomters. + + Raises + ------ + See Geocoder.geocode() documentation. + + """ + latC, lngC, latSW, lngSW, latNE, lngNE = self.get_region_box(place) + D = self.distance(latSW, lngSW, latNE, lngNE) + return latC, lngC, D/2 + + @classmethod + def distance(cls, lat1, lng1, lat2, lng2): + """Calculates the distance between two points on a sphere + + """ + # Haversine distance formula + lat1, lng1 = math.radians(lat1), math.radians(lng1) + lat2, lng2 = math.radians(lat2), math.radians(lng2) + s = math.sin((lat1-lat2)/2) + t = math.sin((lng1-lng2)/2) + a = s*s + math.cos(lat2)*math.cos(lat1)*t*t + c = 2*math.atan2(math.sqrt(a), math.sqrt(1-a)) + earth_radius = 6371 # kilometers + return earth_radius*c + + def print_stats(self): + print '\n--STATS--' + print 'geo requests: ', self.count_request + print 'geo requets ok: ', self.count_request_ok + print 'geo quota exceeded:', self.quota_exceeded_at + print 'geo throttle: ', self.throttle + print 'has none: ', self.count_nowhere + print 'has geocode: ', self.count_has_geocode + print 'has location: ', self.count_has_location + + if self.cache: + counts = [ 0, 0, 0 ] + max_place = ( None, 0 ) + for item in self.cache: + count = self.cache[item][2] + if count <= 5: + counts[0] += 1 + elif count <= 10: + counts[1] += 1 + else: + counts[2] += 1 + if count > max_place[1]: + max_place = ( item, count ) + print '\n--CACHE--' + print 'size: ', len(self.cache) + print 'counts: ', counts + print 'max place: ', max_place \ No newline at end of file diff --git a/twittergeo/SearchGeo.py b/twittergeo/SearchGeo.py new file mode 100644 index 0000000..db7c2d2 --- /dev/null +++ b/twittergeo/SearchGeo.py @@ -0,0 +1,100 @@ +""" + REQUIRED: PASTE YOUR TWITTER OAUTH CREDENTIALS INTO twittergeo/credentials.txt + OR USE -oauth OPTION TO USE A DIFFERENT FILE CONTAINING THE CREDENTIALS. + + Downloads old tweets from the newest to the oldest that contain any of the words + that are passed as arguments on the command line. Prints the tweet text and + location information, including latitude and longitude from Google's Map service. + + Use the -location option to get tweets from a geographical region. If you want to + override the default radius (in km) use the -radius option. Location is + determined from either the user's profile or geocode. + + The script calls Twitter's REST API which permits about a week's worth of old + tweets to be downloaded before breaking the connection. Twitter may also + disconnect if you exceed 180 downloads per 15 minutes. For this reason sleep is + called after each request. The default is 5 seconds. Override with the '-wait' + option. +""" + +__author__ = "Jonas Geduldig" +__date__ = "December 20, 2012" +__license__ = "MIT" + +# unicode printing for Windows +import sys, codecs +sys.stdout = codecs.getwriter('utf8')(sys.stdout) + +import argparse +import Geocoder +import os +import twitterapi +import urllib + +OAUTH = None +GEO = Geocoder.Geocoder() + +def parse_tweet(status): + """Print tweet, location and geocode + + """ + try: + geocode = GEO.geocode_tweet(status) + print '\n%s: %s' % (status['user']['screen_name'], status['text']) + print 'LOCATION:', status['user']['location'] + print 'GEOCODE:', geocode + except Exception, e: + if GEO.quota_exceeded: + raise + +def search_tweets(list, wait, region): + """Get tweets containing any words in 'list' and that have location or coordinates in 'region' + + """ + words = ' OR '.join(list) + params = { 'q': words } + if region: + params['geocode'] = '%f,%f,%fkm' % region # lat,lng,radius + search = twitterapi.TwSearch(OAUTH, params) + while True: + for item in search.past_results(wait): + if 'text' in item: + parse_tweet(item) + elif 'message' in item: + if item['code'] == 131: + continue # ignore internal server error + elif item['code'] == 88: + print>>sys.stderr, 'Suspend search until %s' % search.get_quota()['reset'] + raise Exception('Message from twiter: %s' % item['message']) +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Search tweet history.') + parser.add_argument('-oauth', metavar='FILENAME', type=str, help='read OAuth credentials from file') + parser.add_argument('-wait', type=int, default=5, help='seconds to wait between searches') + parser.add_argument('-location', type=str, help='limit tweets to a place') + parser.add_argument('-radius', type=float, help='distance from "location" in km') + parser.add_argument('words', metavar='W', type=str, nargs='+', help='word(s) to search') + args = parser.parse_args() + + if args.oauth: + OAUTH = twitterapi.TwCredentials.read_file(args.oauth) + else: + path = os.path.dirname(__file__) + path = os.path.join(path, 'credentials.txt') + OAUTH = twitterapi.TwCredentials.read_file(path) + + try: + if args.location: + lat, lng, radius = GEO.get_region_circle(args.location) + print 'Google found region at %f,%f with a radius of %s km' % (lat, lng, radius) + if args.radius: + radius = args.radius + region = (lat, lng, radius) + else: + region = None + search_tweets(args.words, args.wait, region) + except KeyboardInterrupt: + print>>sys.stderr, '\nTerminated by user' + except Exception, e: + print>>sys.stderr, '*** STOPPED', e + + GEO.print_stats() \ No newline at end of file diff --git a/twittergeo/SearchPics.py b/twittergeo/SearchPics.py new file mode 100644 index 0000000..a8c6926 --- /dev/null +++ b/twittergeo/SearchPics.py @@ -0,0 +1,121 @@ +""" + REQUIRED: PASTE YOUR TWITTER OAUTH CREDENTIALS INTO twittergeo/credentials.txt + OR USE -oauth OPTION TO USE A DIFFERENT FILE CONTAINING THE CREDENTIALS. + + Downloads old tweets from the newest to the oldest that contain any of the words + that are passed as arguments on the command line. Prints the tweet text and URLs + of any embedded photos. + + Use the -photo_dir option to save photos to a directory. + + Use the -stalk flag to print latitude and longitude from Google's Map service. + Location is determined from either the user's profile or geocode. + + Use the -location to get tweets from a geographical region. If you want to + override the default radius (in km) use the -radius option. + + The script calls Twitter's REST API which permits about a week's worth of old + tweets to be downloaded before breaking the connection. Twitter may also + disconnect if you exceed 180 downloads per 15 minutes. For this reason sleep is + called after each request. The default is 5 seconds. Override with the '-wait' + option. +""" + +__author__ = "Jonas Geduldig" +__date__ = "December 20, 2012" +__license__ = "MIT" + +# unicode printing for Windows +import sys, codecs +sys.stdout = codecs.getwriter('utf8')(sys.stdout) + +import argparse +import Geocoder +import os +import twitterapi +import urllib + +OAUTH = None +GEO = Geocoder.Geocoder() + +def parse_tweet(status, photo_dir, stalk): + """If tweet contains photo, print tweet. + If stalking, print location and geocode. + If photo_dir, print photo id and save photo to file. + + """ + if 'media' in status['entities']: + photo_count = 0 + for media in status['entities'].get('media'): + if media['type'] == 'photo': + photo_count += 1 + if photo_count == 1: + print '\n%s: %s' % (status['user']['screen_name'], status['text']) + if stalk and not GEO.quota_exceeded: + try: + geocode = GEO.geocode_tweet(status) + print 'LOCATION:', status['user']['location'] + print 'GEOCODE:', geocode + except Exception, e: + if GEO.quota_exceeded: + print>>sys.stderr, 'GEOCODER QUOTA EXCEEDED:', GEO.count_request + photo_url = media['media_url_https'] + if photo_dir: + print media['id_str'] + file_name = os.path.join(photo_dir, media['id_str']) + '.' + photo_url.split('.')[-1] + urllib.urlretrieve(photo_url, file_name) + +def search_tweets(list, wait, photo_dir, region, stalk): + """Get tweets containing any words in 'list' and that have location or coordinates in 'region' + + """ + words = ' OR '.join(list) + params = { 'q': words } + if region: + params['geocode'] = '%f,%f,%fkm' % region # lat,lng,radius + search = twitterapi.TwSearch(OAUTH, params) + while True: + for item in search.past_results(wait): + if 'text' in item: + parse_tweet(item, photo_dir, stalk) + elif 'message' in item: + if item['code'] == 131: + continue # ignore internal server error + elif item['code'] == 88: + print>>sys.stderr, 'Suspend search until %s' % search.get_quota()['reset'] + raise Exception('Message from twiter: %s' % item['message']) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Search tweet history.') + parser.add_argument('-oauth', metavar='FILENAME', type=str, help='read OAuth credentials from file') + parser.add_argument('-wait', type=int, default=5, help='seconds to wait between searches') + parser.add_argument('-photo_dir', metavar='DIRECTORYNAME', type=str, help='download photos to this directory') + parser.add_argument('-location', type=str, help='limit tweets to a place') + parser.add_argument('-radius', type=str, help='distance from "location" in km') + parser.add_argument('-stalk', action='store_true', help='print tweet location') + parser.add_argument('words', metavar='W', type=str, nargs='+', help='word(s) to search') + args = parser.parse_args() + + if args.oauth: + OAUTH = twitterapi.TwCredentials.read_file(args.oauth) + else: + path = os.path.dirname(__file__) + path = os.path.join(path, 'credentials.txt') + OAUTH = twitterapi.TwCredentials.read_file(path) + + try: + if args.location: + lat, lng, radius = GEO.get_region_circle(args.location) + print 'Google found region at %f,%f with a radius of %s km' % (lat, lng, radius) + if args.radius: + radius = args.radius + region = (lat, lng, radius) + else: + region = None + search_tweets(args.words, args.wait, args.photo_dir, region, args.stalk) + except KeyboardInterrupt: + print>>sys.stderr, '\nTerminated by user' + except Exception, e: + print>>sys.stderr, '*** STOPPED', e + + GEO.print_stats() \ No newline at end of file diff --git a/twittergeo/StreamGeo.py b/twittergeo/StreamGeo.py new file mode 100644 index 0000000..7dd281d --- /dev/null +++ b/twittergeo/StreamGeo.py @@ -0,0 +1,109 @@ +""" + REQUIRED: PASTE YOUR TWITTER OAUTH CREDENTIALS INTO twittergeo/credentials.txt + OR USE -oauth OPTION TO USE A DIFFERENT FILE CONTAINING THE CREDENTIALS. + + Downloads real-time tweets. You must supply either one or both of the -words and + -location options. Prints the tweet text and location information, including + latitude and longitude from Google's Map service. + + Use the -words option to get tweets that contain any of the words that are passed + as arguments on the command line. + + Use the -location option to get tweets from a geographical region. Location is + determined only from geocode in the tweet. Use -location ALL to get all geocoded + tweets from any location. + + The script calls Twitter's Streaming API which is bandwidth limitted. If you + exceed the rate limit, Twitter sends a message with the total number of tweets + skipped during the current connection. This number is printed, and the connection + remains open. +""" + +__author__ = "Jonas Geduldig" +__date__ = "December 20, 2012" +__license__ = "MIT" + +# unicode printing for Windows +import sys, codecs +sys.stdout = codecs.getwriter('utf8')(sys.stdout) + +import argparse +import Geocoder +import os +import twitterapi +import urllib + +OAUTH = None +GEO = Geocoder.Geocoder() + +def parse_tweet(status, region): + """Print tweet, location and geocode + + """ + try: + geocode = GEO.geocode_tweet(status) + print '\n%s: %s' % (status['user']['screen_name'], status['text']) + print 'LOCATION:', status['user']['location'] + print 'GEOCODE:', geocode + except Exception, e: + if GEO.quota_exceeded: + print>>sys.stderr, '*** GEOCODER QUOTA EXCEEDED:', GEO.count_request + raise + +def stream_tweets(list, region): + """Get tweets containing any words in 'list' or that have location or coordinates in 'region' + + """ + params = {} + if list != None: + words = ','.join(list) + params['track'] = words + if region != None: + params['locations'] = '%f,%f,%f,%f' % region + print 'REGION', region + while True: + try: + stream = twitterapi.TwStream(OAUTH, params) + while True: + for item in stream.results(): + if 'text' in item: + parse_tweet(item, region) + elif 'disconnect' in item: + raise Exception('Disconnect: %s' % item['disconnect'].get('reason')) + except Exception, e: + # reconnect on 401 errors and socket timeouts + print>>sys.stderr, '*** MUST RECONNECT', e + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Get real-time tweet stream.') + parser.add_argument('-oauth', metavar='FILENAME', type=str, help='read OAuth credentials from file') + parser.add_argument('-location', type=str, help='limit tweets to a place; use ALL to get all geocoded tweets') + parser.add_argument('-words', metavar='W', type=str, nargs='+', help='word(s) to track') + args = parser.parse_args() + + if args.words == None and args.location == None: + sys.exit('You must use either -words or -locoation or both.') + + if args.oauth: + OAUTH = twitterapi.TwCredentials.read_file(args.oauth) + else: + path = os.path.dirname(__file__) + path = os.path.join(path, 'credentials.txt') + OAUTH = twitterapi.TwCredentials.read_file(path) + + if args.location: + if args.location.lower() == 'all': + region = (-180, -90, 180, 90) + else: + latC, lngC, latSW, lngSW, latNE, lngNE = GEO.get_region_box(args.location) + region = (lngSW, latSW, lngNE, latNE) + print 'Google found region at %f,%f and %f,%f' % region + else: + region = None + + try: + stream_tweets(args.words, region) + except KeyboardInterrupt: + print>>sys.stderr, '\nTerminated by user' + + GEO.print_stats() \ No newline at end of file diff --git a/twittergeo/StreamPics.py b/twittergeo/StreamPics.py new file mode 100644 index 0000000..968dfbc --- /dev/null +++ b/twittergeo/StreamPics.py @@ -0,0 +1,129 @@ +""" + REQUIRED: PASTE YOUR TWITTER OAUTH CREDENTIALS INTO twittergeo/credentials.txt + OR USE -oauth OPTION TO USE A DIFFERENT FILE CONTAINING THE CREDENTIALS. + + Downloads real-time tweets that contain embedded photo URLs. You must supply + either one or both of the -words and -location options. Prints the tweet text, + location information, including latitude and longitude from Google's Map service, + and all photo URLs. + + Use the -words option to get tweets that contain any of the words that are passed + as arguments on the command line. + + Use the -location option to get tweets from a geographical region. Location is + determined only from geocode in the tweet. Use -location ALL to get all geocoded + tweets from any location. + + Use the -photo_dir option to save photos to a directory. + + Use the -stalk flag to print latitude and longitude from Google's Map service. + + The script calls Twitter's Streaming API which is bandwidth limitted. If you + exceed the rate limit, Twitter sends a message with the total number of tweets + skipped during the current connection. This number is printed, and the connection + remains open. +""" + +__author__ = "Jonas Geduldig" +__date__ = "December 20, 2012" +__license__ = "MIT" + +# unicode printing for Windows +import sys, codecs +sys.stdout = codecs.getwriter('utf8')(sys.stdout) + +import argparse +import Geocoder +import os +import twitterapi +import urllib + +OAUTH = None +GEO = Geocoder.Geocoder() + +def parse_tweet(status, photo_dir, stalk): + """If tweet contains photo, print tweet. + If stalking, print location and geocode. + If photo_dir, print photo id and save photo to file. + + """ + if 'media' in status['entities']: + photo_count = 0 + for media in status['entities'].get('media'): + if media['type'] == 'photo': + photo_count += 1 + if photo_count == 1: + print '\n%s: %s' % (status['user']['screen_name'], status['text']) + if stalk and not GEO.quota_exceeded: + try: + geocode = GEO.geocode_tweet(status) + print 'LOCATION:', status['user']['location'] + print 'GEOCODE:', geocode + except Exception, e: + if GEO.quota_exceeded: + print>>sys.stderr, '*** GEOCODER QUOTA EXCEEDED:', GEO.count_request + photo_url = media['media_url_https'] + if photo_dir: + print media['id_str'] + file_name = os.path.join(photo_dir, media['id_str']) + '.' + photo_url.split('.')[-1] + urllib.urlretrieve(photo_url, file_name) + +def stream_tweets(list, photo_dir, region, stalk): + """Get tweets containing any words in 'list' or that have location or coordinates in 'region' + + """ + params = {} + if list != None: + words = ','.join(list) + params['track'] = words + if region != None: + params['locations'] = '%f,%f,%f,%f' % region + print 'REGION', region + while True: + try: + stream = twitterapi.TwStream(OAUTH, params) + while True: + for item in stream.results(): + if 'text' in item: + parse_tweet(item, photo_dir, stalk) + elif 'disconnect' in item: + raise Exception('Disconnect: %s' % item['disconnect'].get('reason')) + except Exception, e: + # reconnect on 401 errors and socket timeouts + print>>sys.stderr, '*** MUST RECONNECT', e + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Get real-time tweet stream.') + parser.add_argument('-oauth', metavar='FILENAME', type=str, help='read OAuth credentials from file') + parser.add_argument('-location', type=str, help='limit tweets to a place; use ALL to get all geocoded tweets') + parser.add_argument('-photo_dir', metavar='DIRECTORYNAME', type=str, help='download photos to this directory') + parser.add_argument('-stalk', action='store_true', help='print tweet location') + parser.add_argument('-words', metavar='W', type=str, nargs='+', help='word(s) to track') + args = parser.parse_args() + + if args.words == None and args.location == None: + sys.exit('You must use either -words or -locoation or both.') + + if args.oauth: + OAUTH = twitterapi.TwCredentials.read_file(args.oauth) + else: + path = os.path.dirname(__file__) + path = os.path.join(path, 'credentials.txt') + OAUTH = twitterapi.TwCredentials.read_file(path) + + if args.location: + if args.location.lower() == 'all': + region = (-180, -90, 180, 90) + else: + latC, lngC, latSW, lngSW, latNE, lngNE = GEO.get_region_box(args.location) + region = (lngSW, latSW, lngNE, latNE) + print 'Google found region at %f,%f and %f,%f' % region + else: + region = None + + try: + stream_tweets(args.words, args.photo_dir, region, args.stalk) + except KeyboardInterrupt: + print>>sys.stderr, '\nTerminated by user' + + GEO.print_stats() \ No newline at end of file diff --git a/twittergeo/__init__.py b/twittergeo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/twittergeo/credentials.txt b/twittergeo/credentials.txt new file mode 100644 index 0000000..e50b2c6 --- /dev/null +++ b/twittergeo/credentials.txt @@ -0,0 +1,4 @@ +consumer_key= +consumer_secret= +access_token_key= +access_token_secret=