From ba8cb54d85a2f43110b4dfb75b589eb968766d56 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 24 Oct 2017 06:41:55 -0500 Subject: [PATCH] Tumblr backup updated for Python 3 --- tumblr_backup_python3.py | 1091 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 1091 insertions(+) create mode 100644 tumblr_backup_python3.py diff --git a/tumblr_backup_python3.py b/tumblr_backup_python3.py new file mode 100644 index 0000000..5299be9 --- /dev/null +++ b/tumblr_backup_python3.py @@ -0,0 +1,1091 @@ +#!/usr/bin/env python +# encoding: utf-8 +# converted for Python3 by possibilityleft + +# standard Python library imports + +import codecs +from collections import defaultdict +from datetime import datetime +import errno +from glob import glob +from http.client import HTTPException +import imghdr +try: + import json +except ImportError: + import simplejson as json +import locale +import os +from os.path import join, split, splitext +import re +import ssl +import sys +import queue as queue +import threading +import time +import urllib.request, urllib.parse, urllib.error +# import urllib2 +import urllib.parse +from xml.sax.saxutils import escape + +try: + from settings import DEFAULT_BLOGS +except ImportError: + DEFAULT_BLOGS = [] + +# extra optional packages +try: + import pyexiv2 +except ImportError: + pyexiv2 = None +try: + import youtube_dl + from youtube_dl.utils import sanitize_filename +except ImportError: + youtube_dl = None + +# Format of displayed tags +TAG_FMT = '#%s' + +# Format of tag link URLs; set to None to suppress the links. +# Named placeholders that will be replaced: domain, tag +TAGLINK_FMT = 'http://%(domain)s/tagged/%(tag)s' + +# exit codes +EXIT_SUCCESS = 0 +EXIT_NOPOSTS = 1 +# EXIT_OPTPARSE = 2 -- returned by module optparse +EXIT_INTERRUPT = 3 +EXIT_ERRORS = 4 + +# add another JPEG recognizer +# see http://www.garykessler.net/library/file_sigs.html +def test_jpg(h, f): + if h[:3] == '\xFF\xD8\xFF' and h[3] in "\xDB\xE0\xE1\xE2\xE3": + return 'jpg' + +imghdr.tests.append(test_jpg) + +# variable directory names, will be set in TumblrBackup.backup() +save_folder = '' +media_folder = '' + +# constant names +root_folder = os.getcwd() +post_dir = 'posts' +json_dir = 'json' +media_dir = 'media' +archive_dir = 'archive' +theme_dir = 'theme' +save_dir = '../' +backup_css = 'backup.css' +custom_css = 'custom.css' +avatar_base = 'avatar' +dir_index = 'index.html' + +blog_name = '' +post_ext = '.html' +have_custom_css = False + +POST_TYPES = ( + 'text', 'quote', 'link', 'answer', 'video', 'audio', 'photo', 'chat' +) +POST_TYPES_SET = frozenset(POST_TYPES) +TYPE_ANY = 'any' +TAG_ANY = '__all__' + +MAX_POSTS = 50 + +HTTP_TIMEOUT = 90 +HTTP_CHUNK_SIZE = 1024 * 1024 + +# bb-tumblr-backup API key +API_KEY = '8YUsKJvcJxo2MDwmWMDiXZGuMuIbeCwuQGP5ZHSEA4jBJPMnJT' + +# ensure the right date/time format +try: + locale.setlocale(locale.LC_TIME, '') +except locale.Error: + pass +encoding = 'utf-8' +time_encoding = locale.getlocale(locale.LC_TIME)[1] or encoding + + +have_ssl_ctx = sys.version_info >= (2, 7, 9) +if have_ssl_ctx: + ssl_ctx = ssl.create_default_context() + def urlopen(url): + return urllib.request.urlopen(url, timeout=HTTP_TIMEOUT, context=ssl_ctx) +else: + def urlopen(url): + return urllib.request.urlopen(url, timeout=HTTP_TIMEOUT) + + +def log(account, s): + if not options.quiet: + if account: + sys.stdout.write('%s: ' % account) + sys.stdout.write(s[:-1] + ' ' * 20 + s[-1:]) + sys.stdout.flush() + + +def mkdir(dir, recursive=False): + if not os.path.exists(dir): + try: + if recursive: + os.makedirs(dir) + else: + os.mkdir(dir) + except OSError as e: + if e.errno != errno.EEXIST: + raise + + +def path_to(*parts): + return join(save_folder, *parts) + + +def open_file(open_fn, parts): + if len(parts) > 1: + mkdir(path_to(*parts[:-1]), (len(parts) > 2)) + return open_fn(path_to(*parts)) + + +def open_text(*parts): + return open_file( + lambda f: codecs.open(f, 'w', encoding, 'xmlcharrefreplace'), parts + ) + + +def open_media(*parts): + return open_file(lambda f: open(f, 'wb'), parts) + + +def strftime(format, t=None): + if t is None: + t = time.localtime() + return time.strftime(format, t).decode(time_encoding) + + +def get_api_url(account): + """construct the tumblr API URL""" + global blog_name + blog_name = account + if '.' not in account: + blog_name += '.tumblr.com' + return 'https://api.tumblr.com/v2/blog/' + blog_name + '/posts' + + +def set_period(): + """Prepare the period start and end timestamps""" + i = 0 + tm = [int(options.period[:4]), 1, 1, 0, 0, 0, 0, 0, -1] + if len(options.period) >= 6: + i = 1 + tm[1] = int(options.period[4:6]) + if len(options.period) == 8: + i = 2 + tm[2] = int(options.period[6:8]) + options.p_start = time.mktime(tm) + tm[i] += 1 + options.p_stop = time.mktime(tm) + + +def apiparse(base, count, start=0): + params = {'api_key': API_KEY, 'limit': count, 'reblog_info': 'true'} + if start > 0: + params['offset'] = start + url = base + '?' + urllib.parse.urlencode(params) + for _ in range(10): + try: + resp = urllib.request.urlopen(url) + data = resp.read() + except (EnvironmentError, HTTPException) as e: + sys.stderr.write("%s getting %s\n" % (e, url)) + continue + '''This section not converted to Python3; seemed better to just try to load + the JSON below and throw an error there if it fails... + if resp.headers.get_content_charset() == 'application/json': + break + sys.stderr.write("Unexpected Content-Type: " + resp.headers.get_content_charset()) + return None + else: + return None''' + try: + doc = json.loads(data) + except ValueError as e: + sys.stderr.write('%s: %s\n%d %s %s\n%r\n' % ( + e.__class__.__name__, e, resp.getcode(), resp.msg, resp.headers.get_content_charset(), data + )) + return None + return doc if doc.get('meta', {}).get('status', 0) == 200 else None + + +def add_exif(image_name, tags): + try: + metadata = pyexiv2.ImageMetadata(image_name) + metadata.read() + except EnvironmentError: + sys.stderr.write("Error reading metadata for image %s\n" % image_name) + return + KW_KEY = 'Iptc.Application2.Keywords' + if '-' in options.exif: # remove all tags + if KW_KEY in metadata.iptc_keys: + del metadata[KW_KEY] + else: # add tags + if KW_KEY in metadata.iptc_keys: + tags |= set(metadata[KW_KEY].value) + tags = list(tag.strip().lower() for tag in tags | options.exif if tag) + metadata[KW_KEY] = pyexiv2.IptcTag(KW_KEY, tags) + try: + metadata.write() + except EnvironmentError: + sys.stderr.write("Writing metadata failed for tags: %s in: %s\n" % (tags, image_name)) + + +def save_style(): + with open_text(backup_css) as css: + css.write('''\ +@import url("override.css"); + +body { width: 720px; margin: 0 auto; } +body > footer { padding: 1em 0; } +header > img { float: right; } +img { max-width: 720px; } +blockquote { margin-left: 0; border-left: 8px #999 solid; padding: 0 24px; } +.archive h1, .subtitle, article { padding-bottom: 0.75em; border-bottom: 1px #ccc dotted; } +.post a.llink { display: none; } +header a, footer a { text-decoration: none; } +footer, article footer a { font-size: small; color: #999; } +''') + + +def get_avatar(): + try: + resp = urlopen('http://api.tumblr.com/v2/blog/%s/avatar' % blog_name) + avatar_data = resp.read() + except (EnvironmentError, HTTPException): + return + avatar_file = avatar_base + '.' + imghdr.what(None, avatar_data[:32]) + with open_media(theme_dir, avatar_file) as f: + f.write(avatar_data) + + +def get_style(): + """Get the blog's CSS by brute-forcing it from the home page. + The v2 API has no method for getting the style directly. + See https://groups.google.com/d/msg/tumblr-api/f-rRH6gOb6w/sAXZIeYx5AUJ""" + try: + resp = urlopen('http://%s/' % blog_name) + page_data = resp.read() + except (EnvironmentError, HTTPException): + return + for match in re.findall(r'(?s)', page_data): + css = match.strip().decode(encoding, 'replace') + if not '\n' in css: + continue + css = css.replace('\r', '').replace('\n ', '\n') + with open_text(theme_dir, 'style.css') as f: + f.write(css + '\n') + return + + +class TumblrBackup: + + def __init__(self): + self.errors = False + self.total_count = 0 + + def exit_code(self): + if self.errors: + return EXIT_ERRORS + if self.total_count == 0: + return EXIT_NOPOSTS + return EXIT_SUCCESS + + def build_index(self): + filter = join('*', dir_index) if options.dirs else '*' + post_ext + for f in glob(path_to(post_dir, filter)): + post = LocalPost(f) + self.index[post.tm.tm_year][post.tm.tm_mon].append(post) + self.archives = sorted(((y, m) for y in self.index for m in self.index[y]), + reverse=options.reverse_month + ) + + def save_index(self): + f = glob(path_to(theme_dir, avatar_base + '.*')) + avatar = split(f[0])[1] if f else None + with open_text(dir_index) as idx: + idx.write(self.header(self.title, body_class='index', + subtitle=self.subtitle, avatar=avatar + )) + for year in sorted(list(self.index.keys()), reverse=options.reverse_index): + self.save_year(idx, year) + dtt = datetime.now() + + idx.write('\n' % time.strftime('%x %X', datetime.timetuple(dtt)) + ) + + def save_year(self, idx, year): + idx.write('

%s

\n\n\n') + + def save_month(self, year, month, tm): + posts = sorted(self.index[year][month], key=lambda x: x.date, reverse=options.reverse_month) + posts_month = len(posts) + posts_page = options.posts_per_page if options.posts_per_page >= 1 else posts_month + + def pages_per_month(y, m): + posts = len(self.index[y][m]) + return posts / posts_page + bool(posts % posts_page) + + def next_month(inc): + i = self.archives.index((year, month)) + i += inc + if i < 0 or i >= len(self.archives): + return 0, 0 + return self.archives[i] + + FILE_FMT = '%d-%02d-p%s' + pages_month = pages_per_month(year, month) + for page, start in enumerate(list(range(0, posts_month, posts_page)), start=1): + archive = [self.header(time.strftime('%B %Y', tm), body_class='archive')] + archive.extend(p.get_post() for p in posts[start:start + posts_page]) + + file_name = FILE_FMT % (year, month, page) + if options.dirs: + base = save_dir + archive_dir + '/' + suffix = '/' + arch = open_text(archive_dir, file_name, dir_index) + file_name += suffix + else: + base = '' + suffix = post_ext + file_name += suffix + arch = open_text(archive_dir, file_name) + + if page > 1: + pp = FILE_FMT % (year, month, page - 1) + else: + py, pm = next_month(-1) + pp = FILE_FMT % (py, pm, pages_per_month(py, pm)) if py else '' + first_file = file_name + + if page < pages_month: + np = FILE_FMT % (year, month, page + 1) + else: + ny, nm = next_month(+1) + np = FILE_FMT % (ny, nm, 1) if ny else '' + + archive.append(self.footer(base, pp, np, suffix)) + + arch.write('\n'.join(archive)) + + return first_file + + def header(self, title='', body_class='', subtitle='', avatar=''): + root_rel = '' if body_class == 'index' else save_dir + css_rel = root_rel + (custom_css if have_custom_css else backup_css) + if body_class: + body_class = ' class=' + body_class + h = ''' + + +%s + + + + +
+''' % (encoding, self.title, css_rel, body_class) + if avatar: + h += 'Avatar\n' % (root_rel, theme_dir, avatar) + if title: + h += '

%s

\n' % title + if subtitle: + h += '

%s

\n' % subtitle + h += '
\n' + return h + + def footer(self, base, previous_page, next_page, suffix): + f = '
\n' + return f + + def backup(self, account): + """makes single files and an index for every post on a public Tumblr blog account""" + + self.index = defaultdict(lambda: defaultdict(list)) + self.archives = [] + + base = get_api_url(account) + + # make sure there are folders to save in + global save_folder, media_folder, post_ext, post_dir, save_dir, have_custom_css + if options.blosxom: + save_folder = root_folder + post_ext = '.txt' + post_dir = os.curdir + post_class = BlosxomPost + else: + save_folder = join(root_folder, options.outdir or account) + media_folder = path_to(media_dir) + if options.dirs: + post_ext = '' + save_dir = '../../' + mkdir(path_to(post_dir), True) + else: + mkdir(save_folder, True) + post_class = TumblrPost + have_custom_css = os.access(path_to(custom_css), os.R_OK) + + self.post_count = 0 + + # get the highest post id already saved + ident_max = None + if options.incremental: + try: + ident_max = max( + int(splitext(split(f)[1])[0]) + for f in glob(path_to(post_dir, '*' + post_ext)) + ) + log(account, "Backing up posts after %d\r" % ident_max) + except ValueError: # max() arg is an empty sequence + pass + else: + log(account, "Getting basic information\r") + + # start by calling the API with just a single post + soup = apiparse(base, 1) + if not soup: + self.errors = True + return + + # collect all the meta information + resp = soup['response'] + blog = resp['blog'] + try: + self.title = escape(blog['title']) + except KeyError: + self.title = account + self.subtitle = blog['description'] + + # use the meta information to create a HTML header + TumblrPost.post_header = self.header(body_class='post') + + # find the post number limit to back up + last_post = blog['posts'] + if options.count: + last_post = min(last_post, options.count + options.skip) + + def _backup(posts): + for p in sorted(posts, key=lambda x: x['id'], reverse=True): + post = post_class(p) + if ident_max and int(post.ident) <= ident_max: + return False + if options.period: + if post.date >= options.p_stop: + continue + if post.date < options.p_start: + return False + if options.request: + if post.typ not in options.request: + continue + tags = options.request[post.typ] + if not (TAG_ANY in tags or tags & post.tags_lower): + continue + if options.no_reblog: + if 'reblogged_from_name' in p or 'reblogged_root_name' in p: + if 'trail' in p and not p['trail']: + continue + elif 'trail' in p and 'is_current_item' not in p['trail'][-1]: + continue + elif 'trail' in p and p['trail'] and 'is_current_item' not in p['trail'][-1]: + continue + backup_pool.add_work(post.save_content) + self.post_count += 1 + return True + + # start the thread pool + backup_pool = ThreadPool() + try: + # Get the JSON entries from the API, which we can only do for max 50 posts at once. + # Posts "arrive" in reverse chronological order. Post #0 is the most recent one. + last_batch = MAX_POSTS + i = options.skip + while i < last_post: + # find the upper bound + j = min(i + MAX_POSTS, last_post) + log(account, "Getting posts %d to %d of %d\r" % (i, j - 1, last_post)) + + soup = apiparse(base, j - i, i) + if soup is None: + i += last_batch # try the next batch + self.errors = True + continue + + posts = soup['response']['posts'] + # posts can be empty if we don't backup reblogged posts + if not posts or not _backup(posts): + break + + last_batch = len(posts) + i += last_batch + except: + # ensure proper thread pool termination + backup_pool.cancel() + raise + + # wait until all posts have been saved + backup_pool.wait() + + # postprocessing + if not options.blosxom and self.post_count: + get_avatar() + '''get_style() + if not have_custom_css: + save_style()''' + self.build_index() + self.save_index() + + log(account, "%d posts backed up\n" % self.post_count) + self.total_count += self.post_count + + +class TumblrPost: + + post_header = '' # set by TumblrBackup.backup() + + def __init__(self, post): + self.content = '' + self.post = post + self.json_content = json.dumps(post, sort_keys=True, indent=4, separators=(',', ': ')) + self.ident = str(post['id']) + self.url = post['post_url'] + self.shorturl = post['short_url'] + self.typ = post['type'] + self.date = post['timestamp'] + self.isodate = datetime.utcfromtimestamp(self.date).isoformat() + 'Z' + self.tm = time.localtime(self.date) + self.title = '' + self.tags = post['tags'] + self.note_count = post.get('note_count', 0) + self.source_title = post.get('source_title', '') + self.source_url = post.get('source_url', '') + if options.request: + self.tags_lower = set(t.lower() for t in self.tags) + self.file_name = join(self.ident, dir_index) if options.dirs else self.ident + post_ext + self.llink = self.ident if options.dirs else self.file_name + + def save_content(self): + """generates the content for this post""" + post = self.post + content = [] + + def append(s, fmt='%s'): + content.append(fmt % s) + + def get_try(elt): + return post.get(elt) or '' + + def append_try(elt, fmt='%s'): + elt = get_try(elt) + if elt: + if options.save_images: + elt = re.sub(r'''(?i)(]*\bsrc\s*=\s*["'])(.*?)(["'][^>]*>)''', + self.get_inline_image, elt + ) + append(elt, fmt) + + self.media_dir = join(post_dir, self.ident) if options.dirs else media_dir + self.media_url = save_dir + self.media_dir + self.media_folder = path_to(self.media_dir) + + if self.typ == 'text': + self.title = get_try('title') + append_try('body') + + elif self.typ == 'photo': + url = get_try('link_url') + is_photoset = len(post['photos']) > 1 + for offset, p in enumerate(post['photos'], start=1): + o = p['original_size'] + src = o['url'] + if options.save_images: + src = self.get_image_url(src, offset if is_photoset else 0) + append(escape(src), '') + if url: + content[-1] = '%s' % (escape(url), content[-1]) + content[-1] = '

' + content[-1] + '

' + if p['caption']: + append(p['caption'], '

%s

') + append_try('caption') + + elif self.typ == 'link': + url = post['url'] + self.title = '%s' % (escape(url), post['title'] or url) + append_try('description') + + elif self.typ == 'quote': + append(post['text'], '

%s

') + append_try('source', '

%s

') + + elif self.typ == 'video': + src = '' + if options.save_video: + if post['video_type'] == 'tumblr': + src = self.get_media_url(post['video_url'], '.mp4') + elif youtube_dl: + src = self.get_youtube_url(self.url) + if not src: + sys.stdout.write('Unable to download video in post #%s%-50s\n' % + (self.ident, ' ') + ) + if src: + append('

' % ( + src, "Your browser does not support the video element.", src, "Video file" + )) + else: + append(post['player'][-1]['embed_code']) + append_try('caption') + + elif self.typ == 'audio': + src = '' + if options.save_audio: + if post['audio_type'] == 'tumblr': + audio_url = post['audio_url'] + if audio_url.startswith('http://a.tumblr.com/'): + src = self.get_media_url(audio_url, '.mp3') + elif audio_url.startswith('https://www.tumblr.com/audio_file/'): + audio_url = 'http://a.tumblr.com/%so1.mp3' % audio_url.split('/')[-1] + src = self.get_media_url(audio_url, '.mp3') + elif post['audio_type'] == 'soundcloud': + src = self.get_media_url(post['audio_url'], '.mp3') + if src: + append('

' % ( + src, "Your browser does not support the audio element.", src, "Audio file" + )) + else: + append(post['player']) + append_try('caption') + + elif self.typ == 'answer': + self.title = post['question'] + append_try('answer') + + elif self.typ == 'chat': + self.title = get_try('title') + append( + '
\n'.join('%(label)s %(phrase)s' % d for d in post['dialogue']), + '

%s

' + ) + + else: + sys.stderr.write( + "Unknown post type '%s' in post #%s%-50s\n" % (self.typ, self.ident, ' ') + ) + append(escape(self.json_content), '
%s
') + + self.content = '\n'.join(content) + + # fix wrongly nested HTML elements + for p in ('

(<(%s)>)', '()

'): + self.content = re.sub(p % 'p|ol|iframe[^>]*', r'\1', self.content) + + self.save_post() + + def get_youtube_url(self, youtube_url): + # determine the media file name + filetmpl = '%(id)s_%(uploader_id)s_%(title)s.%(ext)s' + ydl = youtube_dl.YoutubeDL({ + 'outtmpl': join(self.media_folder, filetmpl), + 'quiet': True, + 'restrictfilenames': True, + 'noplaylist': True, + 'continuedl': True, + 'nooverwrites': True, + 'retries': 3000, + 'fragment_retries': 3000, + 'ignoreerrors': True + }) + ydl.add_default_info_extractors() + try: + result = ydl.extract_info(youtube_url, download=False) + media_filename = sanitize_filename(filetmpl % result['entries'][0], restricted=True) + except: + return '' + + # check if a file with this name already exists + if not os.path.isfile(media_filename): + try: + ydl.extract_info(youtube_url, download=True) + except: + return '' + return '%s/%s' % (self.media_url, split(media_filename)[1]) + + def get_media_url(self, media_url, extension): + media_filename = self.get_filename(media_url) + media_filename = os.path.splitext(media_filename)[0] + extension + saved_name = self.download_media(media_url, media_filename) + if saved_name is not None: + media_filename = '%s/%s' % (self.media_url, saved_name) + return media_filename + + def get_image_url(self, image_url, offset): + """Saves an image if not saved yet. Returns the new URL or + the original URL in case of download errors.""" + + def _addexif(fn): + if options.exif and fn.endswith('.jpg'): + add_exif(fn, set(self.tags)) + + image_filename = self.get_filename(image_url, '_o%s' % offset if offset else '') + saved_name = self.download_media(image_url, image_filename) + if saved_name is not None: + _addexif(join(self.media_folder, saved_name)) + image_url = '%s/%s' % (self.media_url, saved_name) + return image_url + + @staticmethod + def maxsize_image_url(image_url): + if ".tumblr.com/" not in image_url or image_url.endswith('.gif'): + return image_url + # change the image resolution to 1280 + return re.sub(r'_\d{2,4}(\.\w+)$', r'_1280\1', image_url) + + def get_inline_image(self, match): + """Saves an inline image if not saved yet. Returns the new tag or + the original one in case of download errors.""" + + image_url = match.group(2) + if image_url.startswith('//'): + image_url = 'http:' + image_url + image_url = self.maxsize_image_url(image_url) + path = urllib.parse.urlparse(image_url).path + image_filename = path.split('/')[-1] + if not image_filename or not image_url.startswith('http'): + return match.group(0) + + saved_name = self.download_media(image_url, image_filename) + if saved_name is None: + return match.group(0) + return '%s%s/%s%s' % (match.group(1), self.media_url, + saved_name, match.group(3) + ) + + def get_filename(self, url, offset=''): + """Determine the image file name depending on options.image_names""" + if options.image_names == 'i': + return self.ident + offset + elif options.image_names == 'bi': + return account + '_' + self.ident + offset + else: + return url.split('/')[-1] + + def download_media(self, url, filename): + # check if a file with this name already exists + known_extension = '.' in filename[-5:] + image_glob = glob(path_to(self.media_dir, + filename + ('' if known_extension else '.*') + )) + if image_glob: + return split(image_glob[0])[1] + # download the media data + try: + resp = urlopen(url) + with open_media(self.media_dir, filename) as dest: + data = resp.read(HTTP_CHUNK_SIZE) + hdr = data[:32] # save the first few bytes + while data: + dest.write(data) + data = resp.read(HTTP_CHUNK_SIZE) + except (EnvironmentError, ValueError, HTTPException) as e: + sys.stderr.write('%s downloading %s\n' % (e, url)) + try: + os.unlink(path_to(self.media_dir, filename)) + except OSError as e: + if e.errno != errno.ENOENT: + raise + return None + # determine the file type if it's unknown + if not known_extension: + image_type = imghdr.what(None, hdr) + if image_type: + oldname = path_to(self.media_dir, filename) + filename += '.' + image_type.replace('jpeg', 'jpg') + os.rename(oldname, path_to(self.media_dir, filename)) + return filename + + def get_post(self): + """returns this post in HTML""" + post = self.post_header + '
\n' % (self.typ, self.ident) + post += '
\n

\n' % (self.isodate, time.strftime('%x %X', self.tm)) + post += '\n' % (save_dir, post_dir, self.llink) + post += '

\n' % self.shorturl + if self.title: + post += '

%s

\n' % self.title + post += self.content + foot = [] + if self.tags: + foot.append(''.join(self.tag_link(t) for t in self.tags)) + if self.note_count: + foot.append('%d note%s' % (self.note_count, 's'[self.note_count == 1:])) + if self.source_title and self.source_url: + foot.append('%s' % + (self.source_url, self.source_title) + ) + if foot: + post += '\n
%s
' % ' — '.join(foot) + post += '\n
\n' + return post + + @staticmethod + def tag_link(tag): + tag_disp = escape(TAG_FMT % tag) + if not TAGLINK_FMT: + return tag_disp + ' ' + url = TAGLINK_FMT % {'domain': blog_name, 'tag': urllib.parse.quote(tag.encode('utf-8'))} + return '%s\n' % (url, tag_disp) + + def save_post(self): + """saves this post locally""" + if options.dirs: + f = open_text(post_dir, self.ident, dir_index) + else: + f = open_text(post_dir, self.file_name) + with f: + f.write(self.get_post()) + os.utime(f.stream.name, (self.date, self.date)) # XXX: is f.stream.name portable? + if options.json: + with open_text(json_dir, self.ident + '.json') as f: + f.write(self.json_content) + + +class BlosxomPost(TumblrPost): + + def get_image_url(self, image_url, offset): + return image_url + + def get_post(self): + """returns this post as a Blosxom post""" + post = self.title + '\nmeta-id: p-' + self.ident + '\nmeta-url: ' + self.url + if self.tags: + post += '\nmeta-tags: ' + ' '.join(t.replace(' ', '+') for t in self.tags) + post += '\n\n' + self.content + return post + + +class LocalPost: + + def __init__(self, post_file): + with codecs.open(post_file, 'r', encoding) as f: + self.lines = f.readlines() + # remove header and footer + while self.lines and '
' not in self.lines[-1]: + del self.lines[-1] + parts = post_file.split(os.sep) + if parts[-1] == dir_index: # ...//index.html + self.file_name = os.sep.join(parts[-2:]) + self.ident = parts[-2] + else: + self.file_name = parts[-1] + self.ident = splitext(self.file_name)[0] + self.date = os.stat(post_file).st_mtime + self.tm = time.localtime(self.date) + + def get_post(self): + return ''.join(self.lines) + + +class ThreadPool: + + def __init__(self, thread_count=20, max_queue=1000): + self.queue = queue.Queue(max_queue) + self.quit = threading.Event() + self.abort = threading.Event() + self.threads = [threading.Thread(target=self.handler) for _ in range(thread_count)] + for t in self.threads: + t.start() + + def add_work(self, work): + self.queue.put(work) + + def wait(self): + self.quit.set() + self.queue.join() + + def cancel(self): + self.abort.set() + for i, t in enumerate(self.threads, start=1): + log('', "\rStopping threads %s%s\r" % + (' ' * i, '.' * (len(self.threads) - i)) + ) + t.join() + + def handler(self): + while not self.abort.is_set(): + try: + work = self.queue.get(True, 0.1) + except queue.Empty: + if self.quit.is_set(): + break + else: + if self.quit.is_set() and self.queue.qsize() % MAX_POSTS == 0: + log(account, "%d remaining posts to save\r" % self.queue.qsize()) + try: + work() + finally: + self.queue.task_done() + + +if __name__ == '__main__': + import optparse + + def csv_callback(option, opt, value, parser): + setattr(parser.values, option.dest, set(value.split(','))) + + def tags_callback(option, opt, value, parser): + request_callback(option, opt, TYPE_ANY + ':' + value.replace(',', ':'), parser) + + def request_callback(option, opt, value, parser): + request = parser.values.request or {} + for req in value.lower().split(','): + parts = req.strip().split(':') + typ = parts.pop(0) + if typ != TYPE_ANY and typ not in POST_TYPES: + parser.error("%s: invalid post type '%s'" % (opt, typ)) + for typ in POST_TYPES if typ == TYPE_ANY else (typ,): + if parts: + request[typ] = request.get(typ, set()).union(parts) + else: + request[typ] = set([TAG_ANY]) + parser.values.request = request + + parser = optparse.OptionParser("Usage: %prog [options] blog-name ...", + description="Makes a local backup of Tumblr blogs." + ) + parser.add_option('-O', '--outdir', help="set the output directory" + " (default: blog-name)" + ) + parser.add_option('-D', '--dirs', action='store_true', + help="save each post in its own folder" + ) + parser.add_option('-q', '--quiet', action='store_true', + help="suppress progress messages" + ) + parser.add_option('-i', '--incremental', action='store_true', + help="incremental backup mode" + ) + parser.add_option('-k', '--skip-images', action='store_false', default=True, + dest='save_images', help="do not save images; link to Tumblr instead" + ) + parser.add_option('--save-video', action='store_true', help="save video files") + parser.add_option('--save-audio', action='store_true', help="save audio files") + parser.add_option('-j', '--json', action='store_true', + help="save the original JSON source" + ) + parser.add_option('-b', '--blosxom', action='store_true', + help="save the posts in blosxom format" + ) + parser.add_option('-r', '--reverse-month', action='store_false', default=True, + help="reverse the post order in the monthly archives" + ) + parser.add_option('-R', '--reverse-index', action='store_false', default=True, + help="reverse the index file order" + ) + parser.add_option('-a', '--auto', type='int', metavar="HOUR", + help="do a full backup at HOUR hours, otherwise do an incremental backup" + " (useful for cron jobs)" + ) + parser.add_option('-n', '--count', type='int', default=0, + help="save only COUNT posts" + ) + parser.add_option('-s', '--skip', type='int', default=0, + help="skip the first SKIP posts" + ) + parser.add_option('-p', '--period', help="limit the backup to PERIOD" + " ('y', 'm', 'd' or YYYY[MM[DD]])" + ) + parser.add_option('-N', '--posts-per-page', type='int', default=50, + metavar='COUNT', help="set the number of posts per monthly page, " + "0 for unlimited" + ) + parser.add_option('-Q', '--request', type='string', action='callback', + callback=request_callback, help="save posts matching the request" + " TYPE:TAG:TAG:…,TYPE:TAG:…,…. TYPE can be %s or %s; TAGs can be" + " omitted or a colon-separated list. Example: -Q %s:personal,quote" + ",photo:me:self" % (', '.join(POST_TYPES), TYPE_ANY, TYPE_ANY) + ) + parser.add_option('-t', '--tags', type='string', action='callback', + callback=tags_callback, help="save only posts tagged TAGS (comma-separated values;" + " case-insensitive)" + ) + parser.add_option('-T', '--type', type='string', action='callback', + callback=request_callback, help="save only posts of type TYPE" + " (comma-separated values from %s)" % ', '.join(POST_TYPES) + ) + parser.add_option('--no-reblog', action='store_true', help="don't save reblogged posts") + parser.add_option('-I', '--image-names', type='choice', choices=('o', 'i', 'bi'), + default='o', metavar='FMT', + help="image filename format ('o'=original, 'i'=, 'bi'=_)" + ) + parser.add_option('-e', '--exif', type='string', action='callback', + callback=csv_callback, default=set(), metavar='KW', + help="add EXIF keyword tags to each picture (comma-separated values;" + " '-' to remove all tags, '' to add no extra tags)" + ) + parser.add_option('-S', '--no-ssl-verify', action='store_true', + help="ignore SSL verification errors" + ) + options, args = parser.parse_args() + + if options.auto is not None and options.auto != time.localtime().tm_hour: + options.incremental = True + if options.period: + try: + pformat = {'y': '%Y', 'm': '%Y%m', 'd': '%Y%m%d'}[options.period] + options.period = time.strftime(pformat) + except KeyError: + options.period = options.period.replace('-', '') + if not re.match(r'^\d{4}(\d\d)?(\d\d)?$', options.period): + parser.error("Period must be 'y', 'm', 'd' or YYYY[MM[DD]]") + set_period() + if have_ssl_ctx and options.no_ssl_verify: + ssl_ctx = ssl._create_unverified_context() + # Otherwise, it's an old Python version without SSL verification, + # so this is the default. + + args = args or DEFAULT_BLOGS + if not args: + parser.error("Missing blog-name") + if options.outdir and len(args) > 1: + parser.error("-O can only be used for a single blog-name") + if options.exif and not pyexiv2: + parser.error("--exif: module 'pyexif2' is not installed") + if (options.save_video or options.save_audio) and not youtube_dl: + parser.error("--save-video/-audio: module 'youtube_dl' is not installed") + + tb = TumblrBackup() + try: + for account in args: + tb.backup(account) + except KeyboardInterrupt: + sys.exit(EXIT_INTERRUPT) + + sys.exit(tb.exit_code())