diff --git a/src/chapter_list.py b/src/chapter_list.py index 1a104d9..ac8e889 100644 --- a/src/chapter_list.py +++ b/src/chapter_list.py @@ -10,6 +10,7 @@ from bs4 import BeautifulSoup import effulgence_pb2 as eproto +import common import re import urllib @@ -44,11 +45,15 @@ def chapters_from_toc(toc_soup): "ascii", "xmlcharrefreplace") chapter.main_threaded_url = link["href"] - chapter.by_user = re.search(r'http://([a-z-]*)\.', - chapter.main_threaded_url).groups()[0] + parsed_url = common.parse_dreamwidth_url(chapter.main_threaded_url) + + chapter.by_user = parsed_url["by_user"] chapter.first_flat_page_url = set_param_in_url( chapter.main_threaded_url, "view", "flat") + + chapter.full_chapter_file_name = common.dreamwidth_url_to_internal( + chapter.main_threaded_url).replace("xhtml", "pbtxt") return chapters diff --git a/src/common.py b/src/common.py index c44b5cd..e8203ef 100644 --- a/src/common.py +++ b/src/common.py @@ -1,10 +1,12 @@ import sys +from bs4 import BeautifulSoup import effulgence_pb2 as eproto import google.protobuf import re import os +import urlparse def get_chapters_from_stdin(): chapters = eproto.Chapters() @@ -45,8 +47,16 @@ def parse_dreamwidth_url(url): result["comment_id"] = int(thread_result.groups()[0]) return result - - + +def dreamwidth_url_to_internal(url): + """Given a dreamwidth URL, find the internal URL name""" + parsed = parse_dreamwidth_url(url) + if parsed.get("comment_id"): + parsed["fragment"] = "#cmt%d" % parsed["comment_id"] + else: + parsed["fragment"] = "" + return "{by_user}_{html_numeric_id}.xhtml{fragment}".format(**parsed) + def full_chapter_from_introonly(introonly_chapter): """Given a chapter proto (without the comments), we load the full chapter.""" chapter = eproto.Chapter() @@ -68,3 +78,10 @@ def img_url_to_internal(url): """Will generate comment.icon_image_name.""" r = re.match(r"http://www.dreamwidth.org/userpic/([0-9]*)/([0-9]*)", url) return "img_%s_%s.jpg" % r.groups() + +def replace_links_with_internal(soup): + for link in soup.find_all("a"): + if link.get("href"): # Sometimes comments have . + dest = urlparse.urlsplit(link["href"]) + if dest.netloc.split(".", 1)[1] == "dreamwidth.org": + link["href"] = dreamwidth_url_to_internal(link["href"]) diff --git a/src/extract_all.py b/src/extract_all.py index 9c80909..1cf7a61 100644 --- a/src/extract_all.py +++ b/src/extract_all.py @@ -39,12 +39,17 @@ def extract_comment(c_div): # Apparently, not all comments have images. if img_tag: c.icon_url = img_tag["src"] + images.add(c.icon_url) c.icon_text = img_tag["alt"] c.icon_image_name = common.img_url_to_internal(c.icon_url) c.timestamp = c_div.find("span", class_="datetime").text.strip() c.cmt_id = int(re.match(r"comment-cmt([0-9]+)", c_div["id"]).groups()[0]) - c.text = c_div.find("div", class_="comment-content").decode_contents(formatter="html") + + content = c_div.find("div", class_="comment-content") + common.replace_links_with_internal(content) + c.text = content.decode_contents(formatter="html") + return c def extract_comment_soup(soup, chapter, parent_threads): @@ -103,6 +108,7 @@ def branch_thread(thread, from_id): return child chapters = common.get_chapters_from_stdin() +images = set() # Mapping usernames to authors. user_to_moiety_dict = common.load_profile_data() diff --git a/src/extract_firstflat_info.py b/src/extract_firstflat_info.py index 7a9ccc1..13a8973 100644 --- a/src/extract_firstflat_info.py +++ b/src/extract_firstflat_info.py @@ -35,12 +35,6 @@ def extract_intro_part(soup): for chapter in chapters.chapter: the_file_name = chapter.first_flat_page_url.replace("http://", "web_cache/") - # Infer the eventual full chapter file name. (Could be done in the first - # stage, too.) - html_numeric_id = re.search(r"([0-9]+).html", the_file_name).groups()[0] - chapter.full_chapter_file_name = "%s_%s.pbtxt" % (chapter.by_user, - html_numeric_id) - with open(the_file_name) as first_flat_file: print chapter.title soup = BeautifulSoup(first_flat_file) diff --git a/src/new_toc.py b/src/new_toc.py index b32e59f..b72cbc1 100644 --- a/src/new_toc.py +++ b/src/new_toc.py @@ -30,14 +30,7 @@ # As for the others, parse them & replace them with the appropriate internal # links. - for link in the_toc_html.find_all("a"): - url_components = common.parse_dreamwidth_url(link["href"]) - new_url = "%s_%s.xhtml" % (url_components["by_user"], - url_components["html_numeric_id"]) - if "comment_id" in url_components: - new_url += "#cmt%d" % url_components["comment_id"] - link["href"] = new_url - + common.replace_links_with_internal(the_toc_html) toc_string = the_toc_html.decode_contents(formatter="html") toc_html_string = toc_template.substitute(toc_entries=toc_string)