Move some link stuff into common, and make dreamwidth links internal.

See ssafar#5, but breaks bellbook entries
dhouck · Jan 29, 2015 · 2db8dc1 · 2db8dc1
1 parent 287669f
commit 2db8dc1
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 19 deletions.
diff --git a/src/chapter_list.py b/src/chapter_list.py
@@ -10,6 +10,7 @@
 
 from bs4 import BeautifulSoup
 import effulgence_pb2 as eproto
+import common
 
 import re
 import urllib
@@ -44,11 +45,15 @@ def chapters_from_toc(toc_soup):
                 "ascii", "xmlcharrefreplace")
 
         chapter.main_threaded_url = link["href"]
-        chapter.by_user = re.search(r'http://([a-z-]*)\.', 
-                                    chapter.main_threaded_url).groups()[0]
+        parsed_url = common.parse_dreamwidth_url(chapter.main_threaded_url)
+
+        chapter.by_user = parsed_url["by_user"]
 
         chapter.first_flat_page_url = set_param_in_url(
             chapter.main_threaded_url, "view", "flat")
+
+        chapter.full_chapter_file_name = common.dreamwidth_url_to_internal(
+            chapter.main_threaded_url).replace("xhtml", "pbtxt")
 
     return chapters
 

diff --git a/src/common.py b/src/common.py
@@ -1,10 +1,12 @@
 import sys
 
+from bs4 import BeautifulSoup
 import effulgence_pb2 as eproto
 
 import google.protobuf
 import re
 import os
+import urlparse
 
 def get_chapters_from_stdin():
     chapters = eproto.Chapters()
@@ -45,8 +47,16 @@ def parse_dreamwidth_url(url):
         result["comment_id"] = int(thread_result.groups()[0])
 
     return result
-
-
+
+def dreamwidth_url_to_internal(url):
+    """Given a dreamwidth URL, find the internal URL name"""
+    parsed = parse_dreamwidth_url(url)
+    if parsed.get("comment_id"):
+        parsed["fragment"] = "#cmt%d" % parsed["comment_id"]
+    else:
+        parsed["fragment"] = ""
+    return "{by_user}_{html_numeric_id}.xhtml{fragment}".format(**parsed)
+
 def full_chapter_from_introonly(introonly_chapter):
     """Given a chapter proto (without the comments), we load the full chapter."""
     chapter = eproto.Chapter()
@@ -68,3 +78,10 @@ def img_url_to_internal(url):
     """Will generate comment.icon_image_name."""
     r = re.match(r"http://www.dreamwidth.org/userpic/([0-9]*)/([0-9]*)", url)
     return "img_%s_%s.jpg" % r.groups()
+
+def replace_links_with_internal(soup):
+    for link in soup.find_all("a"):
+        if link.get("href"): # Sometimes comments have <a></a>.
+            dest = urlparse.urlsplit(link["href"])
+            if dest.netloc.split(".", 1)[1] == "dreamwidth.org":
+                link["href"] = dreamwidth_url_to_internal(link["href"])
diff --git a/src/extract_all.py b/src/extract_all.py
@@ -39,12 +39,17 @@ def extract_comment(c_div):
     # Apparently, not all comments have images.
     if img_tag:
         c.icon_url = img_tag["src"]
+        images.add(c.icon_url)
         c.icon_text = img_tag["alt"]
         c.icon_image_name = common.img_url_to_internal(c.icon_url)
 
     c.timestamp = c_div.find("span", class_="datetime").text.strip()
     c.cmt_id = int(re.match(r"comment-cmt([0-9]+)", c_div["id"]).groups()[0])
-    c.text = c_div.find("div", class_="comment-content").decode_contents(formatter="html")
+
+    content = c_div.find("div", class_="comment-content")
+    common.replace_links_with_internal(content)
+    c.text = content.decode_contents(formatter="html")
+
     return c
 
 def extract_comment_soup(soup, chapter, parent_threads):
@@ -103,6 +108,7 @@ def branch_thread(thread, from_id):
     return child
 
 chapters = common.get_chapters_from_stdin()
+images = set()
 
 # Mapping usernames to authors.
 user_to_moiety_dict = common.load_profile_data()

diff --git a/src/extract_firstflat_info.py b/src/extract_firstflat_info.py
@@ -35,12 +35,6 @@ def extract_intro_part(soup):
     for chapter in chapters.chapter:
         the_file_name = chapter.first_flat_page_url.replace("http://", "web_cache/")
 
-        # Infer the eventual full chapter file name. (Could be done in the first
-        # stage, too.)
-        html_numeric_id = re.search(r"([0-9]+).html", the_file_name).groups()[0]
-        chapter.full_chapter_file_name = "%s_%s.pbtxt" % (chapter.by_user,
-                                                          html_numeric_id)
-
         with open(the_file_name) as first_flat_file:
             print chapter.title
             soup = BeautifulSoup(first_flat_file)

diff --git a/src/new_toc.py b/src/new_toc.py
@@ -30,14 +30,7 @@
     # As for the others, parse them & replace them with the appropriate internal
     # links.
 
-    for link in the_toc_html.find_all("a"):
-        url_components = common.parse_dreamwidth_url(link["href"])
-        new_url = "%s_%s.xhtml" % (url_components["by_user"],
-                                   url_components["html_numeric_id"])
-        if "comment_id" in url_components:
-            new_url += "#cmt%d" % url_components["comment_id"]
-        link["href"] = new_url
-
+    common.replace_links_with_internal(the_toc_html)
 
     toc_string = the_toc_html.decode_contents(formatter="html")
     toc_html_string = toc_template.substitute(toc_entries=toc_string)