Skip to content
This repository has been archived by the owner on Feb 28, 2020. It is now read-only.

Commit

Permalink
Move some link stuff into common, and make dreamwidth links internal.
Browse files Browse the repository at this point in the history
See ssafar#5, but breaks bellbook entries
  • Loading branch information
dhouck committed Jan 29, 2015
1 parent 287669f commit 2db8dc1
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 19 deletions.
9 changes: 7 additions & 2 deletions src/chapter_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from bs4 import BeautifulSoup
import effulgence_pb2 as eproto
import common

import re
import urllib
Expand Down Expand Up @@ -44,11 +45,15 @@ def chapters_from_toc(toc_soup):
"ascii", "xmlcharrefreplace")

chapter.main_threaded_url = link["href"]
chapter.by_user = re.search(r'http://([a-z-]*)\.',
chapter.main_threaded_url).groups()[0]
parsed_url = common.parse_dreamwidth_url(chapter.main_threaded_url)

chapter.by_user = parsed_url["by_user"]

chapter.first_flat_page_url = set_param_in_url(
chapter.main_threaded_url, "view", "flat")

chapter.full_chapter_file_name = common.dreamwidth_url_to_internal(
chapter.main_threaded_url).replace("xhtml", "pbtxt")

return chapters

Expand Down
21 changes: 19 additions & 2 deletions src/common.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import sys

from bs4 import BeautifulSoup
import effulgence_pb2 as eproto

import google.protobuf
import re
import os
import urlparse

def get_chapters_from_stdin():
chapters = eproto.Chapters()
Expand Down Expand Up @@ -45,8 +47,16 @@ def parse_dreamwidth_url(url):
result["comment_id"] = int(thread_result.groups()[0])

return result



def dreamwidth_url_to_internal(url):
"""Given a dreamwidth URL, find the internal URL name"""
parsed = parse_dreamwidth_url(url)
if parsed.get("comment_id"):
parsed["fragment"] = "#cmt%d" % parsed["comment_id"]
else:
parsed["fragment"] = ""
return "{by_user}_{html_numeric_id}.xhtml{fragment}".format(**parsed)

def full_chapter_from_introonly(introonly_chapter):
"""Given a chapter proto (without the comments), we load the full chapter."""
chapter = eproto.Chapter()
Expand All @@ -68,3 +78,10 @@ def img_url_to_internal(url):
"""Will generate comment.icon_image_name."""
r = re.match(r"http://www.dreamwidth.org/userpic/([0-9]*)/([0-9]*)", url)
return "img_%s_%s.jpg" % r.groups()

def replace_links_with_internal(soup):
for link in soup.find_all("a"):
if link.get("href"): # Sometimes comments have <a></a>.
dest = urlparse.urlsplit(link["href"])
if dest.netloc.split(".", 1)[1] == "dreamwidth.org":
link["href"] = dreamwidth_url_to_internal(link["href"])
8 changes: 7 additions & 1 deletion src/extract_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,17 @@ def extract_comment(c_div):
# Apparently, not all comments have images.
if img_tag:
c.icon_url = img_tag["src"]
images.add(c.icon_url)
c.icon_text = img_tag["alt"]
c.icon_image_name = common.img_url_to_internal(c.icon_url)

c.timestamp = c_div.find("span", class_="datetime").text.strip()
c.cmt_id = int(re.match(r"comment-cmt([0-9]+)", c_div["id"]).groups()[0])
c.text = c_div.find("div", class_="comment-content").decode_contents(formatter="html")

content = c_div.find("div", class_="comment-content")
common.replace_links_with_internal(content)
c.text = content.decode_contents(formatter="html")

return c

def extract_comment_soup(soup, chapter, parent_threads):
Expand Down Expand Up @@ -103,6 +108,7 @@ def branch_thread(thread, from_id):
return child

chapters = common.get_chapters_from_stdin()
images = set()

# Mapping usernames to authors.
user_to_moiety_dict = common.load_profile_data()
Expand Down
6 changes: 0 additions & 6 deletions src/extract_firstflat_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,6 @@ def extract_intro_part(soup):
for chapter in chapters.chapter:
the_file_name = chapter.first_flat_page_url.replace("http://", "web_cache/")

# Infer the eventual full chapter file name. (Could be done in the first
# stage, too.)
html_numeric_id = re.search(r"([0-9]+).html", the_file_name).groups()[0]
chapter.full_chapter_file_name = "%s_%s.pbtxt" % (chapter.by_user,
html_numeric_id)

with open(the_file_name) as first_flat_file:
print chapter.title
soup = BeautifulSoup(first_flat_file)
Expand Down
9 changes: 1 addition & 8 deletions src/new_toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,7 @@
# As for the others, parse them & replace them with the appropriate internal
# links.

for link in the_toc_html.find_all("a"):
url_components = common.parse_dreamwidth_url(link["href"])
new_url = "%s_%s.xhtml" % (url_components["by_user"],
url_components["html_numeric_id"])
if "comment_id" in url_components:
new_url += "#cmt%d" % url_components["comment_id"]
link["href"] = new_url

common.replace_links_with_internal(the_toc_html)

toc_string = the_toc_html.decode_contents(formatter="html")
toc_html_string = toc_template.substitute(toc_entries=toc_string)
Expand Down

0 comments on commit 2db8dc1

Please sign in to comment.