Skip to content

Commit 0c33558

Browse files
committed
Update The Score Group scraper for new tag selector
1 parent e06a770 commit 0c33558

File tree

1 file changed

+46
-24
lines changed

1 file changed

+46
-24
lines changed

scrapers/TheScoreGroup/TheScoreGroup.py

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from py_common.deps import ensure_requirements
1111
import py_common.log as log
1212
from py_common.types import ScrapedPerformer, ScrapedScene
13-
from py_common.util import is_valid_url, scraper_args
13+
from py_common.util import dig, is_valid_url, scraper_args
1414

1515
ensure_requirements("lxml", "requests")
1616

@@ -132,7 +132,7 @@
132132
# </div>
133133
# </div>
134134
# </div>
135-
def map_performer(el) -> ScrapedPerformer:
135+
def map_performer(el) -> ScrapedPerformer | None:
136136
"Converts performer search result into scraped performer"
137137
url = el.xpath(".//a/@href")[0]
138138
if "no-model" in url:
@@ -149,7 +149,7 @@ def map_performer(el) -> ScrapedPerformer:
149149

150150
return {
151151
"name": name,
152-
"url": fixed_url,
152+
"urls": [fixed_url],
153153
"image": image,
154154
}
155155

@@ -181,7 +181,7 @@ def best_quality_scene_image(code: str) -> str | None:
181181
"Finds the highest resolution scene image for a scene ID"
182182
no_qual_path = (
183183
"https://cdn77.scoreuniverse.com/modeldir/data/posting/"
184-
f"{code[0:len(code)-3]}/{code[-3:]}/posting_{code}"
184+
f"{code[0 : len(code) - 3]}/{code[-3:]}/posting_{code}"
185185
)
186186
for quality in ["_1920", "_1600", "_1280", "_800", "_xl", "_lg", "_med", ""]:
187187
image_url = f"{no_qual_path}{quality}.jpg"
@@ -194,47 +194,69 @@ def scene_from_url(url: str) -> ScrapedScene:
194194
"Scrape scene URL from HTML"
195195
# url
196196
clean_url = urlunparse(urlparse(url)._replace(query=""))
197-
scene: ScrapedScene = { "url": clean_url }
197+
scene: ScrapedScene = {"url": clean_url}
198198

199199
result = client.get(url)
200200
tree = html.fromstring(result.content)
201201

202-
video_page = '//section[@id="videos_page-page" or @id="mixed_page-page"]'
202+
if not (
203+
video_page := tree.xpath(
204+
'//section[@id="videos_page-page" or @id="mixed_page-page"]'
205+
)
206+
):
207+
log.error("Page layout has changed, scraper needs updating")
208+
return scene
209+
210+
video_page = video_page[0]
203211

204212
# title
205-
if title := tree.xpath(
206-
'normalize-space(' # trim leading/trailing whitespace
207-
f'{video_page}//h1/span/following-sibling::text()[1] | ' # if h1 contains a span, ignore the span and take the remaining text
208-
f'{video_page}//h1[not(span)]/text()' # if h1 has no span, just take the text
209-
')'
213+
if (
214+
title := video_page.xpath(
215+
"normalize-space(" # trim leading/trailing whitespace
216+
"//h1/span/following-sibling::text()[1] | " # if h1 contains a span, ignore the span and take the remaining text
217+
"//h1[not(span)]/text()" # if h1 has no span, just take the text
218+
")"
219+
)
210220
):
211221
scene["title"] = title
212222

213223
# studio
214224
# Original studio is determinable by looking at the CDN links (<source src="//cdn77.scoreuniverse.com/naughtymag/scenes...)
215225
# this helps set studio for PornMegaLoad URLs as nothing is released directly by the network
216-
if video_src := tree.xpath(f'{video_page}//video/source/@src'):
217-
studio_ref = re.sub(r".*\.com/(.+?)\/(video|scene).*", r"\1", next(iter(video_src)))
218-
scene["studio"] = { "name": STUDIO_MAP.get(studio_ref, studio_ref) }
226+
if video_src := video_page.xpath("//video/source/@src"):
227+
studio_ref = re.sub(
228+
r".*\.com/(.+?)\/(video|scene).*", r"\1", next(iter(video_src))
229+
)
230+
scene["studio"] = {"name": STUDIO_MAP.get(studio_ref, studio_ref)}
219231

220232
# date
221-
if raw_date := tree.xpath(f'{video_page}//div[contains(concat(" ",normalize-space(@class)," ")," mb-3 ")]//span[contains(.,"Date:")]/following-sibling::span'):
233+
if raw_date := video_page.xpath(
234+
'//div[contains(concat(" ",normalize-space(@class)," ")," mb-3 ")]//span[contains(.,"Date:")]/following-sibling::span'
235+
):
222236
scene["date"] = datetime.strptime(
223-
re.sub(r"(\d+)[a-z]{2}", r"\1", next(iter(raw_date)).text).replace("..,", ""),
224-
"%B %d, %Y"
225-
).strftime("%Y-%m-%d")
237+
re.sub(r"(\d+)[a-z]{2}", r"\1", next(iter(raw_date)).text).replace(
238+
"..,", ""
239+
),
240+
"%B %d, %Y",
241+
).isoformat()
226242

227243
# details
228-
if description := tree.xpath(f'{video_page}//div[@class="p-desc p-3" or contains(@class, "desc")]/text()'):
229-
scene["details"] = "\n\n".join([p.strip() for p in description if len(p.strip())])
244+
if description := video_page.xpath(
245+
'//div[@class="p-desc p-3" or contains(@class, "desc")]/text()'
246+
):
247+
scene["details"] = "\n\n".join(
248+
[p.strip() for p in description if len(p.strip())]
249+
)
230250

231251
# tags
232-
if tags := tree.xpath(f'{video_page}//a[contains(@href, "videos-tag") or contains(@href, "scenes-tag")]'):
233-
scene["tags"] = [ { "name": tag.text } for tag in iter(tags) ]
252+
if tags := video_page.xpath('//a[contains(@href, "-tag")]'):
253+
scene["tags"] = [{"name": tag.text} for tag in iter(tags)]
234254

235255
# performers
236-
if performers := tree.xpath(f'{video_page}//span[contains(.,"Featuring:")]/following-sibling::span/a'):
237-
scene["performers"] = [ { "name": p.text } for p in iter(performers) ]
256+
if performers := video_page.xpath(
257+
'//span[contains(.,"Featuring:")]/following-sibling::span/a'
258+
):
259+
scene["performers"] = [{"name": p.text} for p in iter(performers)]
238260

239261
# code
240262
scene_id = re.sub(r".*\/(\d+)\/?$", r"\1", clean_url)

0 commit comments

Comments
 (0)