Skip to content

Commit 4767d20

Browse files
committed
Fix PMVHaven for videos that do not have a creator
Closes #2154
1 parent d01f5c8 commit 4767d20

File tree

1 file changed

+45
-43
lines changed

1 file changed

+45
-43
lines changed

scrapers/PMVHaven/PMVHaven.py

+45-43
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
import json
22
import sys
33
from typing import Never
4-
import requests
54
import re
65

76
import py_common.log as log
7+
from py_common.util import dig
8+
from py_common.deps import ensure_requirements
9+
10+
ensure_requirements("requests")
11+
import requests # noqa: E402
812

913

1014
def fail(message: str) -> Never:
@@ -51,60 +55,59 @@ def getIMG(video):
5155
return item
5256
return ""
5357

54-
def getMusic(video):
55-
if len(video["music"]) > 0:
56-
return "Music:\n" + "\n".join(video["music"])
57-
return ""
5858

5959
def getVideoById(sceneId):
6060
data = getData(sceneId)
6161

62-
if "video" not in data or len(data["video"]) < 1:
62+
if not (video := dig(data, "video", 0)):
6363
fail(f"Video data not found in API response: {data}")
6464

65-
video = data["video"][0]
66-
tags = video["tags"] + video["categories"]
6765
urlTitle = video["title"].replace(" ", "-")
6866

69-
details = ""
70-
if video["description"] != None:
71-
details += video["description"]
72-
music = getMusic(video)
73-
if music:
74-
if len(details) > 0:
75-
details += "\n"
76-
details+=music
77-
78-
return {
67+
scraped = {
7968
"title": video["title"],
8069
"url": f"https://pmvhaven.com/video/{urlTitle}_{video['_id']}",
8170
"image": getIMG(video),
8271
"date": video["isoDate"].split("T")[0],
83-
"details": details,
84-
"studio": {"Name": video["creator"]},
85-
"tags": [{"name": x.strip()} for x in tags],
86-
"performers": [{"name": x.strip()} for x in video["stars"]],
72+
"performers": [{"name": x.strip()} for x in dig(video, "stars", default=[])],
8773
}
8874

75+
if description := dig(video, "description"):
76+
scraped["description"] = description
77+
78+
if songs := dig(video, "music"):
79+
music = "Music:\n" + "\n".join(songs)
80+
if "description" in scraped:
81+
scraped["description"] += "\n" + music
82+
else:
83+
scraped["description"] = music
84+
85+
if creator := dig(video, "creator"):
86+
scraped["studio"] = {"name": creator}
87+
88+
tags = dig(video, "tags", default=[]) + dig(video, "categories", default=[])
89+
# remove duplicates and sort
90+
scraped["tags"] = sorted(
91+
{tag.strip().lower(): tag.strip() for tag in tags}.values()
92+
)
93+
94+
return scraped
95+
8996

90-
"""
97+
def sceneByFragment(params):
98+
"""
9199
Assumes the video ID or the download hash is in the title of the Stash scene.
92100
The default file name when downloading from PMVHaven includes the download hash,
93101
so this will first assume the parameter is the download hash. If no results are
94102
returned then it will assume the parameter is the video ID and attempt data fetch.
95-
"""
96-
97-
98-
def sceneByFragment(params):
99-
if not params["title"]:
103+
"""
104+
if not (title := dig(params, "title")):
100105
fail("JSON blob did not contain title property")
101106

102-
regex = re.search(r"([a-z0-9]{24})", params["title"])
107+
if not (match := re.search(r"([a-z0-9]{24})", title)):
108+
fail(f"Did not find ID from video title '{title}'")
103109

104-
if not regex:
105-
fail(f"Did not find ID from video title {params['title']}")
106-
107-
inputParam = regex.group(1)
110+
inputParam = match.group(1)
108111
videoId = getVideoIdFromDownloadHash(inputParam)
109112

110113
if videoId is None:
@@ -113,24 +116,23 @@ def sceneByFragment(params):
113116
return getVideoById(videoId)
114117

115118

116-
"""
119+
def sceneByURL(params):
120+
"""
117121
This assumes a URL of https://pmvhaven.com/video/{title}_{alphanumericVideoId}
118122
As of 2024-01-01, this is the only valid video URL format. If this changes in
119123
the future (i.e. more than one valid URL type, or ID not present in URL) and
120-
requires falling back to the old cloudscraper method, an xpath of
121-
//meta[@property="video-id"]/@content
124+
requires falling back to the old cloudscraper method, an xpath of
125+
//meta[@property="video-id"]/@content
122126
can be used to pass into the PMVHaven API
123-
"""
127+
"""
124128

125-
126-
def sceneByURL(params):
127-
if not params["url"]:
129+
if not (url := dig(params, "url")):
128130
fail("No URL entered")
129131

130-
sceneId = params["url"].split("_")[-1]
132+
sceneId = url.split("_")[-1]
131133

132-
if not sceneId or not sceneId.isalnum():
133-
fail(f"Did not find scene ID from PMVStash video URL {params['url']}")
134+
if not (sceneId and sceneId.isalnum()):
135+
fail(f"Did not find scene ID from PMVStash video URL {url}")
134136

135137
data = getVideoById(sceneId)
136138
return data

0 commit comments

Comments
 (0)