1010from py_common .deps import ensure_requirements
1111import py_common .log as log
1212from py_common .types import ScrapedPerformer , ScrapedScene
13- from py_common .util import is_valid_url , scraper_args
13+ from py_common .util import dig , is_valid_url , scraper_args
1414
1515ensure_requirements ("lxml" , "requests" )
1616
132132# </div>
133133# </div>
134134# </div>
135- def map_performer (el ) -> ScrapedPerformer :
135+ def map_performer (el ) -> ScrapedPerformer | None :
136136 "Converts performer search result into scraped performer"
137137 url = el .xpath (".//a/@href" )[0 ]
138138 if "no-model" in url :
@@ -149,7 +149,7 @@ def map_performer(el) -> ScrapedPerformer:
149149
150150 return {
151151 "name" : name ,
152- "url " : fixed_url ,
152+ "urls " : [ fixed_url ] ,
153153 "image" : image ,
154154 }
155155
@@ -181,7 +181,7 @@ def best_quality_scene_image(code: str) -> str | None:
181181 "Finds the highest resolution scene image for a scene ID"
182182 no_qual_path = (
183183 "https://cdn77.scoreuniverse.com/modeldir/data/posting/"
184- f"{ code [0 : len (code )- 3 ]} /{ code [- 3 :]} /posting_{ code } "
184+ f"{ code [0 : len (code ) - 3 ]} /{ code [- 3 :]} /posting_{ code } "
185185 )
186186 for quality in ["_1920" , "_1600" , "_1280" , "_800" , "_xl" , "_lg" , "_med" , "" ]:
187187 image_url = f"{ no_qual_path } { quality } .jpg"
@@ -194,47 +194,69 @@ def scene_from_url(url: str) -> ScrapedScene:
194194 "Scrape scene URL from HTML"
195195 # url
196196 clean_url = urlunparse (urlparse (url )._replace (query = "" ))
197- scene : ScrapedScene = { "url" : clean_url }
197+ scene : ScrapedScene = {"url" : clean_url }
198198
199199 result = client .get (url )
200200 tree = html .fromstring (result .content )
201201
202- video_page = '//section[@id="videos_page-page" or @id="mixed_page-page"]'
202+ if not (
203+ video_page := tree .xpath (
204+ '//section[@id="videos_page-page" or @id="mixed_page-page"]'
205+ )
206+ ):
207+ log .error ("Page layout has changed, scraper needs updating" )
208+ return scene
209+
210+ video_page = video_page [0 ]
203211
204212 # title
205- if title := tree .xpath (
206- 'normalize-space(' # trim leading/trailing whitespace
207- f'{ video_page } //h1/span/following-sibling::text()[1] | ' # if h1 contains a span, ignore the span and take the remaining text
208- f'{ video_page } //h1[not(span)]/text()' # if h1 has no span, just take the text
209- ')'
213+ if (
214+ title := video_page .xpath (
215+ "normalize-space(" # trim leading/trailing whitespace
216+ "//h1/span/following-sibling::text()[1] | " # if h1 contains a span, ignore the span and take the remaining text
217+ "//h1[not(span)]/text()" # if h1 has no span, just take the text
218+ ")"
219+ )
210220 ):
211221 scene ["title" ] = title
212222
213223 # studio
214224 # Original studio is determinable by looking at the CDN links (<source src="//cdn77.scoreuniverse.com/naughtymag/scenes...)
215225 # this helps set studio for PornMegaLoad URLs as nothing is released directly by the network
216- if video_src := tree .xpath (f'{ video_page } //video/source/@src' ):
217- studio_ref = re .sub (r".*\.com/(.+?)\/(video|scene).*" , r"\1" , next (iter (video_src )))
218- scene ["studio" ] = { "name" : STUDIO_MAP .get (studio_ref , studio_ref ) }
226+ if video_src := video_page .xpath ("//video/source/@src" ):
227+ studio_ref = re .sub (
228+ r".*\.com/(.+?)\/(video|scene).*" , r"\1" , next (iter (video_src ))
229+ )
230+ scene ["studio" ] = {"name" : STUDIO_MAP .get (studio_ref , studio_ref )}
219231
220232 # date
221- if raw_date := tree .xpath (f'{ video_page } //div[contains(concat(" ",normalize-space(@class)," ")," mb-3 ")]//span[contains(.,"Date:")]/following-sibling::span' ):
233+ if raw_date := video_page .xpath (
234+ '//div[contains(concat(" ",normalize-space(@class)," ")," mb-3 ")]//span[contains(.,"Date:")]/following-sibling::span'
235+ ):
222236 scene ["date" ] = datetime .strptime (
223- re .sub (r"(\d+)[a-z]{2}" , r"\1" , next (iter (raw_date )).text ).replace ("..," , "" ),
224- "%B %d, %Y"
225- ).strftime ("%Y-%m-%d" )
237+ re .sub (r"(\d+)[a-z]{2}" , r"\1" , next (iter (raw_date )).text ).replace (
238+ "..," , ""
239+ ),
240+ "%B %d, %Y" ,
241+ ).isoformat ()
226242
227243 # details
228- if description := tree .xpath (f'{ video_page } //div[@class="p-desc p-3" or contains(@class, "desc")]/text()' ):
229- scene ["details" ] = "\n \n " .join ([p .strip () for p in description if len (p .strip ())])
244+ if description := video_page .xpath (
245+ '//div[@class="p-desc p-3" or contains(@class, "desc")]/text()'
246+ ):
247+ scene ["details" ] = "\n \n " .join (
248+ [p .strip () for p in description if len (p .strip ())]
249+ )
230250
231251 # tags
232- if tags := tree .xpath (f' { video_page } //a[contains(@href, "videos-tag") or contains(@href, "scenes -tag")]' ):
233- scene ["tags" ] = [ { "name" : tag .text } for tag in iter (tags ) ]
252+ if tags := video_page .xpath (' //a[contains(@href, "-tag")]' ):
253+ scene ["tags" ] = [{ "name" : tag .text } for tag in iter (tags )]
234254
235255 # performers
236- if performers := tree .xpath (f'{ video_page } //span[contains(.,"Featuring:")]/following-sibling::span/a' ):
237- scene ["performers" ] = [ { "name" : p .text } for p in iter (performers ) ]
256+ if performers := video_page .xpath (
257+ '//span[contains(.,"Featuring:")]/following-sibling::span/a'
258+ ):
259+ scene ["performers" ] = [{"name" : p .text } for p in iter (performers )]
238260
239261 # code
240262 scene_id = re .sub (r".*\/(\d+)\/?$" , r"\1" , clean_url )
0 commit comments