From db61251271b667892794f5a3164b2f16b18d6035 Mon Sep 17 00:00:00 2001 From: "Dee.H.Y" Date: Tue, 1 Oct 2024 15:24:32 +0800 Subject: [PATCH] Update the Freshmen.yml scraper configuration, adding more data extraction fields - Updated the URL pattern from freshmen.net/content/ to club.freshmen.net/secure/ - Added extraction for the Date field, using XPath selectors to locate and format the date - Removed the definitions for the Performers and Image fields from the contentTab - Changed the XPath selector for the Image field from //*[@id="videoPlayer"]/@poster to the more concise //video/@poster - Added the Tags field to extract label information - Updated the location of the Performers field, now extracting actor names from actors_list__actor - Updated the last updated date of the configuration file to October 1, 2024 --- scrapers/Freshmen.yml | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/scrapers/Freshmen.yml b/scrapers/Freshmen.yml index fe8acbb89..d5f50382d 100644 --- a/scrapers/Freshmen.yml +++ b/scrapers/Freshmen.yml @@ -2,22 +2,32 @@ name: "Freshmen" sceneByURL: - action: scrapeXPath url: - - freshmen.net/content/ + - club.freshmen.net/secure/ scraper: sceneScraper xPathScrapers: sceneScraper: scene: Title: - selector: //h1/span/text() - concat: " " + selector: //h1 + postProcess: + - replace: + - regex: ^(.+)\s\(Issue\s#(\d+).+$ + with: "Issue $2: $1" Details: - selector: //div[@class='contentTab']/div[@class='top']//p + selector: //div[@class='content_detail__first_col__player__more__description']//div/p concat: "\n\n" - Performers: - Name: //div[@class='actor']/div[@class='name'] - Image: - selector: //*[@id="videoPlayer"]/@poster + Date: + selector: //div[@class='content_date']/text() + postProcess: + - parseDate: 01/02/2006 + Image: //div[@class="player"]//img/@src | //div[@class="player"]//video/@poster Studio: Name: fixed: Freshmen -# Last Updated June 26, 2022 + Tags: + Name: + selector: //div[@class="wrapper tag_list"]/a/text() + Performers: + Name: //div[@class='actors_list__actor']//h3/text() + +# Last Updated October 01, 2024