Skip to content

Commit 3960a6c

Browse files
committed
Scrape ModelMediaAsia from the API instead of using XPath
Fixes #2163
1 parent a0105ed commit 3960a6c

File tree

1 file changed

+98
-59
lines changed

1 file changed

+98
-59
lines changed

scrapers/ModelMediaAsia.yml

+98-59
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,27 @@
11
name: "ModelMediaAsia"
22
sceneByURL:
3-
- action: scrapeXPath
3+
- action: scrapeJson
44
url:
55
- modelmediaasia.com/en-US/videos/
6+
queryURL: https://api.modelmediaasia.com/api/v2/videos/{url}
7+
queryURLReplace:
8+
url:
9+
- regex: .*\/videos\/([^?]*).*
10+
with: $1
11+
scraper: apiScraper_en
12+
- action: scrapeJson
13+
url:
614
- modelmediaasia.com/zh-CN/videos/
7-
scraper: sceneScraper
15+
queryURL: https://api.modelmediaasia.com/api/v2/videos/{url}
16+
queryURLReplace:
17+
url:
18+
- regex: .*\/videos\/([^?]*).*
19+
with: $1
20+
scraper: apiScraper_cn
821

922
sceneByFragment:
1023
action: scrapeXPath
11-
queryURL: https://modelmediaasia.com/videos/{filename}
24+
queryURL: https://api.modelmediaasia.com/api/v2/videos/{filename}
1225
queryURLReplace:
1326
# Assume beginning part contains the code
1427
filename:
@@ -17,74 +30,100 @@ sceneByFragment:
1730
with: $1
1831
- regex: .*\.[^\.]+$ # if no id is found in the filename
1932
with: # clear the filename so that it doesn't leak
20-
scraper: sceneScraper
33+
scraper: apiScraper_en
2134

2235
performerByURL:
2336
- action: scrapeXPath
2437
url:
2538
- modelmediaasia.com/en-US/models/
39+
queryURL: https://api.modelmediaasia.com/api/v2/models/{url}
40+
queryURLReplace:
41+
url:
42+
- regex: .*\/models\/([^?]*).*
43+
with: $1
44+
scraper: apiScraper_en
45+
- action: scrapeXPath
46+
url:
2647
- modelmediaasia.com/zh-CN/models/
27-
scraper: performerScraper
48+
queryURL: https://api.modelmediaasia.com/api/v2/models/{url}
49+
queryURLReplace:
50+
url:
51+
- regex: .*\/models\/([^?]*).*
52+
with: $1
53+
scraper: apiScraper_cn
54+
55+
jsonScrapers:
56+
apiScraper_en:
57+
performer:
58+
Name: data.name
59+
Aliases: data.name_cn
60+
Gender: data.gender
61+
Ethnicity:
62+
fixed: Asian
63+
Image: data.avatar
2864

29-
xPathScrapers:
30-
sceneScraper:
31-
common:
32-
$detailspart: //div[@class="details-part"]
3365
scene:
34-
Title: $detailspart//h2
35-
Date:
36-
# Some text may be in front of the date
37-
selector: $detailspart//span[contains(@class, "trending-year")]
66+
Title: data.title
67+
Code: &code data.serial_number
68+
Date: &date
69+
selector: data.published_at
3870
postProcess:
3971
- replace:
40-
- regex: '.*(\d{4}/\d{1,2}/\d{1,2})'
41-
with: $1
42-
- parseDate: 2006/01/02
43-
Details:
44-
selector: //div[@id="description-01"]/div/p/text()
45-
Performers:
46-
Name: $detailspart//div[@class="content-details trending-info"][2]//h6/text()
47-
# workaround for including URL
48-
# URL arrays don't work https://github.com/stashapp/stash/issues/5294
49-
# Details:
50-
# selector: //*[@id="__nuxt"]/main/div/div[2]/div/div/div/div[3]/div/ul/li/a/@href
51-
# postProcess:
52-
# - replace:
53-
# - regex: ^
54-
# with: https://modelmediaasia.com
72+
# Remove milliseconds
73+
- regex: (\d{10}).*
74+
with: $1
75+
- parseDate: unix
76+
Details: data.description
77+
Image: data.cover
78+
Tags: &tags
79+
Name: data.tags.#.name
5580
Studio:
5681
Name:
57-
fixed: "Model Media"
58-
Tags:
59-
Name: //ul[contains(@class, "iq-blogtag")]/li/a/text()
60-
Image: //div[@class="iq-main-slider site-video"]//div[contains(@class,"object-cover")]/img/@src
61-
Code: //div[@class="details-part"]//a[contains(@class,"text-capitalize")]/text()
62-
performerScraper:
63-
common:
64-
$infobox: //div[@id="__nuxt"]/main//div[@class="flex flex-col gap-3 my-5"]
82+
fixed: Model Media Asia
83+
Performers:
84+
Name: data.models.#.name
85+
Aliases: data.models.#.name_cn
86+
Image: data.models.#.avatar
87+
Gender: data.models.#.gender
88+
Ethnicity:
89+
fixed: Asian
90+
URL:
91+
selector: data.models.#.id
92+
postProcess:
93+
- replace:
94+
- regex: ^
95+
with: https://modelmediaasia.com/en-US/models/
96+
97+
apiScraper_cn:
6598
performer:
66-
Name: //div[@id="__nuxt"]/main//div[contains(@class,"text-white")]/div/h4[1]/text()
67-
Gender:
68-
fixed: Female
99+
Name: data.name_cn
100+
Aliases: data.name
101+
Gender: data.gender
69102
Ethnicity:
70103
fixed: Asian
71-
Height:
72-
selector: $infobox/p[1]/text()
73-
postProcess:
74-
- replace:
75-
- regex: '(\d+)\D+cm'
76-
with: $1
77-
Weight:
78-
selector: $infobox/p[2]/text()
79-
postProcess:
80-
- replace:
81-
- regex: '(\d+)\D+kg'
82-
with: $1
83-
Measurements:
84-
selector: $infobox/p[3]/text()
85-
postProcess:
86-
- replace:
87-
- regex: '\s*(\d+\w*)\D+(\d+)\D+(\d+)'
88-
with: $1-$2-$3
89-
Image: //img[@class="w-full aspect-[3/4]"]/@src
90-
# Last Updated October 26, 2024
104+
Image: data.avatar
105+
106+
scene:
107+
Title: data.title_cn
108+
Code: *code
109+
Date: *date
110+
Details: data.description_cn
111+
Image: data.cover
112+
Tags: *tags
113+
Studio:
114+
Name:
115+
fixed: 麻豆傳媒映畫
116+
Performers:
117+
Name: data.models.#.name_cn
118+
Aliases: data.models.#.name
119+
Image: data.models.#.avatar
120+
Gender: data.models.#.gender
121+
Ethnicity:
122+
fixed: Asian
123+
URL:
124+
selector: data.models.#.id
125+
postProcess:
126+
- replace:
127+
- regex: ^
128+
with: https://modelmediaasia.com/zh-CN/models/
129+
# Last Updated January 16, 2025

0 commit comments

Comments
 (0)