Skip to content

Commit 50c2897

Browse files
authored
Merge pull request #2539 from spaceyuck/fandom-performer-scraper
[fandom.com] Fandom performer scraper
2 parents 6963885 + 8ec0448 commit 50c2897

File tree

1 file changed

+134
-0
lines changed

1 file changed

+134
-0
lines changed

scrapers/fandom-wiki.yml

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
name: fandom-wiki
2+
performerByURL:
3+
- action: scrapeXPath
4+
url:
5+
- fandom.com/
6+
queryURL: "{url}"
7+
scraper: performerScraper
8+
9+
xPathScrapers:
10+
performerScraper:
11+
common:
12+
$infobox: //aside[contains(@class, 'portable-infobox')]
13+
performer:
14+
Name:
15+
selector: $infobox/h2//text()[1]
16+
Aliases:
17+
selector: $infobox//div[@data-source="full_name" or @data-source="full" or @data-source="also_known_as" or @data-source="known" or @data-source="aka" or contains(@data-source,"Name") or contains(@data-source,"name") or contains(@data-source,"alias") or contains(@data-source,"Alias")]/div/text()[1]
18+
Gender:
19+
selector: $infobox//div[@data-source="gender" or @data-source="Gender" or @data-source="sex"]/div//text()[1]
20+
Birthdate:
21+
selector: $infobox//div[contains(@data-source,"Birth") or contains(@data-source,"birth") or @data-source="born" or @data-source="dob" or @data-source="DOB"]/div//text()[1]
22+
postProcess: &postprocessDate
23+
# there are many formats used, many only give partial info - some examples
24+
# September 26, 1981
25+
# June, 2077
26+
# January 7, 11942
27+
# Between 1 August 1950 and 31 July 1951
28+
# 2 May 1998 (aged 46/47)
29+
# 1997 [3] some more text
30+
- replace:
31+
# remove expressions in braces (likely only giving context)
32+
- regex: \([^)]*\)
33+
with: ""
34+
# remove commas ("September 26, 1981" -> "September 26 1981")
35+
- regex: "[,]"
36+
with: ""
37+
# remove estimation prefixes like "ca." (might be in the middle of date string, like for an unknown exact day)
38+
- regex: ^(.*\s+)?(?:c\.|ca\.)\s*(\d+.*)$
39+
with: "$1 $2"
40+
# remove suffixed from day ("1st" -> "1", "2nd" -> "2", "3rd" -> "3", "4th" -> "4")
41+
- regex: "([0-9])(?:st|nd|rd|th)"
42+
with: "$1"
43+
# remove extra content around first par with format "2 May 1998" -> "May 2 1998"
44+
- regex: ^(?:.*[^\d])?(\d{1,2})(?:\s+)(\w{3,})(?:\s+)(\d{4})(?:.*)?$
45+
with: $2 $1 $3
46+
# remove extra content around first par with format "May 2 1998" -> "May 2 1998"
47+
- regex: ^(?:.*[^\w])?(\w{3,})(?:\s+)(\d{1,2})(?:\s+)(\d{4})(?:.*)?$
48+
with: $1 $2 $3
49+
# starts with a year -> expand to JJanuary first
50+
- regex: ^(\d{4})(?:[^\d].*)?$
51+
with: January 1 $1
52+
- parseDate: January 2 2006
53+
DeathDate:
54+
selector: $infobox//div[contains(@data-source,"Death") or contains(@data-source,"death") or @data-source="died" or @data-source="dod" or @data-source="DOD"]/div//text()[1]
55+
postProcess: *postprocessDate
56+
Ethnicity:
57+
selector: $infobox//div[@data-source="species" or contains(@data-source,"Race") or contains(@data-source,"race")]/div//text()[1]
58+
Country:
59+
selector: $infobox//div[@data-source="nationality"]/div//text()[1]
60+
HairColor:
61+
selector: $infobox//div[contains(@data-source,"hair") or contains(@data-source,"Hair")]/div//text()[1]
62+
EyeColor:
63+
selector: $infobox//div[contains(@data-source,"eye") or contains(@data-source,"Eye")]/div//text()[1]
64+
Height:
65+
selector: $infobox//div[@data-source="height"]/div//text()
66+
concat: " "
67+
postProcess:
68+
# there are many formats used, a lot of them use multi-unit formats, or even multiple values - some examples
69+
# 163 cm (5'4")[3]
70+
# 5'3 (1.60m)
71+
# 171 cm (5'7½")
72+
# 171 cm (5'7.3"), according to source abc
73+
# 5 feet 3 inches (160cm)[3]
74+
# 163cm[1] (5'4")[note 1]
75+
# 168 cm (with heels)
76+
- replace:
77+
# remove citation markers like "[3]"
78+
- regex: "\\[[ ]*[0-9]+[ ]*\\]"
79+
with: ""
80+
# height in meters, pulls out as centimeters number ("1.63 m (5'4")" -> "163")
81+
- regex: ^(?:.*[^0-9.])?([0-9]+)\.([0-9]{2}])[ ]?(?:m|M)(?:.*)?$
82+
with: "$1$2"
83+
# height in centimeters, pulls out just the number ("163 cm (5'4")" -> "163")
84+
- regex: ^(?:.*[^0-9])?([0-9]{3})[ ]?(?:cm|CM)(?:.*)?$
85+
with: "$1"
86+
# pull out any three digit number from string (to catch those in cm without units)
87+
- regex: ^(?:.*[^0-9])?([0-9]{3})(?:.*)?$
88+
with: "$1"
89+
# reduce any variant of imperial to predicatble format ("5' 4"" -> "5'4")
90+
- regex: ^(?:.*[^0-9])?([0-9])\s*(?:'|ft|ft.|feet|foot)\s*([0-9])(?:.*)?$
91+
with: "$1'$2"
92+
- feetToCm: True
93+
Weight:
94+
selector: $infobox//div[@data-source="weight"]/div//text()[1]
95+
postProcess:
96+
# there are many formats used, a lot of them use multi-unit formats, or even multiple values - some examples
97+
# 127 lb (58kg)
98+
# 127 lb
99+
# 58 kg (127lb)
100+
# 58.2 kg (127.3 lb)
101+
# 58 kg
102+
- replace:
103+
# in kg, pull out just the number
104+
- regex: ^(?:.*[^0-9.])?([0-9]{2,3})(?:\.[0-9]+)?[ ]?(?:kg|Kg|KG)(?:.*)?$
105+
with: "$1"
106+
# in lb, reduce to number and unit
107+
- regex: ^(?:.*[^0-9.])?([0-9]{2,3}})(?:\.[0-9]+)?[ ]?(?:lb|Lb|LB)(?:.*)?$
108+
with: "$1lb"
109+
# use JS to convert imperial to metric, if imperial
110+
- lbToKg: true
111+
Measurements:
112+
# looks for one field with all values, or split fields (get combined into single string)
113+
# split fields really only work corretly if the field order is still B-W-H
114+
selector: $infobox//div[@data-source="measurements" or @data-source="bust" or @data-source="waist" or @data-source="hips"]/div//text()[1]
115+
concat: " "
116+
postProcess:
117+
# there are probably many formats used - examples fo seen formats
118+
# B93 W59 H88 cm (B37" W23" H35")
119+
# separate fields with <number> cm
120+
- replace:
121+
# remove citation markers like "[3]"
122+
- regex: "\\[[ ]*[0-9]+[ ]*\\]"
123+
with: ""
124+
# remove expressions in braces (to remove noise from potential split fields)
125+
- regex: \([^)]*\)
126+
with: ""
127+
# reduce to B-W-H removing prefixes and units ("B93 W59 H88 cm" -> "93-59-88")
128+
- regex: ^(?:.*B|.*[^0-9])?([0-9]{2,})\s*(?:cm|")?[\s\-]+W?([0-9]{2,})\s*(?:cm|")?[\s\-]+H?([0-9]{2,})\s*(?:cm|")?.*$
129+
with: "$1-$2-$3"
130+
Tags:
131+
Name:
132+
selector: //nav[@id="articleCategories"]//li[contains(@class, "category")]/@data-name
133+
Image:
134+
selector: $infobox//figure//img/@src

0 commit comments

Comments
 (0)