Skip to content

Commit 8ec0448

Browse files
authored
[Fandom][
Use [1] instead of regex-ing out citation markers, use built-in functions for imperial -> metric conversion
1 parent cbf92ba commit 8ec0448

File tree

1 file changed

+13
-82
lines changed

1 file changed

+13
-82
lines changed

scrapers/fandom-wiki.yml

Lines changed: 13 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -12,33 +12,13 @@ xPathScrapers:
1212
$infobox: //aside[contains(@class, 'portable-infobox')]
1313
performer:
1414
Name:
15-
selector: $infobox/h2//text()
16-
postProcess:
17-
- replace:
18-
# remove citation markers like "[3]"
19-
- regex: "\\[[ ]*[0-9]+[ ]*\\]"
20-
with: ""
15+
selector: $infobox/h2//text()[1]
2116
Aliases:
22-
selector: $infobox//div[@data-source="full_name" or @data-source="full" or @data-source="also_known_as" or @data-source="known" or @data-source="aka" or contains(@data-source,"Name") or contains(@data-source,"name") or contains(@data-source,"alias") or contains(@data-source,"Alias")]/div/text()
23-
postProcess:
24-
- replace:
25-
# remove citation markers like "[3]"
26-
- regex: "\\[[ ]*[0-9]+[ ]*\\]"
27-
with: ""
17+
selector: $infobox//div[@data-source="full_name" or @data-source="full" or @data-source="also_known_as" or @data-source="known" or @data-source="aka" or contains(@data-source,"Name") or contains(@data-source,"name") or contains(@data-source,"alias") or contains(@data-source,"Alias")]/div/text()[1]
2818
Gender:
29-
selector: $infobox//div[@data-source="gender" or @data-source="Gender" or @data-source="sex"]/div//text()
30-
concat: " "
31-
postProcess:
32-
- replace:
33-
# remove citation markers like "[3]"
34-
- regex: "\\[[ ]*[0-9]+[ ]*\\]"
35-
with: ""
36-
# reduce to single word
37-
- regex: ^([^\s]+).*
38-
with: "$1"
19+
selector: $infobox//div[@data-source="gender" or @data-source="Gender" or @data-source="sex"]/div//text()[1]
3920
Birthdate:
40-
selector: $infobox//div[contains(@data-source,"Birth") or contains(@data-source,"birth") or @data-source="born" or @data-source="dob" or @data-source="DOB"]/div//text()
41-
concat: " "
21+
selector: $infobox//div[contains(@data-source,"Birth") or contains(@data-source,"birth") or @data-source="born" or @data-source="dob" or @data-source="DOB"]/div//text()[1]
4222
postProcess: &postprocessDate
4323
# there are many formats used, many only give partial info - some examples
4424
# September 26, 1981
@@ -48,9 +28,6 @@ xPathScrapers:
4828
# 2 May 1998 (aged 46/47)
4929
# 1997 [3] some more text
5030
- replace:
51-
# remove citation markers like "[3]"
52-
- regex: "\\[[ ]*[0-9]+[ ]*\\]"
53-
with: ""
5431
# remove expressions in braces (likely only giving context)
5532
- regex: \([^)]*\)
5633
with: ""
@@ -74,37 +51,16 @@ xPathScrapers:
7451
with: January 1 $1
7552
- parseDate: January 2 2006
7653
DeathDate:
77-
selector: $infobox//div[contains(@data-source,"Death") or contains(@data-source,"death") or @data-source="died" or @data-source="dod" or @data-source="DOD"]/div//text()
78-
concat: " "
54+
selector: $infobox//div[contains(@data-source,"Death") or contains(@data-source,"death") or @data-source="died" or @data-source="dod" or @data-source="DOD"]/div//text()[1]
7955
postProcess: *postprocessDate
8056
Ethnicity:
81-
selector: $infobox//div[@data-source="species" or contains(@data-source,"Race") or contains(@data-source,"race")]/div//text()
82-
postProcess:
83-
- replace:
84-
# remove citation markers like "[3]"
85-
- regex: "\\[[ ]*[0-9]+[ ]*\\]"
86-
with: ""
57+
selector: $infobox//div[@data-source="species" or contains(@data-source,"Race") or contains(@data-source,"race")]/div//text()[1]
8758
Country:
88-
selector: $infobox//div[@data-source="nationality"]/div//text()
89-
postProcess:
90-
- replace:
91-
# remove citation markers like "[3]"
92-
- regex: "\\[[ ]*[0-9]+[ ]*\\]"
93-
with: ""
59+
selector: $infobox//div[@data-source="nationality"]/div//text()[1]
9460
HairColor:
95-
selector: $infobox//div[contains(@data-source,"hair") or contains(@data-source,"Hair")]/div//text()
96-
postProcess:
97-
- replace:
98-
# remove citation markers like "[3]"
99-
- regex: "\\[[ ]*[0-9]+[ ]*\\]"
100-
with: ""
61+
selector: $infobox//div[contains(@data-source,"hair") or contains(@data-source,"Hair")]/div//text()[1]
10162
EyeColor:
102-
selector: $infobox//div[contains(@data-source,"eye") or contains(@data-source,"Eye")]/div//text()
103-
postProcess:
104-
- replace:
105-
# remove citation markers like "[3]"
106-
- regex: "\\[[ ]*[0-9]+[ ]*\\]"
107-
with: ""
63+
selector: $infobox//div[contains(@data-source,"eye") or contains(@data-source,"Eye")]/div//text()[1]
10864
Height:
10965
selector: $infobox//div[@data-source="height"]/div//text()
11066
concat: " "
@@ -133,21 +89,9 @@ xPathScrapers:
13389
# reduce any variant of imperial to predicatble format ("5' 4"" -> "5'4")
13490
- regex: ^(?:.*[^0-9])?([0-9])\s*(?:'|ft|ft.|feet|foot)\s*([0-9])(?:.*)?$
13591
with: "$1'$2"
136-
# use JS to convert imperial to metric, if imperial
137-
- javascript: |
138-
if (value && value.length) {
139-
const match = /^(\d)'(\d)$/.exec(value);
140-
if (match) {
141-
const feet = parseInt(match[1]);
142-
const inches = parseInt(match[2]);
143-
const imperialHeight = feet + inches / 12.0;
144-
return (imperialHeight * 30.48).toFixed(0);
145-
}
146-
}
147-
return value;
92+
- feetToCm: True
14893
Weight:
149-
selector: $infobox//div[@data-source="weight"]/div//text()
150-
concat: " "
94+
selector: $infobox//div[@data-source="weight"]/div//text()[1]
15195
postProcess:
15296
# there are many formats used, a lot of them use multi-unit formats, or even multiple values - some examples
15397
# 127 lb (58kg)
@@ -156,31 +100,18 @@ xPathScrapers:
156100
# 58.2 kg (127.3 lb)
157101
# 58 kg
158102
- replace:
159-
# remove citation markers like "[3]"
160-
- regex: "\\[[ ]*[0-9]+[ ]*\\]"
161-
with: ""
162103
# in kg, pull out just the number
163104
- regex: ^(?:.*[^0-9.])?([0-9]{2,3})(?:\.[0-9]+)?[ ]?(?:kg|Kg|KG)(?:.*)?$
164105
with: "$1"
165106
# in lb, reduce to number and unit
166107
- regex: ^(?:.*[^0-9.])?([0-9]{2,3}})(?:\.[0-9]+)?[ ]?(?:lb|Lb|LB)(?:.*)?$
167108
with: "$1lb"
168109
# use JS to convert imperial to metric, if imperial
169-
- javascript: |
170-
if (value && value.length) {
171-
console.log('weight has value');
172-
const match = /^(\d+)lb$/.exec(value);
173-
if (match) {
174-
console.log('wight lbs regexp match');
175-
const imperialWight = parseInt(match[1]);
176-
return (imperialWight * 2.20462).toFixed(0);
177-
}
178-
}
179-
return value;
110+
- lbToKg: true
180111
Measurements:
181112
# looks for one field with all values, or split fields (get combined into single string)
182113
# split fields really only work corretly if the field order is still B-W-H
183-
selector: $infobox//div[@data-source="measurements" or @data-source="bust" or @data-source="waist" or @data-source="hips"]/div//text()
114+
selector: $infobox//div[@data-source="measurements" or @data-source="bust" or @data-source="waist" or @data-source="hips"]/div//text()[1]
184115
concat: " "
185116
postProcess:
186117
# there are probably many formats used - examples fo seen formats

0 commit comments

Comments
 (0)