|
| 1 | +name: fandom-wiki |
| 2 | +performerByURL: |
| 3 | + - action: scrapeXPath |
| 4 | + url: |
| 5 | + - fandom.com/ |
| 6 | + queryURL: "{url}" |
| 7 | + scraper: performerScraper |
| 8 | + |
| 9 | +xPathScrapers: |
| 10 | + performerScraper: |
| 11 | + common: |
| 12 | + $infobox: //aside[contains(@class, 'portable-infobox')] |
| 13 | + performer: |
| 14 | + Name: |
| 15 | + selector: $infobox/h2//text()[1] |
| 16 | + Aliases: |
| 17 | + selector: $infobox//div[@data-source="full_name" or @data-source="full" or @data-source="also_known_as" or @data-source="known" or @data-source="aka" or contains(@data-source,"Name") or contains(@data-source,"name") or contains(@data-source,"alias") or contains(@data-source,"Alias")]/div/text()[1] |
| 18 | + Gender: |
| 19 | + selector: $infobox//div[@data-source="gender" or @data-source="Gender" or @data-source="sex"]/div//text()[1] |
| 20 | + Birthdate: |
| 21 | + selector: $infobox//div[contains(@data-source,"Birth") or contains(@data-source,"birth") or @data-source="born" or @data-source="dob" or @data-source="DOB"]/div//text()[1] |
| 22 | + postProcess: &postprocessDate |
| 23 | + # there are many formats used, many only give partial info - some examples |
| 24 | + # September 26, 1981 |
| 25 | + # June, 2077 |
| 26 | + # January 7, 11942 |
| 27 | + # Between 1 August 1950 and 31 July 1951 |
| 28 | + # 2 May 1998 (aged 46/47) |
| 29 | + # 1997 [3] some more text |
| 30 | + - replace: |
| 31 | + # remove expressions in braces (likely only giving context) |
| 32 | + - regex: \([^)]*\) |
| 33 | + with: "" |
| 34 | + # remove commas ("September 26, 1981" -> "September 26 1981") |
| 35 | + - regex: "[,]" |
| 36 | + with: "" |
| 37 | + # remove estimation prefixes like "ca." (might be in the middle of date string, like for an unknown exact day) |
| 38 | + - regex: ^(.*\s+)?(?:c\.|ca\.)\s*(\d+.*)$ |
| 39 | + with: "$1 $2" |
| 40 | + # remove suffixed from day ("1st" -> "1", "2nd" -> "2", "3rd" -> "3", "4th" -> "4") |
| 41 | + - regex: "([0-9])(?:st|nd|rd|th)" |
| 42 | + with: "$1" |
| 43 | + # remove extra content around first par with format "2 May 1998" -> "May 2 1998" |
| 44 | + - regex: ^(?:.*[^\d])?(\d{1,2})(?:\s+)(\w{3,})(?:\s+)(\d{4})(?:.*)?$ |
| 45 | + with: $2 $1 $3 |
| 46 | + # remove extra content around first par with format "May 2 1998" -> "May 2 1998" |
| 47 | + - regex: ^(?:.*[^\w])?(\w{3,})(?:\s+)(\d{1,2})(?:\s+)(\d{4})(?:.*)?$ |
| 48 | + with: $1 $2 $3 |
| 49 | + # starts with a year -> expand to JJanuary first |
| 50 | + - regex: ^(\d{4})(?:[^\d].*)?$ |
| 51 | + with: January 1 $1 |
| 52 | + - parseDate: January 2 2006 |
| 53 | + DeathDate: |
| 54 | + selector: $infobox//div[contains(@data-source,"Death") or contains(@data-source,"death") or @data-source="died" or @data-source="dod" or @data-source="DOD"]/div//text()[1] |
| 55 | + postProcess: *postprocessDate |
| 56 | + Ethnicity: |
| 57 | + selector: $infobox//div[@data-source="species" or contains(@data-source,"Race") or contains(@data-source,"race")]/div//text()[1] |
| 58 | + Country: |
| 59 | + selector: $infobox//div[@data-source="nationality"]/div//text()[1] |
| 60 | + HairColor: |
| 61 | + selector: $infobox//div[contains(@data-source,"hair") or contains(@data-source,"Hair")]/div//text()[1] |
| 62 | + EyeColor: |
| 63 | + selector: $infobox//div[contains(@data-source,"eye") or contains(@data-source,"Eye")]/div//text()[1] |
| 64 | + Height: |
| 65 | + selector: $infobox//div[@data-source="height"]/div//text() |
| 66 | + concat: " " |
| 67 | + postProcess: |
| 68 | + # there are many formats used, a lot of them use multi-unit formats, or even multiple values - some examples |
| 69 | + # 163 cm (5'4")[3] |
| 70 | + # 5'3 (1.60m) |
| 71 | + # 171 cm (5'7½") |
| 72 | + # 171 cm (5'7.3"), according to source abc |
| 73 | + # 5 feet 3 inches (160cm)[3] |
| 74 | + # 163cm[1] (5'4")[note 1] |
| 75 | + # 168 cm (with heels) |
| 76 | + - replace: |
| 77 | + # remove citation markers like "[3]" |
| 78 | + - regex: "\\[[ ]*[0-9]+[ ]*\\]" |
| 79 | + with: "" |
| 80 | + # height in meters, pulls out as centimeters number ("1.63 m (5'4")" -> "163") |
| 81 | + - regex: ^(?:.*[^0-9.])?([0-9]+)\.([0-9]{2}])[ ]?(?:m|M)(?:.*)?$ |
| 82 | + with: "$1$2" |
| 83 | + # height in centimeters, pulls out just the number ("163 cm (5'4")" -> "163") |
| 84 | + - regex: ^(?:.*[^0-9])?([0-9]{3})[ ]?(?:cm|CM)(?:.*)?$ |
| 85 | + with: "$1" |
| 86 | + # pull out any three digit number from string (to catch those in cm without units) |
| 87 | + - regex: ^(?:.*[^0-9])?([0-9]{3})(?:.*)?$ |
| 88 | + with: "$1" |
| 89 | + # reduce any variant of imperial to predicatble format ("5' 4"" -> "5'4") |
| 90 | + - regex: ^(?:.*[^0-9])?([0-9])\s*(?:'|ft|ft.|feet|foot)\s*([0-9])(?:.*)?$ |
| 91 | + with: "$1'$2" |
| 92 | + - feetToCm: True |
| 93 | + Weight: |
| 94 | + selector: $infobox//div[@data-source="weight"]/div//text()[1] |
| 95 | + postProcess: |
| 96 | + # there are many formats used, a lot of them use multi-unit formats, or even multiple values - some examples |
| 97 | + # 127 lb (58kg) |
| 98 | + # 127 lb |
| 99 | + # 58 kg (127lb) |
| 100 | + # 58.2 kg (127.3 lb) |
| 101 | + # 58 kg |
| 102 | + - replace: |
| 103 | + # in kg, pull out just the number |
| 104 | + - regex: ^(?:.*[^0-9.])?([0-9]{2,3})(?:\.[0-9]+)?[ ]?(?:kg|Kg|KG)(?:.*)?$ |
| 105 | + with: "$1" |
| 106 | + # in lb, reduce to number and unit |
| 107 | + - regex: ^(?:.*[^0-9.])?([0-9]{2,3}})(?:\.[0-9]+)?[ ]?(?:lb|Lb|LB)(?:.*)?$ |
| 108 | + with: "$1lb" |
| 109 | + # use JS to convert imperial to metric, if imperial |
| 110 | + - lbToKg: true |
| 111 | + Measurements: |
| 112 | + # looks for one field with all values, or split fields (get combined into single string) |
| 113 | + # split fields really only work corretly if the field order is still B-W-H |
| 114 | + selector: $infobox//div[@data-source="measurements" or @data-source="bust" or @data-source="waist" or @data-source="hips"]/div//text()[1] |
| 115 | + concat: " " |
| 116 | + postProcess: |
| 117 | + # there are probably many formats used - examples fo seen formats |
| 118 | + # B93 W59 H88 cm (B37" W23" H35") |
| 119 | + # separate fields with <number> cm |
| 120 | + - replace: |
| 121 | + # remove citation markers like "[3]" |
| 122 | + - regex: "\\[[ ]*[0-9]+[ ]*\\]" |
| 123 | + with: "" |
| 124 | + # remove expressions in braces (to remove noise from potential split fields) |
| 125 | + - regex: \([^)]*\) |
| 126 | + with: "" |
| 127 | + # reduce to B-W-H removing prefixes and units ("B93 W59 H88 cm" -> "93-59-88") |
| 128 | + - regex: ^(?:.*B|.*[^0-9])?([0-9]{2,})\s*(?:cm|")?[\s\-]+W?([0-9]{2,})\s*(?:cm|")?[\s\-]+H?([0-9]{2,})\s*(?:cm|")?.*$ |
| 129 | + with: "$1-$2-$3" |
| 130 | + Tags: |
| 131 | + Name: |
| 132 | + selector: //nav[@id="articleCategories"]//li[contains(@class, "category")]/@data-name |
| 133 | + Image: |
| 134 | + selector: $infobox//figure//img/@src |
0 commit comments