-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathretrieval_patterns.json
68 lines (68 loc) · 2.02 KB
/
retrieval_patterns.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
{
"general": {
"css_selectors": {
"a[href*=dataset]": "dataset",
"a[href*=data]": "dataset",
"a[href*=supplementary]": "supplementary",
"a[href*=pdf]": "supplementary"
},
"xpaths": {
},
"bad_patterns": [
],
"skip_llm_classification_patterns": {
"scholar.google.com": "Related Works"
}
},
"nature": {
"css_selectors": {
".additional-information a": "supplementary",
"#data-availability-content a": "supplementary",
"#additional-information-section a": "supplementary",
"[data-title~=Supplementary] a": "supplementary",
"[data-title~=Data] a": "dataset"
},
"xpaths": {
"//h2[contains(text(), 'Source Data')]": "dataset"
}
},
"sciencedirect": {
"css_selectors": {
".Appendices a": "dataset",
"#da0010 a": "dataset",
"#da0010 a": "dataset"
},
"xpaths": {
"//h2[contains(text(), 'Data and code availability')]": "dataset",
"//h4[contains(text(), 'Data and code availability')]": "dataset"
},
"bad_patterns": [
"/supporthub/"
]
},
"thelancet": {
"css_selectors": {
".article__sections .section-paragraph a[href*='dataset']": "dataset",
".article__sections .refs .ref__title a[href*='dataset']": "dataset",
".supplemental-information__item a": "supplementary"
}
},
"PMC": {
"css_selectors": {
"CHANGE HERE!!": "data availability"
},
"xpaths": {
"data_availability": {
"//section[h2[contains(text(), 'Data Availability')]]": "dataset",
"//section[h3[contains(text(), 'Data Availability')]]": "dataset",
"following-sibling::p[1]" : "dataset",
"//section[h2[contains(text(), 'Footnotes')]]": "dataset"
},
"supplementary_data": {
"//section[h2[contains(text(), 'Supplementary Material')]]": "supplementary",
"//section[h2[contains(text(), 'Extended Data')]]": "supplementary",
"//section[h3[contains(text(), 'Clinical Data')]]": "supplementary"
}
}
}
}