Skip to content

Commit 41198c6

Browse files
authored
Merge pull request #3 from dataforgoodfr/feat/multi_step_scrapping
Feat/multi step scrapping
2 parents 6ccdf3c + dd2fb2a commit 41198c6

File tree

9 files changed

+1665
-132
lines changed

9 files changed

+1665
-132
lines changed

.gitignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# https://www.atlassian.com/git/tutorials/saving-changes/gitignore
55

66
.env
7-
data/*
7+
data/.ipynb_checkpoints/
88

99

1010
# Node artifact files
@@ -53,7 +53,7 @@ Thumbs.db
5353
*.wmv
5454

5555
*.pyc
56-
notebooks/.ipynb_checkpoints
56+
notebooks/**/.ipynb_checkpoints/
5757
.env
5858
.env
5959
.venv

climateguard/gdelt_scrapper.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from urllib.request import urlopen
2+
import pandas as pd
3+
import gdeltdoc as gdelt
4+
import functools
5+
import itertools
6+
from pathlib import Path
7+
8+
class GDELTScrapper:
9+
THEMES_URL = "http://data.gdeltproject.org/api/v2/guides/LOOKUP-GKGTHEMES.TXT"
10+
11+
@functools.cached_property
12+
def themes_df(self) -> pd.DataFrame:
13+
# Fetch the content using urllib
14+
with urlopen(self.THEMES_URL) as response:
15+
data = response.read().decode()
16+
17+
# Split the data into lines
18+
lines = data.strip().split("\n")
19+
20+
# Split each line into key-value pairs
21+
rows = [line.split("\t") for line in lines]
22+
23+
# Create a DataFrame from the rows
24+
df = pd.DataFrame(rows, columns=['theme', 'count'])
25+
df['count'] = df['count'].astype(int)
26+
27+
return df
28+
29+
def find_themes_related_to_keyword(self, keyword: str) -> list[str]:
30+
return self.themes_df[self.themes_df["theme"].str.contains(keyword, case=False)]["theme"].to_list()
31+
32+
def find_articles(self, themes: list[str], years: list[int]) -> pd.DataFrame:
33+
partial_articles_dfs = []
34+
35+
gd = gdelt.GdeltDoc()
36+
for theme, year in itertools.product(themes, years):
37+
f = gdelt.Filters(
38+
#keyword = "climate change",
39+
start_date=f"{year}-01-01",
40+
end_date=f"{year}-12-31",
41+
theme=theme,
42+
country="LG", # Latvia
43+
)
44+
45+
partial_articles_df = gd.article_search(f)
46+
print(f"{len(partial_articles_df)} articles found for theme {theme}, in {year}")
47+
partial_articles_dfs.append(partial_articles_df)
48+
49+
articles_df = pd.concat(partial_articles_dfs)
50+
51+
articles_df = articles_df[articles_df["language"] == "Latvian"]
52+
articles_df["seendate"] = pd.to_datetime(articles_df["seendate"])
53+
54+
print(f"Deleting {articles_df["url"].duplicated().sum()} duplicates")
55+
articles_df = articles_df.drop_duplicates("url")
56+
print(f"{len(articles_df)} unique articles found")
57+
return articles_df
58+
59+
60+
# Usage example:
61+
if __name__ == "__main__":
62+
scraper = GDELTScrapper()
63+
64+
# Find themes related to climate
65+
themes = scraper.find_themes_related_to_keyword("CLIMATE")
66+
print(f"Themes related to climate: {themes}")
67+
68+
# Find articles for these themes and year range
69+
articles_df = scraper.find_articles(themes=themes, years=[2022, 2023, 2024])
70+
71+
# This can be used as input for NewsScraper
72+
article_urls = articles_df["url"].to_list()
73+
74+
# Save dataframe to a csv file
75+
file_path = Path(__file__).parent.parent / "data/latvian_article_links.csv"
76+
articles_df.to_csv(file_path)

0 commit comments

Comments
 (0)