1+ from urllib .request import urlopen
2+ import pandas as pd
3+ import gdeltdoc as gdelt
4+ import functools
5+ import itertools
6+ from pathlib import Path
7+
8+ class GDELTScrapper :
9+ THEMES_URL = "http://data.gdeltproject.org/api/v2/guides/LOOKUP-GKGTHEMES.TXT"
10+
11+ @functools .cached_property
12+ def themes_df (self ) -> pd .DataFrame :
13+ # Fetch the content using urllib
14+ with urlopen (self .THEMES_URL ) as response :
15+ data = response .read ().decode ()
16+
17+ # Split the data into lines
18+ lines = data .strip ().split ("\n " )
19+
20+ # Split each line into key-value pairs
21+ rows = [line .split ("\t " ) for line in lines ]
22+
23+ # Create a DataFrame from the rows
24+ df = pd .DataFrame (rows , columns = ['theme' , 'count' ])
25+ df ['count' ] = df ['count' ].astype (int )
26+
27+ return df
28+
29+ def find_themes_related_to_keyword (self , keyword : str ) -> list [str ]:
30+ return self .themes_df [self .themes_df ["theme" ].str .contains (keyword , case = False )]["theme" ].to_list ()
31+
32+ def find_articles (self , themes : list [str ], years : list [int ]) -> pd .DataFrame :
33+ partial_articles_dfs = []
34+
35+ gd = gdelt .GdeltDoc ()
36+ for theme , year in itertools .product (themes , years ):
37+ f = gdelt .Filters (
38+ #keyword = "climate change",
39+ start_date = f"{ year } -01-01" ,
40+ end_date = f"{ year } -12-31" ,
41+ theme = theme ,
42+ country = "LG" , # Latvia
43+ )
44+
45+ partial_articles_df = gd .article_search (f )
46+ print (f"{ len (partial_articles_df )} articles found for theme { theme } , in { year } " )
47+ partial_articles_dfs .append (partial_articles_df )
48+
49+ articles_df = pd .concat (partial_articles_dfs )
50+
51+ articles_df = articles_df [articles_df ["language" ] == "Latvian" ]
52+ articles_df ["seendate" ] = pd .to_datetime (articles_df ["seendate" ])
53+
54+ print (f"Deleting { articles_df ["url" ].duplicated ().sum ()} duplicates" )
55+ articles_df = articles_df .drop_duplicates ("url" )
56+ print (f"{ len (articles_df )} unique articles found" )
57+ return articles_df
58+
59+
60+ # Usage example:
61+ if __name__ == "__main__" :
62+ scraper = GDELTScrapper ()
63+
64+ # Find themes related to climate
65+ themes = scraper .find_themes_related_to_keyword ("CLIMATE" )
66+ print (f"Themes related to climate: { themes } " )
67+
68+ # Find articles for these themes and year range
69+ articles_df = scraper .find_articles (themes = themes , years = [2022 , 2023 , 2024 ])
70+
71+ # This can be used as input for NewsScraper
72+ article_urls = articles_df ["url" ].to_list ()
73+
74+ # Save dataframe to a csv file
75+ file_path = Path (__file__ ).parent .parent / "data/latvian_article_links.csv"
76+ articles_df .to_csv (file_path )
0 commit comments