Skip to content

Commit 907bb73

Browse files
committed
feat: chain scrapping & detection pipeline
1 parent 41198c6 commit 907bb73

File tree

6 files changed

+133
-145
lines changed

6 files changed

+133
-145
lines changed

climateguard/pipeline.py

Lines changed: 36 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,97 +1,46 @@
1-
from gdelt_scrapper import GDELTScrapper
2-
from news_scrapper import NewsScraper
1+
import asyncio
32
from pathlib import Path
43
import json
5-
import multiprocessing
6-
from functools import partial
4+
from typing import Literal, List, Dict
5+
6+
from climateguard.scrapping.pipeline import ScrapFromGDelt
7+
from climateguard.detect_claims import detect_claims
8+
from climateguard.models import Article, Claims
79

810
class Pipeline:
911
def __init__(self):
10-
self.gdelt_scraper = GDELTScrapper()
11-
self.news_scraper = NewsScraper()
12-
13-
def run(self, keyword: str, years: list[int], output_dir: Path):
14-
# Step 1: Find themes related to the keyword
15-
themes = self.gdelt_scraper.find_themes_related_to_keyword(keyword)
16-
print(f"Themes related to {keyword}: {themes}")
17-
18-
# Step 2: Find articles for these themes and years
19-
articles_df = self.gdelt_scraper.find_articles(themes=themes, years=years)
20-
21-
# Step 3: Extract URLs from the DataFrame
22-
urls = articles_df["url"].tolist()
23-
24-
# Save the list of URLs to a separate file
25-
self._save_urls(urls, output_dir)
26-
27-
# Step 4: Scrape each URL using multiprocessing
28-
scraped_articles, failed_urls = self._scrape_urls_parallel(urls)
29-
30-
# Step 5: Save results
31-
self._save_results(scraped_articles, failed_urls, output_dir)
32-
33-
def _save_urls(self, urls: list, output_dir: Path):
34-
output_dir.mkdir(parents=True, exist_ok=True)
35-
urls_file = output_dir / 'all_urls.json'
36-
with open(urls_file, 'w', encoding='utf-8') as f:
37-
json.dump(urls, f, ensure_ascii=False, indent=4)
38-
print(f"All URLs saved to {urls_file}")
39-
40-
def _scrape_urls_parallel(self, urls):
41-
# Create a partial function with self.news_scraper
42-
scrape_func = partial(self._scrape_single_url, news_scraper=self.news_scraper)
12+
self.scrap_from_gdelt = ScrapFromGDelt()
13+
self.data_dir = Path(__file__).parent / "data"
4314

44-
# Use all available cores
45-
num_cores = multiprocessing.cpu_count()
46-
47-
# Create a multiprocessing pool
48-
with multiprocessing.Pool(num_cores) as pool:
49-
results = pool.map(scrape_func, urls)
15+
def run(self, keyword: str, years: list[int], language: Literal["French", "English", "Latvian"]) -> List[Dict]:
16+
scraped_articles_file = self.data_dir / 'scraped_articles.json'
5017

51-
# Process results
52-
scraped_articles = []
53-
failed_urls = []
54-
for result in results:
55-
if result['success']:
56-
article = result['article']
57-
scraped_articles.append(article)
58-
print(f"Scraped: {article.title}")
59-
print(f"Content length: {len(article.content)}")
60-
print(f"Date: {article.date}")
61-
print("---")
62-
else:
63-
failed_urls.append(result['url'])
64-
print(f"Failed to scrape: {result['url']}")
65-
print("---")
66-
67-
return scraped_articles, failed_urls
68-
69-
@staticmethod
70-
def _scrape_single_url(url, news_scraper):
71-
article = news_scraper.scrape_article(url)
72-
if article:
73-
return {'success': True, 'article': article}
18+
if not scraped_articles_file.exists():
19+
print("Scraped articles not found. Starting scraping process...")
20+
articles_data = self.scrap_from_gdelt.run(keyword, years, self.data_dir)
7421
else:
75-
return {'success': False, 'url': url}
76-
77-
def _save_results(self, scraped_articles, failed_urls, output_dir):
78-
output_dir.mkdir(parents=True, exist_ok=True)
79-
80-
# Save successfully scraped articles to JSON
81-
output_file = output_dir / 'scraped_articles.json'
82-
with open(output_file, 'w', encoding='utf-8') as f:
83-
json.dump([article.dict() for article in scraped_articles], f, ensure_ascii=False, indent=4)
84-
85-
print(f"\nSuccessfully scraped articles saved to {output_file}")
86-
87-
# Save failed URLs to a separate file
88-
failed_file = output_dir / 'failed_urls.json'
89-
with open(failed_file, 'w', encoding='utf-8') as f:
90-
json.dump(failed_urls, f, ensure_ascii=False, indent=4)
91-
92-
print(f"Failed URLs saved to {failed_file}")
22+
print("Scraped articles found. Loading from file...")
23+
with open(scraped_articles_file, 'r', encoding='utf-8') as f:
24+
articles_data = json.load(f)
25+
26+
# Process articles and detect claims
27+
results = []
28+
for article_data in articles_data:
29+
article = Article(**article_data)
30+
claims, n_tokens = detect_claims(article, language)
31+
results.append({
32+
"article": article.dict(),
33+
"claims": claims.dict(),
34+
"n_tokens": n_tokens
35+
})
36+
37+
print(f"Processed {len(results)} articles with claims")
38+
return results
39+
40+
def main():
41+
pipeline = Pipeline()
42+
processed_articles = pipeline.run(keyword="CLIMATE", years=[2022, 2023, 2024], language="English")
43+
9344

9445
if __name__ == "__main__":
95-
pipeline = Pipeline()
96-
output_dir = Path(__file__).parent.parent / "data"
97-
pipeline.run(keyword="CLIMATE", years=[2022, 2023, 2024], output_dir=output_dir)
46+
main()

climateguard/scrapping/pipeline.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
from climateguard.scrapping.gdelt_scrapper import GDELTScrapper
2+
from climateguard.scrapping.news_scrapper import NewsScraper
3+
from pathlib import Path
4+
import json
5+
import multiprocessing
6+
from functools import partial
7+
8+
class ScrapFromGDelt:
9+
def __init__(self):
10+
self.gdelt_scraper = GDELTScrapper()
11+
self.news_scraper = NewsScraper()
12+
13+
def run(self, keyword: str, years: list[int], output_dir: Path):
14+
# Step 1: Find themes related to the keyword
15+
themes = self.gdelt_scraper.find_themes_related_to_keyword(keyword)
16+
print(f"Themes related to {keyword}: {themes}")
17+
18+
# Step 2: Find articles for these themes and years
19+
articles_df = self.gdelt_scraper.find_articles(themes=themes, years=years)
20+
21+
# Step 3: Extract URLs from the DataFrame
22+
urls = articles_df["url"].tolist()
23+
24+
# Save the list of URLs to a separate file
25+
self._save_urls(urls, output_dir)
26+
27+
# Step 4: Scrape each URL using multiprocessing
28+
scraped_articles, failed_urls = self._scrape_urls_parallel(urls)
29+
30+
# Step 5: Save results
31+
self._save_results(scraped_articles, failed_urls, output_dir)
32+
33+
def _save_urls(self, urls: list, output_dir: Path):
34+
output_dir.mkdir(parents=True, exist_ok=True)
35+
urls_file = output_dir / 'all_urls.json'
36+
with open(urls_file, 'w', encoding='utf-8') as f:
37+
json.dump(urls, f, ensure_ascii=False, indent=4)
38+
print(f"All URLs saved to {urls_file}")
39+
40+
def _scrape_urls_parallel(self, urls):
41+
# Create a partial function with self.news_scraper
42+
scrape_func = partial(self._scrape_single_url, news_scraper=self.news_scraper)
43+
44+
# Use all available cores
45+
num_cores = multiprocessing.cpu_count()
46+
47+
# Create a multiprocessing pool
48+
with multiprocessing.Pool(num_cores) as pool:
49+
results = pool.map(scrape_func, urls)
50+
51+
# Process results
52+
scraped_articles = []
53+
failed_urls = []
54+
for result in results:
55+
if result['success']:
56+
article = result['article']
57+
scraped_articles.append(article)
58+
print(f"Scraped: {article.title}")
59+
print(f"Content length: {len(article.content)}")
60+
print(f"Date: {article.date}")
61+
print("---")
62+
else:
63+
failed_urls.append(result['url'])
64+
print(f"Failed to scrape: {result['url']}")
65+
print("---")
66+
67+
return scraped_articles, failed_urls
68+
69+
@staticmethod
70+
def _scrape_single_url(url, news_scraper):
71+
article = news_scraper.scrape_article(url)
72+
if article:
73+
return {'success': True, 'article': article}
74+
else:
75+
return {'success': False, 'url': url}
76+
77+
def _save_results(self, scraped_articles, failed_urls, output_dir):
78+
output_dir.mkdir(parents=True, exist_ok=True)
79+
80+
# Save successfully scraped articles to JSON
81+
output_file = output_dir / 'scraped_articles.json'
82+
with open(output_file, 'w', encoding='utf-8') as f:
83+
json.dump([article.dict() for article in scraped_articles], f, ensure_ascii=False, indent=4)
84+
85+
print(f"\nSuccessfully scraped articles saved to {output_file}")
86+
87+
# Save failed URLs to a separate file
88+
failed_file = output_dir / 'failed_urls.json'
89+
with open(failed_file, 'w', encoding='utf-8') as f:
90+
json.dump(failed_urls, f, ensure_ascii=False, indent=4)
91+
92+
print(f"Failed URLs saved to {failed_file}")
93+
94+
if __name__ == "__main__":
95+
pipeline = ScrapFromGDelt()
96+
output_dir = Path(__file__).parent.parent / "data"
97+
pipeline.run(keyword="CLIMATE", years=[2022, 2023, 2024], output_dir=output_dir)

climateguard/test.py

Lines changed: 0 additions & 58 deletions
This file was deleted.

0 commit comments

Comments
 (0)