|
1 | | -from gdelt_scrapper import GDELTScrapper |
2 | | -from news_scrapper import NewsScraper |
| 1 | +import asyncio |
3 | 2 | from pathlib import Path |
4 | 3 | import json |
5 | | -import multiprocessing |
6 | | -from functools import partial |
| 4 | +from typing import Literal, List, Dict |
| 5 | + |
| 6 | +from climateguard.scrapping.pipeline import ScrapFromGDelt |
| 7 | +from climateguard.detect_claims import detect_claims |
| 8 | +from climateguard.models import Article, Claims |
7 | 9 |
|
8 | 10 | class Pipeline: |
9 | 11 | def __init__(self): |
10 | | - self.gdelt_scraper = GDELTScrapper() |
11 | | - self.news_scraper = NewsScraper() |
12 | | - |
13 | | - def run(self, keyword: str, years: list[int], output_dir: Path): |
14 | | - # Step 1: Find themes related to the keyword |
15 | | - themes = self.gdelt_scraper.find_themes_related_to_keyword(keyword) |
16 | | - print(f"Themes related to {keyword}: {themes}") |
17 | | - |
18 | | - # Step 2: Find articles for these themes and years |
19 | | - articles_df = self.gdelt_scraper.find_articles(themes=themes, years=years) |
20 | | - |
21 | | - # Step 3: Extract URLs from the DataFrame |
22 | | - urls = articles_df["url"].tolist() |
23 | | - |
24 | | - # Save the list of URLs to a separate file |
25 | | - self._save_urls(urls, output_dir) |
26 | | - |
27 | | - # Step 4: Scrape each URL using multiprocessing |
28 | | - scraped_articles, failed_urls = self._scrape_urls_parallel(urls) |
29 | | - |
30 | | - # Step 5: Save results |
31 | | - self._save_results(scraped_articles, failed_urls, output_dir) |
32 | | - |
33 | | - def _save_urls(self, urls: list, output_dir: Path): |
34 | | - output_dir.mkdir(parents=True, exist_ok=True) |
35 | | - urls_file = output_dir / 'all_urls.json' |
36 | | - with open(urls_file, 'w', encoding='utf-8') as f: |
37 | | - json.dump(urls, f, ensure_ascii=False, indent=4) |
38 | | - print(f"All URLs saved to {urls_file}") |
39 | | - |
40 | | - def _scrape_urls_parallel(self, urls): |
41 | | - # Create a partial function with self.news_scraper |
42 | | - scrape_func = partial(self._scrape_single_url, news_scraper=self.news_scraper) |
| 12 | + self.scrap_from_gdelt = ScrapFromGDelt() |
| 13 | + self.data_dir = Path(__file__).parent / "data" |
43 | 14 |
|
44 | | - # Use all available cores |
45 | | - num_cores = multiprocessing.cpu_count() |
46 | | - |
47 | | - # Create a multiprocessing pool |
48 | | - with multiprocessing.Pool(num_cores) as pool: |
49 | | - results = pool.map(scrape_func, urls) |
| 15 | + def run(self, keyword: str, years: list[int], language: Literal["French", "English", "Latvian"]) -> List[Dict]: |
| 16 | + scraped_articles_file = self.data_dir / 'scraped_articles.json' |
50 | 17 |
|
51 | | - # Process results |
52 | | - scraped_articles = [] |
53 | | - failed_urls = [] |
54 | | - for result in results: |
55 | | - if result['success']: |
56 | | - article = result['article'] |
57 | | - scraped_articles.append(article) |
58 | | - print(f"Scraped: {article.title}") |
59 | | - print(f"Content length: {len(article.content)}") |
60 | | - print(f"Date: {article.date}") |
61 | | - print("---") |
62 | | - else: |
63 | | - failed_urls.append(result['url']) |
64 | | - print(f"Failed to scrape: {result['url']}") |
65 | | - print("---") |
66 | | - |
67 | | - return scraped_articles, failed_urls |
68 | | - |
69 | | - @staticmethod |
70 | | - def _scrape_single_url(url, news_scraper): |
71 | | - article = news_scraper.scrape_article(url) |
72 | | - if article: |
73 | | - return {'success': True, 'article': article} |
| 18 | + if not scraped_articles_file.exists(): |
| 19 | + print("Scraped articles not found. Starting scraping process...") |
| 20 | + articles_data = self.scrap_from_gdelt.run(keyword, years, self.data_dir) |
74 | 21 | else: |
75 | | - return {'success': False, 'url': url} |
76 | | - |
77 | | - def _save_results(self, scraped_articles, failed_urls, output_dir): |
78 | | - output_dir.mkdir(parents=True, exist_ok=True) |
79 | | - |
80 | | - # Save successfully scraped articles to JSON |
81 | | - output_file = output_dir / 'scraped_articles.json' |
82 | | - with open(output_file, 'w', encoding='utf-8') as f: |
83 | | - json.dump([article.dict() for article in scraped_articles], f, ensure_ascii=False, indent=4) |
84 | | - |
85 | | - print(f"\nSuccessfully scraped articles saved to {output_file}") |
86 | | - |
87 | | - # Save failed URLs to a separate file |
88 | | - failed_file = output_dir / 'failed_urls.json' |
89 | | - with open(failed_file, 'w', encoding='utf-8') as f: |
90 | | - json.dump(failed_urls, f, ensure_ascii=False, indent=4) |
91 | | - |
92 | | - print(f"Failed URLs saved to {failed_file}") |
| 22 | + print("Scraped articles found. Loading from file...") |
| 23 | + with open(scraped_articles_file, 'r', encoding='utf-8') as f: |
| 24 | + articles_data = json.load(f) |
| 25 | + |
| 26 | + # Process articles and detect claims |
| 27 | + results = [] |
| 28 | + for article_data in articles_data: |
| 29 | + article = Article(**article_data) |
| 30 | + claims, n_tokens = detect_claims(article, language) |
| 31 | + results.append({ |
| 32 | + "article": article.dict(), |
| 33 | + "claims": claims.dict(), |
| 34 | + "n_tokens": n_tokens |
| 35 | + }) |
| 36 | + |
| 37 | + print(f"Processed {len(results)} articles with claims") |
| 38 | + return results |
| 39 | + |
| 40 | +def main(): |
| 41 | + pipeline = Pipeline() |
| 42 | + processed_articles = pipeline.run(keyword="CLIMATE", years=[2022, 2023, 2024], language="English") |
| 43 | + |
93 | 44 |
|
94 | 45 | if __name__ == "__main__": |
95 | | - pipeline = Pipeline() |
96 | | - output_dir = Path(__file__).parent.parent / "data" |
97 | | - pipeline.run(keyword="CLIMATE", years=[2022, 2023, 2024], output_dir=output_dir) |
| 46 | + main() |
0 commit comments