-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patharticle_cache.py
94 lines (78 loc) · 3.46 KB
/
article_cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
from datetime import datetime, timedelta
import json
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
class ArticleCache:
def __init__(self,
cache_file='articles/processed/processed_articles.json',
archive_file='articles/processed/archived_processed_articles.json'):
# Use absolute paths
base_dir = os.environ.get('STORAGE_PATH', '/app/articles')
self.cache_file = Path(base_dir) / 'processed' / 'processed_articles.json'
self.archive_file = Path(base_dir) / 'processed' / 'archived_processed_articles.json'
# Create directories
os.makedirs(self.cache_file.parent, exist_ok=True)
# Load cache
self.cache = self._load_cache(self.cache_file)
self.archive = self._load_cache(self.archive_file)
self.cached_articles = set(self.cache.keys()) | set(self.archive.keys())
# Log cache status
logger.info(f"Loaded {len(self.cached_articles)} cached articles from {self.cache_file}")
logger.info(f"First few cached URLs: {list(self.cached_articles)[:3]}")
def _load_cache(self, filename):
"""Load cache from file, creating it if it doesn't exist"""
if os.path.exists(filename):
try:
with open(filename, 'r') as f:
return json.load(f)
except json.JSONDecodeError:
return {}
return {}
def _save_cache(self, data, filename):
"""Save cache to file with pretty formatting"""
with open(filename, 'w') as f:
json.dump(
data,
f,
indent=2,
sort_keys=True
)
def add_article(self, article_link):
"""Add article to cache and cached_articles set"""
now = datetime.now().isoformat()
self.cache[article_link] = now
self.cached_articles.add(article_link)
self._save_cache(self.cache, self.cache_file)
logger.info(f"Added new article to cache: {article_link}")
def is_processed(self, url):
"""Check if an article URL has been processed (in either cache or archive)"""
return url in self.cache or url in self.archive
def archive_old_entries(self, days=30):
"""Move entries older than specified days to archive"""
cutoff = datetime.now() - timedelta(days=days)
entries_to_archive = {}
current_entries = {}
# Separate entries into current and to-be-archived
for url, processed_date in self.cache.items():
if datetime.fromisoformat(processed_date) < cutoff:
entries_to_archive[url] = processed_date
else:
current_entries[url] = processed_date
# Update archive with new old entries
self.archive.update(entries_to_archive)
# Update cache to only contain current entries
self.cache = current_entries
# Save both files
self._save_cache(self.cache, self.cache_file)
self._save_cache(self.archive, self.archive_file)
return len(entries_to_archive)
def is_cached(self, article_link):
"""Check if an article URL has been processed"""
is_cached = article_link in self.cached_articles
if is_cached:
logger.info(f"Found cached article: {article_link}")
else:
logger.debug(f"New article found: {article_link}")
return is_cached