Skip to content

Commit ceb27da

Browse files
committed
Tests for openbis_scraper
1 parent 58d1287 commit ceb27da

File tree

2 files changed

+73
-76
lines changed

2 files changed

+73
-76
lines changed

src/desi/scraper/openbis_scraper.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class OpenbisScraper:
1616
to Markdown, and saves it to .md files.
1717
"""
1818

19-
def __init__(self, base_url, output_dir, initial_urls=None):
19+
def __init__(self, base_url, output_dir, initial_urls=None, max_pages=None):
2020
"""
2121
Initializes the scraper with the target URL and output directory.
2222
@@ -25,11 +25,13 @@ def __init__(self, base_url, output_dir, initial_urls=None):
2525
output_dir (str): The directory where Markdown files will be saved.
2626
initial_urls (set, optional): A set of initial URLs to crawl.
2727
Defaults to the base_url.
28+
max_pages (int, optional): A safety limit on the number of pages to scrape.
2829
"""
2930
self.base_url = base_url
3031
self.output_dir = output_dir
3132
self.to_visit = initial_urls if initial_urls is not None else {base_url}
3233
self.visited = set()
34+
self.max_pages = max_pages
3335

3436
if not os.path.exists(self.output_dir):
3537
os.makedirs(self.output_dir)
@@ -39,6 +41,12 @@ def scrape(self):
3941
Starts the crawling and scraping process.
4042
"""
4143
while self.to_visit:
44+
if self.max_pages is not None and len(self.visited) >= self.max_pages:
45+
logger.info(
46+
f"Reached max_pages limit of {self.max_pages}. Stopping scrape."
47+
)
48+
break
49+
4250
current_url = self.to_visit.pop()
4351
if current_url in self.visited:
4452
continue

tests/test_openbis_scraper.py

Lines changed: 64 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,43 @@
11
# Ensure the source directory is in the Python path for imports
2-
# This allows the test to find the scraper module
32
import sys
43
from pathlib import Path
5-
from unittest.mock import MagicMock, patch
4+
from unittest.mock import call, patch
65

76
import pytest
87
import requests
98

10-
# Adjust the path if your 'src' directory is located elsewhere relative to the project root
11-
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src"))
9+
# Add the src directory to the path to ensure imports work from the root
10+
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
1211

1312
from desi.scraper.openbis_scraper import OpenbisScraper
1413

1514
# --- Test Data and Mocks ---
1615

17-
# 1. Create fake HTML content for our mock server responses
1816
FAKE_HTML_HOME = """
19-
<html>
20-
<body>
17+
<html><body>
2118
<div role="main">
22-
<h1>Home Page</h1>
23-
<p>This is the main content.</p>
24-
<a href="/en/20.10.0-11/details.html">Details Page Link</a>
25-
<a href="https://example.com">External Link</a>
26-
<a href="/en/20.10.0-11/">Link to Self</a>
19+
<h1>Home Page</h1><a href="details.html">Details</a>
20+
<a href="https://example.com">External</a>
2721
</div>
28-
<div id="sidebar">
29-
<p>Other content we should ignore.</p>
30-
</div>
31-
</body>
32-
</html>
22+
</body></html>
3323
"""
3424

3525
FAKE_HTML_DETAILS = """
36-
<html>
37-
<body>
38-
<div role="main">
39-
<h2>Details Page</h2>
40-
<p>This is the details page content.</p>
41-
</div>
42-
</body>
43-
</html>
26+
<html><body>
27+
<div role="main"><h2>Details Page</h2><a href="/">Home</a></div>
28+
</body></html>
4429
"""
4530

46-
FAKE_HTML_NO_MAIN = """
47-
<html>
48-
<body>
49-
<div>
50-
<p>This page has no main content div.</p>
51-
</div>
52-
</body>
53-
</html>
54-
"""
31+
FAKE_HTML_NO_MAIN = "<html><body><div>No main content</div></body></html>"
5532

5633

57-
# 2. Create a mock for the requests.get function
5834
class MockResponse:
35+
"""A mock for the requests.Response object."""
36+
5937
def __init__(self, content, status_code=200):
6038
self.content = content.encode("utf-8")
6139
self.status_code = status_code
40+
self.url = ""
6241

6342
def raise_for_status(self):
6443
if self.status_code >= 400:
@@ -68,8 +47,6 @@ def raise_for_status(self):
6847
@pytest.fixture
6948
def mock_requests_get(mocker):
7049
"""A pytest fixture to mock requests.get with predefined responses."""
71-
72-
# A map of URLs to their fake HTML content
7350
url_map = {
7451
"https://openbis.readthedocs.io/en/20.10.0-11/": MockResponse(FAKE_HTML_HOME),
7552
"https://openbis.readthedocs.io/en/20.10.0-11/details.html": MockResponse(
@@ -83,8 +60,7 @@ def mock_requests_get(mocker):
8360
),
8461
}
8562

86-
def mock_get(url):
87-
# Return the response from our map, or a 404 if not found
63+
def mock_get(url, timeout=None):
8864
return url_map.get(url, MockResponse("Not Found", 404))
8965

9066
# Replace the real requests.get with our mock function
@@ -94,96 +70,109 @@ def mock_get(url):
9470
# --- Unit Tests ---
9571

9672

97-
def test_scrape_single_page_success(tmp_path):
73+
# By adding `mock_requests_get` to the test signature, we activate the mock.
74+
def test_scrape_single_page_success(mock_requests_get, tmp_path):
9875
"""
9976
Tests that the scraper can download and correctly save a single page.
77+
This test will NOT hit the live internet because the mock is active.
10078
"""
10179
base_url = "https://openbis.readthedocs.io/en/20.10.0-11/"
80+
scraper = OpenbisScraper(base_url=base_url, output_dir=str(tmp_path), max_pages=1)
10281

103-
# Run the scraper on the temporary directory provided by pytest
104-
with patch("time.sleep", return_value=None): # Mock sleep to speed up test
105-
scraper = OpenbisScraper(base_url=base_url, output_dir=str(tmp_path))
82+
with patch("time.sleep", return_value=None):
10683
scraper.scrape()
107-
# Assertions
84+
10885
expected_file = tmp_path / "en_20.10.0-11.md"
10986
assert expected_file.exists()
11087

11188
content = expected_file.read_text(encoding="utf-8")
11289
assert "# Home Page" in content
113-
assert "This is the main content." in content
114-
assert "Other content we should ignore" not in content # Should not scrape sidebar
90+
# The filename should be derived from the path, not the full URL
91+
assert len(list(tmp_path.iterdir())) == 1
11592

11693

117-
def test_scrape_crawls_to_second_page(tmp_path):
94+
def test_scrape_crawls_to_second_page(mock_requests_get, tmp_path):
11895
"""
11996
Tests that the scraper follows an internal link found on the first page.
12097
"""
12198
base_url = "https://openbis.readthedocs.io/en/20.10.0-11/"
99+
scraper = OpenbisScraper(base_url=base_url, output_dir=str(tmp_path))
122100

123101
with patch("time.sleep", return_value=None):
124-
scraper = OpenbisScraper(base_url=base_url, output_dir=str(tmp_path))
125102
scraper.scrape()
126103

127-
# Assert that both the home page and the details page were scraped and saved
128104
assert (tmp_path / "en_20.10.0-11.md").exists()
129105
details_file = tmp_path / "en_20.10.0-11_details.html.md"
130106
assert details_file.exists()
131107

132108
details_content = details_file.read_text(encoding="utf-8")
133109
assert "## Details Page" in details_content
134-
assert "This is the details page content." in details_content
110+
# It should have called the mock for the base URL and the details URL
111+
assert mock_requests_get.call_count == 2
135112

136113

137-
def test_ignores_external_and_visited_links(mock_requests_get, tmp_path):
114+
def test_scrape_stops_at_max_pages(mock_requests_get, tmp_path):
138115
"""
139-
Tests that the scraper does not follow external links or revisit pages.
116+
Tests that the new max_pages safety feature correctly limits the crawl.
140117
"""
141118
base_url = "https://openbis.readthedocs.io/en/20.10.0-11/"
119+
# Set max_pages to 1, even though the first page has a link to a second
120+
scraper = OpenbisScraper(base_url=base_url, output_dir=str(tmp_path), max_pages=1)
142121

143122
with patch("time.sleep", return_value=None):
144-
scraper = OpenbisScraper(base_url=base_url, output_dir=str(tmp_path))
145123
scraper.scrape()
146124

147-
# The mock is configured to only know about the 'home' and 'details' URLs.
148-
# If it tries to access example.com, the mock would fail.
149-
# We can check the call count to ensure it only visited the valid internal links.
150-
# It should have been called twice: once for home, once for details.
151-
assert mock_requests_get.call_count == 2
125+
# The mock should only have been called ONCE
126+
assert mock_requests_get.call_count == 1
127+
# Only one file should have been created
128+
assert len(list(tmp_path.iterdir())) == 1
129+
assert (tmp_path / "en_20.10.0-11.md").exists()
130+
assert not (tmp_path / "en_20.10.0-11_details.html.md").exists()
152131

153132

154-
def test_handles_request_exception_gracefully(tmp_path):
133+
def test_handles_request_exception_gracefully(mock_requests_get, tmp_path):
155134
"""
156135
Tests that a network error on one page does not stop the entire process.
157136
"""
158137
base_url = "https://openbis.readthedocs.io/en/20.10.0-11/"
159-
error_urls = {base_url, "https://openbis.readthedocs.io/en/20.10.0-11/error.html"}
160-
# Manually add the error URL to the list of pages to visit
138+
error_url = "https://openbis.readthedocs.io/en/20.10.0-11/error.html"
139+
initial_urls = {base_url, error_url}
140+
141+
scraper = OpenbisScraper(
142+
base_url=base_url, output_dir=str(tmp_path), initial_urls=initial_urls
143+
)
144+
161145
with patch("time.sleep", return_value=None):
162-
scraper = OpenbisScraper(
163-
base_url=base_url, output_dir=str(tmp_path), initial_urls=error_urls
164-
)
165146
scraper.scrape()
166147

167-
# The scraper should log an error but continue.
168-
# The successful page should exist, but the error page should not.
148+
# Assert that the successful page was created and the error page was not
169149
assert (tmp_path / "en_20.10.0-11.md").exists()
170150
assert not (tmp_path / "en_20.10.0-11_error.html.md").exists()
171151

152+
# --- FIX: Use assert_has_calls to be more specific and robust ---
153+
# We verify that it *attempted* to call our initial URLs, regardless of
154+
# what other URLs it discovered and called later.
155+
expected_calls = [
156+
call(base_url, timeout=30),
157+
call(error_url, timeout=30),
158+
]
159+
mock_requests_get.assert_has_calls(expected_calls, any_order=True)
172160

173-
def test_handles_page_without_main_content(tmp_path):
161+
162+
def test_handles_page_without_main_content(mock_requests_get, tmp_path):
174163
"""
175164
Tests that no file is created for a page that lacks the main content div.
176165
"""
177-
base_url = "https://openbis.readthedocs.io/en/20.10.0-11/"
178-
url_no_main = "https://openbis.readthedocs.io/en/20.10.0-11/no-main.html"
179-
# FIX: Pass the URL directly to the function.
180-
no_main_urls = {url_no_main}
166+
no_main_url = "https://openbis.readthedocs.io/en/20.10.0-11/no-main.html"
167+
scraper = OpenbisScraper(
168+
base_url="https://openbis.readthedocs.io/en/20.10.0-11/",
169+
output_dir=str(tmp_path),
170+
initial_urls={no_main_url},
171+
)
181172

182173
with patch("time.sleep", return_value=None):
183-
scraper = OpenbisScraper(
184-
base_url=base_url, output_dir=str(tmp_path), initial_urls=no_main_urls
185-
)
186174
scraper.scrape()
187175

188-
# Assertions are unchanged
176+
# The mock was called, but no file should have been written.
177+
assert mock_requests_get.call_count == 1
189178
assert len(list(tmp_path.iterdir())) == 0

0 commit comments

Comments
 (0)