11# Ensure the source directory is in the Python path for imports
2- # This allows the test to find the scraper module
32import sys
43from pathlib import Path
5- from unittest .mock import MagicMock , patch
4+ from unittest .mock import call , patch
65
76import pytest
87import requests
98
10- # Adjust the path if your 'src' directory is located elsewhere relative to the project root
11- sys .path .insert (0 , str (Path (__file__ ).resolve (). parents [ 2 ] / "src" ))
9+ # Add the src directory to the path to ensure imports work from the root
10+ sys .path .insert (0 , str (Path (__file__ ).parent . parent / "src" ))
1211
1312from desi .scraper .openbis_scraper import OpenbisScraper
1413
1514# --- Test Data and Mocks ---
1615
17- # 1. Create fake HTML content for our mock server responses
1816FAKE_HTML_HOME = """
19- <html>
20- <body>
17+ <html><body>
2118 <div role="main">
22- <h1>Home Page</h1>
23- <p>This is the main content.</p>
24- <a href="/en/20.10.0-11/details.html">Details Page Link</a>
25- <a href="https://example.com">External Link</a>
26- <a href="/en/20.10.0-11/">Link to Self</a>
19+ <h1>Home Page</h1><a href="details.html">Details</a>
20+ <a href="https://example.com">External</a>
2721 </div>
28- <div id="sidebar">
29- <p>Other content we should ignore.</p>
30- </div>
31- </body>
32- </html>
22+ </body></html>
3323"""
3424
3525FAKE_HTML_DETAILS = """
36- <html>
37- <body>
38- <div role="main">
39- <h2>Details Page</h2>
40- <p>This is the details page content.</p>
41- </div>
42- </body>
43- </html>
26+ <html><body>
27+ <div role="main"><h2>Details Page</h2><a href="/">Home</a></div>
28+ </body></html>
4429"""
4530
46- FAKE_HTML_NO_MAIN = """
47- <html>
48- <body>
49- <div>
50- <p>This page has no main content div.</p>
51- </div>
52- </body>
53- </html>
54- """
31+ FAKE_HTML_NO_MAIN = "<html><body><div>No main content</div></body></html>"
5532
5633
57- # 2. Create a mock for the requests.get function
5834class MockResponse :
35+ """A mock for the requests.Response object."""
36+
5937 def __init__ (self , content , status_code = 200 ):
6038 self .content = content .encode ("utf-8" )
6139 self .status_code = status_code
40+ self .url = ""
6241
6342 def raise_for_status (self ):
6443 if self .status_code >= 400 :
@@ -68,8 +47,6 @@ def raise_for_status(self):
6847@pytest .fixture
6948def mock_requests_get (mocker ):
7049 """A pytest fixture to mock requests.get with predefined responses."""
71-
72- # A map of URLs to their fake HTML content
7350 url_map = {
7451 "https://openbis.readthedocs.io/en/20.10.0-11/" : MockResponse (FAKE_HTML_HOME ),
7552 "https://openbis.readthedocs.io/en/20.10.0-11/details.html" : MockResponse (
@@ -83,8 +60,7 @@ def mock_requests_get(mocker):
8360 ),
8461 }
8562
86- def mock_get (url ):
87- # Return the response from our map, or a 404 if not found
63+ def mock_get (url , timeout = None ):
8864 return url_map .get (url , MockResponse ("Not Found" , 404 ))
8965
9066 # Replace the real requests.get with our mock function
@@ -94,96 +70,109 @@ def mock_get(url):
9470# --- Unit Tests ---
9571
9672
97- def test_scrape_single_page_success (tmp_path ):
73+ # By adding `mock_requests_get` to the test signature, we activate the mock.
74+ def test_scrape_single_page_success (mock_requests_get , tmp_path ):
9875 """
9976 Tests that the scraper can download and correctly save a single page.
77+ This test will NOT hit the live internet because the mock is active.
10078 """
10179 base_url = "https://openbis.readthedocs.io/en/20.10.0-11/"
80+ scraper = OpenbisScraper (base_url = base_url , output_dir = str (tmp_path ), max_pages = 1 )
10281
103- # Run the scraper on the temporary directory provided by pytest
104- with patch ("time.sleep" , return_value = None ): # Mock sleep to speed up test
105- scraper = OpenbisScraper (base_url = base_url , output_dir = str (tmp_path ))
82+ with patch ("time.sleep" , return_value = None ):
10683 scraper .scrape ()
107- # Assertions
84+
10885 expected_file = tmp_path / "en_20.10.0-11.md"
10986 assert expected_file .exists ()
11087
11188 content = expected_file .read_text (encoding = "utf-8" )
11289 assert "# Home Page" in content
113- assert "This is the main content." in content
114- assert "Other content we should ignore" not in content # Should not scrape sidebar
90+ # The filename should be derived from the path, not the full URL
91+ assert len ( list ( tmp_path . iterdir ())) == 1
11592
11693
117- def test_scrape_crawls_to_second_page (tmp_path ):
94+ def test_scrape_crawls_to_second_page (mock_requests_get , tmp_path ):
11895 """
11996 Tests that the scraper follows an internal link found on the first page.
12097 """
12198 base_url = "https://openbis.readthedocs.io/en/20.10.0-11/"
99+ scraper = OpenbisScraper (base_url = base_url , output_dir = str (tmp_path ))
122100
123101 with patch ("time.sleep" , return_value = None ):
124- scraper = OpenbisScraper (base_url = base_url , output_dir = str (tmp_path ))
125102 scraper .scrape ()
126103
127- # Assert that both the home page and the details page were scraped and saved
128104 assert (tmp_path / "en_20.10.0-11.md" ).exists ()
129105 details_file = tmp_path / "en_20.10.0-11_details.html.md"
130106 assert details_file .exists ()
131107
132108 details_content = details_file .read_text (encoding = "utf-8" )
133109 assert "## Details Page" in details_content
134- assert "This is the details page content." in details_content
110+ # It should have called the mock for the base URL and the details URL
111+ assert mock_requests_get .call_count == 2
135112
136113
137- def test_ignores_external_and_visited_links (mock_requests_get , tmp_path ):
114+ def test_scrape_stops_at_max_pages (mock_requests_get , tmp_path ):
138115 """
139- Tests that the scraper does not follow external links or revisit pages .
116+ Tests that the new max_pages safety feature correctly limits the crawl .
140117 """
141118 base_url = "https://openbis.readthedocs.io/en/20.10.0-11/"
119+ # Set max_pages to 1, even though the first page has a link to a second
120+ scraper = OpenbisScraper (base_url = base_url , output_dir = str (tmp_path ), max_pages = 1 )
142121
143122 with patch ("time.sleep" , return_value = None ):
144- scraper = OpenbisScraper (base_url = base_url , output_dir = str (tmp_path ))
145123 scraper .scrape ()
146124
147- # The mock is configured to only know about the 'home' and 'details' URLs.
148- # If it tries to access example.com, the mock would fail.
149- # We can check the call count to ensure it only visited the valid internal links.
150- # It should have been called twice: once for home, once for details.
151- assert mock_requests_get .call_count == 2
125+ # The mock should only have been called ONCE
126+ assert mock_requests_get .call_count == 1
127+ # Only one file should have been created
128+ assert len (list (tmp_path .iterdir ())) == 1
129+ assert (tmp_path / "en_20.10.0-11.md" ).exists ()
130+ assert not (tmp_path / "en_20.10.0-11_details.html.md" ).exists ()
152131
153132
154- def test_handles_request_exception_gracefully (tmp_path ):
133+ def test_handles_request_exception_gracefully (mock_requests_get , tmp_path ):
155134 """
156135 Tests that a network error on one page does not stop the entire process.
157136 """
158137 base_url = "https://openbis.readthedocs.io/en/20.10.0-11/"
159- error_urls = {base_url , "https://openbis.readthedocs.io/en/20.10.0-11/error.html" }
160- # Manually add the error URL to the list of pages to visit
138+ error_url = "https://openbis.readthedocs.io/en/20.10.0-11/error.html"
139+ initial_urls = {base_url , error_url }
140+
141+ scraper = OpenbisScraper (
142+ base_url = base_url , output_dir = str (tmp_path ), initial_urls = initial_urls
143+ )
144+
161145 with patch ("time.sleep" , return_value = None ):
162- scraper = OpenbisScraper (
163- base_url = base_url , output_dir = str (tmp_path ), initial_urls = error_urls
164- )
165146 scraper .scrape ()
166147
167- # The scraper should log an error but continue.
168- # The successful page should exist, but the error page should not.
148+ # Assert that the successful page was created and the error page was not
169149 assert (tmp_path / "en_20.10.0-11.md" ).exists ()
170150 assert not (tmp_path / "en_20.10.0-11_error.html.md" ).exists ()
171151
152+ # --- FIX: Use assert_has_calls to be more specific and robust ---
153+ # We verify that it *attempted* to call our initial URLs, regardless of
154+ # what other URLs it discovered and called later.
155+ expected_calls = [
156+ call (base_url , timeout = 30 ),
157+ call (error_url , timeout = 30 ),
158+ ]
159+ mock_requests_get .assert_has_calls (expected_calls , any_order = True )
172160
173- def test_handles_page_without_main_content (tmp_path ):
161+
162+ def test_handles_page_without_main_content (mock_requests_get , tmp_path ):
174163 """
175164 Tests that no file is created for a page that lacks the main content div.
176165 """
177- base_url = "https://openbis.readthedocs.io/en/20.10.0-11/"
178- url_no_main = "https://openbis.readthedocs.io/en/20.10.0-11/no-main.html"
179- # FIX: Pass the URL directly to the function.
180- no_main_urls = {url_no_main }
166+ no_main_url = "https://openbis.readthedocs.io/en/20.10.0-11/no-main.html"
167+ scraper = OpenbisScraper (
168+ base_url = "https://openbis.readthedocs.io/en/20.10.0-11/" ,
169+ output_dir = str (tmp_path ),
170+ initial_urls = {no_main_url },
171+ )
181172
182173 with patch ("time.sleep" , return_value = None ):
183- scraper = OpenbisScraper (
184- base_url = base_url , output_dir = str (tmp_path ), initial_urls = no_main_urls
185- )
186174 scraper .scrape ()
187175
188- # Assertions are unchanged
176+ # The mock was called, but no file should have been written.
177+ assert mock_requests_get .call_count == 1
189178 assert len (list (tmp_path .iterdir ())) == 0
0 commit comments