Skip to content

Commit bcd72bc

Browse files
authored
Merge pull request #19 from JosePizarro3/cleanup-and-fixes
Cleanup and fixes
2 parents e0a69ed + a3f24f9 commit bcd72bc

File tree

7 files changed

+201
-21
lines changed

7 files changed

+201
-21
lines changed

.github/release.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
changelog:
2+
exclude:
3+
authors: [github-actions, pre-commit-ci]
4+
categories:
5+
- title: 💥 Breaking Changes
6+
labels: [breaking]
7+
- title: 🎉 New Features
8+
labels: [feature]
9+
- title: 🛠 Enhancements
10+
labels: [enhancement, DX, UX]
11+
- title: 🐛 Bug Fixes
12+
labels: [fix]
13+
- title: 📖 Documentation
14+
labels: [docs]
15+
- title: 🚀 Performance
16+
labels: [performance]
17+
- title: 🚧 CI
18+
labels: [ci]
19+
- title: 💡 Refactoring
20+
labels: [refactor]
21+
- title: 🧪 Tests
22+
labels: [tests]
23+
- title: 🔒 Security Fixes
24+
labels: [security]
25+
- title: 🏥 Package Health
26+
labels: [pkg]
27+
- title: 📦 Dependencies
28+
labels: [dependencies, outdated]
29+
- title: 🤷‍♂️ Other Changes
30+
labels: ['*']

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ pyrxiv search_and_download --help
3232

3333
For example:
3434
```bash
35-
pyrxiv search_and_download --category cond-mat.str-el --regex-pattern "DMFT|Hubbard" --n-papers 5
35+
pyrxiv search_and_download --category cond-mat.str-el --regex-pattern "DMFT|Hubbard" --n-papers 5 --download-pdfs True
3636
```
3737

3838
---

pyrxiv/cli/cli.py

Lines changed: 72 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,12 @@
11
import re
22
import time
33
from pathlib import Path
4-
from typing import TYPE_CHECKING
54

65
import click
76
import h5py
87
import numpy as np
98

10-
if TYPE_CHECKING:
11-
from pyrxiv.datamodel import ArxivPaper
12-
13-
9+
from pyrxiv.datamodel import ArxivPaper
1410
from pyrxiv.download import ArxivDownloader
1511
from pyrxiv.extract import TextExtractor
1612
from pyrxiv.fetch import ArxivFetcher
@@ -46,19 +42,21 @@ def run_search_and_download(
4642
start_from_filepath: bool = False,
4743
loader: str = "pdfminer",
4844
clean_text: bool = True,
45+
download_pdfs: bool = False,
4946
) -> tuple[list[Path], list["ArxivPaper"]]:
5047
"""
5148
Searches for a specific number of papers `n_papers` in arXiv for a specified `category` and downloads
52-
them in a `download_path`.
49+
their metadata in an HDF5 file in `download_path`.
5350
5451
If `regex_pattern` is specified, only the papers that contain the pattern will be downloaded.
5552
If `start_id` is specified, the search will start from that ID.
5653
If `start_from_filepath` is True, the search will start from the last downloaded paper's ID.
5754
If `loader` is specified, the text will be extracted using the corresponding loader.
58-
55+
If `clean_text` is True, the extracted text will be cleaned by removing references and unnecessary whitespaces.
56+
If `download_pdfs` is True, the PDFs will be downloaded and saved in `download_path`.
5957
6058
Args:
61-
download_path (Path, optional): The path for downloading the arXiv PDFs. Defaults to Path("data").
59+
download_path (Path, optional): The path for downloading the arXiv metadata. Defaults to Path("data").
6260
category (str, optional): The arXiv category on which the papers will be searched. Defaults to "cond-mat.str-el".
6361
n_papers (int, optional): The number of arXiv papers to be fetched and downloaded.
6462
If `regex_pattern` is not specified, this would correspond to the n_papers starting from the newest in the `category`. Defaults to 5.
@@ -70,9 +68,11 @@ def run_search_and_download(
7068
loader (str, optional): PDF loader to use for extracting text from the downloaded PDFs.
7169
Defaults to "pdfminer". Available loaders: "pdfminer", "pypdf".
7270
clean_text (bool, optional): If True, the extracted text will be cleaned by removing references and unnecessary whitespaces.
71+
Defaults to True.
72+
download_pdfs (bool, optional): If True, the PDFs will be downloaded and saved in `download_path`. Defaults to False.
7373
7474
Returns:
75-
tuple[list[Path], list[ArxivPaper]]: A tuple containing a list of Paths to the downloaded PDFs and a list of ArxivPaper objects
75+
tuple[list[Path], list[ArxivPaper]]: A tuple containing a list of Paths to the arXiv papers and a list of ArxivPaper objects
7676
with the extracted text.
7777
"""
7878
if loader not in ["pdfminer", "pypdf"]:
@@ -134,7 +134,8 @@ def run_search_and_download(
134134
save_paper_to_hdf5(paper=paper, pdf_path=pdf_path, hdf_path=hdf_path)
135135

136136
# Deleting the PDF file after storing it in HDF5
137-
pdf_path.unlink()
137+
if not download_pdfs:
138+
pdf_path.unlink()
138139

139140
# Appending the HDF5 file and paper to the lists
140141
pattern_files.append(hdf_path)
@@ -162,7 +163,7 @@ def cli():
162163
default="data",
163164
required=False,
164165
help="""
165-
(Optional) The path for downloading the arXiv PDFs. Defaults to "data".
166+
(Optional) The path for downloading the arXiv metadata in HDF5 files and, optionally (if set with download-pdfs), the PDFs. Defaults to "data".
166167
""",
167168
)
168169
@click.option(
@@ -239,6 +240,16 @@ def cli():
239240
Defaults to True.
240241
""",
241242
)
243+
@click.option(
244+
"--download-pdfs",
245+
"-dp",
246+
is_flag=True,
247+
default=False,
248+
required=False,
249+
help="""
250+
(Optional) If True, the PDFs will be downloaded and saved in `download_path`. Defaults to False.
251+
""",
252+
)
242253
def search_and_download(
243254
download_path,
244255
category,
@@ -248,6 +259,7 @@ def search_and_download(
248259
start_from_filepath,
249260
loader,
250261
clean_text,
262+
download_pdfs,
251263
):
252264
start_time = time.time()
253265

@@ -260,7 +272,56 @@ def search_and_download(
260272
start_from_filepath=start_from_filepath,
261273
loader=loader,
262274
clean_text=clean_text,
275+
download_pdfs=download_pdfs,
263276
)
264277

265278
elapsed_time = time.time() - start_time
266279
click.echo(f"Downloaded arXiv papers in {elapsed_time:.2f} seconds\n\n")
280+
281+
282+
@cli.command(
283+
name="download_pdfs",
284+
help="Downloads the PDFs of the arXiv papers stored in HDF5 files in a specified path.",
285+
)
286+
@click.option(
287+
"--data-path",
288+
"-path",
289+
type=str,
290+
default="data",
291+
required=False,
292+
help="""
293+
(Optional) The path where the HDF5 files with the arXiv papers metadata exist. The downloaded PDFs will be stored in there as well. Defaults to "data".
294+
""",
295+
)
296+
def download_pdfs(data_path):
297+
start_time = time.time()
298+
299+
# check if `data_path` exists, and if not, returns an error
300+
data_path = Path(data_path)
301+
if not data_path.exists():
302+
raise click.ClickException(f"The specified path {data_path} does not exist.")
303+
downloader = ArxivDownloader(download_path=data_path, logger=logger)
304+
305+
# Loops over all HDF5 files in the `data_path` and downloads the corresponding PDFs
306+
hdf5_files = list(data_path.glob("*.hdf5"))
307+
308+
failed_downloads = []
309+
with click.progressbar(
310+
length=len(hdf5_files), label="Downloading papers PDFs"
311+
) as bar:
312+
for file in hdf5_files:
313+
paper = ArxivPaper.from_hdf5(file=file)
314+
try:
315+
_ = downloader.download_pdf(arxiv_paper=paper)
316+
except Exception as e:
317+
failed_downloads.append(str(file))
318+
logger.error(f"Failed to download PDF for {file}: {e}")
319+
bar.update(1)
320+
321+
elapsed_time = time.time() - start_time
322+
click.echo(f"Downloaded arXiv papers in {elapsed_time:.2f} seconds\n\n")
323+
324+
if failed_downloads:
325+
click.echo("\nFailed to download PDFs for the following files:")
326+
for failed_file in failed_downloads:
327+
click.echo(f" - {failed_file}")

pyrxiv/datamodel.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import datetime
2+
from pathlib import Path
23

34
import h5py
45
import numpy as np
@@ -108,3 +109,47 @@ def to_hdf5(self, hdf_file: h5py.File) -> h5py.Group:
108109
# all other attributes
109110
sub_group.attrs[key] = value
110111
return group
112+
113+
@classmethod
114+
def from_hdf5(cls, file: Path) -> "ArxivPaper":
115+
"""
116+
Loads the ArxivPaper metadata and text dataset from an HDF5 file and returns an instance of ArxivPaper
117+
filled with the data.
118+
119+
Args:
120+
file (Path): The path to the HDF5 file.
121+
122+
Returns:
123+
ArxivPaper: An instance of ArxivPaper filled with the data from the HDF5 file.
124+
"""
125+
with h5py.File(file, "r") as h5f:
126+
paper_id = file.stem
127+
group = h5f[paper_id]["arxiv_paper"]
128+
data = {}
129+
for key in cls.model_fields:
130+
if key == "id":
131+
data[key] = paper_id
132+
continue
133+
if key in ["updated", "published"]:
134+
value = group.attrs.get(key, None)
135+
data[key] = (
136+
datetime.datetime.fromisoformat(value) if value else None
137+
)
138+
continue
139+
if key == "authors":
140+
authors = group.get("authors", [])
141+
data[key] = [
142+
Author(name=author.decode("utf-8")) for author in authors
143+
]
144+
continue
145+
if key == "text":
146+
text_data = group.get("text", b"")
147+
data[key] = text_data[()].decode("utf-8") if text_data else ""
148+
continue
149+
if key in group and isinstance(group[key], h5py.Dataset):
150+
value = group[key][()]
151+
if isinstance(value, np.ndarray):
152+
data[key] = [item.decode("utf-8") for item in value]
153+
continue
154+
data[key] = group.attrs.get(key, None)
155+
return cls(**data)

tests/conftest.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import pytest
66

7-
from pyrxiv.datamodel import ArxivPaper
7+
from pyrxiv.datamodel import ArxivPaper, Author
88
from pyrxiv.logger import log_storage
99

1010
if os.getenv("_PYTEST_RAISE", "0") != "0":
@@ -30,11 +30,15 @@ def generate_arxiv_paper(id: str = "1234.5678v1"):
3030
id=id,
3131
url=f"http://arxiv.org/abs/{id}",
3232
pdf_url=f"http://arxiv.org/pdf/{id}",
33-
title="Test Title",
34-
summary="A summary or abstract.",
35-
authors=[],
36-
comment="",
37-
categories=[],
38-
updated=datetime.datetime(2024, 4, 25, 0, 0, tzinfo=datetime.timezone.utc),
39-
published=datetime.datetime(2024, 4, 25, 0, 0, tzinfo=datetime.timezone.utc),
33+
updated=datetime.datetime(2025, 2, 21, 10, 0, 0),
34+
published=datetime.datetime(2025, 2, 20, 12, 0, 0),
35+
title="A test paper",
36+
summary="This is a test summary",
37+
authors=[Author(name="Alice"), Author(name="Bob")],
38+
comment="Some comment",
39+
n_pages=12,
40+
n_figures=3,
41+
categories=["cond-mat.str-el", "cond-mat.mtrl-sci"],
42+
pdf_loader="pypdf",
43+
text="This is the body of the paper.",
4044
)

tests/test_datamodel.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import datetime
2+
import io
3+
4+
import h5py
5+
import pytest
6+
7+
from pyrxiv.datamodel import ArxivPaper
8+
from tests.conftest import generate_arxiv_paper
9+
10+
11+
def test_to_from_hdf5_roundtrip(tmp_path):
12+
sample_paper = generate_arxiv_paper()
13+
# create an in-memory buffer to avoid disk writes
14+
buffer = io.BytesIO()
15+
16+
# write to HDF5
17+
with h5py.File(buffer, "w") as h5f:
18+
sample_paper.to_hdf5(h5f)
19+
20+
# important: reset buffer cursor for reading
21+
buffer.seek(0)
22+
23+
# write buffer to tmp_path just to satisfy Path input of from_hdf5
24+
# -> you can patch from_hdf5 later to also accept file-like objects
25+
hdf5_path = tmp_path / f"{sample_paper.id}.h5"
26+
with open(hdf5_path, "wb") as f:
27+
f.write(buffer.getvalue())
28+
29+
# now load back
30+
paper2 = ArxivPaper.from_hdf5(hdf5_path)
31+
32+
# compare
33+
assert paper2.id == sample_paper.id
34+
assert paper2.title == sample_paper.title
35+
assert paper2.summary == sample_paper.summary
36+
assert [a.name for a in paper2.authors] == [a.name for a in sample_paper.authors]
37+
assert paper2.text == sample_paper.text
38+
assert paper2.categories == sample_paper.categories
39+
assert paper2.n_pages == sample_paper.n_pages
40+
assert isinstance(paper2.updated, datetime.datetime)

tests/test_download.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class TestArxivDownloader:
1616
1,
1717
{
1818
"level": "error",
19-
"event": "Failed to download PDF: 404 Client Error: Not Found for url: http://arxiv.org/pdf/1234.5678v1",
19+
"event": "Failed to download PDF: 404 Client Error: Not Found for url: https://arxiv.org/pdf/1234.5678v1",
2020
},
2121
),
2222
(

0 commit comments

Comments
 (0)