Merge pull request #19 from JosePizarro3/cleanup-and-fixes

JosePizarro3 · web-flow · commit bcd72bc50e6a · 2025-09-22T13:07:40.000+02:00
Cleanup and fixes
diff --git a/.github/release.yml b/.github/release.yml
@@ -0,0 +1,30 @@
+changelog:
+  exclude:
+    authors: [github-actions, pre-commit-ci]
+  categories:
+    - title: 💥 Breaking Changes
+      labels: [breaking]
+    - title: 🎉 New Features
+      labels: [feature]
+    - title: 🛠 Enhancements
+      labels: [enhancement, DX, UX]
+    - title: 🐛 Bug Fixes
+      labels: [fix]
+    - title: 📖 Documentation
+      labels: [docs]
+    - title: 🚀 Performance
+      labels: [performance]
+    - title: 🚧 CI
+      labels: [ci]
+    - title: 💡 Refactoring
+      labels: [refactor]
+    - title: 🧪 Tests
+      labels: [tests]
+    - title: 🔒 Security Fixes
+      labels: [security]
+    - title: 🏥 Package Health
+      labels: [pkg]
+    - title: 📦 Dependencies
+      labels: [dependencies, outdated]
+    - title: 🤷‍♂️ Other Changes
+      labels: ['*']
diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@ pyrxiv search_and_download --help
 
 For example:
 ```bash
-pyrxiv search_and_download --category cond-mat.str-el --regex-pattern "DMFT|Hubbard" --n-papers 5
+pyrxiv search_and_download --category cond-mat.str-el --regex-pattern "DMFT|Hubbard" --n-papers 5 --download-pdfs True
 ```
 
 ---
diff --git a/pyrxiv/cli/cli.py b/pyrxiv/cli/cli.py
@@ -1,16 +1,12 @@
 import re
 import time
 from pathlib import Path
-from typing import TYPE_CHECKING
 
 import click
 import h5py
 import numpy as np
 
-if TYPE_CHECKING:
-    from pyrxiv.datamodel import ArxivPaper
-
-
+from pyrxiv.datamodel import ArxivPaper
 from pyrxiv.download import ArxivDownloader
 from pyrxiv.extract import TextExtractor
 from pyrxiv.fetch import ArxivFetcher
@@ -46,19 +42,21 @@ def run_search_and_download(
     start_from_filepath: bool = False,
     loader: str = "pdfminer",
     clean_text: bool = True,
+    download_pdfs: bool = False,
 ) -> tuple[list[Path], list["ArxivPaper"]]:
     """
     Searches for a specific number of papers `n_papers` in arXiv for a specified `category` and downloads
-    them in a `download_path`.
+    their metadata in an HDF5 file in `download_path`.
 
     If `regex_pattern` is specified, only the papers that contain the pattern will be downloaded.
     If `start_id` is specified, the search will start from that ID.
     If `start_from_filepath` is True, the search will start from the last downloaded paper's ID.
     If `loader` is specified, the text will be extracted using the corresponding loader.
-
+    If `clean_text` is True, the extracted text will be cleaned by removing references and unnecessary whitespaces.
+    If `download_pdfs` is True, the PDFs will be downloaded and saved in `download_path`.
 
     Args:
-        download_path (Path, optional): The path for downloading the arXiv PDFs. Defaults to Path("data").
+        download_path (Path, optional): The path for downloading the arXiv metadata. Defaults to Path("data").
         category (str, optional): The arXiv category on which the papers will be searched. Defaults to "cond-mat.str-el".
         n_papers (int, optional): The number of arXiv papers to be fetched and downloaded.
             If `regex_pattern` is not specified, this would correspond to the n_papers starting from the newest in the `category`. Defaults to 5.
@@ -70,9 +68,11 @@ def run_search_and_download(
         loader (str, optional): PDF loader to use for extracting text from the downloaded PDFs.
             Defaults to "pdfminer". Available loaders: "pdfminer", "pypdf".
         clean_text (bool, optional): If True, the extracted text will be cleaned by removing references and unnecessary whitespaces.
+            Defaults to True.
+        download_pdfs (bool, optional): If True, the PDFs will be downloaded and saved in `download_path`. Defaults to False.
 
     Returns:
-        tuple[list[Path], list[ArxivPaper]]: A tuple containing a list of Paths to the downloaded PDFs and a list of ArxivPaper objects
+        tuple[list[Path], list[ArxivPaper]]: A tuple containing a list of Paths to the arXiv papers and a list of ArxivPaper objects
             with the extracted text.
     """
     if loader not in ["pdfminer", "pypdf"]:
@@ -134,7 +134,8 @@ def run_search_and_download(
                 save_paper_to_hdf5(paper=paper, pdf_path=pdf_path, hdf_path=hdf_path)
 
                 # Deleting the PDF file after storing it in HDF5
-                pdf_path.unlink()
+                if not download_pdfs:
+                    pdf_path.unlink()
 
                 # Appending the HDF5 file and paper to the lists
                 pattern_files.append(hdf_path)
@@ -162,7 +163,7 @@ def cli():
     default="data",
     required=False,
     help="""
-    (Optional) The path for downloading the arXiv PDFs. Defaults to "data".
+    (Optional) The path for downloading the arXiv metadata in HDF5 files and, optionally (if set with download-pdfs), the PDFs. Defaults to "data".
     """,
 )
 @click.option(
@@ -239,6 +240,16 @@ def cli():
     Defaults to True.
     """,
 )
+@click.option(
+    "--download-pdfs",
+    "-dp",
+    is_flag=True,
+    default=False,
+    required=False,
+    help="""
+    (Optional) If True, the PDFs will be downloaded and saved in `download_path`. Defaults to False.
+    """,
+)
 def search_and_download(
     download_path,
     category,
@@ -248,6 +259,7 @@ def search_and_download(
     start_from_filepath,
     loader,
     clean_text,
+    download_pdfs,
 ):
     start_time = time.time()
 
@@ -260,7 +272,56 @@ def search_and_download(
         start_from_filepath=start_from_filepath,
         loader=loader,
         clean_text=clean_text,
+        download_pdfs=download_pdfs,
     )
 
     elapsed_time = time.time() - start_time
     click.echo(f"Downloaded arXiv papers in {elapsed_time:.2f} seconds\n\n")
+
+
+@cli.command(
+    name="download_pdfs",
+    help="Downloads the PDFs of the arXiv papers stored in HDF5 files in a specified path.",
+)
+@click.option(
+    "--data-path",
+    "-path",
+    type=str,
+    default="data",
+    required=False,
+    help="""
+    (Optional) The path where the HDF5 files with the arXiv papers metadata exist. The downloaded PDFs will be stored in there as well. Defaults to "data".
+    """,
+)
+def download_pdfs(data_path):
+    start_time = time.time()
+
+    # check if `data_path` exists, and if not, returns an error
+    data_path = Path(data_path)
+    if not data_path.exists():
+        raise click.ClickException(f"The specified path {data_path} does not exist.")
+    downloader = ArxivDownloader(download_path=data_path, logger=logger)
+
+    # Loops over all HDF5 files in the `data_path` and downloads the corresponding PDFs
+    hdf5_files = list(data_path.glob("*.hdf5"))
+
+    failed_downloads = []
+    with click.progressbar(
+        length=len(hdf5_files), label="Downloading papers PDFs"
+    ) as bar:
+        for file in hdf5_files:
+            paper = ArxivPaper.from_hdf5(file=file)
+            try:
+                _ = downloader.download_pdf(arxiv_paper=paper)
+            except Exception as e:
+                failed_downloads.append(str(file))
+                logger.error(f"Failed to download PDF for {file}: {e}")
+            bar.update(1)
+
+    elapsed_time = time.time() - start_time
+    click.echo(f"Downloaded arXiv papers in {elapsed_time:.2f} seconds\n\n")
+
+    if failed_downloads:
+        click.echo("\nFailed to download PDFs for the following files:")
+        for failed_file in failed_downloads:
+            click.echo(f"  - {failed_file}")
diff --git a/pyrxiv/datamodel.py b/pyrxiv/datamodel.py
@@ -1,4 +1,5 @@
 import datetime
+from pathlib import Path
 
 import h5py
 import numpy as np
@@ -108,3 +109,47 @@ def to_hdf5(self, hdf_file: h5py.File) -> h5py.Group:
             # all other attributes
             sub_group.attrs[key] = value
         return group
+
+    @classmethod
+    def from_hdf5(cls, file: Path) -> "ArxivPaper":
+        """
+        Loads the ArxivPaper metadata and text dataset from an HDF5 file and returns an instance of ArxivPaper
+        filled with the data.
+
+        Args:
+            file (Path): The path to the HDF5 file.
+
+        Returns:
+            ArxivPaper: An instance of ArxivPaper filled with the data from the HDF5 file.
+        """
+        with h5py.File(file, "r") as h5f:
+            paper_id = file.stem
+            group = h5f[paper_id]["arxiv_paper"]
+            data = {}
+            for key in cls.model_fields:
+                if key == "id":
+                    data[key] = paper_id
+                    continue
+                if key in ["updated", "published"]:
+                    value = group.attrs.get(key, None)
+                    data[key] = (
+                        datetime.datetime.fromisoformat(value) if value else None
+                    )
+                    continue
+                if key == "authors":
+                    authors = group.get("authors", [])
+                    data[key] = [
+                        Author(name=author.decode("utf-8")) for author in authors
+                    ]
+                    continue
+                if key == "text":
+                    text_data = group.get("text", b"")
+                    data[key] = text_data[()].decode("utf-8") if text_data else ""
+                    continue
+                if key in group and isinstance(group[key], h5py.Dataset):
+                    value = group[key][()]
+                    if isinstance(value, np.ndarray):
+                        data[key] = [item.decode("utf-8") for item in value]
+                        continue
+                data[key] = group.attrs.get(key, None)
+            return cls(**data)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from pyrxiv.datamodel import ArxivPaper
+from pyrxiv.datamodel import ArxivPaper, Author
 from pyrxiv.logger import log_storage
 
 if os.getenv("_PYTEST_RAISE", "0") != "0":
@@ -30,11 +30,15 @@ def generate_arxiv_paper(id: str = "1234.5678v1"):
         id=id,
         url=f"http://arxiv.org/abs/{id}",
         pdf_url=f"http://arxiv.org/pdf/{id}",
-        title="Test Title",
-        summary="A summary or abstract.",
-        authors=[],
-        comment="",
-        categories=[],
-        updated=datetime.datetime(2024, 4, 25, 0, 0, tzinfo=datetime.timezone.utc),
-        published=datetime.datetime(2024, 4, 25, 0, 0, tzinfo=datetime.timezone.utc),
+        updated=datetime.datetime(2025, 2, 21, 10, 0, 0),
+        published=datetime.datetime(2025, 2, 20, 12, 0, 0),
+        title="A test paper",
+        summary="This is a test summary",
+        authors=[Author(name="Alice"), Author(name="Bob")],
+        comment="Some comment",
+        n_pages=12,
+        n_figures=3,
+        categories=["cond-mat.str-el", "cond-mat.mtrl-sci"],
+        pdf_loader="pypdf",
+        text="This is the body of the paper.",
     )
diff --git a/tests/test_datamodel.py b/tests/test_datamodel.py
@@ -0,0 +1,40 @@
+import datetime
+import io
+
+import h5py
+import pytest
+
+from pyrxiv.datamodel import ArxivPaper
+from tests.conftest import generate_arxiv_paper
+
+
+def test_to_from_hdf5_roundtrip(tmp_path):
+    sample_paper = generate_arxiv_paper()
+    # create an in-memory buffer to avoid disk writes
+    buffer = io.BytesIO()
+
+    # write to HDF5
+    with h5py.File(buffer, "w") as h5f:
+        sample_paper.to_hdf5(h5f)
+
+    # important: reset buffer cursor for reading
+    buffer.seek(0)
+
+    # write buffer to tmp_path just to satisfy Path input of from_hdf5
+    # -> you can patch from_hdf5 later to also accept file-like objects
+    hdf5_path = tmp_path / f"{sample_paper.id}.h5"
+    with open(hdf5_path, "wb") as f:
+        f.write(buffer.getvalue())
+
+    # now load back
+    paper2 = ArxivPaper.from_hdf5(hdf5_path)
+
+    # compare
+    assert paper2.id == sample_paper.id
+    assert paper2.title == sample_paper.title
+    assert paper2.summary == sample_paper.summary
+    assert [a.name for a in paper2.authors] == [a.name for a in sample_paper.authors]
+    assert paper2.text == sample_paper.text
+    assert paper2.categories == sample_paper.categories
+    assert paper2.n_pages == sample_paper.n_pages
+    assert isinstance(paper2.updated, datetime.datetime)
diff --git a/tests/test_download.py b/tests/test_download.py
@@ -16,7 +16,7 @@ class TestArxivDownloader:
                 1,
                 {
                     "level": "error",
-                    "event": "Failed to download PDF: 404 Client Error: Not Found for url: http://arxiv.org/pdf/1234.5678v1",
+                    "event": "Failed to download PDF: 404 Client Error: Not Found for url: https://arxiv.org/pdf/1234.5678v1",
                 },
             ),
             (

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ class TestArxivDownloader:`
`16`	`16`	`1,`
`17`	`17`	`{`
`18`	`18`	`"level": "error",`
`19`		`- "event": "Failed to download PDF: 404 Client Error: Not Found for url: http://arxiv.org/pdf/1234.5678v1",`
	`19`	`+ "event": "Failed to download PDF: 404 Client Error: Not Found for url: https://arxiv.org/pdf/1234.5678v1",`
`20`	`20`	`},`
`21`	`21`	`),`
`22`	`22`	`(`