Skip to content

Commit

Permalink
More pylint and mypy findings
Browse files Browse the repository at this point in the history
  • Loading branch information
delucchi-cmu committed Oct 25, 2023
1 parent d9153a8 commit ffb69a1
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 17 deletions.
24 changes: 12 additions & 12 deletions src/lsdb/loaders/dataframe/dataframe_catalog_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class DataframeCatalogLoader:

def __init__(
self,
df: pd.DataFrame,
dataframe: pd.DataFrame,
lowest_order: int = 0,
highest_order: int = 5,
partition_size: float | None = None,
Expand All @@ -38,14 +38,14 @@ def __init__(
"""Initializes a DataframeCatalogLoader
Args:
df (pd.Dataframe): Catalog Pandas Dataframe
dataframe (pd.Dataframe): Catalog Pandas Dataframe
lowest_order (int): The lowest partition order
highest_order (int): The highest partition order
partition_size (float): The desired partition size, in megabytes
threshold (int): The maximum number of data points per pixel
**kwargs: Arguments to pass to the creation of the catalog info
"""
self.df = df
self.dataframe = dataframe
self.lowest_order = lowest_order
self.highest_order = highest_order
self.threshold = self._calculate_threshold(partition_size, threshold)
Expand All @@ -66,11 +66,11 @@ def _calculate_threshold(self, partition_size: float | None = None, threshold: i
raise ValueError("Specify only one: threshold or partition_size")
if threshold is None:
if partition_size is not None:
df_size_bytes = self.df.memory_usage().sum()
df_size_bytes = self.dataframe.memory_usage().sum()
# Round the number of partitions to the next integer, otherwise the
# number of pixels per partition may exceed the threshold
num_partitions = math.ceil(df_size_bytes / (partition_size * (1 << 20)))
threshold = len(self.df.index) // num_partitions
threshold = len(self.dataframe.index) // num_partitions
else:
threshold = DataframeCatalogLoader.DEFAULT_THRESHOLD
return threshold
Expand Down Expand Up @@ -107,11 +107,11 @@ def load_catalog(self) -> Catalog:
def _set_hipscat_index(self):
"""Generates the hipscat indices for each data point and assigns
the hipscat index column as the Dataframe index."""
self.df[HIPSCAT_ID_COLUMN] = compute_hipscat_id(
ra_values=self.df[self.catalog_info.ra_column],
dec_values=self.df[self.catalog_info.dec_column],
self.dataframe[HIPSCAT_ID_COLUMN] = compute_hipscat_id(
ra_values=self.dataframe[self.catalog_info.ra_column],
dec_values=self.dataframe[self.catalog_info.dec_column],
)
self.df.set_index(HIPSCAT_ID_COLUMN, inplace=True)
self.dataframe.set_index(HIPSCAT_ID_COLUMN, inplace=True)

def _compute_pixel_map(self) -> Dict[HealpixPixel, HealpixInfo]:
"""Compute object histogram and generate the mapping between
Expand All @@ -123,7 +123,7 @@ def _compute_pixel_map(self) -> Dict[HealpixPixel, HealpixInfo]:
of objects in the HEALPix pixel, the second is the list of pixels
"""
raw_histogram = generate_histogram(
self.df,
self.dataframe,
highest_order=self.highest_order,
ra_column=self.catalog_info.ra_column,
dec_column=self.catalog_info.dec_column,
Expand Down Expand Up @@ -161,7 +161,7 @@ def _generate_dask_df_and_map(
ddf_pixel_map[hp_pixel] = hp_pixel_index

# Generate Dask Dataframe with original schema
schema = pd.DataFrame(columns=self.df.columns).astype(self.df.dtypes)
schema = pd.DataFrame(columns=self.dataframe.columns).astype(self.dataframe.dtypes)
ddf = self._generate_dask_dataframe(pixel_dfs, schema)

return ddf, ddf_pixel_map
Expand Down Expand Up @@ -199,4 +199,4 @@ def _get_dataframe_for_healpix(self, pixels: List[int]) -> pd.DataFrame:
"""
left_bound = healpix_to_hipscat_id(self.highest_order, pixels[0])
right_bound = healpix_to_hipscat_id(self.highest_order, pixels[-1] + 1)
return self.df.loc[(self.df.index >= left_bound) & (self.df.index < right_bound)]
return self.dataframe.loc[(self.dataframe.index >= left_bound) & (self.dataframe.index < right_bound)]
8 changes: 5 additions & 3 deletions src/lsdb/loaders/dataframe/from_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


def from_dataframe(
df: pd.DataFrame,
dataframe: pd.DataFrame,
lowest_order: int = 0,
highest_order: int = 5,
partition_size: float | None = None,
Expand All @@ -17,7 +17,7 @@ def from_dataframe(
"""Load a catalog from a Pandas Dataframe in CSV format.
Args:
df (pd.Dataframe): The catalog Pandas Dataframe
dataframe (pd.Dataframe): The catalog Pandas Dataframe
lowest_order (int): The lowest partition order
highest_order (int): The highest partition order
partition_size (float): The desired partition size, in megabytes
Expand All @@ -27,5 +27,7 @@ def from_dataframe(
Returns:
Catalog object loaded from the given parameters
"""
loader = DataframeCatalogLoader(df, lowest_order, highest_order, partition_size, threshold, **kwargs)
loader = DataframeCatalogLoader(
dataframe, lowest_order, highest_order, partition_size, threshold, **kwargs
)
return loader.load_catalog()
2 changes: 1 addition & 1 deletion src/lsdb/loaders/hipscat/read_hipscat.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def read_hipscat(


def _get_dataset_class_from_catalog_info(
base_catalog_path: str, storage_options: dict = None
base_catalog_path: str, storage_options: Union[Dict[Any, Any], None] = None
) -> Type[Dataset]:
base_catalog_dir = hc.io.get_file_pointer_from_path(base_catalog_path)
catalog_info_path = hc.io.paths.get_catalog_info_pointer(base_catalog_dir)
Expand Down
1 change: 0 additions & 1 deletion tests/lsdb/loaders/dataframe/test_from_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import pandas as pd
import pytest
from hipscat.catalog import CatalogType
from hipscat.pixel_math import HealpixPixel
from hipscat.pixel_tree.pixel_node_type import PixelNodeType

import lsdb
Expand Down

0 comments on commit ffb69a1

Please sign in to comment.