Skip to content

Commit f16acbd

Browse files
authored
Merge pull request #12 from astronomy-commons/sean/add-catalog-loading
Add Catalog class and Catalog Loading
2 parents 7f72881 + ca73e96 commit f16acbd

32 files changed

+486
-39
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,9 @@ repos:
100100
files: ^(src|tests)/
101101
args:
102102
[
103-
103+
104104
"--ignore-missing-imports", # Ignore imports without type hints
105-
105+
106106
]
107107

108108
# Make sure Sphinx can build the documentation without issues.

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ classifiers = [
1515
]
1616
dynamic = ["version"]
1717
dependencies = [
18+
"dask",
19+
"hipscat",
20+
"pyarrow",
1821
"deprecated",
1922
"ipykernel", # Support for Jupyter notebooks
2023
]

src/lsdb/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
from .example_module import *
1+
from .catalog import Catalog
2+
from .loaders import read_hipscat

src/lsdb/catalog/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .catalog import Catalog

src/lsdb/catalog/catalog.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from typing import Dict
2+
3+
import dask.dataframe as dd
4+
import hipscat as hc
5+
6+
from lsdb.core.healpix.healpix_pixel import HealpixPixel
7+
8+
DaskDFPixelMap = Dict[HealpixPixel, int]
9+
10+
11+
# pylint: disable=R0903, W0212
12+
class Catalog:
13+
"""LSDB Catalog DataFrame to perform analysis of sky catalogs and efficient
14+
spatial operations.
15+
16+
Attributes:
17+
name: Name of the catalog
18+
hc_structure: `hipscat.Catalog` object representing the structure
19+
and metadata of the HiPSCat catalog
20+
"""
21+
22+
def __init__(
23+
self,
24+
ddf: dd.DataFrame,
25+
ddf_pixel_map: DaskDFPixelMap,
26+
hc_structure: hc.catalog.Catalog,
27+
):
28+
"""Initialise a Catalog object.
29+
30+
Not to be used to load a catalog directly, use one of the `lsdb.from_...` or
31+
`lsdb.load_...` methods
32+
33+
Args:
34+
ddf: Dask DataFrame with the source data of the catalog
35+
ddf_pixel_map: Dictionary mapping HEALPix order and pixel to partition index of ddf
36+
hc_structure: `hipscat.Catalog` object with hipscat metadata of the catalog
37+
"""
38+
self._ddf = ddf
39+
self._ddf_pixel_map = ddf_pixel_map
40+
self.hc_structure = hc_structure
41+
42+
def __repr__(self):
43+
return self._ddf.__repr__()
44+
45+
def _repr_html_(self):
46+
return self._ddf._repr_html_()
47+
48+
def compute(self):
49+
"""Compute dask distributed dataframe to pandas dataframe"""
50+
return self._ddf.compute()
51+
52+
def get_partition(self, order: int, pixel: int) -> dd.DataFrame:
53+
"""Get the dask partition for a given HEALPix pixel
54+
55+
Args:
56+
order: Order of HEALPix pixel
57+
pixel: HEALPix pixel number in NESTED ordering scheme
58+
Returns:
59+
Dask Dataframe with a single partition with data at that pixel
60+
Raises:
61+
Value error if no data exists for the specified pixel
62+
"""
63+
hp_pixel = HealpixPixel(order, pixel)
64+
if not hp_pixel in self._ddf_pixel_map:
65+
raise ValueError(f"Pixel at order {order} pixel {pixel} not in Catalog")
66+
partition_index = self._ddf_pixel_map[hp_pixel]
67+
return self._ddf.partitions[partition_index]
File renamed without changes.
File renamed without changes.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from __future__ import annotations
2+
3+
import hipscat.pixel_math.hipscat_id
4+
5+
MAXIMUM_ORDER = hipscat.pixel_math.hipscat_id.HIPSCAT_ID_HEALPIX_ORDER
6+
7+
8+
class HealpixPixel:
9+
"""A HEALPix pixel, represented by an order and pixel number in NESTED ordering scheme
10+
11+
see https://lambda.gsfc.nasa.gov/toolbox/pixelcoords.html for more information
12+
"""
13+
14+
def __init__(self, order: int, pixel: int) -> None:
15+
"""Initialize a HEALPix pixel
16+
Args:
17+
order: HEALPix order
18+
pixel: HEALPix pixel number in NESTED ordering scheme
19+
"""
20+
if order > MAXIMUM_ORDER:
21+
raise ValueError(f"HEALPix order cannot be greater than {MAXIMUM_ORDER}")
22+
self.order = order
23+
self.pixel = pixel
24+
25+
def _key(self) -> tuple[int, int]:
26+
"""Returns tuple of order and pixel, for use in hashing and equality"""
27+
return self.order, self.pixel
28+
29+
def __eq__(self, other: object) -> bool:
30+
"""Defines 2 pixels as equal if they have the same order and pixel"""
31+
if not isinstance(other, HealpixPixel):
32+
return False
33+
return self._key() == other._key()
34+
35+
def __hash__(self) -> int:
36+
"""Hashes pixels by order and pixel, so equal pixel objects are looked up the same in
37+
hashable data structures"""
38+
return hash(self._key())
39+
40+
def __str__(self) -> str:
41+
return f"Order: {self.order}, Pixel: {self.pixel}"
42+
43+
def __repr__(self):
44+
return self.__str__()

src/lsdb/example_module.py

Lines changed: 0 additions & 23 deletions
This file was deleted.

src/lsdb/io/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .parquet_io import *

src/lsdb/io/parquet_io.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
from hipscat.io import FilePointer
3+
from pyarrow import Schema
4+
import pyarrow.parquet as pq
5+
6+
7+
def read_parquet_schema(file_pointer: FilePointer) -> Schema:
8+
return pq.read_schema(file_pointer)
9+
10+
11+
def read_parquet_file_to_pandas(file_pointer: FilePointer, **kwargs) -> pd.DataFrame:
12+
return pd.read_parquet(file_pointer, **kwargs)

src/lsdb/loaders/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .hipscat import read_hipscat

src/lsdb/loaders/hipscat/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .read_hipscat import read_hipscat
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import dask.dataframe as dd
2+
import hipscat as hc
3+
import pyarrow
4+
5+
from lsdb import io
6+
from lsdb.catalog.catalog import Catalog, DaskDFPixelMap
7+
from lsdb.core.healpix.healpix_pixel import MAXIMUM_ORDER, HealpixPixel
8+
from lsdb.loaders.hipscat.hipscat_loading_config import HipscatLoadingConfig
9+
10+
11+
# pylint: disable=R0903
12+
class HipscatCatalogLoader:
13+
"""Loads a HiPSCat formatted Catalog"""
14+
15+
def __init__(self, path: str, config: HipscatLoadingConfig) -> None:
16+
"""Initializes a HipscatCatalogLoader
17+
18+
Args:
19+
path: path to the root of the HiPSCat catalog
20+
config: options to configure how the catalog is loaded
21+
"""
22+
self.path = path
23+
self.base_catalog_dir = hc.io.get_file_pointer_from_path(self.path)
24+
self.config = config
25+
26+
def load_catalog(self) -> Catalog:
27+
"""Load a catalog from the configuration specified when the loader was created
28+
29+
Returns:
30+
Catalog object with data from the source given at loader initialization
31+
"""
32+
hc_catalog = self.load_hipscat_catalog()
33+
dask_df, dask_df_pixel_map = self._load_dask_df_and_map(hc_catalog)
34+
return Catalog(dask_df, dask_df_pixel_map, hc_catalog)
35+
36+
def load_hipscat_catalog(self) -> hc.catalog.Catalog:
37+
"""Load `hipscat` library catalog object with catalog metadata and partition data"""
38+
return hc.catalog.Catalog(catalog_path=self.path)
39+
40+
def _load_dask_df_and_map(
41+
self, catalog: hc.catalog.Catalog
42+
) -> tuple[dd.DataFrame, DaskDFPixelMap]:
43+
"""Load Dask DF from parquet files and make dict of HEALPix pixel to partition index"""
44+
ordered_pixels = self._get_ordered_pixel_list(catalog)
45+
ordered_paths = self._get_paths_from_pixels(catalog, ordered_pixels)
46+
pixel_to_index_map = {
47+
pixel: index for index, pixel in enumerate(ordered_pixels)
48+
}
49+
ddf = self._load_df_from_paths(catalog, ordered_paths)
50+
return ddf, pixel_to_index_map
51+
52+
def _get_ordered_pixel_list(
53+
self, catalog: hc.catalog.Catalog
54+
) -> list[HealpixPixel]:
55+
pixels = []
56+
for _, row in catalog.get_pixels().iterrows():
57+
order = row[hc.catalog.PartitionInfo.METADATA_ORDER_COLUMN_NAME]
58+
pixel = row[hc.catalog.PartitionInfo.METADATA_PIXEL_COLUMN_NAME]
59+
pixels.append(HealpixPixel(order, pixel))
60+
# Sort pixels by pixel number at highest order
61+
sorted_pixels = sorted(
62+
pixels, key=lambda pixel: (4 ** (MAXIMUM_ORDER - pixel.order)) * pixel.pixel
63+
)
64+
return sorted_pixels
65+
66+
def _get_paths_from_pixels(
67+
self, catalog: hc.catalog.Catalog, ordered_pixels: list[HealpixPixel]
68+
) -> list[hc.io.FilePointer]:
69+
paths = [
70+
hc.io.paths.pixel_catalog_file(
71+
catalog_base_dir=catalog.catalog_base_dir,
72+
pixel_order=pixel.order,
73+
pixel_number=pixel.pixel,
74+
)
75+
for pixel in ordered_pixels
76+
]
77+
return paths
78+
79+
def _load_df_from_paths(
80+
self, catalog: hc.catalog.Catalog, paths: list[hc.io.FilePointer]
81+
) -> dd.DataFrame:
82+
metadata_schema = self._load_parquet_metadata_schema(catalog, paths)
83+
dask_meta_schema = metadata_schema.empty_table().to_pandas()
84+
ddf = dd.from_map(io.read_parquet_file_to_pandas, paths, meta=dask_meta_schema)
85+
return ddf
86+
87+
def _load_parquet_metadata_schema(
88+
self, catalog: hc.catalog.Catalog, paths: list[hc.io.FilePointer]
89+
) -> pyarrow.Schema:
90+
metadata_pointer = hc.io.paths.get_parquet_metadata_pointer(
91+
catalog.catalog_base_dir
92+
)
93+
if hc.io.file_io.does_file_or_directory_exist(metadata_pointer):
94+
return io.read_parquet_schema(metadata_pointer)
95+
return io.read_parquet_schema(paths[0])
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from dataclasses import dataclass
2+
3+
4+
@dataclass
5+
class HipscatLoadingConfig:
6+
"""Configuration for loading a HiPSCat catalog in lsdb"""
7+
8+
pass
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import dataclasses
2+
3+
from lsdb import Catalog
4+
from lsdb.loaders.hipscat.hipscat_catalog_loader import HipscatCatalogLoader
5+
from lsdb.loaders.hipscat.hipscat_loading_config import HipscatLoadingConfig
6+
7+
8+
def read_hipscat(
9+
path: str,
10+
) -> Catalog:
11+
"""Load a catalog from a HiPSCat formatted catalog.
12+
13+
Args:
14+
path: The path that locates the root of the HiPSCat catalog
15+
source: Source to load the catalog from. Default is `None`, in which case the source is
16+
inferred from the path. Currently supported options are:
17+
-`'local'`: HiPSCat files stored locally on disk
18+
"""
19+
20+
# Creates a config object to store loading parameters from all keyword arguments. I
21+
# originally had a few parameters in here, but after changing the file loading implementation
22+
# they weren't needed, so this object is now empty. But I wanted to keep this here for future
23+
# use
24+
kwd_args = locals().copy()
25+
config_args = {
26+
field.name: kwd_args[field.name]
27+
for field in dataclasses.fields(HipscatLoadingConfig)
28+
}
29+
config = HipscatLoadingConfig(**config_args)
30+
31+
loader = HipscatCatalogLoader(path, config)
32+
33+
return loader.load_catalog()

tests/.pylintrc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,7 @@ disable=raw-checker-failed,
425425
redefined-outer-name,
426426
protected-access,
427427
missing-module-docstring,
428+
unnecessary-pass,
428429

429430
# Enable the message, report, category or checker with the given id(s). You can
430431
# either give multiple identifier separated by comma (,) or put this option

tests/conftest.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import os
2+
3+
import hipscat as hc
4+
import pytest
5+
6+
import lsdb
7+
8+
DATA_DIR_NAME = "data"
9+
SMALL_SKY_DIR_NAME = "small_sky"
10+
SMALL_SKY_ORDER1_DIR_NAME = "small_sky_order1"
11+
TEST_DIR = os.path.dirname(__file__)
12+
13+
14+
@pytest.fixture
15+
def test_data_dir():
16+
return os.path.join(TEST_DIR, DATA_DIR_NAME)
17+
18+
19+
@pytest.fixture
20+
def small_sky_dir(test_data_dir):
21+
return os.path.join(test_data_dir, SMALL_SKY_DIR_NAME)
22+
23+
24+
@pytest.fixture
25+
def small_sky_order1_dir(test_data_dir):
26+
return os.path.join(test_data_dir, SMALL_SKY_ORDER1_DIR_NAME)
27+
28+
29+
@pytest.fixture
30+
def small_sky_hipscat_catalog(small_sky_dir):
31+
return hc.catalog.Catalog(small_sky_dir)
32+
33+
34+
@pytest.fixture
35+
def small_sky_catalog(small_sky_dir):
36+
return lsdb.read_hipscat(small_sky_dir)
37+
38+
39+
@pytest.fixture
40+
def small_sky_order1_hipscat_catalog(small_sky_order1_dir):
41+
return hc.catalog.Catalog(small_sky_order1_dir)
42+
43+
44+
@pytest.fixture
45+
def small_sky_order1_catalog(small_sky_order1_dir):
46+
return lsdb.read_hipscat(small_sky_order1_dir)
Binary file not shown.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"catalog_name": "small_sky",
3+
"version": "0.0.1",
4+
"generation_date": "2022.12.20",
5+
"epoch": "J2000",
6+
"ra_kw": "ra",
7+
"dec_kw": "dec",
8+
"id_kw": "id",
9+
"total_objects": 131,
10+
"pixel_threshold": 1000000
11+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Norder,Dir,Npix,num_rows
2+
0,0,11,131
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)