From 82557fd25d6b1a7443cde0889f10520e7b8601fb Mon Sep 17 00:00:00 2001 From: Daniel Smilkov Date: Tue, 25 Jul 2023 13:39:19 -0400 Subject: [PATCH 1/2] save --- lilac/__init__.py | 26 ++++++- lilac/data_loader.py | 14 ++++ lilac/router_data_loader.py | 3 - lilac/router_dataset.py | 7 +- lilac/server.py | 2 +- notebooks/API.ipynb | 136 ++++++++++++++++++++++++------------ 6 files changed, 134 insertions(+), 54 deletions(-) diff --git a/lilac/__init__.py b/lilac/__init__.py index cb9fc46dc..bcc9ac85b 100644 --- a/lilac/__init__.py +++ b/lilac/__init__.py @@ -1,3 +1,27 @@ +from .data.dataset_duckdb import DatasetDuckDB +from .data.sources.csv_source import CSVDataset +from .data.sources.default_sources import register_default_sources +from .data.sources.gmail_source import Gmail +from .data.sources.huggingface_source import HuggingFaceDataset +from .data.sources.json_source import JSONDataset +from .data_loader import load_from_config +from .db_manager import get_dataset, set_default_dataset_cls from .server import start_server, stop_server +from .signals.default_signals import register_default_signals -__all__ = ['start_server', 'stop_server'] +register_default_sources() +register_default_signals() +set_default_dataset_cls(DatasetDuckDB) + +__all__ = [ + 'start_server', + 'stop_server', + 'load_from_config', + 'get_dataset', + + # Source configurations. + 'HuggingFaceDataset', + 'CSVDataset', + 'JSONDataset', + 'Gmail', +] diff --git a/lilac/data_loader.py b/lilac/data_loader.py index bc21aa7c2..1e00d6d53 100644 --- a/lilac/data_loader.py +++ b/lilac/data_loader.py @@ -18,6 +18,10 @@ import pandas as pd from distributed import Client +from lilac.config import data_path +from lilac.data.dataset import Dataset +from lilac.db_manager import get_dataset + from .data.dataset_utils import write_items_to_parquet from .data.sources.default_sources import register_default_sources from .data.sources.source import Source @@ -37,6 +41,16 @@ from .utils import get_dataset_output_dir, log, open_file +def load_from_config( + namespace: str, + dataset_name: str, + source_config: Source, +) -> Dataset: + """Load a dataset from a source configuration.""" + process_source(data_path(), namespace, dataset_name, source_config) + return get_dataset(namespace, dataset_name) + + def process_source(base_dir: Union[str, pathlib.Path], namespace: str, dataset_name: str, diff --git a/lilac/router_data_loader.py b/lilac/router_data_loader.py index 5828c6794..50613f64e 100644 --- a/lilac/router_data_loader.py +++ b/lilac/router_data_loader.py @@ -14,7 +14,6 @@ from .auth import get_user_access from .config import data_path -from .data.sources.default_sources import register_default_sources from .data.sources.source_registry import get_source_cls, registered_sources from .data_loader import process_source from .router_utils import RouteErrorHandler @@ -22,8 +21,6 @@ REQUEST_TIMEOUT_SEC = 30 * 60 # 30 mins. -register_default_sources() - router = APIRouter(route_class=RouteErrorHandler) diff --git a/lilac/router_dataset.py b/lilac/router_dataset.py index 72ac0df68..b3c9e5726 100644 --- a/lilac/router_dataset.py +++ b/lilac/router_dataset.py @@ -23,13 +23,11 @@ StatsResult, UnaryOp, ) -from .data.dataset_duckdb import DatasetDuckDB -from .db_manager import get_dataset, remove_dataset_from_cache, set_default_dataset_cls +from .db_manager import get_dataset, remove_dataset_from_cache from .router_utils import RouteErrorHandler from .schema import Bin, Path, normalize_path from .signals.concept_labels import ConceptLabelsSignal from .signals.concept_scorer import ConceptScoreSignal -from .signals.default_signals import register_default_signals from .signals.semantic_similarity import SemanticSimilaritySignal from .signals.signal import ( Signal, @@ -44,9 +42,6 @@ router = APIRouter(route_class=RouteErrorHandler) -register_default_signals() -set_default_dataset_cls(DatasetDuckDB) - @router.get('/', response_model_exclude_none=True) def get_datasets() -> list[DatasetInfo]: diff --git a/lilac/server.py b/lilac/server.py index 87ebfee31..e0668622f 100644 --- a/lilac/server.py +++ b/lilac/server.py @@ -163,7 +163,7 @@ def start_server(host: str = '0.0.0.0', port: int = 5432) -> None: if SERVER: raise ValueError('Server is already running') - config = uvicorn.Config(app, host='0.0.0.0', port=5432) + config = uvicorn.Config(app, host='0.0.0.0', port=5432, access_log=False) SERVER = uvicorn.Server(config) try: loop = asyncio.get_running_loop() diff --git a/notebooks/API.ipynb b/notebooks/API.ipynb index 51ce1c511..756740ab6 100644 --- a/notebooks/API.ipynb +++ b/notebooks/API.ipynb @@ -4,111 +4,174 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Starting the web server\n" + "## Importing a dataset\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import lilac as ll" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### From HuggingFace\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/Users/dsmilkov/code/lilac/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" + "Found cached dataset glue (/Users/dsmilkov/.cache/huggingface/datasets/glue/ax/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n", + "100%|██████████| 1/1 [00:00<00:00, 399.38it/s]\n" ] }, { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "INFO: Started server process [72797]\n", - "INFO: Waiting for application startup.\n", - "INFO: Application startup complete.\n", - "INFO: Uvicorn running on http://0.0.0.0:5432 (Press CTRL+C to quit)\n" + "Manifest for dataset \"glue\" written to ./data/datasets/local/glue\n" ] } ], "source": [ - "import lilac as ll\n", - "\n", - "ll.start_server()" + "source_config = ll.HuggingFaceDataset(dataset_name='glue', config_name='ax')\n", + "dataset = ll.load_from_config('local', 'glue', source_config)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Stopping the server\n" + "### From CSV\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "INFO: Shutting down\n", - "INFO: Waiting for application shutdown.\n", - "INFO: Application shutdown complete.\n" + "Downloading from url https://storage.googleapis.com/lilac-data-us-east1/datasets/csv_datasets/the_movies_dataset/the_movies_dataset.csv to /tmp/./data/local_cache/37a9be0240c140da95c664d3bc092a04\n", + "Manifest for dataset \"the_movies_dataset\" written to ./data/datasets/local/the_movies_dataset\n" ] } ], "source": [ - "await ll.stop_server()" + "source_config = ll.CSVDataset(filepaths=[\n", + " 'https://storage.googleapis.com/lilac-data-us-east1/datasets/csv_datasets/the_movies_dataset/the_movies_dataset.csv'\n", + "])\n", + "dataset = ll.load_from_config('local', 'the_movies_dataset', source_config)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Importing a dataset\n" + "### From JSON\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading from url https://raw.githubusercontent.com/explosion/prodigy-recipes/master/example-datasets/news_headlines.jsonl to /tmp/./data/local_cache/7f919bed71f0491cae57f3b9cd4f0aa4\n", + "Manifest for dataset \"news_headlines\" written to ./data/datasets/local/news_headlines\n" + ] + } + ], "source": [ - "dataset = lilac.load(namespace='local', name=..., config: SourceConfig) # Blocking." + "source_config = ll.JSONDataset(filepaths=[\n", + " 'https://raw.githubusercontent.com/explosion/prodigy-recipes/master/example-datasets/news_headlines.jsonl'\n", + "])\n", + "dataset = ll.load_from_config('local', 'news_headlines', source_config)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Query a dataset\n" + "## Visualize the data\n", + "\n", + "Now that we have imported a few datasets, let's visualize them to see what they look like.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO: Started server process [78275]\n", + "INFO: Waiting for application startup.\n", + "INFO: Application startup complete.\n", + "INFO: Uvicorn running on http://0.0.0.0:5432 (Press CTRL+C to quit)\n" + ] + } + ], "source": [ - "dataset.select_rows() # ......\n" + "ll.start_server()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Index the dataset\n" + "### Stopping the server\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO: Shutting down\n", + "INFO: Waiting for application shutdown.\n", + "INFO: Application shutdown complete.\n" + ] + } + ], "source": [ - "dataset.compute_embedding(...)" + "await ll.stop_server()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query a dataset\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Index the dataset\n" ] }, { @@ -147,13 +210,7 @@ } ], "source": [ - "from lilac.db_manager import get_dataset, set_default_dataset_cls\n", - "from lilac.data.dataset_duckdb import DatasetDuckDB\n", "from lilac.signals.concept_scorer import ConceptScoreSignal\n", - "from lilac.signals.default_signals import register_default_signals\n", - "\n", - "register_default_signals()\n", - "set_default_dataset_cls(DatasetDuckDB)\n", "\n", "dataset = get_dataset('local', 'legal-clauses')\n", "\n", @@ -201,13 +258,6 @@ "3. I want to download it\n" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, From 94c0a47f7090660338e804bcad851675dacf8a96 Mon Sep 17 00:00:00 2001 From: Daniel Smilkov Date: Tue, 25 Jul 2023 14:11:19 -0400 Subject: [PATCH 2/2] save --- lilac/__init__.py | 4 ++-- lilac/data_loader.py | 11 +++++------ notebooks/API.ipynb | 24 ++++++++++++++++++++---- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/lilac/__init__.py b/lilac/__init__.py index bcc9ac85b..4912b7ee1 100644 --- a/lilac/__init__.py +++ b/lilac/__init__.py @@ -4,7 +4,7 @@ from .data.sources.gmail_source import Gmail from .data.sources.huggingface_source import HuggingFaceDataset from .data.sources.json_source import JSONDataset -from .data_loader import load_from_config +from .data_loader import create_dataset from .db_manager import get_dataset, set_default_dataset_cls from .server import start_server, stop_server from .signals.default_signals import register_default_signals @@ -16,7 +16,7 @@ __all__ = [ 'start_server', 'stop_server', - 'load_from_config', + 'create_dataset', 'get_dataset', # Source configurations. diff --git a/lilac/data_loader.py b/lilac/data_loader.py index 1e00d6d53..068c05c87 100644 --- a/lilac/data_loader.py +++ b/lilac/data_loader.py @@ -18,14 +18,13 @@ import pandas as pd from distributed import Client -from lilac.config import data_path -from lilac.data.dataset import Dataset -from lilac.db_manager import get_dataset - +from .config import data_path +from .data.dataset import Dataset from .data.dataset_utils import write_items_to_parquet from .data.sources.default_sources import register_default_sources from .data.sources.source import Source from .data.sources.source_registry import resolve_source +from .db_manager import get_dataset from .schema import ( MANIFEST_FILENAME, PARQUET_FILENAME_PREFIX, @@ -41,12 +40,12 @@ from .utils import get_dataset_output_dir, log, open_file -def load_from_config( +def create_dataset( namespace: str, dataset_name: str, source_config: Source, ) -> Dataset: - """Load a dataset from a source configuration.""" + """Load a dataset from a given source configuration.""" process_source(data_path(), namespace, dataset_name, source_config) return get_dataset(namespace, dataset_name) diff --git a/notebooks/API.ipynb b/notebooks/API.ipynb index 756740ab6..fc2fbf331 100644 --- a/notebooks/API.ipynb +++ b/notebooks/API.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Importing a dataset\n" + "## Creating a Lilac dataset\n" ] }, { @@ -46,7 +46,7 @@ ], "source": [ "source_config = ll.HuggingFaceDataset(dataset_name='glue', config_name='ax')\n", - "dataset = ll.load_from_config('local', 'glue', source_config)" + "dataset = ll.create_dataset('local', 'glue', source_config)" ] }, { @@ -74,7 +74,7 @@ "source_config = ll.CSVDataset(filepaths=[\n", " 'https://storage.googleapis.com/lilac-data-us-east1/datasets/csv_datasets/the_movies_dataset/the_movies_dataset.csv'\n", "])\n", - "dataset = ll.load_from_config('local', 'the_movies_dataset', source_config)" + "dataset = ll.create_dataset('local', 'the_movies_dataset', source_config)" ] }, { @@ -102,7 +102,23 @@ "source_config = ll.JSONDataset(filepaths=[\n", " 'https://raw.githubusercontent.com/explosion/prodigy-recipes/master/example-datasets/news_headlines.jsonl'\n", "])\n", - "dataset = ll.load_from_config('local', 'news_headlines', source_config)" + "dataset = ll.create_dataset('local', 'news_headlines', source_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get an existing dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = ll.get_dataset('local', 'the_movies_dataset')" ] }, {