-
Notifications
You must be signed in to change notification settings - Fork 90
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(datasets): Improved Dependency Management for Spark-based Datase…
…ts (#911) * added the skeleton for the utils sub pkg Signed-off-by: Minura Punchihewa <[email protected]> * moved the utility funcs from spark_dataset to relevant modules in _utils Signed-off-by: Minura Punchihewa <[email protected]> * updated the use of utility funcs in spark_dataset Signed-off-by: Minura Punchihewa <[email protected]> * fixed import in databricks_utils Signed-off-by: Minura Punchihewa <[email protected]> * renamed _strip_dbfs_prefix to strip_dbfs_prefix Signed-off-by: Minura Punchihewa <[email protected]> * updated the other modules that import from spark_dataset to use _utils Signed-off-by: Minura Punchihewa <[email protected]> * updated the use of strip_dbfs_prefix in spark_dataset Signed-off-by: Minura Punchihewa <[email protected]> * fixed lint issues Signed-off-by: Minura Punchihewa <[email protected]> * removed the base deps for spark, pandas and delta from databricks datasets Signed-off-by: Minura Punchihewa <[email protected]> * moved the file based utility funcs to databricks_utils Signed-off-by: Minura Punchihewa <[email protected]> * fixed the imports of the file based utility funcs Signed-off-by: Minura Punchihewa <[email protected]> * fixed lint issues Signed-off-by: Minura Punchihewa <[email protected]> * fixed the use of _get_spark() in tests Signed-off-by: Minura Punchihewa <[email protected]> * fixed uses of databricks utils in tests Signed-off-by: Minura Punchihewa <[email protected]> * fixed more tests Signed-off-by: Minura Punchihewa <[email protected]> * fixed more lint issues Signed-off-by: Minura Punchihewa <[email protected]> * fixed more tests Signed-off-by: Minura Punchihewa <[email protected]> * fixed more tests Signed-off-by: Minura Punchihewa <[email protected]> * improved type hints for spark & databricks utility funcs Signed-off-by: Minura Punchihewa <[email protected]> * fixed more lint issues Signed-off-by: Minura Punchihewa <[email protected]> * further improved type hints for utility funcs Signed-off-by: Minura Punchihewa <[email protected]> * fixed a couple of incorrect type hints Signed-off-by: Minura Punchihewa <[email protected]> * fixed several incorrect type hints Signed-off-by: Minura Punchihewa <[email protected]> * updated the release notes Signed-off-by: Minura Punchihewa <[email protected]> * Reorder release notes Signed-off-by: Merel Theisen <[email protected]> --------- Signed-off-by: Minura Punchihewa <[email protected]> Signed-off-by: Merel Theisen <[email protected]> Co-authored-by: Nok Lam Chan <[email protected]> Co-authored-by: Merel Theisen <[email protected]> Co-authored-by: Merel Theisen <[email protected]>
- Loading branch information
1 parent
57f6279
commit 07aef5a
Showing
15 changed files
with
220 additions
and
200 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
105 changes: 105 additions & 0 deletions
105
kedro-datasets/kedro_datasets/_utils/databricks_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import os | ||
from fnmatch import fnmatch | ||
from pathlib import PurePosixPath | ||
from typing import TYPE_CHECKING, Union | ||
|
||
from pyspark.sql import SparkSession | ||
|
||
if TYPE_CHECKING: | ||
from databricks.connect import DatabricksSession | ||
from pyspark.dbutils import DBUtils | ||
|
||
|
||
def parse_glob_pattern(pattern: str) -> str: | ||
special = ("*", "?", "[") | ||
clean = [] | ||
for part in pattern.split("/"): | ||
if any(char in part for char in special): | ||
break | ||
clean.append(part) | ||
return "/".join(clean) | ||
|
||
|
||
def split_filepath(filepath: str | os.PathLike) -> tuple[str, str]: | ||
split_ = str(filepath).split("://", 1) | ||
if len(split_) == 2: # noqa: PLR2004 | ||
return split_[0] + "://", split_[1] | ||
return "", split_[0] | ||
|
||
|
||
def strip_dbfs_prefix(path: str, prefix: str = "/dbfs") -> str: | ||
return path[len(prefix) :] if path.startswith(prefix) else path | ||
|
||
|
||
def dbfs_glob(pattern: str, dbutils: "DBUtils") -> list[str]: | ||
"""Perform a custom glob search in DBFS using the provided pattern. | ||
It is assumed that version paths are managed by Kedro only. | ||
Args: | ||
pattern: Glob pattern to search for. | ||
dbutils: dbutils instance to operate with DBFS. | ||
Returns: | ||
List of DBFS paths prefixed with '/dbfs' that satisfy the glob pattern. | ||
""" | ||
pattern = strip_dbfs_prefix(pattern) | ||
prefix = parse_glob_pattern(pattern) | ||
matched = set() | ||
filename = pattern.split("/")[-1] | ||
|
||
for file_info in dbutils.fs.ls(prefix): | ||
if file_info.isDir(): | ||
path = str( | ||
PurePosixPath(strip_dbfs_prefix(file_info.path, "dbfs:")) / filename | ||
) | ||
if fnmatch(path, pattern): | ||
path = "/dbfs" + path | ||
matched.add(path) | ||
return sorted(matched) | ||
|
||
|
||
def get_dbutils(spark: Union[SparkSession, "DatabricksSession"]) -> "DBUtils": | ||
"""Get the instance of 'dbutils' or None if the one could not be found.""" | ||
dbutils = globals().get("dbutils") | ||
if dbutils: | ||
return dbutils | ||
|
||
try: | ||
from pyspark.dbutils import DBUtils | ||
|
||
dbutils = DBUtils(spark) | ||
except ImportError: | ||
try: | ||
import IPython | ||
except ImportError: | ||
pass | ||
else: | ||
ipython = IPython.get_ipython() | ||
dbutils = ipython.user_ns.get("dbutils") if ipython else None | ||
|
||
return dbutils | ||
|
||
|
||
def dbfs_exists(pattern: str, dbutils: "DBUtils") -> bool: | ||
"""Perform an `ls` list operation in DBFS using the provided pattern. | ||
It is assumed that version paths are managed by Kedro. | ||
Broad `Exception` is present due to `dbutils.fs.ExecutionError` that | ||
cannot be imported directly. | ||
Args: | ||
pattern: Filepath to search for. | ||
dbutils: dbutils instance to operate with DBFS. | ||
Returns: | ||
Boolean value if filepath exists. | ||
""" | ||
pattern = strip_dbfs_prefix(pattern) | ||
file = parse_glob_pattern(pattern) | ||
try: | ||
dbutils.fs.ls(file) | ||
return True | ||
except Exception: | ||
return False | ||
|
||
|
||
def deployed_on_databricks() -> bool: | ||
"""Check if running on Databricks.""" | ||
return "DATABRICKS_RUNTIME_VERSION" in os.environ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from typing import TYPE_CHECKING, Union | ||
|
||
from pyspark.sql import SparkSession | ||
|
||
if TYPE_CHECKING: | ||
from databricks.connect import DatabricksSession | ||
|
||
|
||
def get_spark() -> Union[SparkSession, "DatabricksSession"]: | ||
""" | ||
Returns the SparkSession. In case databricks-connect is available we use it for | ||
extended configuration mechanisms and notebook compatibility, | ||
otherwise we use classic pyspark. | ||
""" | ||
try: | ||
# When using databricks-connect >= 13.0.0 (a.k.a databricks-connect-v2) | ||
# the remote session is instantiated using the databricks module | ||
# If the databricks-connect module is installed, we use a remote session | ||
from databricks.connect import DatabricksSession | ||
|
||
# We can't test this as there's no Databricks test env available | ||
spark = DatabricksSession.builder.getOrCreate() # pragma: no cover | ||
|
||
except ImportError: | ||
# For "normal" spark sessions that don't use databricks-connect | ||
# we get spark normally | ||
spark = SparkSession.builder.getOrCreate() | ||
|
||
return spark |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.