Skip to content

Commit a881c7e

Browse files
SNOW-1447365: Add support for to_csv (#1832)
1 parent 6cd31f1 commit a881c7e

File tree

14 files changed

+792
-11
lines changed

14 files changed

+792
-11
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@
9494
- Added support for `DataFrame.assign`.
9595
- Added support for `DataFrame.stack`.
9696
- Added support for `DataFrame.pivot` and `pd.pivot`.
97+
- Added support for `DataFrame.to_csv` and `Series.to_csv`.
9798

9899
#### Bug Fixes
99100

docs/source/modin/dataframe.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,3 +216,10 @@ DataFrame
216216
DataFrame.first_valid_index
217217
DataFrame.last_valid_index
218218
DataFrame.resample
219+
220+
.. rubric:: Serialization / IO / conversion
221+
222+
.. autosummary::
223+
:toctree: pandas_api/
224+
225+
DataFrame.to_csv

docs/source/modin/series.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,3 +287,10 @@ Series
287287
Series.str.strip
288288
Series.str.translate
289289
Series.str.upper
290+
291+
.. rubric:: Serialization / IO / conversion
292+
293+
.. autosummary::
294+
:toctree: pandas_api/
295+
296+
Series.to_csv

docs/source/modin/supported/dataframe_supported.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,14 @@ Methods
412412
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
413413
| ``to_clipboard`` | N | | |
414414
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
415-
| ``to_csv`` | N | | |
415+
| ``to_csv`` | P | | Supports writing to both local and snowflake stage.|
416+
| | | | Filepath starting with ``@`` is treated as |
417+
| | | | snowflake stage location. |
418+
| | | | Writing to local file supports all parameters. |
419+
| | | | Writing to snowflake state does not support |
420+
| | | | ``float_format``, ``mode``, ``encoding``, |
421+
| | | | ``quoting``, ``quotechar``, ``lineterminator``, |
422+
| | | | ``doublequote`` and ``decimal`` parameters. |
416423
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
417424
| ``to_dict`` | Y | | |
418425
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

docs/source/modin/supported/series_supported.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,14 @@ Methods
398398
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
399399
| ``to_clipboard`` | N | | |
400400
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
401-
| ``to_csv`` | N | | |
401+
| ``to_csv`` | P | | Supports writing to both local and snowflake stage.|
402+
| | | | Filepath starting with ``@`` is treated as |
403+
| | | | snowflake stage location. |
404+
| | | | Writing to local file supports all parameters. |
405+
| | | | Writing to snowflake state does not support |
406+
| | | | ``float_format``, ``mode``, ``encoding``, |
407+
| | | | ``quoting``, ``quotechar``, ``lineterminator``, |
408+
| | | | ``doublequote`` and ``decimal`` parameters. |
402409
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
403410
| ``to_dict`` | Y | | |
404411
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

src/snowflake/snowpark/modin/pandas/base.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3323,7 +3323,6 @@ def to_clipboard(
33233323
# TODO: SNOW-1119855: Modin upgrade - modin.pandas.base.BasePandasDataset
33243324
return self._default_to_pandas("to_clipboard", excel=excel, sep=sep, **kwargs)
33253325

3326-
@base_not_implemented()
33273326
def to_csv(
33283327
self,
33293328
path_or_buf=None,
@@ -3348,7 +3347,7 @@ def to_csv(
33483347
errors: str = "strict",
33493348
storage_options: StorageOptions = None,
33503349
): # pragma: no cover
3351-
from snowflake.snowpark.modin.pandas.core.execution.dispatching.factories.dispatcher import (
3350+
from snowflake.snowpark.modin.core.execution.dispatching.factories.dispatcher import (
33523351
FactoryDispatcher,
33533352
)
33543353

src/snowflake/snowpark/modin/plugin/_internal/io_utils.py

Lines changed: 110 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,113 @@
55
import glob
66
import os
77
from collections.abc import Hashable
8-
from typing import Any, Callable, Union
8+
from typing import Any, Callable, Optional, Union
99

1010
import numpy as np
1111
import pandas as native_pd
12+
from pandas._typing import FilePath
1213

1314
import snowflake.snowpark.modin.pandas as pd
1415
from snowflake.snowpark.session import Session
1516

1617
PANDAS_KWARGS = {"names", "index_col", "usecols", "dtype"}
1718

19+
# Series.to_csv and DataFrame.to_csv default values.
20+
# This must be same as modin.pandas.base.py:to_csv.
21+
TO_CSV_DEFAULTS = {
22+
"path_or_buf": None,
23+
"sep": ",",
24+
"na_rep": "",
25+
"float_format": None,
26+
"columns": None,
27+
"header": True,
28+
"index": True,
29+
"index_label": None,
30+
"mode": "w",
31+
"encoding": None,
32+
"compression": "infer",
33+
"quoting": None,
34+
"quotechar": '"',
35+
"lineterminator": None,
36+
"chunksize": None,
37+
"date_format": None,
38+
"doublequote": True,
39+
"escapechar": None,
40+
"decimal": ".",
41+
"errors": "strict",
42+
"storage_options": None,
43+
}
44+
45+
# Reference https://docs.snowflake.com/en/sql-reference/sql/copy-into-location#type-csv
46+
SUPPORTED_COMPRESSION_IN_SNOWFLAKE = [
47+
"auto",
48+
"brotli",
49+
"bz2",
50+
"deflate",
51+
"gzip",
52+
"raw_deflate",
53+
"zstd",
54+
]
55+
56+
57+
def infer_compression_algorithm(filepath: str) -> Optional[str]:
58+
"""
59+
Try to infer compression algorithm from extension of given filepath.
60+
Return None, if we fail to map extension to any known compression algorithm.
61+
Args:
62+
filepath: path to file.
63+
64+
Returns:
65+
Corresponding compression algorithm on success, None otherwise.
66+
"""
67+
_, ext = os.path.splitext(filepath)
68+
if not ext:
69+
return None
70+
# Remove leading dot and convert to lower case.
71+
ext = ext[1:].lower()
72+
# Map from file extension to compression algorithm.
73+
ext_to_algo = {
74+
"br": "brotli",
75+
"br2": "br2",
76+
"gz": "gzip",
77+
"tar": "tar",
78+
"xz": "xz",
79+
"zip": "zip",
80+
"zst": "zstd",
81+
"zz": "deflate",
82+
}
83+
return ext_to_algo.get(ext)
84+
85+
86+
def get_compression_algorithm_for_csv(
87+
compression: Union[str, dict, None], filepath: str
88+
) -> Optional[str]:
89+
"""
90+
Get compression algorithm for output csv file.
91+
Args:
92+
compression: compression parameter value.
93+
filepath: path to write csv file to.
94+
95+
Returns:
96+
Compression algorithm or None.
97+
"""
98+
if compression == "infer":
99+
# Same as native pandas, try to infer compression from file extension.
100+
compression = infer_compression_algorithm(filepath)
101+
elif isinstance(compression, dict):
102+
compression = compression.get("method")
103+
104+
if compression is None:
105+
return compression
106+
107+
# Check against supported compression algorithms in Snowflake.
108+
if compression.lower() not in SUPPORTED_COMPRESSION_IN_SNOWFLAKE:
109+
raise ValueError(
110+
f"Unrecognized compression type: {compression}\nValid "
111+
f"compression types are {SUPPORTED_COMPRESSION_IN_SNOWFLAKE}"
112+
)
113+
return compression
114+
18115

19116
def upload_local_path_to_snowflake_stage(
20117
session: Session, path: str, sf_stage: str
@@ -75,6 +172,18 @@ def is_local_filepath(filepath: str) -> bool:
75172
return not filepath.startswith("@") or filepath.startswith(r"\@")
76173

77174

175+
def is_snowflake_stage_path(filepath: FilePath) -> bool:
176+
"""
177+
Returns whether a filepath refers to snowflake stage location.
178+
Args:
179+
filepath: File path to file.
180+
Returns:
181+
"""
182+
return (
183+
filepath is not None and isinstance(filepath, str) and filepath.startswith("@")
184+
)
185+
186+
78187
def get_non_pandas_kwargs(kwargs: Any) -> Any:
79188
"""
80189
Returns a new dict without pandas keyword

src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py

Lines changed: 68 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,9 @@
216216
set_frame_2d_positional,
217217
)
218218
from snowflake.snowpark.modin.plugin._internal.io_utils import (
219+
TO_CSV_DEFAULTS,
219220
get_columns_to_keep_for_usecols,
221+
get_compression_algorithm_for_csv,
220222
get_non_pandas_kwargs,
221223
is_local_filepath,
222224
upload_local_path_to_snowflake_stage,
@@ -1087,10 +1089,12 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
10871089
self,
10881090
index: bool = True,
10891091
index_label: Optional[IndexLabel] = None,
1092+
data_column_labels: Optional[List[Hashable]] = None,
10901093
) -> SnowparkDataFrame:
10911094
"""
10921095
Convert the Snowpark pandas Dataframe to Snowpark Dataframe. The Snowpark Dataframe is created by selecting
1093-
all index columns of the Snowpark pandas Dataframe if index=True, and also all data columns.
1096+
all index columns of the Snowpark pandas Dataframe if index=True, and also all data columns
1097+
if data_column_labels is None.
10941098
For example:
10951099
With a Snowpark pandas Dataframe (df) has index=[`A`, `B`], columns = [`C`, `D`],
10961100
the result Snowpark Dataframe after calling _to_snowpark_dataframe_from_snowpark_pandas_dataframe(index=True),
@@ -1108,6 +1112,8 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
11081112
index_label: Optional[IndexLabel], default None
11091113
the new label used for the index columns, the length must be the same as the number of index column
11101114
of the current dataframe. If None, the original index name is used.
1115+
data_column_labels: Optional[Hashable], default None
1116+
Data columns to include. If none include all data columns.
11111117

11121118
Returns:
11131119
SnowparkDataFrame
@@ -1132,7 +1138,8 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
11321138
else:
11331139
index_column_labels = self._modin_frame.index_column_pandas_labels
11341140

1135-
data_column_labels = self._modin_frame.data_column_pandas_labels
1141+
if data_column_labels is None:
1142+
data_column_labels = self._modin_frame.data_column_pandas_labels
11361143
if self._modin_frame.is_unnamed_series():
11371144
# this is an unnamed Snowpark pandas series, there is no customer visible pandas
11381145
# label for the data column, set the label to be None
@@ -1172,7 +1179,12 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
11721179
self._modin_frame.index_column_snowflake_quoted_identifiers
11731180
)
11741181
identifiers_to_retain.extend(
1175-
self._modin_frame.data_column_snowflake_quoted_identifiers
1182+
[
1183+
t[0]
1184+
for t in self._modin_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels(
1185+
data_column_labels, include_index=False
1186+
)
1187+
]
11761188
)
11771189
for pandas_label, snowflake_identifier in zip(
11781190
index_column_labels + data_column_labels,
@@ -1191,6 +1203,59 @@ def _to_snowpark_dataframe_from_snowpark_pandas_dataframe(
11911203
col_mapper=rename_mapper
11921204
)
11931205

1206+
def to_csv_with_snowflake(self, **kwargs: Any) -> None:
1207+
"""
1208+
Write data to a csv file in snowflake stage.
1209+
Args:
1210+
**kwargs: to_csv arguments.
1211+
"""
1212+
# Raise not implemented error for unsupported parameters.
1213+
unsupported_params = [
1214+
"float_format",
1215+
"mode",
1216+
"encoding",
1217+
"quoting",
1218+
"quotechar",
1219+
"lineterminator",
1220+
"doublequote",
1221+
"decimal",
1222+
]
1223+
for param in unsupported_params:
1224+
if kwargs.get(param) is not TO_CSV_DEFAULTS[param]:
1225+
ErrorMessage.parameter_not_implemented_error(param, "to_csv")
1226+
1227+
ignored_params = ["chunksize", "errors", "storage_options"]
1228+
for param in ignored_params:
1229+
if kwargs.get(param) is not TO_CSV_DEFAULTS[param]:
1230+
WarningMessage.ignored_argument("to_csv", param, "")
1231+
1232+
def _get_param(param_name: str) -> Any:
1233+
"""
1234+
Extract parameter value from kwargs. If missing return default value.
1235+
"""
1236+
return kwargs.get(param_name, TO_CSV_DEFAULTS[param_name])
1237+
1238+
path = _get_param("path_or_buf")
1239+
compression = get_compression_algorithm_for_csv(_get_param("compression"), path)
1240+
1241+
index = _get_param("index")
1242+
snowpark_df = self._to_snowpark_dataframe_from_snowpark_pandas_dataframe(
1243+
index, _get_param("index_label"), _get_param("columns")
1244+
)
1245+
na_sep = _get_param("na_rep")
1246+
snowpark_df.write.csv(
1247+
location=path,
1248+
format_type_options={
1249+
"COMPRESSION": compression if compression else "NONE",
1250+
"FIELD_DELIMITER": _get_param("sep"),
1251+
"NULL_IF": na_sep if na_sep else (),
1252+
"ESCAPE": _get_param("escapechar"),
1253+
"DATE_FORMAT": _get_param("date_format"),
1254+
},
1255+
header=_get_param("header"),
1256+
single=True,
1257+
)
1258+
11941259
def to_snowflake(
11951260
self,
11961261
name: Union[str, Iterable[str]],

0 commit comments

Comments
 (0)