Skip to content

Commit b19cbc6

Browse files
author
Cyprien Ricque
committed
automatic file detection
1 parent e785835 commit b19cbc6

File tree

10 files changed

+176
-32
lines changed

10 files changed

+176
-32
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ py-modules = ["stdflow"]
77

88
[project]
99
name = "stdflow"
10-
version = "0.0.20"
10+
version = "0.0.21"
1111
description = "[alpha] A package that transform your notebooks and python files into pipeline steps by standardizing the data input / output."
1212
readme = "README.md"
1313
authors = [

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setup(
44
name="stdflow",
5-
version="0.0.20",
5+
version="0.0.21",
66
description="[alpha] A package that transform your notebooks and python files into pipeline steps by standardizing the data input / output.",
77
long_description=open("README.md").read(),
88
long_description_content_type="text/markdown",

stdflow/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import pandas as pd
1212

13-
__version__ = "0.0.20"
13+
__version__ = "0.0.21"
1414

1515
import logging
1616
import sys

stdflow/stdflow_path/data_path.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55

66
from stdflow.stdflow_path import Path
7+
from stdflow_utils.listing import list_csv_files
78

89
try:
910
from typing import Literal, Optional
@@ -26,7 +27,7 @@ def __init__(
2627
attrs: list | None | str = None,
2728
step_name: str | None = None,
2829
version: str | Literal[":last", ":first"] = ":last",
29-
file_name: str = None,
30+
file_name: str | Literal[":auto"] = None,
3031
):
3132
"""
3233
At this stage all information are present except the version which is to be detected if not specified
@@ -36,8 +37,9 @@ def __init__(
3637
:param version: last part of the full_path. one of [":last", ":first", "<version_name>", None]
3738
:param file_name: file name (optional)
3839
"""
39-
# if step is str and contains step_, remove it
4040
super().__init__(root, file_name)
41+
42+
# if step is str and contains step_, remove it
4143
if isinstance(step_name, str) and step_name.startswith(STEP_PREFIX):
4244
step_name = step_name[len(STEP_PREFIX) :]
4345
# if version is str and contains v_, remove it
@@ -55,6 +57,20 @@ def __init__(
5557
elif version is not None:
5658
self.version = version
5759

60+
if file_name == ":auto":
61+
self.file_name = self.detect_file_name()
62+
63+
def detect_file_name(self):
64+
if not os.path.isdir(self.dir_path):
65+
logger.error(f"Path {self.dir_path} does not exist")
66+
files = list_csv_files(self.dir_path)
67+
if len(files) == 1:
68+
logger.debug(f"Using file {files[0]}")
69+
return files[0]
70+
else:
71+
logger.warning(f"Multiple files found in {self.dir_path}: {files}")
72+
return None
73+
5874
def detect_version(self, path, version_type):
5975
if version_type not in [":last", ":first"]:
6076
logger.warning(f"Unknown version type: {version_type}")

stdflow/stdflow_utils/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,9 @@ def string_to_uuid(input_string):
144144
# )
145145

146146

147+
148+
149+
147150
if __name__ == "__main__":
148151
to_html(
149152
dest="./",

stdflow/stdflow_utils/listing.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import os
2+
import re
3+
4+
5+
def list_files_regex_all_depth(directory, pattern):
6+
matched_files = []
7+
8+
# Walk through each file in the directory
9+
for root, dirs, files in os.walk(directory):
10+
for file in files:
11+
# If the file matches the pattern, add it to the list
12+
if re.search(pattern, file):
13+
full_path = os.path.join(root, file)
14+
relative_path = os.path.relpath(full_path, directory)
15+
matched_files.append(relative_path)
16+
17+
return matched_files
18+
19+
20+
def list_files_regex(directory, pattern):
21+
matched_files = []
22+
23+
# List each file in the directory
24+
for file in os.listdir(directory):
25+
# Construct the full path
26+
full_path = os.path.join(directory, file)
27+
# Check if it's a file (not a subdirectory)
28+
if os.path.isfile(full_path):
29+
# If the file matches the pattern, add it to the list
30+
if re.search(pattern, file):
31+
relative_path = os.path.relpath(full_path, directory)
32+
matched_files.append(relative_path)
33+
34+
return matched_files
35+
36+
37+
# using glob
38+
def list_files_ext(directory, pattern):
39+
import glob
40+
41+
paths = glob.glob(os.path.join(directory, pattern))
42+
return [os.path.relpath(path, directory) for path in paths]
43+
44+
45+
def list_excel_files(directory):
46+
return list_files_ext(directory, "*.xlsx") + list_files_ext(directory, "*.xls")
47+
48+
49+
def list_csv_files(directory):
50+
return list_files_ext(directory, "*.csv")
51+
52+
53+
if __name__ == "__main__":
54+
print(list_files_regex_all_depth(directory="./src", pattern=".*\.py$"))
55+
print(list_files_regex(directory="./src/data_analysis", pattern=".*\.py"))
56+
print(list_files_ext(directory="./src/data_analysis", pattern="*.py"))

stdflow/step.py

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -73,31 +73,57 @@ def __new__(cls, *args, **kwargs):
7373

7474

7575
class Step(ModuleType):
76-
def __init__(self, name: str = "Step"):
77-
super().__init__(name)
76+
def __init__(
77+
self,
78+
step_in: str | None = None,
79+
version_in: str | None = ":last",
80+
attrs_in: str | list[str] | None = ":default",
81+
file_name_in: str | None = ":default",
82+
method_in: str | object | None = ":auto",
83+
root_in: str | None = ":default",
84+
85+
step_out: str | None = None,
86+
version_out: str | None = DEFAULT_DATE_VERSION_FORMAT,
87+
attrs_out: str | list[str] | None = ":default",
88+
file_name_out: str | None = ":default",
89+
method_out: str | object | None = ":auto",
90+
root_out: str | None = ":default",
91+
92+
root: str | None = "./data",
93+
attrs: str | list[str] | None = None,
94+
file_name: str | None = ":auto",
95+
96+
data_l: list[MetaData] = None,
97+
data_l_in: list[MetaData] = None,
98+
99+
):
100+
super().__init__("Step")
101+
78102
# === Exported === #
79-
self.data_l: list[MetaData] = []
80-
self.data_l_in: list[MetaData] = [] # direct input to this step file
103+
# all inputs to this step
104+
self.data_l = data_l if data_l is not None else []
105+
# direct input to this step
106+
self.data_l_in = data_l_in if data_l_in is not None else []
81107
# ================ #
82108

83109
# Default values of load and save functions
84-
self._step_in: str | None = None
85-
self._version_in: str | None = ":last"
86-
self._attrs_in: str | list[str] | None = ":default"
87-
self._file_name_in: str | None = ":default" # TODO
88-
self._method_in: str | object | None = ":auto" # TODO
89-
self._root_in: str | None = ":default"
90-
91-
self._step_out: str | None = None
92-
self._version_out: str | None = DEFAULT_DATE_VERSION_FORMAT
93-
self._attrs_out: str | list[str] | None = ":default"
94-
self._file_name_out: str | None = ":default" # TODO
95-
self._method_out: str | object | None = ":auto"
96-
self._root_out: str | None = ":default"
110+
self._step_in = step_in
111+
self._version_in = version_in
112+
self._attrs_in = attrs_in
113+
self._file_name_in = file_name_in
114+
self._method_in = method_in
115+
self._root_in = root_in
116+
117+
self._step_out = step_out
118+
self._version_out = version_out
119+
self._attrs_out = attrs_out
120+
self._file_name_out = file_name_out
121+
self._method_out = method_out
122+
self._root_out = root_out
97123

98-
self._root: str | None = "./data"
99-
self._file_name: str | None = ":auto" # TODO
100-
self._attrs: str | list[str] | None = None
124+
self._root = root
125+
self._file_name = file_name
126+
self._attrs = attrs
101127

102128
def load(
103129
self,
@@ -125,6 +151,10 @@ def load(
125151
:param kwargs: kwargs to send to the method
126152
:return:
127153
"""
154+
if verbose:
155+
logger.setLevel(logging.INFO)
156+
else:
157+
logger.setLevel(logging.WARNING)
128158
caller_file_name, caller_function, caller_package = get_caller_metadata()
129159
if "ipykernel" in caller_file_name:
130160
notebook_path, notebook_name = get_notebook_path()

tests/old/test_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33

44
def test_exact_version():
5-
assert st.__version__ == "0.0.20"
5+
assert st.__version__ == "0.0.21"
66

77

88
def test_version():

tests/test_file_name_auto.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import os
2+
import re
3+
4+
import stdflow as sf
5+
6+
import pandas as pd
7+
import os
8+
import shutil
9+
import pytest
10+
11+
12+
import stdflow as sf
13+
14+
15+
def test_sf_load():
16+
# Define a test directory name
17+
test_dir = "test_dir"
18+
19+
# Create the test directory
20+
os.makedirs(test_dir, exist_ok=True)
21+
22+
# Create a test dataframe
23+
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
24+
25+
# Save the test dataframe to csv in the test directory
26+
df.to_csv(os.path.join(test_dir, "test.csv"), index=False)
27+
28+
# Load the dataframe using sf.load
29+
loaded_df = sf.load(root='./', attrs=test_dir)
30+
31+
# Check if the loaded dataframe is equal to the original one
32+
pd.testing.assert_frame_equal(df, loaded_df)
33+
34+
# Remove the test directory after test
35+
shutil.rmtree(test_dir)
36+
37+
38+
# Run the test
39+
pytest.main(["-v", "your_test_file.py"])

tests/test_multiple_steps.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
def load_and_process_digi_sentiments_indonesia(catalog, brand_dict=None):
88
print("load_and_process_digi_sentiments_indonesia")
9-
conf = catalog["digimind_sentiments"]
9+
conf = catalog["twitter_sentiments"]
1010

1111
step = sf.Step()
1212
step.reset()
@@ -36,7 +36,7 @@ def load_digi_mentions_indonesia(step):
3636

3737
def load_and_process_digi_mentions_indonesia(catalog, brand_dict=None):
3838
print("load_and_process_digi_mentions_indonesia")
39-
conf = catalog["digimind_mentions"]
39+
conf = catalog["twitter_mentions"]
4040

4141
step = sf.Step()
4242
step.reset()
@@ -54,7 +54,7 @@ def load_and_process_digi_mentions_indonesia(catalog, brand_dict=None):
5454

5555
def load_and_process_digi_indonesia(catalog, brand_dict=None):
5656
print("load_and_process_digi_indonesia")
57-
conf = catalog["digimind"]
57+
conf = catalog["twitter"]
5858

5959
step = sf.Step()
6060
step.reset()
@@ -69,8 +69,8 @@ def load_and_process_digi_indonesia(catalog, brand_dict=None):
6969
def test_ms():
7070
load_and_process_digi_indonesia(
7171
{
72-
"digimind": {"attrs_out": "oui", "step_out": "non"},
73-
"digimind_mentions": {"attrs": "oui", "step_out": "non", "step_in": "non"},
74-
"digimind_sentiments": {"attrs": "oui", "step_out": "non", "step_in": "non"},
72+
"twitter": {"attrs_out": "oui", "step_out": "non"},
73+
"twitter_mentions": {"attrs": "oui", "step_out": "non", "step_in": "non"},
74+
"twitter_sentiments": {"attrs": "oui", "step_out": "non", "step_in": "non"},
7575
}
7676
)

0 commit comments

Comments
 (0)