Skip to content

Commit 9142c84

Browse files
data prep
1 parent bc79eae commit 9142c84

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+592
-234
lines changed

mlops/metadata.yaml

+18-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,18 @@
1-
project_uuid: 22bfea7de412433badba758788c0fcc7
1+
project_type: standalone
2+
variables_dir: ~/.mage_data
3+
variables_retention_period: "90d"
4+
features:
5+
add_new_block_v2: true
6+
command_center: true
7+
compute_management: false
8+
custom_design: true
9+
data_integration_in_batch_pipeline: true
10+
dbt_v2: true
11+
global_hooks: true
12+
interactions: true
13+
display_local_timezone: true
14+
notebook_block_output_split_view: true
15+
operation_history: true
16+
polars: true
17+
project_uuid: 36404d0ffc214b8a89f598f3522c1a20
18+
help_improve_mage: true

mlops/settings.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
projects:
2-
unit_0: {}
2+
unit_0:
3+
active: true
34
unit_1: {}
45
unit_2: {}
56
unit_3: {}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import statistics
2+
from mage_ai.data_cleaner.column_types.column_type_detector import infer_column_types
3+
from mage_ai.data_preparation.models.constants import DATAFRAME_ANALYSIS_MAX_COLUMNS
4+
from mage_ai.shared.parsers import convert_matrix_to_dataframe
5+
6+
7+
df_1 = convert_matrix_to_dataframe(df_1)
8+
df_1 = df_1.iloc[:, :DATAFRAME_ANALYSIS_MAX_COLUMNS]
9+
columns_and_types = infer_column_types(df_1).items()
10+
columns = [t[0] for t in columns_and_types]
11+
stats = ['Type', 'Missing values', 'Unique values', 'Min', 'Max', 'Mean', 'Median', 'Mode']
12+
rows = [[] for _ in stats]
13+
14+
for col, col_type in columns_and_types:
15+
series = df_1[col]
16+
17+
min_value = None
18+
max_value = None
19+
mean = None
20+
median = None
21+
22+
not_null = series[series.notnull()]
23+
24+
if len(not_null) == 0:
25+
continue
26+
27+
if col_type.value in ['number', 'number_with_decimals']:
28+
if str(series.dtype) == 'object':
29+
if col_type.value == 'number_with_decimals':
30+
series = series.astype('float64')
31+
not_null = not_null.astype('float64')
32+
else:
33+
series = series.astype('int64')
34+
not_null = not_null.astype('int64')
35+
36+
count = len(not_null.index)
37+
if count >= 1:
38+
mean = round(not_null.sum() / count, 2)
39+
median = sorted(not_null)[int(count / 2)]
40+
min_value = round(series.min(), 2)
41+
max_value = round(series.max(), 2)
42+
else:
43+
min_value = not_null.astype(str).min()
44+
max_value = not_null.astype(str).max()
45+
46+
_, mode = sorted(
47+
[(v, k) for k, v in not_null.value_counts().items()],
48+
reverse=True,
49+
)[0]
50+
51+
for idx, value in enumerate([
52+
col_type.value,
53+
len(series[series.isna()].index),
54+
len(series.unique()),
55+
min_value,
56+
max_value,
57+
mean,
58+
median,
59+
mode,
60+
]):
61+
rows[idx].append(value)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
number_of_rows = len(df_1.index)
2+
columns_with_mising_values = []
3+
percentage_of_missing_values = []
4+
for col in df_1.columns:
5+
missing = df_1[col].isna().sum()
6+
if missing > 0:
7+
columns_with_mising_values.append(col)
8+
percentage_of_missing_values.append(100 * missing / number_of_rows)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from mage_ai.data_preparation.models.constants import DATAFRAME_ANALYSIS_MAX_COLUMNS
2+
from mage_ai.shared.parsers import convert_matrix_to_dataframe
3+
4+
5+
df_1 = convert_matrix_to_dataframe(df_1)
6+
columns = ['mode value', 'frequency', '% of values']
7+
column_index = []
8+
rows = []
9+
for col in df_1.columns[:DATAFRAME_ANALYSIS_MAX_COLUMNS]:
10+
value_counts = df_1[col].value_counts()
11+
if len(value_counts.index) == 0:
12+
continue
13+
column_value = value_counts.index[0]
14+
value = value_counts[column_value]
15+
number_of_rows = df_1[col].count()
16+
column_index.append(col)
17+
rows.append([
18+
column_value,
19+
f'{round(100 * value / number_of_rows, 2)}%',
20+
value,
21+
])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import pandas as pd
2+
3+
from mage_ai.shared.parsers import convert_matrix_to_dataframe
4+
5+
6+
if isinstance(df_1, list) and len(df_1) >= 1:
7+
item = df_1[0]
8+
if isinstance(item, pd.Series):
9+
item = item.to_frame()
10+
elif not isinstance(item, pd.DataFrame):
11+
item = convert_matrix_to_dataframe(item)
12+
df_1 = item
13+
14+
columns = df_1.columns
15+
col = 'trip_distance'
16+
x = df_1[df_1[col] <= 20][col]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from mage_ai.data_cleaner.column_types.column_type_detector import infer_column_types
2+
3+
4+
headers = ['value']
5+
stats = ['Columns', 'Rows']
6+
rows = [[len(df_1.columns)], [len(df_1.index)]]
7+
8+
col_counts = {}
9+
for col, col_type in infer_column_types(df_1).items():
10+
col_type_name = col_type.value
11+
if not col_counts.get(col_type_name):
12+
col_counts[col_type_name] = 0
13+
col_counts[col_type_name] += 1
14+
15+
for col_type, count in sorted(col_counts.items()):
16+
stats.append(f'# of {col_type}')
17+
rows.append([count])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
columns = df_1.columns
2+
number_of_unique_values = [df_1[col].nunique() for col in columns]

mlops/unit_1/custom/download.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from typing import List
2+
3+
if 'custom' not in globals():
4+
from mage_ai.data_preparation.decorators import custom
5+
6+
BASE_URL = 'https://github.com/mage-ai/datasets/raw/master/taxi/green'
7+
8+
9+
@custom
10+
def fan_out(*args, **kwargs) -> List[str]:
11+
urls = []
12+
13+
years = [(2024, (1, 3))]
14+
15+
for year, months in years:
16+
for i in range(*months):
17+
urls.append(f'{BASE_URL}/{year}/{i:02d}.parquet')
18+
19+
return [
20+
urls,
21+
]

mlops/unit_1/data_exporters/build.py

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
from typing import List, Tuple
2+
3+
from pandas import DataFrame, Series
4+
from scipy.sparse._csr import csr_matrix
5+
from sklearn.base import BaseEstimator
6+
7+
from mlops.utils.data_preparation.encoders import vectorize_features
8+
from mlops.utils.data_preparation.feature_selector import select_features
9+
10+
if 'data_exporter' not in globals():
11+
from mage_ai.data_preparation.decorators import data_exporter
12+
if 'test' not in globals():
13+
from mage_ai.data_preparation.decorators import test
14+
15+
16+
@data_exporter
17+
def export(
18+
data: Tuple[DataFrame, DataFrame, DataFrame], *args, **kwargs
19+
) -> Tuple[
20+
csr_matrix,
21+
csr_matrix,
22+
csr_matrix,
23+
Series,
24+
Series,
25+
Series,
26+
BaseEstimator,
27+
]:
28+
df, df_train, df_val = data
29+
target = kwargs.get('target', 'duration')
30+
31+
X, _, _ = vectorize_features(select_features(df))
32+
y: Series = df[target]
33+
34+
X_train, X_val, dv = vectorize_features(
35+
select_features(df_train),
36+
select_features(df_val),
37+
)
38+
y_train = df_train[target]
39+
y_val = df_val[target]
40+
41+
return (
42+
X,
43+
X_train,
44+
X_val,
45+
y,
46+
y_train,
47+
y_val,
48+
dv,
49+
)
50+
51+
52+
@test
53+
def test_dataset(
54+
X: csr_matrix,
55+
X_train: csr_matrix,
56+
X_val: csr_matrix,
57+
y: Series,
58+
y_train: Series,
59+
y_val: Series,
60+
*args,
61+
) -> None:
62+
assert (
63+
X.shape[0] == 105870
64+
), f'Entire dataset should have 105870 examples, but has {X.shape[0]}'
65+
assert (
66+
X.shape[1] == 7027
67+
), f'Entire dataset should have 7027 features, but has {X.shape[1]}'
68+
assert (
69+
len(y.index) == X.shape[0]
70+
), f'Entire dataset should have {X.shape[0]} examples, but has {len(y.index)}'
71+
72+
73+
@test
74+
def test_training_set(
75+
X: csr_matrix,
76+
X_train: csr_matrix,
77+
X_val: csr_matrix,
78+
y: Series,
79+
y_train: Series,
80+
y_val: Series,
81+
*args,
82+
) -> None:
83+
assert (
84+
X_train.shape[0] == 54378
85+
), f'Training set for training model should have 54378 examples, but has {X_train.shape[0]}'
86+
assert (
87+
X_train.shape[1] == 5094
88+
), f'Training set for training model should have 5094 features, but has {X_train.shape[1]}'
89+
assert (
90+
len(y_train.index) == X_train.shape[0]
91+
), f'Training set for training model should have {X_train.shape[0]} examples, but has {len(y_train.index)}'
92+
93+
94+
@test
95+
def test_validation_set(
96+
X: csr_matrix,
97+
X_train: csr_matrix,
98+
X_val: csr_matrix,
99+
y: Series,
100+
y_train: Series,
101+
y_val: Series,
102+
*args,
103+
) -> None:
104+
assert (
105+
X_val.shape[0] == 51492
106+
), f'Training set for validation should have 51492 examples, but has {X_val.shape[0]}'
107+
assert (
108+
X_val.shape[1] == 5094
109+
), f'Training set for validation should have 5094 features, but has {X_val.shape[1]}'
110+
assert (
111+
len(y_val.index) == X_val.shape[0]
112+
), f'Training set for training model should have {X_val.shape[0]} examples, but has {len(y_val.index)}'
File renamed without changes.

mlops/unit_1/data_loaders/ingest.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from io import BytesIO
2+
from typing import List
3+
4+
import pandas as pd
5+
6+
from mlops.utils.requests.adapters import open_session
7+
8+
if 'data_loader' not in globals():
9+
from mage_ai.data_preparation.decorators import data_loader
10+
11+
12+
BASE_URL = 'https://github.com/mage-ai/datasets/raw/master/taxi/green'
13+
14+
15+
@data_loader
16+
def ingest_files(**kwargs) -> pd.DataFrame:
17+
dfs: List[pd.DataFrame] = []
18+
19+
for year, months in [(2024, (1, 3))]:
20+
for i in range(*months):
21+
with open_session() as session:
22+
response = session.get(f'{BASE_URL}/{year}/{i:02d}.parquet')
23+
24+
if response.status_code != 200:
25+
raise Exception(response.text)
26+
27+
df = pd.read_parquet(BytesIO(response.content))
28+
dfs.append(df)
29+
30+
return pd.concat(dfs)
+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
training_set:
2+
object_type: pipeline
3+
object_uuid: data_preparation
4+
outdated_after:
5+
seconds: 3599
6+
settings:
7+
build:
8+
partitions: 1
File renamed without changes.

0 commit comments

Comments
 (0)