Skip to content

Commit 0538efb

Browse files
committed
Add data cleaning functions.
1 parent e315786 commit 0538efb

File tree

3 files changed

+103
-1
lines changed

3 files changed

+103
-1
lines changed

src/final_project_btb/analysis/task_predictor_pdf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def task_save_image_with_columns_predictions(
2727

2828

2929
@pytask.mark.skip()
30-
def task_generate_excel(
30+
def task_generate_dataframe(
3131
data=BLD / "page_for_detection.png", produces=BLD / "table_no_cleaning.cvs"
3232
):
3333
df = extracting_data(data)
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
from final_project_btb.analysis.data_extraction import extracting_data
5+
6+
7+
def clean_data_extracted(image_path):
8+
"""Extract the data, arrange rows and columns, clean the dataframe.
9+
10+
Args:
11+
image_path (str): Path to the input image on which object detection
12+
predictions will be performed.
13+
14+
Returns:
15+
df(DataFrame): Semi-Clean DataFrame ready to graph some results.
16+
"""
17+
original_df = extracting_data(image_path)
18+
original_df = _concatenate_extra_rows(original_df)
19+
original_df = _delete_empty_rows(original_df)
20+
21+
rename_dict = {
22+
"Col_1": "town_and_county",
23+
"Col_2": "name_of_bank",
24+
"Col_3": "president",
25+
"Col_4": "vice_president",
26+
"Col_5": "cashier",
27+
"Col_6": "asst_cashier",
28+
"Col_7": "paid_up_capital",
29+
"Col_8": "surp_and_prof",
30+
"Col_9": "deposits",
31+
"Col_10": "loans_and_discounts_stocks_and_securities",
32+
"Col_11": "cash_and_exchanges",
33+
"Col_12": "principal_correspondence",
34+
}
35+
df = original_df.copy()
36+
df = df.rename(columns=rename_dict)
37+
38+
# Lista de columnas numéricas
39+
numeric_cols = [
40+
"paid_up_capital",
41+
"surp_and_prof",
42+
"deposits",
43+
"loans_and_discounts_stocks_and_securities",
44+
"cash_and_exchanges",
45+
]
46+
47+
df = _clean_number_columns(df, numeric_cols)
48+
df = _delete_columns_with_na(df, numeric_cols)
49+
50+
return df
51+
52+
53+
def _concatenate_extra_rows(base_de_datos):
54+
"""Concatenates rows with empty cell in first column."""
55+
df_array = base_de_datos.to_numpy(copy=True)
56+
57+
for i in range(1, len(df_array)):
58+
if df_array[i, 0] == "":
59+
df_array[i - 1] = np.where(
60+
df_array[i] != "", df_array[i - 1] + " " + df_array[i], df_array[i - 1]
61+
)
62+
df_array[i] = ""
63+
64+
df_final = pd.DataFrame(df_array, columns=base_de_datos.columns)
65+
return df_final
66+
67+
68+
def _delete_empty_rows(base_de_datos):
69+
"""Eliminates every row with all empty cells."""
70+
df_without_empty_rows = base_de_datos.replace("", np.nan).dropna(how="all")
71+
df_without_empty_column1 = df_without_empty_rows.dropna(
72+
subset=[df_without_empty_rows.columns[0]]
73+
)
74+
75+
return df_without_empty_column1.reset_index(drop=True)
76+
77+
78+
def _clean_number_columns(df, name_columns):
79+
"""Eliminates every character that is not a number in number columns."""
80+
df_clean = df.copy()
81+
for col in name_columns:
82+
df_clean[col] = df_clean[col].astype(str).replace(r"\D", "", regex=True)
83+
df_clean[col] = pd.to_numeric(df_clean[col]).astype(pd.UInt32Dtype())
84+
return df_clean
85+
86+
87+
def _delete_columns_with_na(df, name_columns):
88+
"""Eliminates every row with all na."""
89+
df_clean = df.copy()
90+
df_clean = df_clean.dropna(subset=name_columns, how="all")
91+
return df_clean
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from final_project_btb.config import (
2+
BLD,
3+
)
4+
from final_project_btb.data_management.cleaning_data import clean_data_extracted
5+
6+
7+
def task_generate_cvs(
8+
data=BLD / "page_for_detection.png", produces=BLD / "clean_table.cvs"
9+
):
10+
df = clean_data_extracted(data)
11+
df.to_csv(produces)

0 commit comments

Comments
 (0)