Skip to content

Commit 195fa92

Browse files
committed
Automatic code formatting using black
1 parent 6d7a881 commit 195fa92

File tree

17 files changed

+426
-324
lines changed

17 files changed

+426
-324
lines changed

bdikit/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
__version__ = '0.2.0.dev0'
1+
__version__ = "0.2.0.dev0"
22
# To shortcut the import path
3-
from bdikit.api import APIManager
3+
from bdikit.api import APIManager

bdikit/api.py

+44-22
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,24 @@
22
from bdikit.mapping_recommendation.scope_reducing_manager import ScopeReducingManager
33
from bdikit.mapping_recommendation.value_mapping_manager import ValueMappingManager
44
from bdikit.mapping_recommendation.column_mapping_manager import ColumnMappingManager
5-
from bdikit.visualization.mappings import plot_reduce_scope, plot_column_mappings, plot_value_mappings
5+
from bdikit.visualization.mappings import (
6+
plot_reduce_scope,
7+
plot_column_mappings,
8+
plot_value_mappings,
9+
)
610
from bdikit.utils import get_gdc_data
711
from os.path import join, dirname
812
import os
913

10-
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable huggingface messages
14+
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable huggingface messages
1115

12-
GDC_DATA_PATH = join(dirname(__file__), './resource/gdc_table.csv')
16+
GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv")
1317

1418

15-
class APIManager():
16-
17-
def __init__(self,):
19+
class APIManager:
20+
def __init__(
21+
self,
22+
):
1823
# TODO: move into database object (in data_ingestion folder)
1924
self.dataset = None
2025
# TODO: move into database object (in data_ingestion folder)
@@ -23,8 +28,8 @@ def __init__(self,):
2328
self.reduced_scope = None
2429
self.column_manager = None
2530
self.value_manager = None
26-
self.column_mappings = None # TODO move this to a property in column_manager
27-
self.value_mappings = None # TODO move this to a property in value_manager
31+
self.column_mappings = None # TODO move this to a property in column_manager
32+
self.value_mappings = None # TODO move this to a property in value_manager
2833

2934
def load_global_table(self, global_table_path=None):
3035
if global_table_path is None:
@@ -45,41 +50,58 @@ def reduce_scope(self):
4550
self.reduced_scope = self.scope_manager.reduce()
4651
return plot_reduce_scope(self.reduced_scope, self.dataset)
4752

48-
def map_columns(self, algorithm='SimFloodAlgorithm'):
49-
self.column_manager = ColumnMappingManager(self.dataset, self.global_table, algorithm)
53+
def map_columns(self, algorithm="SimFloodAlgorithm"):
54+
self.column_manager = ColumnMappingManager(
55+
self.dataset, self.global_table, algorithm
56+
)
5057
self.column_manager.reduced_scope = self.reduced_scope
5158
self.column_mappings = self.column_manager.map()
5259
plot_column_mappings(self.column_mappings)
5360

5461
return self.column_mappings
5562

56-
def map_values(self, algorithm='EditAlgorithm'):
63+
def map_values(self, algorithm="EditAlgorithm"):
5764
self.global_table_all = get_gdc_data(self.column_mappings.values())
58-
self.value_manager = ValueMappingManager(self.dataset, self.column_mappings, self.global_table_all, algorithm)
65+
self.value_manager = ValueMappingManager(
66+
self.dataset, self.column_mappings, self.global_table_all, algorithm
67+
)
5968
self.value_mappings = self.value_manager.map()
6069
plot_value_mappings(self.value_mappings)
6170

6271
return self.value_mappings
6372

64-
def update_reduced_scope(self, original_column, new_candidate_name, new_candidate_sim=1.0):
73+
def update_reduced_scope(
74+
self, original_column, new_candidate_name, new_candidate_sim=1.0
75+
):
6576
for index in range(len(self.reduced_scope)):
66-
if self.reduced_scope[index]['Candidate column'] == original_column:
67-
self.reduced_scope[index]['Top k columns'].append((new_candidate_name, new_candidate_sim))
68-
print('Reduced scope updated!')
77+
if self.reduced_scope[index]["Candidate column"] == original_column:
78+
self.reduced_scope[index]["Top k columns"].append(
79+
(new_candidate_name, new_candidate_sim)
80+
)
81+
print("Reduced scope updated!")
6982
plot_reduce_scope(self.reduced_scope)
7083
break
7184

7285
def update_column_mappings(self, new_mappings):
7386
for original_column, new_target_column in new_mappings:
7487
self.column_mappings[original_column] = new_target_column
7588

76-
print('Column mapping updated!')
89+
print("Column mapping updated!")
7790
plot_column_mappings(self.column_mappings)
7891

79-
def update_value_mappings(self, original_column, original_value, new_target_value, new_similarity=1.0):
80-
for index in range(len(self.value_mappings[original_column]['matches'])):
81-
if self.value_mappings[original_column]['matches'][index][0] == original_value:
82-
self.value_mappings[original_column]['matches'][index] = (original_value, new_target_value, new_similarity)
83-
print('Value mapping updated!')
92+
def update_value_mappings(
93+
self, original_column, original_value, new_target_value, new_similarity=1.0
94+
):
95+
for index in range(len(self.value_mappings[original_column]["matches"])):
96+
if (
97+
self.value_mappings[original_column]["matches"][index][0]
98+
== original_value
99+
):
100+
self.value_mappings[original_column]["matches"][index] = (
101+
original_value,
102+
new_target_value,
103+
new_similarity,
104+
)
105+
print("Value mapping updated!")
84106
plot_value_mappings(self.value_mappings)
85107
break

bdikit/data_ingestion/column.py

+18-16
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,43 @@
11
from enum import Enum
22

3+
34
class ColumnType(Enum):
4-
STRING = 'string'
5-
FLOAT = 'float'
6-
INTEGER = 'integer'
7-
# TODO semantic types?
5+
STRING = "string"
6+
FLOAT = "float"
7+
INTEGER = "integer"
8+
# TODO semantic types?
9+
810

911
class Column:
10-
def __init__(self, df_name, column_name, column_type=ColumnType.STRING, domain_values=None, null_values_representations=None):
12+
def __init__(
13+
self,
14+
df_name,
15+
column_name,
16+
column_type=ColumnType.STRING,
17+
domain_values=None,
18+
null_values_representations=None,
19+
):
1120
self.df_name = df_name
1221
self.column_name = column_name
1322
self.column_type = column_type
14-
23+
1524
if domain_values is None:
1625
self.domain_values = set()
1726
else:
1827
self.domain_values = set(domain_values)
19-
28+
2029
if null_values_representations is None:
2130
self.null_values_representations = set()
2231
else:
2332
self.null_values_representations = set(null_values_representations)
24-
25-
2633

2734
def __str__(self):
2835
return f"Column(df_name={self.df_name}, column_name={self.column_name}, column_type={self.column_type}, domain_values={self.domain_values}, null_values_representations={self.null_values_representations})"
29-
36+
3037
def __eq__(self, value):
3138
if not isinstance(value, Column):
3239
return False
3340
return self.df_name == value.df_name and self.column_name == value.column_name
34-
41+
3542
def __hash__(self):
3643
return hash((self.df_name, self.column_name))
37-
38-
39-
40-
41-

bdikit/data_ingestion/database.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from .column import Column, ColumnType
55

6+
67
class Database:
78
"""
89
A class representing a database that stores dataframes.
@@ -14,7 +15,7 @@ class Database:
1415
load_data(df_name, file_path): Load data from a CSV file into a dataframe and store it in the database.
1516
load_data_from_folder(folder_path): Load data from all CSV files in a folder.
1617
get_dataframe(df_name): Retrieve a dataframe by its name.
17-
get_dataframe_names(): Get the names of all dataframes stored in the database.
18+
get_dataframe_names(): Get the names of all dataframes stored in the database.
1819
describe_database(): Print out the names, shape, columns, and head of all dataframes stored in the database.
1920
"""
2021

@@ -32,7 +33,8 @@ def load_data(self, df_name, file_path):
3233
"""
3334
if df_name in self.dataframes:
3435
raise ValueError(
35-
f"Dataframe associated with file name '{df_name}' already exists in the database.")
36+
f"Dataframe associated with file name '{df_name}' already exists in the database."
37+
)
3638

3739
df = pd.read_csv(file_path)
3840
self.dataframes[df_name] = df
@@ -42,7 +44,6 @@ def load_data(self, df_name, file_path):
4244
column = Column(df_name, c, ColumnType.STRING)
4345
self.columns.add(column)
4446

45-
4647
def load_data_from_folder(self, folder_path):
4748
"""
4849
Function to load data from all CSV files in a folder using the Database class.
@@ -76,7 +77,7 @@ def get_dataframe_names(self):
7677
list: A list of dataframe names.
7778
"""
7879
return list(self.dataframes.keys())
79-
80+
8081
def get_columns(self):
8182
"""
8283
Get the names of all columns stored in the database.
@@ -99,7 +100,6 @@ def describe_database(self):
99100
# print(f"\t\t- Head: \n{self.dataframes[df_name].head()}")
100101

101102

102-
103103
# def main():
104104
# col1 = Column('df1', 'col1', ColumnType.STRING, ['a', 'b', 'c'], ['n/a', 'na'])
105105
# col2 = Column('df1', 'col2', ColumnType.INTEGER, [1, 2, 3], ['n/a', 'na'])
@@ -112,4 +112,4 @@ def describe_database(self):
112112
# print(col3 == col4)
113113

114114
# if __name__ == "__main__":
115-
# main()
115+
# main()

bdikit/data_ingestion/dataset_loader.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
def load_dataframe(dataset_path):
55
dataset = pd.read_csv(dataset_path)
66

7-
return dataset
7+
return dataset

bdikit/download.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def get_cached_model_or_download(model_name: str):
6060
if len(sys.argv) < 2:
6161
print("Please provide a model_id as a command line argument.")
6262
sys.exit(1)
63-
63+
6464
model_id = sys.argv[1]
6565
model_path = get_cached_model_or_download(model_id)
6666
print(f"Downloaded model: {model_path}")

0 commit comments

Comments
 (0)