-
Notifications
You must be signed in to change notification settings - Fork 76
Matching Dependencies Validation #537
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
clang-tidy made some suggestions
}; | ||
|
||
private: | ||
std::vector<HighlightRecord> highlights; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: invalid case style for private member 'highlights' [readability-identifier-naming]
std::vector<HighlightRecord> highlights; | |
std::vector<HighlightRecord> highlights_; |
src/core/algorithms/md/md_verifier/highlights/highlights.h:31:
- return highlights;
+ return highlights_;
|
||
std::shared_ptr<model::ColumnLayoutTypedRelationData> relation_; | ||
|
||
MDHighlights highlights; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: invalid case style for private member 'highlights' [readability-identifier-naming]
MDHighlights highlights; | |
MDHighlights highlights_; |
src/core/algorithms/md/md_verifier/md_verifier.h:63:
- return highlights.GetHighlightsAsStrings();
+ return highlights_.GetHighlightsAsStrings();
src/core/algorithms/md/md_verifier/md_verifier.h:67:
- return highlights.GetHighlights();
+ return highlights_.GetHighlights();
auto ptr_ = std::dynamic_pointer_cast<NumericSimilarityMeasure>(ptr); | ||
if (!ptr_.get()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: invalid case style for variable 'ptr_' [readability-identifier-naming]
auto ptr_ = std::dynamic_pointer_cast<NumericSimilarityMeasure>(ptr); | |
if (!ptr_.get()) { | |
auto ptr = std::dynamic_pointer_cast<NumericSimilarityMeasure>(ptr); | |
if (!ptr.get()) { |
src/core/algorithms/md/md_verifier/similarities/similarities.cpp:13:
- return ptr_;
+ return ptr;
auto ptr_ = std::dynamic_pointer_cast<StringSimilarityMeasure>(ptr); | ||
if (!ptr_.get()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: invalid case style for variable 'ptr_' [readability-identifier-naming]
auto ptr_ = std::dynamic_pointer_cast<StringSimilarityMeasure>(ptr); | |
if (!ptr_.get()) { | |
auto ptr = std::dynamic_pointer_cast<StringSimilarityMeasure>(ptr); | |
if (!ptr.get()) { |
src/core/algorithms/md/md_verifier/similarities/similarities.cpp:22:
- return ptr_;
+ return ptr;
src/tests/test_md_verifier.cpp
Outdated
ASSERT_EQ(GetParam().highlights, verifier->GetHighlightsAsStrings()); | ||
} | ||
|
||
auto const eps = std::numeric_limits<DecisionBoundary>::epsilon(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: invalid case style for global constant 'eps' [readability-identifier-naming]
auto const eps = std::numeric_limits<DecisionBoundary>::epsilon(); | |
auto const kEps = std::numeric_limits<DecisionBoundary>::epsilon(); |
src/tests/test_md_verifier.cpp:123:
- MDVerifierParams(kAnimalsBeverages, {2}, {3}, {0.75 + eps}, {0.75},
+ MDVerifierParams(kAnimalsBeverages, {2}, {3}, {0.75 + kEps}, {0.75},
src/tests/test_md_verifier.cpp:127:
- MDVerifierParams(kAnimalsBeverages, {2}, {3}, {0.75}, {0.75 + eps},
+ MDVerifierParams(kAnimalsBeverages, {2}, {3}, {0.75}, {0.75 + kEps},
src/tests/test_md_verifier.cpp:139:
- {0.75 - eps, 0.75 - eps},
+ {0.75 - kEps, 0.75 - kEps},
src/tests/test_md_verifier.cpp:144:
- true, {0.75 - eps, 0.75 - eps}),
+ true, {0.75 - kEps, 0.75 - kEps}),
src/tests/test_md_verifier.cpp:167:
- kAnimalsBeverages, {2}, {3}, {0.75}, {0.75 + eps},
+ kAnimalsBeverages, {2}, {3}, {0.75}, {0.75 + kEps},
src/core/algorithms/algorithms.h
Outdated
@@ -10,6 +10,7 @@ | |||
#include "algorithms/gfd/verification_algorithms.h" | |||
#include "algorithms/ind/ind_verifier/ind_verifier.h" | |||
#include "algorithms/ind/mining_algorithms.h" | |||
#include "algorithms/md/md_verifier/md_verifier.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add src/core/algorithms/md/verification_algorithms.h
and use it here for consistency
void MDHighlights::Reset() { | ||
highlights_.clear(); | ||
} | ||
} // namespace algos::md |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add newline
} | ||
} | ||
|
||
} // namespace algos::md |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add newline, same for the other files
#include "model/types/string_type.h" | ||
|
||
namespace algos::md { | ||
using DecisionBoundary = model::md::DecisionBoundary; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
|
||
auto get_schema_columns = [this]() { return relation_->GetSchema()->GetNumColumns(); }; | ||
|
||
RegisterOption(config::kTableOpt(&input_table_)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MDs are defined on two tables.
examples/basic/verifying_md.py
Outdated
return | ||
|
||
highlights = verifier.get_highlights() | ||
print(RED_CODE, "MFD does not hold due to the following items:", DEFAULT_COLOR_CODE) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
print(RED_CODE, "MFD does not hold due to the following items:", DEFAULT_COLOR_CODE) | |
print(RED_CODE, "MD does not hold due to the following items:", DEFAULT_COLOR_CODE) |
examples/basic/verifying_md.py
Outdated
|
||
|
||
def check_md(table_path: str, params: MDParams): | ||
algo = desbordante.md_verifier.algorithms.Default() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
algo = desbordante.md_verifier.algorithms.Default() | |
algo = desbordante.md_verification.algorithms.Default() |
examples/basic/verifying_md.py
Outdated
print_results(algo) | ||
|
||
|
||
def drunk_animals_example(): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just chose a name that's kind of funny in the original PR, but use the current one
examples/basic/verifying_md.py
Outdated
|
||
print( | ||
"\nLet's try to check if MD {animal -> diet} with decision boundaries {1.0} and {1.0} and Levenshtein similarity measure holds.\n" | ||
"Matching Dependancy with all decision boundaries equal to 1.0 is the same as Functional Dependancy.\n" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can only be true if equal values result in 1.0 output from the measure, which is expected, but is not strictly required by the definition, and the mining algorithm does not rely on that.
src/python_bindings/bindings.cpp
Outdated
@@ -67,6 +68,7 @@ PYBIND11_MODULE(desbordante, module, pybind11::mod_gil_not_used()) { | |||
BindNdVerification, | |||
BindSFD, | |||
BindMd, | |||
BindMDVerifier, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BindMDVerifier, | |
BindMDVerification, |
for (size_t first_row = 0; first_row < num_cols; ++first_row) { | ||
for (size_t second_row = first_row + 1; second_row < num_cols; ++second_row) { | ||
if (!CheckRows(first_row, second_row)) { | ||
md_holds_ = false; | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is extremely simple, it would be preferable to use something like what HyMD does, with some potential for code reuse.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
clang-tidy made some suggestions
MDVerifierColumnSimilarityClassifier rhs_; | ||
|
||
bool md_holds_ = false; | ||
model::md::DecisionBoundary true_rhs_decision_boundary; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: invalid case style for private member 'true_rhs_decision_boundary' [readability-identifier-naming]
model::md::DecisionBoundary true_rhs_decision_boundary; | |
model::md::DecisionBoundary true_rhs_decision_boundary_; |
src/core/algorithms/md/md_verifier/md_verifier.h:69:
- return true_rhs_decision_boundary;
+ return true_rhs_decision_boundary_;
: ColumnMatch(left_col_index, right_col_index, measure->GetName()), measure(measure) {} | ||
|
||
std::string ToString() const { | ||
std::stringstream ss; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: implicit instantiation of undefined template 'std::basic_stringstream' [clang-diagnostic-error]
std::stringstream ss;
^
Additional context
/usr/include/c++/13/iosfwd:109: template is declared here
class basic_stringstream;
^
std::vector<model::md::ColumnSimilarityClassifier> lhs_column_similarity_classifiers_; | ||
model::md::ColumnSimilarityClassifier rhs_column_similarity_classifier_; | ||
|
||
model::md::DecisionBoundary true_rhs_decision_boundary; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: invalid case style for private member 'true_rhs_decision_boundary' [readability-identifier-naming]
model::md::DecisionBoundary true_rhs_decision_boundary; | |
model::md::DecisionBoundary true_rhs_decision_boundary_; |
src/core/algorithms/md/md_verifier/validation/validation.h:33:
- true_rhs_decision_boundary = new_decision_boundary;
+ true_rhs_decision_boundary_ = new_decision_boundary;
src/core/algorithms/md/md_verifier/validation/validation.h:49:
- return true_rhs_decision_boundary;
+ return true_rhs_decision_boundary_;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
clang-tidy made some suggestions
return duration_cast<milliseconds>(system_clock::now() - start_time).count(); | ||
} | ||
|
||
model::MD MDVerifier::BuildMD(std::vector<MDVerifierColumnSimilarityClassifier> const& lhs, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: unused parameter 'lhs' [clang-diagnostic-unused-parameter]
model::MD MDVerifier::BuildMD(std::vector<MDVerifierColumnSimilarityClassifier> const& lhs,
^
d5bedf5
to
e303843
Compare
highlights_.push_back( | ||
{left_table_row, right_table_row, column_match, similarity, decision_boundary}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
highlights_.push_back( | |
{left_table_row, right_table_row, column_match, similarity, decision_boundary}); | |
highlights_.emplace_back(left_table_row, right_table_row, column_match, similarity, | |
decision_boundary); |
void MDVerifier::LoadDataInternal() { | ||
left_schema_ = std::make_shared<RelationalSchema>(left_table_->GetRelationName()); | ||
std::size_t const left_table_cols = left_table_->GetNumberOfColumns(); | ||
for (model::Index i : std::views::iota(model::Index(0), left_table_cols)) { | ||
left_schema_->AppendColumn(left_table_->GetColumnName(i)); | ||
} | ||
|
||
if (right_table_ == nullptr) { | ||
right_schema_ = left_schema_; | ||
|
||
} else { | ||
right_schema_ = std::make_unique<RelationalSchema>(right_table_->GetRelationName()); | ||
std::size_t const right_table_cols = right_table_->GetNumberOfColumns(); | ||
for (model::Index i : std::views::iota(model::Index(0), right_table_cols)) { | ||
right_schema_->AppendColumn(right_table_->GetColumnName(i)); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
right_schema_
is defined asstd::shared_ptr
so why do we initialize it withstd::make_unique
?- If we replace
make_unique
withmake_shared
, now all of a sudden those are the similar functions, so they can be simplified as:
void MDVerifier::LoadDataInternal() { | |
left_schema_ = std::make_shared<RelationalSchema>(left_table_->GetRelationName()); | |
std::size_t const left_table_cols = left_table_->GetNumberOfColumns(); | |
for (model::Index i : std::views::iota(model::Index(0), left_table_cols)) { | |
left_schema_->AppendColumn(left_table_->GetColumnName(i)); | |
} | |
if (right_table_ == nullptr) { | |
right_schema_ = left_schema_; | |
} else { | |
right_schema_ = std::make_unique<RelationalSchema>(right_table_->GetRelationName()); | |
std::size_t const right_table_cols = right_table_->GetNumberOfColumns(); | |
for (model::Index i : std::views::iota(model::Index(0), right_table_cols)) { | |
right_schema_->AppendColumn(right_table_->GetColumnName(i)); | |
} | |
} | |
} | |
auto CreateSchema(config::InputTable const& table) { | |
auto schema = std::make_shared<RelationalSchema>(table->GetRelationName()); | |
std::size_t const cols = table->GetNumberOfColumns(); | |
for (model::Index i : std::views::iota(model::Index(0), cols)) { | |
schema->AppendColumn(table->GetColumnName(i)); | |
} | |
return schema; | |
} | |
void MDVerifier::LoadDataInternal() { | |
left_schema_ = CreateSchema(left_table_); | |
right_schema_ = right_table_ ? CreateSchema(right_table_) : left_schema_; | |
} |
unsigned long long MDVerifier::ExecuteInternal() { | ||
using namespace std::chrono; | ||
|
||
auto start_time = system_clock::now(); | ||
|
||
VerifyMD(); | ||
|
||
return duration_cast<milliseconds>(system_clock::now() - start_time).count(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unsigned long long MDVerifier::ExecuteInternal() { | |
using namespace std::chrono; | |
auto start_time = system_clock::now(); | |
VerifyMD(); | |
return duration_cast<milliseconds>(system_clock::now() - start_time).count(); | |
} | |
unsigned long long MDVerifier::ExecuteInternal() { | |
return util::TimedInvoke(&MDVerifier::VerifyMD, this); | |
} |
std::shared_ptr<std::vector<model::md::ColumnMatch>> column_matches = | ||
std::make_shared<std::vector<model::md::ColumnMatch>>(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: we can use auto
here, since it's obvious what type std::make_shared
returns.
std::shared_ptr<std::vector<model::md::ColumnMatch>> column_matches = | |
std::make_shared<std::vector<model::md::ColumnMatch>>(); | |
auto column_matches = std::make_shared<std::vector<model::md::ColumnMatch>>(); |
99101b6
to
8b2fbe2
Compare
|
||
def print_results(verifier): | ||
if verifier.md_holds(): | ||
print(GREEN_CODE, "MD holds", DEFAULT_COLOR_CODE, "\n") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This outputs
MD holds
Perhaps you want
print(GREEN_CODE, "MD holds", DEFAULT_COLOR_CODE, "\n") | |
print(f"{GREEN_CODE}MD holds{DEFAULT_COLOR_CODE}\n") |
for
MD holds
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Next time, please, mark resolved conversations on github as "Resolved". It's more convenient to perform review this way
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Add newlines to ends of files
nit: Consider changing row
to record
in names in your code
nit: Don't overuse auto
src/python_bindings/py_util/get_py_type.cpp
entries for algos::md::ColumnSimilarityClassifier
and std::vector<algos::md::ColumnSimilarityClassifier>
are missing, add them
(changes marked nit:
(nitpick) can be skipped if you are light on time)
ColumnSimilarityClassifier() = default; | ||
|
||
ColumnSimilarityClassifier(CMPtr column_match, model::md::DecisionBoundary decision_boundary) | ||
: column_match_(column_match), decision_boundary_(decision_boundary) {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
: column_match_(column_match), decision_boundary_(decision_boundary) {} | |
: column_match_(std::move(column_match)), decision_boundary_(decision_boundary) {} |
ColumnSimilarityClassifier(CMPtr column_match, model::md::DecisionBoundary decision_boundary) | ||
: column_match_(column_match), decision_boundary_(decision_boundary) {} | ||
|
||
CMPtr GetColumnMatch() const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
CMPtr GetColumnMatch() const { | |
CMPtr const& GetColumnMatch() const { |
for (auto [left_row, right_rows_set] : rows_pairs) { | ||
for (auto right_row : right_rows_set) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
You are copying the whole set here
Also see Google C++ style guide on type deduction
for (auto [left_row, right_rows_set] : rows_pairs) { | |
for (auto right_row : right_rows_set) { | |
for (auto const& [left_row, right_rows_set] : rows_pairs) { | |
for (model::Index right_row_index : right_rows_set) { |
return decision_boundary_; | ||
} | ||
}; | ||
} // namespace algos::md |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Add newline
} | ||
} | ||
|
||
return highlights; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
Add a constructor taking std::vector<Highlight>
: NRVO is not guaranteed.
return highlights; | |
return {std::move(highlights)}; |
Also, use std::vector<...>::reserve
|
||
print( | ||
"Now let's take a look that happen if we increase decision boundary of left-hand side and right-hand side\n" | ||
"For example, we will take 0.76 instead of 0.75" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"For example, we will take 0.76 instead of 0.75" | |
"For example, we will set it to 0.76 instead of 0.75" |
if (similarity >= decision_boundary) { | ||
UpdateNewRowsPairsSet(left_clusters[left_cluster_index], | ||
right_clusters[right_cluster_index], new_rows_pairs); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: similarity indexes can be used to retrieve record pairs that match the LHS. The validation process from HyMD
itself can be reused for MDValidator
, except that it shouldn't stop when the RHS decision boundary drops to 0.
} | ||
} | ||
|
||
void MDValidationCalculator::UpdateNewRowsPairsSet(hymd::indexes::PliCluster const& left_cluster, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: What is "update"? Something like AddIntersectionWithPreviouslyMatched
maybe?
for (hymd::RecordIdentifier right_row : right_cluster) { | ||
auto it = rows_pairs_.find(left_row); | ||
if (it != rows_pairs_.end() && it->second.find(right_row) != it->second.end()) { | ||
new_rows_pairs[left_row].emplace(right_row); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
for (hymd::RecordIdentifier right_row : right_cluster) { | |
auto it = rows_pairs_.find(left_row); | |
if (it != rows_pairs_.end() && it->second.find(right_row) != it->second.end()) { | |
new_rows_pairs[left_row].emplace(right_row); | |
} | |
} | |
auto it = rows_pairs_.find(left_row); | |
if (it == rows_pairs_.end()) continue; | |
???::??_set const& matched_before = it->second; | |
???::??_set matched_intersection; | |
for (hymd::RecordIdentifier right_row : right_cluster) { | |
if (matched_before.contains(right_row)) { | |
matched_intersection.insert(right_row); | |
} | |
} | |
assert(!new_row_pairs.contains(left_row)); | |
if (!matched_intersection.empty()) new_row_pairs.emplace(left_row, std::move(matched_intersection)); |
return true_rhs_decision_boundary_; | ||
} | ||
|
||
RowsPairSet const& GetRowsPairs() const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: By the time this is called only the violating record pairs should be left in, right?
RowsPairSet const& GetRowsPairs() const { | |
RowsPairSet const& GetViolatingPairs() const { |
check_md(table, params) | ||
|
||
print( | ||
"As we can see, nothing changed. Now let's increase right-hand side decision boundary:\n" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"As we can see, nothing changed. Now let's increase right-hand side decision boundary:\n" | |
"As we can see, nothing changed. Now let's increase the right-hand side decision boundary:\n" |
check_md(table, params) | ||
|
||
print( | ||
'Values "meat" and "mead" have Levenshtein similarity measure equal to 0.75, but we accept similarity measure at least 0.76, so MD doesn\'t holds.\n' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
'Values "meat" and "mead" have Levenshtein similarity measure equal to 0.75, but we accept similarity measure at least 0.76, so MD doesn\'t holds.\n' | |
'The values "meat" and "mead" have a similarity of 0.75 according to the Levenshtein similarity measure, but we require the similarity to be at least 0.76, so the MD doesn\'t hold.\n' |
'Values "meat" and "mead" have Levenshtein similarity measure equal to 0.75, but we accept similarity measure at least 0.76, so MD doesn\'t holds.\n' | ||
) | ||
|
||
print("Let's check that changes if we corrent typos in dataset") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
print("Let's check that changes if we corrent typos in dataset") | |
print("Let's check if that changes if we correct typos in the dataset") |
table["diet"] = table["diet"].replace({"mead": "meat"}) | ||
|
||
print(f"Corrected dataset:\n\n{table}\n") | ||
print("Now let's check MD with 1.0 decision boundaries again") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
print("Now let's check MD with 1.0 decision boundaries again") | |
print("Now let's check the MD with 1.0 decision boundaries again") |
|
||
def theatre_example(): | ||
print( | ||
"Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for such purpose:\n" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for such purpose:\n" | |
"Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for this purpose:\n" |
) | ||
|
||
print( | ||
"To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n" | |
"To verify a Matching Dependency, we must first define Column Similarity Classifiers for the data.\n" |
|
||
print( | ||
"To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n" | ||
"Column Similarity Classifier consists of Column Match and decision boundary. " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"Column Similarity Classifier consists of Column Match and decision boundary. " | |
"A Column Similarity Classifier consists of a Column Match and a decision boundary. " |
print( | ||
"To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n" | ||
"Column Similarity Classifier consists of Column Match and decision boundary. " | ||
"Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n" | |
"A Column Match consists of two indices—the columns in the left and right tables—and a similarity measure (Levenshtein Similarity, for example).\n" |
"To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n" | ||
"Column Similarity Classifier consists of Column Match and decision boundary. " | ||
"Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n" | ||
"We will use notation [measure(i, j) >= lambda] for Column Similarity Classifier with i'th column of left table, j'th column of right table, similarity measure 'measure' and decision boundary 'lambda'. " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"We will use notation [measure(i, j) >= lambda] for Column Similarity Classifier with i'th column of left table, j'th column of right table, similarity measure 'measure' and decision boundary 'lambda'. " | |
"We will use the notation [measure(i, j) >= lambda] for a Column Similarity Classifier that specifies the i'th column of the left table, the j'th column of the right table, the similarity measure 'measure' and the decision boundary 'lambda'. " |
"Column Similarity Classifier consists of Column Match and decision boundary. " | ||
"Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n" | ||
"We will use notation [measure(i, j) >= lambda] for Column Similarity Classifier with i'th column of left table, j'th column of right table, similarity measure 'measure' and decision boundary 'lambda'. " | ||
'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is valid for ColumnMatch between column with "left_col_name" and "left_col_name" of left and right tables respectively.\n' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is valid for ColumnMatch between column with "left_col_name" and "left_col_name" of left and right tables respectively.\n' | |
'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is also valid for a ColumnMatch specifying the columns "left_col_name" and "right_col_name" of the left and right tables respectively.\n' |
Features: