Matching Dependencies Validation #537

maybenotilya · 2025-03-05T15:26:15Z

Features:

Matching Dependancies Validator
Similarity Measures for MD Validation
Python Bindings
Python example

github-actions

clang-tidy made some suggestions

github-actions · 2025-03-05T15:40:50Z

src/core/algorithms/md/md_verifier/highlights/highlights.h

+    };
+
+private:
+    std::vector<HighlightRecord> highlights;


warning: invalid case style for private member 'highlights' [readability-identifier-naming]

Suggested change

std::vector<HighlightRecord> highlights;

std::vector<HighlightRecord> highlights_;

src/core/algorithms/md/md_verifier/highlights/highlights.h:31:

- return highlights; + return highlights_;

github-actions · 2025-03-05T15:40:50Z

src/core/algorithms/md/md_verifier/md_verifier.h

+
+    std::shared_ptr<model::ColumnLayoutTypedRelationData> relation_;
+
+    MDHighlights highlights;


warning: invalid case style for private member 'highlights' [readability-identifier-naming]

Suggested change

MDHighlights highlights;

MDHighlights highlights_;

src/core/algorithms/md/md_verifier/md_verifier.h:63:

- return highlights.GetHighlightsAsStrings(); + return highlights_.GetHighlightsAsStrings();

src/core/algorithms/md/md_verifier/md_verifier.h:67:

- return highlights.GetHighlights(); + return highlights_.GetHighlights();

github-actions · 2025-03-05T15:40:51Z

src/core/algorithms/md/md_verifier/similarities/similarities.cpp

+    auto ptr_ = std::dynamic_pointer_cast<NumericSimilarityMeasure>(ptr);
+    if (!ptr_.get()) {


warning: invalid case style for variable 'ptr_' [readability-identifier-naming]

Suggested change

auto ptr_ = std::dynamic_pointer_cast<NumericSimilarityMeasure>(ptr);

if (!ptr_.get()) {

auto ptr = std::dynamic_pointer_cast<NumericSimilarityMeasure>(ptr);

if (!ptr.get()) {

src/core/algorithms/md/md_verifier/similarities/similarities.cpp:13:

- return ptr_; + return ptr;

github-actions · 2025-03-05T15:40:51Z

src/core/algorithms/md/md_verifier/similarities/similarities.cpp

+    auto ptr_ = std::dynamic_pointer_cast<StringSimilarityMeasure>(ptr);
+    if (!ptr_.get()) {


warning: invalid case style for variable 'ptr_' [readability-identifier-naming]

Suggested change

auto ptr_ = std::dynamic_pointer_cast<StringSimilarityMeasure>(ptr);

if (!ptr_.get()) {

auto ptr = std::dynamic_pointer_cast<StringSimilarityMeasure>(ptr);

if (!ptr.get()) {

src/core/algorithms/md/md_verifier/similarities/similarities.cpp:22:

- return ptr_; + return ptr;

github-actions · 2025-03-05T15:40:51Z

src/tests/test_md_verifier.cpp

+    ASSERT_EQ(GetParam().highlights, verifier->GetHighlightsAsStrings());
+}
+
+auto const eps = std::numeric_limits<DecisionBoundary>::epsilon();


warning: invalid case style for global constant 'eps' [readability-identifier-naming]

Suggested change

auto const eps = std::numeric_limits<DecisionBoundary>::epsilon();

auto const kEps = std::numeric_limits<DecisionBoundary>::epsilon();

src/tests/test_md_verifier.cpp:123:

- MDVerifierParams(kAnimalsBeverages, {2}, {3}, {0.75 + eps}, {0.75}, + MDVerifierParams(kAnimalsBeverages, {2}, {3}, {0.75 + kEps}, {0.75},

src/tests/test_md_verifier.cpp:127:

- MDVerifierParams(kAnimalsBeverages, {2}, {3}, {0.75}, {0.75 + eps}, + MDVerifierParams(kAnimalsBeverages, {2}, {3}, {0.75}, {0.75 + kEps},

src/tests/test_md_verifier.cpp:139:

- {0.75 - eps, 0.75 - eps}, + {0.75 - kEps, 0.75 - kEps},

src/tests/test_md_verifier.cpp:144:

- true, {0.75 - eps, 0.75 - eps}), + true, {0.75 - kEps, 0.75 - kEps}),

src/tests/test_md_verifier.cpp:167:

- kAnimalsBeverages, {2}, {3}, {0.75}, {0.75 + eps}, + kAnimalsBeverages, {2}, {3}, {0.75}, {0.75 + kEps},

BUYT-1 · 2025-03-09T10:43:35Z

src/core/algorithms/algorithms.h

@@ -10,6 +10,7 @@
 #include "algorithms/gfd/verification_algorithms.h"
 #include "algorithms/ind/ind_verifier/ind_verifier.h"
 #include "algorithms/ind/mining_algorithms.h"
+#include "algorithms/md/md_verifier/md_verifier.h"


Add src/core/algorithms/md/verification_algorithms.h and use it here for consistency

BUYT-1 · 2025-03-09T10:43:50Z

src/core/algorithms/md/md_verifier/highlights/highlights.cpp

+void MDHighlights::Reset() {
+    highlights_.clear();
+}
+}  // namespace algos::md


Add newline

BUYT-1 · 2025-03-09T10:45:35Z

src/core/algorithms/md/md_verifier/md_verifier.cpp

+    }
+}
+
+}  // namespace algos::md


Add newline, same for the other files

BUYT-1 · 2025-03-09T10:46:50Z

src/core/algorithms/md/md_verifier/highlights/highlights.h

+#include "model/types/string_type.h"
+
+namespace algos::md {
+using DecisionBoundary = model::md::DecisionBoundary;


No aliases that aren't supposed to be public API in headers

BUYT-1 · 2025-03-09T10:50:57Z

src/core/algorithms/md/md_verifier/md_verifier.cpp

+
+    auto get_schema_columns = [this]() { return relation_->GetSchema()->GetNumColumns(); };
+
+    RegisterOption(config::kTableOpt(&input_table_));


MDs are defined on two tables.

BUYT-1 · 2025-03-09T12:00:17Z

examples/basic/verifying_md.py

+        return
+
+    highlights = verifier.get_highlights()
+    print(RED_CODE, "MFD does not hold due to the following items:", DEFAULT_COLOR_CODE)


Suggested change

print(RED_CODE, "MFD does not hold due to the following items:", DEFAULT_COLOR_CODE)

print(RED_CODE, "MD does not hold due to the following items:", DEFAULT_COLOR_CODE)

BUYT-1 · 2025-03-09T12:01:11Z

examples/basic/verifying_md.py

+
+
+def check_md(table_path: str, params: MDParams):
+    algo = desbordante.md_verifier.algorithms.Default()


Suggested change

algo = desbordante.md_verifier.algorithms.Default()

algo = desbordante.md_verification.algorithms.Default()

BUYT-1 · 2025-03-09T12:02:03Z

examples/basic/verifying_md.py

+    print_results(algo)
+
+
+def drunk_animals_example():


I just chose a name that's kind of funny in the original PR, but use the current one

BUYT-1 · 2025-03-09T12:03:24Z

examples/basic/verifying_md.py

+
+    print(
+        "\nLet's try to check if MD {animal -> diet} with decision boundaries {1.0} and {1.0} and Levenshtein similarity measure holds.\n"
+        "Matching Dependancy with all decision boundaries equal to 1.0 is the same as Functional Dependancy.\n"


This can only be true if equal values result in 1.0 output from the measure, which is expected, but is not strictly required by the definition, and the mining algorithm does not rely on that.

BUYT-1 · 2025-03-09T12:10:38Z

src/python_bindings/bindings.cpp

@@ -67,6 +68,7 @@ PYBIND11_MODULE(desbordante, module, pybind11::mod_gil_not_used()) {
                           BindNdVerification,
                           BindSFD,
                           BindMd,
+                           BindMDVerifier,


Suggested change

BindMDVerifier,

BindMDVerification,

BUYT-1 · 2025-03-09T12:25:43Z

src/core/algorithms/md/md_verifier/md_verifier.cpp

+    for (size_t first_row = 0; first_row < num_cols; ++first_row) {
+        for (size_t second_row = first_row + 1; second_row < num_cols; ++second_row) {
+            if (!CheckRows(first_row, second_row)) {
+                md_holds_ = false;
+            }
+        }
+    }


This is extremely simple, it would be preferable to use something like what HyMD does, with some potential for code reuse.

github-actions

clang-tidy made some suggestions

github-actions · 2025-03-21T14:38:43Z

src/core/algorithms/md/md_verifier/md_verifier.h

+    MDVerifierColumnSimilarityClassifier rhs_;
+
+    bool md_holds_ = false;
+    model::md::DecisionBoundary true_rhs_decision_boundary;


warning: invalid case style for private member 'true_rhs_decision_boundary' [readability-identifier-naming]

Suggested change

model::md::DecisionBoundary true_rhs_decision_boundary;

model::md::DecisionBoundary true_rhs_decision_boundary_;

src/core/algorithms/md/md_verifier/md_verifier.h:69:

- return true_rhs_decision_boundary; + return true_rhs_decision_boundary_;

github-actions · 2025-03-21T14:38:43Z

src/core/algorithms/md/md_verifier/md_verifier_column_match.h

+        : ColumnMatch(left_col_index, right_col_index, measure->GetName()), measure(measure) {}
+
+    std::string ToString() const {
+        std::stringstream ss;


warning: implicit instantiation of undefined template 'std::basic_stringstream' [clang-diagnostic-error]

std::stringstream ss; ^

Additional context

/usr/include/c++/13/iosfwd:109: template is declared here

class basic_stringstream; ^

github-actions · 2025-03-21T14:38:43Z

src/core/algorithms/md/md_verifier/validation/validation.h

+    std::vector<model::md::ColumnSimilarityClassifier> lhs_column_similarity_classifiers_;
+    model::md::ColumnSimilarityClassifier rhs_column_similarity_classifier_;
+
+    model::md::DecisionBoundary true_rhs_decision_boundary;


warning: invalid case style for private member 'true_rhs_decision_boundary' [readability-identifier-naming]

Suggested change

model::md::DecisionBoundary true_rhs_decision_boundary;

model::md::DecisionBoundary true_rhs_decision_boundary_;

src/core/algorithms/md/md_verifier/validation/validation.h:33:

- true_rhs_decision_boundary = new_decision_boundary; + true_rhs_decision_boundary_ = new_decision_boundary;

src/core/algorithms/md/md_verifier/validation/validation.h:49:

- return true_rhs_decision_boundary; + return true_rhs_decision_boundary_;

github-actions

clang-tidy made some suggestions

github-actions · 2025-04-05T21:21:54Z

src/core/algorithms/md/md_verifier/md_verifier.cpp

+    return duration_cast<milliseconds>(system_clock::now() - start_time).count();
+}
+
+model::MD MDVerifier::BuildMD(std::vector<MDVerifierColumnSimilarityClassifier> const& lhs,


warning: unused parameter 'lhs' [clang-diagnostic-unused-parameter]

model::MD MDVerifier::BuildMD(std::vector<MDVerifierColumnSimilarityClassifier> const& lhs, ^

ol-imorozko · 2025-04-07T12:49:07Z

src/core/algorithms/md/md_verifier/highlights/highlights.cpp

+    highlights_.push_back(
+            {left_table_row, right_table_row, column_match, similarity, decision_boundary});


Suggested change

highlights_.push_back(

{left_table_row, right_table_row, column_match, similarity, decision_boundary});

highlights_.emplace_back(left_table_row, right_table_row, column_match, similarity,

decision_boundary);

ol-imorozko · 2025-04-07T13:02:26Z

src/core/algorithms/md/md_verifier/md_verifier.cpp

+void MDVerifier::LoadDataInternal() {
+    left_schema_ = std::make_shared<RelationalSchema>(left_table_->GetRelationName());
+    std::size_t const left_table_cols = left_table_->GetNumberOfColumns();
+    for (model::Index i : std::views::iota(model::Index(0), left_table_cols)) {
+        left_schema_->AppendColumn(left_table_->GetColumnName(i));
+    }
+
+    if (right_table_ == nullptr) {
+        right_schema_ = left_schema_;
+
+    } else {
+        right_schema_ = std::make_unique<RelationalSchema>(right_table_->GetRelationName());
+        std::size_t const right_table_cols = right_table_->GetNumberOfColumns();
+        for (model::Index i : std::views::iota(model::Index(0), right_table_cols)) {
+            right_schema_->AppendColumn(right_table_->GetColumnName(i));
+        }
+    }
+}


right_schema_ is defined as std::shared_ptr so why do we initialize it with std::make_unique?

If we replace make_unique with make_shared, now all of a sudden those are the similar functions, so they can be simplified as:

Suggested change

void MDVerifier::LoadDataInternal() {

left_schema_ = std::make_shared<RelationalSchema>(left_table_->GetRelationName());

std::size_t const left_table_cols = left_table_->GetNumberOfColumns();

for (model::Index i : std::views::iota(model::Index(0), left_table_cols)) {

left_schema_->AppendColumn(left_table_->GetColumnName(i));

}

if (right_table_ == nullptr) {

right_schema_ = left_schema_;

} else {

right_schema_ = std::make_unique<RelationalSchema>(right_table_->GetRelationName());

std::size_t const right_table_cols = right_table_->GetNumberOfColumns();

for (model::Index i : std::views::iota(model::Index(0), right_table_cols)) {

right_schema_->AppendColumn(right_table_->GetColumnName(i));

}

}

}

auto CreateSchema(config::InputTable const& table) {

auto schema = std::make_shared<RelationalSchema>(table->GetRelationName());

std::size_t const cols = table->GetNumberOfColumns();

for (model::Index i : std::views::iota(model::Index(0), cols)) {

schema->AppendColumn(table->GetColumnName(i));

}

return schema;

}

void MDVerifier::LoadDataInternal() {

left_schema_ = CreateSchema(left_table_);

right_schema_ = right_table_ ? CreateSchema(right_table_) : left_schema_;

}

ol-imorozko · 2025-04-07T13:04:43Z

src/core/algorithms/md/md_verifier/md_verifier.cpp

+unsigned long long MDVerifier::ExecuteInternal() {
+    using namespace std::chrono;
+
+    auto start_time = system_clock::now();
+
+    VerifyMD();
+
+    return duration_cast<milliseconds>(system_clock::now() - start_time).count();
+}


Suggested change

unsigned long long MDVerifier::ExecuteInternal() {

using namespace std::chrono;

auto start_time = system_clock::now();

VerifyMD();

return duration_cast<milliseconds>(system_clock::now() - start_time).count();

}

unsigned long long MDVerifier::ExecuteInternal() {

return util::TimedInvoke(&MDVerifier::VerifyMD, this);

}

src/core/algorithms/md/md_verifier/md_verifier.cpp

ol-imorozko · 2025-04-07T13:28:42Z

src/core/algorithms/md/md_verifier/md_verifier.cpp

+    std::shared_ptr<std::vector<model::md::ColumnMatch>> column_matches =
+            std::make_shared<std::vector<model::md::ColumnMatch>>();


nit: we can use auto here, since it's obvious what type std::make_shared returns.

Suggested change

std::shared_ptr<std::vector<model::md::ColumnMatch>> column_matches =

std::make_shared<std::vector<model::md::ColumnMatch>>();

auto column_matches = std::make_shared<std::vector<model::md::ColumnMatch>>();

…numeric columns

BUYT-1 · 2025-04-30T17:22:51Z

examples/basic/verifying_md.py

+
+def print_results(verifier):
+    if verifier.md_holds():
+        print(GREEN_CODE, "MD holds", DEFAULT_COLOR_CODE, "\n")


This outputs

MD holds

Perhaps you want

Suggested change

print(GREEN_CODE, "MD holds", DEFAULT_COLOR_CODE, "\n")

print(f"{GREEN_CODE}MD holds{DEFAULT_COLOR_CODE}\n")

for

MD holds

ol-imorozko

Next time, please, mark resolved conversations on github as "Resolved". It's more convenient to perform review this way

BUYT-1

nit: Add newlines to ends of files

nit: Consider changing row to record in names in your code

nit: Don't overuse auto

src/python_bindings/py_util/get_py_type.cpp entries for algos::md::ColumnSimilarityClassifier and std::vector<algos::md::ColumnSimilarityClassifier> are missing, add them

(changes marked nit: (nitpick) can be skipped if you are light on time)

BUYT-1 · 2025-05-19T13:40:12Z

src/core/algorithms/md/md_verifier/column_similarity_classifier.h

+    ColumnSimilarityClassifier() = default;
+
+    ColumnSimilarityClassifier(CMPtr column_match, model::md::DecisionBoundary decision_boundary)
+        : column_match_(column_match), decision_boundary_(decision_boundary) {}


nit:

Suggested change

: column_match_(column_match), decision_boundary_(decision_boundary) {}

: column_match_(std::move(column_match)), decision_boundary_(decision_boundary) {}

BUYT-1 · 2025-05-19T13:57:04Z

src/core/algorithms/md/md_verifier/column_similarity_classifier.h

+    ColumnSimilarityClassifier(CMPtr column_match, model::md::DecisionBoundary decision_boundary)
+        : column_match_(column_match), decision_boundary_(decision_boundary) {}
+
+    CMPtr GetColumnMatch() const {


nit:

Suggested change

CMPtr GetColumnMatch() const {

CMPtr const& GetColumnMatch() const {

BUYT-1 · 2025-05-19T13:59:54Z

src/core/algorithms/md/md_verifier/highlights/highlights.cpp

+    for (auto [left_row, right_rows_set] : rows_pairs) {
+        for (auto right_row : right_rows_set) {


nit:
You are copying the whole set here
Also see Google C++ style guide on type deduction

Suggested change

for (auto [left_row, right_rows_set] : rows_pairs) {

for (auto right_row : right_rows_set) {

for (auto const& [left_row, right_rows_set] : rows_pairs) {

for (model::Index right_row_index : right_rows_set) {

BUYT-1 · 2025-05-19T14:02:09Z

src/core/algorithms/md/md_verifier/column_similarity_classifier.h

+        return decision_boundary_;
+    }
+};
+}  // namespace algos::md


nit: Add newline

BUYT-1 · 2025-05-19T14:12:37Z

src/core/algorithms/md/md_verifier/highlights/highlights.cpp

+        }
+    }
+
+    return highlights;


nit:
Add a constructor taking std::vector<Highlight>: NRVO is not guaranteed.

Suggested change

return highlights;

return {std::move(highlights)};

Also, use std::vector<...>::reserve

BUYT-1 · 2025-05-19T17:26:25Z

examples/basic/verifying_md.py

+
+    print(
+        "Now let's take a look that happen if we increase decision boundary of left-hand side and right-hand side\n"
+        "For example, we will take 0.76 instead of 0.75"


Suggested change

"For example, we will take 0.76 instead of 0.75"

"For example, we will set it to 0.76 instead of 0.75"

BUYT-1 · 2025-05-19T17:30:27Z

src/core/algorithms/md/md_verifier/validation/validation.cpp

+            if (similarity >= decision_boundary) {
+                UpdateNewRowsPairsSet(left_clusters[left_cluster_index],
+                                      right_clusters[right_cluster_index], new_rows_pairs);
+            }


nit: similarity indexes can be used to retrieve record pairs that match the LHS. The validation process from HyMD itself can be reused for MDValidator, except that it shouldn't stop when the RHS decision boundary drops to 0.

BUYT-1 · 2025-05-19T17:45:05Z

src/core/algorithms/md/md_verifier/validation/validation.cpp

+    }
+}
+
+void MDValidationCalculator::UpdateNewRowsPairsSet(hymd::indexes::PliCluster const& left_cluster,


nit: What is "update"? Something like AddIntersectionWithPreviouslyMatched maybe?

BUYT-1 · 2025-05-19T18:20:49Z

src/core/algorithms/md/md_verifier/validation/validation.cpp

+        for (hymd::RecordIdentifier right_row : right_cluster) {
+            auto it = rows_pairs_.find(left_row);
+            if (it != rows_pairs_.end() && it->second.find(right_row) != it->second.end()) {
+                new_rows_pairs[left_row].emplace(right_row);
+            }
+        }


nit:

Suggested change

for (hymd::RecordIdentifier right_row : right_cluster) {

auto it = rows_pairs_.find(left_row);

if (it != rows_pairs_.end() && it->second.find(right_row) != it->second.end()) {

new_rows_pairs[left_row].emplace(right_row);

}

}

auto it = rows_pairs_.find(left_row);

if (it == rows_pairs_.end()) continue;

???::??_set const& matched_before = it->second;

???::??_set matched_intersection;

for (hymd::RecordIdentifier right_row : right_cluster) {

if (matched_before.contains(right_row)) {

matched_intersection.insert(right_row);

}

}

assert(!new_row_pairs.contains(left_row));

if (!matched_intersection.empty()) new_row_pairs.emplace(left_row, std::move(matched_intersection));

BUYT-1 · 2025-05-19T18:35:10Z

src/core/algorithms/md/md_verifier/validation/validation.h

+        return true_rhs_decision_boundary_;
+    }
+
+    RowsPairSet const& GetRowsPairs() const {


nit: By the time this is called only the violating record pairs should be left in, right?

Suggested change

RowsPairSet const& GetRowsPairs() const {

RowsPairSet const& GetViolatingPairs() const {

BUYT-1 · 2025-05-20T14:19:19Z

examples/basic/verifying_md.py

+    check_md(table, params)
+
+    print(
+        "As we can see, nothing changed. Now let's increase right-hand side decision boundary:\n"


Suggested change

"As we can see, nothing changed. Now let's increase right-hand side decision boundary:\n"

"As we can see, nothing changed. Now let's increase the right-hand side decision boundary:\n"

BUYT-1 · 2025-05-20T14:20:53Z

examples/basic/verifying_md.py

+    check_md(table, params)
+
+    print(
+        'Values "meat" and "mead" have Levenshtein similarity measure equal to 0.75, but we accept similarity measure at least 0.76, so MD doesn\'t holds.\n'


Suggested change

'Values "meat" and "mead" have Levenshtein similarity measure equal to 0.75, but we accept similarity measure at least 0.76, so MD doesn\'t holds.\n'

'The values "meat" and "mead" have a similarity of 0.75 according to the Levenshtein similarity measure, but we require the similarity to be at least 0.76, so the MD doesn\'t hold.\n'

BUYT-1 · 2025-05-20T14:21:15Z

examples/basic/verifying_md.py

+        'Values "meat" and "mead" have Levenshtein similarity measure equal to 0.75, but we accept similarity measure at least 0.76, so MD doesn\'t holds.\n'
+    )
+
+    print("Let's check that changes if we corrent typos in dataset")


Suggested change

print("Let's check that changes if we corrent typos in dataset")

print("Let's check if that changes if we correct typos in the dataset")

BUYT-1 · 2025-05-20T14:21:29Z

examples/basic/verifying_md.py

+    table["diet"] = table["diet"].replace({"mead": "meat"})
+
+    print(f"Corrected dataset:\n\n{table}\n")
+    print("Now let's check MD with 1.0 decision boundaries again")


Suggested change

print("Now let's check MD with 1.0 decision boundaries again")

print("Now let's check the MD with 1.0 decision boundaries again")

BUYT-1 · 2025-05-20T14:21:48Z

examples/basic/verifying_md.py

+
+def theatre_example():
+    print(
+        "Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for such purpose:\n"


Suggested change

"Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for such purpose:\n"

"Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for this purpose:\n"

BUYT-1 · 2025-05-20T14:27:24Z

examples/basic/verifying_md.py

+    )
+
+    print(
+        "To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n"


Suggested change

"To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n"

"To verify a Matching Dependency, we must first define Column Similarity Classifiers for the data.\n"

BUYT-1 · 2025-05-20T14:27:37Z

examples/basic/verifying_md.py

+
+    print(
+        "To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n"
+        "Column Similarity Classifier consists of Column Match and decision boundary. "


Suggested change

"Column Similarity Classifier consists of Column Match and decision boundary. "

"A Column Similarity Classifier consists of a Column Match and a decision boundary. "

BUYT-1 · 2025-05-20T14:28:25Z

examples/basic/verifying_md.py

+    print(
+        "To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n"
+        "Column Similarity Classifier consists of Column Match and decision boundary. "
+        "Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n"


Suggested change

"Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n"

"A Column Match consists of two indices—the columns in the left and right tables—and a similarity measure (Levenshtein Similarity, for example).\n"

BUYT-1 · 2025-05-20T14:29:47Z

examples/basic/verifying_md.py

+        "To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n"
+        "Column Similarity Classifier consists of Column Match and decision boundary. "
+        "Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n"
+        "We will use notation [measure(i, j) >= lambda] for Column Similarity Classifier with i'th column of left table, j'th column of right table, similarity measure 'measure' and decision boundary 'lambda'. "


Suggested change

"We will use notation [measure(i, j) >= lambda] for Column Similarity Classifier with i'th column of left table, j'th column of right table, similarity measure 'measure' and decision boundary 'lambda'. "

"We will use the notation [measure(i, j) >= lambda] for a Column Similarity Classifier that specifies the i'th column of the left table, the j'th column of the right table, the similarity measure 'measure' and the decision boundary 'lambda'. "

BUYT-1 · 2025-05-20T14:31:09Z

examples/basic/verifying_md.py

+        "Column Similarity Classifier consists of Column Match and decision boundary. "
+        "Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n"
+        "We will use notation [measure(i, j) >= lambda] for Column Similarity Classifier with i'th column of left table, j'th column of right table, similarity measure 'measure' and decision boundary 'lambda'. "
+        'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is valid for ColumnMatch between column with "left_col_name" and "left_col_name" of left and right tables respectively.\n'


Suggested change

'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is valid for ColumnMatch between column with "left_col_name" and "left_col_name" of left and right tables respectively.\n'

'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is also valid for a ColumnMatch specifying the columns "left_col_name" and "right_col_name" of the left and right tables respectively.\n'

github-actions bot reviewed Mar 5, 2025

View reviewed changes

BUYT-1 requested changes Mar 9, 2025

View reviewed changes

BUYT-1 reviewed Mar 9, 2025

View reviewed changes

BUYT-1 changed the title ~~Matching Dependancies Validation~~ Matching Dependencies Validation Mar 9, 2025

github-actions bot reviewed Mar 21, 2025

View reviewed changes

chernishev requested a review from BUYT-1 March 31, 2025 18:31

github-actions bot reviewed Apr 5, 2025

View reviewed changes

maybenotilya force-pushed the md-validator branch 2 times, most recently from d5bedf5 to e303843 Compare April 6, 2025 01:35

ol-imorozko requested changes Apr 7, 2025

View reviewed changes

maybenotilya added 20 commits April 19, 2025 04:29

feat(MD): add MD algorithm initialization

465fce5

refactor(MD): rename metrics to similarity measures

0712054

fead(MD): add similarity measures

0171d2e

refactor: change similarity measures and naming

280820a

feat(MD): add algorithm and tests

f619936

feat(MD): add highlights

82b2ae2

tests(MD): add highlights tests

b9d38ed

feat(MD): add python bindings

deea753

feat(MD): rework highlights and make highlights bindings

c3b0819

feat(MD): add complete example on MD

b6f7054

add more examples on MD

f3971c9

feat(MD): add more links on MD

a8c7bc7

fix(MD): fix typos and add similarity measure types

81228f5

fix after cherry-pick

cdc5ed7

fix(MD): fix more issues after cherry-pick

d3cdab8

fix(MD): fix clang format

b919593

fix(MD): fix naming in md_validator

fb764a5

fix(MD): build for macosx x86_64

f20085e

fix(MD): remove unused highlights methods

9d913ca

feat(MD): add possibility to calculate string similarity measures of …

93f97c3

…numeric columns

maybenotilya added 12 commits April 19, 2025 04:30

feat(MD): add monge-elkan and jaccard similarities

2c30c40

fix(MD): fix segfault in CalculateStringSimilarity

1a5f82c

refactor(MD): adjust architecture to match MD definition from paper

7bed72a

fix(MD): fix bindings and MD verification example

8a0cfb3

fix(MD): move column similarity classifier to separate header

51b7177

fix(MD): clang-format and clang-tidy

fa6f788

fix(MD): rename Highlight::ToString() in bindings

3c2021c

feat(MD): add MD instances to suggest in validation

6cd7b5d

fix(MD): fix typo in BuildMD

78f9d34

fix(MD): add const qualifiers where needed

3a85552

refactor(MD): match HyMD defined column matches

29fb188

refactor(MD): code reorganize

8b2fbe2

maybenotilya force-pushed the md-validator branch from 99101b6 to 8b2fbe2 Compare April 19, 2025 01:40

maybenotilya requested a review from ol-imorozko April 19, 2025 17:29

BUYT-1 reviewed Apr 30, 2025

View reviewed changes

ol-imorozko approved these changes May 3, 2025

View reviewed changes

feat(MD): add more examples on MD verifier

8252f22

maybenotilya requested a review from BUYT-1 May 19, 2025 13:08

BUYT-1 requested changes May 19, 2025

View reviewed changes

BUYT-1 requested changes May 20, 2025

View reviewed changes

	std::vector<HighlightRecord> highlights;
	std::vector<HighlightRecord> highlights_;


		std::shared_ptr<model::ColumnLayoutTypedRelationData> relation_;

		MDHighlights highlights;

		auto ptr_ = std::dynamic_pointer_cast<NumericSimilarityMeasure>(ptr);
		if (!ptr_.get()) {

		auto ptr_ = std::dynamic_pointer_cast<StringSimilarityMeasure>(ptr);
		if (!ptr_.get()) {

	auto const eps = std::numeric_limits<DecisionBoundary>::epsilon();
	auto const kEps = std::numeric_limits<DecisionBoundary>::epsilon();


		auto get_schema_columns = [this]() { return relation_->GetSchema()->GetNumColumns(); };

		RegisterOption(config::kTableOpt(&input_table_));

	print(RED_CODE, "MFD does not hold due to the following items:", DEFAULT_COLOR_CODE)
	print(RED_CODE, "MD does not hold due to the following items:", DEFAULT_COLOR_CODE)



		def check_md(table_path: str, params: MDParams):
		algo = desbordante.md_verifier.algorithms.Default()

	algo = desbordante.md_verifier.algorithms.Default()
	algo = desbordante.md_verification.algorithms.Default()

	model::md::DecisionBoundary true_rhs_decision_boundary;
	model::md::DecisionBoundary true_rhs_decision_boundary_;

		highlights_.push_back(
		{left_table_row, right_table_row, column_match, similarity, decision_boundary});

		std::shared_ptr<std::vector<model::md::ColumnMatch>> column_matches =
		std::make_shared<std::vector<model::md::ColumnMatch>>();

	std::shared_ptr<std::vector<model::md::ColumnMatch>> column_matches =
	std::make_shared<std::vector<model::md::ColumnMatch>>();
	auto column_matches = std::make_shared<std::vector<model::md::ColumnMatch>>();

	print(GREEN_CODE, "MD holds", DEFAULT_COLOR_CODE, "\n")
	print(f"{GREEN_CODE}MD holds{DEFAULT_COLOR_CODE}\n")

	: column_match_(column_match), decision_boundary_(decision_boundary) {}
	: column_match_(std::move(column_match)), decision_boundary_(decision_boundary) {}

	CMPtr GetColumnMatch() const {
	CMPtr const& GetColumnMatch() const {

		for (auto [left_row, right_rows_set] : rows_pairs) {
		for (auto right_row : right_rows_set) {

	"For example, we will take 0.76 instead of 0.75"
	"For example, we will set it to 0.76 instead of 0.75"

-        for (hymd::RecordIdentifier right_row : right_cluster) {
-            auto it = rows_pairs_.find(left_row);
-            if (it != rows_pairs_.end() && it->second.find(right_row) != it->second.end()) {
-                new_rows_pairs[left_row].emplace(right_row);
-            }
-        }
+        auto it = rows_pairs_.find(left_row);
+        if (it == rows_pairs_.end()) continue;
+        ???::??_set const& matched_before = it->second;
+        ???::??_set matched_intersection;
+        for (hymd::RecordIdentifier right_row : right_cluster) {
+            if (matched_before.contains(right_row)) {
+                matched_intersection.insert(right_row);
+            }
+        }
+        assert(!new_row_pairs.contains(left_row));
+        if (!matched_intersection.empty()) new_row_pairs.emplace(left_row, std::move(matched_intersection));

	RowsPairSet const& GetRowsPairs() const {
	RowsPairSet const& GetViolatingPairs() const {

Matching Dependencies Validation #537

Are you sure you want to change the base?

Matching Dependencies Validation #537

Uh oh!

Conversation

maybenotilya commented Mar 5, 2025

Uh oh!

github-actions bot left a comment

Choose a reason for hiding this comment

Uh oh!

github-actions bot Mar 5, 2025

Choose a reason for hiding this comment

Uh oh!

github-actions bot Mar 5, 2025

Choose a reason for hiding this comment

Uh oh!

github-actions bot Mar 5, 2025

Choose a reason for hiding this comment

Uh oh!

github-actions bot Mar 5, 2025

Choose a reason for hiding this comment

Uh oh!

github-actions bot Mar 5, 2025

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot left a comment

Choose a reason for hiding this comment

Uh oh!

github-actions bot Mar 21, 2025

Choose a reason for hiding this comment

Uh oh!

github-actions bot Mar 21, 2025

Choose a reason for hiding this comment

Uh oh!

github-actions bot Mar 21, 2025

Choose a reason for hiding this comment

Uh oh!

github-actions bot left a comment

Choose a reason for hiding this comment

Uh oh!

github-actions bot Apr 5, 2025

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

ol-imorozko Apr 7, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

ol-imorozko Apr 7, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

BUYT-1 Apr 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

ol-imorozko Apr 7, 2025 •

edited

Loading

ol-imorozko Apr 7, 2025 •

edited

Loading

BUYT-1 Apr 30, 2025 •

edited

Loading

BUYT-1 May 19, 2025 •

edited

Loading

	"As we can see, nothing changed. Now let's increase right-hand side decision boundary:\n"
	"As we can see, nothing changed. Now let's increase the right-hand side decision boundary:\n"

	'Values "meat" and "mead" have Levenshtein similarity measure equal to 0.75, but we accept similarity measure at least 0.76, so MD doesn\'t holds.\n'
	'The values "meat" and "mead" have a similarity of 0.75 according to the Levenshtein similarity measure, but we require the similarity to be at least 0.76, so the MD doesn\'t hold.\n'

	print("Let's check that changes if we corrent typos in dataset")
	print("Let's check if that changes if we correct typos in the dataset")

	print("Now let's check MD with 1.0 decision boundaries again")
	print("Now let's check the MD with 1.0 decision boundaries again")

	"Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for such purpose:\n"
	"Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for this purpose:\n"

	"To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n"
	"To verify a Matching Dependency, we must first define Column Similarity Classifiers for the data.\n"

	"Column Similarity Classifier consists of Column Match and decision boundary. "
	"A Column Similarity Classifier consists of a Column Match and a decision boundary. "

	"Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n"
	"A Column Match consists of two indices—the columns in the left and right tables—and a similarity measure (Levenshtein Similarity, for example).\n"

	"We will use notation [measure(i, j) >= lambda] for Column Similarity Classifier with i'th column of left table, j'th column of right table, similarity measure 'measure' and decision boundary 'lambda'. "
	"We will use the notation [measure(i, j) >= lambda] for a Column Similarity Classifier that specifies the i'th column of the left table, the j'th column of the right table, the similarity measure 'measure' and the decision boundary 'lambda'. "

	'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is valid for ColumnMatch between column with "left_col_name" and "left_col_name" of left and right tables respectively.\n'
	'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is also valid for a ColumnMatch specifying the columns "left_col_name" and "right_col_name" of the left and right tables respectively.\n'