-
Notifications
You must be signed in to change notification settings - Fork 76
Matching Dependencies Validation #537
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
465fce5
0712054
0171d2e
280820a
f619936
82b2ae2
b9d38ed
deea753
c3b0819
b6f7054
f3971c9
a8c7bc7
81228f5
cdc5ed7
d3cdab8
b919593
fb764a5
f20085e
9d913ca
93f97c3
2c30c40
1a5f82c
7bed72a
8a0cfb3
51b7177
fa6f788
3c2021c
6cd7b5d
78f9d34
3a85552
29fb188
8b2fbe2
8252f22
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,233 @@ | ||||||||||||||||||
import desbordante | ||||||||||||||||||
import pandas as pd | ||||||||||||||||||
|
||||||||||||||||||
from typing import TypedDict | ||||||||||||||||||
from desbordante.md import ColumnSimilarityClassifier | ||||||||||||||||||
from desbordante.md.column_matches import Levenshtein, Custom | ||||||||||||||||||
|
||||||||||||||||||
GREEN_CODE = "\033[1;42m" | ||||||||||||||||||
RED_CODE = "\033[1;41m" | ||||||||||||||||||
BLUE_CODE = "\033[1;46m" | ||||||||||||||||||
DEFAULT_COLOR_CODE = "\033[1;49m" | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
class MDParams(TypedDict): | ||||||||||||||||||
lhs: list[ColumnSimilarityClassifier] | ||||||||||||||||||
rhs: ColumnSimilarityClassifier | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
def print_results(verifier): | ||||||||||||||||||
if verifier.md_holds(): | ||||||||||||||||||
print(GREEN_CODE, "MD holds", DEFAULT_COLOR_CODE, "\n") | ||||||||||||||||||
return | ||||||||||||||||||
|
||||||||||||||||||
highlights = verifier.get_highlights() | ||||||||||||||||||
print(RED_CODE, "MD does not hold due to the following items:", DEFAULT_COLOR_CODE) | ||||||||||||||||||
for highlight in highlights: | ||||||||||||||||||
print(highlight.to_string()) | ||||||||||||||||||
print( | ||||||||||||||||||
f"Desbordante suggests to use following right-hand side decision boundary: {verifier.get_true_rhs_decision_boundary()}\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
print(f"Following MD was provided:\n {verifier.get_input_md()}") | ||||||||||||||||||
print("Following MDs are suggested:") | ||||||||||||||||||
Comment on lines
+31
to
+32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
If there is only supposed to be a single suggestion:
Suggested change
|
||||||||||||||||||
for md in verifier.get_md_suggestions(): | ||||||||||||||||||
print(f" {md.to_string_active()}") | ||||||||||||||||||
print() | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
def check_md(table: str, params: MDParams): | ||||||||||||||||||
algo = desbordante.md_verification.algorithms.Default() | ||||||||||||||||||
algo.load_data(left_table=table) | ||||||||||||||||||
|
||||||||||||||||||
algo.execute(**params) | ||||||||||||||||||
print_results(algo) | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
def animals_beverages_example(): | ||||||||||||||||||
print("As first example, let's look at the dataset animals_beverages.csv") | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
|
||||||||||||||||||
table_path = "examples/datasets/animals_beverages.csv" | ||||||||||||||||||
table = pd.read_csv(table_path) | ||||||||||||||||||
print(table) | ||||||||||||||||||
|
||||||||||||||||||
print( | ||||||||||||||||||
"\nLet's try to check if MD {Levenshtein(animal, animal) -> Levenshtein(diet, diet)} with decision boundaries {1.0} and {1.0} respectively.\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
"Levenshtein similarity with decision boundary equal to 1.0 means that values must be equal.\n" | ||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
params = { | ||||||||||||||||||
"lhs": [ColumnSimilarityClassifier(Levenshtein(2, 2, 0.0), 1)], | ||||||||||||||||||
"rhs": ColumnSimilarityClassifier(Levenshtein(3, 3, 0.0), 1), | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
check_md(table, params) | ||||||||||||||||||
|
||||||||||||||||||
print( | ||||||||||||||||||
"As we see, such MD doesn't holds due to some sort of typo in table.\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
"Let's ease our constraints and say that if column's similarity measure is at least 0.75, they are similar enough:\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
params = { | ||||||||||||||||||
"lhs": [ColumnSimilarityClassifier(Levenshtein(2, 2, 0.0), 0.75)], | ||||||||||||||||||
"rhs": ColumnSimilarityClassifier(Levenshtein(3, 3, 0.0), 0.75), | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
check_md(table, params) | ||||||||||||||||||
|
||||||||||||||||||
print( | ||||||||||||||||||
"As a result, MD {[Levenshtein(animal, animal) >= 0.75] -> [Levenshtein(diet, diet) >= 0.75]} holds.\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
"This is how MD can be helpful in avoiding typos in table and searching them\n\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your example does not show that. Either clarify or remove this line. |
||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
print( | ||||||||||||||||||
"Now let's take a look that happen if we increase decision boundary of left-hand side and right-hand side\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
"For example, we will take 0.76 instead of 0.75" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
print("Left-hand side:\n") | ||||||||||||||||||
|
||||||||||||||||||
params = { | ||||||||||||||||||
"lhs": [ColumnSimilarityClassifier(Levenshtein(2, 2, 0.0), 0.76)], | ||||||||||||||||||
"rhs": ColumnSimilarityClassifier(Levenshtein(3, 3, 0.0), 0.75), | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
check_md(table, params) | ||||||||||||||||||
|
||||||||||||||||||
print( | ||||||||||||||||||
"As we can see, nothing changed. Now let's increase right-hand side decision boundary:\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
params = { | ||||||||||||||||||
"lhs": [ColumnSimilarityClassifier(Levenshtein(2, 2, 0.0), 0.75)], | ||||||||||||||||||
"rhs": ColumnSimilarityClassifier(Levenshtein(3, 3, 0.0), 0.76), | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
check_md(table, params) | ||||||||||||||||||
|
||||||||||||||||||
print( | ||||||||||||||||||
'Values "meat" and "mead" have Levenshtein similarity measure equal to 0.75, but we accept similarity measure at least 0.76, so MD doesn\'t holds.\n' | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
print("Let's check that changes if we corrent typos in dataset") | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
|
||||||||||||||||||
table["animal"] = table["animal"].replace({"beer": "bear"}) | ||||||||||||||||||
table["diet"] = table["diet"].replace({"mead": "meat"}) | ||||||||||||||||||
|
||||||||||||||||||
print(f"Corrected dataset:\n\n{table}\n") | ||||||||||||||||||
print("Now let's check MD with 1.0 decision boundaries again") | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
|
||||||||||||||||||
params = { | ||||||||||||||||||
"lhs": [ColumnSimilarityClassifier(Levenshtein(2, 2, 0.0), 1.0)], | ||||||||||||||||||
"rhs": ColumnSimilarityClassifier(Levenshtein(3, 3, 0.0), 1.0), | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
check_md(table, params) | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
def theatre_example(): | ||||||||||||||||||
print( | ||||||||||||||||||
"Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for such purpose:\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
table_path = "examples/datasets/theatres_typos.csv" | ||||||||||||||||||
table = pd.read_csv(table_path) | ||||||||||||||||||
print(table) | ||||||||||||||||||
|
||||||||||||||||||
print( | ||||||||||||||||||
"\nAs we see, there are some typos in this dataset.\n" | ||||||||||||||||||
"We will try to discover MD {Levenshtein(Title, Title) -> NormalizedDistance(Duration, Duration)}\nFirstly, let's check if MD with all decision boundaries equal to 1.0 holds:\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
max_duration = max(table["Duration"]) | ||||||||||||||||||
|
||||||||||||||||||
params = { | ||||||||||||||||||
"lhs": [ColumnSimilarityClassifier(Levenshtein("Title", "Title", 0.0), 1)], | ||||||||||||||||||
"rhs": ColumnSimilarityClassifier( | ||||||||||||||||||
Custom( | ||||||||||||||||||
lambda d1, d2: 1 - abs(int(d1) - int(d2)) / max_duration, | ||||||||||||||||||
"Duration", | ||||||||||||||||||
"Duration", | ||||||||||||||||||
symmetrical=True, | ||||||||||||||||||
equality_is_max=True, | ||||||||||||||||||
measure_name="normalized_distance", | ||||||||||||||||||
min_sim=0.0, | ||||||||||||||||||
), | ||||||||||||||||||
1, | ||||||||||||||||||
), | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
check_md(table, params) | ||||||||||||||||||
|
||||||||||||||||||
print("To avoid typos, let's set left-hand side decision boundary to 0.75:\n") | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
What do you want to do with typos? |
||||||||||||||||||
|
||||||||||||||||||
params = { | ||||||||||||||||||
"lhs": [ColumnSimilarityClassifier(Levenshtein("Title", "Title", 0.0), 0.75)], | ||||||||||||||||||
"rhs": ColumnSimilarityClassifier( | ||||||||||||||||||
Custom( | ||||||||||||||||||
lambda d1, d2: 1 - abs(int(d1) - int(d2)) / max_duration, | ||||||||||||||||||
"Duration", | ||||||||||||||||||
"Duration", | ||||||||||||||||||
symmetrical=True, | ||||||||||||||||||
equality_is_max=True, | ||||||||||||||||||
measure_name="normalized_distance", | ||||||||||||||||||
min_sim=0.0, | ||||||||||||||||||
), | ||||||||||||||||||
1, | ||||||||||||||||||
), | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
check_md(table, params) | ||||||||||||||||||
|
||||||||||||||||||
print("More pairs violationg MD appeared.\n") | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
|
||||||||||||||||||
print( | ||||||||||||||||||
"Desbordante suggest to use right-hand side decision boundary lower than 0.(96).\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
"Let's try to verify MD {[Levenshtein(Title, Title) >= 0.75] -> [Levenshtein(Duration, Duration) >= 0.96]}\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
params = { | ||||||||||||||||||
"lhs": [ColumnSimilarityClassifier(Levenshtein("Title", "Title", 0.0), 0.75)], | ||||||||||||||||||
"rhs": ColumnSimilarityClassifier( | ||||||||||||||||||
Custom( | ||||||||||||||||||
lambda d1, d2: 1 - abs(int(d1) - int(d2)) / max_duration, | ||||||||||||||||||
"Duration", | ||||||||||||||||||
"Duration", | ||||||||||||||||||
symmetrical=True, | ||||||||||||||||||
equality_is_max=True, | ||||||||||||||||||
measure_name="normalized_distance", | ||||||||||||||||||
min_sim=0.0, | ||||||||||||||||||
), | ||||||||||||||||||
0.96, | ||||||||||||||||||
), | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
check_md(table, params) | ||||||||||||||||||
|
||||||||||||||||||
print("As we see, now everything holds.") | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
if __name__ == "__main__": | ||||||||||||||||||
print(DEFAULT_COLOR_CODE) | ||||||||||||||||||
print( | ||||||||||||||||||
"This example demonstrates how to validate Matching Dependancies (MD) from 'Efficient Discovery of Matching Dependencies' " | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
"by Schirmer et al. using the Desbordante library. " | ||||||||||||||||||
"You can read about Matching Dependancies and their formal definition in article below\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
"https://hpi.de/fileadmin/user_upload/fachgebiete/naumann/publications/PDFs/2020_schirmer_efficient.pdf\n" | ||||||||||||||||||
) | ||||||||||||||||||
print( | ||||||||||||||||||
"Matching dependancies verification algorithm accepts left-hand side and right-hand side and returns if such dependancy holds. " | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
"Also in case if dependancy doesn't hold, algorithm also returns list of highlights and suggests how to adjust dependancy." | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
print( | ||||||||||||||||||
"You can also read about mining Matching Dependancies in examples/basic/mining_md.py" | ||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
print( | ||||||||||||||||||
"To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
"Column Similarity Classifier consists of Column Match and decision boundary. " | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
"Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
"We will use notation [measure(i, j) >= lambda] for Column Similarity Classifier with i'th column of left table, j'th column of right table, similarity measure 'measure' and decision boundary 'lambda'. " | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is valid for ColumnMatch between column with "left_col_name" and "left_col_name" of left and right tables respectively.\n' | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||
) | ||||||||||||||||||
animals_beverages_example() | ||||||||||||||||||
print("-" * 50) | ||||||||||||||||||
theatre_example() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
Title,Theatre,Duration | ||
Don Quixote,Sydney Opera House,139 | ||
Don Quixte,Teatro alla Scala,135 | ||
Don Quixote,Grand Opera House,140 | ||
Cinderela,Teatro alla Scala,110 | ||
Cinderella,Grand Opera House,112 | ||
Romeoand Juliet,Sydney Opera House,160 | ||
Romeo and Juliet,Teatro alla Scala,163 | ||
Romeo and Juliet,Grand Opera House,165 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#include "algorithms/md/hymd/preprocessing/column_matches/column_match.h" | ||
|
||
namespace algos::md { | ||
using CMPtr = std::shared_ptr<hymd::preprocessing::column_matches::ColumnMatch>; | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,28 @@ | ||||||
#pragma once | ||||||
|
||||||
#include <sstream> | ||||||
|
||||||
#include "algorithms/md/hymd/preprocessing/column_matches/column_match.h" | ||||||
#include "algorithms/md/md_verifier/cmptr.h" | ||||||
|
||||||
namespace algos::md { | ||||||
class ColumnSimilarityClassifier { | ||||||
private: | ||||||
CMPtr column_match_; | ||||||
model::md::DecisionBoundary decision_boundary_; | ||||||
|
||||||
public: | ||||||
ColumnSimilarityClassifier() = default; | ||||||
|
||||||
ColumnSimilarityClassifier(CMPtr column_match, model::md::DecisionBoundary decision_boundary) | ||||||
: column_match_(column_match), decision_boundary_(decision_boundary) {} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
|
||||||
|
||||||
CMPtr GetColumnMatch() const { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
|
||||||
return column_match_; | ||||||
} | ||||||
|
||||||
model::md::DecisionBoundary GetDecisionBoundary() const { | ||||||
return decision_boundary_; | ||||||
} | ||||||
}; | ||||||
} // namespace algos::md | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Add newline |
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,17 @@ | ||||||||||
#include "algorithms/md/md_verifier/highlights/highlights.h" | ||||||||||
|
||||||||||
namespace algos::md { | ||||||||||
MDHighlights MDHighlights::CreateFrom(model::RhsSimilarityClassifierDesctription rhs_desc, | ||||||||||
RowsPairSet const& rows_pairs, | ||||||||||
RowsToSimilarityMap const& rows_to_similarity) { | ||||||||||
MDHighlights highlights; | ||||||||||
for (auto [left_row, right_rows_set] : rows_pairs) { | ||||||||||
for (auto right_row : right_rows_set) { | ||||||||||
Comment on lines
+8
to
+9
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
|
||||||||||
highlights.highlights_.emplace_back(left_row, right_row, rhs_desc, | ||||||||||
rows_to_similarity.at({left_row, right_row})); | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
return highlights; | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
Also, use |
||||||||||
} | ||||||||||
} // namespace algos::md |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#pragma once | ||
|
||
#include <sstream> | ||
|
||
#include "algorithms/md/md.h" | ||
#include "algorithms/md/md_verifier/validation/rows_pairs.h" | ||
#include "algorithms/md/similarity.h" | ||
#include "config/indices/type.h" | ||
#include "model/index.h" | ||
|
||
namespace algos::md { | ||
class MDHighlights { | ||
public: | ||
struct Highlight { | ||
model::Index left_table_row; | ||
model::Index right_table_row; | ||
model::md::Similarity similarity; | ||
model::RhsSimilarityClassifierDesctription rhs_decs; | ||
|
||
Highlight(model::Index left_table_row, model::Index right_table_row, | ||
model::RhsSimilarityClassifierDesctription rhs_decs, | ||
model::md::Similarity similarity) | ||
: left_table_row(left_table_row), | ||
right_table_row(right_table_row), | ||
similarity(similarity), | ||
rhs_decs(rhs_decs) {} | ||
|
||
std::string ToString() const { | ||
std::stringstream ss; | ||
ss << rhs_decs.column_match_description.column_match_name << '(' | ||
<< rhs_decs.column_match_description.left_column_description.column_name << ", " | ||
<< rhs_decs.column_match_description.right_column_description.column_name | ||
<< ") violates MD in " << left_table_row << " row of left table and " | ||
<< right_table_row << " row of right table with similarity " << similarity | ||
<< " and decision boundary " << rhs_decs.decision_boundary; | ||
return ss.str(); | ||
}; | ||
|
||
std::string ToStringIndexes() const { | ||
std::stringstream ss; | ||
ss << rhs_decs.column_match_description.column_match_name << '(' | ||
<< rhs_decs.column_match_description.left_column_description.column_index << ", " | ||
<< rhs_decs.column_match_description.right_column_description.column_index | ||
<< ") violates MD in " << left_table_row << " row of left table and " | ||
<< right_table_row << " row of right table with similarity " << similarity | ||
<< " and decision boundary " << rhs_decs.decision_boundary; | ||
return ss.str(); | ||
}; | ||
}; | ||
|
||
private: | ||
std::vector<Highlight> highlights_; | ||
|
||
public: | ||
std::vector<Highlight> const& GetHighlights() const { | ||
return highlights_; | ||
} | ||
|
||
static MDHighlights CreateFrom(model::RhsSimilarityClassifierDesctription rhs_desc, | ||
RowsPairSet const& rows_pairs, | ||
RowsToSimilarityMap const& rows_to_similarity); | ||
}; | ||
} // namespace algos::md |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This outputs
Perhaps you want
for