Desbordante · maybenotilya · Nov 6, 2024 · Nov 6, 2024 · Nov 9, 2024 · Nov 22, 2024
diff --git a/examples/basic/verifying_md.py b/examples/basic/verifying_md.py
@@ -0,0 +1,233 @@
+import desbordante
+import pandas as pd
+
+from typing import TypedDict
+from desbordante.md import ColumnSimilarityClassifier
+from desbordante.md.column_matches import Levenshtein, Custom
+
+GREEN_CODE = "\033[1;42m"
+RED_CODE = "\033[1;41m"
+BLUE_CODE = "\033[1;46m"
+DEFAULT_COLOR_CODE = "\033[1;49m"
+
+
+class MDParams(TypedDict):
+    lhs: list[ColumnSimilarityClassifier]
+    rhs: ColumnSimilarityClassifier
+
+
+def print_results(verifier):
+    if verifier.md_holds():
+        print(GREEN_CODE, "MD holds", DEFAULT_COLOR_CODE, "\n")
-        print(GREEN_CODE, "MD holds", DEFAULT_COLOR_CODE, "\n")
+        print(f"{GREEN_CODE}MD holds{DEFAULT_COLOR_CODE}\n")
-        print(GREEN_CODE, "MD holds", DEFAULT_COLOR_CODE, "\n")
+        print(f"{GREEN_CODE}MD holds{DEFAULT_COLOR_CODE}\n")
+        return
+
+    highlights = verifier.get_highlights()
+    print(RED_CODE, "MD does not hold due to the following items:", DEFAULT_COLOR_CODE)
+    for highlight in highlights:
+        print(highlight.to_string())
+    print(
+        f"Desbordante suggests to use following right-hand side decision boundary: {verifier.get_true_rhs_decision_boundary()}\n"
-        f"Desbordante suggests to use following right-hand side decision boundary: {verifier.get_true_rhs_decision_boundary()}\n"
+        f"Desbordante suggests to use the following right-hand side decision boundary: {verifier.get_true_rhs_decision_boundary()}\n"
-        f"Desbordante suggests to use following right-hand side decision boundary: {verifier.get_true_rhs_decision_boundary()}\n"
+        f"Desbordante suggests to use the following right-hand side decision boundary: {verifier.get_true_rhs_decision_boundary()}\n"
+    )
+    print(f"Following MD was provided:\n  {verifier.get_input_md()}")
+    print("Following MDs are suggested:")
-    print(f"Following MD was provided:\n  {verifier.get_input_md()}")
-    print("Following MDs are suggested:")
+    print(f"The following MD was provided:\n  {verifier.get_input_md()}")
+    print("The following MDs are suggested:")
-    print(f"Following MD was provided:\n  {verifier.get_input_md()}")
-    print("Following MDs are suggested:")
+    print(f"The following MD was provided:\n  {verifier.get_input_md()}")
+    print("The following MD is suggested:")
-    print(f"Following MD was provided:\n  {verifier.get_input_md()}")
-    print("Following MDs are suggested:")
+    print(f"The following MD was provided:\n  {verifier.get_input_md()}")
+    print("The following MDs are suggested:")
-    print(f"Following MD was provided:\n  {verifier.get_input_md()}")
-    print("Following MDs are suggested:")
+    print(f"The following MD was provided:\n  {verifier.get_input_md()}")
+    print("The following MD is suggested:")
+    for md in verifier.get_md_suggestions():
+        print(f"  {md.to_string_active()}")
+    print()
+
+
+def check_md(table: str, params: MDParams):
+    algo = desbordante.md_verification.algorithms.Default()
+    algo.load_data(left_table=table)
+
+    algo.execute(**params)
+    print_results(algo)
+
+
+def animals_beverages_example():
+    print("As first example, let's look at the dataset animals_beverages.csv")
-    print("As first example, let's look at the dataset animals_beverages.csv")
+    print("As the first example, let's look at the animals_beverages.csv dataset")
-    print("As first example, let's look at the dataset animals_beverages.csv")
+    print("As the first example, let's look at the animals_beverages.csv dataset")
+
+    table_path = "examples/datasets/animals_beverages.csv"
+    table = pd.read_csv(table_path)
+    print(table)
+
+    print(
+        "\nLet's try to check if MD {Levenshtein(animal, animal) -> Levenshtein(diet, diet)} with decision boundaries {1.0} and {1.0} respectively.\n"
-        "\nLet's try to check if MD {Levenshtein(animal, animal) -> Levenshtein(diet, diet)} with decision boundaries {1.0} and {1.0} respectively.\n"
+        "\nLet's try to check if the MD {Levenshtein(animal, animal) -> Levenshtein(diet, diet)} holds with decision boundaries {1.0} and {1.0} respectively.\n"
-        "\nLet's try to check if MD {Levenshtein(animal, animal) -> Levenshtein(diet, diet)} with decision boundaries {1.0} and {1.0} respectively.\n"
+        "\nLet's try to check if the MD {Levenshtein(animal, animal) -> Levenshtein(diet, diet)} holds with decision boundaries {1.0} and {1.0} respectively.\n"
+        "Levenshtein similarity with decision boundary equal to 1.0 means that values must be equal.\n"
+    )
+
+    params = {
+        "lhs": [ColumnSimilarityClassifier(Levenshtein(2, 2, 0.0), 1)],
+        "rhs": ColumnSimilarityClassifier(Levenshtein(3, 3, 0.0), 1),
+    }
+
+    check_md(table, params)
+
+    print(
+        "As we see, such MD doesn't holds due to some sort of typo in table.\n"
-        "As we see, such MD doesn't holds due to some sort of typo in table.\n"
+        "As we see, this MD doesn't hold due to some sort of typo in the table.\n"
-        "As we see, such MD doesn't holds due to some sort of typo in table.\n"
+        "As we see, this MD doesn't hold due to some sort of typo in the table.\n"
+        "Let's ease our constraints and say that if column's similarity measure is at least 0.75, they are similar enough:\n"
-        "Let's ease our constraints and say that if column's similarity measure is at least 0.75, they are similar enough:\n"
+        "Let's relax our constraints and say that if the similarity between the values in the column is at least 0.75, they are similar enough:\n"
-        "Let's ease our constraints and say that if column's similarity measure is at least 0.75, they are similar enough:\n"
+        "Let's relax our constraints and say that if the similarity between the values in the column is at least 0.75, they are similar enough:\n"
+    )
+
+    params = {
+        "lhs": [ColumnSimilarityClassifier(Levenshtein(2, 2, 0.0), 0.75)],
+        "rhs": ColumnSimilarityClassifier(Levenshtein(3, 3, 0.0), 0.75),
+    }
+
+    check_md(table, params)
+
+    print(
+        "As a result, MD {[Levenshtein(animal, animal) >= 0.75] -> [Levenshtein(diet, diet) >= 0.75]} holds.\n"
-        "As a result, MD {[Levenshtein(animal, animal) >= 0.75] -> [Levenshtein(diet, diet) >= 0.75]} holds.\n"
+        "Now, MD {[Levenshtein(animal, animal) >= 0.75] -> [Levenshtein(diet, diet) >= 0.75]} holds.\n"
-        "As a result, MD {[Levenshtein(animal, animal) >= 0.75] -> [Levenshtein(diet, diet) >= 0.75]} holds.\n"
+        "Now, MD {[Levenshtein(animal, animal) >= 0.75] -> [Levenshtein(diet, diet) >= 0.75]} holds.\n"
+        "This is how MD can be helpful in avoiding typos in table and searching them\n\n"
+    )
+
+    print(
+        "Now let's take a look that happen if we increase decision boundary of left-hand side and right-hand side\n"
-        "Now let's take a look that happen if we increase decision boundary of left-hand side and right-hand side\n"
+        "Now let's take a look at what happens if we increase the decision boundary in the left-hand side and the right-hand side\n"
-        "Now let's take a look that happen if we increase decision boundary of left-hand side and right-hand side\n"
+        "Now let's take a look at what happens if we increase the decision boundary in the left-hand side and the right-hand side\n"
+        "For example, we will take 0.76 instead of 0.75"
-        "For example, we will take 0.76 instead of 0.75"
+        "For example, we will set it to 0.76 instead of 0.75"
-        "For example, we will take 0.76 instead of 0.75"
+        "For example, we will set it to 0.76 instead of 0.75"
+    )
+    print("Left-hand side:\n")
+
+    params = {
+        "lhs": [ColumnSimilarityClassifier(Levenshtein(2, 2, 0.0), 0.76)],
+        "rhs": ColumnSimilarityClassifier(Levenshtein(3, 3, 0.0), 0.75),
+    }
+
+    check_md(table, params)
+
+    print(
+        "As we can see, nothing changed. Now let's increase right-hand side decision boundary:\n"
-        "As we can see, nothing changed. Now let's increase right-hand side decision boundary:\n"
+        "As we can see, nothing changed. Now let's increase the right-hand side decision boundary:\n"
-        "As we can see, nothing changed. Now let's increase right-hand side decision boundary:\n"
+        "As we can see, nothing changed. Now let's increase the right-hand side decision boundary:\n"
+    )
+
+    params = {
+        "lhs": [ColumnSimilarityClassifier(Levenshtein(2, 2, 0.0), 0.75)],
+        "rhs": ColumnSimilarityClassifier(Levenshtein(3, 3, 0.0), 0.76),
+    }
+
+    check_md(table, params)
+
+    print(
+        'Values "meat" and "mead" have Levenshtein similarity measure equal to 0.75, but we accept similarity measure at least 0.76, so MD doesn\'t holds.\n'
-        'Values "meat" and "mead" have Levenshtein similarity measure equal to 0.75, but we accept similarity measure at least 0.76, so MD doesn\'t holds.\n'
+        'The values "meat" and "mead" have a similarity of 0.75 according to the Levenshtein similarity measure, but we require the similarity to be at least 0.76, so the MD doesn\'t hold.\n'
-        'Values "meat" and "mead" have Levenshtein similarity measure equal to 0.75, but we accept similarity measure at least 0.76, so MD doesn\'t holds.\n'
+        'The values "meat" and "mead" have a similarity of 0.75 according to the Levenshtein similarity measure, but we require the similarity to be at least 0.76, so the MD doesn\'t hold.\n'
+    )
+
+    print("Let's check that changes if we corrent typos in dataset")
-    print("Let's check that changes if we corrent typos in dataset")
+    print("Let's check if that changes if we correct typos in the dataset")
-    print("Let's check that changes if we corrent typos in dataset")
+    print("Let's check if that changes if we correct typos in the dataset")
+
+    table["animal"] = table["animal"].replace({"beer": "bear"})
+    table["diet"] = table["diet"].replace({"mead": "meat"})
+
+    print(f"Corrected dataset:\n\n{table}\n")
+    print("Now let's check MD with 1.0 decision boundaries again")
-    print("Now let's check MD with 1.0 decision boundaries again")
+    print("Now let's check the MD with 1.0 decision boundaries again")
-    print("Now let's check MD with 1.0 decision boundaries again")
+    print("Now let's check the MD with 1.0 decision boundaries again")
+
+    params = {
+        "lhs": [ColumnSimilarityClassifier(Levenshtein(2, 2, 0.0), 1.0)],
+        "rhs": ColumnSimilarityClassifier(Levenshtein(3, 3, 0.0), 1.0),
+    }
+
+    check_md(table, params)
+
+
+def theatre_example():
+    print(
+        "Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for such purpose:\n"
-        "Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for such purpose:\n"
+        "Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for this purpose:\n"
-        "Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for such purpose:\n"
+        "Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for this purpose:\n"
-        "Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for such purpose:\n"
+        "Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for this purpose:\n"
-        "Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for such purpose:\n"
+        "Let's look at the example with numeric columns.\nWe will use theatre.csv dataset for this purpose:\n"
+    )
+
+    table_path = "examples/datasets/theatres_typos.csv"
+    table = pd.read_csv(table_path)
+    print(table)
+
+    print(
+        "\nAs we see, there are some typos in this dataset.\n"
+        "We will try to discover MD {Levenshtein(Title, Title) -> NormalizedDistance(Duration, Duration)}\nFirstly, let's check if MD with all decision boundaries equal to 1.0 holds:\n"
-        "We will try to discover MD {Levenshtein(Title, Title) -> NormalizedDistance(Duration, Duration)}\nFirstly, let's check if MD with all decision boundaries equal to 1.0 holds:\n"
+        "We will try to discover the MD {Levenshtein(Title, Title) -> NormalizedDistance(Duration, Duration)}\nFirstly, let's check if the MD with all decision boundaries equal to 1.0 holds:\n"
-        "We will try to discover MD {Levenshtein(Title, Title) -> NormalizedDistance(Duration, Duration)}\nFirstly, let's check if MD with all decision boundaries equal to 1.0 holds:\n"
+        "We will try to discover the MD {Levenshtein(Title, Title) -> NormalizedDistance(Duration, Duration)}\nFirstly, let's check if the MD with all decision boundaries equal to 1.0 holds:\n"
+    )
+
+    max_duration = max(table["Duration"])
+
+    params = {
+        "lhs": [ColumnSimilarityClassifier(Levenshtein("Title", "Title", 0.0), 1)],
+        "rhs": ColumnSimilarityClassifier(
+            Custom(
+                lambda d1, d2: 1 - abs(int(d1) - int(d2)) / max_duration,
+                "Duration",
+                "Duration",
+                symmetrical=True,
+                equality_is_max=True,
+                measure_name="normalized_distance",
+                min_sim=0.0,
+            ),
+            1,
+        ),
+    }
+
+    check_md(table, params)
+
+    print("To avoid typos, let's set left-hand side decision boundary to 0.75:\n")
-    print("To avoid typos, let's set left-hand side decision boundary to 0.75:\n")
+    print("To ??? typos, let's set the decision boundary in the left-hand side to 0.75:\n")
-    print("To avoid typos, let's set left-hand side decision boundary to 0.75:\n")
+    print("To ??? typos, let's set the decision boundary in the left-hand side to 0.75:\n")
+
+    params = {
+        "lhs": [ColumnSimilarityClassifier(Levenshtein("Title", "Title", 0.0), 0.75)],
+        "rhs": ColumnSimilarityClassifier(
+            Custom(
+                lambda d1, d2: 1 - abs(int(d1) - int(d2)) / max_duration,
+                "Duration",
+                "Duration",
+                symmetrical=True,
+                equality_is_max=True,
+                measure_name="normalized_distance",
+                min_sim=0.0,
+            ),
+            1,
+        ),
+    }
+
+    check_md(table, params)
+
+    print("More pairs violationg MD appeared.\n")
-    print("More pairs violationg MD appeared.\n")
+    print("More pairs violating the MD appeared.\n")
-    print("More pairs violationg MD appeared.\n")
+    print("More pairs violating the MD appeared.\n")
+
+    print(
+        "Desbordante suggest to use right-hand side decision boundary lower than 0.(96).\n"
-        "Desbordante suggest to use right-hand side decision boundary lower than 0.(96).\n"
+        "Desbordante suggests to use a right-hand side decision boundary lower than 0.(96).\n"
-        "Desbordante suggest to use right-hand side decision boundary lower than 0.(96).\n"
+        "Desbordante suggests to use a right-hand side decision boundary lower than 0.(96).\n"
+        "Let's try to verify MD {[Levenshtein(Title, Title) >= 0.75] -> [Levenshtein(Duration, Duration) >= 0.96]}\n"
-        "Let's try to verify MD {[Levenshtein(Title, Title) >= 0.75] -> [Levenshtein(Duration, Duration) >= 0.96]}\n"
+        "Let's try to verify the MD {[Levenshtein(Title, Title) >= 0.75] -> [Levenshtein(Duration, Duration) >= 0.96]}\n"
-        "Let's try to verify MD {[Levenshtein(Title, Title) >= 0.75] -> [Levenshtein(Duration, Duration) >= 0.96]}\n"
+        "Let's try to verify the MD {[Levenshtein(Title, Title) >= 0.75] -> [Levenshtein(Duration, Duration) >= 0.96]}\n"
+    )
+
+    params = {
+        "lhs": [ColumnSimilarityClassifier(Levenshtein("Title", "Title", 0.0), 0.75)],
+        "rhs": ColumnSimilarityClassifier(
+            Custom(
+                lambda d1, d2: 1 - abs(int(d1) - int(d2)) / max_duration,
+                "Duration",
+                "Duration",
+                symmetrical=True,
+                equality_is_max=True,
+                measure_name="normalized_distance",
+                min_sim=0.0,
+            ),
+            0.96,
+        ),
+    }
+
+    check_md(table, params)
+
+    print("As we see, now everything holds.")
-    print("As we see, now everything holds.")
+    print("As we can see, the MD holds now.")
-    print("As we see, now everything holds.")
+    print("As we can see, the MD holds now.")
+
+
+if __name__ == "__main__":
+    print(DEFAULT_COLOR_CODE)
+    print(
+        "This example demonstrates how to validate Matching Dependancies (MD) from 'Efficient Discovery of Matching Dependencies' "
-        "This example demonstrates how to validate Matching Dependancies (MD) from 'Efficient Discovery of Matching Dependencies' "
+        "This example demonstrates how to validate Matching Dependancies (MD) as defined in 'Efficient Discovery of Matching Dependencies' "
-        "This example demonstrates how to validate Matching Dependancies (MD) from 'Efficient Discovery of Matching Dependencies' "
+        "This example demonstrates how to validate Matching Dependancies (MD) as defined in 'Efficient Discovery of Matching Dependencies' "
+        "by Schirmer et al. using the Desbordante library. "
+        "You can read about Matching Dependancies and their formal definition in article below\n"
-        "You can read about Matching Dependancies and their formal definition in article below\n"
+        "You can read about Matching Dependancies and their formal definition in the article below\n"
-        "You can read about Matching Dependancies and their formal definition in article below\n"
+        "You can read about Matching Dependancies and their formal definition in the article below\n"
+        "https://hpi.de/fileadmin/user_upload/fachgebiete/naumann/publications/PDFs/2020_schirmer_efficient.pdf\n"
+    )
+    print(
+        "Matching dependancies verification algorithm accepts left-hand side and right-hand side and returns if such dependancy holds. "
-        "Matching dependancies verification algorithm accepts left-hand side and right-hand side and returns if such dependancy holds. "
+        "The matching dependency verification algorithm accepts the left-hand side and right-hand side and determines if the specified dependency holds. "
-        "Matching dependancies verification algorithm accepts left-hand side and right-hand side and returns if such dependancy holds. "
+        "The matching dependency verification algorithm accepts the left-hand side and right-hand side and determines if the specified dependency holds. "
+        "Also in case if dependancy doesn't hold, algorithm also returns list of highlights and suggests how to adjust dependancy."
-        "Also in case if dependancy doesn't hold, algorithm also returns list of highlights and suggests how to adjust dependancy."
+        "Also, in case if dependency doesn't hold, the algorithm returns a list of highlights and suggests how to adjust the dependency."
-        "Also in case if dependancy doesn't hold, algorithm also returns list of highlights and suggests how to adjust dependancy."
+        "Also, in case if dependency doesn't hold, the algorithm returns a list of highlights and suggests how to adjust the dependency."
+    )
+    print(
+        "You can also read about mining Matching Dependancies in examples/basic/mining_md.py"
+    )
+
+    print(
+        "To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n"
-        "To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n"
+        "To verify a Matching Dependency, we must first define Column Similarity Classifiers for the data.\n"
-        "To verify Matching Dependancy, firstly we must define Column Similarity Classifiers for tables.\n"
+        "To verify a Matching Dependency, we must first define Column Similarity Classifiers for the data.\n"
+        "Column Similarity Classifier consists of Column Match and decision boundary. "
-        "Column Similarity Classifier consists of Column Match and decision boundary. "
+        "A Column Similarity Classifier consists of a Column Match and a decision boundary. "
-        "Column Similarity Classifier consists of Column Match and decision boundary. "
+        "A Column Similarity Classifier consists of a Column Match and a decision boundary. "
+        "Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n"
-        "Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n"
+        "A Column Match consists of two indices—the columns in the left and right tables—and a similarity measure (Levenshtein Similarity, for example).\n"
-        "Column Match consists of two indices: columns in left and right table and similarity measure (Levevnshtein Similarity, for example).\n"
+        "A Column Match consists of two indices—the columns in the left and right tables—and a similarity measure (Levenshtein Similarity, for example).\n"
+        "We will use notation [measure(i, j) >= lambda] for Column Similarity Classifier with i'th column of left table, j'th column of right table, similarity measure 'measure' and decision boundary 'lambda'. "
-        "We will use notation [measure(i, j) >= lambda] for Column Similarity Classifier with i'th column of left table, j'th column of right table, similarity measure 'measure' and decision boundary 'lambda'. "
+        "We will use the notation [measure(i, j) >= lambda] for a Column Similarity Classifier that specifies the i'th column of the left table, the j'th column of the right table, the similarity measure 'measure' and the decision boundary 'lambda'. "
-        "We will use notation [measure(i, j) >= lambda] for Column Similarity Classifier with i'th column of left table, j'th column of right table, similarity measure 'measure' and decision boundary 'lambda'. "
+        "We will use the notation [measure(i, j) >= lambda] for a Column Similarity Classifier that specifies the i'th column of the left table, the j'th column of the right table, the similarity measure 'measure' and the decision boundary 'lambda'. "
+        'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is valid for ColumnMatch between column with "left_col_name" and "left_col_name" of left and right tables respectively.\n'
-        'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is valid for ColumnMatch between column with "left_col_name" and "left_col_name" of left and right tables respectively.\n'
+        'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is also valid for a ColumnMatch specifying the columns "left_col_name" and "right_col_name" of the left and right tables respectively.\n'
-        'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is valid for ColumnMatch between column with "left_col_name" and "left_col_name" of left and right tables respectively.\n'
+        'Also, notation like [measure("left_col_name", "right_col_name") >= lambda] is also valid for a ColumnMatch specifying the columns "left_col_name" and "right_col_name" of the left and right tables respectively.\n'
+    )
+    animals_beverages_example()
+    print("-" * 50)
+    theatre_example()
diff --git a/examples/datasets/theatres_typos.csv b/examples/datasets/theatres_typos.csv
@@ -0,0 +1,9 @@
+Title,Theatre,Duration
+Don Quixote,Sydney Opera House,139
+Don Quixte,Teatro alla Scala,135
+Don Quixote,Grand Opera House,140
+Cinderela,Teatro alla Scala,110
+Cinderella,Grand Opera House,112
+Romeoand Juliet,Sydney Opera House,160
+Romeo and Juliet,Teatro alla Scala,163
+Romeo and Juliet,Grand Opera House,165
diff --git a/src/core/algorithms/algorithms.h b/src/core/algorithms/algorithms.h
@@ -11,6 +11,7 @@
 #include "algorithms/ind/ind_verifier/ind_verifier.h"
 #include "algorithms/ind/mining_algorithms.h"
 #include "algorithms/md/mining_algorithms.h"
+#include "algorithms/md/verification_algorithms.h"
 #include "algorithms/metric/verification_algorithms.h"
 #include "algorithms/nar/mining_algorithms.h"
 #include "algorithms/od/mining_algorithms.h"

diff --git a/src/core/algorithms/md/md_verifier/cmptr.h b/src/core/algorithms/md/md_verifier/cmptr.h
@@ -0,0 +1,5 @@
+#include "algorithms/md/hymd/preprocessing/column_matches/column_match.h"
+
+namespace algos::md {
+using CMPtr = std::shared_ptr<hymd::preprocessing::column_matches::ColumnMatch>;
+}
diff --git a/src/core/algorithms/md/md_verifier/column_similarity_classifier.h b/src/core/algorithms/md/md_verifier/column_similarity_classifier.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <sstream>
+
+#include "algorithms/md/hymd/preprocessing/column_matches/column_match.h"
+#include "algorithms/md/md_verifier/cmptr.h"
+
+namespace algos::md {
+class ColumnSimilarityClassifier {
+private:
+    CMPtr column_match_;
+    model::md::DecisionBoundary decision_boundary_;
+
+public:
+    ColumnSimilarityClassifier() = default;
+
+    ColumnSimilarityClassifier(CMPtr column_match, model::md::DecisionBoundary decision_boundary)
+        : column_match_(column_match), decision_boundary_(decision_boundary) {}
-        : column_match_(column_match), decision_boundary_(decision_boundary) {}
+        : column_match_(std::move(column_match)), decision_boundary_(decision_boundary) {}
-        : column_match_(column_match), decision_boundary_(decision_boundary) {}
+        : column_match_(std::move(column_match)), decision_boundary_(decision_boundary) {}
+
+    CMPtr GetColumnMatch() const {
-    CMPtr GetColumnMatch() const {
+    CMPtr const& GetColumnMatch() const {
-    CMPtr GetColumnMatch() const {
+    CMPtr const& GetColumnMatch() const {
+        return column_match_;
+    }
+
+    model::md::DecisionBoundary GetDecisionBoundary() const {
+        return decision_boundary_;
+    }
+};
+}  // namespace algos::md
diff --git a/src/core/algorithms/md/md_verifier/highlights/highlights.cpp b/src/core/algorithms/md/md_verifier/highlights/highlights.cpp
@@ -0,0 +1,17 @@
+#include "algorithms/md/md_verifier/highlights/highlights.h"
+
+namespace algos::md {
+MDHighlights MDHighlights::CreateFrom(model::RhsSimilarityClassifierDesctription rhs_desc,
+                                      RowsPairSet const& rows_pairs,
+                                      RowsToSimilarityMap const& rows_to_similarity) {
+    MDHighlights highlights;
+    for (auto [left_row, right_rows_set] : rows_pairs) {
+        for (auto right_row : right_rows_set) {
-    for (auto [left_row, right_rows_set] : rows_pairs) {
-        for (auto right_row : right_rows_set) {
+    for (auto const& [left_row, right_rows_set] : rows_pairs) {
+        for (model::Index right_row_index : right_rows_set) {
-    for (auto [left_row, right_rows_set] : rows_pairs) {
-        for (auto right_row : right_rows_set) {
+    for (auto const& [left_row, right_rows_set] : rows_pairs) {
+        for (model::Index right_row_index : right_rows_set) {
+            highlights.highlights_.emplace_back(left_row, right_row, rhs_desc,
+                                                rows_to_similarity.at({left_row, right_row}));
+        }
+    }
+
+    return highlights;
-    return highlights;
+    return {std::move(highlights)};
-    return highlights;
+    return {std::move(highlights)};
+}
+}  // namespace algos::md
diff --git a/src/core/algorithms/md/md_verifier/highlights/highlights.h b/src/core/algorithms/md/md_verifier/highlights/highlights.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <sstream>
+
+#include "algorithms/md/md.h"
+#include "algorithms/md/md_verifier/validation/rows_pairs.h"
+#include "algorithms/md/similarity.h"
+#include "config/indices/type.h"
+#include "model/index.h"
+
+namespace algos::md {
+class MDHighlights {
+public:
+    struct Highlight {
+        model::Index left_table_row;
+        model::Index right_table_row;
+        model::md::Similarity similarity;
+        model::RhsSimilarityClassifierDesctription rhs_decs;
+
+        Highlight(model::Index left_table_row, model::Index right_table_row,
+                  model::RhsSimilarityClassifierDesctription rhs_decs,
+                  model::md::Similarity similarity)
+            : left_table_row(left_table_row),
+              right_table_row(right_table_row),
+              similarity(similarity),
+              rhs_decs(rhs_decs) {}
+
+        std::string ToString() const {
+            std::stringstream ss;
+            ss << rhs_decs.column_match_description.column_match_name << '('
+               << rhs_decs.column_match_description.left_column_description.column_name << ", "
+               << rhs_decs.column_match_description.right_column_description.column_name
+               << ") violates MD in " << left_table_row << " row of left table and "
+               << right_table_row << " row of right table with similarity " << similarity
+               << " and decision boundary " << rhs_decs.decision_boundary;
+            return ss.str();
+        };
+
+        std::string ToStringIndexes() const {
+            std::stringstream ss;
+            ss << rhs_decs.column_match_description.column_match_name << '('
+               << rhs_decs.column_match_description.left_column_description.column_index << ", "
+               << rhs_decs.column_match_description.right_column_description.column_index
+               << ") violates MD in " << left_table_row << " row of left table and "
+               << right_table_row << " row of right table with similarity " << similarity
+               << " and decision boundary " << rhs_decs.decision_boundary;
+            return ss.str();
+        };
+    };
+
+private:
+    std::vector<Highlight> highlights_;
+
+public:
+    std::vector<Highlight> const& GetHighlights() const {
+        return highlights_;
+    }
+
+    static MDHighlights CreateFrom(model::RhsSimilarityClassifierDesctription rhs_desc,
+                                   RowsPairSet const& rows_pairs,
+                                   RowsToSimilarityMap const& rows_to_similarity);
+};
+}  // namespace algos::md