Merge pull request #1642 from moj-analytical-services/update_sqlglot

ThomasHepworth · web-flow · commit 26d71481bbdb · 2023-10-11T14:40:48.000+01:00
Update sqlglot to &gt;=13.0.0
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,8 +14,7 @@ jsonschema = ">=3.2,<5.0"
 # 1.3.5 is the last version supporting py 3.7.1
 pandas = ">1.3.0"
 duckdb = ">=0.8.0"
-# normalize issue in sqlglot - temporarily exclude updates
-sqlglot = ">=7.0.0,<11.4.2"
+sqlglot = ">=13.0.0, <19.0.0"
 altair = "^5.0.1"
 Jinja2 = ">=3.0.3"
 phonetics = "^1.0.5"
diff --git a/splink/comparison_level.py b/splink/comparison_level.py
@@ -10,6 +10,7 @@
 import sqlglot
 from sqlglot.expressions import Identifier
 from sqlglot.optimizer.normalize import normalize
+from sqlglot.optimizer.simplify import simplify
 
 from .constants import LEVEL_NOT_OBSERVED_TEXT
 from .default_from_jsonschema import default_value_from_schema
@@ -495,7 +496,7 @@ def _is_exact_match(self):
         sql_syntax_tree = sqlglot.parse_one(
             self.sql_condition.lower(), read=self.sql_dialect
         )
-        sql_cnf = normalize(sql_syntax_tree)
+        sql_cnf = simplify(normalize(sql_syntax_tree))
 
         exprs = _get_and_subclauses(sql_cnf)
         for expr in exprs:
@@ -508,7 +509,7 @@ def _exact_match_colnames(self):
         sql_syntax_tree = sqlglot.parse_one(
             self.sql_condition.lower(), read=self.sql_dialect
         )
-        sql_cnf = normalize(sql_syntax_tree)
+        sql_cnf = simplify(normalize(sql_syntax_tree))
 
         exprs = _get_and_subclauses(sql_cnf)
         for expr in exprs:
diff --git a/splink/input_column.py b/splink/input_column.py
@@ -227,14 +227,11 @@ def _get_dialect_quotes(dialect):
 
 
 def _get_sqlglot_dialect_quotes(dialect: sqlglot.Dialect):
-    # TODO: once we drop support for sqlglot < 6.0.0, we can simplify this
     try:
-        # For sqlglot < 6.0.0
-        quotes = dialect.identifiers
-        quote = '"' if '"' in quotes else quotes[0]
-        start = end = quote
+        # For sqlglot >= 16.0.0
+        start = dialect.IDENTIFIER_START
+        end = dialect.IDENTIFIER_END
     except AttributeError:
-        # For sqlglot >= 6.0.0
         start = dialect.identifier_start
         end = dialect.identifier_end
     return start, end
diff --git a/tests/test_comparison_level.py b/tests/test_comparison_level.py
@@ -0,0 +1,85 @@
+from pytest import mark, raises
+
+from splink.comparison_level import ComparisonLevel
+
+from .decorator import mark_with_dialects_excluding
+
+
+def make_comparison_level(sql_condition, dialect):
+    return ComparisonLevel(
+        {
+            "sql_condition": sql_condition,
+            "label_for_charts": "nice_informative_label",
+        },
+        sql_dialect=dialect,
+    )
+
+
+# SQL conditions that are of 'exact match' type
+exact_matchy_sql_conditions_and_columns = [
+    ("col_l = col_r", {"col"}),
+    ("col_l = col_r AND another_col_l = another_col_r", {"col", "another_col"}),
+    (
+        "col_l = col_r AND another_col_l = another_col_r AND third_l = third_r",
+        {"col", "another_col", "third"},
+    ),
+    (
+        "(col_l = col_r AND another_col_l = another_col_r) AND third_l = third_r",
+        {"col", "another_col", "third"},
+    ),
+    (
+        "col_l = col_r AND (another_col_l = another_col_r AND third_l = third_r)",
+        {"col", "another_col", "third"},
+    ),
+]
+
+
+@mark.parametrize(
+    "sql_condition, exact_match_cols", exact_matchy_sql_conditions_and_columns
+)
+@mark_with_dialects_excluding()
+def test_is_exact_match_for_exact_matchy_levels(
+    sql_condition, exact_match_cols, dialect
+):
+    lev = make_comparison_level(sql_condition, dialect)
+    assert lev._is_exact_match
+
+
+@mark.parametrize(
+    "sql_condition, exact_match_cols", exact_matchy_sql_conditions_and_columns
+)
+@mark_with_dialects_excluding()
+def test_exact_match_colnames_for_exact_matchy_levels(
+    sql_condition, exact_match_cols, dialect
+):
+    lev = make_comparison_level(sql_condition, dialect)
+    assert set(lev._exact_match_colnames) == exact_match_cols
+
+
+# SQL conditions that are NOT of 'exact match' type
+non_exact_matchy_sql_conditions = [
+    "levenshtein(col_l, col_r) < 3",
+    "col_l < col_r",
+    "col_l = col_r OR another_col_l = another_col_r",
+    "col_l = a_different_col_r",
+    "col_l = col_r AND (col_2_l = col_2_r OR col_3_l = col_3_r)",
+    "col_l = col_r AND (col_2_l < col_2_r)",
+    "substr(col_l, 2) = substr(col_r, 2)",
+]
+
+
+@mark.parametrize("sql_condition", non_exact_matchy_sql_conditions)
+@mark_with_dialects_excluding()
+def test_is_exact_match_for_non_exact_matchy_levels(sql_condition, dialect):
+    lev = make_comparison_level(sql_condition, dialect)
+    assert not lev._is_exact_match
+
+
+@mark.parametrize("sql_condition", non_exact_matchy_sql_conditions)
+@mark_with_dialects_excluding()
+def test_exact_match_colnames_for_non_exact_matchy_levels(sql_condition, dialect):
+    lev = make_comparison_level(sql_condition, dialect)
+    # _exact_match_colnames should have an error if it is
+    # not actually an exact match level
+    with raises(ValueError):
+        lev._exact_match_colnames
diff --git a/tests/test_compound_comparison_levels.py b/tests/test_compound_comparison_levels.py
@@ -1,6 +1,4 @@
 import pandas as pd
-from sqlglot import parse_one
-from sqlglot.optimizer.normalize import normalize
 
 import splink.duckdb.comparison_level_library as cll
 import splink.duckdb.comparison_library as cl
@@ -216,52 +214,3 @@ def test_complex_compound_comparison_level():
     linker = DuckDBLinker(df, settings)
 
     linker.estimate_parameters_using_expectation_maximisation("1=1")
-
-
-def test_normalise():
-    # check that the sqlglot normaliser is doing what we think
-    # try to not impose specific form too strongly, so we aren't too tightly
-    # coupled to the implementationß
-    sql_syntax_tree = parse_one("a or (b and c)")
-    sql_cnf = normalize(sql_syntax_tree).sql().lower()
-
-    subclauses_expected = [
-        ["a or c", "c or a"],
-        ["a or b", "b or a"],
-    ]
-
-    # get subclauses and remove outer parens
-    subclauses_found = map(lambda s: s.strip("()"), sql_cnf.split(" and "))
-
-    # loop through subclauses, make sure that we have exactly one of each
-    for found in subclauses_found:
-        term_found = False
-        for i, expected in enumerate(subclauses_expected):
-            if found in expected:
-                del subclauses_expected[i]
-                term_found = True
-                break
-        assert term_found, f"CNF contains unexpected clause '{found}'"
-    assert not subclauses_expected
-
-    # and a slightly more complex statement
-    sql_syntax_tree = parse_one("(a and b) or (a and c) or (c and d) or (d and b)")
-    sql_cnf = normalize(sql_syntax_tree).sql().lower()
-
-    subclauses_expected = [
-        ["b or c", "c or b"],
-        ["a or d", "d or a"],
-    ]
-
-    subclauses_found = map(lambda s: s.strip("()"), sql_cnf.split(" and "))
-
-    # loop through subclauses, make sure that we have exactly one of each
-    for found in subclauses_found:
-        term_found = False
-        for i, expected in enumerate(subclauses_expected):
-            if found in expected:
-                del subclauses_expected[i]
-                term_found = True
-                break
-        assert term_found, f"CNF contains unexpected clause '{found}'"
-    assert not subclauses_expected
diff --git a/tests/test_sql_transform.py b/tests/test_sql_transform.py
@@ -36,12 +36,16 @@ def test_move_l_r_table_prefix_to_column_suffix():
     move_l_r_test(br, expected)
 
     br = "len(list_filter(l.name_list, x -> list_contains(r.name_list, x))) >= 1"
-    expected = "len(list_filter(name_list_l, x -> list_contains(name_list_r, x))) >= 1"
+    expected = (
+        "length(list_filter(name_list_l, x -> list_contains(name_list_r, x))) >= 1"
+    )
     move_l_r_test(br, expected)
 
     br = "len(list_filter(l.name_list, x -> list_contains(r.name_list, x))) >= 1"
     res = move_l_r_table_prefix_to_column_suffix(br)
-    expected = "len(list_filter(name_list_l, x -> list_contains(name_list_r, x))) >= 1"
+    expected = (
+        "length(list_filter(name_list_l, x -> list_contains(name_list_r, x))) >= 1"
+    )
     assert res.lower() == expected.lower()