great-expectations · ccravens · Apr 6, 2025 · Apr 6, 2025 · billdirks · Apr 29, 2025
diff --git a/contrib/capitalone_dataprofiler_expectations/README.md b/contrib/capitalone_dataprofiler_expectations/README.md
@@ -290,7 +290,7 @@ The format for an unstructured profile is below:
 * CREDIT_CARD
 * EMAIL_ADDRESS
 * UUID
-* HASH_OR_KEY (md5, sha1, sha256, random hash, etc.)
+* HASH_OR_KEY (sha1, sha256, random hash, etc.)
 * IPV4
 * IPV6
 * MAC_ADDRESS

diff --git a/...ypes_expectations/great_expectations_semantic_types_expectations/expectations/__init__.py b/...ypes_expectations/great_expectations_semantic_types_expectations/expectations/__init__.py
@@ -150,7 +150,7 @@
 )
 from .expect_column_values_to_be_valid_mac import ExpectColumnValuesToBeValidMac
 from .expect_column_values_to_be_valid_mbti import ExpectColumnValuesToBeValidMbti
-from .expect_column_values_to_be_valid_md5 import ExpectColumnValuesToBeValidMd5
+from .expect_column_values_to_be_valid_sha256 import ExpectColumnValuesToBeValidSha256
 from .expect_column_values_to_be_valid_meid import ExpectColumnValuesToBeValidMeid
 from .expect_column_values_to_be_valid_mic import ExpectColumnValuesToBeValidMic
 from .expect_column_values_to_be_valid_mic_match_country_code import (

diff --git a/...ctations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_md5.py b/...ctations_semantic_types_expectations/expectations/expect_column_values_to_be_valid_md5.py
@@ -7,22 +7,22 @@
     column_condition_partial,
 )
 
-MD5_REGEX = r"^([a-fA-F\d]{32})$"
+SHA256_REGEX = r"^([a-fA-F\d]{64})$"
 
 
 # This class defines a Metric to support your Expectation.
 # For most ColumnMapExpectations, the main business logic for calculation will live in this class.
-class ColumnValuesToBeValidMd5(ColumnMapMetricProvider):
+class ColumnValuesToBeValidSha256(ColumnMapMetricProvider):
     # This is the id string that will be used to reference your metric.
-    condition_metric_name = "column_values.valid_md5"
+    condition_metric_name = "column_values.valid_sha256"
 
     # This method implements the core logic for the PandasExecutionEngine
     @column_condition_partial(engine=PandasExecutionEngine)
     def _pandas(cls, column, **kwargs):
-        def matches_md5_regex(x):
-            return bool(re.match(MD5_REGEX, str(x)))
+        def matches_sha256_regex(x):
+            return bool(re.match(SHA256_REGEX, str(x)))
 
-        return column.apply(lambda x: matches_md5_regex(x) if x else False)
+        return column.apply(lambda x: matches_sha256_regex(x) if x else False)
 
     # This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine
     # @column_condition_partial(engine=SqlAlchemyExecutionEngine)
@@ -36,42 +36,42 @@ def matches_md5_regex(x):
 
 
 # This class defines the Expectation itself
-class ExpectColumnValuesToBeValidMd5(ColumnMapExpectation):
-    """Expect column values to be valid MD5 hashes."""
+class ExpectColumnValuesToBeValidSha256(ColumnMapExpectation):
+    """Expect column values to be valid SHA256 hashes."""
 
     # These examples will be shown in the public gallery.
     # They will also be executed as unit tests for your Expectation.
     examples = [
         {
             "data": {
-                "well_formed_md5": [
-                    "00000000000000000000000000000000",
-                    "71c2709c8e93e30c40bb441d7423ccfd",  # md5 hash of "great_expectations"
-                    "71C2709C8E93E30C40BB441D7423CCFD",
-                    "71C2709c8e93e30c40bb441d7423ccfD",
-                    "ffffffffffffffffffffffffffffffff",
+                "well_formed_sha256": [
+                    "0000000000000000000000000000000000000000000000000000000000000000",
+                    "e93ac4c39c921632d9a237cd86452a3ba417e7e27b68fb6f0acef66f0e18f2ab",  # sha256 hash of "great_expectations" UTF-8
+                    "E93AC4C39C921632D9A237CD86452A3BA417E7E27B68FB6F0ACEF66F0E18F2AB",
+                    "e93ac4c39C921632d9a237cD86452a3Ba417e7e27B68fb6f0acef66f0e18F2aB",
+                    "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
                 ],
-                "malformed_md5": [
+                "malformed_sha256": [
                     "",
                     "ab12",
-                    "71c2709c8e93e30c40bb441d7423ccfdffff",
-                    "71c2709c8e93e30c40bb441d7423xxxx",
-                    "This is not valid md5",
+                    "e93ac4c39c921632d9a237cd86452a3ba417e7e27b68fb6f0acef66f0e18f2abffff",
+                    "e93ac4c39c921632d9a237cd86452a3ba417e7e27b68fb6f0acef66f0e18f2abxxxx",
+                    "This is not valid sha256",
                 ],
             },
             "tests": [
                 {
                     "title": "basic_positive_test",
                     "exact_match_out": False,
                     "include_in_gallery": True,
-                    "in": {"column": "well_formed_md5"},
+                    "in": {"column": "well_formed_sha256"},
                     "out": {"success": True},
                 },
                 {
                     "title": "basic_negative_test",
                     "exact_match_out": False,
                     "include_in_gallery": True,
-                    "in": {"column": "malformed_md5"},
+                    "in": {"column": "malformed_sha256"},
                     "out": {"success": False},
                 },
             ],
@@ -80,7 +80,7 @@ class ExpectColumnValuesToBeValidMd5(ColumnMapExpectation):
 
     # This is the id string of the Metric used by this Expectation.
     # For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above.
-    map_metric = "column_values.valid_md5"
+    map_metric = "column_values.valid_sha256"
 
     # This is a list of parameter names that can affect whether the Expectation evaluates to True or False
     success_keys = ("mostly",)
@@ -99,4 +99,4 @@ class ExpectColumnValuesToBeValidMd5(ColumnMapExpectation):
 
 
 if __name__ == "__main__":
-    ExpectColumnValuesToBeValidMd5().print_diagnostic_checklist()
+    ExpectColumnValuesToBeValidSha256().print_diagnostic_checklist()
diff --git a/...o_your_data/datasource_configuration/sql_components/_table_sampling_methods.mdx b/...o_your_data/datasource_configuration/sql_components/_table_sampling_methods.mdx
@@ -7,10 +7,10 @@
 
 #### Available methods for `sampling_method` values and their configuration parameters:
 
-| `sampling_method`     | `sampling_kwargs`                                        | Returned Batch Data                                                                                                 |
-|-----------------------|----------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------|
-| `sample_using_limit`  | `n=num_rows`                                             | First up to to n (specific limit parameter) rows of batch                                                           |
-| `sample_using_random` | `p=fraction`                                             | Rows selected at random, whose number amounts to selected fraction of total number of rows in batch                 |
-| `sample_using_mod`    | `column_name='col', mod=<int>`                           | Take the mod of named column, and only keep rows that match the given value                                         |
-| `sample_using_a_list` | `column_name='col', value_list=<list[val]>`              | Match the values in the named column against value_list, and only keep the matches                                  |
-| `sample_using_hash`   | `column_name='col', hash_digits=<int>, hash_value=<str>` | Hash the values in the named column (using "md5" hash function), and only keep rows that match the given hash_value |
+| `sampling_method`     | `sampling_kwargs`                                        | Returned Batch Data                                                                                                    |
+|-----------------------|----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
+| `sample_using_limit`  | `n=num_rows`                                             | First up to to n (specific limit parameter) rows of batch                                                              |
+| `sample_using_random` | `p=fraction`                                             | Rows selected at random, whose number amounts to selected fraction of total number of rows in batch                    |
+| `sample_using_mod`    | `column_name='col', mod=<int>`                           | Take the mod of named column, and only keep rows that match the given value                                            |
+| `sample_using_a_list` | `column_name='col', value_list=<list[val]>`              | Match the values in the named column against value_list, and only keep the matches                                     |
+| `sample_using_hash`   | `column_name='col', hash_digits=<int>, hash_value=<str>` | Hash the values in the named column (using "sha256" hash function), and only keep rows that match the given hash_value |
diff --git a/..._your_data/datasource_configuration/sql_components/_table_splitting_methods.mdx b/..._your_data/datasource_configuration/sql_components/_table_splitting_methods.mdx
@@ -7,17 +7,17 @@
 
 #### Available methods for `splitter_method` values and their configuration parameters:
 
-| `splitter_method`                | `splitter_kwargs`                                                       | Returned Batch Data                                                                                                                                                                                           |
-|-----------------------------------|--------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `split_on_whole_table`            | N/A                                                                      | Identical to original                                                                                                                                                                                         |
-| `split_on_column_value`           | `column_name='col'`                                                      | Rows where value of column_name are same                                                                                                                                                                      |
-| `split_on_year`                   | `column_name='col'`                                                      | Rows where the year of a datetime column are the same                                                                                                                                                         |
-| `split_on_year_and_month`         | `column_name='col'`                                                      | Rows where the year and month of a datetime column are the same                                                                                                                                               |
-| `split_on_year_and_month_and_day` | `column_name='col'`                                                      | Rows where the year, month and day of a datetime column are the same                                                                                                                                          |
-| `split_on_date_parts`             | `column_name='col', date_parts='<list[DatePart]>'`                       | Rows where the date parts of a datetime column are the same. Date parts can be specified as DatePart objects or as their string equivalent e.g. "year", "month", "week", "day", "hour", "minute", or "second" |
-| `split_on_divided_integer`        | `column_name='col', divisor=<int>`                                       | Rows where value of column_name divided (using integer division) by the given divisor are same                                                                                                               |
-| `split_on_mod_integer`            | `column_name='col', mod=<int>`                                           | Rows where value of column_name divided (using modular division) by the given mod are same                                                                                                                    |
-| `split_on_multi_column_values`    | `column_names='<list[col]>'`                                             | Rows where values of column_names are same                                                                                                                                                                    |
-| `split_on_converted_datetime`     | `column_name='col', date_format_string=<'%Y-%m-%d'>`                     | Rows where value of column_name converted to datetime using the given date_format_string are same                                                                                                             |
-| `split_on_hashed_column`          | `column_name='col', hash_digits=<int>`                                   | Rows where value of column_name hashed (using "md5" hash function) and retaining the stated number of hash_digits are same (experimental)                                                                     |
+| `splitter_method`                | `splitter_kwargs`                                                       | Returned Batch Data                                                                                                                                                                                                |
+|-----------------------------------|--------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `split_on_whole_table`            | N/A                                                                      | Identical to original                                                                                                                                                                                            |
+| `split_on_column_value`           | `column_name='col'`                                                      | Rows where value of column_name are same                                                                                                                                                                         |
+| `split_on_year`                   | `column_name='col'`                                                      | Rows where the year of a datetime column are the same                                                                                                                                                            |
+| `split_on_year_and_month`         | `column_name='col'`                                                      | Rows where the year and month of a datetime column are the same                                                                                                                                                  |
+| `split_on_year_and_month_and_day` | `column_name='col'`                                                      | Rows where the year, month and day of a datetime column are the same                                                                                                                                             |
+| `split_on_date_parts`             | `column_name='col', date_parts='<list[DatePart]>'`                       | Rows where the date parts of a datetime column are the same. Date parts can be specified as DatePart objects or as their string equivalent e.g. "year", "month", "week", "day", "hour", "minute", or "second"    |
+| `split_on_divided_integer`        | `column_name='col', divisor=<int>`                                       | Rows where value of column_name divided (using integer division) by the given divisor are same                                                                                                                   |
+| `split_on_mod_integer`            | `column_name='col', mod=<int>`                                           | Rows where value of column_name divided (using modular division) by the given mod are same                                                                                                                       |
+| `split_on_multi_column_values`    | `column_names='<list[col]>'`                                             | Rows where values of column_names are same                                                                                                                                                                       |
+| `split_on_converted_datetime`     | `column_name='col', date_format_string=<'%Y-%m-%d'>`                     | Rows where value of column_name converted to datetime using the given date_format_string are same                                                                                                                |
+| `split_on_hashed_column`          | `column_name='col', hash_digits=<int>`                                   | Rows where value of column_name hashed (using "sha256" hash function) and retaining the stated number of hash_digits are same (experimental)                                                                     |
 
diff --git a/docs/docusaurus/versioned_docs/version-0.18/reference/learn/terms/metric.md b/docs/docusaurus/versioned_docs/version-0.18/reference/learn/terms/metric.md
@@ -49,7 +49,7 @@ An Expectation can also expose Metrics, such as the observed value of a useful s
 
 Validation Results can expose Metrics that are defined by specific Expectations that have been validated, called "Expectation Defined Metrics." To access those values, you address the Metric as a dot-delimited string that identifies the value, such as `expect_column_values_to_be_unique.success`or `expect_column_values_to_be_between.result.unexpected_percent`. These Metrics may be stored in a Metrics Store.
 
-A `metric_kwargs_id` is a string representation of the Metric Kwargs that can be used as a database key. For simple cases, it could be easily readable, such as `column=Age`, but when there are multiple keys and values or complex values, it will most likely be a md5 hash of key/value pairs. It can also be `None` in the case that there are no kwargs required to identify the Metric.
+A `metric_kwargs_id` is a string representation of the Metric Kwargs that can be used as a database key. For simple cases, it could be easily readable, such as `column=Age`, but when there are multiple keys and values or complex values, it will most likely be a sha256 hash of key/value pairs. It can also be `None` in the case that there are no kwargs required to identify the Metric.
 
 The following examples demonstrate how Metrics are defined:
 

diff --git a/great_expectations/analytics/anonymizer.py b/great_expectations/analytics/anonymizer.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
-from hashlib import md5
+from hashlib import sha256
 
 
 def anonymize(string: str) -> str:
-    return md5(string.encode("utf-8")).hexdigest()
+    return sha256(string.encode("utf-8")).hexdigest()
diff --git a/great_expectations/core/id_dict.py b/great_expectations/core/id_dict.py
@@ -29,7 +29,7 @@ def to_id(self, id_keys=None, id_ignore_keys=None) -> IDDictID:
             return f"{key}={self[key]!s}"
 
         _id_dict = convert_to_json_serializable(data={k: self[k] for k in id_keys})
-        return hashlib.md5(json.dumps(_id_dict, sort_keys=True).encode("utf-8")).hexdigest()
+        return hashlib.sha256(json.dumps(_id_dict, sort_keys=True).encode("utf-8")).hexdigest()
 
     @override
     def __hash__(self) -> int:  # type: ignore[override] # FIXME CoP

diff --git a/great_expectations/execution_engine/pandas_execution_engine.py b/great_expectations/execution_engine/pandas_execution_engine.py
@@ -641,4 +641,4 @@ def hash_pandas_dataframe(df):
         # In case of facing unhashable objects (like dict), use pickle
         obj = pickle.dumps(df, pickle.HIGHEST_PROTOCOL)
 
-    return hashlib.md5(obj).hexdigest()
+    return hashlib.sha256(obj).hexdigest()
diff --git a/great_expectations/execution_engine/partition_and_sample/pandas_data_partitioner.py b/great_expectations/execution_engine/partition_and_sample/pandas_data_partitioner.py
@@ -225,7 +225,7 @@ def partition_on_hashed_column(
         column_name: str,
         hash_digits: int,
         batch_identifiers: dict,
-        hash_function_name: str = "md5",
+        hash_function_name: str = "sha256",
     ) -> pd.DataFrame:
         """Partition on the hashed value of the named column"""
         try: