-
Notifications
You must be signed in to change notification settings - Fork 1.6k
[BUGFIX] FIPS Compliance #11066
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
[BUGFIX] FIPS Compliance #11066
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -150,7 +150,7 @@ | |
) | ||
from .expect_column_values_to_be_valid_mac import ExpectColumnValuesToBeValidMac | ||
from .expect_column_values_to_be_valid_mbti import ExpectColumnValuesToBeValidMbti | ||
from .expect_column_values_to_be_valid_md5 import ExpectColumnValuesToBeValidMd5 | ||
from .expect_column_values_to_be_valid_sha256 import ExpectColumnValuesToBeValidSha256 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't look like you've renamed this file. Per the discussion comment I just left, we should have both expectations live side-by-side so don't break our current users. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This expectation should still be available to people who want to use it. It would not impact FIPS users since they wouldn't use this expectation. If you want to author an equivalent sha256 expectation I would do that in a separate PR. For this PR I would revert this change. |
||
from .expect_column_values_to_be_valid_meid import ExpectColumnValuesToBeValidMeid | ||
from .expect_column_values_to_be_valid_mic import ExpectColumnValuesToBeValidMic | ||
from .expect_column_values_to_be_valid_mic_match_country_code import ( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,22 +7,22 @@ | |
column_condition_partial, | ||
) | ||
|
||
MD5_REGEX = r"^([a-fA-F\d]{32})$" | ||
SHA256_REGEX = r"^([a-fA-F\d]{64})$" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would revert changes to this file. It would not impact FIPS users. |
||
|
||
|
||
# This class defines a Metric to support your Expectation. | ||
# For most ColumnMapExpectations, the main business logic for calculation will live in this class. | ||
class ColumnValuesToBeValidMd5(ColumnMapMetricProvider): | ||
class ColumnValuesToBeValidSha256(ColumnMapMetricProvider): | ||
# This is the id string that will be used to reference your metric. | ||
condition_metric_name = "column_values.valid_md5" | ||
condition_metric_name = "column_values.valid_sha256" | ||
|
||
# This method implements the core logic for the PandasExecutionEngine | ||
@column_condition_partial(engine=PandasExecutionEngine) | ||
def _pandas(cls, column, **kwargs): | ||
def matches_md5_regex(x): | ||
return bool(re.match(MD5_REGEX, str(x))) | ||
def matches_sha256_regex(x): | ||
return bool(re.match(SHA256_REGEX, str(x))) | ||
|
||
return column.apply(lambda x: matches_md5_regex(x) if x else False) | ||
return column.apply(lambda x: matches_sha256_regex(x) if x else False) | ||
|
||
# This method defines the business logic for evaluating your metric when using a SqlAlchemyExecutionEngine | ||
# @column_condition_partial(engine=SqlAlchemyExecutionEngine) | ||
|
@@ -36,42 +36,42 @@ def matches_md5_regex(x): | |
|
||
|
||
# This class defines the Expectation itself | ||
class ExpectColumnValuesToBeValidMd5(ColumnMapExpectation): | ||
"""Expect column values to be valid MD5 hashes.""" | ||
class ExpectColumnValuesToBeValidSha256(ColumnMapExpectation): | ||
"""Expect column values to be valid SHA256 hashes.""" | ||
|
||
# These examples will be shown in the public gallery. | ||
# They will also be executed as unit tests for your Expectation. | ||
examples = [ | ||
{ | ||
"data": { | ||
"well_formed_md5": [ | ||
"00000000000000000000000000000000", | ||
"71c2709c8e93e30c40bb441d7423ccfd", # md5 hash of "great_expectations" | ||
"71C2709C8E93E30C40BB441D7423CCFD", | ||
"71C2709c8e93e30c40bb441d7423ccfD", | ||
"ffffffffffffffffffffffffffffffff", | ||
"well_formed_sha256": [ | ||
"0000000000000000000000000000000000000000000000000000000000000000", | ||
"e93ac4c39c921632d9a237cd86452a3ba417e7e27b68fb6f0acef66f0e18f2ab", # sha256 hash of "great_expectations" UTF-8 | ||
"E93AC4C39C921632D9A237CD86452A3BA417E7E27B68FB6F0ACEF66F0E18F2AB", | ||
"e93ac4c39C921632d9a237cD86452a3Ba417e7e27B68fb6f0acef66f0e18F2aB", | ||
"ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff", | ||
], | ||
"malformed_md5": [ | ||
"malformed_sha256": [ | ||
"", | ||
"ab12", | ||
"71c2709c8e93e30c40bb441d7423ccfdffff", | ||
"71c2709c8e93e30c40bb441d7423xxxx", | ||
"This is not valid md5", | ||
"e93ac4c39c921632d9a237cd86452a3ba417e7e27b68fb6f0acef66f0e18f2abffff", | ||
"e93ac4c39c921632d9a237cd86452a3ba417e7e27b68fb6f0acef66f0e18f2abxxxx", | ||
"This is not valid sha256", | ||
], | ||
}, | ||
"tests": [ | ||
{ | ||
"title": "basic_positive_test", | ||
"exact_match_out": False, | ||
"include_in_gallery": True, | ||
"in": {"column": "well_formed_md5"}, | ||
"in": {"column": "well_formed_sha256"}, | ||
"out": {"success": True}, | ||
}, | ||
{ | ||
"title": "basic_negative_test", | ||
"exact_match_out": False, | ||
"include_in_gallery": True, | ||
"in": {"column": "malformed_md5"}, | ||
"in": {"column": "malformed_sha256"}, | ||
"out": {"success": False}, | ||
}, | ||
], | ||
|
@@ -80,7 +80,7 @@ class ExpectColumnValuesToBeValidMd5(ColumnMapExpectation): | |
|
||
# This is the id string of the Metric used by this Expectation. | ||
# For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above. | ||
map_metric = "column_values.valid_md5" | ||
map_metric = "column_values.valid_sha256" | ||
|
||
# This is a list of parameter names that can affect whether the Expectation evaluates to True or False | ||
success_keys = ("mostly",) | ||
|
@@ -99,4 +99,4 @@ class ExpectColumnValuesToBeValidMd5(ColumnMapExpectation): | |
|
||
|
||
if __name__ == "__main__": | ||
ExpectColumnValuesToBeValidMd5().print_diagnostic_checklist() | ||
ExpectColumnValuesToBeValidSha256().print_diagnostic_checklist() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,10 +7,10 @@ | |
|
||
#### Available methods for `sampling_method` values and their configuration parameters: | ||
|
||
| `sampling_method` | `sampling_kwargs` | Returned Batch Data | | ||
|-----------------------|----------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------| | ||
| `sample_using_limit` | `n=num_rows` | First up to to n (specific limit parameter) rows of batch | | ||
| `sample_using_random` | `p=fraction` | Rows selected at random, whose number amounts to selected fraction of total number of rows in batch | | ||
| `sample_using_mod` | `column_name='col', mod=<int>` | Take the mod of named column, and only keep rows that match the given value | | ||
| `sample_using_a_list` | `column_name='col', value_list=<list[val]>` | Match the values in the named column against value_list, and only keep the matches | | ||
| `sample_using_hash` | `column_name='col', hash_digits=<int>, hash_value=<str>` | Hash the values in the named column (using "md5" hash function), and only keep rows that match the given hash_value | | ||
| `sampling_method` | `sampling_kwargs` | Returned Batch Data | | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is documentation for the 0.18 branch so should not be updated. |
||
|-----------------------|----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------| | ||
| `sample_using_limit` | `n=num_rows` | First up to to n (specific limit parameter) rows of batch | | ||
| `sample_using_random` | `p=fraction` | Rows selected at random, whose number amounts to selected fraction of total number of rows in batch | | ||
| `sample_using_mod` | `column_name='col', mod=<int>` | Take the mod of named column, and only keep rows that match the given value | | ||
| `sample_using_a_list` | `column_name='col', value_list=<list[val]>` | Match the values in the named column against value_list, and only keep the matches | | ||
| `sample_using_hash` | `column_name='col', hash_digits=<int>, hash_value=<str>` | Hash the values in the named column (using "sha256" hash function), and only keep rows that match the given hash_value | |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,17 +7,17 @@ | |
|
||
#### Available methods for `splitter_method` values and their configuration parameters: | ||
|
||
| `splitter_method` | `splitter_kwargs` | Returned Batch Data | | ||
|-----------------------------------|--------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | ||
| `split_on_whole_table` | N/A | Identical to original | | ||
| `split_on_column_value` | `column_name='col'` | Rows where value of column_name are same | | ||
| `split_on_year` | `column_name='col'` | Rows where the year of a datetime column are the same | | ||
| `split_on_year_and_month` | `column_name='col'` | Rows where the year and month of a datetime column are the same | | ||
| `split_on_year_and_month_and_day` | `column_name='col'` | Rows where the year, month and day of a datetime column are the same | | ||
| `split_on_date_parts` | `column_name='col', date_parts='<list[DatePart]>'` | Rows where the date parts of a datetime column are the same. Date parts can be specified as DatePart objects or as their string equivalent e.g. "year", "month", "week", "day", "hour", "minute", or "second" | | ||
| `split_on_divided_integer` | `column_name='col', divisor=<int>` | Rows where value of column_name divided (using integer division) by the given divisor are same | | ||
| `split_on_mod_integer` | `column_name='col', mod=<int>` | Rows where value of column_name divided (using modular division) by the given mod are same | | ||
| `split_on_multi_column_values` | `column_names='<list[col]>'` | Rows where values of column_names are same | | ||
| `split_on_converted_datetime` | `column_name='col', date_format_string=<'%Y-%m-%d'>` | Rows where value of column_name converted to datetime using the given date_format_string are same | | ||
| `split_on_hashed_column` | `column_name='col', hash_digits=<int>` | Rows where value of column_name hashed (using "md5" hash function) and retaining the stated number of hash_digits are same (experimental) | | ||
| `splitter_method` | `splitter_kwargs` | Returned Batch Data | | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is also 0.18 documentation. |
||
|-----------------------------------|--------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | ||
| `split_on_whole_table` | N/A | Identical to original | | ||
| `split_on_column_value` | `column_name='col'` | Rows where value of column_name are same | | ||
| `split_on_year` | `column_name='col'` | Rows where the year of a datetime column are the same | | ||
| `split_on_year_and_month` | `column_name='col'` | Rows where the year and month of a datetime column are the same | | ||
| `split_on_year_and_month_and_day` | `column_name='col'` | Rows where the year, month and day of a datetime column are the same | | ||
| `split_on_date_parts` | `column_name='col', date_parts='<list[DatePart]>'` | Rows where the date parts of a datetime column are the same. Date parts can be specified as DatePart objects or as their string equivalent e.g. "year", "month", "week", "day", "hour", "minute", or "second" | | ||
| `split_on_divided_integer` | `column_name='col', divisor=<int>` | Rows where value of column_name divided (using integer division) by the given divisor are same | | ||
| `split_on_mod_integer` | `column_name='col', mod=<int>` | Rows where value of column_name divided (using modular division) by the given mod are same | | ||
| `split_on_multi_column_values` | `column_names='<list[col]>'` | Rows where values of column_names are same | | ||
| `split_on_converted_datetime` | `column_name='col', date_format_string=<'%Y-%m-%d'>` | Rows where value of column_name converted to datetime using the given date_format_string are same | | ||
| `split_on_hashed_column` | `column_name='col', hash_digits=<int>` | Rows where value of column_name hashed (using "sha256" hash function) and retaining the stated number of hash_digits are same (experimental) | | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,7 +49,7 @@ An Expectation can also expose Metrics, such as the observed value of a useful s | |
|
||
Validation Results can expose Metrics that are defined by specific Expectations that have been validated, called "Expectation Defined Metrics." To access those values, you address the Metric as a dot-delimited string that identifies the value, such as `expect_column_values_to_be_unique.success`or `expect_column_values_to_be_between.result.unexpected_percent`. These Metrics may be stored in a Metrics Store. | ||
|
||
A `metric_kwargs_id` is a string representation of the Metric Kwargs that can be used as a database key. For simple cases, it could be easily readable, such as `column=Age`, but when there are multiple keys and values or complex values, it will most likely be a md5 hash of key/value pairs. It can also be `None` in the case that there are no kwargs required to identify the Metric. | ||
A `metric_kwargs_id` is a string representation of the Metric Kwargs that can be used as a database key. For simple cases, it could be easily readable, such as `column=Age`, but when there are multiple keys and values or complex values, it will most likely be a sha256 hash of key/value pairs. It can also be `None` in the case that there are no kwargs required to identify the Metric. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also 0.18 documentation. |
||
|
||
The following examples demonstrate how Metrics are defined: | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
from __future__ import annotations | ||
|
||
from hashlib import md5 | ||
from hashlib import sha256 | ||
|
||
|
||
def anonymize(string: str) -> str: | ||
return md5(string.encode("utf-8")).hexdigest() | ||
return sha256(string.encode("utf-8")).hexdigest() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is used for sending analytic events and we would care if the output of this method changed. I would either:
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,7 +29,7 @@ def to_id(self, id_keys=None, id_ignore_keys=None) -> IDDictID: | |
return f"{key}={self[key]!s}" | ||
|
||
_id_dict = convert_to_json_serializable(data={k: self[k] for k in id_keys}) | ||
return hashlib.md5(json.dumps(_id_dict, sort_keys=True).encode("utf-8")).hexdigest() | ||
return hashlib.sha256(json.dumps(_id_dict, sort_keys=True).encode("utf-8")).hexdigest() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is likely fine though I'd want to review how this is used more. We could parameterize this by making a setting. We could share the setting with anonymize above. |
||
|
||
@override | ||
def __hash__(self) -> int: # type: ignore[override] # FIXME CoP | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -641,4 +641,4 @@ def hash_pandas_dataframe(df): | |
# In case of facing unhashable objects (like dict), use pickle | ||
obj = pickle.dumps(df, pickle.HIGHEST_PROTOCOL) | ||
|
||
return hashlib.md5(obj).hexdigest() | ||
return hashlib.sha256(obj).hexdigest() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar comment to above about making a setting and/or tracking down how this is used. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -225,7 +225,7 @@ def partition_on_hashed_column( | |
column_name: str, | ||
hash_digits: int, | ||
batch_identifiers: dict, | ||
hash_function_name: str = "md5", | ||
hash_function_name: str = "sha256", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't look like this is called internally anywhere and it is already parameterized. I think leaving the default as is to preserve backwards compatibility would be desired. So I would revert this change. |
||
) -> pd.DataFrame: | ||
"""Partition on the hashed value of the named column""" | ||
try: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would leave this as is and revert this change.
Data Profilers are a legacy part of our codebase. Users can elect to use this or not and can elect which features they want. A FIPS user would not use the md5 but other users might find it useful for their datasets.