diff --git a/fbpcs/private_computation/service/secret_scrubber.py b/fbpcs/private_computation/service/secret_scrubber.py new file mode 100644 index 000000000..e8e8c6fdd --- /dev/null +++ b/fbpcs/private_computation/service/secret_scrubber.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +import logging +import re +from dataclasses import dataclass +from typing import Dict, List + + +@dataclass +class Secret: + name: str + regex_pattern_str: str + + +@dataclass +class ScrubSummary: + scrubbed_output: str + total_substitutions: int + name_to_num_subs: Dict[str, int] + + def get_report(self) -> str: + report = "Scrub Summary\n" + for name, num_subs in self.name_to_num_subs.items(): + report += f"\n* {name}: {num_subs} substitutions" + report += f"\n* Total substitutions: {self.total_substitutions}" + return report + + +class SecretScrubber: + SECRETS: List[Secret] = [ + # https://aws.amazon.com/blogs/security/a-safer-way-to-distribute-aws-credentials-to-ec2/ + Secret("AWS access key id", "(? None: + self.patterns: Dict[str, re.Pattern] = { + secret.name: re.compile(secret.regex_pattern_str) for secret in self.SECRETS + } + + def scrub(self, string: str) -> ScrubSummary: + total_substitutions = 0 + name_to_num_subs = {} + for name, pattern in self.patterns.items(): + string, num_substitutes = pattern.subn(self.REPLACEMENT_STR, string) + name_to_num_subs[name] = num_substitutes + total_substitutions += num_substitutes + return ScrubSummary(string, total_substitutions, name_to_num_subs) + + +class LoggingSecretScrubber(logging.Formatter): + SECRET_SCRUBBER: SecretScrubber = SecretScrubber() + + def format(self, record: logging.LogRecord) -> str: + original = logging.Formatter.format(self, record) + return self.SECRET_SCRUBBER.scrub(original).scrubbed_output diff --git a/fbpcs/private_computation/test/service/test_secret_scrubber.py b/fbpcs/private_computation/test/service/test_secret_scrubber.py new file mode 100644 index 000000000..b8c3fd143 --- /dev/null +++ b/fbpcs/private_computation/test/service/test_secret_scrubber.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from unittest import TestCase + +from fbpcs.private_computation.service.secret_scrubber import SecretScrubber + + +class TestSecretScrubber(TestCase): + def setUp(self) -> None: + self.scrubber = SecretScrubber() + # these are not real. I randomly generated them + # and separated them into two string to avoid any + # scanners picking them up + # used exrex: https://github.com/asciimoo/exrex + self.aws_access_key_id = "L194ZYK14" + "K8XMRS9RIEE" + self.aws_secret_access_key = "NJuU/e4plYJ" + "gc6ykqaykh6yXoQxYFmdO+RNJtYHV" + self.graph_api_token = ( + "EAA3XZGPxS4159hjkreOqw0cUmSLx6K6fzZvemMGnzSQrjLadM" + "uhMS1ZeobDYIv4kBzGC5hU066Zj5ud0xNqqeNv3OhVJvYzqXUKit6Lj" + ) + + def test_scrub(self) -> None: + test_message = f""" + "CloudCredentialService": {{ + "class": "fbpcs.pid.service.credential_service.simple_cloud_credential_service.SimpleCloudCredentialService", + "constructor": {{ + "access_key_id": "{self.aws_access_key_id}", + "access_key_data": "{self.aws_secret_access_key}" + }} + }} + + access_token: {self.graph_api_token} + """ + + expected_output = f""" + "CloudCredentialService": {{ + "class": "fbpcs.pid.service.credential_service.simple_cloud_credential_service.SimpleCloudCredentialService", + "constructor": {{ + "access_key_id": "{self.scrubber.REPLACEMENT_STR}", + "access_key_data": "{self.scrubber.REPLACEMENT_STR}" + }} + }} + + access_token:{self.scrubber.REPLACEMENT_STR} + """ + + scrub_summary = self.scrubber.scrub(test_message) + + self.assertEqual(scrub_summary.scrubbed_output, expected_output) + self.assertEqual(scrub_summary.total_substitutions, 3) + for c in scrub_summary.name_to_num_subs.values(): + self.assertEqual(c, 1) diff --git a/fbpcs/private_computation_cli/private_computation_cli.py b/fbpcs/private_computation_cli/private_computation_cli.py index fe7871e6f..db2de7603 100644 --- a/fbpcs/private_computation_cli/private_computation_cli.py +++ b/fbpcs/private_computation_cli/private_computation_cli.py @@ -29,6 +29,7 @@ pc-cli run_attribution --config= --dataset_id= --input_path= --timestamp= --attribution_rule= --aggregation_type= --concurrency= --num_files_per_mpc_container= --k_anonymity_threshold= [options] pc-cli pre_validate --config= [--dataset_id=] --input_path= [--timestamp= --attribution_rule= --aggregation_type= --concurrency= --num_files_per_mpc_container= --k_anonymity_threshold=] [options] pc-cli bolt_e2e --bolt_config= + pc-cli secret_scrubber Options: @@ -70,6 +71,10 @@ ) from fbpcs.private_computation.service.constants import FBPCS_BUNDLE_ID from fbpcs.private_computation.service.pre_validate_service import PreValidateService +from fbpcs.private_computation.service.secret_scrubber import ( + LoggingSecretScrubber, + SecretScrubber, +) from fbpcs.private_computation.service.utils import transform_file_path from fbpcs.private_computation.stage_flows.private_computation_base_stage_flow import ( PrivateComputationBaseStageFlow, @@ -191,9 +196,14 @@ def main(argv: Optional[List[str]] = None) -> None: "print_log_urls": bool, "get_attribution_dataset_info": bool, "bolt_e2e": bool, + "secret_scrubber": bool, "": schema.Or(None, str), "": schema.Or(None, schema.Use(lambda arg: arg.split(","))), "": schema.Or(None, str), + "": schema.Or( + None, schema.And(schema.Use(PurePath), os.path.exists) + ), + "": schema.Or(None, str), "--config": schema.Or( None, schema.And(schema.Use(PurePath), os.path.exists) ), @@ -268,9 +278,9 @@ def main(argv: Optional[List[str]] = None) -> None: config = {} if arguments["--config"]: config = ConfigYamlDict.from_file(arguments["--config"]) - # if no --config given and endpoint isn't bolt_e2e, raise exception - # bolt_e2e endpoint needs --bolt_config argument - elif not arguments["bolt_e2e"]: + # if no --config given and endpoint isn't bolt_e2e or secret_scrubber, raise + # exception. All other endpoints need --config. + elif not arguments["bolt_e2e"] and not arguments["secret_scrubber"]: raise ValueError("--config is a required argument") log_path = arguments["--log_path"] @@ -284,8 +294,13 @@ def main(argv: Optional[List[str]] = None) -> None: # contain PII. level=logging.INFO, handlers=[log_handler], - format="%(asctime)sZ %(levelname)s t:%(threadName)s n:%(name)s ! %(message)s", ) + + log_format = "%(asctime)sZ %(levelname)s t:%(threadName)s n:%(name)s ! %(message)s" + log_scrubber = LoggingSecretScrubber(log_format) + for handler in logging.root.handlers: + handler.setFormatter(log_scrubber) + logger = logging.getLogger(__name__) log_level = logging.DEBUG if arguments["--verbose"] else logging.INFO logger.setLevel(log_level) @@ -481,6 +496,16 @@ def main(argv: Optional[List[str]] = None) -> None: raise RuntimeError(f"Jobs failed: {failed_job_names}") else: print("Jobs succeeded") + elif arguments["secret_scrubber"]: + with open(arguments[""]) as f: + file_content = f.read() + + secret_scrubber = SecretScrubber() + scrub_summary = secret_scrubber.scrub(file_content) + scrubbed_output_path = arguments[""] + with open(scrubbed_output_path, "w") as f: + f.write(scrub_summary.scrubbed_output) + print(scrub_summary.get_report()) if __name__ == "__main__": diff --git a/fbpcs/scripts/run_fbpcs.sh b/fbpcs/scripts/run_fbpcs.sh index 2d7ac17d4..9e45cc5d4 100755 --- a/fbpcs/scripts/run_fbpcs.sh +++ b/fbpcs/scripts/run_fbpcs.sh @@ -113,9 +113,8 @@ function parse_args() { done if [ -z ${real_config_path+x} ]; then - echo >&2 "--config= not specified in coordinator command." - usage - exit 1 + # hack to support PC-CLI endpoints that do not require --config + real_config_path=$(mktemp) fi # run_fbpcs.sh specific arguments