Skip to content

Commit

Permalink
implement a log scrubber (#1335)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #1335

## What

* Create `SecretScrubber` to scrub secrets (defined by regex patterns) out of text
* Create logging formatter so that we can scrub each line before we log it
* Create PC-CLI endpoint for ad-hoc scrubbing (e.g. instance files, config.yml files)

## Why

* logging secrets is bad

Reviewed By: gorel

Differential Revision: D38074790

fbshipit-source-id: 2cbb5957b50bc4f0b86d5ef3a6fb9dc7eae2469a
  • Loading branch information
jrodal98 authored and facebook-github-bot committed Jul 22, 2022
1 parent be82624 commit 6f68f81
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 7 deletions.
69 changes: 69 additions & 0 deletions fbpcs/private_computation/service/secret_scrubber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

import logging
import re
from dataclasses import dataclass
from typing import Dict, List


@dataclass
class Secret:
name: str
regex_pattern_str: str


@dataclass
class ScrubSummary:
scrubbed_output: str
total_substitutions: int
name_to_num_subs: Dict[str, int]

def get_report(self) -> str:
report = "Scrub Summary\n"
for name, num_subs in self.name_to_num_subs.items():
report += f"\n* {name}: {num_subs} substitutions"
report += f"\n* Total substitutions: {self.total_substitutions}"
return report


class SecretScrubber:
SECRETS: List[Secret] = [
# https://aws.amazon.com/blogs/security/a-safer-way-to-distribute-aws-credentials-to-ec2/
Secret("AWS access key id", "(?<![A-Z0-9])[A-Z0-9]{20}(?![A-Z0-9])"),
# https://aws.amazon.com/blogs/security/a-safer-way-to-distribute-aws-credentials-to-ec2/
Secret(
"AWS secret access key",
"(?<![A-Za-z0-9/+=])[A-Za-z0-9/+=]{40}(?![A-Za-z0-9/+=])",
),
# https://fburl.com/code/ogf6f0v4
Secret("Meta Graph API token", "([^a-zA-Z0-9]|^)EAA[a-zA-Z0-9]{90,400}"),
]
REPLACEMENT_STR: str = "********"

def __init__(self) -> None:
self.patterns: Dict[str, re.Pattern] = {
secret.name: re.compile(secret.regex_pattern_str) for secret in self.SECRETS
}

def scrub(self, string: str) -> ScrubSummary:
total_substitutions = 0
name_to_num_subs = {}
for name, pattern in self.patterns.items():
string, num_substitutes = pattern.subn(self.REPLACEMENT_STR, string)
name_to_num_subs[name] = num_substitutes
total_substitutions += num_substitutes
return ScrubSummary(string, total_substitutions, name_to_num_subs)


class LoggingSecretScrubber(logging.Formatter):
SECRET_SCRUBBER: SecretScrubber = SecretScrubber()

def format(self, record: logging.LogRecord) -> str:
original = logging.Formatter.format(self, record)
return self.SECRET_SCRUBBER.scrub(original).scrubbed_output
56 changes: 56 additions & 0 deletions fbpcs/private_computation/test/service/test_secret_scrubber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from unittest import TestCase

from fbpcs.private_computation.service.secret_scrubber import SecretScrubber


class TestSecretScrubber(TestCase):
def setUp(self) -> None:
self.scrubber = SecretScrubber()
# these are not real. I randomly generated them
# and separated them into two string to avoid any
# scanners picking them up
# used exrex: https://github.com/asciimoo/exrex
self.aws_access_key_id = "L194ZYK14" + "K8XMRS9RIEE"
self.aws_secret_access_key = "NJuU/e4plYJ" + "gc6ykqaykh6yXoQxYFmdO+RNJtYHV"
self.graph_api_token = (
"EAA3XZGPxS4159hjkreOqw0cUmSLx6K6fzZvemMGnzSQrjLadM"
"uhMS1ZeobDYIv4kBzGC5hU066Zj5ud0xNqqeNv3OhVJvYzqXUKit6Lj"
)

def test_scrub(self) -> None:
test_message = f"""
"CloudCredentialService": {{
"class": "fbpcs.pid.service.credential_service.simple_cloud_credential_service.SimpleCloudCredentialService",
"constructor": {{
"access_key_id": "{self.aws_access_key_id}",
"access_key_data": "{self.aws_secret_access_key}"
}}
}}
access_token: {self.graph_api_token}
"""

expected_output = f"""
"CloudCredentialService": {{
"class": "fbpcs.pid.service.credential_service.simple_cloud_credential_service.SimpleCloudCredentialService",
"constructor": {{
"access_key_id": "{self.scrubber.REPLACEMENT_STR}",
"access_key_data": "{self.scrubber.REPLACEMENT_STR}"
}}
}}
access_token:{self.scrubber.REPLACEMENT_STR}
"""

scrub_summary = self.scrubber.scrub(test_message)

self.assertEqual(scrub_summary.scrubbed_output, expected_output)
self.assertEqual(scrub_summary.total_substitutions, 3)
for c in scrub_summary.name_to_num_subs.values():
self.assertEqual(c, 1)
33 changes: 29 additions & 4 deletions fbpcs/private_computation_cli/private_computation_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
pc-cli run_attribution --config=<config_file> --dataset_id=<dataset_id> --input_path=<input_path> --timestamp=<timestamp> --attribution_rule=<attribution_rule> --aggregation_type=<aggregation_type> --concurrency=<concurrency> --num_files_per_mpc_container=<num_files_per_mpc_container> --k_anonymity_threshold=<k_anonymity_threshold> [options]
pc-cli pre_validate --config=<config_file> [--dataset_id=<dataset_id>] --input_path=<input_path> [--timestamp=<timestamp> --attribution_rule=<attribution_rule> --aggregation_type=<aggregation_type> --concurrency=<concurrency> --num_files_per_mpc_container=<num_files_per_mpc_container> --k_anonymity_threshold=<k_anonymity_threshold>] [options]
pc-cli bolt_e2e --bolt_config=<bolt_config_file>
pc-cli secret_scrubber <secret_input_path> <scrubbed_output_path>
Options:
Expand Down Expand Up @@ -70,6 +71,10 @@
)
from fbpcs.private_computation.service.constants import FBPCS_BUNDLE_ID
from fbpcs.private_computation.service.pre_validate_service import PreValidateService
from fbpcs.private_computation.service.secret_scrubber import (
LoggingSecretScrubber,
SecretScrubber,
)
from fbpcs.private_computation.service.utils import transform_file_path
from fbpcs.private_computation.stage_flows.private_computation_base_stage_flow import (
PrivateComputationBaseStageFlow,
Expand Down Expand Up @@ -191,9 +196,14 @@ def main(argv: Optional[List[str]] = None) -> None:
"print_log_urls": bool,
"get_attribution_dataset_info": bool,
"bolt_e2e": bool,
"secret_scrubber": bool,
"<instance_id>": schema.Or(None, str),
"<instance_ids>": schema.Or(None, schema.Use(lambda arg: arg.split(","))),
"<study_id>": schema.Or(None, str),
"<secret_input_path>": schema.Or(
None, schema.And(schema.Use(PurePath), os.path.exists)
),
"<scrubbed_output_path>": schema.Or(None, str),
"--config": schema.Or(
None, schema.And(schema.Use(PurePath), os.path.exists)
),
Expand Down Expand Up @@ -268,9 +278,9 @@ def main(argv: Optional[List[str]] = None) -> None:
config = {}
if arguments["--config"]:
config = ConfigYamlDict.from_file(arguments["--config"])
# if no --config given and endpoint isn't bolt_e2e, raise exception
# bolt_e2e endpoint needs --bolt_config argument
elif not arguments["bolt_e2e"]:
# if no --config given and endpoint isn't bolt_e2e or secret_scrubber, raise
# exception. All other endpoints need --config.
elif not arguments["bolt_e2e"] and not arguments["secret_scrubber"]:
raise ValueError("--config is a required argument")

log_path = arguments["--log_path"]
Expand All @@ -284,8 +294,13 @@ def main(argv: Optional[List[str]] = None) -> None:
# contain PII.
level=logging.INFO,
handlers=[log_handler],
format="%(asctime)sZ %(levelname)s t:%(threadName)s n:%(name)s ! %(message)s",
)

log_format = "%(asctime)sZ %(levelname)s t:%(threadName)s n:%(name)s ! %(message)s"
log_scrubber = LoggingSecretScrubber(log_format)
for handler in logging.root.handlers:
handler.setFormatter(log_scrubber)

logger = logging.getLogger(__name__)
log_level = logging.DEBUG if arguments["--verbose"] else logging.INFO
logger.setLevel(log_level)
Expand Down Expand Up @@ -481,6 +496,16 @@ def main(argv: Optional[List[str]] = None) -> None:
raise RuntimeError(f"Jobs failed: {failed_job_names}")
else:
print("Jobs succeeded")
elif arguments["secret_scrubber"]:
with open(arguments["<secret_input_path>"]) as f:
file_content = f.read()

secret_scrubber = SecretScrubber()
scrub_summary = secret_scrubber.scrub(file_content)
scrubbed_output_path = arguments["<scrubbed_output_path>"]
with open(scrubbed_output_path, "w") as f:
f.write(scrub_summary.scrubbed_output)
print(scrub_summary.get_report())


if __name__ == "__main__":
Expand Down
5 changes: 2 additions & 3 deletions fbpcs/scripts/run_fbpcs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,8 @@ function parse_args() {
done

if [ -z ${real_config_path+x} ]; then
echo >&2 "--config=<path to config> not specified in coordinator command."
usage
exit 1
# hack to support PC-CLI endpoints that do not require --config
real_config_path=$(mktemp)
fi

# run_fbpcs.sh specific arguments
Expand Down

0 comments on commit 6f68f81

Please sign in to comment.