Skip to content

Commit e128deb

Browse files
authored
New commands: azdev scan/mask for secrets scanning and redaction (#460)
* add scan/mask command * code implementation * refine code and add tests * pylint * flake8 * address comments
1 parent 4f5a0b6 commit e128deb

File tree

12 files changed

+515
-2
lines changed

12 files changed

+515
-2
lines changed

.flake8

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ ignore =
99
C901,
1010
W503,
1111
W504
12+
per-file-ignores =
13+
azdev/help.py:W605
14+
azdev/operations/tests/test_scan_and_mask.py:W605
1215

1316
exclude =
1417
mod_templates

HISTORY.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
33
Release History
44
===============
5+
0.1.74
6+
++++++
7+
* `azdev scan/mask`: New commands for scanning and masking secrets for files or string
8+
59
0.1.73
610
++++++
711
* `azdev command-change meta-export`: Add `has_completer` to denote whether completer is configed in arg

azdev/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
# license information.
55
# -----------------------------------------------------------------------------
66

7-
__VERSION__ = '0.1.73'
7+
__VERSION__ = '0.1.74'

azdev/commands.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from .transformers import performance_benchmark_data_transformer
1010

1111

12+
# pylint: disable=too-many-statements
1213
def load_command_table(self, _):
1314

1415
def operation_group(name):
@@ -27,6 +28,10 @@ def operation_group(name):
2728
with CommandGroup(self, '', operation_group('linter')) as g:
2829
g.command('linter', 'run_linter')
2930

31+
with CommandGroup(self, '', operation_group('secret')) as g:
32+
g.command('scan', 'scan_secrets')
33+
g.command('mask', 'mask_secrets')
34+
3035
with CommandGroup(self, 'statistics', operation_group('statistics')) as g:
3136
g.command('list-command-table', 'list_command_table')
3237
g.command('diff-command-tables', 'diff_command_tables')

azdev/help.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# -----------------------------------------------------------------------------
66

77
from knack.help_files import helps
8+
# pylint: disable=line-too-long, anomalous-backslash-in-string
89

910

1011
helps[''] = """
@@ -159,6 +160,51 @@
159160
text: azdev linter --repo azure-cli --tgt upstream/master --src upstream/dev
160161
"""
161162

163+
helps['scan'] = """
164+
short-summary: Scan secrets for files or string
165+
long-summary: Check built-in scanning rules at https://github.com/microsoft/security-utilities/blob/main/GeneratedRegexPatterns/PreciselyClassifiedSecurityKeys.json
166+
examples:
167+
- name: Scan secrets for a single file with custom patterns
168+
text: |
169+
azdev scan --file-path my_file.yaml --custom-pattern my_pattern.json
170+
("my_pattern.json" contains the following content)
171+
{
172+
"Include": [
173+
{
174+
"Pattern": "(?<refine>[\w.%#+-]+)(%40|@)([a-z0-9.-]*.[a-z]{2,})",
175+
"Name": "EmailAddress",
176+
"Signatures": ["%40", "@"]
177+
},
178+
{
179+
"Pattern": "(?<refine>[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12})",
180+
"Name": "GUID"
181+
}
182+
],
183+
"Exclude": [
184+
{
185+
"Id": "SEC101/156",
186+
"Name": "AadClientAppIdentifiableCredentials",
187+
}
188+
]
189+
}
190+
- name: Scan secrets for raw string and save results to file
191+
text: |
192+
azdev scan --data "my string waiting to be scanned" --save-scan-result True
193+
- name: Recursively scan secrets for a directory and save results to specific file
194+
text: |
195+
azdev scan --directory-path /path/to/my/folder --recursive --scan-result-path /path/to/scan_result.json
196+
"""
197+
198+
helps['mask'] = """
199+
short-summary: Mask secrets for files or string
200+
long-summary: |
201+
Redaction type 'FIXED_VALUE' will mask all secrets with '***'.
202+
Redaction type 'FIXED_LENGTH' will mask secrets with several '*'s which will keep the original secret length.
203+
Redaction type 'SECRET_NAME' redaction type will mask secrets with their secret name (type).
204+
Redaction type 'CUSTOM' will mask secrets with 'redaction_token' value you specify through saved scan result file.
205+
Check built-in scanning rules at https://github.com/microsoft/security-utilities/blob/main/GeneratedRegexPatterns/PreciselyClassifiedSecurityKeys.json
206+
"""
207+
162208
helps['statistics'] = """
163209
short-summary: Commands for CLI modules statistics.
164210
"""

azdev/operations/secret.py

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
# -----------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See License.txt in the project root for
4+
# license information.
5+
# -----------------------------------------------------------------------------
6+
7+
import os
8+
import json
9+
from json.decoder import JSONDecodeError
10+
from knack.log import get_logger
11+
from microsoft_security_utilities_secret_masker import (load_regex_patterns_from_json_file,
12+
load_regex_pattern_from_json,
13+
SecretMasker)
14+
logger = get_logger(__name__)
15+
16+
17+
def _validate_data_path(file_path=None, directory_path=None, data=None):
18+
if file_path and directory_path:
19+
raise ValueError('Can not specify file path and directory path at the same time')
20+
if file_path and data:
21+
raise ValueError('Can not specify file path and raw string at the same time')
22+
if directory_path and data:
23+
raise ValueError('Can not specify directory path and raw string at the same time')
24+
if not file_path and not directory_path and not data:
25+
raise ValueError('No file path or directory path or raw string provided')
26+
27+
if directory_path and not os.path.isdir(directory_path):
28+
raise ValueError(f'invalid directory path:{directory_path}')
29+
if file_path and not os.path.isfile(file_path):
30+
raise ValueError(f'invalid file path:{file_path}')
31+
32+
33+
def _load_built_in_regex_patterns():
34+
return load_regex_patterns_from_json_file('PreciselyClassifiedSecurityKeys.json')
35+
36+
37+
def _load_regex_patterns(custom_pattern=None):
38+
built_in_regex_patterns = _load_built_in_regex_patterns()
39+
40+
if not custom_pattern:
41+
return built_in_regex_patterns
42+
43+
try:
44+
if os.path.isfile(custom_pattern):
45+
custom_pattern = json.load(custom_pattern)
46+
else:
47+
custom_pattern = json.loads(custom_pattern)
48+
except JSONDecodeError as err:
49+
raise ValueError(f'Custom pattern should be in valid json format, err:{err.msg}')
50+
51+
regex_patterns = []
52+
if 'Include' in custom_pattern:
53+
for pattern in custom_pattern['Include']:
54+
if not pattern.get('Pattern', None):
55+
raise ValueError(f'Invalid Custom Pattern: {pattern}, '
56+
f'"Pattern" property is required for Include patterns')
57+
regex_patterns.append(load_regex_pattern_from_json(pattern))
58+
if "Exclude" in custom_pattern:
59+
exclude_pattern_ids = []
60+
for pattern in custom_pattern['Exclude']:
61+
if not pattern.get('Id', None):
62+
raise ValueError(f'Invalid Custom Pattern: {pattern}, "Id" property is required for Exclude patterns')
63+
exclude_pattern_ids.append(pattern['Id'])
64+
for pattern in built_in_regex_patterns:
65+
if pattern.id in exclude_pattern_ids:
66+
continue
67+
regex_patterns.append(pattern)
68+
else:
69+
regex_patterns.extend(built_in_regex_patterns)
70+
return regex_patterns
71+
72+
73+
def _scan_secrets_for_string(data, custom_pattern=None):
74+
if not data:
75+
return None
76+
77+
regex_patterns = _load_regex_patterns(custom_pattern)
78+
secret_masker = SecretMasker(regex_patterns)
79+
detected_secrets = secret_masker.detect_secrets(data)
80+
secrets = []
81+
for secret in detected_secrets:
82+
secrets.append({
83+
'secret_name': secret.name,
84+
'secret_value': data[secret.start:secret.end],
85+
'secret_index': [secret.start, secret.end],
86+
'redaction_token': secret.redaction_token,
87+
})
88+
return secrets
89+
90+
91+
def scan_secrets(file_path=None, directory_path=None, recursive=False, data=None,
92+
save_scan_result=None, scan_result_path=None, custom_pattern=None):
93+
_validate_data_path(file_path=file_path, directory_path=directory_path, data=data)
94+
target_files = []
95+
scan_results = {}
96+
if directory_path:
97+
directory_path = os.path.abspath(directory_path)
98+
if recursive:
99+
for root, _, files in os.walk(directory_path):
100+
target_files.extend(os.path.join(root, file) for file in files)
101+
else:
102+
for file in os.listdir(directory_path):
103+
file = os.path.join(directory_path, file)
104+
if os.path.isfile(file):
105+
target_files.append(file)
106+
if file_path:
107+
file_path = os.path.abspath(file_path)
108+
target_files.append(file_path)
109+
110+
if data:
111+
secrets = _scan_secrets_for_string(data, custom_pattern)
112+
if secrets:
113+
scan_results['raw_data'] = secrets
114+
elif target_files:
115+
for target_file in target_files:
116+
logger.debug('start scanning secrets for %s', target_file)
117+
with open(target_file) as f:
118+
data = f.read()
119+
if not data:
120+
continue
121+
secrets = _scan_secrets_for_string(data, custom_pattern)
122+
logger.debug('%d secrets found for %s', len(secrets), target_file)
123+
if secrets:
124+
scan_results[target_file] = secrets
125+
126+
if scan_result_path:
127+
save_scan_result = True
128+
if not save_scan_result:
129+
return {
130+
'secrets_detected': bool(scan_results),
131+
'scan_results': scan_results
132+
}
133+
134+
if not scan_results:
135+
return {'secrets_detected': False, 'scan_result_path': None}
136+
137+
if not scan_result_path:
138+
from azdev.utilities.config import get_azdev_config_dir
139+
from datetime import datetime
140+
file_folder = os.path.join(get_azdev_config_dir(), 'scan_results')
141+
if not os.path.exists(file_folder):
142+
os.mkdir(file_folder, 0o755)
143+
file_name = file_path or directory_path or datetime.now().strftime('%Y%m%d%H%M%S')
144+
result_file_name = 'scan_result_' + file_name.replace('.', '_') + '.json'
145+
scan_result_path = os.path.join(file_folder, result_file_name)
146+
147+
with open(scan_result_path, 'w') as f:
148+
json.dump(scan_results, f)
149+
logger.debug('store scanning results in %s', scan_result_path)
150+
return {'secrets_detected': True, 'scan_result_path': os.path.abspath(scan_result_path)}
151+
152+
153+
def _get_scan_results_from_saved_file(saved_scan_result_path,
154+
file_path=None, directory_path=None, recursive=False, data=None):
155+
scan_results = {}
156+
if not os.path.isfile(saved_scan_result_path):
157+
raise ValueError(f'invalid saved scan result path:{saved_scan_result_path}')
158+
with open(saved_scan_result_path) as f:
159+
saved_scan_results = json.load(f)
160+
# filter saved scan results to keep those related with specified file(s)
161+
_validate_data_path(file_path=file_path, directory_path=directory_path, data=data)
162+
if file_path:
163+
file_path = os.path.abspath(file_path)
164+
if file_path in saved_scan_results:
165+
scan_results[file_path] = saved_scan_results[file_path]
166+
elif directory_path:
167+
if recursive:
168+
for root, _, files in os.walk(directory_path):
169+
for file in files:
170+
file_full = os.path.join(root, file)
171+
if file_full in saved_scan_results:
172+
scan_results[file_full] = saved_scan_results[file_full]
173+
else:
174+
for file in os.listdir(directory_path):
175+
file_full = os.path.join(directory_path, file)
176+
if file_full in saved_scan_results:
177+
scan_results[file_full] = saved_scan_results[file_full]
178+
else:
179+
scan_results['raw_data'] = saved_scan_results['raw_data']
180+
181+
return scan_results
182+
183+
184+
def _mask_secret_for_string(data, secret, redaction_type=None):
185+
if redaction_type == 'FIXED_VALUE':
186+
data = data.replace(secret['secret_value'], '***')
187+
elif redaction_type == 'FIXED_LENGTH':
188+
data = data.replace(secret['secret_value'], '*' * len(secret['secret_value']))
189+
elif redaction_type == 'SECRET_NAME':
190+
data = data.replace(secret['secret_value'], secret['secret_name'])
191+
else:
192+
data = data.replace(secret['secret_value'], secret['redaction_token'])
193+
return data
194+
195+
196+
def mask_secrets(file_path=None, directory_path=None, recursive=False, data=None,
197+
save_scan_result=None, scan_result_path=None, custom_pattern=None,
198+
saved_scan_result_path=None, redaction_type='FIXED_VALUE', yes=None):
199+
scan_results = {}
200+
if saved_scan_result_path:
201+
scan_results = _get_scan_results_from_saved_file(saved_scan_result_path, file_path=file_path,
202+
directory_path=directory_path, recursive=recursive, data=data)
203+
else:
204+
scan_response = scan_secrets(file_path=file_path, directory_path=directory_path, recursive=recursive, data=data,
205+
save_scan_result=save_scan_result, scan_result_path=scan_result_path,
206+
custom_pattern=custom_pattern)
207+
if save_scan_result and scan_response['scan_result_path']:
208+
with open(scan_response['scan_result_path']) as f:
209+
scan_results = json.load(f)
210+
elif not save_scan_result:
211+
scan_results = scan_response['scan_results']
212+
213+
mask_result = {
214+
'mask': False,
215+
'data': data,
216+
'file_path': file_path,
217+
'directory_path': directory_path,
218+
'recursive': recursive
219+
}
220+
if not scan_results:
221+
logger.warning('No secrets detected, finish directly.')
222+
return mask_result
223+
for scan_file_path, secrets in scan_results.items():
224+
logger.warning('Will mask %d secrets for %s', len(secrets), scan_file_path)
225+
if not yes:
226+
from knack.prompting import prompt_y_n
227+
if not prompt_y_n(f'Do you want to continue with redaction type {redaction_type}?'):
228+
return mask_result
229+
230+
if 'raw_data' in scan_results:
231+
for secret in scan_results['raw_data']:
232+
data = _mask_secret_for_string(data, secret, redaction_type)
233+
mask_result['mask'] = True
234+
mask_result['data'] = data
235+
return mask_result
236+
237+
for scan_file_path, secrets in scan_results.items():
238+
with open(scan_file_path, 'r') as f:
239+
content = f.read()
240+
if not content:
241+
continue
242+
for secret in secrets:
243+
content = _mask_secret_for_string(content, secret, redaction_type)
244+
with open(scan_file_path, 'w') as f:
245+
f.write(content)
246+
mask_result['mask'] = True
247+
return mask_result
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is a test string with email fooabc@gmail.com.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is a test string without any secrets.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"id": "e1fe6dd8-ba31-4d61-89e7-88639da4683d",
3+
"sas": "sv=2022-11-02&sr=c&sig=a9Y5mpQgKUiiPzHFNdDm53Na6UndTrNMCsRZd6b2oV4%3D",
4+
"detail": {
5+
"email": "fooabc@gmail.com"
6+
}
7+
}

0 commit comments

Comments
 (0)