Skip to content

Commit e37a8dd

Browse files
committed
Sanitize content hashes in archive methods
Our content hashes are always hex digests and we should reject anything that contains non-hex characters.
1 parent fc56a38 commit e37a8dd

File tree

8 files changed

+106
-12
lines changed

8 files changed

+106
-12
lines changed

servicelayer/archive/file.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from normality import safe_filename
66

77
from servicelayer.archive.archive import Archive
8-
from servicelayer.archive.util import ensure_path, checksum, BUF_SIZE
8+
from servicelayer.archive.util import ensure_path, checksum, sanitize_checksum, BUF_SIZE
99
from servicelayer.archive.util import path_prefix, path_content_hash
1010

1111
log = logging.getLogger(__name__)
@@ -33,6 +33,8 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
3333
"""Import the given file into the archive."""
3434
if content_hash is None:
3535
content_hash = checksum(file_path)
36+
else:
37+
content_hash = sanitize_checksum(content_hash)
3638

3739
if content_hash is None:
3840
return
@@ -51,6 +53,7 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
5153
return content_hash
5254

5355
def load_file(self, content_hash, file_name=None, temp_path=None):
56+
content_hash = sanitize_checksum(content_hash)
5457
return self._locate_key(content_hash)
5558

5659
def list_files(self, prefix=None):
@@ -67,6 +70,7 @@ def list_files(self, prefix=None):
6770
yield path_content_hash(file_path)
6871

6972
def delete_file(self, content_hash):
73+
content_hash = sanitize_checksum(content_hash)
7074
prefix = path_prefix(content_hash)
7175
if prefix is None:
7276
return

servicelayer/archive/gs.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from google.resumable_media.common import DataCorruption, InvalidResponse
1010

1111
from servicelayer.archive.virtual import VirtualArchive
12-
from servicelayer.archive.util import checksum, ensure_path
12+
from servicelayer.archive.util import checksum, sanitize_checksum, ensure_path
1313
from servicelayer.archive.util import path_prefix, ensure_posix_path
1414
from servicelayer.archive.util import path_content_hash, HASH_LENGTH
1515
from servicelayer.util import service_retries, backoff
@@ -89,6 +89,8 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
8989
file_path = ensure_path(file_path)
9090
if content_hash is None:
9191
content_hash = checksum(file_path)
92+
else:
93+
content_hash = sanitize_checksum(content_hash)
9294

9395
if content_hash is None:
9496
return
@@ -111,6 +113,7 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
111113
def load_file(self, content_hash, file_name=None, temp_path=None):
112114
"""Retrieve a file from Google storage and put it onto the local file
113115
system for further processing."""
116+
content_hash = sanitize_checksum(content_hash)
114117
for attempt in service_retries():
115118
try:
116119
blob = self._locate_contenthash(content_hash)
@@ -147,6 +150,7 @@ def delete_file(self, content_hash):
147150
"""Check if a file with the given hash exists on S3."""
148151
if content_hash is None or len(content_hash) < HASH_LENGTH:
149152
return
153+
content_hash = sanitize_checksum(content_hash)
150154
prefix = path_prefix(content_hash)
151155
if prefix is None:
152156
return

servicelayer/archive/s3.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from servicelayer import settings
77
from servicelayer.archive.virtual import VirtualArchive
8-
from servicelayer.archive.util import checksum, ensure_path
8+
from servicelayer.archive.util import checksum, sanitize_checksum, ensure_path
99
from servicelayer.archive.util import path_prefix, path_content_hash
1010

1111
log = logging.getLogger(__name__)
@@ -86,6 +86,8 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
8686
file_path = ensure_path(file_path)
8787
if content_hash is None:
8888
content_hash = checksum(file_path)
89+
else:
90+
content_hash = sanitize_checksum(content_hash)
8991

9092
# if content_hash is None:
9193
# return
@@ -105,6 +107,7 @@ def archive_file(self, file_path, content_hash=None, mime_type=None):
105107
def load_file(self, content_hash, file_name=None, temp_path=None):
106108
"""Retrieve a file from S3 storage and put it onto the local file
107109
system for further processing."""
110+
content_hash = sanitize_checksum(content_hash)
108111
key = self._locate_key(content_hash)
109112
if key is not None:
110113
path = self._local_path(content_hash, file_name, temp_path)
@@ -114,6 +117,7 @@ def load_file(self, content_hash, file_name=None, temp_path=None):
114117
def delete_file(self, content_hash):
115118
if content_hash is None:
116119
return
120+
content_hash = sanitize_checksum(content_hash)
117121
prefix = path_prefix(content_hash)
118122
if prefix is None:
119123
return

servicelayer/archive/util.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import string
23
from hashlib import sha1
34
from pathlib import Path
45

@@ -32,6 +33,18 @@ def checksum(file_name):
3233
return str(digest.hexdigest())
3334

3435

36+
def sanitize_checksum(checksum):
37+
"""Normalize the checksum. Raises an error if the given checksum invalid."""
38+
if not checksum:
39+
raise ValueError("Checksum is empty")
40+
41+
for char in checksum:
42+
if char not in string.hexdigits:
43+
raise ValueError(f'Checksum contains invalid character "{char}"')
44+
45+
return checksum
46+
47+
3548
def path_prefix(content_hash):
3649
"""Get a prefix for a content hashed folder structure."""
3750
if content_hash is None:

servicelayer/archive/virtual.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from normality import safe_filename
66

77
from servicelayer.archive.archive import Archive
8-
from servicelayer.archive.util import ensure_path
8+
from servicelayer.archive.util import ensure_path, sanitize_checksum
99

1010
log = logging.getLogger(__name__)
1111

@@ -33,6 +33,7 @@ def cleanup_file(self, content_hash, temp_path=None):
3333
"""Delete the local cached version of the file."""
3434
if content_hash is None:
3535
return
36+
content_hash = sanitize_checksum(content_hash)
3637
path = self._get_local_prefix(content_hash, temp_path=temp_path)
3738
try:
3839
shutil.rmtree(path, ignore_errors=True)

tests/archive/test_file.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import pytest
12
import shutil
23
import tempfile
34
from unittest import TestCase
@@ -26,9 +27,11 @@ def test_basic_archive(self):
2627
assert out == out2, (out, out2)
2728

2829
def test_basic_archive_with_checksum(self):
29-
checksum_ = "banana"
30-
out = self.archive.archive_file(self.file, checksum_)
31-
assert checksum_ == out, (checksum_, out)
30+
with pytest.raises(ValueError):
31+
self.archive.archive_file(self.file, content_hash="banana")
32+
33+
out = self.archive.archive_file(self.file, content_hash="01234567890abcdef")
34+
assert out == "01234567890abcdef"
3235

3336
def test_generate_url(self):
3437
out = self.archive.archive_file(self.file)
@@ -39,6 +42,15 @@ def test_publish(self):
3942
assert not self.archive.can_publish
4043

4144
def test_load_file(self):
45+
# Invalid content hash
46+
with pytest.raises(ValueError):
47+
self.archive.load_file("banana")
48+
49+
# Valid content hash, but file does not exist
50+
path = self.archive.load_file("01234567890abcdef")
51+
assert path is None
52+
53+
# Valid content hash, file exists
4254
out = self.archive.archive_file(self.file)
4355
path = self.archive.load_file(out)
4456
assert path is not None, path
@@ -64,10 +76,17 @@ def test_list_files(self):
6476
assert len(keys) == 0, keys
6577

6678
def test_delete_file(self):
79+
# Invalid content hash
80+
with pytest.raises(ValueError):
81+
self.archive.delete_file("banana")
82+
83+
# File does not exist
84+
assert self.archive.delete_file("01234567890abcdef") is None
85+
86+
# Valid content hash, file exists
6787
out = self.archive.archive_file(self.file)
6888
path = self.archive.load_file(out)
6989
assert path is not None, path
70-
self.archive.cleanup_file(out)
71-
self.archive.delete_file(out)
90+
assert self.archive.delete_file(out) is None
7291
path = self.archive.load_file(out)
7392
assert path is None, path

tests/archive/test_s3.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import pytest
12
from unittest import TestCase
23
from urllib.parse import urlparse, parse_qs
34

@@ -26,9 +27,11 @@ def test_basic_archive(self):
2627
assert out == out2, (out, out2)
2728

2829
def test_basic_archive_with_checksum(self):
29-
checksum_ = "banana"
30-
out = self.archive.archive_file(self.file, checksum_)
31-
assert checksum_ == out, (checksum_, out)
30+
with pytest.raises(ValueError):
31+
self.archive.archive_file(self.file, content_hash="banana")
32+
33+
out = self.archive.archive_file(self.file, content_hash="01234567890abcdef")
34+
assert out == "01234567890abcdef"
3235

3336
def test_generate_url(self):
3437
content_hash = self.archive.archive_file(self.file)
@@ -60,12 +63,29 @@ def test_publish_file(self):
6063
assert "https://foo.s3.amazonaws.com/self.py" in url, url
6164

6265
def test_load_file(self):
66+
# Invalid content hash
67+
with pytest.raises(ValueError):
68+
self.archive.load_file("banana")
69+
70+
# Valid content hash, but file does not exist
71+
path = self.archive.load_file("01234567890abcdef")
72+
assert path is None
73+
74+
# Valid content hash, file exists
6375
out = self.archive.archive_file(self.file)
6476
path = self.archive.load_file(out)
6577
assert path is not None, path
6678
assert path.is_file(), path
6779

6880
def test_cleanup_file(self):
81+
# Invalid content hash
82+
with pytest.raises(ValueError):
83+
self.archive.cleanup_file("banana")
84+
85+
# File does not exist
86+
assert self.archive.cleanup_file("01234567890abcdef") is None
87+
88+
# Valid content hash, file exists
6989
out = self.archive.archive_file(self.file)
7090
self.archive.cleanup_file(out)
7191
path = self.archive.load_file(out)
@@ -86,6 +106,14 @@ def test_list_files(self):
86106
assert len(keys) == 0, keys
87107

88108
def test_delete_file(self):
109+
# Invalid content hash
110+
with pytest.raises(ValueError):
111+
self.archive.delete_file("banana")
112+
113+
# File does not exist
114+
assert self.archive.delete_file("01234567890abcdef") is None
115+
116+
# Valid content hash, file exists
89117
out = self.archive.archive_file(self.file)
90118
path = self.archive.load_file(out)
91119
assert path is not None, path

tests/archive/test_util.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import pytest
2+
from unittest import TestCase
3+
4+
from servicelayer.archive.util import sanitize_checksum
5+
6+
7+
class UtilTest(TestCase):
8+
def test_sanitize_checksum(self):
9+
assert sanitize_checksum("0123456789abcdef") == "0123456789abcdef"
10+
11+
with pytest.raises(ValueError, match="Checksum is empty"):
12+
sanitize_checksum(None)
13+
14+
with pytest.raises(ValueError, match="Checksum is empty"):
15+
sanitize_checksum("")
16+
17+
with pytest.raises(ValueError, match='Checksum contains invalid character "n"'):
18+
sanitize_checksum("banana")
19+
20+
with pytest.raises(ValueError, match='Checksum contains invalid character "/"'):
21+
sanitize_checksum("/")

0 commit comments

Comments
 (0)