Skip to content

Commit db1ce4e

Browse files
committed
Ingester storage conditional put updates with tests.
1 parent f806d07 commit db1ce4e

File tree

3 files changed

+123
-84
lines changed

3 files changed

+123
-84
lines changed

ingester/datalake_ingester/storage.py

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,26 @@
1212
# License for the specific language governing permissions and limitations under
1313
# the License.
1414

15-
import boto3
16-
from boto3.dynamodb.conditions import Attr
1715

1816
from memoized_property import memoized_property
1917
import boto.dynamodb2
2018
from boto.dynamodb2.table import Table
2119
from boto.dynamodb2.exceptions import ConditionalCheckFailedException
2220
import os
2321
from datalake.common.errors import InsufficientConfiguration
22+
import logging
2423

2524

2625
class DynamoDBStorage(object):
2726
'''store datalake records in a dynamoDB table'''
2827

29-
def __init__(self, table_name, connection=None):
28+
def __init__(self, table_name=None, latest_table=None, connection=None):
3029
self.table_name = table_name
3130
self.latest_table_name = os.environ.get("DATALAKE_DNAMODB_LATEST_TABLE",
32-
f"{self.table_name}-latest")
31+
f"{latest_table}")
3332
self.use_latest = os.environ.get("DATALAKE_LATEST_FLAG", False)
3433
self._prepare_connection(connection)
34+
self.logger = logging.getLogger('storage')
3535

3636
@classmethod
3737
def from_config(cls):
@@ -56,8 +56,7 @@ def _table(self):
5656

5757
@memoized_property
5858
def _latest_table(self):
59-
dynamodb = boto3.resource('dynamodb')
60-
return dynamodb.Table(self.latest_table_name, connection=self._connection)
59+
return Table(self.latest_table_name, connection=self._connection)
6160

6261
def store(self, record):
6362
try:
@@ -70,12 +69,57 @@ def update(self, record):
7069
self._table.put_item(data=record, overwrite=True)
7170

7271
def store_latest(self, record):
73-
item_attrs = {'time_index_key': record['time_index_key'],
74-
'range_key': record['range_key']}
75-
condition = (Attr('metadata.start').lt(record['metadata']['start']))
72+
"""
73+
note: Record must utilize AttributeValue syntax
74+
for the conditional put.
75+
"""
76+
record = {
77+
'time_index_key': {"S": record['time_index_key']},
78+
'range_key': {"S": record['range_key']},
79+
'metadata': {
80+
'M': {
81+
'start': {
82+
'N': str(record['metadata']['start'])
83+
},
84+
'end': {
85+
'N': str(record['metadata']['end'])
86+
},
87+
'id': {
88+
'S': str(record['metadata']['id'])
89+
},
90+
'path': {
91+
'S': str(record['metadata']['path'])
92+
},
93+
'hash': {
94+
'S': str(record['metadata']['hash'])
95+
},
96+
'version': {
97+
'N': str(record['metadata']['version'])
98+
},
99+
'what': {
100+
'S': str(record['metadata']['what'])
101+
},
102+
'where': {
103+
'S': str(record['metadata']['where'])
104+
},
105+
'work_id': {
106+
'S': str(record['metadata']['work_id'])
107+
}
108+
}
109+
},
110+
'url': {"S": record['url']},
111+
'create_time': {'N': str(record['create_time'])}
112+
}
76113
try:
77-
self._latest_table.put_item(item_attrs,
78-
condition)
114+
self._connection.put_item(
115+
table_name=self.latest_table_name,
116+
item=record,
117+
condition_expression=\
118+
f"attribute_not_exists(metadata.M.start.N) OR metadata.M.start.N < {record['metadata']['M']['start']['N']}",
119+
)
120+
self.logger.info("Record stored successfully.")
79121
except ConditionalCheckFailedException:
80-
pass
122+
self.logger.error("Condition not met, no operation was performed.")
123+
except Exception as e:
124+
self.logger.error(f"Error occurred: {str(e)}")
81125

ingester/tests/conftest.py

Lines changed: 3 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -14,31 +14,17 @@
1414
import boto.sns
1515
import boto.sqs
1616

17-
18-
import boto3
19-
from boto3 import client
20-
from botocore.exceptions import ClientError
21-
2217
from datalake.tests import * # noqa
2318

2419
from datalake_ingester import SQSQueue
2520

26-
import logging
27-
logging.basicConfig(level=logging.DEBUG)
2821

2922
@pytest.fixture
3023
def dynamodb_connection(aws_connector):
3124
return aws_connector(mock_dynamodb2,
3225
lambda: boto.dynamodb2.connect_to_region('us-west-1'))
3326

3427

35-
@pytest.fixture
36-
def dynamodb_latest_connection(aws_connector):
37-
return aws_connector(mock_dynamodb2,
38-
lambda: boto3.resource('dynamodb',
39-
region_name='us-west-1'))
40-
41-
4228
def _delete_table_if_exists(conn, name):
4329
try:
4430
table = Table(name, connection=conn)
@@ -47,16 +33,6 @@ def _delete_table_if_exists(conn, name):
4733
if e.status == 400 and e.error_code == 'ResourceNotFoundException':
4834
return
4935
raise e
50-
51-
def _delete_latest_if_exists(dynamodb, name):
52-
try:
53-
table = dynamodb.Table(name)
54-
table.delete()
55-
table.wait_until_not_exists()
56-
except ClientError as e:
57-
if e.response['Error']['Code'] == 'ResourceNotFoundException':
58-
return
59-
raise e
6036

6137

6238
@pytest.fixture
@@ -79,31 +55,6 @@ def tear_down():
7955
return table_maker
8056

8157

82-
@pytest.fixture
83-
def dynamodb_latest_table_maker(request, dynamodb_latest_connection):
84-
85-
def table_maker(name, key_schema, attributes):
86-
_delete_latest_if_exists(dynamodb_latest_connection, name)
87-
table = dynamodb_latest_connection.create_table(
88-
TableName=name,
89-
KeySchema=key_schema,
90-
AttributeDefinitions=attributes,
91-
ProvisionedThroughput={
92-
'ReadCapacityUnits': 5,
93-
'WriteCapacityUnits': 5
94-
}
95-
)
96-
table.wait_until_exists()
97-
98-
def tear_down():
99-
table.delete()
100-
table.wait_until_not_exists()
101-
102-
request.addfinalizer(tear_down)
103-
return table
104-
105-
return table_maker
106-
10758

10859
@pytest.fixture
10960
def dynamodb_users_table(dynamodb_table_maker):
@@ -118,16 +69,9 @@ def dynamodb_records_table(dynamodb_table_maker):
11869

11970

12071
@pytest.fixture
121-
def dynamodb_latest_table(dynamodb_latest_table_maker):
122-
schema = [
123-
{'AttributeName': 'time_index_key', 'KeyType': 'HASH'},
124-
{'AttributeName': 'range_key', 'KeyType': 'RANGE'}
125-
]
126-
attributes = [
127-
{'AttributeName': 'time_index_key', 'AttributeType': 'S'},
128-
{'AttributeName': 'range_key', 'AttributeType': 'S'}
129-
]
130-
return dynamodb_latest_table_maker('latest', schema, attributes)
72+
def dynamodb_latest_table(dynamodb_table_maker):
73+
schema = [HashKey('time_index_key'), RangeKey('range_key')]
74+
return dynamodb_table_maker('latest', schema)
13175

13276

13377
@pytest.fixture

ingester/tests/test_storage.py

Lines changed: 64 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,47 @@
11
from datalake_ingester import DynamoDBStorage
22
from decimal import Decimal
3-
import boto3
3+
4+
5+
def test_dynamodb_store(dynamodb_users_table, dynamodb_connection):
6+
storage = DynamoDBStorage('users', connection=dynamodb_connection)
7+
expected_user = {'name': 'John', 'last_name': 'Muir'}
8+
storage.store(expected_user)
9+
user = dict(dynamodb_users_table.get_item(name='John', last_name='Muir'))
10+
assert dict(user) == expected_user
11+
12+
def test_store_duplicate(dynamodb_users_table, dynamodb_connection):
13+
storage = DynamoDBStorage('users', connection=dynamodb_connection)
14+
expected_user = {'name': 'Vanilla', 'last_name': 'Ice'}
15+
storage.store(expected_user)
16+
storage.store(expected_user)
17+
user = dict(dynamodb_users_table.get_item(name='Vanilla', last_name='Ice'))
18+
assert dict(user) == expected_user
419

520
def test_insert_new_record(dynamodb_latest_table, dynamodb_connection):
6-
boto3.setup_default_session(fake_credentials=True)
7-
storage = DynamoDBStorage('latest', connection=dynamodb_connection)
21+
storage = DynamoDBStorage(latest_table='latest', connection=dynamodb_connection)
22+
823
new_record = {
924
'time_index_key': '15225:newlog',
1025
'range_key': 'new_server:12345abcde',
1126
'metadata': {
12-
'start': 1500000000000
27+
'version': 1,
28+
'start': 1500000000000,
29+
'end': 1500000000010,
30+
'path': '/var/log/syslog.2',
31+
'work_id': None,
32+
'where': 'ground_server2',
33+
'what': 'syslog',
34+
'id': '34fb2d1ec54245c7a57e29ed5a6ea9b2',
35+
'hash': 'b4f2d8de24af342643d5b78a8f2b9b88'
1336
},
1437
'url': 's3://newfile/url',
1538
'create_time': 1500000000000
1639
}
1740

18-
storage.store_latest(new_record)
41+
try:
42+
storage.store_latest(new_record)
43+
except Exception as e:
44+
print(f"Failed to store record: {str(e)}")
1945

2046
stored_record = dynamodb_latest_table.get_item(
2147
time_index_key='15225:newlog',
@@ -25,13 +51,21 @@ def test_insert_new_record(dynamodb_latest_table, dynamodb_connection):
2551

2652

2753
def test_store_conditional_put_latest_multiple_files(dynamodb_latest_table, dynamodb_connection):
28-
storage = DynamoDBStorage('latest', connection=dynamodb_connection)
54+
storage = DynamoDBStorage(latest_table='latest', connection=dynamodb_connection)
2955

3056
file1 = {
3157
'time_index_key': '15219:zlcdzvawsp',
3258
'range_key': 'lawvuunyws:447a4a801cabc6089f04922abdfa8aad099824e9',
3359
'metadata': {
34-
'start': 1314877177402
60+
'version': 1,
61+
'start': 1314877177402,
62+
'end': 1314877177412, # ends ten seconds later
63+
'path': '/var/log/syslog.2',
64+
'work_id': 'abc-123',
65+
'where': 'ground_server2',
66+
'what': 'syslog',
67+
'id': '34fb2d1ec54245c7a57e29ed5a6ea9b2',
68+
'hash': 'b4f2d8de24af342643d5b78a8f2b9b88'
3569
},
3670
'url': 's3://existingfile/url',
3771
'create_time': 1314877177402
@@ -41,7 +75,16 @@ def test_store_conditional_put_latest_multiple_files(dynamodb_latest_table, dyna
4175
'time_index_key': '15219:zlcdzvawsp',
4276
'range_key': 'lawvuunyws:447a4a801cabc6089f04922abdfa8aad099824e9',
4377
'metadata': {
44-
'start': 1314877177403 # One millisecond later
78+
'version': 1,
79+
'start': 1314877177413, # One millisecond later
80+
'end': 1314877177423, # ends ten seconds later
81+
'path': '/var/log/syslog.2',
82+
'work_id': 'abc-123',
83+
'where': 'ground_server2',
84+
'what': 'syslog',
85+
'id': '45gb2d1ec54245c7a57e29ed5a6ea9b2',
86+
'hash': 'c5g3d8de24af342643d5b78a8f2b9b88'
87+
4588
},
4689
'url': 's3://existingfile/url',
4790
'create_time': 1314877177403
@@ -55,7 +98,7 @@ def test_store_conditional_put_latest_multiple_files(dynamodb_latest_table, dyna
5598
'start': 1414877177402,
5699
'end': 1415128740728,
57100
'path': '/var/log/syslog.2',
58-
'work_id': None,
101+
'work_id': 'foo-bizz',
59102
'where': 'ground_server2',
60103
'what': 'syslog',
61104
'id': '34fb2d1ec54245c7a57e29ed5a6ea9b2',
@@ -74,24 +117,32 @@ def test_store_conditional_put_latest_multiple_files(dynamodb_latest_table, dyna
74117

75118
res = dict(dynamodb_latest_table.get_item(time_index_key='15219:zlcdzvawsp',
76119
range_key='lawvuunyws:447a4a801cabc6089f04922abdfa8aad099824e9'))
77-
assert res['metadata']['start'] == Decimal('1314877177403')
120+
assert res['metadata']['start'] == Decimal('1314877177413')
78121
assert len(records) == 2
79122
assert file2 == res
80123

81124

82125
def test_concurrent_updates(dynamodb_latest_table, dynamodb_connection):
83-
storage = DynamoDBStorage('latest', connection=dynamodb_connection)
126+
storage = DynamoDBStorage(latest_table='latest', connection=dynamodb_connection)
84127

85128
base_record = {
86129
'time_index_key': '15219:zlcdzvawsp',
87130
'range_key': 'lawvuunyws:447a4a801cabc6089f04922abdfa8aad099824e9',
88131
'metadata': {
89-
'start': 1314877177402
132+
'version': 1,
133+
'start': 1314877177402,
134+
'end': 1314877177412, # ends ten seconds later
135+
'path': '/var/log/syslog.2',
136+
'work_id': 'abc-123',
137+
'where': 'ground_server2',
138+
'what': 'syslog',
139+
'id': '34fb2d1ec54245c7a57e29ed5a6ea9b2',
140+
'hash': 'b4f2d8de24af342643d5b78a8f2b9b88'
90141
},
91142
'url': 's3://existingfile/url',
92143
'create_time': 1314877177402
93144
}
94-
storage.store(base_record)
145+
storage.store_latest(base_record)
95146

96147

97148
updated_record1 = base_record.copy()

0 commit comments

Comments
 (0)