Skip to content

Commit 5368ebf

Browse files
committed
WIP: adding query_latest_table method to ArchiveQuerier.
1 parent 8eb2e08 commit 5368ebf

File tree

3 files changed

+80
-13
lines changed

3 files changed

+80
-13
lines changed

api/datalake_api/querier.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from memoized_property import memoized_property
1616
from datalake.common import DatalakeRecord
17+
from boto3.dynamodb.conditions import Key
1718
import base64
1819
import json
1920
import time
@@ -365,3 +366,13 @@ def _get_all_records_in_bucket(self, bucket, **kwargs):
365366
break
366367
kwargs['ExclusiveStartKey'] = response['LastEvaluatedKey']
367368
return records
369+
370+
def query_latest_table(self, what, where):
371+
response = self._table.query(
372+
KeyConditionExpression=Key('what_where_key').eq(f'{what}:{where}')
373+
)
374+
items = response.get('Items', [])
375+
if not items:
376+
return None
377+
latest_item = items[0]
378+
return dict(url=latest_item['url'], metadata=latest_item['metadata'])

api/tests/conftest.py

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,20 @@ def tear_down():
107107
}
108108
]
109109

110+
latest_attribute_definitions = [
111+
{
112+
'AttributeName': 'what_where_key',
113+
'AttributeType': 'S'
114+
}
115+
]
116+
117+
latest_key_schema = [
118+
{
119+
'AttributeName': 'what_where_key',
120+
'KeyType': 'HASH'
121+
}
122+
]
123+
110124
global_secondary = [{
111125
'IndexName': 'work-id-index',
112126
'KeySchema': [
@@ -140,55 +154,80 @@ def _delete_table(table):
140154
raise e
141155

142156

143-
def _create_table(dynamodb, table_name):
157+
def _create_table(dynamodb,
158+
table_name,
159+
attribute_definitions,
160+
key_schema,
161+
global_secondary=None):
144162
table = dynamodb.Table(table_name)
145163
_delete_table(table)
146164
kwargs = dict(
147165
TableName=table_name,
148166
AttributeDefinitions=attribute_definitions,
149167
KeySchema=key_schema,
150-
GlobalSecondaryIndexes=global_secondary,
151168
ProvisionedThroughput={
152169
'ReadCapacityUnits': 5,
153170
'WriteCapacityUnits': 5
154171
}
155172
)
173+
if global_secondary:
174+
kwargs['GlobalSecondaryIndexes'] = global_secondary
156175
dynamodb.create_table(**kwargs)
157176
return dynamodb.Table(table_name)
158177

159178

160179
def _populate_table(table, records):
180+
print(f'attempting to populate {table}')
161181
with table.batch_writer() as batch:
162182
for r in records:
163183
batch.put_item(Item=r)
164184

165-
185+
# Adding latest table logic so latest table will be created and records will populate it
186+
# Once that's possible, we will simply query the latest_table for what:where, no bucket logic
166187
@pytest.fixture
167188
def table_maker(request, dynamodb):
168189

169-
def maker(records):
170-
table_name = 'test'
171-
table = _create_table(dynamodb, table_name)
172-
_populate_table(table, records)
190+
def maker(records, include_latest_key=False):
191+
old_table_name = 'test'
192+
latest_table_name = 'test_latest'
193+
latest_table = None
194+
195+
old_table = _create_table(dynamodb, old_table_name, attribute_definitions, key_schema, global_secondary)
196+
_populate_table(old_table, records)
197+
198+
if include_latest_key:
199+
latest_table = _create_table(dynamodb, latest_table_name, latest_attribute_definitions, latest_key_schema)
200+
_populate_table(latest_table, records)
173201

174202
def tear_down():
175-
_delete_table(table)
203+
_delete_table(old_table)
204+
if include_latest_key:
205+
_delete_table(latest_table)
206+
176207
request.addfinalizer(tear_down)
177208

178-
return table
209+
return old_table, latest_table
179210

180211
return maker
181212

182213

183214
@pytest.fixture
184215
def record_maker(s3_file_from_metadata):
185216

186-
def maker(**kwargs):
217+
def maker(include_latest_key=False, **kwargs):
187218
m = generate_random_metadata()
188219
m.update(**kwargs)
189220
key = '/'.join([str(v) for v in kwargs.values()])
190221
url = 's3://datalake-test/' + key
191222
s3_file_from_metadata(url, m)
192-
return DatalakeRecord.list_from_metadata(url, m)
223+
records = DatalakeRecord.list_from_metadata(url, m)
224+
225+
if include_latest_key:
226+
what = kwargs.get('what')
227+
where = kwargs.get('where')
228+
for record in records:
229+
record['what_where_key'] = f"{what}:{where}"
230+
231+
return records
193232

194233
return maker

api/tests/test_archive_querier.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,10 @@ def query_latest(self, what, where):
125125

126126
@pytest.fixture(params=[ArchiveQuerier, HttpQuerier],
127127
ids=['archive_querier', 'http'])
128-
def querier(request, dynamodb):
129-
return request.param('test', dynamodb=dynamodb)
128+
def querier(request):
129+
def create_querier(dynamodb, table_name):
130+
return request.param(table_name, dynamodb=dynamodb)
131+
return create_querier
130132

131133

132134
def in_url(result, part):
@@ -528,3 +530,18 @@ def test_2x_max_results_in_one_bucket(table_maker, querier, record_maker):
528530
pages = get_all_pages(querier.query_by_time, [start, end, 'boo'])
529531
results = consolidate_pages(pages)
530532
assert len(results) == MAX_RESULTS * 2
533+
534+
"""
535+
Will have to go through all of the tests associated with
536+
latest and correctly query from
537+
the latest table that was created.
538+
"""
539+
540+
def test_latest_table_query(table_maker, querier, record_maker):
541+
now = int(time.time() * 1000)
542+
records = record_maker(include_latest_key=True, what='foo', where='boo')
543+
_, latest_table = table_maker(records)
544+
545+
querier_instance = querier(dynamodb=latest_table.dynamodb, table_name=latest_table.table_name)
546+
result = querier_instance.query_latest_table('foo', 'boo')
547+
_validate_latest_result(result, what='foo', where='boo')

0 commit comments

Comments
 (0)