Skip to content

Commit 483febf

Browse files
committed
ArchiveQuerier utilizing use_latest flag.
1 parent 5368ebf commit 483febf

File tree

3 files changed

+61
-56
lines changed

3 files changed

+61
-56
lines changed

api/datalake_api/querier.py

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -173,9 +173,10 @@ def _unpack(self, result):
173173

174174
class ArchiveQuerier(object):
175175

176-
def __init__(self, table_name, dynamodb=None):
176+
def __init__(self, table_name, dynamodb=None, use_latest=None):
177177
self.table_name = table_name
178178
self.dynamodb = dynamodb
179+
self.use_latest = use_latest
179180

180181
def query_by_work_id(self, work_id, what, where=None, cursor=None):
181182
kwargs = self._prepare_work_id_kwargs(work_id, what)
@@ -331,18 +332,25 @@ def _cursor_for_time_query(self, response, results, current_bucket):
331332
@memoized_property
332333
def _table(self):
333334
return self.dynamodb.Table(self.table_name)
335+
336+
@memoized_property
337+
def _latest_table(self):
338+
return self.dynamodb.Table('test_latest')
334339

335-
def query_latest(self, what, where, lookback_days=DEFAULT_LOOKBACK_DAYS):
336-
current = int(time.time() * 1000)
337-
end = current - lookback_days * _ONE_DAY_MS
338-
while current >= end:
339-
bucket = current/DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
340-
r = self._get_latest_record_in_bucket(bucket, what, where)
341-
if r is not None:
342-
return r
343-
current -= _ONE_DAY_MS
340+
def query_latest(self, what, where, lookback_days=DEFAULT_LOOKBACK_DAYS, use_latest=False):
341+
if self.use_latest:
342+
response = self._latest_table.query(
343+
KeyConditionExpression=Key('what_where_key').eq(f'{what}:{where}')
344+
)
345+
items = response.get('Items', [])
346+
if not items:
347+
return self._default_latest(what, where, lookback_days)
348+
349+
latest_item = items[0]
350+
return dict(url=latest_item['url'], metadata=latest_item['metadata'])
344351

345-
return None
352+
else:
353+
return self._default_latest(what, where, lookback_days)
346354

347355
def _get_latest_record_in_bucket(self, bucket, what, where):
348356
kwargs = self._prepare_time_bucket_kwargs(bucket, what)
@@ -367,12 +375,14 @@ def _get_all_records_in_bucket(self, bucket, **kwargs):
367375
kwargs['ExclusiveStartKey'] = response['LastEvaluatedKey']
368376
return records
369377

370-
def query_latest_table(self, what, where):
371-
response = self._table.query(
372-
KeyConditionExpression=Key('what_where_key').eq(f'{what}:{where}')
373-
)
374-
items = response.get('Items', [])
375-
if not items:
376-
return None
377-
latest_item = items[0]
378-
return dict(url=latest_item['url'], metadata=latest_item['metadata'])
378+
def _default_latest(self, what, where, lookback_days=DEFAULT_LOOKBACK_DAYS):
379+
current = int(time.time() * 1000)
380+
end = current - lookback_days * _ONE_DAY_MS
381+
while current >= end:
382+
bucket = current/DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
383+
r = self._get_latest_record_in_bucket(bucket, what, where)
384+
if r is not None:
385+
return r
386+
current -= _ONE_DAY_MS
387+
388+
return None

api/tests/conftest.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,6 @@ def _create_table(dynamodb,
177177

178178

179179
def _populate_table(table, records):
180-
print(f'attempting to populate {table}')
181180
with table.batch_writer() as batch:
182181
for r in records:
183182
batch.put_item(Item=r)
@@ -187,46 +186,44 @@ def _populate_table(table, records):
187186
@pytest.fixture
188187
def table_maker(request, dynamodb):
189188

190-
def maker(records, include_latest_key=False):
191-
old_table_name = 'test'
189+
def maker(records, use_latest=False):
190+
table_name = 'test'
192191
latest_table_name = 'test_latest'
193192
latest_table = None
194193

195-
old_table = _create_table(dynamodb, old_table_name, attribute_definitions, key_schema, global_secondary)
196-
_populate_table(old_table, records)
194+
table = _create_table(dynamodb, table_name, attribute_definitions, key_schema, global_secondary)
197195

198-
if include_latest_key:
199-
latest_table = _create_table(dynamodb, latest_table_name, latest_attribute_definitions, latest_key_schema)
200-
_populate_table(latest_table, records)
196+
latest_table = _create_table(dynamodb, latest_table_name, latest_attribute_definitions, latest_key_schema)
197+
_populate_table(latest_table, records)
198+
199+
_populate_table(table, records)
201200

202201
def tear_down():
203-
_delete_table(old_table)
204-
if include_latest_key:
205-
_delete_table(latest_table)
202+
_delete_table(table)
203+
_delete_table(latest_table)
206204

207205
request.addfinalizer(tear_down)
208206

209-
return old_table, latest_table
207+
return (table, latest_table)
210208

211209
return maker
212210

213211

214212
@pytest.fixture
215213
def record_maker(s3_file_from_metadata):
216214

217-
def maker(include_latest_key=False, **kwargs):
215+
def maker(**kwargs):
218216
m = generate_random_metadata()
219217
m.update(**kwargs)
220218
key = '/'.join([str(v) for v in kwargs.values()])
221219
url = 's3://datalake-test/' + key
222220
s3_file_from_metadata(url, m)
223221
records = DatalakeRecord.list_from_metadata(url, m)
224222

225-
if include_latest_key:
226-
what = kwargs.get('what')
227-
where = kwargs.get('where')
228-
for record in records:
229-
record['what_where_key'] = f"{what}:{where}"
223+
what = kwargs.get('what')
224+
where = kwargs.get('where')
225+
for record in records:
226+
record['what_where_key'] = f"{what}:{where}"
230227

231228
return records
232229

api/tests/test_archive_querier.py

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -125,11 +125,8 @@ def query_latest(self, what, where):
125125

126126
@pytest.fixture(params=[ArchiveQuerier, HttpQuerier],
127127
ids=['archive_querier', 'http'])
128-
def querier(request):
129-
def create_querier(dynamodb, table_name):
130-
return request.param(table_name, dynamodb=dynamodb)
131-
return create_querier
132-
128+
def querier(request, dynamodb):
129+
return request.param('test', dynamodb=dynamodb, use_latest=True)
133130

134131
def in_url(result, part):
135132
url = result['url']
@@ -409,6 +406,10 @@ def test_no_end(table_maker, querier, s3_file_from_metadata):
409406
url = 's3://datalake-test/' + m['id']
410407
s3_file_from_metadata(url, m)
411408
records = DatalakeRecord.list_from_metadata(url, m)
409+
for record in records:
410+
what = record.get('what')
411+
where = record.get('where')
412+
record['what_where_key'] = f'{what}:{where}'
412413
table_maker(records)
413414
results = querier.query_by_time(m['start'], m['start'] + 1, m['what'])
414415
assert len(results) == 1
@@ -421,7 +422,12 @@ def test_no_end_exclusion(table_maker, querier, s3_file_from_metadata):
421422
url = 's3://datalake-test/' + m['id']
422423
s3_file_from_metadata(url, m)
423424
records = DatalakeRecord.list_from_metadata(url, m)
425+
for record in records:
426+
what = record.get('what')
427+
where = record.get('where')
428+
record['what_where_key'] = f'{what}:{where}'
424429
table_maker(records)
430+
425431
results = querier.query_by_time(m['start'] + 1, m['start'] + 2, m['what'])
426432
assert len(results) == 0
427433

@@ -480,8 +486,7 @@ def test_latest_creation_time_breaks_tie(table_maker, querier,
480486
start = bucket * DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
481487
interval = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS/150
482488
end = start + interval
483-
table = table_maker([])
484-
489+
table = table_maker([])[0]
485490
for i in range(3):
486491
record = record_maker(start=start,
487492
end=end,
@@ -531,17 +536,10 @@ def test_2x_max_results_in_one_bucket(table_maker, querier, record_maker):
531536
results = consolidate_pages(pages)
532537
assert len(results) == MAX_RESULTS * 2
533538

534-
"""
535-
Will have to go through all of the tests associated with
536-
latest and correctly query from
537-
the latest table that was created.
538-
"""
539539

540540
def test_latest_table_query(table_maker, querier, record_maker):
541-
now = int(time.time() * 1000)
542-
records = record_maker(include_latest_key=True, what='foo', where='boo')
543-
_, latest_table = table_maker(records)
544-
545-
querier_instance = querier(dynamodb=latest_table.dynamodb, table_name=latest_table.table_name)
546-
result = querier_instance.query_latest_table('foo', 'boo')
541+
records = record_maker(what='foo', where='boo')
542+
table_maker(records)
543+
querier.use_latest = True
544+
result = querier.query_latest('foo', 'boo')
547545
_validate_latest_result(result, what='foo', where='boo')

0 commit comments

Comments
 (0)