Skip to content

Commit 6932fb0

Browse files
authored
Merge pull request #89 from planetlabs/proto-query-latest-table
Prototype querying latest table.
2 parents d531092 + fb7b8ec commit 6932fb0

File tree

7 files changed

+192
-25
lines changed

7 files changed

+192
-25
lines changed

api/datalake_api/app.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from datalake_api import settings
2424

2525

26+
logging.basicConfig(level=logging.INFO)
2627
LOGGER = logging.getLogger(__name__)
2728

2829

api/datalake_api/querier.py

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,14 @@
1414

1515
from memoized_property import memoized_property
1616
from datalake.common import DatalakeRecord
17+
from boto3.dynamodb.conditions import Key
1718
import base64
1819
import json
1920
import time
21+
import os
22+
23+
import logging
24+
log = logging.getLogger(__name__)
2025

2126

2227
'''the maximum number of results to return to the user
@@ -172,9 +177,15 @@ def _unpack(self, result):
172177

173178
class ArchiveQuerier(object):
174179

175-
def __init__(self, table_name, dynamodb=None):
180+
def __init__(self, table_name,
181+
latest_table_name=None,
182+
use_latest_table=None,
183+
dynamodb=None):
176184
self.table_name = table_name
185+
self.latest_table_name = latest_table_name
186+
self.use_latest_table = use_latest_table
177187
self.dynamodb = dynamodb
188+
178189

179190
def query_by_work_id(self, work_id, what, where=None, cursor=None):
180191
kwargs = self._prepare_work_id_kwargs(work_id, what)
@@ -330,18 +341,28 @@ def _cursor_for_time_query(self, response, results, current_bucket):
330341
@memoized_property
331342
def _table(self):
332343
return self.dynamodb.Table(self.table_name)
344+
345+
@memoized_property
346+
def _latest_table(self):
347+
return self.dynamodb.Table(self.latest_table_name)
333348

334349
def query_latest(self, what, where, lookback_days=DEFAULT_LOOKBACK_DAYS):
335-
current = int(time.time() * 1000)
336-
end = current - lookback_days * _ONE_DAY_MS
337-
while current >= end:
338-
bucket = current/DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
339-
r = self._get_latest_record_in_bucket(bucket, what, where)
340-
if r is not None:
341-
return r
342-
current -= _ONE_DAY_MS
350+
if self.use_latest_table:
351+
log.info('inside use_latest_table=TRUE')
352+
response = self._latest_table.query(
353+
KeyConditionExpression=Key('what_where_key').eq(f'{what}:{where}')
354+
)
355+
items = response.get('Items', [])
356+
357+
if not items:
358+
log.info('Falling back to default latest query')
359+
return self._default_latest(what, where, lookback_days)
343360

344-
return None
361+
latest_item = items[0]
362+
return dict(url=latest_item['url'], metadata=latest_item['metadata'])
363+
364+
else:
365+
return self._default_latest(what, where, lookback_days)
345366

346367
def _get_latest_record_in_bucket(self, bucket, what, where):
347368
kwargs = self._prepare_time_bucket_kwargs(bucket, what)
@@ -365,3 +386,16 @@ def _get_all_records_in_bucket(self, bucket, **kwargs):
365386
break
366387
kwargs['ExclusiveStartKey'] = response['LastEvaluatedKey']
367388
return records
389+
390+
def _default_latest(self, what, where, lookback_days=DEFAULT_LOOKBACK_DAYS):
391+
log.info("Using default latest behavior")
392+
current = int(time.time() * 1000)
393+
end = current - lookback_days * _ONE_DAY_MS
394+
while current >= end:
395+
bucket = current/DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
396+
r = self._get_latest_record_in_bucket(bucket, what, where)
397+
if r is not None:
398+
return r
399+
current -= _ONE_DAY_MS
400+
401+
return None

api/datalake_api/settings.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,14 @@
1111
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
1212
# License for the specific language governing permissions and limitations under
1313
# the License.
14+
import os
1415

1516
# default settings
17+
1618
DYNAMODB_TABLE = 'test'
19+
DYNAMODB_LATEST_TABLE = 'test_latest'
20+
DATALAKE_USE_LATEST_TABLE = False
21+
1722
AWS_REGION = 'us-west-2'
1823
AWS_ACCESS_KEY_ID = None
1924
AWS_SECRET_ACCESS_KEY = None

api/datalake_api/v0.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
v0 = flask.Blueprint('v0', __name__, url_prefix='/v0')
2929

30+
_archive_querier = None
3031

3132
def _get_aws_kwargs():
3233
kwargs = dict(
@@ -48,11 +49,28 @@ def get_dynamodb():
4849

4950

5051
def get_archive_querier():
51-
if not hasattr(app, 'archive_querier'):
52+
"""
53+
we use global var here along with reset_archive_querier()
54+
to allow test fixture to differentiate between
55+
ArchiveQuerier vs HttpQuerier fixtures.
56+
"""
57+
global _archive_querier
58+
59+
if not _archive_querier:
5260
table_name = app.config.get('DYNAMODB_TABLE')
53-
app.archive_querier = ArchiveQuerier(table_name,
61+
latest_table_name = app.config.get('DYNAMODB_LATEST_TABLE')
62+
use_latest_table = app.config.get('DATALAKE_USE_LATEST_TABLE')
63+
_archive_querier = ArchiveQuerier(table_name,
64+
latest_table_name,
65+
use_latest_table,
5466
dynamodb=get_dynamodb())
55-
return app.archive_querier
67+
return _archive_querier
68+
69+
70+
def reset_archive_querier():
71+
"""FOR TESTING PURPOSES ONLY"""
72+
global _archive_querier
73+
_archive_querier = None
5674

5775

5876
@v0.route('/archive/')

api/tests/conftest.py

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242

4343

4444
def get_client():
45+
from datalake_api import settings
46+
datalake_api.app.config.from_object(settings)
47+
4548
datalake_api.app.config['TESTING'] = True
4649
datalake_api.app.config['AWS_ACCESS_KEY_ID'] = 'abc'
4750
datalake_api.app.config['AWS_SECRET_ACCESS_KEY'] = '123'
@@ -107,6 +110,20 @@ def tear_down():
107110
}
108111
]
109112

113+
latest_attribute_definitions = [
114+
{
115+
'AttributeName': 'what_where_key',
116+
'AttributeType': 'S'
117+
}
118+
]
119+
120+
latest_key_schema = [
121+
{
122+
'AttributeName': 'what_where_key',
123+
'KeyType': 'HASH'
124+
}
125+
]
126+
110127
global_secondary = [{
111128
'IndexName': 'work-id-index',
112129
'KeySchema': [
@@ -140,19 +157,24 @@ def _delete_table(table):
140157
raise e
141158

142159

143-
def _create_table(dynamodb, table_name):
160+
def _create_table(dynamodb,
161+
table_name,
162+
attribute_definitions,
163+
key_schema,
164+
global_secondary=None):
144165
table = dynamodb.Table(table_name)
145166
_delete_table(table)
146167
kwargs = dict(
147168
TableName=table_name,
148169
AttributeDefinitions=attribute_definitions,
149170
KeySchema=key_schema,
150-
GlobalSecondaryIndexes=global_secondary,
151171
ProvisionedThroughput={
152172
'ReadCapacityUnits': 5,
153173
'WriteCapacityUnits': 5
154174
}
155175
)
176+
if global_secondary:
177+
kwargs['GlobalSecondaryIndexes'] = global_secondary
156178
dynamodb.create_table(**kwargs)
157179
return dynamodb.Table(table_name)
158180

@@ -168,14 +190,20 @@ def table_maker(request, dynamodb):
168190

169191
def maker(records):
170192
table_name = 'test'
171-
table = _create_table(dynamodb, table_name)
193+
latest_table_name = 'test_latest'
194+
195+
table = _create_table(dynamodb, table_name, attribute_definitions, key_schema, global_secondary)
196+
latest_table = _create_table(dynamodb, latest_table_name, latest_attribute_definitions, latest_key_schema)
197+
198+
_populate_table(latest_table, records)
172199
_populate_table(table, records)
173200

174201
def tear_down():
175202
_delete_table(table)
176-
request.addfinalizer(tear_down)
203+
_delete_table(latest_table)
177204

178-
return table
205+
request.addfinalizer(tear_down)
206+
return (table, latest_table)
179207

180208
return maker
181209

@@ -189,6 +217,13 @@ def maker(**kwargs):
189217
key = '/'.join([str(v) for v in kwargs.values()])
190218
url = 's3://datalake-test/' + key
191219
s3_file_from_metadata(url, m)
192-
return DatalakeRecord.list_from_metadata(url, m)
220+
records = DatalakeRecord.list_from_metadata(url, m)
221+
222+
what = kwargs.get('what')
223+
where = kwargs.get('where')
224+
for record in records:
225+
record['what_where_key'] = f"{what}:{where}"
226+
227+
return records
193228

194229
return maker

api/tests/test_archive_querier.py

Lines changed: 76 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
1212
# License for the specific language governing permissions and limitations under
1313
# the License.
14-
14+
import os
1515
import pytest
16+
from datalake_api.v0 import reset_archive_querier
17+
from datalake_api import settings
1618
from datalake.common import DatalakeRecord
1719
from datalake.tests import generate_random_metadata
1820
import simplejson as json
@@ -123,11 +125,36 @@ def query_latest(self, what, where):
123125
return HttpRecord(**record)
124126

125127

126-
@pytest.fixture(params=[ArchiveQuerier, HttpQuerier],
127-
ids=['archive_querier', 'http'])
128+
129+
@pytest.fixture(params=[
130+
('archive', 'use_latest'),
131+
('archive', 'use_default'),
132+
('http', 'use_latest'),
133+
('http', 'use_default')
134+
], ids=['archive-latest',
135+
'archive-default',
136+
'http-latest',
137+
'http-default'
138+
])
128139
def querier(request, dynamodb):
129-
return request.param('test', dynamodb=dynamodb)
130140

141+
reset_archive_querier()
142+
querier_type, table_usage = request.param
143+
144+
if table_usage == 'use_latest':
145+
settings.DATALAKE_USE_LATEST_TABLE = True
146+
else:
147+
settings.DATALAKE_USE_LATEST_TABLE= False
148+
149+
if querier_type == 'http':
150+
return HttpQuerier('test',
151+
'test_latest',
152+
dynamodb=dynamodb)
153+
else:
154+
return ArchiveQuerier('test',
155+
'test_latest',
156+
use_latest_table=True if table_usage == 'use_latest' else False,
157+
dynamodb=dynamodb)
131158

132159
def in_url(result, part):
133160
url = result['url']
@@ -407,6 +434,10 @@ def test_no_end(table_maker, querier, s3_file_from_metadata):
407434
url = 's3://datalake-test/' + m['id']
408435
s3_file_from_metadata(url, m)
409436
records = DatalakeRecord.list_from_metadata(url, m)
437+
for record in records:
438+
what = record.get('what')
439+
where = record.get('where')
440+
record['what_where_key'] = f'{what}:{where}'
410441
table_maker(records)
411442
results = querier.query_by_time(m['start'], m['start'] + 1, m['what'])
412443
assert len(results) == 1
@@ -419,7 +450,12 @@ def test_no_end_exclusion(table_maker, querier, s3_file_from_metadata):
419450
url = 's3://datalake-test/' + m['id']
420451
s3_file_from_metadata(url, m)
421452
records = DatalakeRecord.list_from_metadata(url, m)
453+
for record in records:
454+
what = record.get('what')
455+
where = record.get('where')
456+
record['what_where_key'] = f'{what}:{where}'
422457
table_maker(records)
458+
423459
results = querier.query_by_time(m['start'] + 1, m['start'] + 2, m['what'])
424460
assert len(results) == 0
425461

@@ -478,8 +514,7 @@ def test_latest_creation_time_breaks_tie(table_maker, querier,
478514
start = bucket * DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
479515
interval = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS/150
480516
end = start + interval
481-
table = table_maker([])
482-
517+
table = table_maker([])[0]
483518
for i in range(3):
484519
record = record_maker(start=start,
485520
end=end,
@@ -528,3 +563,38 @@ def test_2x_max_results_in_one_bucket(table_maker, querier, record_maker):
528563
pages = get_all_pages(querier.query_by_time, [start, end, 'boo'])
529564
results = consolidate_pages(pages)
530565
assert len(results) == MAX_RESULTS * 2
566+
567+
568+
def test_latest_table_query(table_maker, querier, record_maker):
569+
now = int(time.time() * 1000)
570+
records = []
571+
bucket = int(now/DatalakeRecord.TIME_BUCKET_SIZE_IN_MS)
572+
start = bucket * DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
573+
end = start
574+
for i in range(MAX_RESULTS):
575+
records += record_maker(start=start,
576+
end=end,
577+
what='boo',
578+
where='hoo{}'.format(i))
579+
table_maker(records)
580+
result = querier.query_latest('boo', 'hoo0')
581+
_validate_latest_result(result, what='boo', where='hoo0')
582+
583+
584+
def test_query_latest_just_latest_table(table_maker, querier, record_maker):
585+
use_latest_from_env = settings.DATALAKE_USE_LATEST_TABLE
586+
table = table_maker([])[1]
587+
for i in range(3):
588+
record = record_maker(what='meow',
589+
where=f'tree',
590+
path='/{}'.format(i))
591+
592+
# only inserting into latest table
593+
table.put_item(Item=record[0])
594+
time.sleep(1.01)
595+
596+
result = querier.query_latest('meow', 'tree')
597+
if use_latest_from_env:
598+
_validate_latest_result(result, what='meow', where='tree')
599+
else:
600+
assert result is None

api/tests/test_file.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,10 @@ def maker(content, metadata):
190190
s3_file_maker('datalake-test', path, content, metadata)
191191
url = 's3://datalake-test/' + path
192192
records = DatalakeRecord.list_from_metadata(url, metadata)
193+
for record in records:
194+
what = record.get('what')
195+
where = record.get('where')
196+
record['what_where_key'] = f"{what}:{where}"
193197
table_maker(records)
194198

195199
return maker

0 commit comments

Comments
 (0)