Skip to content

Commit 697e9ad

Browse files
authored
Merge pull request #975 from hubmapconsortium/karlburke/LargeCollectionTimeouts
Karlburke/large Collection timeouts
2 parents d2462c4 + ab8b63a commit 697e9ad

File tree

3 files changed

+102
-45
lines changed

3 files changed

+102
-45
lines changed

src/schema/schema_constants.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,13 @@ class TriggerTypeEnum(Enum):
5454
BEFORE_UPDATE = 'before_update_trigger'
5555
AFTER_CREATE = 'after_create_trigger'
5656
AFTER_UPDATE = 'after_update_trigger'
57+
58+
# Define an enumeration of accepted Neo4j relationship types.
59+
class Neo4jRelationshipEnum(Enum):
60+
ACTIVITY_INPUT = 'ACTIVITY_INPUT'
61+
ACTIVITY_OUTPUT = 'ACTIVITY_INPUT'
62+
IN_COLLECTION = 'IN_COLLECTION'
63+
IN_UPLOAD = 'IN_UPLOAD'
64+
REVISION_OF = 'REVISION_OF'
65+
USES_DATA = 'USES_DATA'
66+

src/schema/schema_neo4j_queries.py

Lines changed: 76 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
1+
import neo4j
12
from neo4j.exceptions import TransactionError
2-
from schema.schema_constants import SchemaConstants
3+
from neo4j import Session as Neo4jSession
4+
from schema.schema_constants import SchemaConstants, Neo4jRelationshipEnum
35
import logging
46

57
logger = logging.getLogger(__name__)
68

79
# The filed name of the single result record
810
record_field_name = 'result'
911

10-
1112
####################################################################################################
1213
## Functions can be called by app.py, schema_manager.py, and schema_triggers.py
1314
####################################################################################################
@@ -109,7 +110,38 @@ def get_entity(neo4j_driver, uuid):
109110

110111
return result
111112

113+
"""
114+
Given a list of UUIDs, return a dict mapping uuid -> entity_node
115+
Only UUIDs present in Neo4j will be returned.
112116
117+
Parameters
118+
----------
119+
neo4j_driver : neo4j.Driver object
120+
The neo4j database connection pool
121+
uuid_list : list of str
122+
The uuids of target entities to retrieve from Neo4j
123+
124+
Returns
125+
-------
126+
dict
127+
A dictionary of entity details returned from the Cypher query, keyed by
128+
the uuid provided in uuid_list.
129+
"""
130+
def identify_existing_dataset_entities(neo4j_driver, dataset_uuid_list:list):
131+
132+
if not dataset_uuid_list:
133+
return {}
134+
135+
query = """
136+
MATCH (e:Entity)
137+
WHERE e.uuid IN $param_uuids
138+
AND e.entity_type='Dataset'
139+
RETURN e.uuid AS uuid
140+
"""
141+
142+
with neo4j_driver.session() as session:
143+
results = session.run(query, param_uuids=dataset_uuid_list)
144+
return [record["uuid"] for record in results]
113145

114146
"""
115147
Get the uuids for each entity in a list that doesn't belong to a certain entity type. Uuids are ordered by type
@@ -889,13 +921,11 @@ def link_collection_to_datasets(neo4j_driver, collection_uuid, dataset_uuid_list
889921
_delete_collection_linkages_tx(tx=tx
890922
, uuid=collection_uuid)
891923

892-
# Create relationship from each member Dataset node to this Collection node
893-
for dataset_uuid in dataset_uuid_list:
894-
create_relationship_tx(tx=tx
895-
, source_node_uuid=dataset_uuid
896-
, direction='->'
897-
, target_node_uuid=collection_uuid
898-
, relationship='IN_COLLECTION')
924+
_create_relationships_unwind_tx(tx=tx
925+
, source_uuid_list=dataset_uuid_list
926+
, target_uuid=collection_uuid
927+
, relationship=Neo4jRelationshipEnum.IN_COLLECTION
928+
, direction='->')
899929

900930
tx.commit()
901931
except TransactionError as te:
@@ -1980,6 +2010,43 @@ def create_relationship_tx(tx, source_node_uuid, target_node_uuid, relationship,
19802010

19812011
result = tx.run(query)
19822012

2013+
"""
2014+
Create multiple relationships between a target node and each node in
2015+
a list of source nodes in neo4j
2016+
2017+
Parameters
2018+
----------
2019+
tx : neo4j.Session object
2020+
The neo4j.Session object instance
2021+
source_uuid_list : list[str]
2022+
A list of UUIDs for nodes which will have a relationship to the node with target_uuid
2023+
target_uuid : str
2024+
The UUID of target node
2025+
relationship : Neo4jRelationshipEnum
2026+
The string for the Neo4j relationship type between each source node and the target node.
2027+
direction: str
2028+
The relationship direction of each source node to the target node: outgoing `->` or incoming `<-`
2029+
Neo4j CQL CREATE command supports only directional relationships
2030+
"""
2031+
def _create_relationships_unwind_tx(tx:Neo4jSession, source_uuid_list:list, target_uuid:str
2032+
, relationship:Neo4jRelationshipEnum, direction:str)->None:
2033+
logger.info("====== enter _create_relationships_unwind_tx() ======")
2034+
incoming = direction if direction == "<-" else "-"
2035+
outgoing = direction if direction == "->" else "-"
2036+
2037+
query = (
2038+
f"MATCH (t {{uuid: $target_uuid}}) "
2039+
f"UNWIND $source_uuid_list AS src_uuid "
2040+
f"MATCH (s {{uuid: src_uuid}}) "
2041+
f"CREATE (s){incoming}[r:{relationship.value}]{outgoing}(t) "
2042+
f"RETURN src_uuid AS linked_uuid"
2043+
)
2044+
2045+
result = tx.run( query=query
2046+
, target_uuid=target_uuid
2047+
, source_uuid_list=source_uuid_list)
2048+
logger.info("====== returning from _create_relationships_unwind_tx() ======")
2049+
19832050
"""
19842051
Execute one query to create all outgoing relationships from each node whose
19852052
identifier is in the source node list to the target Activity node in neo4j

src/schema/schema_validators.py

Lines changed: 16 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,8 @@ def verify_DOI_pair(property_key, normalized_entity_type, request, existing_data
298298
f" the prefix {SchemaConstants.DOI_BASE_URL}.")
299299

300300
"""
301-
Validate every entity in a list is of entity_type accepted
301+
Validate every entity in a list is of entity_type that can be in a
302+
Collection and already exists in Neo4j.
302303
303304
Parameters
304305
----------
@@ -318,41 +319,20 @@ def collection_entities_are_existing_datasets(property_key, normalized_entity_ty
318319
# Verify each UUID specified exists in the uuid-api, exists in Neo4j, and is for a Dataset before
319320
# proceeding with creation of Collection.
320321
bad_dataset_uuids = []
321-
for dataset_uuid in new_data_dict['dataset_uuids']:
322-
try:
323-
## The following code duplicates some functionality existing in app.py, in
324-
## query_target_entity(), which also deals with caching. In the future, the
325-
## validation logic shared by this file and app.py should become a utility
326-
## module, shared by validators as well as app.py. But for now, the code
327-
## is repeated for the following.
328-
329-
# Get cached ids if exist otherwise retrieve from UUID-API. Expect an
330-
# Exception to be raised if not found.
331-
dataset_uuid_entity = schema_manager.get_hubmap_ids(id=dataset_uuid)
332-
333-
# If the uuid exists per the uuid-api, make sure it also exists as a Neo4j entity.
334-
uuid = dataset_uuid_entity['uuid']
335-
entity_dict = schema_neo4j_queries.get_entity(schema_manager.get_neo4j_driver_instance(), dataset_uuid)
336-
337-
# If dataset_uuid is not found in Neo4j or is not for a Dataset, fail the validation.
338-
if not entity_dict:
339-
logger.info(f"Request for {dataset_uuid} inclusion in Collection,"
340-
f" but not found in Neo4j.")
341-
bad_dataset_uuids.append(dataset_uuid)
342-
elif entity_dict['entity_type'] != 'Dataset':
343-
logger.info(f"Request for {dataset_uuid} inclusion in Collection,"
344-
f" but entity_type={entity_dict['entity_type']}, not Dataset.")
345-
bad_dataset_uuids.append(dataset_uuid)
346-
except Exception as nfe:
347-
# If the dataset_uuid is not found, fail the validation.
348-
logger.info(f"Request for {dataset_uuid} inclusion in Collection"
349-
f" failed uuid-api retrieval.")
350-
bad_dataset_uuids.append(dataset_uuid)
351-
# If any uuids in the request dataset_uuids are not for an existing Dataset entity which
352-
# exists in uuid-api and Neo4j, raise an Exception so the validation fails and the
353-
# operation can be rejected.
354-
if bad_dataset_uuids:
355-
raise ValueError(f"Unable to find Datasets for {bad_dataset_uuids}.")
322+
dataset_uuid_list = new_data_dict['dataset_uuids']
323+
if not dataset_uuid_list:
324+
return
325+
326+
existing_datasets_list = schema_neo4j_queries.identify_existing_dataset_entities( neo4j_driver=schema_manager.get_neo4j_driver_instance()
327+
, dataset_uuid_list=dataset_uuid_list)
328+
329+
# If any UUIDs which were passed in do not exist in Neo4j or are not Datasets, identify them
330+
missing_uuid_set = set(dataset_uuid_list) - set(existing_datasets_list)
331+
if missing_uuid_set:
332+
logger.info(f"Only existing Datasets may be included in a Collection:"
333+
f" {sorted(missing_uuid_set)}")
334+
raise ValueError( f"Only existing Datasets may be included in a Collection, not these: "
335+
f" {sorted(missing_uuid_set)}")
356336

357337
"""
358338
Validate the provided value of Dataset.status on update via PUT

0 commit comments

Comments
 (0)