Skip to content

Commit 1a91efb

Browse files
committed
Add new composable chemistry queries. Deprecate existing queries.
Signed-off-by: DKL <[email protected]>
1 parent 5ddfdb7 commit 1a91efb

File tree

4 files changed

+184
-1
lines changed

4 files changed

+184
-1
lines changed

deepsearch/chemistry/models.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from typing import Optional
2+
3+
from pydantic import BaseModel
4+
5+
6+
class ChemistryModel(BaseModel, extra="allow"):
7+
id: int
8+
"""Transient identifier for short term use."""
9+
10+
persistent_id: str
11+
"""Identifier for long term (storage) use."""
12+
13+
14+
class ChemistryDocument(ChemistryModel):
15+
application_id: Optional[str]
16+
"""Identifier under which a patent application has been filed."""
17+
18+
publication_id: Optional[str]
19+
"""Identifier under which a patent has been published."""
20+
21+
title: str
22+
"""(Readable) title of the document."""
23+
24+
25+
class ChemistryCompound(ChemistryModel):
26+
smiles: str
27+
"""SMILES representation of compound structure."""
28+
29+
display_name: str
30+
"""User friendly representation of compound."""
31+
32+
inchi: str
33+
"""InChI representation of compound structure."""
34+
35+
inchikey: str
36+
"""Hashed form of InChI."""
37+
38+
sum_formula: str
39+
"""Sum formula of compound. For example 'C6 O2 H5'"""
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .queries import *

deepsearch/chemistry/queries/molecules.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ def MoleculeQuery(
4343
num_items: int = 10,
4444
) -> Query:
4545
"""
46-
Use the vector database in Deep Search for querying molecules
46+
DEPRECATED: Migrate to compounds queries.
47+
48+
Use the knowledge database in Deep Search for querying molecules
4749
by substructure or similarity.
4850
The result is contained in the `molecules` output of the response.
4951
"""
@@ -97,6 +99,8 @@ def MoleculesInPatentsQuery(
9799
partial_lookup: bool = False,
98100
) -> Query:
99101
"""
102+
DEPRECATED: Migrate to compounds queries.
103+
100104
List all molecules contained in a list of patents.
101105
The result is contained in the `molecules` output of the response.
102106
"""
@@ -134,6 +138,8 @@ def PatentsWithMoleculesQuery(
134138
num_items: int = 10,
135139
) -> Query:
136140
"""
141+
DEPRECATED: Migrate to documents queries.
142+
137143
List all patents containing any of the input molecules.
138144
The result is contained in the `patents` output of the response.
139145
"""
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
from abc import ABC
2+
from typing import Any, Type, overload
3+
4+
from pydantic import BaseModel
5+
6+
from deepsearch.chemistry.models import ChemistryCompound, ChemistryDocument
7+
from deepsearch.chemistry.resources import KnowledgeDbResource
8+
from deepsearch.cps.client import api
9+
from deepsearch.cps.client.queries.query import Query
10+
11+
12+
class ChemistryQuery(BaseModel, ABC):
13+
_result_type: Type
14+
15+
16+
class CompoundsQuery(ChemistryQuery):
17+
_result_type = ChemistryCompound
18+
19+
20+
class DocumentsQuery(ChemistryQuery):
21+
_result_type = ChemistryDocument
22+
23+
24+
class CompoundsByIds(CompoundsQuery):
25+
"""Query compounds that have any of the given identifiers."""
26+
27+
persistent_ids: list[str] = []
28+
29+
30+
class CompoundsBySmiles(CompoundsQuery):
31+
"""Query compounds that (exactly) match the given SMILES code."""
32+
33+
structure: str
34+
35+
36+
class CompoundsBySmarts(CompoundsQuery):
37+
"""Query compounds that (exactly) match the given SMARTS code."""
38+
39+
structure: str
40+
41+
42+
class CompoundsBySimilarity(CompoundsQuery):
43+
"""Query compounds that are similar to the given SMILES code."""
44+
45+
structure: str
46+
threshold: float = 0.9
47+
48+
49+
class CompoundsBySubstructure(CompoundsQuery):
50+
"""Query compounds that contain a substructure with the given SMILES code."""
51+
52+
structure: str
53+
54+
55+
class CompoundsIn(CompoundsQuery):
56+
"""Query compounds that occur in the given documents."""
57+
58+
documents: DocumentsQuery
59+
60+
61+
class DocumentsByIds(DocumentsQuery):
62+
"""Query documents that have any of the given identifiers."""
63+
64+
publication_ids: list[str] = []
65+
application_ids: list[str] = []
66+
persistent_ids: list[str] = []
67+
68+
69+
class DocumentsHaving(DocumentsQuery):
70+
"""Query documents that contain compounds matching the given query."""
71+
72+
compounds: CompoundsQuery
73+
74+
75+
@overload
76+
def query_chemistry(
77+
api: api.CpsApi, query: CompoundsQuery, offset: int = 0, limit: int = 10
78+
) -> list[ChemistryCompound]: ...
79+
80+
81+
@overload
82+
def query_chemistry(
83+
api: api.CpsApi, query: DocumentsQuery, offset: int = 0, limit: int = 10
84+
) -> list[ChemistryDocument]: ...
85+
86+
87+
def query_chemistry(
88+
api: api.CpsApi, query: ChemistryQuery, offset: int = 0, limit: int = 10
89+
) -> list[Any]:
90+
"""Perform a chemistry query on the knowledge base."""
91+
92+
# Resolve knowledge lookup functions and arguments.
93+
function_names = {
94+
CompoundsByIds: "compounds",
95+
CompoundsBySmiles: "compounds_by_smiles",
96+
CompoundsBySmarts: "compounds_by_smarts",
97+
CompoundsBySimilarity: "compounds_by_similarity",
98+
CompoundsBySubstructure: "compounds_by_substructure",
99+
CompoundsIn: "compounds_in_documents",
100+
DocumentsByIds: "documents",
101+
DocumentsHaving: "documents_having_compounds",
102+
}
103+
104+
query_parts: list[ChemistryQuery] = [query]
105+
106+
if type(query) is CompoundsIn:
107+
query_parts.append(query.documents)
108+
elif type(query) is DocumentsHaving:
109+
query_parts.append(query.compounds)
110+
111+
function_parts = [function_names[type(q)] for q in query_parts]
112+
arguments = query_parts[-1].model_dump()
113+
114+
# Compose query task.
115+
query_tasks = Query()
116+
117+
lookup = query_tasks.add(
118+
"KnowledgeLookup",
119+
task_id="lookup",
120+
parameters={
121+
"schema": "patcid",
122+
"function": function_parts,
123+
"arguments": arguments,
124+
"offset": offset,
125+
"limit": limit,
126+
},
127+
coordinates=KnowledgeDbResource(),
128+
)
129+
lookup.output("result").output_as("result")
130+
131+
# Run task.
132+
response = api.queries.run(query_tasks)
133+
134+
return [
135+
query_parts[0]._result_type.model_validate(item)
136+
for item in response.outputs["result"]
137+
]

0 commit comments

Comments
 (0)