Skip to content

Commit 659d72b

Browse files
SuaYootw4l
andauthored
feat: Add dedupe API filters (#3056)
Enables filtering crawls, crawlconfigs, and collections by dedupe property. --------- Co-authored-by: Tessa Walsh <[email protected]>
1 parent c817844 commit 659d72b

File tree

3 files changed

+21
-0
lines changed

3 files changed

+21
-0
lines changed

backend/btrixcloud/basecrawls.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,7 @@ async def list_all_base_crawls(
674674
tags: list[str] | None = None,
675675
tag_match: ListFilterType | None = None,
676676
collection_id: Optional[UUID] = None,
677+
dedupe_coll_id: Optional[UUID] = None,
677678
states: Optional[List[str]] = None,
678679
first_seed: Optional[str] = None,
679680
type_: Optional[str] = None,
@@ -790,6 +791,9 @@ async def list_all_base_crawls(
790791
if description:
791792
aggregate.extend([{"$match": {"description": description}}])
792793

794+
if dedupe_coll_id:
795+
aggregate.extend([{"$match": {"dedupeCollId": dedupe_coll_id}}])
796+
793797
if collection_id:
794798
aggregate.extend([{"$match": {"collectionIds": {"$in": [collection_id]}}}])
795799

@@ -1078,6 +1082,7 @@ async def list_all_base_crawls(
10781082
),
10791083
] = ListFilterType.AND,
10801084
collectionId: Optional[UUID] = None,
1085+
dedupeCollId: Optional[UUID] = None,
10811086
crawlType: Optional[str] = None,
10821087
cid: Optional[UUID] = None,
10831088
sortBy: Optional[str] = "finished",
@@ -1110,6 +1115,7 @@ async def list_all_base_crawls(
11101115
tags=tags,
11111116
tag_match=tag_match,
11121117
collection_id=collectionId,
1118+
dedupe_coll_id=dedupeCollId,
11131119
states=states,
11141120
first_seed=firstSeed,
11151121
type_=crawlType,

backend/btrixcloud/colls.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,7 @@ async def list_collections(
539539
sort_direction: int = 1,
540540
name: Optional[str] = None,
541541
name_prefix: Optional[str] = None,
542+
has_dedupe_index: Optional[bool] = None,
542543
access: Optional[str] = None,
543544
headers: Optional[dict] = None,
544545
):
@@ -556,6 +557,9 @@ async def list_collections(
556557
regex_pattern = f"^{name_prefix}"
557558
match_query["name"] = {"$regex": regex_pattern, "$options": "i"}
558559

560+
if has_dedupe_index is not None:
561+
match_query["hasDedupeIndex"] = has_dedupe_index
562+
559563
if public_colls_out:
560564
match_query["access"] = CollAccessType.PUBLIC
561565
elif access:
@@ -1123,6 +1127,7 @@ async def list_collection_all(
11231127
sortDirection: int = 1,
11241128
name: Optional[str] = None,
11251129
namePrefix: Optional[str] = None,
1130+
hasDedupeIndex: Optional[bool] = None,
11261131
access: Optional[str] = None,
11271132
):
11281133
# pylint: disable=duplicate-code
@@ -1134,6 +1139,7 @@ async def list_collection_all(
11341139
sort_direction=sortDirection,
11351140
name=name,
11361141
name_prefix=namePrefix,
1142+
has_dedupe_index=hasDedupeIndex,
11371143
access=access,
11381144
headers=dict(request.headers),
11391145
)

backend/btrixcloud/crawlconfigs.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,7 @@ async def get_crawl_configs(
814814
description: Optional[str] = None,
815815
tags: Optional[List[str]] = None,
816816
tag_match: Optional[ListFilterType] = ListFilterType.AND,
817+
dedupe_coll_id: Optional[UUID] = None,
817818
last_crawl_state: list[TYPE_ALL_CRAWL_STATES] | None = None,
818819
schedule: Optional[bool] = None,
819820
is_crawl_running: Optional[bool] = None,
@@ -868,6 +869,9 @@ async def get_crawl_configs(
868869
if first_seed:
869870
aggregate.extend([{"$match": {"firstSeed": first_seed}}])
870871

872+
if dedupe_coll_id:
873+
aggregate.extend([{"$match": {"dedupeCollId": first_seed}}])
874+
871875
if sort_by:
872876
if sort_by not in ALLOWED_SORT_KEYS:
873877
raise HTTPException(status_code=400, detail="invalid_sort_by")
@@ -1728,6 +1732,10 @@ async def get_crawl_configs(
17281732
description='Defaults to `"and"` if omitted',
17291733
),
17301734
] = ListFilterType.AND,
1735+
dedupe_coll_id: Annotated[
1736+
Optional[UUID],
1737+
Query(alias="dedupeCollId", title="Deduplication Source Collection"),
1738+
] = None,
17311739
last_crawl_state: Annotated[
17321740
list[TYPE_ALL_CRAWL_STATES] | None,
17331741
Query(alias="lastCrawlState", title="Last Crawl State"),
@@ -1765,6 +1773,7 @@ async def get_crawl_configs(
17651773
description=description,
17661774
tags=tags,
17671775
tag_match=tag_match,
1776+
dedupe_coll_id=dedupe_coll_id,
17681777
last_crawl_state=last_crawl_state,
17691778
schedule=schedule,
17701779
is_crawl_running=is_crawl_running,

0 commit comments

Comments
 (0)