Skip to content

Commit 49ed58b

Browse files
ruoliu2claude
andcommitted
refactor(es-compat): use best-effort metadata filtering for index_filter
Remove heavy split-level query execution for field_caps index_filter. The implementation now aligns with ES's "best-effort" approach that uses metadata-level filtering only (time range, tags) instead of opening splits and executing queries. Changes: - Remove split_matches_query function (no longer opens splits) - Remove query_ast and doc_mapper from LeafListFieldsRequest proto - Keep metadata-level filtering in root_list_fields: - Time range extraction from query AST - Tag-based split pruning - Simplify leaf_list_fields to just return fields from all splits This matches ES semantics: "filtering is done on a best-effort basis... this API may return an index even if the provided filter matches no document." Co-Authored-By: Claude Opus 4.5 <[email protected]> Signed-off-by: ruo <[email protected]>
1 parent 84989b2 commit 49ed58b

File tree

5 files changed

+14
-147
lines changed

5 files changed

+14
-147
lines changed

quickwit/quickwit-proto/protos/quickwit/search.proto

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -145,14 +145,6 @@ message LeafListFieldsRequest {
145145
// Optional limit query to a list of fields
146146
// Wildcard expressions are supported.
147147
repeated string fields = 4;
148-
149-
// JSON-serialized QueryAst for index_filter support.
150-
// When provided, only splits containing documents matching this query are included.
151-
optional string query_ast = 5;
152-
153-
// JSON-serialized DocMapper for query execution.
154-
// Required when query_ast is provided to build and execute the query.
155-
optional string doc_mapper = 6;
156148
}
157149

158150
message ListFieldsResponse {

quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs

Lines changed: 0 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

quickwit/quickwit-search/src/list_fields.rs

Lines changed: 13 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ use quickwit_common::rate_limited_warn;
2525
use quickwit_common::shared_consts::{FIELD_PRESENCE_FIELD_NAME, SPLIT_FIELDS_FILE_NAME};
2626
use quickwit_common::uri::Uri;
2727
use quickwit_config::build_doc_mapper;
28-
use quickwit_doc_mapper::DocMapper;
2928
use quickwit_doc_mapper::tag_pruning::extract_tags_from_query;
3029
use quickwit_metastore::SplitMetadata;
3130
use quickwit_proto::metastore::MetastoreServiceClient;
@@ -35,10 +34,9 @@ use quickwit_proto::search::{
3534
};
3635
use quickwit_proto::types::{IndexId, IndexUid};
3736
use quickwit_query::query_ast::QueryAst;
38-
use quickwit_storage::{ByteRangeCache, Storage};
39-
use tantivy::ReloadPolicy;
37+
use quickwit_storage::Storage;
4038

41-
use crate::leaf::{open_index_with_caches, open_split_bundle, warmup};
39+
use crate::leaf::open_split_bundle;
4240
use crate::search_job_placer::group_jobs_by_index_id;
4341
use crate::service::SearcherContext;
4442
use crate::{
@@ -314,75 +312,15 @@ impl FieldPattern {
314312
}
315313
}
316314

317-
/// Checks if any documents in the split match the query.
318-
/// Returns true if at least one document matches, false otherwise.
319-
///
320-
/// This is a lightweight query execution that only counts matches without
321-
/// materializing documents, used for split-level filtering in field capabilities.
322-
async fn split_matches_query(
323-
searcher_context: &SearcherContext,
324-
index_storage: Arc<dyn Storage>,
325-
split: &SplitIdAndFooterOffsets,
326-
doc_mapper: &DocMapper,
327-
query_ast: &QueryAst,
328-
) -> crate::Result<bool> {
329-
let byte_range_cache =
330-
ByteRangeCache::with_infinite_capacity(&quickwit_storage::STORAGE_METRICS.shortlived_cache);
331-
// Open split with caches
332-
let (index, _hot_directory) = open_index_with_caches(
333-
searcher_context,
334-
index_storage,
335-
split,
336-
Some(doc_mapper.tokenizer_manager()),
337-
Some(byte_range_cache),
338-
)
339-
.await?;
340-
341-
// Create searcher with manual reload policy
342-
let reader = index
343-
.reader_builder()
344-
.reload_policy(ReloadPolicy::Manual)
345-
.try_into()
346-
.map_err(|err| SearchError::Internal(format!("failed to create index reader: {err}")))?;
347-
let searcher = reader.searcher();
348-
349-
// Build query from QueryAst
350-
let (query, mut warmup_info) = doc_mapper
351-
.query(searcher.schema().clone(), query_ast.clone(), false, None)
352-
.map_err(|err| SearchError::InvalidQuery(format!("failed to build query: {err}")))?;
353-
354-
// Warmup to ensure all bytes are fetched asynchronously before sync search
355-
warmup_info.simplify();
356-
warmup(&searcher, &warmup_info)
357-
.await
358-
.map_err(|err| SearchError::Internal(format!("failed to warmup query: {err}")))?;
359-
360-
// Check if any docs match (lightweight count)
361-
let count = search_thread_pool()
362-
.run_cpu_intensive(move || {
363-
query
364-
.count(&searcher)
365-
.map_err(|err| SearchError::Internal(format!("failed to count matches: {err}")))
366-
})
367-
.await
368-
.map_err(|_| SearchError::Internal("split matches query panicked".to_string()))??;
369-
370-
Ok(count > 0)
371-
}
372-
373315
/// `leaf` step of list fields.
374316
///
375-
/// Returns field metadata from the assigned splits. When `query_ast` and `doc_mapper_str`
376-
/// are provided, splits are filtered to only include those containing at least one
377-
/// matching document (lightweight query execution for split-level filtering).
317+
/// Returns field metadata from the assigned splits.
378318
pub async fn leaf_list_fields(
379319
index_id: IndexId,
380320
index_storage: Arc<dyn Storage>,
381321
searcher_context: &SearcherContext,
382322
split_ids: &[SplitIdAndFooterOffsets],
383323
field_patterns_str: &[String],
384-
query_ast_str: Option<&str>,
385-
doc_mapper_str: Option<&str>,
386324
) -> crate::Result<ListFieldsResponse> {
387325
let field_patterns: Vec<FieldPattern> = field_patterns_str
388326
.iter()
@@ -394,47 +332,8 @@ pub async fn leaf_list_fields(
394332
return Ok(ListFieldsResponse { fields: Vec::new() });
395333
}
396334

397-
// Filter splits based on query if both query_ast and doc_mapper are provided
398-
let matching_splits: Vec<&SplitIdAndFooterOffsets> = match (query_ast_str, doc_mapper_str) {
399-
(Some(ast_json), Some(mapper_json)) => {
400-
let query_ast: QueryAst = serde_json::from_str(ast_json)
401-
.map_err(|err| SearchError::InvalidQuery(err.to_string()))?;
402-
let doc_mapper = crate::service::deserialize_doc_mapper(mapper_json)?;
403-
404-
let split_match_tasks: Vec<_> = split_ids
405-
.iter()
406-
.map(|split| {
407-
let index_storage = index_storage.clone();
408-
async {
409-
split_matches_query(
410-
searcher_context,
411-
index_storage,
412-
split,
413-
&doc_mapper,
414-
&query_ast,
415-
)
416-
.await
417-
}
418-
})
419-
.collect();
420-
421-
let matches_vec = try_join_all(split_match_tasks).await?;
422-
split_ids
423-
.iter()
424-
.zip(matches_vec)
425-
.filter_map(|(split, matches)| matches.then_some(split))
426-
.collect()
427-
}
428-
_ => split_ids.iter().collect(),
429-
};
430-
431-
// If no splits match, return empty response
432-
if matching_splits.is_empty() {
433-
return Ok(ListFieldsResponse { fields: Vec::new() });
434-
}
435-
436-
// Get fields from matching splits
437-
let single_split_list_fields_futures: Vec<_> = matching_splits
335+
// Get fields from all splits
336+
let single_split_list_fields_futures: Vec<_> = split_ids
438337
.iter()
439338
.map(|split_id| {
440339
get_fields_from_split(
@@ -494,8 +393,6 @@ pub struct IndexMetasForLeafSearch {
494393
pub index_id: IndexId,
495394
/// Index URI.
496395
pub index_uri: Uri,
497-
/// Serialized DocMapper for query execution (only set when query_ast is provided).
498-
pub doc_mapper_str: Option<String>,
499396
}
500397

501398
/// Performs a distributed list fields request.
@@ -514,37 +411,26 @@ pub async fn root_list_fields(
514411
return Ok(ListFieldsResponse { fields: Vec::new() });
515412
}
516413

517-
// Build index metadata map, including doc_mapper if query_ast is provided
518-
let has_query_ast = list_fields_req.query_ast.is_some();
414+
// Build index metadata map and extract timestamp field for time range refinement
519415
let mut index_uid_to_index_meta: HashMap<IndexUid, IndexMetasForLeafSearch> = HashMap::new();
520416
let mut index_uids: Vec<IndexUid> = Vec::new();
521417
let mut timestamp_field_opt: Option<String> = None;
522418

523419
for index_metadata in indexes_metadata {
524-
// Only build doc_mapper when query_ast is provided (needed for split-level filtering)
525-
let doc_mapper_str = if has_query_ast {
526-
let doc_mapper = build_doc_mapper(
420+
// Extract timestamp field for time range refinement (use first index's field)
421+
if timestamp_field_opt.is_none()
422+
&& list_fields_req.query_ast.is_some()
423+
&& let Ok(doc_mapper) = build_doc_mapper(
527424
&index_metadata.index_config.doc_mapping,
528425
&index_metadata.index_config.search_settings,
529426
)
530-
.map_err(|err| SearchError::Internal(format!("failed to build doc mapper: {err}")))?;
531-
532-
// Capture timestamp field for time range extraction (use first index's field)
533-
if timestamp_field_opt.is_none() {
534-
timestamp_field_opt = doc_mapper.timestamp_field_name().map(|s| s.to_string());
535-
}
536-
537-
Some(serde_json::to_string(&doc_mapper).map_err(|err| {
538-
SearchError::Internal(format!("failed to serialize doc mapper: {err}"))
539-
})?)
540-
} else {
541-
None
542-
};
427+
{
428+
timestamp_field_opt = doc_mapper.timestamp_field_name().map(|s| s.to_string());
429+
}
543430

544431
let index_metadata_for_leaf_search = IndexMetasForLeafSearch {
545432
index_uri: index_metadata.index_uri().clone(),
546433
index_id: index_metadata.index_config.index_id.to_string(),
547-
doc_mapper_str,
548434
};
549435

550436
index_uids.push(index_metadata.index_uid.clone());
@@ -637,8 +523,6 @@ pub fn jobs_to_leaf_requests(
637523
index_uri: index_meta.index_uri.to_string(),
638524
fields: search_request_for_leaf.fields.clone(),
639525
split_offsets: job_group.into_iter().map(|job| job.offsets).collect(),
640-
query_ast: search_request_for_leaf.query_ast.clone(),
641-
doc_mapper: index_meta.doc_mapper_str.clone(),
642526
};
643527
leaf_search_requests.push(leaf_search_request);
644528
Ok(())

quickwit/quickwit-search/src/service.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -307,8 +307,6 @@ impl SearchService for SearchServiceImpl {
307307
&self.searcher_context,
308308
&split_ids[..],
309309
&list_fields_req.fields,
310-
list_fields_req.query_ast.as_deref(),
311-
list_fields_req.doc_mapper.as_deref(),
312310
)
313311
.await
314312
}

quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ pub fn convert_to_es_field_capabilities_response(
181181
/// Returns `Ok(None)` if the index_filter is null or empty.
182182
/// Returns `Ok(Some(QueryAst))` if the index_filter is valid.
183183
/// Returns `Err` if the index_filter is invalid or cannot be converted.
184+
#[allow(clippy::result_large_err)]
184185
pub fn parse_index_filter_to_query_ast(
185186
index_filter: serde_json::Value,
186187
) -> Result<Option<QueryAst>, ElasticsearchError> {

0 commit comments

Comments
 (0)