Skip to content

Commit 06ac446

Browse files
revisit parsing of configuration (#3691)
* initial work on serde_multikey * parse DocMapping with serde_multikey * parse field config with serde_multikey * add more log level to kafka and always parse timezone from transform * use serde_multikey in DefaultDocMapper
1 parent b60943d commit 06ac446

File tree

24 files changed

+919
-215
lines changed

24 files changed

+919
-215
lines changed

docs/configuration/index-config.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ fast:
127127
| ------------- | ------------- | ------------- |
128128
| `description` | Optional description for the field. | `None` |
129129
| `stored` | Whether value is stored in the document store | `true` |
130+
| `indexed` | Whether value should be indexed so it can be searhced | `true` |
130131
| `tokenizer` | Name of the `Tokenizer`. ([See tokenizers](#description-of-available-tokenizers)) for a list of available tokenizers. | `default` |
131132
| `record` | Describes the amount of information indexed, choices between `basic`, `freq` and `position` | `basic` |
132133
| `fieldnorms` | Whether to store fieldnorms for the field. Fieldnorms are required to calculate the BM25 Score of the document. | `false` |

quickwit/Cargo.lock

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

quickwit/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ sqlx = { version = "0.7", features = [
149149
"migrate",
150150
"time",
151151
] }
152-
syn = "2.0.11"
152+
syn = { version = "2.0.11", features = [ "extra-traits", "full", "parsing" ]}
153153
sync_wrapper = "0.1.2"
154154
tabled = { version = "0.8", features = ["color"] }
155155
tempfile = "3"

quickwit/quickwit-config/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ vrl-stdlib = { workspace = true, optional=true }
3434

3535
quickwit-common = { workspace = true }
3636
quickwit-doc-mapper = { workspace = true }
37+
quickwit-macros = { workspace = true }
3738

3839
[dev-dependencies]
3940
tokio = { workspace = true }

quickwit/quickwit-config/src/index_config/mod.rs

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ use cron::Schedule;
3232
use humantime::parse_duration;
3333
use quickwit_common::uri::Uri;
3434
use quickwit_doc_mapper::{
35-
DefaultDocMapper, DefaultDocMapperBuilder, DocMapper, FieldMappingEntry, ModeType,
35+
DefaultDocMapper, DefaultDocMapperBuilder, DocMapper, FieldMappingEntry, Mode, ModeType,
3636
QuickwitJsonOptions, TokenizerEntry,
3737
};
3838
use serde::{Deserialize, Serialize};
@@ -44,9 +44,10 @@ use crate::TestableForRegression;
4444

4545
// Note(fmassot): `DocMapping` is a struct only used for
4646
// serialization/deserialization of `DocMapper` parameters.
47-
// This is partly a duplicate of the `DocMapper` and can
48-
// be viewed as a temporary hack for 0.2 release before
47+
// This is partly a duplicate of the `DefaultDocMapper` and
48+
// can be viewed as a temporary hack for 0.2 release before
4949
// refactoring.
50+
#[quickwit_macros::serde_multikey]
5051
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, utoipa::ToSchema)]
5152
#[serde(deny_unknown_fields)]
5253
pub struct DocMapping {
@@ -66,10 +67,17 @@ pub struct DocMapping {
6667
pub store_source: bool,
6768
#[serde(default)]
6869
pub timestamp_field: Option<String>,
69-
#[serde(default)]
70-
pub mode: ModeType,
71-
#[serde(skip_serializing_if = "Option::is_none")]
72-
pub dynamic_mapping: Option<QuickwitJsonOptions>,
70+
#[serde_multikey(
71+
deserializer = Mode::from_parts,
72+
serializer = Mode::into_parts,
73+
fields = (
74+
#[serde(default)]
75+
mode: ModeType,
76+
#[serde(skip_serializing_if = "Option::is_none")]
77+
dynamic_mapping: Option<QuickwitJsonOptions>
78+
),
79+
)]
80+
pub mode: Mode,
7381
#[serde(default)]
7482
#[serde(skip_serializing_if = "Option::is_none")]
7583
pub partition_key: Option<String>,
@@ -436,8 +444,7 @@ impl TestableForRegression for IndexConfig {
436444
.map(|tag_field| tag_field.to_string())
437445
.collect::<BTreeSet<String>>(),
438446
store_source: true,
439-
mode: ModeType::Dynamic,
440-
dynamic_mapping: None,
447+
mode: Mode::default(),
441448
partition_key: Some("tenant_id".to_string()),
442449
max_num_partitions: NonZeroU32::new(100).unwrap(),
443450
timestamp_field: Some("timestamp".to_string()),
@@ -514,8 +521,7 @@ pub fn build_doc_mapper(
514521
timestamp_field: doc_mapping.timestamp_field.clone(),
515522
field_mappings: doc_mapping.field_mappings.clone(),
516523
tag_fields: doc_mapping.tag_fields.iter().cloned().collect(),
517-
mode: doc_mapping.mode,
518-
dynamic_mapping: doc_mapping.dynamic_mapping.clone(),
524+
mode: doc_mapping.mode.clone(),
519525
partition_key: doc_mapping.partition_key.clone(),
520526
max_num_partitions: doc_mapping.max_num_partitions,
521527
tokenizers: doc_mapping.tokenizers.clone(),
@@ -713,7 +719,10 @@ mod tests {
713719
&Uri::from_well_formed("s3://my-index"),
714720
)
715721
.unwrap();
716-
assert_eq!(minimal_config.doc_mapping.mode, ModeType::Dynamic);
722+
assert_eq!(
723+
minimal_config.doc_mapping.mode.mode_type(),
724+
ModeType::Dynamic
725+
);
717726
}
718727

719728
#[test]

quickwit/quickwit-config/src/source_config/mod.rs

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ impl TestableForRegression for SourceConfig {
165165
}),
166166
transform_config: Some(TransformConfig {
167167
vrl_script: ".message = downcase(string!(.message))".to_string(),
168-
timezone_opt: None,
168+
timezone: default_timezone(),
169169
}),
170170
input_format: SourceInputFormat::Json,
171171
}
@@ -413,9 +413,12 @@ pub struct TransformConfig {
413413

414414
/// Timezone used in the VRL [`Program`](vrl::compiler::Program) for date and time
415415
/// manipulations. Defaults to `UTC` if not timezone is specified.
416-
#[serde(skip_serializing_if = "Option::is_none")]
417-
#[serde(rename = "timezone")]
418-
timezone_opt: Option<String>,
416+
#[serde(default = "default_timezone")]
417+
timezone: String,
418+
}
419+
420+
fn default_timezone() -> String {
421+
"UTC".to_string()
419422
}
420423

421424
impl TransformConfig {
@@ -424,7 +427,7 @@ impl TransformConfig {
424427
pub fn new(vrl_script: String, timezone_opt: Option<String>) -> Self {
425428
Self {
426429
vrl_script,
427-
timezone_opt,
430+
timezone: timezone_opt.unwrap_or_else(default_timezone),
428431
}
429432
}
430433

@@ -450,11 +453,11 @@ impl TransformConfig {
450453
&self,
451454
) -> anyhow::Result<(vrl::compiler::Program, vrl::compiler::TimeZone)> {
452455
use anyhow::Context;
453-
let timezone_str = self.timezone_opt.as_deref().unwrap_or("UTC");
454-
let timezone = vrl::compiler::TimeZone::parse(timezone_str).with_context(|| {
456+
let timezone = vrl::compiler::TimeZone::parse(&self.timezone).with_context(|| {
455457
format!(
456-
"Failed to parse timezone: `{timezone_str}`. Timezone must be a valid name \
457-
in the TZ database: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones"
458+
"Failed to parse timezone: `{}`. Timezone must be a valid name \
459+
in the TZ database: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones",
460+
self.timezone,
458461
)
459462
})?;
460463
// Append "\n." to the script to return the entire document and not only the modified
@@ -487,7 +490,7 @@ impl TransformConfig {
487490
pub fn for_test(vrl_script: &str) -> Self {
488491
Self {
489492
vrl_script: vrl_script.to_string(),
490-
timezone_opt: None,
493+
timezone: default_timezone(),
491494
}
492495
}
493496
}
@@ -532,7 +535,7 @@ mod tests {
532535
}),
533536
transform_config: Some(TransformConfig {
534537
vrl_script: ".message = downcase(string!(.message))".to_string(),
535-
timezone_opt: Some("local".to_string()),
538+
timezone: "local".to_string(),
536539
}),
537540
input_format: SourceInputFormat::Json,
538541
};
@@ -628,7 +631,7 @@ mod tests {
628631
}),
629632
transform_config: Some(TransformConfig {
630633
vrl_script: ".message = downcase(string!(.message))".to_string(),
631-
timezone_opt: Some("local".to_string()),
634+
timezone: "local".to_string(),
632635
}),
633636
input_format: SourceInputFormat::Json,
634637
};
@@ -1032,7 +1035,7 @@ mod tests {
10321035
source_params: SourceParams::IngestApi,
10331036
transform_config: Some(TransformConfig {
10341037
vrl_script: ".message = downcase(string!(.message))".to_string(),
1035-
timezone_opt: None,
1038+
timezone: default_timezone(),
10361039
}),
10371040
input_format: SourceInputFormat::Json,
10381041
};
@@ -1045,7 +1048,7 @@ mod tests {
10451048
{
10461049
let transform_config = TransformConfig {
10471050
vrl_script: ".message = downcase(string!(.message))".to_string(),
1048-
timezone_opt: Some("local".to_string()),
1051+
timezone: "local".to_string(),
10491052
};
10501053
let transform_config_yaml = serde_yaml::to_string(&transform_config).unwrap();
10511054
assert_eq!(
@@ -1056,7 +1059,7 @@ mod tests {
10561059
{
10571060
let transform_config = TransformConfig {
10581061
vrl_script: ".message = downcase(string!(.message))".to_string(),
1059-
timezone_opt: None,
1062+
timezone: default_timezone(),
10601063
};
10611064
let transform_config_yaml = serde_yaml::to_string(&transform_config).unwrap();
10621065
assert_eq!(
@@ -1077,7 +1080,7 @@ mod tests {
10771080

10781081
let expected_transform_config = TransformConfig {
10791082
vrl_script: ".message = downcase(string!(.message))".to_string(),
1080-
timezone_opt: None,
1083+
timezone: default_timezone(),
10811084
};
10821085
assert_eq!(transform_config, expected_transform_config);
10831086
}
@@ -1091,7 +1094,7 @@ mod tests {
10911094

10921095
let expected_transform_config = TransformConfig {
10931096
vrl_script: ".message = downcase(string!(.message))".to_string(),
1094-
timezone_opt: Some("Turkey".to_string()),
1097+
timezone: "Turkey".to_string(),
10951098
};
10961099
assert_eq!(transform_config, expected_transform_config);
10971100
}
@@ -1103,7 +1106,7 @@ mod tests {
11031106
{
11041107
let transform_config = TransformConfig {
11051108
vrl_script: ".message = downcase(string!(.message))".to_string(),
1106-
timezone_opt: Some("Turkey".to_string()),
1109+
timezone: "Turkey".to_string(),
11071110
};
11081111
transform_config.compile_vrl_script().unwrap();
11091112
}
@@ -1116,22 +1119,22 @@ mod tests {
11161119
.message = downcase(string!(.message))
11171120
"#
11181121
.to_string(),
1119-
timezone_opt: None,
1122+
timezone: default_timezone(),
11201123
};
11211124
transform_config.compile_vrl_script().unwrap();
11221125
}
11231126
{
11241127
let transform_config = TransformConfig {
11251128
vrl_script: ".message = downcase(string!(.message))".to_string(),
1126-
timezone_opt: Some("foo".to_string()),
1129+
timezone: "foo".to_string(),
11271130
};
11281131
let error = transform_config.compile_vrl_script().unwrap_err();
11291132
assert!(error.to_string().starts_with("Failed to parse timezone"));
11301133
}
11311134
{
11321135
let transform_config = TransformConfig {
11331136
vrl_script: "foo".to_string(),
1134-
timezone_opt: Some("Turkey".to_string()),
1137+
timezone: "Turkey".to_string(),
11351138
};
11361139
let error = transform_config.compile_vrl_script().unwrap_err();
11371140
assert!(error.to_string().starts_with("Failed to compile"));

quickwit/quickwit-doc-mapper/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ typetag = { workspace = true }
3131
utoipa = { workspace = true }
3232

3333
quickwit-datetime = { workspace = true }
34+
quickwit-macros = { workspace = true }
3435
quickwit-query = { workspace = true }
3536

3637
[dev-dependencies]

quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs

Lines changed: 7 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -39,29 +39,10 @@ use crate::doc_mapper::{JsonObject, Partition};
3939
use crate::query_builder::build_query;
4040
use crate::routing_expression::RoutingExpr;
4141
use crate::{
42-
Cardinality, DocMapper, DocParsingError, ModeType, QueryParserError, TokenizerEntry,
43-
WarmupInfo, DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME,
42+
Cardinality, DocMapper, DocParsingError, Mode, QueryParserError, TokenizerEntry, WarmupInfo,
43+
DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME,
4444
};
4545

46-
/// Defines how an unmapped field should be handled.
47-
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
48-
pub(crate) enum Mode {
49-
#[default]
50-
Lenient,
51-
Strict,
52-
Dynamic(QuickwitJsonOptions),
53-
}
54-
55-
impl Mode {
56-
pub fn mode_type(&self) -> ModeType {
57-
match self {
58-
Mode::Lenient => ModeType::Lenient,
59-
Mode::Strict => ModeType::Strict,
60-
Mode::Dynamic(_) => ModeType::Dynamic,
61-
}
62-
}
63-
}
64-
6546
/// Default [`DocMapper`] implementation
6647
/// which defines a set of rules to map json fields
6748
/// to tantivy index fields.
@@ -149,7 +130,6 @@ impl TryFrom<DefaultDocMapperBuilder> for DefaultDocMapper {
149130
type Error = anyhow::Error;
150131

151132
fn try_from(builder: DefaultDocMapperBuilder) -> anyhow::Result<DefaultDocMapper> {
152-
let mode = builder.mode()?;
153133
let mut schema_builder = Schema::builder();
154134
let field_mappings = build_mapping_tree(&builder.field_mappings, &mut schema_builder)?;
155135
let source_field = if builder.store_source {
@@ -162,7 +142,7 @@ impl TryFrom<DefaultDocMapperBuilder> for DefaultDocMapper {
162142
validate_timestamp_field(timestamp_field_path, &field_mappings)?;
163143
};
164144

165-
let dynamic_field = if let Mode::Dynamic(json_options) = &mode {
145+
let dynamic_field = if let Mode::Dynamic(json_options) = &builder.mode {
166146
Some(schema_builder.add_json_field(DYNAMIC_FIELD_NAME, json_options.clone()))
167147
} else {
168148
None
@@ -255,7 +235,7 @@ impl TryFrom<DefaultDocMapperBuilder> for DefaultDocMapper {
255235
required_fields,
256236
partition_key,
257237
max_num_partitions: builder.max_num_partitions,
258-
mode,
238+
mode: builder.mode,
259239
tokenizer_entries: builder.tokenizers,
260240
tokenizer_manager,
261241
})
@@ -338,11 +318,6 @@ fn validate_fields_tokenizers(
338318

339319
impl From<DefaultDocMapper> for DefaultDocMapperBuilder {
340320
fn from(default_doc_mapper: DefaultDocMapper) -> Self {
341-
let mode = default_doc_mapper.mode.mode_type();
342-
let dynamic_mapping: Option<QuickwitJsonOptions> = match &default_doc_mapper.mode {
343-
Mode::Dynamic(mapping_options) => Some(mapping_options.clone()),
344-
_ => None,
345-
};
346321
let partition_key_str = default_doc_mapper.partition_key.to_string();
347322
let partition_key_opt: Option<String> = if partition_key_str.is_empty() {
348323
None
@@ -357,8 +332,7 @@ impl From<DefaultDocMapper> for DefaultDocMapperBuilder {
357332
field_mappings: default_doc_mapper.field_mappings.into(),
358333
tag_fields: default_doc_mapper.tag_field_names.into_iter().collect(),
359334
default_search_fields: default_doc_mapper.default_search_field_names,
360-
mode,
361-
dynamic_mapping,
335+
mode: default_doc_mapper.mode,
362336
partition_key: partition_key_opt,
363337
max_num_partitions: default_doc_mapper.max_num_partitions,
364338
tokenizers: default_doc_mapper.tokenizer_entries,
@@ -1545,8 +1519,8 @@ mod tests {
15451519
.unwrap();
15461520
match &field_mapping_type {
15471521
super::FieldMappingType::Text(options, _) => {
1548-
assert!(options.tokenizer.is_some());
1549-
let tokenizer = options.tokenizer.as_ref().unwrap();
1522+
assert!(options.indexing_options.is_some());
1523+
let tokenizer = &options.indexing_options.as_ref().unwrap().tokenizer;
15501524
assert_eq!(tokenizer.name(), "my_tokenizer");
15511525
}
15521526
_ => panic!("Expected a text field"),

0 commit comments

Comments
 (0)