From cb2c9a02aa7b2c43678dc92ac9423214e8f51216 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Mon, 9 Sep 2024 12:48:39 +0400 Subject: [PATCH] Add missing tokenizers (#2877) --- output/openapi/elasticsearch-openapi.json | 231 +++++++--- .../elasticsearch-serverless-openapi.json | 231 +++++++--- output/schema/schema-serverless.json | 399 ++++++++++++------ output/schema/schema.json | 207 +++++++-- output/typescript/types.ts | 25 +- specification/_types/analysis/analyzers.ts | 2 +- specification/_types/analysis/nori-plugin.ts | 34 ++ specification/_types/analysis/tokenizers.ts | 83 ++-- 8 files changed, 906 insertions(+), 306 deletions(-) create mode 100644 specification/_types/analysis/nori-plugin.ts diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json index 846b23d746..4d9b7c4d0e 100644 --- a/output/openapi/elasticsearch-openapi.json +++ b/output/openapi/elasticsearch-openapi.json @@ -74744,6 +74744,9 @@ { "$ref": "#/components/schemas/_types.analysis:CharGroupTokenizer" }, + { + "$ref": "#/components/schemas/_types.analysis:ClassicTokenizer" + }, { "$ref": "#/components/schemas/_types.analysis:EdgeNGramTokenizer" }, @@ -74760,14 +74763,23 @@ "$ref": "#/components/schemas/_types.analysis:NGramTokenizer" }, { - "$ref": "#/components/schemas/_types.analysis:NoriTokenizer" + "$ref": "#/components/schemas/_types.analysis:PathHierarchyTokenizer" }, { - "$ref": "#/components/schemas/_types.analysis:PathHierarchyTokenizer" + "$ref": "#/components/schemas/_types.analysis:PatternTokenizer" + }, + { + "$ref": "#/components/schemas/_types.analysis:SimplePatternTokenizer" + }, + { + "$ref": "#/components/schemas/_types.analysis:SimplePatternSplitTokenizer" }, { "$ref": "#/components/schemas/_types.analysis:StandardTokenizer" }, + { + "$ref": "#/components/schemas/_types.analysis:ThaiTokenizer" + }, { "$ref": "#/components/schemas/_types.analysis:UaxEmailUrlTokenizer" }, @@ -74775,13 +74787,13 @@ "$ref": "#/components/schemas/_types.analysis:WhitespaceTokenizer" }, { - "$ref": "#/components/schemas/_types.analysis:KuromojiTokenizer" + "$ref": "#/components/schemas/_types.analysis:IcuTokenizer" }, { - "$ref": "#/components/schemas/_types.analysis:PatternTokenizer" + "$ref": "#/components/schemas/_types.analysis:KuromojiTokenizer" }, { - "$ref": "#/components/schemas/_types.analysis:IcuTokenizer" + "$ref": "#/components/schemas/_types.analysis:NoriTokenizer" } ] }, @@ -74824,6 +74836,30 @@ } } }, + "_types.analysis:ClassicTokenizer": { + "allOf": [ + { + "$ref": "#/components/schemas/_types.analysis:TokenizerBase" + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "classic" + ] + }, + "max_token_length": { + "type": "number" + } + }, + "required": [ + "type" + ] + } + ] + }, "_types.analysis:EdgeNGramTokenizer": { "allOf": [ { @@ -74857,8 +74893,7 @@ "required": [ "type", "max_gram", - "min_gram", - "token_chars" + "min_gram" ] } ] @@ -74974,13 +75009,12 @@ "required": [ "type", "max_gram", - "min_gram", - "token_chars" + "min_gram" ] } ] }, - "_types.analysis:NoriTokenizer": { + "_types.analysis:PathHierarchyTokenizer": { "allOf": [ { "$ref": "#/components/schemas/_types.analysis:TokenizerBase" @@ -74991,23 +75025,23 @@ "type": { "type": "string", "enum": [ - "nori_tokenizer" + "path_hierarchy" ] }, - "decompound_mode": { - "$ref": "#/components/schemas/_types.analysis:NoriDecompoundMode" + "buffer_size": { + "$ref": "#/components/schemas/_spec_utils:Stringifiedinteger" }, - "discard_punctuation": { - "type": "boolean" + "delimiter": { + "type": "string" }, - "user_dictionary": { + "replacement": { "type": "string" }, - "user_dictionary_rules": { - "type": "array", - "items": { - "type": "string" - } + "reverse": { + "$ref": "#/components/schemas/_spec_utils:Stringifiedboolean" + }, + "skip": { + "$ref": "#/components/schemas/_spec_utils:Stringifiedinteger" } }, "required": [ @@ -75016,7 +75050,7 @@ } ] }, - "_types.analysis:PathHierarchyTokenizer": { + "_types.analysis:PatternTokenizer": { "allOf": [ { "$ref": "#/components/schemas/_types.analysis:TokenizerBase" @@ -75027,23 +75061,65 @@ "type": { "type": "string", "enum": [ - "path_hierarchy" + "pattern" ] }, - "buffer_size": { - "$ref": "#/components/schemas/_spec_utils:Stringifiedinteger" - }, - "delimiter": { + "flags": { "type": "string" }, - "replacement": { + "group": { + "type": "number" + }, + "pattern": { "type": "string" + } + }, + "required": [ + "type" + ] + } + ] + }, + "_types.analysis:SimplePatternTokenizer": { + "allOf": [ + { + "$ref": "#/components/schemas/_types.analysis:TokenizerBase" + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "simple_pattern" + ] }, - "reverse": { - "$ref": "#/components/schemas/_spec_utils:Stringifiedboolean" + "pattern": { + "type": "string" + } + }, + "required": [ + "type" + ] + } + ] + }, + "_types.analysis:SimplePatternSplitTokenizer": { + "allOf": [ + { + "$ref": "#/components/schemas/_types.analysis:TokenizerBase" + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "simple_pattern_split" + ] }, - "skip": { - "$ref": "#/components/schemas/_spec_utils:Stringifiedinteger" + "pattern": { + "type": "string" } }, "required": [ @@ -75076,6 +75152,27 @@ } ] }, + "_types.analysis:ThaiTokenizer": { + "allOf": [ + { + "$ref": "#/components/schemas/_types.analysis:TokenizerBase" + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "thai" + ] + } + }, + "required": [ + "type" + ] + } + ] + }, "_types.analysis:UaxEmailUrlTokenizer": { "allOf": [ { @@ -75124,6 +75221,31 @@ } ] }, + "_types.analysis:IcuTokenizer": { + "allOf": [ + { + "$ref": "#/components/schemas/_types.analysis:TokenizerBase" + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "icu_tokenizer" + ] + }, + "rule_files": { + "type": "string" + } + }, + "required": [ + "type", + "rule_files" + ] + } + ] + }, "_types.analysis:KuromojiTokenizer": { "allOf": [ { @@ -75170,7 +75292,7 @@ } ] }, - "_types.analysis:PatternTokenizer": { + "_types.analysis:NoriTokenizer": { "allOf": [ { "$ref": "#/components/schemas/_types.analysis:TokenizerBase" @@ -75181,46 +75303,27 @@ "type": { "type": "string", "enum": [ - "pattern" + "nori_tokenizer" ] }, - "flags": { - "type": "string" + "decompound_mode": { + "$ref": "#/components/schemas/_types.analysis:NoriDecompoundMode" }, - "group": { - "type": "number" + "discard_punctuation": { + "type": "boolean" }, - "pattern": { + "user_dictionary": { "type": "string" - } - }, - "required": [ - "type" - ] - } - ] - }, - "_types.analysis:IcuTokenizer": { - "allOf": [ - { - "$ref": "#/components/schemas/_types.analysis:TokenizerBase" - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "icu_tokenizer" - ] }, - "rule_files": { - "type": "string" + "user_dictionary_rules": { + "type": "array", + "items": { + "type": "string" + } } }, "required": [ - "type", - "rule_files" + "type" ] } ] diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json index c8d5f984e2..fe2784eaa4 100644 --- a/output/openapi/elasticsearch-serverless-openapi.json +++ b/output/openapi/elasticsearch-serverless-openapi.json @@ -48666,6 +48666,9 @@ { "$ref": "#/components/schemas/_types.analysis:CharGroupTokenizer" }, + { + "$ref": "#/components/schemas/_types.analysis:ClassicTokenizer" + }, { "$ref": "#/components/schemas/_types.analysis:EdgeNGramTokenizer" }, @@ -48682,14 +48685,23 @@ "$ref": "#/components/schemas/_types.analysis:NGramTokenizer" }, { - "$ref": "#/components/schemas/_types.analysis:NoriTokenizer" + "$ref": "#/components/schemas/_types.analysis:PathHierarchyTokenizer" }, { - "$ref": "#/components/schemas/_types.analysis:PathHierarchyTokenizer" + "$ref": "#/components/schemas/_types.analysis:PatternTokenizer" + }, + { + "$ref": "#/components/schemas/_types.analysis:SimplePatternTokenizer" + }, + { + "$ref": "#/components/schemas/_types.analysis:SimplePatternSplitTokenizer" }, { "$ref": "#/components/schemas/_types.analysis:StandardTokenizer" }, + { + "$ref": "#/components/schemas/_types.analysis:ThaiTokenizer" + }, { "$ref": "#/components/schemas/_types.analysis:UaxEmailUrlTokenizer" }, @@ -48697,13 +48709,13 @@ "$ref": "#/components/schemas/_types.analysis:WhitespaceTokenizer" }, { - "$ref": "#/components/schemas/_types.analysis:KuromojiTokenizer" + "$ref": "#/components/schemas/_types.analysis:IcuTokenizer" }, { - "$ref": "#/components/schemas/_types.analysis:PatternTokenizer" + "$ref": "#/components/schemas/_types.analysis:KuromojiTokenizer" }, { - "$ref": "#/components/schemas/_types.analysis:IcuTokenizer" + "$ref": "#/components/schemas/_types.analysis:NoriTokenizer" } ] }, @@ -48746,6 +48758,30 @@ } } }, + "_types.analysis:ClassicTokenizer": { + "allOf": [ + { + "$ref": "#/components/schemas/_types.analysis:TokenizerBase" + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "classic" + ] + }, + "max_token_length": { + "type": "number" + } + }, + "required": [ + "type" + ] + } + ] + }, "_types.analysis:EdgeNGramTokenizer": { "allOf": [ { @@ -48779,8 +48815,7 @@ "required": [ "type", "max_gram", - "min_gram", - "token_chars" + "min_gram" ] } ] @@ -48896,13 +48931,12 @@ "required": [ "type", "max_gram", - "min_gram", - "token_chars" + "min_gram" ] } ] }, - "_types.analysis:NoriTokenizer": { + "_types.analysis:PathHierarchyTokenizer": { "allOf": [ { "$ref": "#/components/schemas/_types.analysis:TokenizerBase" @@ -48913,23 +48947,23 @@ "type": { "type": "string", "enum": [ - "nori_tokenizer" + "path_hierarchy" ] }, - "decompound_mode": { - "$ref": "#/components/schemas/_types.analysis:NoriDecompoundMode" + "buffer_size": { + "$ref": "#/components/schemas/_spec_utils:Stringifiedinteger" }, - "discard_punctuation": { - "type": "boolean" + "delimiter": { + "type": "string" }, - "user_dictionary": { + "replacement": { "type": "string" }, - "user_dictionary_rules": { - "type": "array", - "items": { - "type": "string" - } + "reverse": { + "$ref": "#/components/schemas/_spec_utils:Stringifiedboolean" + }, + "skip": { + "$ref": "#/components/schemas/_spec_utils:Stringifiedinteger" } }, "required": [ @@ -48938,7 +48972,7 @@ } ] }, - "_types.analysis:PathHierarchyTokenizer": { + "_types.analysis:PatternTokenizer": { "allOf": [ { "$ref": "#/components/schemas/_types.analysis:TokenizerBase" @@ -48949,23 +48983,65 @@ "type": { "type": "string", "enum": [ - "path_hierarchy" + "pattern" ] }, - "buffer_size": { - "$ref": "#/components/schemas/_spec_utils:Stringifiedinteger" - }, - "delimiter": { + "flags": { "type": "string" }, - "replacement": { + "group": { + "type": "number" + }, + "pattern": { "type": "string" + } + }, + "required": [ + "type" + ] + } + ] + }, + "_types.analysis:SimplePatternTokenizer": { + "allOf": [ + { + "$ref": "#/components/schemas/_types.analysis:TokenizerBase" + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "simple_pattern" + ] }, - "reverse": { - "$ref": "#/components/schemas/_spec_utils:Stringifiedboolean" + "pattern": { + "type": "string" + } + }, + "required": [ + "type" + ] + } + ] + }, + "_types.analysis:SimplePatternSplitTokenizer": { + "allOf": [ + { + "$ref": "#/components/schemas/_types.analysis:TokenizerBase" + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "simple_pattern_split" + ] }, - "skip": { - "$ref": "#/components/schemas/_spec_utils:Stringifiedinteger" + "pattern": { + "type": "string" } }, "required": [ @@ -48998,6 +49074,27 @@ } ] }, + "_types.analysis:ThaiTokenizer": { + "allOf": [ + { + "$ref": "#/components/schemas/_types.analysis:TokenizerBase" + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "thai" + ] + } + }, + "required": [ + "type" + ] + } + ] + }, "_types.analysis:UaxEmailUrlTokenizer": { "allOf": [ { @@ -49046,6 +49143,31 @@ } ] }, + "_types.analysis:IcuTokenizer": { + "allOf": [ + { + "$ref": "#/components/schemas/_types.analysis:TokenizerBase" + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "icu_tokenizer" + ] + }, + "rule_files": { + "type": "string" + } + }, + "required": [ + "type", + "rule_files" + ] + } + ] + }, "_types.analysis:KuromojiTokenizer": { "allOf": [ { @@ -49092,7 +49214,7 @@ } ] }, - "_types.analysis:PatternTokenizer": { + "_types.analysis:NoriTokenizer": { "allOf": [ { "$ref": "#/components/schemas/_types.analysis:TokenizerBase" @@ -49103,46 +49225,27 @@ "type": { "type": "string", "enum": [ - "pattern" + "nori_tokenizer" ] }, - "flags": { - "type": "string" + "decompound_mode": { + "$ref": "#/components/schemas/_types.analysis:NoriDecompoundMode" }, - "group": { - "type": "number" + "discard_punctuation": { + "type": "boolean" }, - "pattern": { + "user_dictionary": { "type": "string" - } - }, - "required": [ - "type" - ] - } - ] - }, - "_types.analysis:IcuTokenizer": { - "allOf": [ - { - "$ref": "#/components/schemas/_types.analysis:TokenizerBase" - }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "icu_tokenizer" - ] }, - "rule_files": { - "type": "string" + "user_dictionary_rules": { + "type": "array", + "items": { + "type": "string" + } } }, "required": [ - "type", - "rule_files" + "type" ] } ] diff --git a/output/schema/schema-serverless.json b/output/schema/schema-serverless.json index d717dca62c..f1df4f49f7 100644 --- a/output/schema/schema-serverless.json +++ b/output/schema/schema-serverless.json @@ -77343,7 +77343,7 @@ "name": "NoriDecompoundMode", "namespace": "_types.analysis" }, - "specLocation": "_types/analysis/tokenizers.ts#L74-L78" + "specLocation": "_types/analysis/nori-plugin.ts#L22-L26" }, { "kind": "interface", @@ -80462,7 +80462,7 @@ "name": "TokenChar", "namespace": "_types.analysis" }, - "specLocation": "_types/analysis/tokenizers.ts#L46-L53" + "specLocation": "_types/analysis/tokenizers.ts#L59-L66" }, { "codegenNames": [ @@ -83745,7 +83745,7 @@ "name": "Tokenizer", "namespace": "_types.analysis" }, - "specLocation": "_types/analysis/tokenizers.ts#L119-L121", + "specLocation": "_types/analysis/tokenizers.ts#L137-L139", "type": { "items": [ { @@ -83772,7 +83772,7 @@ "name": "TokenizerDefinition", "namespace": "_types.analysis" }, - "specLocation": "_types/analysis/tokenizers.ts#L123-L141", + "specLocation": "_types/analysis/tokenizers.ts#L141-L164", "type": { "items": [ { @@ -83782,6 +83782,13 @@ "namespace": "_types.analysis" } }, + { + "kind": "instance_of", + "type": { + "name": "ClassicTokenizer", + "namespace": "_types.analysis" + } + }, { "kind": "instance_of", "type": { @@ -83820,14 +83827,28 @@ { "kind": "instance_of", "type": { - "name": "NoriTokenizer", + "name": "PathHierarchyTokenizer", "namespace": "_types.analysis" } }, { "kind": "instance_of", "type": { - "name": "PathHierarchyTokenizer", + "name": "PatternTokenizer", + "namespace": "_types.analysis" + } + }, + { + "kind": "instance_of", + "type": { + "name": "SimplePatternTokenizer", + "namespace": "_types.analysis" + } + }, + { + "kind": "instance_of", + "type": { + "name": "SimplePatternSplitTokenizer", "namespace": "_types.analysis" } }, @@ -83838,6 +83859,13 @@ "namespace": "_types.analysis" } }, + { + "kind": "instance_of", + "type": { + "name": "ThaiTokenizer", + "namespace": "_types.analysis" + } + }, { "kind": "instance_of", "type": { @@ -83855,21 +83883,21 @@ { "kind": "instance_of", "type": { - "name": "KuromojiTokenizer", + "name": "IcuTokenizer", "namespace": "_types.analysis" } }, { "kind": "instance_of", "type": { - "name": "PatternTokenizer", + "name": "KuromojiTokenizer", "namespace": "_types.analysis" } }, { "kind": "instance_of", "type": { - "name": "IcuTokenizer", + "name": "NoriTokenizer", "namespace": "_types.analysis" } } @@ -83929,7 +83957,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L55-L59" + "specLocation": "_types/analysis/tokenizers.ts#L31-L38" }, { "kind": "interface", @@ -83950,7 +83978,42 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L26-L28" + "specLocation": "_types/analysis/tokenizers.ts#L27-L29" + }, + { + "inherits": { + "type": { + "name": "TokenizerBase", + "namespace": "_types.analysis" + } + }, + "kind": "interface", + "name": { + "name": "ClassicTokenizer", + "namespace": "_types.analysis" + }, + "properties": [ + { + "name": "type", + "required": true, + "type": { + "kind": "literal_value", + "value": "classic" + } + }, + { + "name": "max_token_length", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + } + ], + "specLocation": "_types/analysis/tokenizers.ts#L40-L46" }, { "inherits": { @@ -84008,7 +84071,8 @@ }, { "name": "token_chars", - "required": true, + "required": false, + "serverDefault": [], "type": { "kind": "array_of", "value": { @@ -84021,7 +84085,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L30-L36" + "specLocation": "_types/analysis/tokenizers.ts#L48-L57" }, { "inherits": { @@ -84056,7 +84120,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L61-L64" + "specLocation": "_types/analysis/tokenizers.ts#L68-L71" }, { "inherits": { @@ -84080,7 +84144,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L66-L68" + "specLocation": "_types/analysis/tokenizers.ts#L73-L75" }, { "inherits": { @@ -84104,7 +84168,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L70-L72" + "specLocation": "_types/analysis/tokenizers.ts#L77-L79" }, { "inherits": { @@ -84162,7 +84226,8 @@ }, { "name": "token_chars", - "required": true, + "required": false, + "serverDefault": [], "type": { "kind": "array_of", "value": { @@ -84175,7 +84240,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L38-L44" + "specLocation": "_types/analysis/tokenizers.ts#L81-L90" }, { "inherits": { @@ -84186,7 +84251,7 @@ }, "kind": "interface", "name": { - "name": "NoriTokenizer", + "name": "PathHierarchyTokenizer", "namespace": "_types.analysis" }, "properties": [ @@ -84195,33 +84260,42 @@ "required": true, "type": { "kind": "literal_value", - "value": "nori_tokenizer" + "value": "path_hierarchy" } }, { - "name": "decompound_mode", + "name": "buffer_size", "required": false, "type": { + "generics": [ + { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + ], "kind": "instance_of", "type": { - "name": "NoriDecompoundMode", - "namespace": "_types.analysis" + "name": "Stringified", + "namespace": "_spec_utils" } } }, { - "name": "discard_punctuation", + "name": "delimiter", "required": false, "type": { "kind": "instance_of", "type": { - "name": "boolean", + "name": "string", "namespace": "_builtins" } } }, { - "name": "user_dictionary", + "name": "replacement", "required": false, "type": { "kind": "instance_of", @@ -84232,21 +84306,47 @@ } }, { - "name": "user_dictionary_rules", + "name": "reverse", "required": false, "type": { - "kind": "array_of", - "value": { - "kind": "instance_of", - "type": { - "name": "string", - "namespace": "_builtins" + "generics": [ + { + "kind": "instance_of", + "type": { + "name": "boolean", + "namespace": "_builtins" + } } + ], + "kind": "instance_of", + "type": { + "name": "Stringified", + "namespace": "_spec_utils" + } + } + }, + { + "name": "skip", + "required": false, + "type": { + "generics": [ + { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + ], + "kind": "instance_of", + "type": { + "name": "Stringified", + "namespace": "_spec_utils" } } } ], - "specLocation": "_types/analysis/tokenizers.ts#L80-L86" + "specLocation": "_types/analysis/tokenizers.ts#L92-L99" }, { "inherits": { @@ -84257,7 +84357,7 @@ }, "kind": "interface", "name": { - "name": "PathHierarchyTokenizer", + "name": "PatternTokenizer", "namespace": "_types.analysis" }, "properties": [ @@ -84266,42 +84366,33 @@ "required": true, "type": { "kind": "literal_value", - "value": "path_hierarchy" + "value": "pattern" } }, { - "name": "buffer_size", + "name": "flags", "required": false, "type": { - "generics": [ - { - "kind": "instance_of", - "type": { - "name": "integer", - "namespace": "_types" - } - } - ], "kind": "instance_of", "type": { - "name": "Stringified", - "namespace": "_spec_utils" + "name": "string", + "namespace": "_builtins" } } }, { - "name": "delimiter", + "name": "group", "required": false, "type": { "kind": "instance_of", "type": { - "name": "string", - "namespace": "_builtins" + "name": "integer", + "namespace": "_types" } } }, { - "name": "replacement", + "name": "pattern", "required": false, "type": { "kind": "instance_of", @@ -84310,49 +84401,79 @@ "namespace": "_builtins" } } + } + ], + "specLocation": "_types/analysis/tokenizers.ts#L101-L106" + }, + { + "inherits": { + "type": { + "name": "TokenizerBase", + "namespace": "_types.analysis" + } + }, + "kind": "interface", + "name": { + "name": "SimplePatternTokenizer", + "namespace": "_types.analysis" + }, + "properties": [ + { + "name": "type", + "required": true, + "type": { + "kind": "literal_value", + "value": "simple_pattern" + } }, { - "name": "reverse", + "name": "pattern", "required": false, "type": { - "generics": [ - { - "kind": "instance_of", - "type": { - "name": "boolean", - "namespace": "_builtins" - } - } - ], "kind": "instance_of", "type": { - "name": "Stringified", - "namespace": "_spec_utils" + "name": "string", + "namespace": "_builtins" } } + } + ], + "specLocation": "_types/analysis/tokenizers.ts#L108-L111" + }, + { + "inherits": { + "type": { + "name": "TokenizerBase", + "namespace": "_types.analysis" + } + }, + "kind": "interface", + "name": { + "name": "SimplePatternSplitTokenizer", + "namespace": "_types.analysis" + }, + "properties": [ + { + "name": "type", + "required": true, + "type": { + "kind": "literal_value", + "value": "simple_pattern_split" + } }, { - "name": "skip", + "name": "pattern", "required": false, "type": { - "generics": [ - { - "kind": "instance_of", - "type": { - "name": "integer", - "namespace": "_types" - } - } - ], "kind": "instance_of", "type": { - "name": "Stringified", - "namespace": "_spec_utils" + "name": "string", + "namespace": "_builtins" } } } ], - "specLocation": "_types/analysis/tokenizers.ts#L88-L95" + "specLocation": "_types/analysis/tokenizers.ts#L113-L116" }, { "inherits": { @@ -84387,7 +84508,31 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L104-L107" + "specLocation": "_types/analysis/tokenizers.ts#L118-L121" + }, + { + "inherits": { + "type": { + "name": "TokenizerBase", + "namespace": "_types.analysis" + } + }, + "kind": "interface", + "name": { + "name": "ThaiTokenizer", + "namespace": "_types.analysis" + }, + "properties": [ + { + "name": "type", + "required": true, + "type": { + "kind": "literal_value", + "value": "thai" + } + } + ], + "specLocation": "_types/analysis/tokenizers.ts#L123-L125" }, { "inherits": { @@ -84422,7 +84567,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L109-L112" + "specLocation": "_types/analysis/tokenizers.ts#L127-L130" }, { "inherits": { @@ -84457,7 +84602,42 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L114-L117" + "specLocation": "_types/analysis/tokenizers.ts#L132-L135" + }, + { + "inherits": { + "type": { + "name": "TokenizerBase", + "namespace": "_types.analysis" + } + }, + "kind": "interface", + "name": { + "name": "IcuTokenizer", + "namespace": "_types.analysis" + }, + "properties": [ + { + "name": "type", + "required": true, + "type": { + "kind": "literal_value", + "value": "icu_tokenizer" + } + }, + { + "name": "rule_files", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } + } + } + ], + "specLocation": "_types/analysis/icu-plugin.ts#L30-L33" }, { "inherits": { @@ -84572,7 +84752,7 @@ }, "kind": "interface", "name": { - "name": "PatternTokenizer", + "name": "NoriTokenizer", "namespace": "_types.analysis" }, "properties": [ @@ -84581,33 +84761,33 @@ "required": true, "type": { "kind": "literal_value", - "value": "pattern" + "value": "nori_tokenizer" } }, { - "name": "flags", + "name": "decompound_mode", "required": false, "type": { "kind": "instance_of", "type": { - "name": "string", - "namespace": "_builtins" + "name": "NoriDecompoundMode", + "namespace": "_types.analysis" } } }, { - "name": "group", + "name": "discard_punctuation", "required": false, "type": { "kind": "instance_of", "type": { - "name": "integer", - "namespace": "_types" + "name": "boolean", + "namespace": "_builtins" } } }, { - "name": "pattern", + "name": "user_dictionary", "required": false, "type": { "kind": "instance_of", @@ -84616,44 +84796,23 @@ "namespace": "_builtins" } } - } - ], - "specLocation": "_types/analysis/tokenizers.ts#L97-L102" - }, - { - "inherits": { - "type": { - "name": "TokenizerBase", - "namespace": "_types.analysis" - } - }, - "kind": "interface", - "name": { - "name": "IcuTokenizer", - "namespace": "_types.analysis" - }, - "properties": [ - { - "name": "type", - "required": true, - "type": { - "kind": "literal_value", - "value": "icu_tokenizer" - } }, { - "name": "rule_files", - "required": true, + "name": "user_dictionary_rules", + "required": false, "type": { - "kind": "instance_of", - "type": { - "name": "string", - "namespace": "_builtins" + "kind": "array_of", + "value": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } } } } ], - "specLocation": "_types/analysis/icu-plugin.ts#L30-L33" + "specLocation": "_types/analysis/nori-plugin.ts#L28-L34" }, { "esQuirk": "This is a boolean that evolved into an enum. Boolean values should be accepted on reading, and\ntrue and false must be serialized as JSON booleans, or it may break Kibana (see elasticsearch-java#139)", diff --git a/output/schema/schema.json b/output/schema/schema.json index 778bc50378..ed3b4f523c 100644 --- a/output/schema/schema.json +++ b/output/schema/schema.json @@ -63623,7 +63623,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L55-L59" + "specLocation": "_types/analysis/tokenizers.ts#L31-L38" }, { "kind": "interface", @@ -63705,6 +63705,41 @@ ], "specLocation": "_types/analysis/analyzers.ts#L115-L119" }, + { + "kind": "interface", + "inherits": { + "type": { + "name": "TokenizerBase", + "namespace": "_types.analysis" + } + }, + "name": { + "name": "ClassicTokenizer", + "namespace": "_types.analysis" + }, + "properties": [ + { + "name": "type", + "required": true, + "type": { + "kind": "literal_value", + "value": "classic" + } + }, + { + "name": "max_token_length", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + } + ], + "specLocation": "_types/analysis/tokenizers.ts#L40-L46" + }, { "kind": "interface", "inherits": { @@ -64456,7 +64491,8 @@ }, { "name": "token_chars", - "required": true, + "required": false, + "serverDefault": [], "type": { "kind": "array_of", "value": { @@ -64469,7 +64505,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L30-L36" + "specLocation": "_types/analysis/tokenizers.ts#L48-L57" }, { "kind": "interface", @@ -66219,7 +66255,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L61-L64" + "specLocation": "_types/analysis/tokenizers.ts#L68-L71" }, { "kind": "interface", @@ -66848,7 +66884,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L66-L68" + "specLocation": "_types/analysis/tokenizers.ts#L73-L75" }, { "kind": "interface", @@ -67034,7 +67070,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L70-L72" + "specLocation": "_types/analysis/tokenizers.ts#L77-L79" }, { "kind": "interface", @@ -67265,7 +67301,8 @@ }, { "name": "token_chars", - "required": true, + "required": false, + "serverDefault": [], "type": { "kind": "array_of", "value": { @@ -67278,7 +67315,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L38-L44" + "specLocation": "_types/analysis/tokenizers.ts#L81-L90" }, { "kind": "interface", @@ -67362,7 +67399,7 @@ "name": "NoriDecompoundMode", "namespace": "_types.analysis" }, - "specLocation": "_types/analysis/tokenizers.ts#L74-L78" + "specLocation": "_types/analysis/nori-plugin.ts#L22-L26" }, { "kind": "interface", @@ -67471,7 +67508,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L80-L86" + "specLocation": "_types/analysis/nori-plugin.ts#L28-L34" }, { "kind": "type_alias", @@ -67665,7 +67702,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L88-L95" + "specLocation": "_types/analysis/tokenizers.ts#L92-L99" }, { "kind": "interface", @@ -67978,7 +68015,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L97-L102" + "specLocation": "_types/analysis/tokenizers.ts#L101-L106" }, { "kind": "interface", @@ -68718,6 +68755,76 @@ ], "specLocation": "_types/analysis/analyzers.ts#L329-L332" }, + { + "kind": "interface", + "inherits": { + "type": { + "name": "TokenizerBase", + "namespace": "_types.analysis" + } + }, + "name": { + "name": "SimplePatternSplitTokenizer", + "namespace": "_types.analysis" + }, + "properties": [ + { + "name": "type", + "required": true, + "type": { + "kind": "literal_value", + "value": "simple_pattern_split" + } + }, + { + "name": "pattern", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } + } + } + ], + "specLocation": "_types/analysis/tokenizers.ts#L113-L116" + }, + { + "kind": "interface", + "inherits": { + "type": { + "name": "TokenizerBase", + "namespace": "_types.analysis" + } + }, + "name": { + "name": "SimplePatternTokenizer", + "namespace": "_types.analysis" + }, + "properties": [ + { + "name": "type", + "required": true, + "type": { + "kind": "literal_value", + "value": "simple_pattern" + } + }, + { + "name": "pattern", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } + } + } + ], + "specLocation": "_types/analysis/tokenizers.ts#L108-L111" + }, { "kind": "interface", "name": { @@ -69061,7 +69168,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L104-L107" + "specLocation": "_types/analysis/tokenizers.ts#L118-L121" }, { "kind": "interface", @@ -69640,6 +69747,30 @@ ], "specLocation": "_types/analysis/analyzers.ts#L306-L310" }, + { + "kind": "interface", + "inherits": { + "type": { + "name": "TokenizerBase", + "namespace": "_types.analysis" + } + }, + "name": { + "name": "ThaiTokenizer", + "namespace": "_types.analysis" + }, + "properties": [ + { + "name": "type", + "required": true, + "type": { + "kind": "literal_value", + "value": "thai" + } + } + ], + "specLocation": "_types/analysis/tokenizers.ts#L123-L125" + }, { "kind": "enum", "members": [ @@ -69666,7 +69797,7 @@ "name": "TokenChar", "namespace": "_types.analysis" }, - "specLocation": "_types/analysis/tokenizers.ts#L46-L53" + "specLocation": "_types/analysis/tokenizers.ts#L59-L66" }, { "kind": "type_alias", @@ -70077,7 +70208,7 @@ "name": "Tokenizer", "namespace": "_types.analysis" }, - "specLocation": "_types/analysis/tokenizers.ts#L119-L121", + "specLocation": "_types/analysis/tokenizers.ts#L137-L139", "type": { "kind": "union_of", "items": [ @@ -70117,7 +70248,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L26-L28" + "specLocation": "_types/analysis/tokenizers.ts#L27-L29" }, { "kind": "type_alias", @@ -70125,7 +70256,7 @@ "name": "TokenizerDefinition", "namespace": "_types.analysis" }, - "specLocation": "_types/analysis/tokenizers.ts#L123-L141", + "specLocation": "_types/analysis/tokenizers.ts#L141-L164", "type": { "kind": "union_of", "items": [ @@ -70136,6 +70267,13 @@ "namespace": "_types.analysis" } }, + { + "kind": "instance_of", + "type": { + "name": "ClassicTokenizer", + "namespace": "_types.analysis" + } + }, { "kind": "instance_of", "type": { @@ -70174,14 +70312,28 @@ { "kind": "instance_of", "type": { - "name": "NoriTokenizer", + "name": "PathHierarchyTokenizer", "namespace": "_types.analysis" } }, { "kind": "instance_of", "type": { - "name": "PathHierarchyTokenizer", + "name": "PatternTokenizer", + "namespace": "_types.analysis" + } + }, + { + "kind": "instance_of", + "type": { + "name": "SimplePatternTokenizer", + "namespace": "_types.analysis" + } + }, + { + "kind": "instance_of", + "type": { + "name": "SimplePatternSplitTokenizer", "namespace": "_types.analysis" } }, @@ -70192,6 +70344,13 @@ "namespace": "_types.analysis" } }, + { + "kind": "instance_of", + "type": { + "name": "ThaiTokenizer", + "namespace": "_types.analysis" + } + }, { "kind": "instance_of", "type": { @@ -70209,21 +70368,21 @@ { "kind": "instance_of", "type": { - "name": "KuromojiTokenizer", + "name": "IcuTokenizer", "namespace": "_types.analysis" } }, { "kind": "instance_of", "type": { - "name": "PatternTokenizer", + "name": "KuromojiTokenizer", "namespace": "_types.analysis" } }, { "kind": "instance_of", "type": { - "name": "IcuTokenizer", + "name": "NoriTokenizer", "namespace": "_types.analysis" } } @@ -70381,7 +70540,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L109-L112" + "specLocation": "_types/analysis/tokenizers.ts#L127-L130" }, { "kind": "interface", @@ -70504,7 +70663,7 @@ } } ], - "specLocation": "_types/analysis/tokenizers.ts#L114-L117" + "specLocation": "_types/analysis/tokenizers.ts#L132-L135" }, { "kind": "interface", diff --git a/output/typescript/types.ts b/output/typescript/types.ts index 8a152dcd17..02aac32cdf 100644 --- a/output/typescript/types.ts +++ b/output/typescript/types.ts @@ -4409,6 +4409,11 @@ export interface AnalysisCjkAnalyzer { stopwords_path?: string } +export interface AnalysisClassicTokenizer extends AnalysisTokenizerBase { + type: 'classic' + max_token_length?: integer +} + export interface AnalysisCommonGramsTokenFilter extends AnalysisTokenFilterBase { type: 'common_grams' common_words?: string[] @@ -4495,7 +4500,7 @@ export interface AnalysisEdgeNGramTokenizer extends AnalysisTokenizerBase { custom_token_chars?: string max_gram: integer min_gram: integer - token_chars: AnalysisTokenChar[] + token_chars?: AnalysisTokenChar[] } export interface AnalysisElisionTokenFilter extends AnalysisTokenFilterBase { @@ -4838,7 +4843,7 @@ export interface AnalysisNGramTokenizer extends AnalysisTokenizerBase { custom_token_chars?: string max_gram: integer min_gram: integer - token_chars: AnalysisTokenChar[] + token_chars?: AnalysisTokenChar[] } export interface AnalysisNoriAnalyzer { @@ -5003,6 +5008,16 @@ export interface AnalysisSimpleAnalyzer { version?: VersionString } +export interface AnalysisSimplePatternSplitTokenizer extends AnalysisTokenizerBase { + type: 'simple_pattern_split' + pattern?: string +} + +export interface AnalysisSimplePatternTokenizer extends AnalysisTokenizerBase { + type: 'simple_pattern' + pattern?: string +} + export interface AnalysisSnowballAnalyzer { type: 'snowball' version?: VersionString @@ -5110,6 +5125,10 @@ export interface AnalysisThaiAnalyzer { stopwords_path?: string } +export interface AnalysisThaiTokenizer extends AnalysisTokenizerBase { + type: 'thai' +} + export type AnalysisTokenChar = 'letter' | 'digit' | 'whitespace' | 'punctuation' | 'symbol' | 'custom' export type AnalysisTokenFilter = string | AnalysisTokenFilterDefinition @@ -5126,7 +5145,7 @@ export interface AnalysisTokenizerBase { version?: VersionString } -export type AnalysisTokenizerDefinition = AnalysisCharGroupTokenizer | AnalysisEdgeNGramTokenizer | AnalysisKeywordTokenizer | AnalysisLetterTokenizer | AnalysisLowercaseTokenizer | AnalysisNGramTokenizer | AnalysisNoriTokenizer | AnalysisPathHierarchyTokenizer | AnalysisStandardTokenizer | AnalysisUaxEmailUrlTokenizer | AnalysisWhitespaceTokenizer | AnalysisKuromojiTokenizer | AnalysisPatternTokenizer | AnalysisIcuTokenizer +export type AnalysisTokenizerDefinition = AnalysisCharGroupTokenizer | AnalysisClassicTokenizer | AnalysisEdgeNGramTokenizer | AnalysisKeywordTokenizer | AnalysisLetterTokenizer | AnalysisLowercaseTokenizer | AnalysisNGramTokenizer | AnalysisPathHierarchyTokenizer | AnalysisPatternTokenizer | AnalysisSimplePatternTokenizer | AnalysisSimplePatternSplitTokenizer | AnalysisStandardTokenizer | AnalysisThaiTokenizer | AnalysisUaxEmailUrlTokenizer | AnalysisWhitespaceTokenizer | AnalysisIcuTokenizer | AnalysisKuromojiTokenizer | AnalysisNoriTokenizer export interface AnalysisTrimTokenFilter extends AnalysisTokenFilterBase { type: 'trim' diff --git a/specification/_types/analysis/analyzers.ts b/specification/_types/analysis/analyzers.ts index 38a51256ec..47da0e68fd 100644 --- a/specification/_types/analysis/analyzers.ts +++ b/specification/_types/analysis/analyzers.ts @@ -22,8 +22,8 @@ import { integer } from '@_types/Numeric' import { IcuAnalyzer } from './icu-plugin' import { KuromojiAnalyzer } from './kuromoji-plugin' import { Language, SnowballLanguage } from './languages' +import { NoriDecompoundMode } from './nori-plugin' import { StopWords } from './StopWords' -import { NoriDecompoundMode } from './tokenizers' export class CustomAnalyzer { type: 'custom' diff --git a/specification/_types/analysis/nori-plugin.ts b/specification/_types/analysis/nori-plugin.ts new file mode 100644 index 0000000000..b245996e72 --- /dev/null +++ b/specification/_types/analysis/nori-plugin.ts @@ -0,0 +1,34 @@ +/* + * Licensed to Elasticsearch B.V. under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { TokenizerBase } from './tokenizers' + +export enum NoriDecompoundMode { + discard, + none, + mixed +} + +export class NoriTokenizer extends TokenizerBase { + type: 'nori_tokenizer' + decompound_mode?: NoriDecompoundMode + discard_punctuation?: boolean + user_dictionary?: string + user_dictionary_rules?: string[] +} diff --git a/specification/_types/analysis/tokenizers.ts b/specification/_types/analysis/tokenizers.ts index 322a0098a1..38308bcbee 100644 --- a/specification/_types/analysis/tokenizers.ts +++ b/specification/_types/analysis/tokenizers.ts @@ -22,25 +22,38 @@ import { VersionString } from '@_types/common' import { integer } from '@_types/Numeric' import { IcuTokenizer } from './icu-plugin' import { KuromojiTokenizer } from './kuromoji-plugin' +import { NoriTokenizer } from './nori-plugin' export class TokenizerBase { version?: VersionString } -export class EdgeNGramTokenizer extends TokenizerBase { - type: 'edge_ngram' - custom_token_chars?: string - max_gram: integer - min_gram: integer - token_chars: TokenChar[] +export class CharGroupTokenizer extends TokenizerBase { + type: 'char_group' + tokenize_on_chars: string[] + /* + * @server_default 255 + */ + max_token_length?: integer } -export class NGramTokenizer extends TokenizerBase { - type: 'ngram' +export class ClassicTokenizer extends TokenizerBase { + type: 'classic' + /* + * @server_default 255 + */ + max_token_length?: integer +} + +export class EdgeNGramTokenizer extends TokenizerBase { + type: 'edge_ngram' custom_token_chars?: string max_gram: integer min_gram: integer - token_chars: TokenChar[] + /** + * @server_default [] + */ + token_chars?: TokenChar[] } export enum TokenChar { @@ -52,12 +65,6 @@ export enum TokenChar { custom } -export class CharGroupTokenizer extends TokenizerBase { - type: 'char_group' - tokenize_on_chars: string[] - max_token_length?: integer -} - export class KeywordTokenizer extends TokenizerBase { type: 'keyword' buffer_size: integer @@ -71,18 +78,15 @@ export class LowercaseTokenizer extends TokenizerBase { type: 'lowercase' } -export enum NoriDecompoundMode { - discard, - none, - mixed -} - -export class NoriTokenizer extends TokenizerBase { - type: 'nori_tokenizer' - decompound_mode?: NoriDecompoundMode - discard_punctuation?: boolean - user_dictionary?: string - user_dictionary_rules?: string[] +export class NGramTokenizer extends TokenizerBase { + type: 'ngram' + custom_token_chars?: string + max_gram: integer + min_gram: integer + /** + * @server_default [] + */ + token_chars?: TokenChar[] } export class PathHierarchyTokenizer extends TokenizerBase { @@ -101,11 +105,25 @@ export class PatternTokenizer extends TokenizerBase { pattern?: string } +export class SimplePatternTokenizer extends TokenizerBase { + type: 'simple_pattern' + pattern?: string +} + +export class SimplePatternSplitTokenizer extends TokenizerBase { + type: 'simple_pattern_split' + pattern?: string +} + export class StandardTokenizer extends TokenizerBase { type: 'standard' max_token_length?: integer } +export class ThaiTokenizer extends TokenizerBase { + type: 'thai' +} + export class UaxEmailUrlTokenizer extends TokenizerBase { type: 'uax_url_email' max_token_length?: integer @@ -126,16 +144,21 @@ export type Tokenizer = string | TokenizerDefinition */ export type TokenizerDefinition = | CharGroupTokenizer + | ClassicTokenizer | EdgeNGramTokenizer | KeywordTokenizer | LetterTokenizer | LowercaseTokenizer | NGramTokenizer - | NoriTokenizer | PathHierarchyTokenizer + | PatternTokenizer + | SimplePatternTokenizer + | SimplePatternSplitTokenizer | StandardTokenizer + | ThaiTokenizer | UaxEmailUrlTokenizer | WhitespaceTokenizer - | KuromojiTokenizer - | PatternTokenizer + // plugins | IcuTokenizer + | KuromojiTokenizer + | NoriTokenizer