Skip to content

Add semantic field mapper. #1225

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
## [Unreleased 3.x](https://github.com/opensearch-project/neural-search/compare/main...HEAD)

### Features
- [Semantic Field] Add semantic field mapper. ([#1225](https://github.com/opensearch-project/neural-search/pull/1225)).

### Enhancements

Expand Down
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ def knnJarDirectory = "$buildDir/dependencies/opensearch-knn"

dependencies {
api "org.opensearch:opensearch:${opensearch_version}"
implementation group: 'org.opensearch.plugin', name:'mapper-extras-client', version: "${opensearch_version}"
zipArchive group: 'org.opensearch.plugin', name:'opensearch-job-scheduler', version: "${opensearch_build}"
zipArchive group: 'org.opensearch.plugin', name:'opensearch-knn', version: "${opensearch_build}"
zipArchive group: 'org.opensearch.plugin', name:'opensearch-ml-plugin', version: "${opensearch_build}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,13 @@ private void validateTestIndexOnUpgrade(
) throws Exception {
int docCount = getDocCount(getIndexNameForTest());
assertEquals(numberOfDocs, docCount);
loadModel(modelId);
// In rolling upgrade tests we will not clean up the resources created in old and mix
// so check if the model is already deployed then no need to deploy it again.
if (!isModelAlreadyDeployed(modelId)) {
loadModel(modelId);
}
// Try to ensure all nodes are green before we do the search.
waitForClusterHealthGreen(NODES_BWC_CLUSTER);
Map<String, Object> searchResponseAsMap = search(
getIndexNameForTest(),
hybridQueryBuilder,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.constants;

/**
* Constants related to the index mapping.
*/
public class MappingConstants {
/**
* Name for the field type. In index mapping we use this key to define the field type.
*/
public static final String TYPE = "type";
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.constants;

/**
* Constants for semantic field
*/
public class SemanticFieldConstants {
/**
* Name of the model id parameter. We use this key to define the id of the ML model that we will use for the
* semantic field.
*/
public static final String MODEL_ID = "model_id";

/**
* Name of the search model id parameter. We use this key to define the id of the ML model that we will use to
* inference the query text during the search. If this parameter is not defined we will use the model_id instead.
*/
public static final String SEARCH_MODEL_ID = "search_model_id";
Comment on lines +15 to +21
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why we need 2 model_id fields? Can we only keep 1 model_id field? Why we need separate model_ids for inferencing for index and search?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's needed to support the sparse model use case.


/**
* Name of the raw field type parameter. We use this key to define the field type for the raw data. It will control
* how to store and query the raw data.
*/
public static final String RAW_FIELD_TYPE = "raw_field_type";

/**
* Name of the raw field type parameter. We use this key to define a custom field name for the semantic info.
*/
public static final String SEMANTIC_INFO_FIELD_NAME = "semantic_info_field_name";
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.mapper;

import lombok.Getter;
import lombok.NonNull;
import lombok.Setter;
import org.opensearch.core.xcontent.XContentBuilder;
import org.opensearch.index.mapper.BinaryFieldMapper;
import org.opensearch.index.mapper.FilterFieldType;
import org.opensearch.index.mapper.KeywordFieldMapper;
import org.opensearch.index.mapper.MappedFieldType;
import org.opensearch.index.mapper.Mapper;
import org.opensearch.index.mapper.MapperParsingException;
import org.opensearch.index.mapper.MatchOnlyTextFieldMapper;
import org.opensearch.index.mapper.ParametrizedFieldMapper;
import org.opensearch.index.mapper.ParseContext;
import org.opensearch.index.mapper.TextFieldMapper;
import org.opensearch.index.mapper.TokenCountFieldMapper;
import org.opensearch.index.mapper.WildcardFieldMapper;
import org.opensearch.neuralsearch.constants.MappingConstants;
import org.opensearch.neuralsearch.mapper.dto.SemanticParameters;

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.MODEL_ID;
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.RAW_FIELD_TYPE;
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEARCH_MODEL_ID;
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEMANTIC_INFO_FIELD_NAME;

/**
* FieldMapper for the semantic field. It will hold a delegate field mapper to delegate the data parsing and query work
* based on the raw_field_type.
*/
public class SemanticFieldMapper extends ParametrizedFieldMapper {
public static final String CONTENT_TYPE = "semantic";
private final SemanticParameters semanticParameters;

@Setter
@Getter
private ParametrizedFieldMapper delegateFieldMapper;

protected SemanticFieldMapper(
String simpleName,
MappedFieldType mappedFieldType,
MultiFields multiFields,
CopyTo copyTo,
ParametrizedFieldMapper delegateFieldMapper,
SemanticParameters semanticParameters
) {
super(simpleName, mappedFieldType, multiFields, copyTo);
this.delegateFieldMapper = delegateFieldMapper;
this.semanticParameters = semanticParameters;
}

@Override
public Builder getMergeBuilder() {
Builder semanticFieldMapperBuilder = (Builder) new Builder(simpleName()).init(this);
ParametrizedFieldMapper.Builder delegateBuilder = delegateFieldMapper.getMergeBuilder();
semanticFieldMapperBuilder.setDelegateBuilder(delegateBuilder);
return semanticFieldMapperBuilder;
}

@Override
public final ParametrizedFieldMapper merge(Mapper mergeWith) {
if (mergeWith instanceof SemanticFieldMapper) {
try {
delegateFieldMapper = delegateFieldMapper.merge(((SemanticFieldMapper) mergeWith).delegateFieldMapper);
} catch (IllegalArgumentException e) {
final String err = String.format(
Locale.ROOT,
"Failed to update the mapper %s because failed to update the delegate mapper for the raw_field_type %s due to %s",
this.name(),
this.semanticParameters.getRawFieldType(),
e.getMessage()
);
throw new IllegalArgumentException(err, e);
}
}
return super.merge(mergeWith);
}

@Override
protected void parseCreateField(ParseContext context) throws IOException {
delegateFieldMapper.parse(context);
}

@Override
protected String contentType() {
return CONTENT_TYPE;
}

public static class Builder extends ParametrizedFieldMapper.Builder {
@Getter
protected final Parameter<String> modelId = Parameter.stringParam(
MODEL_ID,
true,
m -> ((SemanticFieldMapper) m).semanticParameters.getModelId(),
null
);
@Getter
protected final Parameter<String> searchModelId = Parameter.stringParam(
SEARCH_MODEL_ID,
true,
m -> ((SemanticFieldMapper) m).semanticParameters.getSearchModelId(),
null
);
@Getter
protected final Parameter<String> rawFieldType = Parameter.stringParam(
RAW_FIELD_TYPE,
false,
m -> ((SemanticFieldMapper) m).semanticParameters.getRawFieldType(),
TextFieldMapper.CONTENT_TYPE
);
@Getter
protected final Parameter<String> semanticInfoFieldName = Parameter.stringParam(
SEMANTIC_INFO_FIELD_NAME,
false,
m -> ((SemanticFieldMapper) m).semanticParameters.getSemanticInfoFieldName(),
null
);

@Setter
protected ParametrizedFieldMapper.Builder delegateBuilder;

protected Builder(String name) {
super(name);
}

@Override
protected List<Parameter<?>> getParameters() {
return List.of(modelId, searchModelId, rawFieldType, semanticInfoFieldName);
}

@Override
public SemanticFieldMapper build(BuilderContext context) {
final ParametrizedFieldMapper delegateMapper = delegateBuilder.build(context);

final SemanticParameters semanticParameters = this.getSemanticParameters();
final MappedFieldType semanticFieldType = new SemanticFieldType(delegateMapper.fieldType(), semanticParameters);

return new SemanticFieldMapper(
name,
semanticFieldType,
multiFieldsBuilder.build(this, context),
copyTo.build(),
delegateMapper,
semanticParameters
);
}

public SemanticParameters getSemanticParameters() {
return new SemanticParameters(
modelId.getValue(),
searchModelId.getValue(),
rawFieldType.getValue(),
semanticInfoFieldName.getValue()
);
}
}

public static class TypeParser implements Mapper.TypeParser {

private final static Set<String> SUPPORTED_RAW_FIELD_TYPE = Set.of(
TextFieldMapper.CONTENT_TYPE,
KeywordFieldMapper.CONTENT_TYPE,
MatchOnlyTextFieldMapper.CONTENT_TYPE,
WildcardFieldMapper.CONTENT_TYPE,
TokenCountFieldMapper.CONTENT_TYPE,
BinaryFieldMapper.CONTENT_TYPE
);

@Override
public Builder parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
final String rawFieldType = (String) node.getOrDefault(RAW_FIELD_TYPE, TextFieldMapper.CONTENT_TYPE);

validateRawFieldType(rawFieldType);

final ParametrizedFieldMapper.TypeParser typeParser = (ParametrizedFieldMapper.TypeParser) parserContext.typeParser(
rawFieldType
);
final Builder semanticFieldMapperBuilder = new Builder(name);

// semantic field mapper builder parse semantic fields
Map<String, Object> semanticConfig = extractSemanticConfig(node, semanticFieldMapperBuilder.getParameters(), rawFieldType);
semanticFieldMapperBuilder.parse(name, parserContext, semanticConfig);

// delegate field mapper builder parse remaining fields
ParametrizedFieldMapper.Builder delegateBuilder = typeParser.parse(name, node, parserContext);
semanticFieldMapperBuilder.setDelegateBuilder(delegateBuilder);

return semanticFieldMapperBuilder;
}

private void validateRawFieldType(final String rawFieldType) {
if (rawFieldType == null || !SUPPORTED_RAW_FIELD_TYPE.contains(rawFieldType)) {
final String err = String.format(
Locale.ROOT,
"raw_field_type %s is not supported. It should be one of [%s]",
rawFieldType,
String.join(", ", SUPPORTED_RAW_FIELD_TYPE)
);
throw new IllegalArgumentException(err);
}
}

/**
* In this function we will extract all the parameters defined in the semantic field mapper builder and parse it
* later. The remaining parameters will be processed by the type parser of the raw field type. Here we cannot
* pass the parameters defined by semantic field to the delegate type parser of the raw field type because it
* cannot recognize them.
* @param node field config
* @param parameters parameters for semantic field
* @param rawFieldType field type of the raw data
* @return semantic field config
*/
private Map<String, Object> extractSemanticConfig(Map<String, Object> node, List<Parameter<?>> parameters, String rawFieldType) {
final Map<String, Object> semanticConfig = new HashMap<>();
for (Parameter<?> parameter : parameters) {
Object config = node.get(parameter.name);
if (config != null) {
semanticConfig.put(parameter.name, config);
node.remove(parameter.name);
}
}
semanticConfig.put(MappingConstants.TYPE, SemanticFieldMapper.CONTENT_TYPE);
node.put(MappingConstants.TYPE, rawFieldType);
return semanticConfig;
}
}

public static class SemanticFieldType extends FilterFieldType {
@Getter
private SemanticParameters semanticParameters;

public SemanticFieldType(@NonNull final MappedFieldType delegate, @NonNull final SemanticParameters semanticParameters) {
super(delegate);
this.semanticParameters = semanticParameters;
}

@Override
public String typeName() {
return SemanticFieldMapper.CONTENT_TYPE;
}
}

@Override
protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
builder.field(MappingConstants.TYPE, contentType());

// semantic parameters
final List<Parameter<?>> parameters = getMergeBuilder().getParameters();
for (Parameter<?> parameter : parameters) {
// By default, we will not return the default value. But raw_field_type is useful info to let users know how
// we will handle the raw data. So we explicitly return it even it is using the default value.
if (RAW_FIELD_TYPE.equals(parameter.name)) {
parameter.toXContent(builder, true);
} else {
parameter.toXContent(builder, includeDefaults);
}
}

// non-semantic parameters
// semantic field mapper itself does not handle multi fields or copy to. The delegate field mapper will handle it.
delegateFieldMapper.multiFields().toXContent(builder, params);
delegateFieldMapper.copyTo().toXContent(builder, params);
delegateFieldMapper.getMergeBuilder().toXContent(builder, includeDefaults);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.mapper.dto;

import lombok.Getter;

/**
* A DTO to hold all the semantic parameters.
*/
@Getter
public class SemanticParameters {
private final String modelId;
private final String searchModelId;
private final String rawFieldType;
private final String semanticInfoFieldName;

public SemanticParameters(String modelId, String searchModelId, String rawFieldType, String semanticInfoFieldName) {
this.modelId = modelId;
this.searchModelId = searchModelId;
this.semanticInfoFieldName = semanticInfoFieldName;
this.rawFieldType = rawFieldType;
}
}
Loading
Loading