Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@

import datawave.core.common.logging.ThreadConfigurableLogger;
import datawave.data.normalizer.IpAddressNormalizer;
import datawave.data.type.GeoType;
import datawave.data.type.IpAddressType;
import datawave.data.type.NumberType;
import datawave.data.type.OneToManyNormalizerType;
import datawave.data.type.Type;
import datawave.query.Constants;
Expand Down Expand Up @@ -331,11 +331,15 @@ protected JexlNode expandNodeForNormalizers(JexlNode node, Object data) {
List<JexlNode> normalizedNodes = Lists.newArrayList();
boolean failedNormalization = false;
boolean regexNode = (node instanceof ASTNRNode || node instanceof ASTERNode);
boolean containsLossyRegex = false;
JexlNode lossyRegexNode = null;
// Build up a set of normalized terms using each normalizer
for (Type<?> normalizer : dataTypes) {
try {
if (normalizer instanceof OneToManyNormalizerType && ((OneToManyNormalizerType<?>) normalizer).expandAtQueryTime()) {
if (regexNode) {

// todo: add other one-to-many types for which we should not allow regex
if (regexNode && normalizer instanceof GeoType) {
throw new IllegalArgumentException(
"OneToManyNormalizers to not handle regex normalization: " + fieldName + " -> " + normalizer.getClass());
}
Expand Down Expand Up @@ -363,15 +367,14 @@ protected JexlNode expandNodeForNormalizers(JexlNode node, Object data) {
}
normalizedTerms.add(normTerm);
JexlNode normalizedNode = JexlNodeFactory.buildUntypedNode(node, fieldName, normTerm);
if (regexNode && normalizer.normalizedRegexIsLossy(term)) {
JexlNode evalOnly = QueryPropertyMarker.create(JexlNodeFactory.buildUntypedNode(node, fieldName, term),
EVALUATION_ONLY);
// now we need to combine these two nodes so that both are required
JexlNode combined = JexlNodeFactory.createAndNode(Arrays.asList(new JexlNode[] {normalizedNode, evalOnly}));
normalizedNodes.add(combined);
} else {
normalizedNodes.add(normalizedNode);
}
normalizedNodes.add(normalizedNode);
}
// if the normalized term is identical to the original term, it cannot be lossy
if (regexNode && !term.equals(normTerm) && normalizer.normalizedRegexIsLossy(term)) {
containsLossyRegex = true;
lossyRegexNode = QueryPropertyMarker.create(JexlNodeFactory.buildUntypedNode(nodeToReturn, fieldName, term),
EVALUATION_ONLY);

}
}
} catch (IpAddressNormalizer.Exception ipex) {
Expand Down Expand Up @@ -464,6 +467,13 @@ else if (lenient) {
}
}

// todo: this merits further scrutiny
if (containsLossyRegex) {
// now we need to combine these two nodes so that both are required
JexlNode combined = JexlNodeFactory.createAndNode(Arrays.asList(new JexlNode[] {nodeToReturn, lossyRegexNode}));
nodeToReturn = combined;
}

// wrap the node if required
if (evaluationOnly) {
nodeToReturn = QueryPropertyMarker.create(nodeToReturn, EVALUATION_ONLY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1121,6 +1121,10 @@ protected ASTJexlScript processTree(final ASTJexlScript originalQueryTree, Shard
config.setQueryTree(timedExpandAnyFieldRegexNodes(timers, config.getQueryTree(), config, metadataHelper, scannerFactory, settings.getQuery()));
}

// Enforce unique terms within an AND or OR expression. For some reason we get duplicate expanded terms from the ExpandAnyFieldRegex. Dedupe them here
// until we fix that.
config.setQueryTree(timedEnforceUniqueTermsWithinExpressions(timers, config.getQueryTree()));

if (reduceQuery) {
config.setQueryTree(timedReduce(timers, "Reduce Query After ANYFIELD Expansions", config.getQueryTree()));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,15 @@ public void testGeowaveExpansion() throws Exception {
finalQuery);

ASTJexlScript expectedQuery = JexlASTHelper.parseJexlQuery(
"((((_Bounded_ = true) && (NUMBER >= '0' && NUMBER <= '1000')) && geowave:intersects(GEO, 'POLYGON((-180 -90, 180 -90, 180 90, -180 90, -180 -90))') && (GEO == '00' || GEO == '0202' || GEO == '020b' || GEO == '1f202a02a02a02a02a' || GEO == '1f2088888888888888' || GEO == '1f200a80a80a80a80a') && (GEO == '00' || GEO == '0202' || GEO == '020b' || GEO == '1f202a02a02a02a02a' || GEO == '1f2088888888888888' || GEO == '1f200a80a80a80a80a')) || (((_Bounded_ = true) && (NUMBER >= '0' && NUMBER <= '1000')) && geowave:intersects(GEO, 'POLYGON((-180 -90, 180 -90, 180 90, -180 90, -180 -90))') && (GEO == '00' || GEO == '0202' || GEO == '020b' || GEO == '1f202a02a02a02a02a' || GEO == '1f2088888888888888' || GEO == '1f200a80a80a80a80a') && (GEO == '00' || GEO == '0202' || GEO == '020b' || GEO == '1f202a02a02a02a02a' || GEO == '1f2088888888888888' || GEO == '1f200a80a80a80a80a'))) && GENDER == 'male' && (NOME == 'this' || NOME == 'that') && !filter:includeRegex(ETA, 'blah') && (LOCATION == 'chicago' || LOCATION == 'newyork' || LOCATION == 'newjersey')");
"GENDER == 'male' && (NOME == 'that' || NOME == 'this') && (LOCATION == 'chicago' || LOCATION == 'newjersey' || LOCATION == 'newyork') && (GEO == '00' || GEO == '0202' || GEO == '020b' || GEO == '1f200a80a80a80a80a' || GEO == '1f202a02a02a02a02a' || GEO == '1f2088888888888888') && geowave:intersects(GEO, 'POLYGON((-180 -90, 180 -90, 180 90, -180 90, -180 -90))') && !filter:includeRegex(ETA, 'blah') && ((_Bounded_ = true) && (NUMBER >= '0' && NUMBER <= '1000'))");
// "((((_Bounded_ = true) && (NUMBER >= '0' && NUMBER <= '1000')) && geowave:intersects(GEO, 'POLYGON((-180 -90, 180 -90, 180 90, -180 90, -180 -90))')
// && (GEO == '00' || GEO == '0202' || GEO == '020b' || GEO == '1f202a02a02a02a02a' || GEO == '1f2088888888888888' || GEO == '1f200a80a80a80a80a') &&
// (GEO == '00' || GEO == '0202' || GEO == '020b' || GEO == '1f202a02a02a02a02a' || GEO == '1f2088888888888888' || GEO == '1f200a80a80a80a80a')) ||
// (((_Bounded_ = true) && (NUMBER >= '0' && NUMBER <= '1000')) && geowave:intersects(GEO, 'POLYGON((-180 -90, 180 -90, 180 90, -180 90, -180 -90))') &&
// (GEO == '00' || GEO == '0202' || GEO == '020b' || GEO == '1f202a02a02a02a02a' || GEO == '1f2088888888888888' || GEO == '1f200a80a80a80a80a') && (GEO
// == '00' || GEO == '0202' || GEO == '020b' || GEO == '1f202a02a02a02a02a' || GEO == '1f2088888888888888' || GEO == '1f200a80a80a80a80a'))) && GENDER
// == 'male' && (NOME == 'this' || NOME == 'that') && !filter:includeRegex(ETA, 'blah') && (LOCATION == 'chicago' || LOCATION == 'newyork' || LOCATION
// == 'newjersey')");
Assert.assertTrue(TreeEqualityVisitor.isEqual(expectedQuery, logic.getConfig().getQueryTree()));
}

Expand Down Expand Up @@ -392,7 +400,54 @@ public void testNumericExpansion() throws Exception {
runTestQuery(expectedLists[i], queryStrings[i], format.parse("20091231"), format.parse("20150101"), extraParameters);
}

String expectedQueryStr = "(BAIL == '+eE1.2345' || BAIL == '+fE1.23401' || BAIL == '+gE1.234987') && ((_Eval_ = true) && (BAIL =~ '12340.*?'))";
String expectedQueryStr = "(BAIL == '+eE1.2345' || BAIL == '+fE1.23401') && ((_Eval_ = true) && (BAIL =~ '12340.*?'))";
String plan = JexlFormattedStringBuildingVisitor.buildQuery(logic.getConfig().getQueryTree());
Assert.assertTrue("Expected equality: " + expectedQueryStr + " vs " + plan,
TreeEqualityVisitor.isEqual(JexlASTHelper.parseJexlQuery(expectedQueryStr), logic.getConfig().getQueryTree()));
}

@Test
public void testAnyfieldNumericExpansion() throws Exception {
Map<String,String> extraParameters = new HashMap<>();
extraParameters.put("include.grouping.context", "true");
extraParameters.put("hit.list", "true");
// extraParameters.put("query.syntax", "LUCENE");

if (log.isDebugEnabled()) {
log.debug("testMatchesAtLeastCountOf");
}
String[] queryStrings = {"_ANYFIELD_ =~'12340.*?'"};
@SuppressWarnings("unchecked")
// SOPRANO is the only one with a 0 after the 1234
List<String>[] expectedLists = new List[] {Arrays.asList("SOPRANO")};
for (int i = 0; i < queryStrings.length; i++) {
runTestQuery(expectedLists[i], queryStrings[i], format.parse("20091231"), format.parse("20150101"), extraParameters);
}

String expectedQueryStr = "(BAIL == '+eE1.2345' || BAIL == '+fE1.23401') && ((_Eval_ = true) && (_ANYFIELD_ =~ '12340.*?'))";
String plan = JexlFormattedStringBuildingVisitor.buildQuery(logic.getConfig().getQueryTree());
Assert.assertTrue("Expected equality: " + expectedQueryStr + " vs " + plan,
TreeEqualityVisitor.isEqual(JexlASTHelper.parseJexlQuery(expectedQueryStr), logic.getConfig().getQueryTree()));
}

@Test
public void testLeadingNumericExpansion() throws Exception {
Map<String,String> extraParameters = new HashMap<>();
extraParameters.put("include.grouping.context", "true");
extraParameters.put("hit.list", "true");
// extraParameters.put("query.syntax", "LUCENE");

if (log.isDebugEnabled()) {
log.debug("testMatchesAtLeastCountOf");
}
String[] queryStrings = {"(UUID == 'capone' || UUID == 'soprano') && BAIL=~'.*?05'"};
@SuppressWarnings("unchecked")
List<String>[] expectedLists = new List[] {Arrays.asList("CAPONE")};
for (int i = 0; i < queryStrings.length; i++) {
runTestQuery(expectedLists[i], queryStrings[i], format.parse("20091231"), format.parse("20150101"), extraParameters);
}

String expectedQueryStr = "(UUID == 'capone' || UUID == 'soprano') && ((_Eval_ = true) && BAIL =~ '\\+[a-zA-Z]E.*?0?\\.?5|![A-Za-z]E.*?9?\\.?5' && BAIL =~ '.*?05')";
String plan = JexlFormattedStringBuildingVisitor.buildQuery(logic.getConfig().getQueryTree());
Assert.assertTrue("Expected equality: " + expectedQueryStr + " vs " + plan,
TreeEqualityVisitor.isEqual(JexlASTHelper.parseJexlQuery(expectedQueryStr), logic.getConfig().getQueryTree()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw
timeStamp + sopranoTimeStampDelta, emptyValue);
mutation.put(datatype + "\u0000" + sopranoUID, "GEO" + "\u0000" + "POINT(20 20)", columnVisibility, timeStamp + sopranoTimeStampDelta, emptyValue);

mutation.put(datatype + "\u0000" + caponeUID, "BAIL.0" + "\u0000" + "1234987", columnVisibility, timeStamp + caponeTimeStampDelta, emptyValue);
mutation.put(datatype + "\u0000" + caponeUID, "BAIL.0" + "\u0000" + "0.05", columnVisibility, timeStamp + caponeTimeStampDelta, emptyValue);
mutation.put(datatype + "\u0000" + caponeUID, "NAME.0" + "\u0000" + "ALPHONSE", columnVisibility, timeStamp + caponeTimeStampDelta, emptyValue);
mutation.put(datatype + "\u0000" + caponeUID, "NAME.1" + "\u0000" + "FRANK", columnVisibility, timeStamp + caponeTimeStampDelta, emptyValue);
mutation.put(datatype + "\u0000" + caponeUID, "NAME.2" + "\u0000" + "RALPH", columnVisibility, timeStamp + caponeTimeStampDelta, emptyValue);
Expand Down Expand Up @@ -311,7 +311,7 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw
mutation.put("BAIL".toUpperCase(), shard + "\u0000" + datatype, columnVisibility, timeStamp,
range == WhatKindaRange.SHARD ? getValueForNuthinAndYourHitsForFree() : getValueForBuilderFor(sopranoUID));
bw.addMutation(mutation);
mutation = new Mutation(numberType.normalize("1234987"));
mutation = new Mutation(numberType.normalize("0.05"));
mutation.put("BAIL".toUpperCase(), shard + "\u0000" + datatype, columnVisibility, timeStamp,
range == WhatKindaRange.SHARD ? getValueForNuthinAndYourHitsForFree() : getValueForBuilderFor(caponeUID));
bw.addMutation(mutation);
Expand Down Expand Up @@ -653,6 +653,19 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw
range == WhatKindaRange.SHARD ? getValueForNuthinAndYourHitsForFree() : getValueForBuilderFor(caponeUID));
bw.addMutation(mutation);

mutation = new Mutation(new StringBuilder(numberType.normalize("12345")).reverse());
mutation.put("BAIL".toUpperCase(), shard + "\u0000" + datatype, columnVisibility, timeStamp,
range == WhatKindaRange.SHARD ? getValueForNuthinAndYourHitsForFree() : getValueForBuilderFor(corleoneUID));
bw.addMutation(mutation);
mutation = new Mutation(new StringBuilder(numberType.normalize("123401")).reverse());
mutation.put("BAIL".toUpperCase(), shard + "\u0000" + datatype, columnVisibility, timeStamp,
range == WhatKindaRange.SHARD ? getValueForNuthinAndYourHitsForFree() : getValueForBuilderFor(sopranoUID));
bw.addMutation(mutation);
mutation = new Mutation(new StringBuilder(numberType.normalize("0.05")).reverse());
mutation.put("BAIL".toUpperCase(), shard + "\u0000" + datatype, columnVisibility, timeStamp,
range == WhatKindaRange.SHARD ? getValueForNuthinAndYourHitsForFree() : getValueForBuilderFor(caponeUID));
bw.addMutation(mutation);

// add some index-only fields
mutation = new Mutation(new StringBuilder("chicago").reverse());
mutation.put("LOCATION", shard + "\u0000" + datatype, columnVisibility, timeStamp,
Expand Down Expand Up @@ -726,7 +739,7 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw
emptyValue);
mutation.put("fi\u0000" + "BAIL", numberType.normalize("123401") + "\u0000" + datatype + "\u0000" + sopranoUID, columnVisibility, timeStamp,
emptyValue);
mutation.put("fi\u0000" + "BAIL", numberType.normalize("1234987") + "\u0000" + datatype + "\u0000" + caponeUID, columnVisibility, timeStamp,
mutation.put("fi\u0000" + "BAIL", numberType.normalize("0.05") + "\u0000" + datatype + "\u0000" + caponeUID, columnVisibility, timeStamp,
emptyValue);

// geo
Expand Down Expand Up @@ -912,6 +925,8 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw
mutation.put(ColumnFamilyConstants.COLF_E, new Text(datatype), emptyValue);
mutation.put(ColumnFamilyConstants.COLF_F, new Text(datatype + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(12L)));
mutation.put(ColumnFamilyConstants.COLF_I, new Text(datatype), emptyValue);
mutation.put(ColumnFamilyConstants.COLF_RI, new Text(datatype), emptyValue);

mutation.put(ColumnFamilyConstants.COLF_T, new Text(datatype + "\u0000" + normalizerForColumn("BAIL")), emptyValue);
bw.addMutation(mutation);

Expand Down
Loading