diff --git a/warehouse/ops-tools/index-validation/pom.xml b/warehouse/ops-tools/index-validation/pom.xml
index a28da7a1a8b..a85f0b9d20e 100644
--- a/warehouse/ops-tools/index-validation/pom.xml
+++ b/warehouse/ops-tools/index-validation/pom.xml
@@ -4,7 +4,7 @@
gov.nsa.datawave
datawave-ops-tools-parent
- 7.8.0-SNAPSHOT
+ 7.10.0-SNAPSHOT
datawave-ops-tools-index-validation
jar
diff --git a/warehouse/ops-tools/pom.xml b/warehouse/ops-tools/pom.xml
index 577613d6ea9..c82c4bf2dc9 100644
--- a/warehouse/ops-tools/pom.xml
+++ b/warehouse/ops-tools/pom.xml
@@ -4,7 +4,7 @@
gov.nsa.datawave
datawave-warehouse-parent
- 7.8.0-SNAPSHOT
+ 7.10.0-SNAPSHOT
datawave-ops-tools-parent
pom
diff --git a/warehouse/pom.xml b/warehouse/pom.xml
index 028b013d820..06fd84ea4ea 100644
--- a/warehouse/pom.xml
+++ b/warehouse/pom.xml
@@ -4,7 +4,7 @@
gov.nsa.datawave
datawave-parent
- 7.8.0-SNAPSHOT
+ 7.10.0-SNAPSHOT
datawave-warehouse-parent
pom
diff --git a/warehouse/query-core/pom.xml b/warehouse/query-core/pom.xml
index 04f5f2ff603..2c0912419c1 100644
--- a/warehouse/query-core/pom.xml
+++ b/warehouse/query-core/pom.xml
@@ -4,7 +4,7 @@
gov.nsa.datawave
datawave-warehouse-parent
- 7.8.0-SNAPSHOT
+ 7.10.0-SNAPSHOT
datawave-query-core
jar
diff --git a/warehouse/query-core/src/main/java/datawave/core/iterators/BoundedRangeExpansionIterator.java b/warehouse/query-core/src/main/java/datawave/core/iterators/BoundedRangeExpansionIterator.java
new file mode 100644
index 00000000000..f863a4d0675
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/core/iterators/BoundedRangeExpansionIterator.java
@@ -0,0 +1,171 @@
+package datawave.core.iterators;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map;
+import java.util.TreeSet;
+
+import org.apache.accumulo.core.data.ByteSequence;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Range;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.iterators.IteratorEnvironment;
+import org.apache.accumulo.core.iterators.OptionDescriber;
+import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
+import org.apache.accumulo.core.iterators.user.SeekingFilter;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.io.Text;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Splitter;
+
+import datawave.query.Constants;
+import datawave.query.jexl.LiteralRange;
+
+/**
+ * A {@link SeekingFilter} that attempts to expand bounded ranges using the global index
+ *
+ * The caller is responsible for fetching the appropriate column families. The range is constructed from a {@link LiteralRange}.
+ *
+ * The only thing this iterator does is advance through datatypes if a filter is supplied, advance to the start date, and advance to the next row within the
+ * range.
+ */
+public class BoundedRangeExpansionIterator extends SeekingFilter implements OptionDescriber {
+
+ private static final Logger log = LoggerFactory.getLogger(BoundedRangeExpansionIterator.class);
+
+ public static final String START_DATE = "start.date";
+ public static final String END_DATE = "end.date";
+ public static final String DATATYPES_OPT = "dts";
+
+ private TreeSet datatypes;
+ private String startDate;
+ private String endDate;
+
+ private Text prevRow;
+
+ @Override
+ public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException {
+ if (!validateOptions(options)) {
+ throw new IllegalArgumentException("BoundedRangeExpansionIterator not configured with correct options");
+ }
+
+ String opt = options.get(DATATYPES_OPT);
+ if (StringUtils.isBlank(opt)) {
+ datatypes = new TreeSet<>();
+ } else {
+ datatypes = new TreeSet<>(Splitter.on(',').splitToList(opt));
+ }
+
+ startDate = options.get(START_DATE);
+ endDate = options.get(END_DATE) + Constants.MAX_UNICODE_STRING;
+
+ super.init(source, options, env);
+ }
+
+ @Override
+ public IteratorOptions describeOptions() {
+ IteratorOptions opts = new IteratorOptions(getClass().getName(), "Expands bounded ranges using the global index", null, null);
+ opts.addNamedOption(START_DATE, "The start date");
+ opts.addNamedOption(END_DATE, "The end date");
+ opts.addNamedOption(DATATYPES_OPT, "The set of datatypes used to filter keys (optional)");
+ return opts;
+ }
+
+ @Override
+ public boolean validateOptions(Map options) {
+ return options.containsKey(START_DATE) && options.containsKey(END_DATE);
+ }
+
+ @Override
+ public FilterResult filter(Key k, Value v) {
+ log.trace("filter key: {}", k.toStringNoTime());
+
+ // shard + null + datatype
+ String cq = k.getColumnQualifier().toString();
+ int index = cq.indexOf('\u0000');
+ String date = cq.substring(0, index);
+
+ if (date.compareTo(startDate) < 0) {
+ log.trace("{} is before the start date {}, advancing to start date", date, startDate);
+ return new FilterResult(false, AdvanceResult.USE_HINT);
+ }
+
+ if (date.compareTo(endDate) > 0) {
+ log.trace("{} is past the end date {}, advancing to next row", date, endDate);
+ return new FilterResult(false, AdvanceResult.NEXT_ROW);
+ }
+
+ String datatype = cq.substring(index + 1);
+ if (!datatypes.isEmpty() && !datatypes.contains(datatype)) {
+ log.trace("datatype {} was filtered out, advancing to next key", datatype);
+ return new FilterResult(false, AdvanceResult.NEXT);
+ }
+
+ if (prevRow != null && prevRow.equals(k.getRow())) {
+ // this iterator should only return a single key per unique row, thus the previous row should never match the current row.
+ log.warn("should never see a duplicate row -- skip to next row");
+ return new FilterResult(false, AdvanceResult.NEXT_ROW);
+ }
+
+ prevRow = k.getRow();
+ return new FilterResult(true, AdvanceResult.NEXT_ROW);
+ }
+
+ /**
+ * Hint is only used to seek to the start date
+ *
+ * @param k
+ * a key
+ * @param v
+ * a value
+ * @return the key used to seek
+ */
+ @Override
+ public Key getNextKeyHint(Key k, Value v) {
+ log.trace("get next key hint: {}", k.toStringNoTime());
+
+ // shard + null + datatype
+ String cq = k.getColumnQualifier().toString();
+ int index = cq.indexOf('\u0000');
+ String date = cq.substring(0, index);
+
+ if (date.compareTo(startDate) < 0) {
+ Text columnQualifier;
+
+ if (datatypes.isEmpty()) {
+ log.trace("seek to start date");
+ columnQualifier = new Text(startDate + '\u0000');
+ } else {
+ log.trace("seek to start date and datatype");
+ columnQualifier = new Text(startDate + '\u0000' + datatypes.first());
+ }
+
+ return new Key(k.getRow(), k.getColumnFamily(), columnQualifier);
+ }
+
+ log.trace("next hint key was called in a bad state, reverting to no-op");
+ return k;
+ }
+
+ @Override
+ public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException {
+ if (!range.isStartKeyInclusive()) {
+ // need to skip to next row
+ Key skip = new Key(range.getStartKey().getRow().toString() + '\u0000');
+ if (skip.compareTo(range.getEndKey()) > 0) {
+ // handles case of bounded range against single value
+ // filter key: +cE1 NUM:20150808_0%00;generic [NA]
+ // skip key would be +cE1 but then the start key is greater than the end key. so we cheat accumulo.
+ Range skipRange = new Range(range.getEndKey(), true, range.getEndKey(), range.isEndKeyInclusive());
+ super.seek(skipRange, columnFamilies, inclusive);
+ } else {
+ Range skipRange = new Range(skip, true, range.getEndKey(), range.isEndKeyInclusive());
+ super.seek(skipRange, columnFamilies, inclusive);
+ }
+ } else {
+ super.seek(range, columnFamilies, inclusive);
+ }
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java
index 1dcbf7583cd..b88cef085f2 100644
--- a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java
+++ b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java
@@ -84,7 +84,7 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement
public static final String QUERY_LOGIC_NAME_SOURCE = "queryLogic";
@SuppressWarnings("unused")
- private static final long serialVersionUID = -4354990715046146110L;
+ private static final long serialVersionUID = 2321985989282659247L;
private static final Logger log = Logger.getLogger(ShardQueryConfiguration.class);
// is this a tld query, explicitly default to false
@@ -482,24 +482,26 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement
private boolean pruneQueryOptions = false;
/**
- * Flag to control gathering field counts from the global index and persisting those to the query iterator. Negated terms and branches are not considered.
+ * Flag that sorts the query prior to the global index lookup using inferred costs. This step may reduce time spent in the global index depending on
+ * individual term selectivity.
*/
- private boolean useFieldCounts = false;
+ private boolean sortQueryPreIndexWithImpliedCounts = false;
+
/**
- * Flag to control gathering term counts from the global index and persisting those to the query iterator. Negated terms and branches are not considered.
+ * Flag that sorts the query prior to the global index lookup using field counts from the {@link TableName#METADATA} table. This option opens a scanner and
+ * thus is more expensive than sorting by implied counts, but is potentially more accurate.
*/
- private boolean useTermCounts = false;
+ private boolean sortQueryPreIndexWithFieldCounts = false;
+
/**
- * Flag to control sorting a query by inferred default costs prior to the global index lookup. This step may reduce time performing a secondary sort as when
- * {@link #sortQueryByCounts} is enabled.
+ * Flag that sorts the query using field counts gathered as part of the global index lookup. Negated terms and branches are not considered.
*/
- private boolean sortQueryBeforeGlobalIndex = false;
+ private boolean sortQueryPostIndexWithFieldCounts = false;
/**
- * Flag to control if a query is sorted by either field or term counts. Either {@link #useFieldCounts} or {@link #useTermCounts} must be set for this option
- * to take effect.
+ * Flag that sorts the query using term counts gathered as part of the global index lookup. Negated terms and branches are not considered.
*/
- private boolean sortQueryByCounts = false;
+ private boolean sortQueryPostIndexWithTermCounts = false;
/**
* Insert rules for processing the QueryTree to automatically apply hints to queries. Hints will be passed to the ScannerFactory
@@ -746,10 +748,10 @@ public void copyFrom(ShardQueryConfiguration other) {
this.setTfAggregationThresholdMs(other.getTfAggregationThresholdMs());
this.setGroupFields(GroupFields.copyOf(other.getGroupFields()));
this.setPruneQueryOptions(other.getPruneQueryOptions());
- this.setUseFieldCounts(other.getUseFieldCounts());
- this.setUseTermCounts(other.getUseTermCounts());
- this.setSortQueryBeforeGlobalIndex(other.isSortQueryBeforeGlobalIndex());
- this.setSortQueryByCounts(other.isSortQueryByCounts());
+ this.setSortQueryPreIndexWithImpliedCounts(other.isSortQueryPreIndexWithImpliedCounts());
+ this.setSortQueryPreIndexWithFieldCounts(other.isSortQueryPreIndexWithFieldCounts());
+ this.setSortQueryPostIndexWithTermCounts(other.isSortQueryPostIndexWithTermCounts());
+ this.setSortQueryPostIndexWithFieldCounts(other.isSortQueryPostIndexWithFieldCounts());
this.setUseQueryTreeScanHintRules(other.isUseQueryTreeScanHintRules());
this.setQueryTreeScanHintRules(other.getQueryTreeScanHintRules());
this.setFieldIndexHoleMinThreshold(other.getFieldIndexHoleMinThreshold());
@@ -2765,36 +2767,36 @@ public void setReduceIngestTypesPerShard(boolean reduceIngestTypesPerShard) {
this.reduceIngestTypesPerShard = reduceIngestTypesPerShard;
}
- public boolean getUseTermCounts() {
- return useTermCounts;
+ public boolean isSortQueryPreIndexWithImpliedCounts() {
+ return sortQueryPreIndexWithImpliedCounts;
}
- public void setUseTermCounts(boolean useTermCounts) {
- this.useTermCounts = useTermCounts;
+ public void setSortQueryPreIndexWithImpliedCounts(boolean sortQueryPreIndexWithImpliedCounts) {
+ this.sortQueryPreIndexWithImpliedCounts = sortQueryPreIndexWithImpliedCounts;
}
- public boolean getUseFieldCounts() {
- return useFieldCounts;
+ public boolean isSortQueryPreIndexWithFieldCounts() {
+ return sortQueryPreIndexWithFieldCounts;
}
- public void setUseFieldCounts(boolean useFieldCounts) {
- this.useFieldCounts = useFieldCounts;
+ public void setSortQueryPreIndexWithFieldCounts(boolean sortQueryPreIndexWithFieldCounts) {
+ this.sortQueryPreIndexWithFieldCounts = sortQueryPreIndexWithFieldCounts;
}
- public boolean isSortQueryBeforeGlobalIndex() {
- return sortQueryBeforeGlobalIndex;
+ public boolean isSortQueryPostIndexWithFieldCounts() {
+ return sortQueryPostIndexWithFieldCounts;
}
- public void setSortQueryBeforeGlobalIndex(boolean sortQueryBeforeGlobalIndex) {
- this.sortQueryBeforeGlobalIndex = sortQueryBeforeGlobalIndex;
+ public void setSortQueryPostIndexWithFieldCounts(boolean sortQueryPostIndexWithFieldCounts) {
+ this.sortQueryPostIndexWithFieldCounts = sortQueryPostIndexWithFieldCounts;
}
- public boolean isSortQueryByCounts() {
- return sortQueryByCounts;
+ public boolean isSortQueryPostIndexWithTermCounts() {
+ return sortQueryPostIndexWithTermCounts;
}
- public void setSortQueryByCounts(boolean sortQueryByCounts) {
- this.sortQueryByCounts = sortQueryByCounts;
+ public void setSortQueryPostIndexWithTermCounts(boolean sortQueryPostIndexWithTermCounts) {
+ this.sortQueryPostIndexWithTermCounts = sortQueryPostIndexWithTermCounts;
}
@Override
@@ -3001,10 +3003,10 @@ public boolean equals(Object o) {
getDocAggregationThresholdMs() == that.getDocAggregationThresholdMs() &&
getTfAggregationThresholdMs() == that.getTfAggregationThresholdMs() &&
getPruneQueryOptions() == that.getPruneQueryOptions() &&
- getUseFieldCounts() == that.getUseFieldCounts() &&
- getUseTermCounts() == that.getUseTermCounts() &&
- isSortQueryBeforeGlobalIndex() == that.isSortQueryBeforeGlobalIndex() &&
- isSortQueryByCounts() == that.isSortQueryByCounts();
+ isSortQueryPreIndexWithImpliedCounts() == isSortQueryPreIndexWithImpliedCounts() &&
+ isSortQueryPreIndexWithFieldCounts() == isSortQueryPreIndexWithFieldCounts() &&
+ isSortQueryPostIndexWithTermCounts() == isSortQueryPostIndexWithTermCounts() &&
+ isSortQueryPostIndexWithFieldCounts() == isSortQueryPostIndexWithFieldCounts();
// @formatter:on
}
@@ -3206,10 +3208,11 @@ public int hashCode() {
getDocAggregationThresholdMs(),
getTfAggregationThresholdMs(),
getPruneQueryOptions(),
- getUseFieldCounts(),
- getUseTermCounts(),
- isSortQueryBeforeGlobalIndex(),
- isSortQueryByCounts());
+ isSortQueryPreIndexWithImpliedCounts(),
+ isSortQueryPreIndexWithFieldCounts(),
+ isSortQueryPostIndexWithTermCounts(),
+ isSortQueryPostIndexWithFieldCounts()
+ );
// @formatter:on
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java
index ec0987fdb88..3e2c3e21e95 100644
--- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java
+++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java
@@ -3,14 +3,14 @@
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.util.Objects;
+import java.util.StringJoiner;
import org.apache.commons.lang.builder.CompareToBuilder;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.VLongWritable;
import org.apache.hadoop.io.WritableComparable;
-import com.google.common.base.Objects;
-
import datawave.core.query.configuration.ResultContext;
public class DiscoveredThing implements WritableComparable {
@@ -86,6 +86,7 @@ public void readFields(DataInput in) throws IOException {
@Override
public int compareTo(DiscoveredThing o) {
+
CompareToBuilder cmp = new CompareToBuilder();
if (o == null) {
return 1;
@@ -96,28 +97,34 @@ public int compareTo(DiscoveredThing o) {
cmp.append(getDate(), o.getDate());
cmp.append(getColumnVisibility(), o.getColumnVisibility());
cmp.append(getCount(), o.getCount());
+ cmp.append(getCountsByColumnVisibility(), o.getCountsByColumnVisibility());
return cmp.toComparison();
}
}
@Override
public boolean equals(Object o) {
- if (!(o instanceof DiscoveredThing))
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
return false;
- DiscoveredThing other = (DiscoveredThing) o;
- return Objects.equal(getTerm(), other.getTerm()) && Objects.equal(getField(), other.getField()) && Objects.equal(getType(), other.getType())
- && Objects.equal(getDate(), other.getDate()) && Objects.equal(getColumnVisibility(), other.getColumnVisibility())
- && Objects.equal(getCount(), other.getCount());
+ }
+ DiscoveredThing that = (DiscoveredThing) o;
+ return Objects.equals(term, that.term) && Objects.equals(field, that.field) && Objects.equals(type, that.type) && Objects.equals(date, that.date)
+ && Objects.equals(columnVisibility, that.columnVisibility) && Objects.equals(count, that.count)
+ && Objects.equals(countsByColumnVisibility, that.countsByColumnVisibility);
}
@Override
public int hashCode() {
- return Objects.hashCode(getTerm(), getField(), getType(), getDate(), getColumnVisibility(), getCount());
+ return Objects.hash(term, field, type, date, columnVisibility, count, countsByColumnVisibility);
}
@Override
public String toString() {
- return "DiscoveredThing [term=" + term + ", field=" + field + ", type=" + type + ", date=" + date + ", columnVisibility=" + columnVisibility
- + ", count=" + count + "]";
+ return new StringJoiner(", ", DiscoveredThing.class.getSimpleName() + "[", "]").add("term='" + term + "'").add("field='" + field + "'")
+ .add("type='" + type + "'").add("date='" + date + "'").add("columnVisibility='" + columnVisibility + "'").add("count=" + count)
+ .add("countsByColumnVisibility=" + countsByColumnVisibility).toString();
}
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java
index 1400308f3c2..404d9c29dda 100644
--- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java
+++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java
@@ -1,14 +1,15 @@
package datawave.query.discovery;
-import static com.google.common.collect.Collections2.filter;
-import static com.google.common.collect.Collections2.transform;
-import static com.google.common.collect.Lists.newArrayList;
-
import java.io.IOException;
-import java.util.ArrayList;
import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.BiFunction;
+import java.util.stream.Collectors;
import org.apache.accumulo.core.data.ByteSequence;
import org.apache.accumulo.core.data.Key;
@@ -17,141 +18,312 @@
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.iterators.IteratorEnvironment;
import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
-import org.apache.accumulo.core.util.Pair;
+import org.apache.accumulo.core.security.ColumnVisibility;
import org.apache.hadoop.io.ArrayWritable;
-import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.log4j.Logger;
-import com.google.common.base.Predicates;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
+import com.google.protobuf.InvalidProtocolBufferException;
+
+import datawave.ingest.protobuf.Uid;
+import datawave.marking.MarkingFunctions;
+import datawave.query.Constants;
public class DiscoveryIterator implements SortedKeyValueIterator {
+
private static final Logger log = Logger.getLogger(DiscoveryIterator.class);
+ private static final MarkingFunctions markingFunctions = MarkingFunctions.Factory.createMarkingFunctions();
- private Key tk;
- private Value tv;
- private SortedKeyValueIterator itr;
+ private Key key;
+ private Value value;
+ private SortedKeyValueIterator iterator;
private boolean separateCountsByColVis = false;
private boolean showReferenceCount = false;
private boolean reverseIndex = false;
+ private boolean sumCounts = false;
@Override
public DiscoveryIterator deepCopy(IteratorEnvironment env) {
- DiscoveryIterator i = new DiscoveryIterator();
- i.itr = itr.deepCopy(env);
- return i;
+ DiscoveryIterator copy = new DiscoveryIterator();
+ copy.iterator = iterator.deepCopy(env);
+ return copy;
}
@Override
public void next() throws IOException {
- tk = null;
- tv = null;
+ this.key = null;
+ this.value = null;
- while (itr.hasTop() && tk == null) {
- Multimap terms = aggregateDate();
+ while (iterator.hasTop() && key == null) {
+ // Get the entries to aggregate.
+ Multimap terms = getTermsByDatatype();
if (terms.isEmpty()) {
- if (log.isTraceEnabled())
- log.trace("Couldn't aggregate index info; moving onto next date/field/term if data is available.");
- continue;
+ log.trace("Couldn't aggregate index info; moving onto next date/field/term if data is available.");
} else {
- if (log.isTraceEnabled())
- log.trace("Received term info multimap of size [" + terms.size() + "]");
- ArrayList things = newArrayList(
- filter(transform(terms.asMap().values(), new TermInfoAggregation(separateCountsByColVis, showReferenceCount, reverseIndex)),
- Predicates.notNull()));
- if (log.isTraceEnabled())
- log.trace("After conversion to discovery objects, there are [" + things.size() + "] term info objects.");
- if (things.isEmpty()) {
- continue;
- } else {
- Pair top = makeTop(things);
- tk = top.getFirst();
- tv = top.getSecond();
+ // Aggregate the entries.
+ List things = terms.asMap().values().stream().map(this::aggregate).filter(Objects::nonNull).collect(Collectors.toList());
+ // Establish the next top of this iterator.
+ if (!things.isEmpty()) {
+ setTop(things);
return;
}
}
}
- if (log.isTraceEnabled())
- log.trace("No data found.");
+ log.trace("No data found.");
}
- private Multimap aggregateDate() throws IOException {
- Multimap terms = ArrayListMultimap.create();
- Key start = new Key(itr.getTopKey()), key = null;
- while (itr.hasTop() && start.equals((key = itr.getTopKey()), PartialKey.ROW_COLFAM) && datesMatch(start, key)) {
- TermInfo ti = new TermInfo(key, itr.getTopValue());
- if (ti.valid)
- terms.put(ti.datatype, ti);
+ /**
+ * Return a multimap containing mappings of datatypes to term entries that should be aggregated.
+ */
+ private Multimap getTermsByDatatype() throws IOException {
+ Multimap terms = ArrayListMultimap.create();
+ Key start = new Key(iterator.getTopKey());
+ Key key;
+ // If we should sum up counts, we want to collect the term entries for each date seen for the current field and term of start. Otherwise, we only want
+ // to collect the term entries for the current field, term, and date of start.
+ BiFunction dateMatchingFunction = sumCounts ? (first, second) -> true : this::datesMatch;
+ // Find all matching entries and parse term entries from them.
+ while (iterator.hasTop() && start.equals((key = iterator.getTopKey()), PartialKey.ROW_COLFAM) && dateMatchingFunction.apply(start, key)) {
+ TermEntry termEntry = new TermEntry(key, iterator.getTopValue());
+ if (termEntry.isValid())
+ terms.put(termEntry.getDatatype(), termEntry);
else {
- if (log.isTraceEnabled())
- log.trace("Received invalid term info from key: " + key);
+ if (log.isTraceEnabled()) {
+ log.trace("Received invalid term entry from key: " + key);
+ }
}
- itr.next();
+ iterator.next();
}
return terms;
}
- private static boolean datesMatch(Key reference, Key test) {
- ByteSequence a = reference.getColumnQualifierData(), b = test.getColumnQualifierData();
+ /**
+ * Return true if the dates for the two keys match, or false otherwise.
+ */
+ private boolean datesMatch(Key left, Key right) {
+ ByteSequence leftBytes = left.getColumnQualifierData();
+ ByteSequence rightBytes = right.getColumnQualifierData();
for (int i = 0; i < 8; i++) {
- if (a.byteAt(i) != b.byteAt(i)) {
+ if (leftBytes.byteAt(i) != rightBytes.byteAt(i)) {
return false;
}
}
return true;
}
- private Pair makeTop(List things) {
- Writable[] returnedThings = new Writable[things.size()];
- for (int i = 0; i < returnedThings.length; ++i)
- returnedThings[i] = things.get(i);
- ArrayWritable aw = new ArrayWritable(DiscoveredThing.class);
- aw.set(returnedThings);
+ /**
+ * Return the given term entries aggregated into a single {@link DiscoveredThing} if possible, or return null if any issues occurred.
+ */
+ private DiscoveredThing aggregate(Collection termEntries) {
+ if (termEntries.isEmpty()) {
+ return null;
+ } else {
+ TermEntry first = termEntries.iterator().next();
+ String term = reverseIndex ? new StringBuilder(first.getTerm()).reverse().toString() : first.getTerm();
+ String date = sumCounts ? "" : first.date;
+
+ Set visibilities = new HashSet<>();
+ Map visibilityToCounts = new HashMap<>();
+ long count = 0L;
+
+ // Aggregate the counts and visibilities from each entry.
+ for (TermEntry termEntry : termEntries) {
+ // Fetch the count to aggregate based of whether we should show the term count or the reference count.
+ long currentCount = this.showReferenceCount ? termEntry.getUidListSize() : termEntry.getUidCount();
+ try {
+ // Track the distinct visibilities seen.
+ visibilities.add(termEntry.getVisibility());
+ // If counts by visibility should be tracked, do so.
+ if (this.separateCountsByColVis) {
+ String visibility = new String(termEntry.getVisibility().flatten());
+ visibilityToCounts.compute(visibility, (k, v) -> v == null ? currentCount : v + currentCount);
+ }
+ } catch (Exception e) {
+ // If an exception occurs, skip to the next entry.
+ log.trace(e);
+ continue;
+ }
+ // Increment the overall count.
+ count += currentCount;
+ }
+
+ // If we do not have a count greater than 0, return null.
+ if (count <= 0) {
+ if (log.isTraceEnabled()) {
+ log.trace("Did not aggregate any counts for [" + first.getTerm() + "][" + first.getField() + "][" + first.getDatatype() + "]["
+ + first.getDate() + "]. Returning null");
+ }
+ return null;
+ } else {
+ // Otherwise, combine the visibilities, and return the aggregated result.
+ try {
+ ColumnVisibility visibility = markingFunctions.combine(visibilities);
+ MapWritable countsByVis = new MapWritable();
+ visibilityToCounts.forEach((key, value) -> countsByVis.put(new Text(key), new LongWritable(value)));
+ return new DiscoveredThing(term, first.getField(), first.getDatatype(), date, new String(visibility.flatten()), count, countsByVis);
+ } catch (Exception e) {
+ if (log.isTraceEnabled()) {
+ log.warn("Invalid column visibilities after combining " + visibilities);
+ }
+ return null;
+ }
+ }
+ }
+ }
+ /**
+ * Set the top {@link Key} and {@link Value} of this iterator, created from the given list of {@link DiscoveredThing} instances.
+ */
+ private void setTop(List things) {
+ // We want the key to be the last possible key for this date. Return the key as it is in the index (reversed if necessary) to ensure the keys are
+ // consistent with the initial seek range.
DiscoveredThing thing = things.get(0);
- // we want the key to be the last possible key for this date. Return the key as it is in the index (reversed if necessary) to
- // ensure the keys are consistent with the initial seek range.
- String row = (reverseIndex ? new StringBuilder().append(thing.getTerm()).reverse().toString() : thing.getTerm());
- return new Pair<>(new Key(row, thing.getField(), thing.getDate() + '\uffff'), new Value(WritableUtils.toByteArray(aw)));
+ String row = (this.reverseIndex ? new StringBuilder().append(thing.getTerm()).reverse().toString() : thing.getTerm());
+ Key newKey = new Key(row, thing.getField(), thing.getDate() + "\uffff");
+
+ // Create a value from the list of things.
+ ArrayWritable thingArray = new ArrayWritable(DiscoveredThing.class, things.toArray(new DiscoveredThing[0]));
+ Value newValue = new Value(WritableUtils.toByteArray(thingArray));
+
+ this.key = newKey;
+ this.value = newValue;
}
@Override
public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException {
-
- itr.seek(range, columnFamilies, inclusive);
- if (log.isTraceEnabled())
- log.trace("My source " + (itr.hasTop() ? "does" : "does not") + " have a top.");
+ this.iterator.seek(range, columnFamilies, inclusive);
+ if (log.isTraceEnabled()) {
+ log.trace("My source " + (this.iterator.hasTop() ? "does" : "does not") + " have a top.");
+ }
next();
}
@Override
public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException {
- itr = source;
- separateCountsByColVis = Boolean.parseBoolean(options.get(DiscoveryLogic.SEPARATE_COUNTS_BY_COLVIS));
- showReferenceCount = Boolean.parseBoolean(options.get(DiscoveryLogic.SHOW_REFERENCE_COUNT));
- reverseIndex = Boolean.parseBoolean(options.get(DiscoveryLogic.REVERSE_INDEX));
+ this.iterator = source;
+ this.separateCountsByColVis = Boolean.parseBoolean(options.get(DiscoveryLogic.SEPARATE_COUNTS_BY_COLVIS));
+ this.showReferenceCount = Boolean.parseBoolean(options.get(DiscoveryLogic.SHOW_REFERENCE_COUNT));
+ this.reverseIndex = Boolean.parseBoolean(options.get(DiscoveryLogic.REVERSE_INDEX));
+ this.sumCounts = Boolean.parseBoolean(options.get(DiscoveryLogic.SUM_COUNTS));
if (log.isTraceEnabled()) {
- log.trace("My source is a " + source.getClass().getName());
- log.trace("Separate counts by column visibility = " + separateCountsByColVis);
- log.trace("Show reference count only = " + showReferenceCount);
+ log.trace("Source: " + source.getClass().getName());
+ log.trace("Separate counts by column visibility: " + this.separateCountsByColVis);
+ log.trace("Show reference counts only: " + this.showReferenceCount);
+ log.trace("Reverse index: " + this.reverseIndex);
+ log.trace("Sum counts: " + this.sumCounts);
}
}
@Override
public boolean hasTop() {
- return tk != null;
+ return key != null;
}
@Override
public Key getTopKey() {
- return tk;
+ return key;
}
@Override
public Value getTopValue() {
- return tv;
+ return value;
+ }
+
+ /**
+ * Represents term information parsed from a {@link Key}, {@link Value} pair.
+ */
+ private static class TermEntry {
+
+ private final String term;
+ private final String field;
+ private String date;
+ private String datatype;
+ private ColumnVisibility visibility;
+ private long uidCount;
+ private long uidListSize;
+ private boolean valid;
+
+ public TermEntry(Key key, Value value) {
+ term = key.getRow().toString();
+ field = key.getColumnFamily().toString();
+
+ String colq = key.getColumnQualifier().toString();
+ int firstSeparatorPos = colq.indexOf(Constants.NULL_BYTE_STRING);
+ if (firstSeparatorPos != -1) {
+ int lastSeparatorPos = colq.lastIndexOf(Constants.NULL_BYTE_STRING);
+ // If multiple separators are present, this is a task datatype entry.
+ if (firstSeparatorPos != lastSeparatorPos) {
+ // Ensure that we at least have yyyyMMdd.
+ if ((lastSeparatorPos - firstSeparatorPos) < 9) {
+ return;
+ }
+ // The form is datatype\0date\0task status (old knowledge entry).
+ date = colq.substring(firstSeparatorPos + 1, firstSeparatorPos + 9);
+ datatype = colq.substring(0, firstSeparatorPos);
+ } else {
+ // Ensure that we at least have yyyyMMdd.
+ if (firstSeparatorPos < 8) {
+ return;
+ }
+ // The form is shardId\0datatype.
+ date = colq.substring(0, 8);
+ datatype = colq.substring(firstSeparatorPos + 1);
+ }
+
+ // Parse the UID.List object from the value.
+ try {
+ Uid.List uidList = Uid.List.parseFrom(value.get());
+ if (uidList != null) {
+ uidCount = uidList.getCOUNT();
+ uidListSize = uidList.getUIDList().size();
+ }
+ } catch (InvalidProtocolBufferException e) {
+ // Don't add UID information. At least we know what shard it's located in.
+ }
+
+ visibility = new ColumnVisibility(key.getColumnVisibility());
+
+ // This is now considered a valid entry for aggregation.
+ valid = true;
+ }
+ }
+
+ public String getTerm() {
+ return term;
+ }
+
+ public String getField() {
+ return field;
+ }
+
+ public String getDate() {
+ return date;
+ }
+
+ public String getDatatype() {
+ return datatype;
+ }
+
+ public ColumnVisibility getVisibility() {
+ return visibility;
+ }
+
+ public long getUidCount() {
+ return uidCount;
+ }
+
+ public long getUidListSize() {
+ return uidListSize;
+ }
+
+ public boolean isValid() {
+ return valid;
+ }
}
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java
index 91424b1afb1..6dd595b8792 100644
--- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java
+++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java
@@ -51,6 +51,7 @@
import datawave.core.query.configuration.QueryData;
import datawave.data.type.Type;
import datawave.microservice.query.Query;
+import datawave.microservice.query.QueryImpl;
import datawave.query.Constants;
import datawave.query.QueryParameters;
import datawave.query.discovery.FindLiteralsAndPatternsVisitor.QueryValues;
@@ -72,18 +73,45 @@ public class DiscoveryLogic extends ShardIndexQueryTable {
private static final Logger log = Logger.getLogger(DiscoveryLogic.class);
+ /**
+ * Used to specify if counts should be separated by column visibility.
+ */
public static final String SEPARATE_COUNTS_BY_COLVIS = "separate.counts.by.colvis";
+
+ /**
+ * Used to specify if reference counts should be shown instead of term counts.
+ */
public static final String SHOW_REFERENCE_COUNT = "show.reference.count";
+
+ /**
+ * Used to specify whether to sum up the counts instead of returning counts per date.
+ */
+ public static final String SUM_COUNTS = "sum.counts";
+
+ /**
+ * Used to specify whether to search against the reversed index.
+ */
public static final String REVERSE_INDEX = "reverse.index";
+
private DiscoveryQueryConfiguration config;
private MetadataHelper metadataHelper;
+ /**
+ * Basic constructor.
+ */
public DiscoveryLogic() {
super();
}
+ /**
+ * Copy constructor.
+ *
+ * @param other
+ * the other logic to copy
+ */
public DiscoveryLogic(DiscoveryLogic other) {
super(other);
+ this.config = new DiscoveryQueryConfiguration(other.config);
this.metadataHelper = other.metadataHelper;
}
@@ -92,7 +120,6 @@ public DiscoveryQueryConfiguration getConfig() {
if (this.config == null) {
this.config = DiscoveryQueryConfiguration.create();
}
-
return this.config;
}
@@ -111,56 +138,48 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting
log.debug("Query parameters set to " + settings.getParameters());
}
- // Check if the default modelName and modelTableNames have been overriden by custom parameters.
- if (null != settings.findParameter(QueryParameters.PARAMETER_MODEL_NAME)
- && !settings.findParameter(QueryParameters.PARAMETER_MODEL_NAME).getParameterValue().trim().isEmpty()) {
- setModelName(settings.findParameter(QueryParameters.PARAMETER_MODEL_NAME).getParameterValue().trim());
- }
- if (null != settings.findParameter(QueryParameters.PARAMETER_MODEL_TABLE_NAME)
- && !settings.findParameter(QueryParameters.PARAMETER_MODEL_TABLE_NAME).getParameterValue().trim().isEmpty()) {
- setModelTableName(settings.findParameter(QueryParameters.PARAMETER_MODEL_TABLE_NAME).getParameterValue().trim());
- }
+ // Check if the default model name and model table name have been overridden.
+ setModelName(getOrDefault(settings, QueryParameters.PARAMETER_MODEL_NAME, getConfig().getModelName()));
+ setModelTableName(getOrDefault(settings, QueryParameters.PARAMETER_MODEL_TABLE_NAME, getConfig().getModelTableName()));
- // Check if user would like counts separated by column visibility
- if (null != settings.findParameter(SEPARATE_COUNTS_BY_COLVIS)
- && !settings.findParameter(SEPARATE_COUNTS_BY_COLVIS).getParameterValue().trim().isEmpty()) {
- boolean separateCountsByColVis = Boolean.valueOf(settings.findParameter(SEPARATE_COUNTS_BY_COLVIS).getParameterValue().trim());
- getConfig().setSeparateCountsByColVis(separateCountsByColVis);
- }
+ // Check if counts should be separated by column visibility.
+ setSeparateCountsByColVis(getOrDefaultBoolean(settings, SEPARATE_COUNTS_BY_COLVIS, getSeparateCountsByColVis()));
- // Check if user would like to show reference counts instead of term counts
- if (null != settings.findParameter(SHOW_REFERENCE_COUNT) && !settings.findParameter(SHOW_REFERENCE_COUNT).getParameterValue().trim().isEmpty()) {
- boolean showReferenceCount = Boolean.valueOf(settings.findParameter(SHOW_REFERENCE_COUNT).getParameterValue().trim());
- getConfig().setShowReferenceCount(showReferenceCount);
- }
+ // Check if reference counts should be shown.
+ setShowReferenceCount(getOrDefaultBoolean(settings, SHOW_REFERENCE_COUNT, getShowReferenceCount()));
+
+ // Check if counts should be summed.
+ setSumCounts(getOrDefaultBoolean(settings, SUM_COUNTS, getSumCounts()));
+
+ // Check if any datatype filters were specified.
+ getConfig().setDatatypeFilter(getOrDefaultSet(settings, QueryParameters.DATATYPE_FILTER_SET, getConfig().getDatatypeFilter()));
+
+ // Update the query model.
setQueryModel(metadataHelper.getQueryModel(getModelTableName(), getModelName(), null));
- // get the data type filter set if any
- if (null != settings.findParameter(QueryParameters.DATATYPE_FILTER_SET)
- && !settings.findParameter(QueryParameters.DATATYPE_FILTER_SET).getParameterValue().trim().isEmpty()) {
- Set dataTypeFilter = new HashSet<>(Arrays.asList(StringUtils
- .split(settings.findParameter(QueryParameters.DATATYPE_FILTER_SET).getParameterValue().trim(), Constants.PARAM_VALUE_SEP)));
- getConfig().setDatatypeFilter(dataTypeFilter);
- if (log.isDebugEnabled()) {
- log.debug("Data type filter set to " + dataTypeFilter);
- }
- }
- // Set the connector
+ // Set the connector.
getConfig().setClient(client);
- // Set the auths
+
+ // Set the auths.
getConfig().setAuthorizations(auths);
- // Get the ranges
+ // Get the ranges.
getConfig().setBeginDate(settings.getBeginDate());
getConfig().setEndDate(settings.getEndDate());
- if (null == getConfig().getBeginDate() || null == getConfig().getEndDate()) {
- getConfig().setBeginDate(new Date(0));
+ // If a begin date was not specified, default to the earliest date.
+ if (getConfig().getBeginDate() == null) {
+ getConfig().setBeginDate(new Date(0L));
+ log.warn("Begin date not specified, using earliest begin date.");
+ }
+
+ // If an end date was not specified, default to the latest date.
+ if (getConfig().getEndDate() == null) {
getConfig().setEndDate(new Date(Long.MAX_VALUE));
- log.warn("Dates not specified, using entire date range");
+ log.warn("End date not specified, using latest end date.");
}
- // start with a trimmed version of the query, converted to JEXL
+ // Start with a trimmed version of the query, converted to JEXL
LuceneToJexlQueryParser parser = new LuceneToJexlQueryParser();
parser.setAllowLeadingWildCard(isAllowLeadingWildcard());
QueryNode node = parser.parse(settings.getQuery().trim());
@@ -173,9 +192,9 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting
// Parse & flatten the query
ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(getConfig().getQueryString());
+ CaseSensitivityVisitor.upperCaseIdentifiers(getConfig(), metadataHelper, script);
- script = CaseSensitivityVisitor.upperCaseIdentifiers(getConfig(), metadataHelper, script);
-
+ // Apply the query model.
Set dataTypes = getConfig().getDatatypeFilter();
Set allFields;
allFields = metadataHelper.getAllFields(dataTypes);
@@ -183,14 +202,13 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting
QueryValues literalsAndPatterns = FindLiteralsAndPatternsVisitor.find(script);
Stopwatch timer = Stopwatch.createStarted();
- // no caching for getAllNormalizers, so try some magic with getFields...
+ // No caching for getAllNormalizers, so try some magic with getFields...
Multimap> dataTypeMap = ArrayListMultimap.create(metadataHelper.getFieldsToDatatypes(getConfig().getDatatypeFilter()));
- /*
- * we have a mapping of FIELD->DataType, but not a mapping of ANYFIELD->DataType which should be all dataTypes
- */
- dataTypeMap.putAll(Constants.ANY_FIELD, uniqueByType(dataTypeMap.values()));
+ // We have a mapping of FIELD->DataType, but not a mapping of ANYFIELD->DataType which should be all datatypes.
+ dataTypeMap.putAll(Constants.ANY_FIELD, getUniqueTypes(dataTypeMap.values()));
timer.stop();
log.debug("Took " + timer.elapsed(TimeUnit.MILLISECONDS) + "ms to get all the dataTypes.");
+
getConfig().setLiterals(normalize(new LiteralNormalization(), literalsAndPatterns.getLiterals(), dataTypeMap));
getConfig().setPatterns(normalize(new PatternNormalization(), literalsAndPatterns.getPatterns(), dataTypeMap));
getConfig().setRanges(normalizeRanges(new LiteralNormalization(), literalsAndPatterns.getRanges(), dataTypeMap));
@@ -199,44 +217,143 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting
log.debug("Normalized Patterns = " + getConfig().getPatterns());
}
+ // Set the planned queries to execute.
getConfig().setQueries(createQueries(getConfig()));
return getConfig();
}
- public List createQueries(DiscoveryQueryConfiguration config) throws QueryException, TableNotFoundException, IOException, ExecutionException {
- final List queries = Lists.newLinkedList();
+ /**
+ * If present, return the value of the given parameter from the given settings, or return the default value otherwise.
+ */
+ private String getOrDefault(Query settings, String parameterName, String defaultValue) {
+ String value = getTrimmedParameter(settings, parameterName);
+ return StringUtils.isBlank(value) ? defaultValue : value;
+ }
- Set familiesToSeek = Sets.newHashSet();
- Pair,Set> seekRanges = makeRanges(getConfig(), familiesToSeek, metadataHelper);
- Collection forward = seekRanges.getValue0();
+ /**
+ * If present, return the value of the given parameter from the given settings as a boolean, or return the default value otherwise.
+ */
+ private boolean getOrDefaultBoolean(Query settings, String parameterName, boolean defaultValue) {
+ String value = getTrimmedParameter(settings, parameterName);
+ log.debug("Trimmed value for " + parameterName + ": " + value);
+ return StringUtils.isBlank(value) ? defaultValue : Boolean.parseBoolean(value);
+ }
- if (!forward.isEmpty()) {
- List settings = getIteratorSettingsForDiscovery(getConfig(), getConfig().getLiterals(), getConfig().getPatterns(),
- getConfig().getRanges(), false);
- if (isCheckpointable()) {
- // if checkpointable, then only one range per query data so that the whole checkpointing thing works correctly
- for (Range range : forward) {
- queries.add(new QueryData(config.getIndexTableName(), null, Collections.singleton(range), familiesToSeek, settings));
+ /**
+ * If present, return the value of the given parameter from the given settings as a set, or return the default value otherwise.
+ */
+ private Set getOrDefaultSet(Query settings, String parameterName, Set defaultValue) {
+ String value = getTrimmedParameter(settings, parameterName);
+ return StringUtils.isBlank(value) ? defaultValue : new HashSet<>(Arrays.asList(StringUtils.split(value, Constants.PARAM_VALUE_SEP)));
+ }
+
+ /**
+ * Return the trimmed value of the given parameter from the given settings, or null if a value is not present.
+ */
+ private String getTrimmedParameter(Query settings, String parameterName) {
+ QueryImpl.Parameter parameter = settings.findParameter(parameterName);
+ return parameter != null ? parameter.getParameterValue().trim() : null;
+ }
+
+ /**
+ * Given a sequence of objects of type T, this method will return a single object for every unique type passed in. This is used to dedupe normalizer
+ * instances by their type, so that we only get 1 instance per type of normalizer.
+ */
+ private Collection> getUniqueTypes(Iterable> things) {
+ Map,Type>> map = Maps.newHashMap();
+ for (Type> t : things) {
+ map.put(t.getClass(), t);
+ }
+ return map.values();
+ }
+
+ /**
+ * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object
+ * will determine whether a regex or literal is being normalized.
+ *
+ * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations.
+ *
+ * @param normalization
+ * the normalizer object
+ * @param valuesToFields
+ * mapping of values to fields
+ * @param dataTypeMap
+ * the data type map
+ * @return a mapping of the normalized tuples
+ */
+ private Multimap normalize(Normalization normalization, Multimap valuesToFields, Multimap> dataTypeMap) {
+ Multimap normalizedValuesToFields = HashMultimap.create();
+ for (Entry valueAndField : valuesToFields.entries()) {
+ String value = valueAndField.getKey(), field = valueAndField.getValue();
+ for (Type> dataType : dataTypeMap.get(field)) {
+ try {
+ log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]");
+ String normalized = normalization.normalize(dataType, field, value);
+ normalizedValuesToFields.put(normalized, field);
+ log.debug("Normalization succeeded!");
+ } catch (Exception exception) {
+ log.debug("Normalization failed.");
}
- } else {
- queries.add(new QueryData(config.getIndexTableName(), null, forward, familiesToSeek, settings));
}
}
+ return normalizedValuesToFields;
+ }
- Collection reverse = seekRanges.getValue1();
- if (!reverse.isEmpty()) {
- List settings = getIteratorSettingsForDiscovery(getConfig(), getConfig().getLiterals(), getConfig().getPatterns(),
- getConfig().getRanges(), true);
- if (isCheckpointable()) {
- // if checkpointable, then only one range per query data so that the whole checkpointing thing works correctly
- for (Range range : reverse) {
- queries.add(new QueryData(config.getReverseIndexTableName(), null, Collections.singleton(range), familiesToSeek, settings));
+ /**
+ * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object
+ * will determine whether a regex or literal is being normalized.
+ *
+ * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations.
+ *
+ * @param normalization
+ * the normalizer object
+ * @param valuesToFields
+ * mapping of values to fields
+ * @param dataTypeMap
+ * the data type map
+ * @return a mapping of the normalized ranges
+ */
+ private Multimap> normalizeRanges(Normalization normalization, Multimap> valuesToFields,
+ Multimap> dataTypeMap) {
+ Multimap> normalizedValuesToFields = HashMultimap.create();
+ for (Entry> valueAndField : valuesToFields.entries()) {
+ String field = valueAndField.getKey();
+ LiteralRange> value = valueAndField.getValue();
+ for (Type> dataType : dataTypeMap.get(field)) {
+ try {
+ log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]");
+ String normalizedLower = normalization.normalize(dataType, field, value.getLower().toString());
+ String normalizedUpper = normalization.normalize(dataType, field, value.getUpper().toString());
+ normalizedValuesToFields.put(field, new LiteralRange<>(normalizedLower, value.isLowerInclusive(), normalizedUpper, value.isUpperInclusive(),
+ value.getFieldName(), value.getNodeOperand()));
+ log.debug("Normalization succeeded!");
+ } catch (Exception exception) {
+ log.debug("Normalization failed.");
}
- } else {
- queries.add(new QueryData(config.getReverseIndexTableName(), null, reverse, familiesToSeek, settings));
}
}
+ return normalizedValuesToFields;
+ }
+
+ /**
+ * Create and return a list of planned queries.
+ *
+ * @param config
+ * the config
+ * @return the list of query data
+ */
+ private List createQueries(DiscoveryQueryConfiguration config) throws TableNotFoundException, ExecutionException {
+ final List queries = Lists.newLinkedList();
+
+ Set familiesToSeek = Sets.newHashSet(); // This will be populated by createRanges().
+ Pair,Set> seekRanges = createRanges(config, familiesToSeek, metadataHelper);
+
+ // Create the forward queries.
+ queries.addAll(createQueriesFromRanges(config, seekRanges.getValue0(), familiesToSeek, false));
+
+ // Create the reverse queries.
+ queries.addAll(createQueriesFromRanges(config, seekRanges.getValue1(), familiesToSeek, true));
if (log.isDebugEnabled()) {
log.debug("Created ranges: " + queries);
@@ -245,67 +362,161 @@ public List createQueries(DiscoveryQueryConfiguration config) throws
return queries;
}
- @Override
- public void setupQuery(GenericQueryConfiguration genericConfig) throws QueryException, TableNotFoundException, IOException, ExecutionException {
- if (!genericConfig.getClass().getName().equals(DiscoveryQueryConfiguration.class.getName())) {
- throw new QueryException("Did not receive a DiscoveryQueryConfiguration instance!!");
+ /**
+ * Create planned queries for the given ranges.
+ *
+ * @param config
+ * the config
+ * @param ranges
+ * the ranges
+ * @param familiesToSeek
+ * the families to seek
+ * @param reversed
+ * whether the ranges are for the reversed index
+ * @return the queries
+ */
+ private List createQueriesFromRanges(DiscoveryQueryConfiguration config, Set ranges, Set familiesToSeek, boolean reversed) {
+ List queries = new ArrayList<>();
+ if (!ranges.isEmpty()) {
+ List settings = getIteratorSettings(config, reversed);
+ String tableName = reversed ? config.getReverseIndexTableName() : config.getIndexTableName();
+ if (isCheckpointable()) {
+ for (Range range : ranges) {
+ queries.add(new QueryData(tableName, null, Collections.singleton(range), familiesToSeek, settings));
+ }
+ } else {
+ queries.add(new QueryData(tableName, null, ranges, familiesToSeek, settings));
+ }
}
- this.config = (DiscoveryQueryConfiguration) genericConfig;
- final List> iterators = Lists.newArrayList();
+ return queries;
+ }
- for (QueryData qd : config.getQueries()) {
- if (log.isDebugEnabled()) {
- log.debug("Creating scanner for " + qd);
+ /**
+ * Creates two collections of ranges: one for the forward index (value0) and one for the reverse index (value1). If a literal has a field name, then the
+ * Range for that term will include the column family. If there are multiple fields, then multiple ranges are created.
+ *
+ * @param config
+ * the discovery config
+ * @param familiesToSeek
+ * the families to seek
+ * @param metadataHelper
+ * a metadata helper
+ * @return a pair of ranges
+ * @throws TableNotFoundException
+ * if the table is not found
+ * @throws ExecutionException
+ * for execution exceptions
+ */
+ private Pair,Set> createRanges(DiscoveryQueryConfiguration config, Set familiesToSeek, MetadataHelper metadataHelper)
+ throws TableNotFoundException, ExecutionException {
+ Set forwardRanges = new HashSet<>();
+ Set reverseRanges = new HashSet<>();
+
+ // Evaluate the literals.
+ for (Entry literalAndField : config.getLiterals().entries()) {
+ String literal = literalAndField.getKey(), field = literalAndField.getValue();
+ // If the field is _ANYFIELD_, use null when making the range.
+ field = Constants.ANY_FIELD.equals(field) ? null : field;
+ // Mark the field as a family to seek if not null.
+ if (field != null) {
+ familiesToSeek.add(field);
}
- // scan the table
- BatchScanner bs = scannerFactory.newScanner(qd.getTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery());
+ forwardRanges.add(ShardIndexQueryTableStaticMethods.getLiteralRange(field, literal));
+ }
- bs.setRanges(qd.getRanges());
- for (IteratorSetting setting : qd.getSettings()) {
- bs.addScanIterator(setting);
+ // Evaluate the ranges.
+ for (Entry> rangeEntry : config.getRanges().entries()) {
+ LiteralRange range = rangeEntry.getValue();
+ String field = rangeEntry.getKey();
+ // If the field is _ANYFIELD_, use null when making the range.
+ field = Constants.ANY_FIELD.equals(field) ? null : field;
+ // Mark the field as a family to seek if not null.
+ if (field != null) {
+ familiesToSeek.add(field);
}
- for (String cf : qd.getColumnFamilies()) {
- bs.fetchColumnFamily(new Text(cf));
+ try {
+ forwardRanges.add(ShardIndexQueryTableStaticMethods.getBoundedRangeRange(range));
+ } catch (IllegalRangeArgumentException e) {
+ log.error("Error using range [" + range + "]", e);
}
+ }
- iterators.add(transformScanner(bs, qd));
+ // Evaluate the patterns.
+ for (Entry patternAndField : config.getPatterns().entries()) {
+ String pattern = patternAndField.getKey(), field = patternAndField.getValue();
+ // If the field is _ANYFIELD_, use null when making the range.
+ field = Constants.ANY_FIELD.equals(field) ? null : field;
+ // Mark the field as a family to seek if not null.
+ if (field != null) {
+ familiesToSeek.add(field);
+ }
+ ShardIndexQueryTableStaticMethods.RefactoredRangeDescription description;
+ try {
+ description = ShardIndexQueryTableStaticMethods.getRegexRange(field, pattern, false, metadataHelper, config);
+ } catch (JavaRegexParseException e) {
+ log.error("Error parsing pattern [" + pattern + "]", e);
+ continue;
+ }
+ if (description.isForReverseIndex) {
+ reverseRanges.add(description.range);
+ } else {
+ forwardRanges.add(description.range);
+ }
}
- this.iterator = concat(iterators.iterator());
- }
- public static List getIteratorSettingsForDiscovery(DiscoveryQueryConfiguration config, Multimap literals,
- Multimap patterns, Multimap> ranges, boolean reverseIndex) {
+ return Pair.with(forwardRanges, reverseRanges);
+ }
+ /**
+ * Return the set of iterator settings that should be applied to queries for the given configuration.
+ *
+ * @param config
+ * the config
+ * @param reverseIndex
+ * whether the iterator settings should be configured for a reversed index
+ * @return the iterator settings
+ */
+ private List getIteratorSettings(DiscoveryQueryConfiguration config, boolean reverseIndex) {
List settings = Lists.newLinkedList();
- // The begin date from the query may be down to the second, for doing lookups in the index we want to use the day because
- // the times in the index table have been truncated to the day.
+
+ // Add a date range filter.
+ // The begin date from the query may be down to the second, for doing look-ups in the index we want to use the day because the times in the index table
+ // have been truncated to the day.
Date begin = DateUtils.truncate(config.getBeginDate(), Calendar.DAY_OF_MONTH);
- // we don't need to bump up the end date any more because it's not apart of the range set on the scanner
+ // we don't need to bump up the end date any more because it's not a part of the range set on the scanner.
Date end = config.getEndDate();
-
LongRange dateRange = new LongRange(begin.getTime(), end.getTime());
-
settings.add(ShardIndexQueryTableStaticMethods.configureGlobalIndexDateRangeFilter(config, dateRange));
+
+ // Add a datatype filter.
settings.add(ShardIndexQueryTableStaticMethods.configureGlobalIndexDataTypeFilter(config, config.getDatatypeFilter()));
- IteratorSetting matchingIterator = configureIndexMatchingIterator(config, literals, patterns, ranges, reverseIndex);
+ // Add an iterator to match literals, patterns, and ranges against the index.
+ IteratorSetting matchingIterator = configureIndexMatchingIterator(config, reverseIndex);
if (matchingIterator != null) {
settings.add(matchingIterator);
}
- IteratorSetting discoveryIteratorSetting = new IteratorSetting(config.getBaseIteratorPriority() + 50, DiscoveryIterator.class);
- discoveryIteratorSetting.addOption(REVERSE_INDEX, Boolean.toString(reverseIndex));
- discoveryIteratorSetting.addOption(SEPARATE_COUNTS_BY_COLVIS, config.getSeparateCountsByColVis().toString());
- if (config.getShowReferenceCount()) {
- discoveryIteratorSetting.addOption(SHOW_REFERENCE_COUNT, config.getShowReferenceCount().toString());
- }
- settings.add(discoveryIteratorSetting);
+ // Add an iterator to create the actual DiscoveryThings.
+ settings.add(configureDiscoveryIterator(config, reverseIndex));
return settings;
}
- public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQueryConfiguration config, Multimap literals,
- Multimap patterns, Multimap> ranges, boolean reverseIndex) {
+ /**
+ * Return a {@link IteratorSetting} for an {@link IndexMatchingIterator}.
+ *
+ * @param config
+ * the config
+ * @param reverseIndex
+ * whether searching against the reversed index.
+ * @return the iterator setting
+ */
+ private IteratorSetting configureIndexMatchingIterator(DiscoveryQueryConfiguration config, boolean reverseIndex) {
+ Multimap literals = config.getLiterals();
+ Multimap patterns = config.getPatterns();
+ Multimap> ranges = config.getRanges();
+
if ((literals == null || literals.isEmpty()) && (patterns == null || patterns.isEmpty()) && (ranges == null || ranges.isEmpty())) {
return null;
}
@@ -314,6 +525,7 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer
IteratorSetting cfg = new IteratorSetting(config.getBaseIteratorPriority() + 23, "termMatcher", IndexMatchingIterator.class);
IndexMatchingIterator.Configuration conf = new IndexMatchingIterator.Configuration();
+ // Add literals.
if (literals != null) {
for (Entry literal : literals.entries()) {
if (Constants.ANY_FIELD.equals(literal.getValue())) {
@@ -323,6 +535,7 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer
}
}
}
+ // Add patterns.
if (patterns != null) {
for (Entry pattern : patterns.entries()) {
if (Constants.ANY_FIELD.equals(pattern.getValue())) {
@@ -332,6 +545,7 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer
}
}
}
+ // Add ranges.
if (ranges != null) {
for (Entry> range : ranges.entries()) {
if (Constants.ANY_FIELD.equals(range.getKey())) {
@@ -343,12 +557,57 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer
}
cfg.addOption(IndexMatchingIterator.CONF, IndexMatchingIterator.gson().toJson(conf));
-
cfg.addOption(IndexMatchingIterator.REVERSE_INDEX, Boolean.toString(reverseIndex));
return cfg;
}
+ /**
+ * Return an {@link IteratorSetting} for an {@link DiscoveryIterator}.
+ *
+ * @param config
+ * the config
+ * @param reverseIndex
+ * whether searching against the reversed index.
+ * @return the iterator setting
+ */
+ private IteratorSetting configureDiscoveryIterator(DiscoveryQueryConfiguration config, boolean reverseIndex) {
+ IteratorSetting setting = new IteratorSetting(config.getBaseIteratorPriority() + 50, DiscoveryIterator.class);
+ setting.addOption(REVERSE_INDEX, Boolean.toString(reverseIndex));
+ setting.addOption(SEPARATE_COUNTS_BY_COLVIS, Boolean.toString(config.getSeparateCountsByColVis()));
+ setting.addOption(SHOW_REFERENCE_COUNT, Boolean.toString(config.getShowReferenceCount()));
+ setting.addOption(SUM_COUNTS, Boolean.toString(config.getSumCounts()));
+ return setting;
+ }
+
+ @Override
+ public void setupQuery(GenericQueryConfiguration genericConfig) throws QueryException, TableNotFoundException, IOException, ExecutionException {
+ if (!genericConfig.getClass().getName().equals(DiscoveryQueryConfiguration.class.getName())) {
+ throw new QueryException("Did not receive a DiscoveryQueryConfiguration instance!!");
+ }
+ this.config = (DiscoveryQueryConfiguration) genericConfig;
+ final List> iterators = Lists.newArrayList();
+
+ for (QueryData qd : config.getQueries()) {
+ if (log.isDebugEnabled()) {
+ log.debug("Creating scanner for " + qd);
+ }
+ // scan the table
+ BatchScanner bs = scannerFactory.newScanner(qd.getTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery());
+
+ bs.setRanges(qd.getRanges());
+ for (IteratorSetting setting : qd.getSettings()) {
+ bs.addScanIterator(setting);
+ }
+ for (String cf : qd.getColumnFamilies()) {
+ bs.fetchColumnFamily(new Text(cf));
+ }
+
+ iterators.add(transformScanner(bs, qd));
+ }
+ this.iterator = concat(iterators.iterator());
+ }
+
@Override
public ShardIndexQueryTable clone() {
return new DiscoveryLogic(this);
@@ -361,7 +620,7 @@ public ShardIndexQueryTable clone() {
* a batch scanner
* @return iterator for discoveredthings
*/
- public static Iterator transformScanner(final BatchScanner scanner, final QueryData queryData) {
+ private Iterator transformScanner(final BatchScanner scanner, final QueryData queryData) {
return concat(transform(scanner.iterator(), new Function,Iterator>() {
DataInputBuffer in = new DataInputBuffer();
@@ -386,183 +645,35 @@ public Iterator apply(Entry from) {
}));
}
- /**
- * Makes two collections of ranges: one for the forward index (value0) and one for the reverse index (value1).
- *
- * If a literal has a field name, then the Range for that term will include the column family. If there are multiple fields, then multiple ranges are
- * created.
- *
- * @param config
- * the discovery config
- * @param familiesToSeek
- * the families to seek
- * @param metadataHelper
- * a metadata helper
- * @return a pair of ranges
- * @throws TableNotFoundException
- * if the table is not found
- * @throws ExecutionException
- * for execution exceptions
- */
- @SuppressWarnings("unchecked")
- public static Pair,Set> makeRanges(DiscoveryQueryConfiguration config, Set familiesToSeek, MetadataHelper metadataHelper)
- throws TableNotFoundException, ExecutionException {
- Set forwardRanges = new HashSet<>();
- for (Entry literalAndField : config.getLiterals().entries()) {
- String literal = literalAndField.getKey(), field = literalAndField.getValue();
- // if we're _ANYFIELD_, then use null when making the literal range
- field = Constants.ANY_FIELD.equals(field) ? null : field;
- if (field != null) {
- familiesToSeek.add(field);
- }
- forwardRanges.add(ShardIndexQueryTableStaticMethods.getLiteralRange(field, literal));
- }
- for (Entry> rangeEntry : config.getRanges().entries()) {
- LiteralRange range = rangeEntry.getValue();
- String field = rangeEntry.getKey();
- // if we're _ANYFIELD_, then use null when making the literal range
- field = Constants.ANY_FIELD.equals(field) ? null : field;
- if (field != null) {
- familiesToSeek.add(field);
- }
- try {
- forwardRanges.add(ShardIndexQueryTableStaticMethods.getBoundedRangeRange(range));
- } catch (IllegalRangeArgumentException e) {
- log.error("Error using range [" + range + "]", e);
- continue;
- }
- }
- Set reverseRanges = new HashSet<>();
- for (Entry patternAndField : config.getPatterns().entries()) {
- String pattern = patternAndField.getKey(), field = patternAndField.getValue();
- // if we're _ANYFIELD_, then use null when making the literal range
- field = Constants.ANY_FIELD.equals(field) ? null : field;
- ShardIndexQueryTableStaticMethods.RefactoredRangeDescription description;
- try {
- if (field != null) {
- familiesToSeek.add(field);
- }
- description = ShardIndexQueryTableStaticMethods.getRegexRange(field, pattern, false, metadataHelper, config);
- } catch (JavaRegexParseException e) {
- log.error("Error parsing pattern [" + pattern + "]", e);
- continue;
- }
- if (description.isForReverseIndex) {
- reverseRanges.add(description.range);
- } else {
- forwardRanges.add(description.range);
- }
- }
- return Pair.with(forwardRanges, reverseRanges);
- }
-
- /**
- * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object
- * will determine whether or not a regex or literal is being normalized.
- *
- * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations.
- *
- * @param normalization
- * the normalizer object
- * @param valuesToFields
- * mapping of values to fields
- * @param dataTypeMap
- * the data type map
- * @return a mapping of the noramlized tuples
- */
- public static Multimap normalize(Normalization normalization, Multimap valuesToFields, Multimap> dataTypeMap) {
- Multimap normalizedValuesToFields = HashMultimap.create();
- for (Entry valueAndField : valuesToFields.entries()) {
- String value = valueAndField.getKey(), field = valueAndField.getValue();
- for (Type> dataType : dataTypeMap.get(field)) {
- try {
- log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]");
- String normalized = normalization.normalize(dataType, field, value);
- normalizedValuesToFields.put(normalized, field);
- log.debug("Normalization succeeded!");
- } catch (Exception exception) {
- log.debug("Normalization failed.");
- }
- }
- }
- return normalizedValuesToFields;
- }
-
- /**
- * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object
- * will determine whether or not a regex or literal is being normalized.
- *
- * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations.
- *
- * @param normalization
- * the normalizer object
- * @param valuesToFields
- * mapping of values to fields
- * @param dataTypeMap
- * the data type map
- * @return a mapping of the normalized ranges
- */
- public static Multimap> normalizeRanges(Normalization normalization, Multimap> valuesToFields,
- Multimap> dataTypeMap) {
- Multimap> normalizedValuesToFields = HashMultimap.create();
- for (Entry> valueAndField : valuesToFields.entries()) {
- String field = valueAndField.getKey();
- LiteralRange> value = valueAndField.getValue();
- for (Type> dataType : dataTypeMap.get(field)) {
- try {
- log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]");
- String normalizedLower = normalization.normalize(dataType, field, value.getLower().toString());
- String normalizedUpper = normalization.normalize(dataType, field, value.getUpper().toString());
- normalizedValuesToFields.put(field, new LiteralRange<>(normalizedLower, value.isLowerInclusive(), normalizedUpper, value.isUpperInclusive(),
- value.getFieldName(), value.getNodeOperand()));
- log.debug("Normalization succeeded!");
- } catch (Exception exception) {
- log.debug("Normalization failed.");
- }
- }
- }
- return normalizedValuesToFields;
- }
-
- /**
- * Given a sequence of objects of type T, this method will return a single object for every unique type passed in. This is used to dedupe normalizer
- * instances by their type, so that we only get 1 instance per type of normalizer.
- *
- * @param things
- * iterable list of objects
- * @param
- * type of the objects
- * @return an object for each type passed in
- */
- public static Collection uniqueByType(Iterable things) {
- Map,T> map = Maps.newHashMap();
- for (T t : things) {
- map.put(t.getClass(), t);
- }
- return map.values();
- }
-
@Override
public Set getOptionalQueryParameters() {
Set params = super.getOptionalQueryParameters();
params.add(SEPARATE_COUNTS_BY_COLVIS);
+ params.add(SUM_COUNTS);
return params;
}
- public Boolean getSeparateCountsByColVis() {
+ public boolean getSeparateCountsByColVis() {
return getConfig().getSeparateCountsByColVis();
}
- public void setSeparateCountsByColVis(Boolean separateCountsByColVis) {
+ public void setSeparateCountsByColVis(boolean separateCountsByColVis) {
getConfig().setSeparateCountsByColVis(separateCountsByColVis);
}
- public Boolean getShowReferenceCount() {
+ public boolean getShowReferenceCount() {
return getConfig().getShowReferenceCount();
}
- public void setShowReferenceCount(Boolean showReferenceCount) {
+ public void setShowReferenceCount(boolean showReferenceCount) {
getConfig().setShowReferenceCount(showReferenceCount);
}
+ public boolean getSumCounts() {
+ return getConfig().getSumCounts();
+ }
+
+ public void setSumCounts(boolean sumCounts) {
+ getConfig().setSumCounts(sumCounts);
+ }
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java
index 13c8fa25d75..59d09666450 100644
--- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java
+++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java
@@ -3,6 +3,7 @@
import java.io.Serializable;
import java.util.Collection;
import java.util.Objects;
+import java.util.StringJoiner;
import com.google.common.collect.Multimap;
@@ -17,8 +18,9 @@
public class DiscoveryQueryConfiguration extends ShardIndexQueryConfiguration implements Serializable {
private Multimap literals, patterns;
private Multimap> ranges;
- private Boolean separateCountsByColVis = false;
- private Boolean showReferenceCount = false;
+ private boolean separateCountsByColVis = false;
+ private boolean showReferenceCount = false;
+ private boolean sumCounts = false;
public DiscoveryQueryConfiguration() {}
@@ -116,23 +118,31 @@ public void setPatterns(Multimap patterns) {
this.patterns = patterns;
}
- public Boolean getSeparateCountsByColVis() {
+ public boolean getSeparateCountsByColVis() {
return separateCountsByColVis;
}
- public Boolean getShowReferenceCount() {
+ public boolean getShowReferenceCount() {
return showReferenceCount;
}
+ public boolean getSumCounts() {
+ return sumCounts;
+ }
+
public void setSeparateCountsByColVis(boolean separateCountsByColVis) {
this.separateCountsByColVis = separateCountsByColVis;
}
- public void setShowReferenceCount(Boolean showReferenceCount) {
+ public void setShowReferenceCount(boolean showReferenceCount) {
this.showReferenceCount = showReferenceCount;
}
+ public void setSumCounts(boolean sumCounts) {
+ this.sumCounts = sumCounts;
+ }
+
@Override
public DiscoveryQueryConfiguration checkpoint() {
// Create a new config that only contains what is needed to execute the specified ranges
@@ -156,4 +166,11 @@ public boolean equals(Object o) {
public int hashCode() {
return Objects.hash(super.hashCode(), literals, patterns, ranges, separateCountsByColVis, showReferenceCount);
}
+
+ @Override
+ public String toString() {
+ return new StringJoiner(", ", DiscoveryQueryConfiguration.class.getSimpleName() + "[", "]").add("literals=" + literals).add("patterns=" + patterns)
+ .add("ranges=" + ranges).add("separateCountsByColVis=" + separateCountsByColVis).add("showReferenceCount=" + showReferenceCount)
+ .add("sumCounts=" + sumCounts).toString();
+ }
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java
index 19bec2cb83b..bc2a0dc8781 100644
--- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java
+++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java
@@ -328,11 +328,17 @@ public IndexInfo union(IndexInfo o, List delayedNodes) {
merged.count = merged.uids.size();
}
- merged.setFieldCounts(this.getFieldCounts());
- merged.mergeFieldCounts(o.getFieldCounts());
+ if (this == o) {
+ // handle idiosyncrasy of the peeking iterator where the first term is merged with itself
+ merged.setFieldCounts(o.getFieldCounts());
+ merged.setTermCounts(o.getTermCounts());
+ } else {
+ merged.setFieldCounts(getFieldCounts());
+ merged.setTermCounts(getTermCounts());
- merged.setTermCounts(this.getTermCounts());
- merged.mergeTermCounts(o.getTermCounts());
+ merged.mergeFieldCounts(o.getFieldCounts());
+ merged.mergeTermCounts(o.getTermCounts());
+ }
/*
* If there are multiple levels within a union we could have an ASTOrNode. We cannot prune OrNodes as we would with an intersection, so propagate the
diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java
index 0eb3fe6b144..9d1f2951563 100644
--- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java
+++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java
@@ -157,8 +157,8 @@ public RangeStream(ShardQueryConfiguration config, ScannerFactory scanners, Meta
streamExecutor = new ThreadPoolExecutor(executeLookupMin, maxLookup, 100, TimeUnit.MILLISECONDS, runnables);
fieldDataTypes = config.getQueryFieldsDatatypes();
collapseUids = config.getCollapseUids();
- fieldCounts = config.getUseFieldCounts();
- termCounts = config.getUseTermCounts();
+ fieldCounts = config.isSortQueryPostIndexWithFieldCounts();
+ termCounts = config.isSortQueryPostIndexWithTermCounts();
try {
Set ioFields = metadataHelper.getIndexOnlyFields(null);
if (null != ioFields) {
@@ -264,8 +264,8 @@ public Iterator iterator() {
this.itr = filter(concat(transform(queryStream, new TupleToRange(config.getShardTableName(), queryStream.currentNode(), config))),
getEmptyPlanPruner());
- if (config.isSortQueryByCounts() && (config.getUseFieldCounts() || config.getUseTermCounts())) {
- this.itr = transform(itr, new OrderingTransform(config.getUseFieldCounts(), config.getUseTermCounts()));
+ if (config.isSortQueryPostIndexWithFieldCounts() || config.isSortQueryPostIndexWithTermCounts()) {
+ this.itr = transform(itr, new OrderingTransform(config.isSortQueryPostIndexWithFieldCounts(), config.isSortQueryPostIndexWithTermCounts()));
}
}
} finally {
@@ -362,7 +362,7 @@ public QueryPlan apply(QueryPlan plan) {
Map counts = plan.getTermCounts().getCounts();
OrderByCostVisitor.orderByTermCount(plan.getQueryTree(), counts);
} else if (useFieldCounts) {
- Map counts = plan.getTermCounts().getCounts();
+ Map counts = plan.getFieldCounts().getCounts();
OrderByCostVisitor.orderByFieldCount(plan.getQueryTree(), counts);
}
return plan;
@@ -602,6 +602,10 @@ public ScannerStream visit(ASTEQNode node, Object data) {
String queryString = fieldName + "=='" + literal + "'";
options.addScanIterator(QueryScannerHelper.getQueryInfoIterator(config.getQuery(), false, queryString));
+ // easier to apply hints to new options than deal with copying existing hints between
+ options.applyExecutionHints(config.getIndexTableName(), config.getTableHints());
+ options.applyConsistencyLevel(config.getIndexTableName(), config.getTableConsistencyLevels());
+
scannerSession.setOptions(options);
scannerSession.setMaxResults(config.getMaxIndexBatchSize());
scannerSession.setExecutor(streamExecutor);
diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java
index 6681b9ebd0b..4972d173f5f 100644
--- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java
+++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java
@@ -9,7 +9,7 @@
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.util.PeekingIterator;
+import org.apache.commons.collections4.iterators.PeekingIterator;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;
diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java
index 2b437ea61c5..1d763edb37a 100644
--- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java
+++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java
@@ -12,7 +12,7 @@
import org.apache.accumulo.core.data.PartialKey;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.util.PeekingIterator;
+import org.apache.commons.collections4.iterators.PeekingIterator;
import org.apache.commons.jexl3.parser.JexlNode;
import com.google.common.base.Function;
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java
index 82d9e9b24fa..96452c57741 100644
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java
@@ -15,6 +15,7 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.stream.Collectors;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.commons.jexl3.parser.ASTAndNode;
@@ -37,6 +38,7 @@
import com.google.common.collect.Lists;
import com.google.common.collect.PeekingIterator;
import com.google.common.collect.Sets;
+import com.google.common.collect.Streams;
import datawave.query.attributes.AttributeFactory;
import datawave.query.config.ShardQueryConfiguration;
@@ -104,19 +106,18 @@ public JexlNode getIndexQuery(Set termFrequencyFields, Set index
// get the cartesian product of all the fields and terms
MutableBoolean oredFields = new MutableBoolean();
- Set[] fieldsAndTerms = fieldsAndTerms(termFrequencyFields, indexedFields, contentFields, oredFields, true);
- if (!fieldsAndTerms[0].isEmpty()) {
+ FieldTerms fieldsAndTerms = fieldsAndTerms(termFrequencyFields, indexedFields, contentFields, oredFields, true);
+ Set fields = fieldsAndTerms.getFields();
+ if (!fields.isEmpty()) {
final JexlNode eq = new ASTEQNode(ParserTreeConstants.JJTEQNODE);
-
- for (String field : fieldsAndTerms[0]) {
- nodes.add(JexlNodeFactory.createNodeTreeFromFieldValues(ContainerType.AND_NODE, eq, null, field, fieldsAndTerms[1]));
- }
+ Set terms = fieldsAndTerms.getTerms();
+ fields.forEach(field -> nodes.add(JexlNodeFactory.createNodeTreeFromFieldValues(ContainerType.AND_NODE, eq, null, field, terms)));
}
- if (fieldsAndTerms[0].size() == 0) {
+ if (fields.isEmpty()) {
log.warn("No fields found for content function, will not expand index query");
return new ASTTrueNode(ParserTreeConstants.JJTTRUENODE);
- } else if (fieldsAndTerms[0].size() == 1) {
+ } else if (fields.size() == 1) {
// A single field needs no wrapper node.
return nodes.iterator().next();
} else if (oredFields.booleanValue()) {
@@ -194,7 +195,7 @@ public Set fieldsForNormalization(MetadataHelper helper, Set dat
public Set fields(MetadataHelper helper, Set datatypeFilter) {
try {
return fieldsAndTerms(helper.getTermFrequencyFields(datatypeFilter), helper.getIndexedFields(datatypeFilter),
- helper.getContentFields(datatypeFilter), null)[0];
+ helper.getContentFields(datatypeFilter), null).getFields();
} catch (TableNotFoundException e) {
QueryException qe = new QueryException(DatawaveErrorCode.METADATA_TABLE_FETCH_ERROR, e);
throw new DatawaveFatalQueryException(qe);
@@ -206,15 +207,15 @@ public Set fields(MetadataHelper helper, Set datatypeFilter) {
public Set> fieldSets(MetadataHelper helper, Set datatypeFilter) {
try {
MutableBoolean oredFields = new MutableBoolean();
- Set[] fieldsAndTerms = fieldsAndTerms(helper.getTermFrequencyFields(datatypeFilter), helper.getIndexedFields(datatypeFilter),
+ FieldTerms fieldsAndTerms = fieldsAndTerms(helper.getTermFrequencyFields(datatypeFilter), helper.getIndexedFields(datatypeFilter),
helper.getContentFields(datatypeFilter), oredFields);
Set> fieldSets = new HashSet<>();
if (oredFields.booleanValue()) {
- for (String field : fieldsAndTerms[0]) {
+ for (String field : fieldsAndTerms.getFields()) {
fieldSets.add(Collections.singleton(field));
}
} else {
- fieldSets.add(fieldsAndTerms[0]);
+ fieldSets.add(fieldsAndTerms.getFields());
}
return fieldSets;
} catch (TableNotFoundException e) {
@@ -224,174 +225,200 @@ public Set> fieldSets(MetadataHelper helper, Set datatypeFil
}
- public Set[] fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields) {
+ public FieldTerms fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields) {
return fieldsAndTerms(termFrequencyFields, indexedFields, contentFields, oredFields, false);
}
@SuppressWarnings("unchecked")
- public Set[] fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields,
+ public FieldTerms fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields,
boolean validateFields) {
+ if (this.args.isEmpty()) {
+ NotFoundQueryException qe = new NotFoundQueryException(DatawaveErrorCode.JEXL_NODES_MISSING,
+ MessageFormat.format("Class: {0}, Namespace: {1}, Function: {2}", this.getClass().getSimpleName(), this.namespace, this.name));
+ throw new IllegalArgumentException(qe);
+ }
- final String funcName = name;
-
- PeekingIterator args = Iterators.peekingIterator(this.args.iterator());
-
- Set termFreqFields = Sets.newHashSet(termFrequencyFields);
- Set fields = Sets.newHashSetWithExpectedSize(termFreqFields.size());
- Set terms = Sets.newHashSetWithExpectedSize(this.args.size() - 1);
- Iterator itr = termFreqFields.iterator();
// Can any one of the fields satisfy the query? Always true unless the zone is specified in an AND clause.
if (oredFields != null) {
oredFields.setValue(true);
}
- while (itr.hasNext()) {
- String field = itr.next();
- if (indexedFields.contains(field) && (contentFields.isEmpty() || contentFields.contains(field))) {
- fields.add(field);
- }
- }
-
- if (args.hasNext()) {
- JexlNode termOffsetMap = null;
- if (CONTENT_ADJACENT_FUNCTION_NAME.equals(funcName)) {
- JexlNode firstArg = args.next();
+ PeekingIterator argsIterator = Iterators.peekingIterator(this.args.iterator());
+ FieldTerms fieldTerms = new FieldTerms();
+ JexlNode termOffsetMap;
- // we override the zones if the first argument is a string
- if (firstArg instanceof ASTStringLiteral) {
- fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(firstArg));
- termOffsetMap = args.next();
- } else {
- JexlNode nextArg = args.peek();
-
- // The zones may (more likely) be specified as an idenfifier
- if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) {
- if (oredFields != null && firstArg instanceof ASTAndNode) {
- oredFields.setValue(false);
- }
-
- fields = JexlASTHelper.getIdentifierNames(firstArg);
- termOffsetMap = args.next();
- } else {
- termOffsetMap = firstArg;
- }
- }
- } else if (CONTENT_PHRASE_FUNCTION_NAME.equals(funcName)) {
- JexlNode firstArg = args.next();
+ switch (this.name) {
+ case CONTENT_ADJACENT_FUNCTION_NAME:
+ termOffsetMap = examineContentAdjacentFunction(argsIterator, fieldTerms, oredFields);
+ break;
+ case CONTENT_PHRASE_FUNCTION_NAME:
+ termOffsetMap = examineContentPhraseFunction(argsIterator, fieldTerms, oredFields);
+ break;
+ case CONTENT_SCORED_PHRASE_FUNCTION_NAME:
+ termOffsetMap = examineContentScoredPhraseFunction(argsIterator, fieldTerms, oredFields);
+ break;
+ case CONTENT_WITHIN_FUNCTION_NAME:
+ termOffsetMap = examineContentWithinFunction(argsIterator, fieldTerms, oredFields);
+ break;
+ default:
+ BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.FUNCTION_ARGUMENTS_MISSING);
+ throw new IllegalArgumentException(qe);
+ }
- // we override the zones if the first argument is a string
- if (firstArg instanceof ASTStringLiteral) {
- fields = Collections.singleton(((ASTStringLiteral) firstArg).getLiteral());
+ // Verify that a term offset map with terms were specified.
+ validateTermsOffsetMapAndTermsPresent(termOffsetMap, argsIterator);
- termOffsetMap = args.next();
- } else {
- JexlNode nextArg = args.peek();
+ // If the fields were not established above, ensure that the fields at least contain any term frequency fields that are indexed and, if any content
+ // fields were specified, present within there as well.
+ if (fieldTerms.fields == null) {
+ Set fields = termFrequencyFields.stream()
+ .filter(f -> indexedFields.contains(f) && (contentFields.isEmpty() || contentFields.contains(f))).collect(Collectors.toSet());
+ fieldTerms.fields = fields;
+ }
- // The zones may (more likely) be specified as an identifier
- if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) {
- if (oredFields != null && firstArg instanceof ASTAndNode) {
- oredFields.setValue(false);
- }
-
- fields = JexlASTHelper.getIdentifierNames(firstArg);
- termOffsetMap = args.next();
- } else {
- termOffsetMap = firstArg;
- }
+ // Moving this validation later in the call stack, since it requires other processing (i.e. apply query model)
+ if (validateFields) {
+ for (String field : fieldTerms.fields) {
+ // Deconstruct & upcase the fieldname for testing in case we have not normalized the field names yet. Return the unnormalized fieldname.
+ if (!termFrequencyFields.contains(JexlASTHelper.deconstructIdentifier(field.toUpperCase()))) {
+ PreConditionFailedQueryException qe = new PreConditionFailedQueryException(DatawaveErrorCode.FIELD_PHRASE_QUERY_NOT_INDEXED,
+ MessageFormat.format("Field: {0}", field));
+ throw new IllegalArgumentException(qe);
}
- } else if (CONTENT_SCORED_PHRASE_FUNCTION_NAME.equals(funcName)) {
- JexlNode arg = args.next();
+ }
+ }
- if (arg instanceof ASTNumberLiteral || arg instanceof ASTUnaryMinusNode) {
- // if the first argument is a number, then no field exists
- // for example, content:scoredPhrase(-1.5, termOffsetMap, 'value')
- termOffsetMap = args.next();
- } else {
- if (arg instanceof ASTIdentifier) {
- // single field case
- // for example, content:scoredPhrase(FIELD, -1.5, termOffsetMap, 'value')
- fields = Collections.singleton(String.valueOf(JexlASTHelper.getIdentifier(arg)));
- } else {
- // multi field case
- // for example, content:scoredPhrase((FIELD_A || FIELD_B), -1.5, termOffsetMap, 'value')
- Set identifiers = JexlASTHelper.getIdentifierNames(arg);
- if (!identifiers.isEmpty()) {
- fields = identifiers;
-
- if (oredFields != null && arg instanceof ASTAndNode) {
- oredFields.setValue(false);
- }
- }
- }
+ // Now take the remaining string literals in the arguments as terms.
+ Set terms = Sets.newHashSetWithExpectedSize(this.args.size() - 1);
+ // @formatter:off
+ Streams.stream(argsIterator)
+ .filter(ASTStringLiteral.class::isInstance)
+ .map(JexlNodes::getIdentifierOrLiteralAsString)
+ .forEach(terms::add);
+ // @formatter:on
+ fieldTerms.terms = terms;
+
+ return fieldTerms;
+ }
- // skip score because it is not needed when gathering just the fields and values from a function
- args.next();
+ // Finds and sets the fields for a content:adjacent functions, and returns the anticpatated terms offset map node.
+ private JexlNode examineContentAdjacentFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) {
+ JexlNode firstArg = argsIterator.next();
+ if (firstArg instanceof ASTStringLiteral) {
+ fieldTerms.fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(firstArg));
+ return argsIterator.next();
+ } else {
+ JexlNode nextArg = argsIterator.peek();
+ // The zones may (more likely) be specified as an idenfifier
+ if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) {
+ if (oredFields != null && firstArg instanceof ASTAndNode) {
+ oredFields.setValue(false);
+ }
+ fieldTerms.fields = JexlASTHelper.getIdentifierNames(firstArg);
+ return argsIterator.next();
+ } else {
+ return firstArg;
+ }
+ }
+ }
- termOffsetMap = args.next();
+ // Finds and sets the fields for a content:phrase functions, and returns the anticpatated terms offset map node.
+ private JexlNode examineContentPhraseFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) {
+ JexlNode firstArg = argsIterator.next();
+ // we override the zones if the first argument is a string
+ if (firstArg instanceof ASTStringLiteral) {
+ fieldTerms.fields = Collections.singleton(((ASTStringLiteral) firstArg).getLiteral());
+ return argsIterator.next();
+ } else {
+ JexlNode nextArg = argsIterator.peek();
+ // The zones may (more likely) be specified as an identifier
+ if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) {
+ if (oredFields != null && firstArg instanceof ASTAndNode) {
+ oredFields.setValue(false);
}
- } else if (CONTENT_WITHIN_FUNCTION_NAME.equals(funcName)) {
- JexlNode arg = args.next();
+ fieldTerms.fields = JexlASTHelper.getIdentifierNames(firstArg);
+ return argsIterator.next();
+ } else {
+ return firstArg;
+ }
+ }
+ }
- // we override the zones if the first argument is a string or identifier
- if (arg instanceof ASTStringLiteral) {
- fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(arg));
- arg = args.next();
- } else if (!JexlASTHelper.getIdentifiers(arg).isEmpty()) {
- if (oredFields != null && arg instanceof ASTAndNode) {
+ // Finds and sets the fields for a content:scoredPhrase functions, and returns the anticpatated terms offset map node.
+ private JexlNode examineContentScoredPhraseFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) {
+ JexlNode firstArg = argsIterator.next();
+ if (firstArg instanceof ASTNumberLiteral || firstArg instanceof ASTUnaryMinusNode) {
+ // if the first argument is a number, then no field exists
+ // for example, content:scoredPhrase(-1.5, termOffsetMap, 'value')
+ return argsIterator.next();
+ } else {
+ if (firstArg instanceof ASTIdentifier) {
+ // single field case
+ // for example, content:scoredPhrase(FIELD, -1.5, termOffsetMap, 'value')
+ fieldTerms.fields = Collections.singleton(String.valueOf(JexlASTHelper.getIdentifier(firstArg)));
+ } else {
+ // multi field case
+ // for example, content:scoredPhrase((FIELD_A || FIELD_B), -1.5, termOffsetMap, 'value')
+ Set identifiers = JexlASTHelper.getIdentifierNames(firstArg);
+ if (!identifiers.isEmpty()) {
+ fieldTerms.fields = identifiers;
+ if (oredFields != null && firstArg instanceof ASTAndNode) {
oredFields.setValue(false);
}
-
- fields = JexlASTHelper.getIdentifierNames(arg);
- arg = args.next();
}
+ }
- // we can trash the distance
- if (!(arg instanceof ASTNumberLiteral || arg instanceof ASTUnaryMinusNode)) {
- BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.NUMERIC_DISTANCE_ARGUMENT_MISSING);
- throw new IllegalArgumentException(qe);
- }
+ // skip score because it is not needed when gathering just the fields and values from a function
+ argsIterator.next();
+ return argsIterator.next();
+ }
+ }
- termOffsetMap = args.next();
- } else {
- BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.FUNCTION_ARGUMENTS_MISSING);
- throw new IllegalArgumentException(qe);
+ // Finds and sets the fields for a content:within functions, and returns the anticpatated terms offset map node.
+ private JexlNode examineContentWithinFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) {
+ JexlNode arg = argsIterator.next();
+ // we override the zones if the first argument is a string or identifier
+ if (arg instanceof ASTStringLiteral) {
+ fieldTerms.fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(arg));
+ arg = argsIterator.next();
+ } else if (!JexlASTHelper.getIdentifiers(arg).isEmpty()) {
+ if (oredFields != null && arg instanceof ASTAndNode) {
+ oredFields.setValue(false);
}
- if (null == termOffsetMap || !(termOffsetMap instanceof ASTIdentifier)) {
- BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMOFFSETMAP_AND_TERMS_MISSING);
- throw new IllegalArgumentException(qe);
- }
+ fieldTerms.fields = JexlASTHelper.getIdentifierNames(arg);
+ arg = argsIterator.next();
+ }
- if (!args.hasNext()) {
- BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMS_MISSING);
- throw new IllegalArgumentException(qe);
- }
+ // we can trash the distance
+ if (!(arg instanceof ASTNumberLiteral || arg instanceof ASTUnaryMinusNode)) {
+ BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.NUMERIC_DISTANCE_ARGUMENT_MISSING);
+ throw new IllegalArgumentException(qe);
+ }
- // moving this validation later in the call stack, since it requires other processing (i.e. apply query model)
- if (validateFields) {
- for (String field : fields) {
- // deconstruct & upcase the fieldname for testing in case we have not normalized the field names yet. Return the unnormalized fieldname.
- if (!termFreqFields.contains(JexlASTHelper.deconstructIdentifier(field.toUpperCase()))) {
- PreConditionFailedQueryException qe = new PreConditionFailedQueryException(DatawaveErrorCode.FIELD_PHRASE_QUERY_NOT_INDEXED,
- MessageFormat.format("Field: {0}", field));
- throw new IllegalArgumentException(qe);
- }
- }
- }
+ return argsIterator.next();
+ }
- // now take the remaining string literals as terms
- Iterator termsItr = Iterators.transform(Iterators.filter(args, new StringLiteralsOnly()), new GetImage());
- while (termsItr.hasNext()) {
- terms.add(termsItr.next());
- }
+ /**
+ * Throws a {@link BadRequestQueryException} if termsOffsetMap is not an instance of {@link ASTIdentifier} or if there are no more nodes in the
+ * iterator.
+ *
+ * @param termOffsetMap
+ * the terms offset map node
+ * @param argsIterator
+ * the iterator of arguments
+ */
+ private void validateTermsOffsetMapAndTermsPresent(JexlNode termOffsetMap, PeekingIterator argsIterator) {
+ if (!(termOffsetMap instanceof ASTIdentifier)) {
+ BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMOFFSETMAP_AND_TERMS_MISSING);
+ throw new IllegalArgumentException(qe);
+ }
- } else {
- NotFoundQueryException qe = new NotFoundQueryException(DatawaveErrorCode.JEXL_NODES_MISSING,
- MessageFormat.format("Class: {0}, Namespace: {1}, Function: {2}", this.getClass().getSimpleName(), namespace, funcName));
+ if (!argsIterator.hasNext()) {
+ BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMS_MISSING);
throw new IllegalArgumentException(qe);
}
- return new Set[] {fields, terms};
}
/**
@@ -616,6 +643,29 @@ public boolean allowIvaratorFiltering() {
}
}
+ public static class FieldTerms {
+
+ private Set fields;
+ private Set terms;
+
+ public FieldTerms() {
+ fields = null;
+ terms = null;
+ }
+
+ public Set getFields() {
+ return fields;
+ }
+
+ public int totalFields() {
+ return fields.size();
+ }
+
+ public Set getTerms() {
+ return terms;
+ }
+ }
+
@Override
public ContentJexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node) {
FunctionJexlNodeVisitor fvis = new FunctionJexlNodeVisitor();
@@ -636,5 +686,4 @@ public ContentJexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node)
return new ContentJexlArgumentDescriptor(node, fvis.namespace(), fvis.name(), fvis.args());
}
-
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java
index e3c207b39d5..9301f22f490 100644
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java
@@ -1,11 +1,11 @@
package datawave.query.jexl.lookups;
-import java.io.IOException;
+import static datawave.query.jexl.lookups.ShardIndexQueryTableStaticMethods.EXPANSION_HINT_KEY;
+
import java.text.MessageFormat;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map.Entry;
-import java.util.SortedMap;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
@@ -18,15 +18,15 @@
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.iterators.user.WholeRowIterator;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;
import org.springframework.util.StringUtils;
+import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import datawave.core.common.logging.ThreadConfigurableLogger;
-import datawave.core.iterators.ColumnQualifierRangeIterator;
+import datawave.core.iterators.BoundedRangeExpansionIterator;
import datawave.core.iterators.CompositeSeekingIterator;
import datawave.core.iterators.TimeoutExceptionIterator;
import datawave.core.iterators.TimeoutIterator;
@@ -126,25 +126,25 @@ public synchronized void submit() {
log.debug("Range: " + range);
bs = null;
try {
- bs = scannerFactory.newScanner(config.getIndexTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery());
+ // the 'newScanner' method in the ScannerFactory has no knowledge about the 'expansion' hint, so determine hint here
+ String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName();
+
+ bs = scannerFactory.newScanner(config.getIndexTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery(), hintKey);
bs.setRanges(Collections.singleton(range));
bs.fetchColumnFamily(new Text(literalRange.getFieldName()));
- // set up the GlobalIndexRangeSamplingIterator
-
- IteratorSetting cfg = new IteratorSetting(config.getBaseIteratorPriority() + 50, "WholeRowIterator", WholeRowIterator.class);
- bs.addScanIterator(cfg);
-
- cfg = new IteratorSetting(config.getBaseIteratorPriority() + 48, "DateFilter", ColumnQualifierRangeIterator.class);
- // search from 20YYddMM to 20ZZddMM\uffff to ensure we encompass all of the current day
- String end = endDay + Constants.MAX_UNICODE_STRING;
- cfg.addOption(ColumnQualifierRangeIterator.RANGE_NAME, ColumnQualifierRangeIterator.encodeRange(new Range(startDay, end)));
-
- bs.addScanIterator(cfg);
+ IteratorSetting setting = new IteratorSetting(config.getBaseIteratorPriority() + 20, "BoundedRangeExpansionIterator",
+ BoundedRangeExpansionIterator.class);
+ setting.addOption(BoundedRangeExpansionIterator.START_DATE, startDay);
+ setting.addOption(BoundedRangeExpansionIterator.END_DATE, endDay);
+ if (!config.getDatatypeFilter().isEmpty()) {
+ setting.addOption(BoundedRangeExpansionIterator.DATATYPES_OPT, Joiner.on(',').join(config.getDatatypeFilter()));
+ }
+ bs.addScanIterator(setting);
// If this is a composite field, with multiple terms, we need to setup our query to filter based on each component of the composite range
- if (config.getCompositeToFieldMap().get(literalRange.getFieldName()) != null) {
+ if (!config.getCompositeToFieldMap().get(literalRange.getFieldName()).isEmpty()) {
String compositeSeparator = null;
if (config.getCompositeFieldSeparators() != null)
@@ -169,8 +169,8 @@ public synchronized void submit() {
}
if (null != fairnessIterator) {
- cfg = new IteratorSetting(config.getBaseIteratorPriority() + 100, TimeoutExceptionIterator.class);
- bs.addScanIterator(cfg);
+ IteratorSetting timeoutSetting = new IteratorSetting(config.getBaseIteratorPriority() + 100, TimeoutExceptionIterator.class);
+ bs.addScanIterator(timeoutSetting);
}
timedScanFuture = execService.submit(createTimedCallable(bs.iterator()));
@@ -180,13 +180,6 @@ public synchronized void submit() {
log.error(qe);
throw new DatawaveFatalQueryException(qe);
- } catch (IOException e) {
- QueryException qe = new QueryException(DatawaveErrorCode.RANGE_CREATE_ERROR, e, MessageFormat.format("{0}", this.literalRange));
- log.debug(qe);
- if (bs != null) {
- scannerFactory.close(bs);
- }
- throw new IllegalRangeArgumentException(qe);
}
}
}
@@ -233,6 +226,7 @@ protected Callable createTimedCallable(final Iterator>
Key k = entry.getKey();
+ log.info("tk: " + k.toStringNoTime());
if (log.isTraceEnabled()) {
log.trace("Forward Index entry: " + entry.getKey());
}
@@ -240,61 +234,22 @@ protected Callable createTimedCallable(final Iterator>
k.getRow(holder);
String uniqueTerm = holder.toString();
- SortedMap keymap = WholeRowIterator.decodeRow(entry.getKey(), entry.getValue());
-
- String field = null;
-
- boolean foundDataType = false;
-
- for (Key topKey : keymap.keySet()) {
- if (null == field) {
- topKey.getColumnFamily(holder);
- field = holder.toString();
- }
- // Get the column qualifier from the key. It
- // contains the datatype and normalizer class
-
- if (null != topKey.getColumnQualifier()) {
- if (null != config.getDatatypeFilter() && !config.getDatatypeFilter().isEmpty()) {
-
- String colq = topKey.getColumnQualifier().toString();
- int idx = colq.indexOf(Constants.NULL);
-
- if (idx != -1) {
- String type = colq.substring(idx + 1);
-
- // If types are specified and this type
- // is not in the list, skip it.
- if (config.getDatatypeFilter().contains(type)) {
- if (log.isTraceEnabled()) {
- log.trace(config.getDatatypeFilter() + " contains " + type);
- }
-
- foundDataType = true;
- break;
- }
- }
- } else {
- foundDataType = true;
- }
- }
- }
- if (foundDataType) {
+ k.getColumnFamily(holder);
+ String field = holder.toString();
- // obtaining the size of a map can be expensive,
- // instead
- // track the count of each unique item added.
- indexLookupMap.put(field, uniqueTerm);
+ // safety check...
+ Preconditions.checkState(field.equals(literalRange.getFieldName()),
+ "Got an unexpected field name when expanding range" + field + " " + literalRange.getFieldName());
- // safety check...
- Preconditions.checkState(field.equals(literalRange.getFieldName()),
- "Got an unexpected field name when expanding range" + field + " " + literalRange.getFieldName());
+ // obtaining the size of a map can be expensive,
+ // instead
+ // track the count of each unique item added.
+ indexLookupMap.put(field, uniqueTerm);
- // If this range expands into to many values, we can
- // stop
- if (indexLookupMap.get(field).isThresholdExceeded()) {
- return true;
- }
+ // If this range expands into to many values, we can
+ // stop
+ if (indexLookupMap.get(field).isThresholdExceeded()) {
+ return true;
}
}
} catch (Exception e) {
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java
index c1c068e5bc1..b40001d5fd2 100644
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java
@@ -14,7 +14,6 @@
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;
-import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java
index 74671654e43..6c82c7f01f1 100644
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java
@@ -68,6 +68,9 @@ public class ShardIndexQueryTableStaticMethods {
private static FastDateFormat formatter = FastDateFormat.getInstance("yyyyMMdd");
+ // name reserved for executor pools
+ public static final String EXPANSION_HINT_KEY = "expansion";
+
/**
* Create an IndexLookup task to find field names give a JexlNode and a set of Types for that node
*
@@ -440,9 +443,13 @@ public static Range getLiteralRange(String fieldName, String normalizedQueryTerm
* check for limiting unique terms
* @return the scanner session
* @throws InvocationTargetException
+ * if no target exists
* @throws NoSuchMethodException
+ * if no method exists
* @throws InstantiationException
+ * if there is a problem initializing
* @throws IllegalAccessException
+ * if there is an illegal access
* @throws IOException
* dates can't be formatted
*/
@@ -455,7 +462,9 @@ public static ScannerSession configureTermMatchOnly(ShardQueryConfiguration conf
return null;
}
- ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery());
+ String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName();
+
+ ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery(), hintKey);
bs.setRanges(ranges);
@@ -483,7 +492,9 @@ public static ScannerSession configureLimitedDiscovery(ShardQueryConfiguration c
return null;
}
- ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery());
+ String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : tableName;
+
+ ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery(), hintKey);
bs.setRanges(ranges);
@@ -511,6 +522,13 @@ public static final void configureGlobalIndexDateRangeFilter(ShardQueryConfigura
}
IteratorSetting cfg = configureGlobalIndexDateRangeFilter(config, dateRange);
bs.addScanIterator(cfg);
+
+ // unused method, but we'll still configure execution hints if possible
+ String executionHintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName();
+
+ if (config.getTableHints().containsKey(executionHintKey)) {
+ bs.setExecutionHints(config.getTableHints().get(executionHintKey));
+ }
}
public static final IteratorSetting configureGlobalIndexDateRangeFilter(ShardQueryConfiguration config, LongRange dateRange) {
@@ -580,6 +598,16 @@ public static final void configureGlobalIndexTermMatchingIterator(ShardQueryConf
bs.addScanIterator(cfg);
+ // unused method, but we'll still configure execution hints if possible
+ if (!reverseIndex) {
+ // only apply hints to the global index
+ String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName();
+
+ if (config.getTableHints().containsKey(hintKey)) {
+ bs.setExecutionHints(config.getTableHints().get(hintKey));
+ }
+ }
+
setExpansionFields(config, bs, reverseIndex, expansionFields);
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/AbstractNodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/AbstractNodeCostComparator.java
new file mode 100644
index 00000000000..9c2cc475401
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/AbstractNodeCostComparator.java
@@ -0,0 +1,108 @@
+package datawave.query.jexl.nodes;
+
+import java.util.Map;
+
+import org.apache.commons.jexl3.parser.ASTAndNode;
+import org.apache.commons.jexl3.parser.ASTOrNode;
+import org.apache.commons.jexl3.parser.ASTReference;
+import org.apache.commons.jexl3.parser.ASTReferenceExpression;
+import org.apache.commons.jexl3.parser.JexlNode;
+import org.apache.commons.jexl3.parser.JexlNodes;
+import org.apache.commons.jexl3.parser.ParserTreeConstants;
+
+import datawave.query.jexl.JexlASTHelper;
+import datawave.query.util.count.CountMap;
+
+/**
+ * Class that contains core logic for field and term comparators
+ */
+public abstract class AbstractNodeCostComparator extends JexlNodeComparator {
+ private static final long NODE_ID_MULTIPLIER = 5000L;
+ private static final int SEGMENT = Integer.MAX_VALUE / 48;
+
+ private final DefaultJexlNodeComparator comparator = new DefaultJexlNodeComparator();
+
+ private final Map counts;
+
+ /**
+ * Constructor that accepts a {@link CountMap}
+ *
+ * @param counts
+ * the count map
+ */
+ protected AbstractNodeCostComparator(CountMap counts) {
+ this(counts.getCounts());
+ }
+
+ /**
+ * Constructor that accepts a {@link Map} of counts
+ *
+ * @param counts
+ * the count map
+ */
+ protected AbstractNodeCostComparator(Map counts) {
+ this.counts = counts;
+ }
+
+ @Override
+ public int compare(JexlNode left, JexlNode right) {
+ left = JexlASTHelper.dereference(left);
+ right = JexlASTHelper.dereference(right);
+
+ int leftCost = getCostIndex(left);
+ int rightCost = getCostIndex(right);
+
+ int result = Integer.compare(leftCost, rightCost);
+
+ if (result == 0) {
+ result = comparator.compare(left, right);
+ }
+
+ return result;
+ }
+
+ @Override
+ public int getCostIndex(JexlNode node) {
+ if ((node instanceof ASTReference || node instanceof ASTReferenceExpression) && node.jjtGetNumChildren() == 1) {
+ return getCostIndex(node.jjtGetChild(0));
+ } else if (node instanceof ASTOrNode) {
+ return getCostForUnion(node);
+ } else if (QueryPropertyMarker.findInstance(node).isAnyType()) {
+ return Integer.MAX_VALUE;
+ } else if (node instanceof ASTAndNode) {
+ return getCostForIntersection(node);
+ } else {
+ String key = getNodeKey(node);
+ long score = counts.getOrDefault(key, getDefaultScore(node));
+ if (score > Integer.MAX_VALUE) {
+ score = Integer.MAX_VALUE;
+ }
+ return (int) score;
+ }
+ }
+
+ /**
+ * This method is the only difference between calculating cost based on field or term
+ *
+ * @param node
+ * a JexlNode
+ * @return the node key
+ */
+ abstract String getNodeKey(JexlNode node);
+
+ private long getDefaultScore(JexlNode node) {
+ int id = JexlNodes.id(node);
+ switch (id) {
+ case ParserTreeConstants.JJTFUNCTIONNODE:
+ return SEGMENT - 4L;
+ case ParserTreeConstants.JJTNENODE:
+ return SEGMENT - 3L;
+ case ParserTreeConstants.JJTNRNODE:
+ return SEGMENT - 2L;
+ case ParserTreeConstants.JJTNOTNODE:
+ return SEGMENT - 1L;
+ default:
+ return id * NODE_ID_MULTIPLIER;
+ }
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultJexlNodeComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultJexlNodeComparator.java
new file mode 100644
index 00000000000..af8a2be45fe
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultJexlNodeComparator.java
@@ -0,0 +1,87 @@
+package datawave.query.jexl.nodes;
+
+import org.apache.commons.jexl3.parser.ASTAndNode;
+import org.apache.commons.jexl3.parser.ASTOrNode;
+import org.apache.commons.jexl3.parser.ASTReference;
+import org.apache.commons.jexl3.parser.ASTReferenceExpression;
+import org.apache.commons.jexl3.parser.JexlNode;
+import org.apache.commons.jexl3.parser.JexlNodes;
+import org.apache.commons.jexl3.parser.ParserTreeConstants;
+
+import datawave.query.jexl.JexlASTHelper;
+
+/**
+ * Comparator that enforces default ordering according to implied cost
+ *
+ * Nodes are sorted by node type, then junction, then lexicographically
+ */
+public class DefaultJexlNodeComparator extends JexlNodeComparator {
+
+ private static final int SEGMENT = Integer.MAX_VALUE / 48;
+
+ private final JunctionComparator junctionComparator = new JunctionComparator();
+ private final LexicographicalNodeComparator lexiComparator = new LexicographicalNodeComparator();
+
+ @Override
+ public int compare(JexlNode left, JexlNode right) {
+ left = JexlASTHelper.dereference(left);
+ right = JexlASTHelper.dereference(right);
+
+ int result = Integer.compare(getCostIndex(left), getCostIndex(right));
+
+ // EQ vs. (EQ AND EQ) will match
+ if (result == 0) {
+ result = junctionComparator.compare(left, right);
+ }
+
+ if (result == 0) {
+ result = lexiComparator.compare(left, right);
+ }
+
+ return result;
+ }
+
+ /**
+ *
+ * @param node
+ * an arbitrary JexlNode
+ * @return the node cost
+ */
+ @Override
+ protected int getCostIndex(JexlNode node) {
+ if ((node instanceof ASTReference || node instanceof ASTReferenceExpression) && node.jjtGetNumChildren() == 1) {
+ return getCostIndex(node.jjtGetChild(0));
+ } else if (node instanceof ASTOrNode) {
+ return getCostForUnion(node);
+ } else if (QueryPropertyMarker.findInstance(node).isAnyType()) {
+ return Integer.MAX_VALUE;
+ } else if (node instanceof ASTAndNode) {
+ return getCostForIntersection(node);
+ } else {
+ return getNodeScore(node);
+ }
+ }
+
+ /**
+ * Wrapper around {@link JexlNodes#id(JexlNode)} so that we can boost the score of negated terms
+ *
+ * @param node
+ * any JexlNode
+ * @return a score for the node
+ */
+ private int getNodeScore(JexlNode node) {
+ int id = JexlNodes.id(node);
+ switch (id) {
+ case ParserTreeConstants.JJTFUNCTIONNODE:
+ return SEGMENT - 4;
+ case ParserTreeConstants.JJTNENODE:
+ return SEGMENT - 3;
+ case ParserTreeConstants.JJTNRNODE:
+ return SEGMENT - 2;
+ case ParserTreeConstants.JJTNOTNODE:
+ return SEGMENT - 1;
+ default:
+ return id;
+ }
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultNodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultNodeCostComparator.java
deleted file mode 100644
index fa5edcc8db7..00000000000
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultNodeCostComparator.java
+++ /dev/null
@@ -1,69 +0,0 @@
-package datawave.query.jexl.nodes;
-
-import org.apache.commons.jexl3.parser.ASTAndNode;
-import org.apache.commons.jexl3.parser.ASTOrNode;
-import org.apache.commons.jexl3.parser.ASTReference;
-import org.apache.commons.jexl3.parser.ASTReferenceExpression;
-import org.apache.commons.jexl3.parser.JexlNode;
-import org.apache.commons.jexl3.parser.JexlNodes;
-import org.apache.commons.jexl3.parser.ParserTreeConstants;
-
-/**
- * Provides default node cost calculations based on the Jexl node id
- */
-public class DefaultNodeCostComparator extends NodeCostComparator {
-
- /**
- *
- * @param node
- * an arbitrary JexlNode
- * @return the node cost
- */
- @Override
- protected int getCostIndex(JexlNode node) {
- if (node.jjtGetNumChildren() == 1 && (node instanceof ASTReference || node instanceof ASTReferenceExpression)) {
- QueryPropertyMarker.Instance instance = QueryPropertyMarker.findInstance(node);
- if (instance.isAnyType()) {
- return Integer.MAX_VALUE - 4;
- }
- return getCostIndex(node.jjtGetChild(0));
- } else if (node instanceof ASTOrNode) {
- int sum = 0;
- for (int i = 0; i < node.jjtGetNumChildren(); i++) {
- sum += getCostIndex(node.jjtGetChild(i));
- }
- return sum;
- } else if (node instanceof ASTAndNode) {
- int lowest = Integer.MAX_VALUE;
- for (int i = 0; i < node.jjtGetNumChildren(); i++) {
- int cost = getCostIndex(node.jjtGetChild(i));
- if (cost < lowest)
- lowest = cost;
- }
- return lowest;
- } else {
- return getNodeScore(node);
- }
- }
-
- /**
- * Wrapper around {@link JexlNodes#id(JexlNode)} so that we can boost the score of negated terms
- *
- * @param node
- * any JexlNode
- * @return a score for the node
- */
- private int getNodeScore(JexlNode node) {
- int id = JexlNodes.id(node);
- switch (id) {
- case ParserTreeConstants.JJTNENODE:
- return Integer.MAX_VALUE - 3;
- case ParserTreeConstants.JJTNRNODE:
- return Integer.MAX_VALUE - 2;
- case ParserTreeConstants.JJTNOTNODE:
- return Integer.MAX_VALUE - 1;
- default:
- return id;
- }
- }
-}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldCostComparator.java
new file mode 100644
index 00000000000..eb3d1e2956c
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldCostComparator.java
@@ -0,0 +1,55 @@
+package datawave.query.jexl.nodes;
+
+import java.util.Map;
+
+import org.apache.commons.jexl3.parser.ASTFunctionNode;
+import org.apache.commons.jexl3.parser.ASTNENode;
+import org.apache.commons.jexl3.parser.ASTNRNode;
+import org.apache.commons.jexl3.parser.ASTNotNode;
+import org.apache.commons.jexl3.parser.JexlNode;
+
+import datawave.query.jexl.JexlASTHelper;
+import datawave.query.util.count.CountMap;
+
+/**
+ * Comparator that operates on field cardinality
+ */
+public class FieldCostComparator extends AbstractNodeCostComparator {
+
+ /**
+ * Constructor that accepts a {@link CountMap}
+ *
+ * @param counts
+ * the count map
+ */
+ public FieldCostComparator(CountMap counts) {
+ this(counts.getCounts());
+ }
+
+ /**
+ * Constructor that accepts a {@link Map} of counts
+ *
+ * @param counts
+ * the count map
+ */
+ public FieldCostComparator(Map counts) {
+ super(counts);
+ }
+
+ /**
+ * The {@link FieldCostComparator} uses a node's identifier to calculate cost
+ *
+ * @param node
+ * a JexlNode
+ * @return the node key
+ */
+ @Override
+ public String getNodeKey(JexlNode node) {
+ if (node instanceof ASTNotNode || node instanceof ASTNENode || node instanceof ASTNRNode || node instanceof ASTFunctionNode) {
+ // certain node types are always kicked out
+ return null;
+ }
+ return JexlASTHelper.getIdentifier(node);
+ }
+
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldOrTermNodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldOrTermNodeCostComparator.java
deleted file mode 100644
index 4e66d8e9599..00000000000
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldOrTermNodeCostComparator.java
+++ /dev/null
@@ -1,116 +0,0 @@
-package datawave.query.jexl.nodes;
-
-import java.util.Map;
-
-import org.apache.commons.jexl3.parser.ASTAndNode;
-import org.apache.commons.jexl3.parser.ASTFunctionNode;
-import org.apache.commons.jexl3.parser.ASTNENode;
-import org.apache.commons.jexl3.parser.ASTNRNode;
-import org.apache.commons.jexl3.parser.ASTNotNode;
-import org.apache.commons.jexl3.parser.ASTOrNode;
-import org.apache.commons.jexl3.parser.ASTReference;
-import org.apache.commons.jexl3.parser.ASTReferenceExpression;
-import org.apache.commons.jexl3.parser.JexlNode;
-import org.apache.commons.jexl3.parser.JexlNodes;
-import org.apache.commons.jexl3.parser.ParserTreeConstants;
-
-import datawave.query.jexl.JexlASTHelper;
-import datawave.query.jexl.visitors.JexlStringBuildingVisitor;
-
-/**
- * Orders nodes based on field or term counts
- */
-public class FieldOrTermNodeCostComparator extends NodeCostComparator {
-
- private final boolean isFieldCount;
- private static final long NODE_ID_MULTIPLIER = 5000;
- private final Map counts;
-
- public FieldOrTermNodeCostComparator(Map counts, boolean isFieldCount) {
- this.counts = counts;
- this.isFieldCount = isFieldCount;
- }
-
- @Override
- int getCostIndex(JexlNode node) {
- if (node.jjtGetNumChildren() == 1 && (node instanceof ASTReference || node instanceof ASTReferenceExpression)) {
- return getCostIndex(node.jjtGetChild(0));
- } else if (node instanceof ASTOrNode) {
- int sum = 0;
- for (int i = 0; i < node.jjtGetNumChildren(); i++) {
- sum += getCostIndex(node.jjtGetChild(i));
- }
- return sum;
- } else if (QueryPropertyMarker.findInstance(node).isAnyType()) {
- return Integer.MAX_VALUE;
- } else if (node instanceof ASTAndNode) {
- int lowest = Integer.MAX_VALUE;
- for (int i = 0; i < node.jjtGetNumChildren(); i++) {
- int cost = getCostIndex(node.jjtGetChild(i));
- if (cost < lowest) {
- lowest = cost;
- }
- }
- return lowest;
- } else {
- return getCostForLeaf(node);
- }
- }
-
- /**
- * Get the cost for a leaf according to the count map.
- *
- * The extra code to handle integer overflows is due to term counts in the global index being a Long but Java's {@link Comparable#compareTo(Object)} returns
- * an integer.
- *
- * @param node
- * a JexlNode
- * @return an integer used to compare nodes
- */
- private int getCostForLeaf(JexlNode node) {
- String key = getNodeKey(node);
- long value = counts.getOrDefault(key, getNodeScore(node));
- if (value > Integer.MAX_VALUE) {
- value = Integer.MAX_VALUE;
- }
- return (int) value;
- }
-
- /**
- * Generate a key for the count map. It's either the field, or the whole node.
- *
- * @param node
- * a JexlNode
- * @return a node key
- */
- private String getNodeKey(JexlNode node) {
- if (node instanceof ASTNotNode || node instanceof ASTNENode || node instanceof ASTNRNode || node instanceof ASTFunctionNode) {
- return "NO_KEY";
- } else if (isFieldCount) {
- return JexlASTHelper.getIdentifier(node);
- } else {
- return JexlStringBuildingVisitor.buildQueryWithoutParse(node);
- }
- }
-
- /**
- * Wrapper around {@link JexlNodes#id(JexlNode)} so that we can boost the score of negated terms
- *
- * @param node
- * any JexlNode
- * @return a score for the node
- */
- private long getNodeScore(JexlNode node) {
- int id = JexlNodes.id(node);
- switch (id) {
- case ParserTreeConstants.JJTNENODE:
- return Integer.MAX_VALUE - 3L;
- case ParserTreeConstants.JJTNRNODE:
- return Integer.MAX_VALUE - 2L;
- case ParserTreeConstants.JJTNOTNODE:
- return Integer.MAX_VALUE - 1L;
- default:
- return id * NODE_ID_MULTIPLIER;
- }
- }
-}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JexlNodeComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JexlNodeComparator.java
new file mode 100644
index 00000000000..4796e20b5a6
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JexlNodeComparator.java
@@ -0,0 +1,72 @@
+package datawave.query.jexl.nodes;
+
+import java.util.Comparator;
+
+import org.apache.commons.jexl3.parser.JexlNode;
+
+import datawave.query.jexl.JexlASTHelper;
+
+/**
+ * Comparator for JexlNodes.
+ *
+ * Implementing classes may prioritize different features for sorting. For example, sorting leaves before junctions, EQ nodes before ER nodes, or sorting
+ * lexicographically by field and value
+ *
+ * EQ < ER < Functions
+ */
+public abstract class JexlNodeComparator implements Comparator {
+
+ @Override
+ public int compare(JexlNode left, JexlNode right) {
+ int leftCost = getCostIndex(JexlASTHelper.dereference(left));
+ int rightCost = getCostIndex(JexlASTHelper.dereference(right));
+
+ return Integer.compare(leftCost, rightCost);
+ }
+
+ /**
+ * Calculates a cost for the provided node
+ *
+ * @param node
+ * an arbitrary JexlNode
+ * @return the integer cost
+ */
+ abstract int getCostIndex(JexlNode node);
+
+ /**
+ * Get the cost for a union by summing the cost of each child
+ *
+ * @param node
+ * the union
+ * @return the cost
+ */
+ protected int getCostForUnion(JexlNode node) {
+ int cost = 0;
+ for (int i = 0; i < node.jjtGetNumChildren(); i++) {
+ cost += getCostIndex(node.jjtGetChild(i));
+ // check for overflows
+ if (cost == Integer.MAX_VALUE || cost < 0) {
+ return Integer.MAX_VALUE;
+ }
+ }
+ return cost;
+ }
+
+ /**
+ * Get the cost for an intersection by taking the lowest cost of all children
+ *
+ * @param node
+ * the intersection
+ * @return the cost
+ */
+ protected int getCostForIntersection(JexlNode node) {
+ int cost = Integer.MAX_VALUE;
+ for (int i = 0; i < node.jjtGetNumChildren(); i++) {
+ int childCost = getCostIndex(node.jjtGetChild(i));
+ if (childCost < cost) {
+ cost = childCost;
+ }
+ }
+ return cost;
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JunctionComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JunctionComparator.java
new file mode 100644
index 00000000000..859d117700c
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JunctionComparator.java
@@ -0,0 +1,24 @@
+package datawave.query.jexl.nodes;
+
+import org.apache.commons.jexl3.parser.ASTAndNode;
+import org.apache.commons.jexl3.parser.ASTOrNode;
+import org.apache.commons.jexl3.parser.JexlNode;
+
+/**
+ * Comparator that pushes single leaf nodes to the left and junctions to the right
+ *
+ * Note: should only be used to break ties in other comparators.
+ */
+public class JunctionComparator extends JexlNodeComparator {
+
+ @Override
+ public int getCostIndex(JexlNode node) {
+ if (node instanceof ASTAndNode && !QueryPropertyMarker.findInstance(node).isAnyType()) {
+ return 3;
+ } else if (node instanceof ASTOrNode) {
+ return 2;
+ } else {
+ return 1;
+ }
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/LexicographicalNodeComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/LexicographicalNodeComparator.java
new file mode 100644
index 00000000000..37e183c46bf
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/LexicographicalNodeComparator.java
@@ -0,0 +1,25 @@
+package datawave.query.jexl.nodes;
+
+import org.apache.commons.jexl3.parser.JexlNode;
+
+import datawave.query.jexl.visitors.JexlStringBuildingVisitor;
+
+/**
+ * Sorts nodes according to the node string.
+ *
+ * Note: this comparator is intended to break ties between nodes of similar type or cost. Running this comparator in isolation will produce unexpected results.
+ */
+public class LexicographicalNodeComparator extends JexlNodeComparator {
+
+ @Override
+ public int compare(JexlNode left, JexlNode right) {
+ String leftQuery = JexlStringBuildingVisitor.buildQuery(left);
+ String rightQuery = JexlStringBuildingVisitor.buildQuery(right);
+ return leftQuery.compareTo(rightQuery);
+ }
+
+ @Override
+ public int getCostIndex(JexlNode node) {
+ throw new IllegalStateException("Not implemented");
+ }
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/NodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/NodeCostComparator.java
deleted file mode 100644
index a238e5c6007..00000000000
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/NodeCostComparator.java
+++ /dev/null
@@ -1,40 +0,0 @@
-package datawave.query.jexl.nodes;
-
-import java.util.Comparator;
-
-import org.apache.commons.jexl3.parser.JexlNode;
-
-import datawave.query.jexl.visitors.JexlStringBuildingVisitor;
-
-/**
- * Compare nodes based on arbitrary cost.
- *
- * EQ < ER < Functions
- */
-public abstract class NodeCostComparator implements Comparator {
-
- @Override
- public int compare(JexlNode left, JexlNode right) {
- int leftCost = getCostIndex(left);
- int rightCost = getCostIndex(right);
-
- int result = Integer.compare(leftCost, rightCost);
- if (result == 0) {
- // if comparing by field cost (same field) provide an opportunity to sort alphabetically
- result = JexlStringBuildingVisitor.buildQuery(left).compareTo(JexlStringBuildingVisitor.buildQuery(right));
- }
-
- return result;
- }
-
- // Evaluate OR nodes last, then And nodes, then nodes by node id
-
- /**
- * Calculates a cost for the provided node
- *
- * @param node
- * an arbitrary JexlNode
- * @return the integer cost
- */
- abstract int getCostIndex(JexlNode node);
-}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/TermCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/TermCostComparator.java
new file mode 100644
index 00000000000..ae3b62c2273
--- /dev/null
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/TermCostComparator.java
@@ -0,0 +1,46 @@
+package datawave.query.jexl.nodes;
+
+import java.util.Map;
+
+import org.apache.commons.jexl3.parser.JexlNode;
+
+import datawave.query.jexl.visitors.JexlStringBuildingVisitor;
+import datawave.query.util.count.CountMap;
+
+/**
+ * Comparator that operates on term cardinality
+ */
+public class TermCostComparator extends AbstractNodeCostComparator {
+
+ /**
+ * Constructor that accepts a {@link CountMap}
+ *
+ * @param counts
+ * the count map
+ */
+ public TermCostComparator(CountMap counts) {
+ this(counts.getCounts());
+ }
+
+ /**
+ * Constructor that accepts a {@link Map} of counts
+ *
+ * @param counts
+ * the count map
+ */
+ public TermCostComparator(Map counts) {
+ super(counts);
+ }
+
+ /**
+ * The {@link TermCostComparator} uses the whole node string to calculate cost
+ *
+ * @param node
+ * a JexlNode
+ * @return the node key
+ */
+ public String getNodeKey(JexlNode node) {
+ return JexlStringBuildingVisitor.buildQuery(node);
+ }
+
+}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java
index 6dd428cb411..8b576303a01 100644
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java
@@ -391,7 +391,7 @@ private Set getFieldsForFunctionNode(ASTFunctionNode node) {
if (visitor.namespace().equals(CONTENT_FUNCTION_NAMESPACE)) {
// all content function fields are added
ContentFunctionsDescriptor.ContentJexlArgumentDescriptor contentDescriptor = new ContentFunctionsDescriptor().getArgumentDescriptor(node);
- return contentDescriptor.fieldsAndTerms(Collections.emptySet(), Collections.emptySet(), Collections.emptySet(), null)[0];
+ return contentDescriptor.fieldsAndTerms(Collections.emptySet(), Collections.emptySet(), Collections.emptySet(), null).getFields();
} else {
JexlArgumentDescriptor descriptor = JexlFunctionArgumentDescriptorFactory.F.getArgumentDescriptor(node);
if (descriptor == null) {
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java
index 4f6a8c6fb2c..49dbba0a2c9 100644
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java
@@ -217,17 +217,25 @@ private JexlNode pruneNode(JexlNode node, Set fields) {
* @return the original node, or null if it is pruned
*/
private JexlNode pruneUnion(JexlNode node, Set fields) {
+ // if there is a isNotNull in the union, and we know we have an equality node involving one of the isNotNull nodes,
+ // we have the means to prune the entire union.
+ boolean willPrune = false;
+
for (int i = 0; i < node.jjtGetNumChildren(); i++) {
JexlNode deref = JexlASTHelper.dereference(node.jjtGetChild(i));
- if (!isIsNotNullFunction(deref)) {
- return node;
+ if (isIsNotNullFunction(deref) && !willPrune) {
+ String field = fieldForNode(deref);
+ if (fields.contains(field)) {
+ willPrune = true;
+ }
}
- String field = fieldForNode(deref);
- if (!fields.contains(field)) {
- return node;
- }
}
+
+ if (!willPrune) {
+ return node;
+ }
+
return null;
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java
index b2821874110..1b4f0cddff3 100644
--- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java
+++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java
@@ -10,30 +10,34 @@
import org.apache.commons.jexl3.parser.JexlNode;
import org.apache.commons.jexl3.parser.JexlNodes;
import org.apache.commons.jexl3.parser.ParseException;
+import org.apache.commons.jexl3.parser.ParserTreeConstants;
import org.apache.log4j.Logger;
import datawave.query.jexl.JexlASTHelper;
-import datawave.query.jexl.nodes.DefaultNodeCostComparator;
-import datawave.query.jexl.nodes.FieldOrTermNodeCostComparator;
-import datawave.query.jexl.nodes.NodeCostComparator;
+import datawave.query.jexl.nodes.DefaultJexlNodeComparator;
+import datawave.query.jexl.nodes.FieldCostComparator;
+import datawave.query.jexl.nodes.JexlNodeComparator;
import datawave.query.jexl.nodes.QueryPropertyMarker;
+import datawave.query.jexl.nodes.TermCostComparator;
import datawave.query.jexl.visitors.BaseVisitor;
import datawave.query.jexl.visitors.JexlStringBuildingVisitor;
/**
- * Orders query nodes by cost.
+ * Orders query nodes by cost using one or more {@link JexlNodeComparator}s.
*
- * Cost is calculated based on field counts, term counts, or a default cost based on the node id {@link org.apache.commons.jexl3.parser.ParserTreeConstants}.
+ * The {@link DefaultJexlNodeComparator} orders a query based on the implied cost via the node id, see {@link ParserTreeConstants}. In general an EQ node is
+ * faster to resolve than an ER node, or a Marker node.
*
- * In general an EQ node is faster to resolve than an ER node.
+ * The {@link FieldCostComparator} orders a query cased on the field cardinality. This cardinality can be gathered from the metadata table across the entire
+ * date range of the query, or the cardinality can be gathered from the global index and applied on a per-shard basis.
*
- * In general an ER node is faster to resolve than a function node.
+ * The {@link TermCostComparator} orders a query based on the term cardinality. This is gathered from the global index and applied on a per-shard basis.
*/
public class OrderByCostVisitor extends BaseVisitor {
private static final Logger log = Logger.getLogger(OrderByCostVisitor.class);
- private NodeCostComparator costComparator;
+ private JexlNodeComparator comparator;
private final boolean isFieldMap;
private final Map countMap;
@@ -50,8 +54,7 @@ public static String order(String query) {
script = order(script);
return JexlStringBuildingVisitor.buildQueryWithoutParse(script);
} catch (ParseException e) {
- log.error("Could not order query by cost: " + query);
- e.printStackTrace();
+ log.error("Could not order query by cost: " + query, e);
}
return null;
}
@@ -182,7 +185,7 @@ private Object visitJunction(JexlNode node, Object data) {
QueryPropertyMarker.Instance instance = QueryPropertyMarker.findInstance(node);
if (!instance.isAnyType()) {
JexlNode[] children = JexlNodes.getChildren(node);
- Arrays.sort(children, getCostComparator());
+ Arrays.sort(children, getComparator());
JexlNodes.setChildren(node, children);
node.childrenAccept(this, data);
@@ -190,15 +193,19 @@ private Object visitJunction(JexlNode node, Object data) {
return data;
}
- private NodeCostComparator getCostComparator() {
- if (costComparator == null) {
+ private JexlNodeComparator getComparator() {
+ if (comparator == null) {
if (countMap != null) {
- costComparator = new FieldOrTermNodeCostComparator(countMap, isFieldMap);
+ if (isFieldMap) {
+ comparator = new FieldCostComparator(countMap);
+ } else {
+ comparator = new TermCostComparator(countMap);
+ }
} else {
- costComparator = new DefaultNodeCostComparator();
+ comparator = new DefaultJexlNodeComparator();
}
}
- return costComparator;
+ return comparator;
}
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java
index d601d49b8c9..3a06f4ac411 100644
--- a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java
+++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java
@@ -4,26 +4,27 @@
import java.util.ArrayList;
import java.util.List;
+import datawave.query.jexl.functions.QueryFunctions;
import datawave.query.language.functions.QueryFunction;
import datawave.webservice.query.exception.BadRequestQueryException;
import datawave.webservice.query.exception.DatawaveErrorCode;
/**
* This function accepts a comma separated list of fields to be excluded from QueryModel expansion. The purpose is to provide users with an easy way to avoid
- * undesirable query model expansions.
- *
- * Note: The exclude is only applied to the fields in the original query. An original field can be expanded into an excluded field.
+ * undesirable query model expansions.
+ * Note: The exclusion is only applied to the fields in the original query. An original field can be expanded into an excluded field.
*/
public class NoExpansion extends JexlQueryFunction {
public NoExpansion() {
- super("noExpansion", new ArrayList<>());
+ super(QueryFunctions.NO_EXPANSION, new ArrayList<>());
}
@Override
public void validate() throws IllegalArgumentException {
- if (this.parameterList.size() != 1) {
- BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, MessageFormat.format("{0}", this.name));
+ if (this.parameterList.isEmpty()) {
+ BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS,
+ MessageFormat.format("{0} requires at least one argument", this.name));
throw new IllegalArgumentException(qe);
}
}
@@ -35,7 +36,19 @@ public QueryFunction duplicate() {
@Override
public String toString() {
- List params = getParameterList();
- return "f:noExpansion(" + String.join("", params) + ")";
+ StringBuilder sb = new StringBuilder();
+
+ sb.append(QueryFunctions.QUERY_FUNCTION_NAMESPACE).append(':').append(QueryFunctions.NO_EXPANSION);
+ if (parameterList.isEmpty()) {
+ sb.append("()");
+ } else {
+ char separator = '(';
+ for (String param : parameterList) {
+ sb.append(separator).append(escapeString(param));
+ separator = ',';
+ }
+ sb.append(')');
+ }
+ return sb.toString();
}
}
diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java
index 6df09d7646c..72d8a852e3d 100644
--- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java
+++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java
@@ -2767,8 +2767,10 @@ public Tuple2,Boolean> getQueryRanges(ScannerFactor
}
}
- if (config.isSortQueryBeforeGlobalIndex()) {
+ if (config.isSortQueryPreIndexWithFieldCounts()) {
config.setQueryTree(timedSortQueryBeforeGlobalIndex(config, getMetadataHelper()));
+ } else if (config.isSortQueryPreIndexWithImpliedCounts()) {
+ config.setQueryTree(timedSortQueryBeforeGlobalIndex(config));
}
// if a simple examination of the query has not forced a full table
@@ -2863,12 +2865,22 @@ protected ASTJexlScript timedSortQueryBeforeGlobalIndex(ShardQueryConfiguration
Map counts = metadataHelper.getCountsForFieldsInDateRange(fields, datatypes, config.getBeginDate(), config.getEndDate());
if (!counts.isEmpty()) {
return OrderByCostVisitor.orderByFieldCount(config.getQueryTree(), counts);
+ } else {
+ // fall back to sorting by implied cardinality
+ return OrderByCostVisitor.order(config.getQueryTree());
}
}
return config.getQueryTree();
});
}
+ protected ASTJexlScript timedSortQueryBeforeGlobalIndex(ShardQueryConfiguration config) throws DatawaveQueryException {
+ return visitorManager.timedVisit(config.getTimers(), "SortQueryBeforeGlobalIndex", () -> {
+ // sort by implied cardinality
+ return OrderByCostVisitor.order(config.getQueryTree());
+ });
+ }
+
private TypeMetadata getTypeMetadata() {
try {
return metadataHelper.getTypeMetadata();
diff --git a/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java
index 18bfd3bc275..ab5150b526c 100644
--- a/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java
+++ b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java
@@ -48,7 +48,7 @@ protected void populateContentFunctions(JexlNode node) {
ContentFunctionsDescriptor descriptor = new ContentFunctionsDescriptor();
ContentJexlArgumentDescriptor argsDescriptor;
- Set[] fieldsAndTerms;
+ ContentFunctionsDescriptor.FieldTerms fieldsAndTerms;
JexlNode parent;
String field;
@@ -67,12 +67,12 @@ protected void populateContentFunctions(JexlNode node) {
// content, tf, and indexed fields are not actually needed to extract fields from the function node
fieldsAndTerms = argsDescriptor.fieldsAndTerms(Collections.emptySet(), Collections.emptySet(), Collections.emptySet(), null);
- if (fieldsAndTerms[0].size() != 1) {
+ if (fieldsAndTerms.totalFields() != 1) {
throw new IllegalStateException("content function had more than one field");
}
- field = JexlASTHelper.deconstructIdentifier(fieldsAndTerms[0].iterator().next());
- ContentFunction contentFunction = new ContentFunction(field, fieldsAndTerms[1]);
+ field = JexlASTHelper.deconstructIdentifier(fieldsAndTerms.getFields().iterator().next());
+ ContentFunction contentFunction = new ContentFunction(field, fieldsAndTerms.getTerms());
contentFunctions.put(contentFunction.getField(), contentFunction);
if (isFunctionNegated(f)) {
diff --git a/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java b/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java
index 9f168e63e32..ed032b43bdb 100644
--- a/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java
+++ b/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java
@@ -120,6 +120,10 @@ public List apply(QueryData qd) {
options.setQueryConfig(this.config);
+ String tableName = tableId.canonical();
+ options.applyExecutionHints(tableName, config.getTableHints());
+ options.applyConsistencyLevel(tableName, config.getTableConsistencyLevels());
+
chunks.add(new ScannerChunk(options, plan.getRanges(), qd, server));
} catch (Exception e) {
log.error(e);
diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java b/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java
index 94a332e9772..1f3623a3ae1 100644
--- a/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java
+++ b/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java
@@ -28,7 +28,7 @@
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.Authorizations;
-import org.apache.accumulo.core.util.PeekingIterator;
+import org.apache.commons.collections4.iterators.PeekingIterator;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;
diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java b/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java
index 5366418ec48..415d6548f73 100644
--- a/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java
+++ b/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java
@@ -50,8 +50,11 @@ public class ScannerFactory {
protected ResourceQueue scanQueue = null;
protected ShardQueryConfiguration config = null;
- protected Map consistencyByTable = new HashMap<>();
- protected Map> hintsByTable = new HashMap<>();
+ // consistency and execution hints can be mapped to table names or functional names
+ // for example, 'shardIndex' might map to a default executor pool for the shard index table
+ // while 'expansion' might map to a separate executor pool on the shard index
+ protected Map consistencyLevelMap = new HashMap<>();
+ protected Map> executionHintMap = new HashMap<>();
private static final Logger log = Logger.getLogger(ScannerFactory.class);
@@ -94,7 +97,7 @@ public ScannerFactory(AccumuloClient client, int queueSize) {
}
/**
- * Method that allows a ScannerFactory to be updated by a config after initialization
+ * Method that allows a ScannerFactory to use scan execution and consistency hints from the provided {@link GenericQueryConfiguration}.
*
* @param genericConfig
* a {@link GenericQueryConfiguration}
@@ -105,12 +108,12 @@ public void updateConfigs(GenericQueryConfiguration genericConfig) {
Map consistencyLevels = genericConfig.getTableConsistencyLevels();
if (consistencyLevels != null && !consistencyLevels.isEmpty()) {
- this.consistencyByTable = genericConfig.getTableConsistencyLevels();
+ this.consistencyLevelMap = genericConfig.getTableConsistencyLevels();
}
Map> hints = genericConfig.getTableHints();
if (hints != null && !hints.isEmpty()) {
- this.hintsByTable = genericConfig.getTableHints();
+ this.executionHintMap = genericConfig.getTableHints();
}
int numThreads = DEFAULT_MAX_THREADS;
@@ -139,6 +142,7 @@ public void updateConfigs(GenericQueryConfiguration genericConfig) {
public Scanner newSingleScanner(String tableName, Set auths, Query query) throws TableNotFoundException {
if (open.get()) {
Scanner bs = QueryScannerHelper.createScannerWithoutInfo(client, tableName, auths, query);
+
applyConfigs(bs, tableName);
log.debug("Created scanner " + System.identityHashCode(bs));
@@ -160,10 +164,47 @@ public Scanner newSingleScanner(String tableName, Set auths, Que
}
}
+ /**
+ * Create a new {@link BatchScanner} using the table name as the execution hint
+ *
+ * @param tableName
+ * the table name
+ * @param auths
+ * the set of authorizations
+ * @param threads
+ * the number of threads
+ * @param query
+ * the Query
+ * @return a BatchScanner
+ * @throws TableNotFoundException
+ * if no table exists
+ */
public BatchScanner newScanner(String tableName, Set auths, int threads, Query query) throws TableNotFoundException {
+ return newScanner(tableName, auths, threads, query, tableName);
+ }
+
+ /**
+ * Creates a new {@link BatchScanner} with execution hints
+ *
+ * @param tableName
+ * the table name
+ * @param auths
+ * the set of authorizations
+ * @param threads
+ * the number of threads to use
+ * @param query
+ * the Query
+ * @param hintKey
+ * the key used to select an execution hint
+ * @return a BatchScanner
+ * @throws TableNotFoundException
+ * if no table exists
+ */
+ public BatchScanner newScanner(String tableName, Set auths, int threads, Query query, String hintKey) throws TableNotFoundException {
if (open.get()) {
BatchScanner bs = QueryScannerHelper.createBatchScanner(client, tableName, auths, threads, query);
- applyConfigs(bs, tableName);
+
+ applyConfigs(bs, hintKey, tableName);
log.debug("Created scanner " + System.identityHashCode(bs));
if (log.isTraceEnabled()) {
@@ -186,6 +227,7 @@ public BatchScanner newScanner(String tableName, Set auths, int
public BatchScanner newScanner(String tableName, Set auths, int threads, Query query, boolean reportErrors) throws TableNotFoundException {
if (open.get()) {
BatchScanner bs = QueryScannerHelper.createBatchScanner(client, tableName, auths, threads, query, reportErrors);
+
applyConfigs(bs, tableName);
log.debug("Created scanner " + System.identityHashCode(bs));
@@ -229,7 +271,28 @@ public BatchScanner newScanner(String tableName, Query query) throws TableNotFou
* if there are issues
*/
public BatchScannerSession newQueryScanner(final String tableName, final Set auths, Query settings) throws Exception {
- return newLimitedScanner(BatchScannerSession.class, tableName, auths, settings).setThreads(scanQueue.getCapacity());
+ return newQueryScanner(tableName, auths, settings, tableName);
+ }
+
+ /**
+ * Builds a new scanner session using a finalized table name and set of authorizations using the previously defined queue. Note that the number of entries
+ * is hardcoded, below, to 1000, but can be changed
+ *
+ * @param tableName
+ * the table string
+ * @param auths
+ * a set of auths
+ * @param settings
+ * query settings
+ * @param executionHintKey
+ * a key used to select a scan execution hint
+ * @return a new scanner session
+ * @throws Exception
+ * if there are issues
+ */
+ public BatchScannerSession newQueryScanner(final String tableName, final Set auths, Query settings, String executionHintKey)
+ throws Exception {
+ return newLimitedScanner(BatchScannerSession.class, tableName, auths, settings, executionHintKey).setThreads(scanQueue.getCapacity());
}
/**
@@ -248,13 +311,49 @@ public BatchScannerSession newQueryScanner(final String tableName, final Set T newLimitedScanner(Class wrapper, final String tableName, final Set auths, final Query settings)
throws NoSuchMethodException, InvocationTargetException, InstantiationException, IllegalAccessException {
+ return newLimitedScanner(wrapper, tableName, auths, settings, tableName);
+ }
+
+ /**
+ * Builds a new scanner session using a finalized table name and set of authorizations using the previously defined queue. Note that the number of entries
+ * is hardcoded, below, to 1000, but can be changed
+ *
+ * @param tableName
+ * the table string
+ * @param auths
+ * a set of auths
+ * @param settings
+ * query settings
+ * @param hintKey
+ * the key used to select an execution hint
+ * @param
+ * type of the wrapper
+ * @param wrapper
+ * a wrapper class
+ * @return a new scanner session
+ * @throws NoSuchMethodException
+ * in the case of no such method
+ * @throws InvocationTargetException
+ * in the case of no invocation target
+ * @throws InstantiationException
+ * in the case something fails to instantiate
+ * @throws IllegalAccessException
+ * in the case of an illegal access
+ *
+ */
+ public