diff --git a/warehouse/query-core/src/main/java/datawave/query/attributes/ExcerptFields.java b/warehouse/query-core/src/main/java/datawave/query/attributes/ExcerptFields.java index a85e70d4253..a2f763fb222 100644 --- a/warehouse/query-core/src/main/java/datawave/query/attributes/ExcerptFields.java +++ b/warehouse/query-core/src/main/java/datawave/query/attributes/ExcerptFields.java @@ -1,7 +1,6 @@ package datawave.query.attributes; import java.io.Serializable; -import java.util.Collection; import java.util.Iterator; import java.util.Map; import java.util.Objects; @@ -9,14 +8,14 @@ import java.util.SortedMap; import java.util.TreeMap; -import org.apache.commons.lang3.StringUtils; - import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonValue; import com.google.common.collect.Multimap; import datawave.query.Constants; import datawave.query.jexl.JexlASTHelper; +import datawave.query.postprocessing.tf.PhraseIndexes; +import datawave.util.StringUtils; /** * Represents a set of fields that have been specified within an #EXCERPT_FIELDS function, as well as their corresponding target offsets that should be used to @@ -51,7 +50,7 @@ public static ExcerptFields from(String string) { return null; } // Strip whitespaces. - string = StringUtils.deleteWhitespace(string); + string = PhraseIndexes.whitespacePattern.matcher(string).replaceAll(""); if (string.isEmpty()) { return new ExcerptFields(); @@ -202,9 +201,9 @@ public void deconstructFields() { */ public void expandFields(Multimap model) { SortedMap> expandedMap = new TreeMap<>(); - for (String field : fieldMap.keySet()) { - SortedMap offset = fieldMap.get(field); - field = field.toUpperCase(); + for (Map.Entry> entry : fieldMap.entrySet()) { + String field = entry.getKey().toUpperCase(); + SortedMap offset = entry.getValue(); // Add the expanded fields. if (model.containsKey(field)) { for (String expandedField : model.get(field)) { diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java b/warehouse/query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java index a882013c008..f3b098493f4 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java @@ -5,10 +5,10 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.SortedSet; +import java.util.StringJoiner; import java.util.TreeSet; import java.util.stream.Collectors; @@ -21,9 +21,9 @@ import org.apache.accumulo.core.iterators.OptionDescriber; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; import org.apache.hadoop.io.Text; -import org.apache.log4j.Logger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import com.google.common.base.Joiner; import com.google.protobuf.InvalidProtocolBufferException; import datawave.ingest.protobuf.TermWeight; @@ -34,81 +34,181 @@ * for each document scanned. */ public class TermFrequencyExcerptIterator implements SortedKeyValueIterator, OptionDescriber { - private static final Logger log = Logger.getLogger(TermFrequencyExcerptIterator.class); - private static final Joiner joiner = Joiner.on(" ").skipNulls(); - - // The field name option - public static final String FIELD_NAME = "field.name"; - // The start offset option - public static final String START_OFFSET = "start.offset"; - // The end offset option - public static final String END_OFFSET = "end.offset"; - // the underlying source - protected SortedKeyValueIterator source; + private static final Logger log = LoggerFactory.getLogger(TermFrequencyExcerptIterator.class); + private static final Collection TERM_FREQUENCY_COLUMN_FAMILY_BYTE_SEQUENCE = Collections + .singleton(new ArrayByteSequence(Constants.TERM_FREQUENCY_COLUMN_FAMILY.getBytes())); + + /** + * A special term that is used to indicate we removed a candidate term because
+ * it was a member of the list of terms that should not be included in an excerpt.
+ *
+ * When returned to the ExcerptTransform, indicates that we should run this iterator a second time with a bigger range so that we can account for removed + * words in the excerpt. + */ + public static final String WORD_SKIPPED_MARKER = "XXXWESKIPPEDAWORDXXX"; + + /** + * A special term that is used to indicate when we are using documents that are not scored.
+ *
+ * When returned to the ExcerptTransform, indicates that we should not populate the "HIT_EXCERPT_WITH_SCORES" or "HIT_EXCERPT_ONE_BEST" fields. + */ + public static final String NOT_SCORED_MARKER = "XXXNOTSCOREDXXX"; + + private static final String BLANK_EXCERPT_MESSAGE = "YOUR EXCERPT WAS BLANK! Maybe bad field or size?"; - // the field name - protected String fieldName; - // the start offset (inclusive) - protected int startOffset; - // the end offset (exclusive) - protected int endOffset; + /** + * Encapsulates the configuration of the HitExcerptIterator. The Iterator constructor guarantees one is set per instance of the iterator, method are + * provided to initialize this from an external options map or perform a deep copy from another instance. + */ + public static final class Configuration { + /** The field name option */ + public static final String FIELD_NAME = "field.name"; + /** The start offset option */ + public static final String START_OFFSET = "start.offset"; + /** The end offset option */ + public static final String END_OFFSET = "end.offset"; + + /** represents the directions used to build the excerpt in the iterator */ + public enum Direction { + /** specifies that an excerpt only returns terms prior to the last hit term found in the excerpt */ + BEFORE, + /** specifies that an excerpt only returns terms after the first hit term found in the excerpt */ + AFTER, + /** specifies that an excerpt returns terms from both before and after the median offset */ + BOTH + } - // The specified dt/uid column families + public Configuration() {} + + /** + * initialize the fields in this instance by reading from an option map + * + * @param options + * the map to read options from. + */ + public void init(Map options) { + fieldName = options.get(FIELD_NAME); + startOffset = Integer.parseInt(options.get(START_OFFSET)); + endOffset = Integer.parseInt(options.get(END_OFFSET)); + hitTermsList = new ArrayList<>(); + direction = Direction.BOTH; + origHalfSize = 0; + trimExcerpt = false; + } + + /** deep copy the configuration of another excerpt configuration into this one */ + public void deepCopy(Configuration other) { + fieldName = other.fieldName; + startOffset = other.startOffset; + endOffset = other.endOffset; + hitTermsList = new ArrayList<>(other.hitTermsList); + direction = other.direction; + origHalfSize = other.origHalfSize; + trimExcerpt = other.trimExcerpt; + } + + /** the field name */ + private String fieldName; + + /** the list of hit terms: terms from the query that resulted in the current document being returned as a result */ + private ArrayList hitTermsList; + + /** the direction for the excerpt - controls which directions we build the excerpt from an originating hit term */ + private Direction direction; + + /** + * Whether we might need to trim down the excerpt to the requested size.
+ * Is false if this is the first time running the iterator and true otherwise because the second pass always asks for more data than is needed to + * generate the excerpt to account for stop words. + */ + private boolean trimExcerpt; + + /** the start offset (inclusive) */ + private int startOffset; + + /** the end offset (exclusive) */ + private int endOffset; + + /** The size of half of the original desired excerpt length. Used during trimming. */ + private float origHalfSize; + + public List getHitTermsList() { + return hitTermsList; + } + + public String toString() { + return fieldName + ", " + startOffset + ", " + endOffset; + } + } + + /** the underlying source */ + protected SortedKeyValueIterator source; + + /** The specified dt/uid column families */ protected SortedSet columnFamilies; - // inclusive or exclusive dt/uid column families + + /** inclusive or exclusive dt/uid column families */ protected boolean inclusive; - // the underlying TF scan range + /** the underlying TF scan range */ protected Range scanRange; - // the top key + /** the top key */ protected Key tk; - // the top value + + /** the top value */ protected Value tv; + /** encapsulates the configuration for the TermFrequencyExcerptIterator */ + protected final Configuration config; + + public TermFrequencyExcerptIterator() { + this.config = new Configuration(); // excerpt config will never be null; + } + @Override public IteratorOptions describeOptions() { IteratorOptions options = new IteratorOptions(TermFrequencyExcerptIterator.class.getSimpleName(), "An iterator that returns excepts from the scanned documents", null, null); - options.addNamedOption(FIELD_NAME, "The token field name for which to get excerpts (required)"); - options.addNamedOption(START_OFFSET, "The start offset for the excerpt (inclusive) (required)"); - options.addNamedOption(END_OFFSET, "The end offset for the excerpt (exclusive) (required)"); + options.addNamedOption(Configuration.FIELD_NAME, "The token field name for which to get excerpts (required)"); + options.addNamedOption(Configuration.START_OFFSET, "The start offset for the excerpt (inclusive) (required)"); + options.addNamedOption(Configuration.END_OFFSET, "The end offset for the excerpt (exclusive) (required)"); return options; } @Override public boolean validateOptions(Map map) { - if (map.containsKey(FIELD_NAME)) { - if (map.get(FIELD_NAME).isEmpty()) { - throw new IllegalArgumentException("Empty field name property: " + FIELD_NAME); + if (map.containsKey(Configuration.FIELD_NAME)) { + if (map.get(Configuration.FIELD_NAME).isEmpty()) { + throw new IllegalArgumentException("Empty field name property: " + Configuration.FIELD_NAME); } } else { - throw new IllegalArgumentException("Missing field name property: " + FIELD_NAME); + throw new IllegalArgumentException("Missing field name property: " + Configuration.FIELD_NAME); } int startOffset; - if (map.containsKey(START_OFFSET)) { + if (map.containsKey(Configuration.START_OFFSET)) { try { - startOffset = Integer.parseInt(map.get(START_OFFSET)); - } catch (NumberFormatException e) { + startOffset = Integer.parseInt(map.get(Configuration.START_OFFSET)); + } catch (Exception e) { throw new IllegalArgumentException("Failed to parse start offset as integer", e); } } else { - throw new IllegalArgumentException("Missing start offset property: " + START_OFFSET); + throw new IllegalArgumentException("Missing start offset property: " + Configuration.START_OFFSET); } - int endOffset; - if (map.containsKey(END_OFFSET)) { + if (map.containsKey(Configuration.END_OFFSET)) { + int endOffset; try { - endOffset = Integer.parseInt(map.get(END_OFFSET)); - } catch (NumberFormatException e) { + endOffset = Integer.parseInt(map.get(Configuration.END_OFFSET)); + } catch (Exception e) { throw new IllegalArgumentException("Failed to parse end offset as integer", e); } if (endOffset <= startOffset) { throw new IllegalArgumentException("End offset must be greater than start offset"); } } else { - throw new IllegalArgumentException("Missing end offset property: " + END_OFFSET); + throw new IllegalArgumentException("Missing end offset property: " + Configuration.END_OFFSET); } return true; @@ -122,19 +222,15 @@ public boolean hasTop() { @Override public SortedKeyValueIterator deepCopy(IteratorEnvironment env) { TermFrequencyExcerptIterator it = new TermFrequencyExcerptIterator(); - it.startOffset = startOffset; - it.endOffset = endOffset; - it.fieldName = fieldName; it.source = source.deepCopy(env); + it.config.deepCopy(config); return it; } @Override public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException { this.source = source; - this.startOffset = Integer.parseInt(options.get(START_OFFSET)); - this.endOffset = Integer.parseInt(options.get(END_OFFSET)); - this.fieldName = options.get(FIELD_NAME); + config.init(options); } @Override @@ -150,15 +246,11 @@ public Value getTopValue() { @Override public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { if (log.isDebugEnabled()) { - log.debug(this + " seek'ing with requested range " + range); + log.debug("{} seek'ing with requested range {}", this, range); } // capture the column families and the inclusiveness - if (columnFamilies != null) { - this.columnFamilies = getSortedCFs(columnFamilies); - } else { - this.columnFamilies = Collections.emptySortedSet(); - } + this.columnFamilies = columnFamilies != null ? getSortedCFs(columnFamilies) : Collections.emptySortedSet(); this.inclusive = inclusive; // Determine the start key in the term frequencies @@ -182,9 +274,7 @@ public void seek(Range range, Collection columnFamilies, boolean i startKey = new Key(range.getStartKey().getRow(), Constants.TERM_FREQUENCY_COLUMN_FAMILY, new Text(dtAndUid)); } } - if (log.isDebugEnabled()) { - log.debug(this + " seek'ing to start key: " + startKey); - } + log.debug("{} calling seek to start key: {}", this, startKey); // Determine the end key in the term frequencies Key endKey = null; @@ -193,12 +283,12 @@ public void seek(Range range, Collection columnFamilies, boolean i String dtAndUid = getDtUidFromEventKey(range.getEndKey(), false, range.isEndKeyInclusive()); // if no end document if (dtAndUid == null) { - // if we do not have column families specified or they are not inclusive + // if we do not have column families specified, or they are not inclusive if (this.columnFamilies.isEmpty() || !this.inclusive) { // then go to the end of the TFs endKey = new Key(range.getEndKey().getRow(), Constants.TERM_FREQUENCY_COLUMN_FAMILY, new Text(Constants.MAX_UNICODE_STRING)); } else { - // othersize end at the last document specified + // otherwise end at the last document specified endKey = new Key(range.getEndKey().getRow(), Constants.TERM_FREQUENCY_COLUMN_FAMILY, new Text(this.columnFamilies.last() + Constants.NULL + Constants.MAX_UNICODE_STRING)); } @@ -207,9 +297,7 @@ public void seek(Range range, Collection columnFamilies, boolean i endKey = new Key(range.getStartKey().getRow(), Constants.TERM_FREQUENCY_COLUMN_FAMILY, new Text(dtAndUid)); } } - if (log.isDebugEnabled()) { - log.debug(this + " seek'ing to end key: " + endKey); - } + log.debug("{} seek'ing to end key: {}", this, endKey); // if we have actually exhausted our range, then return with no next key if (endKey != null && startKey != null && endKey.compareTo(startKey) <= 0) { @@ -223,11 +311,11 @@ public void seek(Range range, Collection columnFamilies, boolean i this.scanRange = new Range(startKey, false, endKey, false); if (log.isDebugEnabled()) { - log.debug(this + " seek'ing to: " + this.scanRange + " from requested range " + range); + log.debug("{} seek'ing to: {} from requested range {}", this, this.scanRange, range); } // seek the underlying source - source.seek(this.scanRange, Collections.singleton(new ArrayByteSequence(Constants.TERM_FREQUENCY_COLUMN_FAMILY.getBytes())), true); + source.seek(this.scanRange, TERM_FREQUENCY_COLUMN_FAMILY_BYTE_SEQUENCE, true); // get the next key next(); @@ -239,7 +327,7 @@ public void next() throws IOException { tv = null; if (log.isTraceEnabled()) { - log.trace(source.hasTop() + " nexting on " + scanRange); + log.trace("{} calling next on {}", source.hasTop(), scanRange); } // find a valid dt/uid (depends on initial column families set in seek call) @@ -247,7 +335,9 @@ public void next() throws IOException { while (source.hasTop() && dtUid == null) { Key top = source.getTopKey(); String thisDtUid = getDtUidFromTfKey(top); - if (isUsableDocument(thisDtUid)) { + // if this dt and uid are in the accepted column families... + if (columnFamilies.contains(thisDtUid) == inclusive) { + // we can use this document dtUid = thisDtUid; } else { seekToNextUid(top.getRow(), thisDtUid); @@ -255,16 +345,19 @@ public void next() throws IOException { } // if no more term frequencies, then we are done. - if (!source.hasTop()) { + if (!source.hasTop() || dtUid == null) { return; } - // get the pieces from the top key that will be returned + final int startOffset = config.startOffset; + final int endOffset = config.endOffset; + final List hitTermsList = config.hitTermsList; + final String fieldName = config.fieldName; + Key top = source.getTopKey(); - Text cv = top.getColumnVisibility(); - long ts = top.getTimestamp(); - Text row = top.getRow(); - List[] terms = new List[endOffset - startOffset]; + + // set the size of the array to the number of offsets that we will try to fill for the potential excerpt + WordsAndScores[] wordsAndScoresArr = new WordsAndScores[endOffset - startOffset]; // while we have term frequencies for the same document while (source.hasTop() && dtUid.equals(getDtUidFromTfKey(source.getTopKey()))) { @@ -276,78 +369,476 @@ public void next() throws IOException { // if this is for the field we are summarizing if (fieldName.equals(fieldAndValue[0])) { try { - // parse the offsets from the value + // get the protobuf that contains all the extra information for the TFs from the value TermWeight.Info info = TermWeight.Info.parseFrom(source.getTopValue().get()); + // check if the number of scores is equal to the number of offsets + boolean useScores = info.getScoreCount() == info.getTermOffsetCount(); + List scoreList = null; + // if the number of scores and offsets is the same, check to see if all the scores are negative or not + if (useScores) { + scoreList = info.getScoreList(); + useScores = !hasOnlyNegativeScores(scoreList, info, startOffset, endOffset); + } + // for each offset, gather all the terms in our range - for (int i = 0; i < info.getTermOffsetCount(); i++) { + for (int i = 0, termOffsetCount = info.getTermOffsetCount(); i < termOffsetCount; i++) { int offset = info.getTermOffset(i); // if the offset is within our range if (offset >= startOffset && offset < endOffset) { // calculate the index in our value list int index = offset - startOffset; - // if the value is larger than the value for this offset thus far - if (terms[index] == null) { - terms[index] = new ArrayList<>(); + // if the current index has no words/scores yet, initialize an object at the index + if (wordsAndScoresArr[index] == null) { + wordsAndScoresArr[index] = new WordsAndScores(); + } + boolean stopFound; + // if we are using scores, add the word and score to the object, if not then only add the word + if (useScores) { + stopFound = wordsAndScoresArr[index].addTerm(fieldAndValue[1], scoreList.get(i), hitTermsList); + } else { + stopFound = wordsAndScoresArr[index].addTerm(fieldAndValue[1], hitTermsList); + } + // if we encounter a stop word, and we're not in trim mode, fail-fast and create an entry + // to return the special marker token. When seeing this, the transform will run this again + // in trim mode with an expanded offset range. + if (stopFound && !config.trimExcerpt) { + tk = new Key(top.getRow(), new Text(dtUid), new Text(fieldName + Constants.NULL + WORD_SKIPPED_MARKER + Constants.NULL + + WORD_SKIPPED_MARKER + Constants.NULL + WORD_SKIPPED_MARKER), top.getColumnVisibility(), top.getTimestamp()); + tv = new Value(); + return; } - // use this value - terms[index].add(fieldAndValue[1]); } } } catch (InvalidProtocolBufferException e) { - log.error("Value found in tf column was not of type TermWeight.Info, skipping", e); + log.warn("Value found in tf column was not a valid TermWeight.Info, skipping", e); } } - // get the next term frequency source.next(); } - - // generate the return key and value - tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + generatePhrase(terms)), cv, ts); + // Now that the words and scores array is populated with all the tf data, it's time to generate an excerpt and + // create a key that contains all of our excerpts to be read by the ExcerptTransform + tk = new Key(top.getRow(), new Text(dtUid), new Text(fieldName + Constants.NULL + generateExcerpt(wordsAndScoresArr)), top.getColumnVisibility(), + top.getTimestamp()); tv = new Value(); } + /** Checks whether the passed in list has only negative scores or not within the iterator range. */ + private static boolean hasOnlyNegativeScores(List scoreList, TermWeight.Info info, int startOffset, int endOffset) { + // check each score and if it is positive and within the offsets we are looking at, return false + for (int i = 0, scoreListSize = scoreList.size(); i < scoreListSize; i++) { + int offset = info.getTermOffset(i); + if (scoreList.get(i) >= 0 && offset >= startOffset && offset < endOffset) { + return false; + } + } + // we have not found a positive number within our offsets, return true + return true; + } + /** - * Generate a phrase from the given list of terms + * Generates multiple variations of excerpts and creates the top key and value containing them for return to the transform. In this method, we'll generate + * the following excerpts: + *
    + *
  • phraseWithScoresExcerpt
  • + *
  • phraseWithoutScoresExcerpt
  • + *
  • oneBestExcerpt
  • + *
+ * These get packaged into the key returned to the transform. Ultimately the transform will decide which one we use. * - * @param terms - * the terms to create a phrase from + * @param wordsAndScoresArr + * a collection of document terms and their scores, organized by offset. Each offset may have multiple terms to choose from. Some offsets may be + * null if there were no tf's from that position. + */ + protected String generateExcerpt(WordsAndScores[] wordsAndScoresArr) { + + boolean usedScores = false; + String phraseWithScoresExcerpt = null; + String oneBestExcerpt = null; + + // loop through the WordsAndScores and if we find at least one that has scores, generate a phrase with scores based excerpt + for (WordsAndScores wordsAndScores : wordsAndScoresArr) { + if (wordsAndScores != null && wordsAndScores.getUseScores()) { + phraseWithScoresExcerpt = generatePhrase(wordsAndScoresArr, config); + usedScores = true; + break; + } + } + + // if we did not find any scores in the entire wordsAndScoresArr, add a marker. + if (!usedScores) { + phraseWithScoresExcerpt = NOT_SCORED_MARKER; + oneBestExcerpt = NOT_SCORED_MARKER; + } else { // if we have any scores, set all output scores flags to false. + for (WordsAndScores wordsAndScores : wordsAndScoresArr) { + if (wordsAndScores != null && wordsAndScores.getUseScores()) { + wordsAndScores.setOutputScores(false); + } + } + } + + // Generate the "phrase without scores" excerpt now that the scores flags are false. + String phraseWithoutScoresExcerpt = generatePhrase(wordsAndScoresArr, config); + + // if the regular excerpt is blank, we will return a message saying that the excerpt was blank + if (phraseWithoutScoresExcerpt.isBlank()) { + phraseWithoutScoresExcerpt = BLANK_EXCERPT_MESSAGE; + if (usedScores) { + phraseWithScoresExcerpt = BLANK_EXCERPT_MESSAGE; + oneBestExcerpt = BLANK_EXCERPT_MESSAGE; + } + } else { + if (usedScores) { + // prepare all the WordsAndScores to output for the "one best" excerpt + for (WordsAndScores wordsAndScores : wordsAndScoresArr) { + if (wordsAndScores != null) { + wordsAndScores.setOneBestExcerpt(true); + } + } + // generate the "one best" excerpt + oneBestExcerpt = generatePhrase(wordsAndScoresArr, config); + } + } + // return all the excerpt sections concatenated with nulls between them + return phraseWithScoresExcerpt + Constants.NULL + phraseWithoutScoresExcerpt + Constants.NULL + oneBestExcerpt; + } + + /** + * Generate a phrase from the given lists of WordsAndScores + * + * @param wordsAndScoresArr + * the array of WordsAndScores that contain the terms to create a phrase from + * @param config + * the configuration for the ExcerptIterator * @return the phrase */ - protected String generatePhrase(List[] terms) { - String[] largestTerms = new String[terms.length]; - for (int i = 0; i < terms.length; i++) { - largestTerms[i] = getLongestTerm(terms[i]); + private static String generatePhrase(WordsAndScores[] wordsAndScoresArr, final Configuration config) { + // put brackets around whole hit phrases instead of individual terms + checkForHitPhrase(wordsAndScoresArr, config.hitTermsList); + + // there are cases where we'll have no scores, and we will want to choose a longer term for a particular position + // instead of the hit term we matched (e.g., when we're dealing with synonyms that are a fragment of a larger + // term.) This method will set an override for that position so that we'll choose the longer term instead of + // a shorter synonym, which is typically a substring of the longer term. + overrideOutputLongest(wordsAndScoresArr); + + // create an array with the same length as the one we just passed in + String[] termsToOutput = new String[wordsAndScoresArr.length]; + + // pull our items out of the care package + Configuration.Direction direction = config.direction; + boolean trimExcerpt = config.trimExcerpt; + int startOffset = config.startOffset; + int endOffset = config.endOffset; + float origHalfSize = config.origHalfSize; + + boolean bef = direction.equals(Configuration.Direction.BEFORE); + boolean aft = direction.equals(Configuration.Direction.AFTER); + + // tracks whether we've found the first hit term + boolean firstHitTermFound = false; + + // tracks index of the first hit term found in the words array + int beforeIndex = -1; + // tracks index of the last hit term found in the words array. + int afterIndex = -1; + + int debugCounter = 0; // FOR DEBUG: counter used for debug logging + + // go through the whole WordsAndScoresArr and try to get a word to output for each offset + for (int i = 0; i < wordsAndScoresArr.length; i++) { + // if there is nothing at this position, put nothing at the position in the output + if (wordsAndScoresArr[i] == null) { + termsToOutput[i] = null; + continue; + } + + termsToOutput[i] = wordsAndScoresArr[i].getWordToOutput(); + // If the WordsAndScores returned null, that means the word chosen for this position is something we do not want to output. + if (termsToOutput[i] != null) { + // if the user has requested BEFORE or AFTER and this object had a hit term... + if ((bef || aft) && (wordsAndScoresArr[i].getHasHitTerm())) { + // if this is the first hit term for the excerpt, set this offset as the "afterIndex" and set the lock so we do not write over it + if (!firstHitTermFound) { + afterIndex = i; + firstHitTermFound = true; + } + // set this offset as the "beforeIndex" (no lock on this one because we want it to keep being overwritten with the last hit term offset) + beforeIndex = i; + } + } + + // FOR DEBUG:-------------------------------------------------------------------------------------------------------------- + else { // counting how many things we don't want to output have been removed from the excerpt + if (log.isDebugEnabled() && trimExcerpt) { + debugCounter++; + } + } + // FOR DEBUG:-------------------------------------------------------------------------------------------------------------- } - return joiner.join(largestTerms); + // FOR DEBUG:---------------------------------------------------------------------------------------------------------------------- + if (log.isDebugEnabled() && trimExcerpt) { + log.debug("{} words removed from expanded range ({},{})", debugCounter, startOffset, endOffset); + debugCounter = 0; + // counting how many words we have in the excerpt (after removing stop words) before any trimming is done + for (String s : termsToOutput) { + if (s != null) { + debugCounter++; + } + } + log.debug("{} words in excerpt before trimming (we want this to be greater than or equal to \"size of excerpt requested\" from ExcerptTransform)", + debugCounter); + } + // ------------------------------------------------------------------------------------------------------------------------------- + + // if no BEFORE or AFTER AND a hit term wasn't found... + if (!firstHitTermFound) { + // join everything together with spaces while skipping null offsets (and trimming if we need to) + return !trimExcerpt ? joinExcerpt(termsToOutput) : joinExcerpt(bothTrim(termsToOutput, origHalfSize, (endOffset - startOffset) / 2)); + } else { + if (bef) { // if direction is "before", set everything after the last hit term to null + for (int k = beforeIndex + 1; k < wordsAndScoresArr.length; k++) { + termsToOutput[k] = null; + } + // trim the excerpt down if we need + if (trimExcerpt) { + int start = (int) (beforeIndex - (origHalfSize * 2)); + trimBeginning(termsToOutput, beforeIndex, start); + } + } else { // if direction is "after", set everything before the first hit term to null + for (int k = 0; k < afterIndex; k++) { + termsToOutput[k] = null; + } + // trim the excerpt down if we need + if (trimExcerpt) { + int start = (int) (afterIndex + (origHalfSize * 2)); + trimEnd(termsToOutput, afterIndex, start); + } + } + // join everything together with spaces while skipping null offsets + return joinExcerpt(termsToOutput); + } + } + + /** + * Joins together all the individual terms from the passed in array while skipping any null values. + * + * @param termsToOutput + * the terms to create a phrase from + * @return the finalized joined together string (the excerpt) + */ + private static String joinExcerpt(String[] termsToOutput) { + StringJoiner j = new StringJoiner(" "); + for (String s : termsToOutput) { + if (s != null) { + j.add(s); + } + } + return j.toString(); + } + + /** + * Trim down both side of the excerpt to the size that we want + * + * @param termsToOutput + * the terms to create a phrase from + * @param origHalfSize + * The size of half of the original desired excerpt length. Used during trimming. + * @param expandedMid + * Calculated by (endOffset - startOffset) / 2. Is the midpoint of the expanded range. + * @return the trimmed array + */ + private static String[] bothTrim(String[] termsToOutput, float origHalfSize, int expandedMid) { + // calculate the midpoint of the expanded start and end offsets (because this should only be triggered on a second attempt) + int start = (int) (expandedMid - origHalfSize); + // trim the beginning down to size + trimBeginning(termsToOutput, expandedMid, start); + start = (int) (expandedMid + origHalfSize); + // trim the end down to size + trimEnd(termsToOutput, expandedMid, start); + return termsToOutput; + } + + /** + * Trims off the front to get the correct size + * + * @param termsToOutput + * the terms to create a phrase from + * @param beforeIndex + * the index in the array to start at + * @param start + * the index in the array where we want to start setting values to null at + */ + private static void trimBeginning(String[] termsToOutput, int beforeIndex, int start) { + boolean startNull = false; + // start at "beforeIndex" and work our way backwards through the array + while (beforeIndex >= 0) { + // if we have not started the trimming yet, but we are at the index were we need to start... + if (!startNull && beforeIndex < start) { + startNull = true; + } + // if we have passed the offset where we should be trimming, set this index to null + if (startNull) { + termsToOutput[beforeIndex] = null; + } else { + // if we have not started trimming and this index is null, decrement "start" so that we can use an extra offset to get the excerpt size we want + if (termsToOutput[beforeIndex] == null) { + start--; + } + } + beforeIndex--; + } + } + + /** + * Trims off the end to get the correct size + * + * @param termsToOutput + * the terms to create a phrase from + * @param afterIndex + * the index in the array to start at + * @param start + * the index in the array where we want to start setting values to null at + */ + private static void trimEnd(String[] termsToOutput, int afterIndex, int start) { + boolean startNull = false; + // start at "afterIndex" and work our way through the array + while (afterIndex < termsToOutput.length) { + // if we have not started the trimming yet, but we are at the index were we need to start... + if (!startNull && afterIndex > start) { + startNull = true; + } + // if we have passed the offset where we should be trimming, set this index to null + if (startNull) { + termsToOutput[afterIndex] = null; + } else { + // if we have not started trimming and this index is null, increment "start" so that we can use an extra offset to get the excerpt size we want + if (termsToOutput[afterIndex] == null) { + start++; + } + } + afterIndex++; + } + } + + /** + * Looks for hit phrases (not separate hit terms) and puts the whole phrase in brackets + * + * @param wordsAndScoresArr + * the terms to create a phrase from + * @param hitTermsList + * the list of all hit terms + */ + private static void checkForHitPhrase(WordsAndScores[] wordsAndScoresArr, List hitTermsList) { + ArrayList hitPhrases = new ArrayList<>(); + // checks for phrases (anything in the hit list with a space in it) and adds them to a new arrayList + for (String s : hitTermsList) { + if (s.contains(" ")) { + hitPhrases.add(s); + } + } + // if we don't find any, return unchanged + if (hitPhrases.isEmpty()) { + return; + } + + // for each hit phrase found... + for (String hitPhrase : hitPhrases) { + // split the phrase on the spaces into the separate terms + String[] individualHitTerms = hitPhrase.split(" "); + // if the phrase is almost the same size as the whole excerpt, skip this iteration + if ((wordsAndScoresArr.length - 2) < individualHitTerms.length) { + continue; + } + // iterate across the WordsAndScores until the end of the hit phrase reaches the last offset + int iterations = wordsAndScoresArr.length - individualHitTerms.length + 1; + for (int j = 0; j < iterations; j++) { + // if we find the hit phrase... + if (isPhraseFound(individualHitTerms, wordsAndScoresArr, j)) { + // set which position in the phrase each offset is + for (int k = 0; k < individualHitTerms.length; k++) { + int overridePosition; + // beginning of phrase + if (k == 0) { + overridePosition = 1; + } else if (k == individualHitTerms.length - 1) { // end of phrase + overridePosition = 3; + } else { // middle of phrase + overridePosition = 2; + } + // set the override values for the current positions WordsAndScores to the index of the hit term in this position plus the override + wordsAndScoresArr[j + k].setOverride(wordsAndScoresArr[j + k].getWordsList().indexOf(individualHitTerms[k]), overridePosition); + } + } + } + } } /** - * Get the longest term from a list of terms; + * Check to see if the whole hit phrase is found in the offsets starting at the passed in j value + * + * @param individualHitTerms + * the array of the hit phrase split into individual terms * * @param terms - * the terms to create a phrase - * @return the longest term (null if empty or null list) + * the terms to create a phrase from + * + * @param j + * the current starting offset in the WordsAndScores array + * @return boolean isPhraseFound */ - protected String getLongestTerm(List terms) { - if (terms == null || terms.isEmpty()) { - return null; - } else { - return terms.stream().max(Comparator.comparingInt(String::length)).get(); + private static boolean isPhraseFound(String[] individualHitTerms, WordsAndScores[] terms, int j) { + // k represents what position we are in of the individual hit terms array + for (int k = 0; k < individualHitTerms.length; k++) { + // if a WordsAndScores is null, the phrase obviously wasn't found + if (terms[j + k] == null) { + return false; + } + // get the words list from the current WordsAndScores + ArrayList tempWords = (ArrayList) terms[j + k].getWordsList(); + // if the current WordsAndScores doesn't have the term for this position, the phrase obviously wasn't found + if (!tempWords.contains(individualHitTerms[k])) { + return false; + } } + // we found the whole phrase!!! + return true; } /** - * Determine if this dt and uid are in the accepted column families + * If scores are not used, checks to see if a WordsAndScores object has a hit term in it and if so, sees if the hit word is also the longest word. If not, + * then it will set an override so that the longest word is output with brackets. This is useful in the case where we are dealing with synonyms, where there + * will be a single long word in the same position as fragments of that word, and one of the fragments is the hit term * - * @param dtAndUid - * the dt and uid string - * @return true if we can use it, false if not + * @param wordsAndScoresArr + * the terms to create a phrase from */ - private boolean isUsableDocument(String dtAndUid) { - return columnFamilies.contains(dtAndUid) == inclusive; + private static void overrideOutputLongest(WordsAndScores[] wordsAndScoresArr) { + // check to see if we find scores and if so, return unchanged + for (WordsAndScores ws : wordsAndScoresArr) { + if (ws != null && ws.getUseScores()) { + return; + } + } + // we now know that scores are not being used for this excerpt + for (WordsAndScores ws : wordsAndScoresArr) { + // if a WordsAndScores is not null and has a hit term in it... + if (ws != null && ws.getHasHitTerm()) { + // get the index of its longest word + int lwi = ws.getLongestWordIndex(); + // check to see if the index of its longest word is not the same as the index of the hit term (meaning that the hit term is NOT the longest + // word) + if (ws.getHitTermIndex() != lwi) { + // get the override value from the WordsAndScores + int ov = ws.getOverrideValue(); + // if this WordsAndScores has a valid override, use the same value but set the index to the index of the longest word + // if it does not have a valid override yet, set it with the index of the longest word and an override value of "4" + ws.setOverride(lwi, ov >= 0 ? ov : 4); + } + } + } } /** @@ -364,7 +855,7 @@ private void seekToNextUid(Text row, String dtAndUid) throws IOException { Key startKey = new Key(row, Constants.TERM_FREQUENCY_COLUMN_FAMILY, new Text(dtAndUid + '.')); this.scanRange = new Range(startKey, false, this.scanRange.getEndKey(), this.scanRange.isEndKeyInclusive()); if (log.isDebugEnabled()) { - log.debug(this + " seek'ing to next document: " + this.scanRange); + log.debug("{} seek'ing to next document: {}", this, this.scanRange); } source.seek(this.scanRange, Collections.singleton(new ArrayByteSequence(Constants.TERM_FREQUENCY_COLUMN_FAMILY.getBytes())), true); @@ -377,7 +868,7 @@ private void seekToNextUid(Text row, String dtAndUid) throws IOException { * the column families * @return a sorted set of column families as Strings */ - private SortedSet getSortedCFs(Collection columnFamilies) { + private static SortedSet getSortedCFs(Collection columnFamilies) { return columnFamilies.stream().map(m -> { try { return Text.decode(m.getBackingArray(), m.offset(), m.length()); @@ -394,13 +885,13 @@ private SortedSet getSortedCFs(Collection columnFamilies) * the term freq key * @return the field name */ - private String[] getFieldAndValue(Key tfKey) { + private static String[] getFieldAndValue(Key tfKey) { String cq = tfKey.getColumnQualifier().toString(); int index = cq.lastIndexOf(Constants.NULL); - String fieldname = cq.substring(index + 1); + String fieldName = cq.substring(index + 1); int index2 = cq.lastIndexOf(Constants.NULL, index - 1); - String fieldvalue = cq.substring(index2 + 1, index); - return new String[] {fieldname, fieldvalue}; + String fieldValue = cq.substring(index2 + 1, index); + return new String[] {fieldName, fieldValue}; } /** @@ -410,7 +901,7 @@ private String[] getFieldAndValue(Key tfKey) { * the term freq key * @return the dt\x00uid */ - private String getDtUidFromTfKey(Key tfKey) { + private static String getDtUidFromTfKey(Key tfKey) { return getDtUid(tfKey.getColumnQualifier().toString()); } @@ -425,8 +916,8 @@ private String getDtUidFromTfKey(Key tfKey) { * inclusive boolean flag * @return the start or end document (cq) for our tf scan range. Null if dt,uid does not exist in the event key */ - private String getDtUidFromEventKey(Key eventKey, boolean startKey, boolean inclusive) { - // if an infinite end range, or unspecified end document, then no cdocument to specify + private static String getDtUidFromEventKey(Key eventKey, boolean startKey, boolean inclusive) { + // if an infinite end range, or unspecified end document, then no document to specify if (eventKey == null || eventKey.getColumnFamily() == null || eventKey.getColumnFamily().getLength() == 0) { return null; } @@ -437,51 +928,44 @@ private String getDtUidFromEventKey(Key eventKey, boolean startKey, boolean incl // if calculating a start cq if (startKey) { - // if the start dt/uid is inclusive and the cf is only the dt and uid, then include this document - if (inclusive && cf.equals(dtAndUid)) { - return dtAndUid + Constants.NULL; - } + // if the start dt/uid is inclusive and the cf is only the dt and uid, then include this document, // otherwise start at the next document - else { - return dtAndUid + Constants.ONE_BYTE; - } + return inclusive && cf.equals(dtAndUid) ? dtAndUid + Constants.NULL : dtAndUid + Constants.ONE_BYTE; } // if calculating an end cq else { - // if the end dt/uid is inclusive or the cf was not only the dt and uid - if (inclusive || !cf.equals(dtAndUid)) { - // then include this document - return dtAndUid + Constants.NULL + Constants.MAX_UNICODE_STRING; - } + // if the end dt/uid is inclusive or the cf was not only the dt and uid, then include this document, // otherwise stop before this document - else { - return dtAndUid + Constants.NULL; - } + return inclusive || !cf.equals(dtAndUid) ? dtAndUid + Constants.NULL + Constants.MAX_UNICODE_STRING : dtAndUid + Constants.NULL; } } // get the dt/uid from the beginning of a given string - private String getDtUid(String str) { + private static String getDtUid(String str) { int index = str.indexOf(Constants.NULL); index = str.indexOf(Constants.NULL, index + 1); - if (index == -1) { - return str; - } else { - return str.substring(0, index); - } + return index == -1 ? str : str.substring(0, index); + } + + public void setHitTermsList(List hitTermsList) { + this.config.hitTermsList = (ArrayList) hitTermsList; + } + + public void setDirection(String direction) { + this.config.direction = Configuration.Direction.valueOf(direction.toUpperCase()); + } + + public void setOrigHalfSize(float origHalfSize) { + this.config.origHalfSize = origHalfSize; + } + + public void setTrimExcerpt(boolean trimExcerpt) { + this.config.trimExcerpt = trimExcerpt; } @Override public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("TermFrequencyExcerptIterator: "); - sb.append(this.fieldName); - sb.append(", "); - sb.append(this.startOffset); - sb.append(", "); - sb.append(this.endOffset); - - return sb.toString(); + return "TermFrequencyExcerptIterator: " + config; } } diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/logic/WordsAndScores.java b/warehouse/query-core/src/main/java/datawave/query/iterator/logic/WordsAndScores.java new file mode 100644 index 00000000000..280e29680ff --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/logic/WordsAndScores.java @@ -0,0 +1,501 @@ +package datawave.query.iterator.logic; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import javax.validation.constraints.PositiveOrZero; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import datawave.ingest.protobuf.TermWeightPosition; + +/** + * An object used to save terms and their respective scores (if a score exists and is valid).
+ *
+ * Each instance represents the word/words at a single position(offset) in a document, and the addWord method is used to record the words found in + * that position.
+ * There are conditions where multiple words may exist in the same position in a document, such as synonym tokens for standard text or speech-to-text where + * multiple possible readings for a single utterance exist.
+ * Individual words may be stop words that we do not want to emit as a component of an excerpt, but we need to track these, because in the speech to text case + * there may be poorer choices (e.g., with lower scores) that we want to avoid emitting.
+ *
+ * In addition to preserving state, this class implements the getWordToOutput method that chooses the best word from this position to emit as a + * part of a hit excerpt, employing a variety of rules including everything from score to word length or the word's position in a phrase. + */ +public class WordsAndScores { + private static final Logger log = LoggerFactory.getLogger(WordsAndScores.class); + + public static final int MAX_ARRAY_SIZE = 1024; + public static final int MAX_SCORE = 900_000_000; + + /** the list of words, there may be one or many words saved for a position in the document */ + private final ArrayList words; + /** + * the list of scores, each word for a single position may have a different score.
+ * the indexes in this list correspond to the indexes in the words array + */ + private final ArrayList scores; + + /** the index of the word with the longest length */ + private int longestWordIndex; + /** the index of the word with the smallest score (smallest non-negative is best) */ + private int smallestScoreIndex; + /** the index of the hit term to output */ + private int hitTermIndex; + /** the size of the words and scores lists */ + @PositiveOrZero + private int arrSize; + + /** + * the index of the override to output - Except for when creating the "one-best" excerpt, the override has the highest priority for being output.
+ * It should be used when we want to output something different from the standard decision-making in getWordToOutput().
+ * External logic can use setOverride(index, value) to set. + */ + private int overrideIndex; + /** + * 1 for beginning of phrase, 2 for middle of phrase, 3 for end of phrase, 4 for brackets around the specified word, anything else for none of those.
+ *
+ * It should be used when we want to output something different from the standard decision-making in getWordToOutput().
+ * External logic can use setOverride(index, value) to set. + */ + private int overrideValue; + + private boolean useScores; + private boolean hasHitTerm; + private boolean outputScores; + private boolean oneBestExcerpt; + + private static final String OPENP = "("; + private static final String CLOSEP = ")"; + private static final String OPENB = "["; + private static final String CLOSEB = "]"; + + /** the list of stop words (words to skip when outputting the excerpt). */ + public static final Set STOP_WORD_LIST = Set.of(""); + + public WordsAndScores() { + words = new ArrayList<>(); + scores = new ArrayList<>(); + longestWordIndex = -1; + smallestScoreIndex = -1; + hitTermIndex = -1; + arrSize = 0; + useScores = false; + hasHitTerm = false; + overrideIndex = -1; + overrideValue = -1; + outputScores = false; + oneBestExcerpt = false; + } + + /** + * A method to add a word the object and update the internal state that tracks the best longest term and the best hit term. The added term may be a stop + * word, meaning that we don't want to output it as an excerpt. In certain cases, the stop word might be better than other choices, so we need to track them + * all. + * + * @param word + * the word to add + * @return True if the added term is a stop word and false otherwise. + */ + public boolean addTerm(String word, List hitTermsList) { + return addTerm(word, -1, hitTermsList); + } + + /** + * A method to add a word and score into the object and update the internal state that tracks the best longest term, the best score and the best hit term. + * The added term may be a stop word, meaning that we don't want to output it as an excerpt. In certain cases, the stop word might be better than other + * choices, so we need to track them all. + * + * @param word + * the word to add + * @param score + * the score to add + * @return True if the added term is a stop word and false otherwise. + */ + public boolean addTerm(String word, int score, List hitTermsList) { + final int currentIndex = arrSize; + arrSize++; + + if (arrSize > MAX_ARRAY_SIZE) { + log.info("WordsAndScores encountered more than the maximum number of words in this position, ignoring word: {} with score: {}", word, score); + return false; + } + + words.add(word); + // if a score is less than 0, replace it with -1; + scores.add(score >= 0 ? score : -1); + + updateLongestWordIndex(word, currentIndex); + // we only want to update with non-negative scores + if (score >= 0) { + updateBestScoreIndex(word, score, currentIndex); + } + + if ((hitTermsList != null) && hitTermsList.contains(word)) { + updateHitTermIndex(word, score, currentIndex); + } + + return STOP_WORD_LIST.contains(word); + } + + /** + * Update the longest word index if the specified word is longer than the current longest word + * + * @param word + * the word to check. + * @param currentIndex + * the current index we're inserting into. + */ + private void updateLongestWordIndex(String word, int currentIndex) { + if (currentIndex == 0) { + longestWordIndex = 0; + return; + } + + if (word.length() > words.get(longestWordIndex).length()) { // if this word is longer than the current longest word + longestWordIndex = currentIndex; // set this index as the longest word + } + } + + /** + * Update the best score index if the specified word has a better score than the current best score (the smallest non-negative score is the best score) + * + * @param word + * the word to check. + * @param score + * the word's score. (we have already ) + * @param currentIndex + * the current index we're inserting into. + */ + private void updateBestScoreIndex(String word, @PositiveOrZero int score, int currentIndex) { + if (smallestScoreIndex == -1) { + // if we have no valid smallestScore, choose the current index + smallestScoreIndex = currentIndex; + // we've received at least one valid score, flip these flags to true + useScores = true; + outputScores = true; + return; + } + + final int smallestScore = scores.get(smallestScoreIndex); + if (score < smallestScore) { + // if the current score is smaller, choose the current index + smallestScoreIndex = currentIndex; + return; + } + + if (score == smallestScore) { + // if this score is equal to the smallest score, choose the longer word + log.info("Two tokens have the same score: Choosing the longest one."); + if (word.length() > words.get(smallestScoreIndex).length()) { + smallestScoreIndex = currentIndex; + } + } + } + + /** + * Update the index of the best hit term index if the specified word is a better hit term than the current hit term. + * + * @param word + * the word to check. + * @param score + * the word's score. + * @param currentIndex + * the current index we're inserting into. + */ + private void updateHitTermIndex(String word, int score, int currentIndex) { + if (hitTermIndex == -1) { // if we don't have a hit term at this offset yet... + hitTermIndex = currentIndex; // set this index as the one with a hit term + hasHitTerm = true; + return; + } + + final int hitTermScore = scores.get(hitTermIndex); + + // whether this word is longer than the existing best hit term word. + boolean currentTermIsLonger = word.length() > words.get(hitTermIndex).length(); + + // there are a couple cases here + // - if current has a score, and it's better than the best, we take current. + // - if current and best have equal scores, but current is longer, we take current instead of best + if (score >= 0 && hitTermScore > score) { + // valid current score and worse valid hitTermScore + hitTermIndex = currentIndex; + } else if ((hitTermScore == score || (hitTermScore < 0 && score < 0)) && currentTermIsLonger) { + // equal current score and hit term score, choose the longer word. + hitTermIndex = currentIndex; + } + } + + /** + * A method to return whichever word we are going to use in the excerpt for this position. In order of preference + *
    + *
  • one best excerpt, if enabled
  • + *
  • overridden word, if this has been provided externally
  • + *
  • the best hit term word, if present
  • + *
  • the best scored word
  • + *
  • the longest word
  • + *
+ * Each of these will be checked to see if they're stopwords. IF they are, returns null instead. + * + * @return the chosen word or null if it is something we do not want to output + */ + public String getWordToOutput() { + if (smallestScoreIndex == -1 && longestWordIndex == -1) { // if we try and get the word from an object with nothing added to it (should never happen)... + log.warn("Trying to get token to output when none have been added: Will output \"REPORTMETODATAWAVE\"."); + return "REPORTMETODATAWAVE"; + } + + if (oneBestExcerpt) { + return getOneBestWordToOutput(); + } + + if (overrideIndex >= 0 && overrideIndex < arrSize) { + return getOverrideWordToOutput(); + } + + if (hasHitTerm) { // if we have a hit term... + return getHitTermWordToOutput(); + } + + if (useScores) { // if we have added at least one score, and it is a valid score... + return getScoredWordToOutput(); + } + + // default to returning the longest word if the scores don't exist/aren't valid + if (STOP_WORD_LIST.contains(words.get(longestWordIndex))) { // if the selected term is in the stop list... + return null; + } + return words.get(longestWordIndex); // return the longest word + } + + /** + * A method to return the speech-to-text 'one-best' word to output. + * + * @return the one-best word, or null if the word is a stopword + */ + private String getOneBestWordToOutput() { + if (scores.get(smallestScoreIndex) > MAX_SCORE) { + return null; + } + if (STOP_WORD_LIST.contains(words.get(smallestScoreIndex))) { // if the selected term is in the stop list... + return null; + } + return hitTermIndex == smallestScoreIndex ? OPENB + words.get(hitTermIndex) + CLOSEB : words.get(smallestScoreIndex); + } + + /** + * A method to return the overridden word to output, which is identified externally to this code. + * + * @return the overridden word, or null if the word is a stopword or exceeds MAX_SCORE. + */ + private String getOverrideWordToOutput() { + if (scores.get(overrideIndex) > MAX_SCORE) { + return null; + } + if (STOP_WORD_LIST.contains(words.get(overrideIndex))) { // if the hit term is on the stop list for some reason... + return null; + } + switch (overrideValue) { + case 1: + if (useScores && (scores.get(overrideIndex) != -1)) { + return OPENB + words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP; + } else { + return OPENB + words.get(overrideIndex); + } + case 3: + if (useScores && (scores.get(overrideIndex) != -1)) { + return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP + CLOSEB; + } else { + return words.get(overrideIndex) + CLOSEB; + } + case 2: + if (useScores && (scores.get(overrideIndex) != -1)) { + return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP; + } else { + return words.get(overrideIndex); + } + case 4: + return OPENB + words.get(overrideIndex) + CLOSEB; + default: + log.warn("Invalid override value {}: Will output \"REPORTMETODATAWAVE\".", overrideValue); + return "REPORTMETODATAWAVE"; + } + } + + /** + * A method to return the hit term to output, + * + * @return the hit term, or null if the word is a stopword or exceeds MAX_SCORE. + */ + private String getHitTermWordToOutput() { + if (scores.get(hitTermIndex) > MAX_SCORE) { + return null; + } + + final String hitTerm = words.get(hitTermIndex); + + if (STOP_WORD_LIST.contains(hitTerm)) { // if the hit term is on the stop list for some reason... + return null; + } + + final int hitTermScore = scores.get(hitTermIndex); + + if (useScores && (hitTermScore != -1)) { // if we have a valid score for the hit term... + return OPENB + hitTerm + OPENP + userReadable(hitTermScore) + CLOSEP + CLOSEB; + } else { + return OPENB + hitTerm + CLOSEB; + } + } + + /** + * A method to return the word with the best (lowest) score if present. + * + * @return the best scored word or null if the word is a stop word or exceeds the max score. + */ + private String getScoredWordToOutput() { + final int bestWordScore = scores.get(smallestScoreIndex); + + if (bestWordScore > MAX_SCORE) { + return null; + } + + final String bestWord = words.get(smallestScoreIndex); + + if (STOP_WORD_LIST.contains(bestWord)) { // if the selected term is in the stop list... + return null; + } + + return outputScores ? bestWord + OPENP + userReadable(bestWordScore) + CLOSEP : bestWord; + } + + /** Converts the score into a number from 0-100 (higher is better) so that it is easier for the user to understand. */ + private int userReadable(int score) { + // the original probability got put through ln(x) so we do e^x to put it back to the original probability + return (int) Math.round((Math.exp(TermWeightPosition.termWeightScoreToPositionScore(score))) * 100); + } + + /** + * Returns the list of words saved in this object. + * + * @return words + */ + public List getWordsList() { + return words; + } + + /** + * Sets the list of words to the one passed in and sets the list of scores to -1. + * + * @param words + * a list of words + */ + public void setWordsList(List words, List hitTermsList) { + reset(); + for (String word : words) { + addTerm(word, hitTermsList); + } + } + + /** + * Returns the list of scores saved in this object. + * + * @return scores + */ + public List getScoresList() { + return scores; + } + + /** + * Sets the list of words and the list of scores to the ones passed in. + * + * @param words + * a list of words + * @param scores + * a list of scores + */ + public void setWordsAndScoresList(List words, List scores, List hitTermsList) { + if (words.size() != scores.size()) { + throw new IllegalArgumentException("The words and scores lists must be the same size!"); + } else { + reset(); + for (int i = 0; i < words.size(); i++) { + addTerm(words.get(i), scores.get(i), hitTermsList); + } + } + } + + public void setOverride(int overrideIndex, int overrideValue) { + this.overrideIndex = overrideIndex; + this.overrideValue = overrideValue; + } + + public void setOutputScores(boolean outputScores) { + this.outputScores = outputScores; + } + + public void setOneBestExcerpt(boolean oneBestExcerpt) { + this.oneBestExcerpt = oneBestExcerpt; + } + + /** + * Returns a boolean that is true if there is a valid score in the scores list and false otherwise. + * + * @return useScores + */ + public boolean getUseScores() { + return useScores; + } + + /** + * Returns a boolean that is true if there is a hit term in the words list and false otherwise. + * + * @return hasHitTerm + */ + public boolean getHasHitTerm() { + return hasHitTerm; + } + + public int getArrSize() { + return arrSize; + } + + public int getLongestWordIndex() { + return longestWordIndex; + } + + public int getHitTermIndex() { + return hitTermIndex; + } + + public int getOverrideIndex() { + return overrideIndex; + } + + public int getOverrideValue() { + return overrideValue; + } + + public void reset() { + words.clear(); + scores.clear(); + longestWordIndex = -1; + smallestScoreIndex = -1; + hitTermIndex = -1; + arrSize = 0; + useScores = false; + hasHitTerm = false; + overrideIndex = -1; + overrideValue = -1; + outputScores = false; + oneBestExcerpt = false; + } + + @Override + public String toString() { + return "WordsAndScores{" + "words=" + words + ", scores=" + scores + ", longestWordIndex=" + longestWordIndex + ", smallestScoreIndex=" + + smallestScoreIndex + ", arrSize=" + arrSize + '}'; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/PhraseIndexes.java b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/PhraseIndexes.java index 8a93ed68d27..505e3fc172e 100644 --- a/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/PhraseIndexes.java +++ b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/PhraseIndexes.java @@ -7,11 +7,11 @@ import java.util.List; import java.util.Objects; import java.util.Set; +import java.util.regex.Pattern; import java.util.stream.Collectors; -import org.apache.commons.lang3.StringUtils; -import org.apache.log4j.Logger; -import org.javatuples.Triplet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; import com.google.common.collect.Ordering; @@ -27,47 +27,52 @@ * required to retrieve excerpts when requested. */ public class PhraseIndexes { - private static final Logger log = Logger.getLogger(PhraseIndexes.class); + private static final Logger log = LoggerFactory.getLogger(PhraseIndexes.class); + + private static final Comparator endOffsetComparator = Comparator.comparingInt(PhraseOffset::getEndOffset); + + private static final Comparator reverseIntegerCompare = ((Comparator) Integer::compareTo).reversed(); + + private static final String WHITESPACE = "\\s"; + public static final Pattern whitespacePattern = Pattern.compile(WHITESPACE); /** - * A Map of fieldname to eventId,start,end phrase offsets. The eventId has the form as defined by TermFrequencyList.getEventid(key) + * A Map of field name to eventId,start,end phrase offsets. The eventId has the form as defined by TermFrequencyList.getEventId(key) */ - private final SortedSetMultimap> map = TreeMultimap.create(); + private final SortedSetMultimap map = TreeMultimap.create(); /** - * Returns a new {@link PhraseIndexes} parsed from the string. The provided string is expected to have the format returned by + * Returns a new {@link PhraseIndexes} object parsed from the string. The provided string is expected to have the format returned by * {@link PhraseIndexes#toString()}. *
    *
  • Given null, null will be returned.
  • *
  • Given an empty or blank string, an empty {@link PhraseIndexes} will be returned.
  • - *
  • Given {@code BODY:1,2:3,5/CONTENT:5,6:7,6}, a {@link PhraseIndexes} will be returned with offsets [1,2] and [3,5] for field {@code BODY}, and offsets - * [5,6] and [7,6] for field {@code CONTENT}.
  • + *
  • Given {@code BODY:event1,1,2:event2,3,5/CONTENT:event3,5,6:event4,7,6}, a {@link PhraseIndexes} will be returned with offsets [1,2] for event1 and + * [3,5] for event2 in the field {@code BODY}, and offsets [5,6] in event3 and [7,6] for event4 and field {@code CONTENT}.
  • *
* - * @param string + * @param phraseIndexString * the string to parse * @return the parsed {@link PhraseIndexes} */ - public static PhraseIndexes from(String string) { - if (string == null) { + public static PhraseIndexes from(String phraseIndexString) { + if (phraseIndexString == null) { return null; } // Strip whitespaces. - string = StringUtils.deleteWhitespace(string); + phraseIndexString = whitespacePattern.matcher(phraseIndexString).replaceAll(""); PhraseIndexes phraseIndexes = new PhraseIndexes(); - String[] fieldParts = string.split(Constants.FORWARD_SLASH); + final String[] fieldParts = phraseIndexString.split(Constants.FORWARD_SLASH); for (String fieldPart : fieldParts) { String[] parts = fieldPart.split(Constants.COLON); String field = parts[0]; + for (int i = 1; i < parts.length; i++) { String[] indexParts = parts[i].split(Constants.COMMA); - String eventId = indexParts[0]; // if the event ID is empty, then it must have been null initially (see toString()) - if (eventId.isEmpty()) { - eventId = null; - } + String eventId = indexParts[0].isEmpty() ? null : indexParts[0]; int start = Integer.parseInt(indexParts[1]); int end = Integer.parseInt(indexParts[2]); phraseIndexes.addIndexTriplet(field, eventId, start, end); @@ -84,8 +89,8 @@ public static PhraseIndexes from(String string) { */ public void addAll(PhraseIndexes phraseIndexes) { for (String field : phraseIndexes.getFields()) { - for (Triplet indice : phraseIndexes.getIndices(field)) { - addIndexTriplet(field, indice.getValue0(), indice.getValue1(), indice.getValue2()); + for (PhraseOffset entry : phraseIndexes.getPhraseOffsets(field)) { + addIndexTriplet(field, entry.getEventId(), entry.getStartOffset(), entry.getEndOffset()); } } } @@ -112,22 +117,22 @@ public void addIndexTriplet(String field, String eventId, int start, int end) { // first remove any overlapping phrases and extend the start/end appropriately if (map.containsKey(field)) { - Iterator> indices = map.get(field).iterator(); + Iterator indices = map.get(field).iterator(); while (indices.hasNext()) { - Triplet indice = indices.next(); + PhraseOffset entry = indices.next(); // if we have gone past the end, then no more possibility of overlapping - if (indice.getValue1() > end) { + if (entry.getStartOffset() > end) { break; } // if from the same event/document, and the endpoints overlap - if (Objects.equals(eventId, indice.getValue0()) && overlaps(indice.getValue1(), indice.getValue2(), start, end)) { - start = Math.min(start, indice.getValue1()); - end = Math.max(end, indice.getValue2()); + if (Objects.equals(eventId, entry.getEventId()) && overlaps(entry.getStartOffset(), entry.getEndOffset(), start, end)) { + start = Math.min(start, entry.getStartOffset()); + end = Math.max(end, entry.getEndOffset()); indices.remove(); } } } - map.put(field, Triplet.with(eventId, start, end)); + map.put(field, PhraseOffset.with(eventId, start, end)); } /** @@ -138,18 +143,18 @@ public void addIndexTriplet(String field, String eventId, int start, int end) { * @param offset * The phrase triplet */ - public void addIndexTriplet(String field, Triplet offset) { - addIndexTriplet(field, offset.getValue0(), offset.getValue1(), offset.getValue2()); + public void addIndexTriplet(String field, PhraseOffset offset) { + addIndexTriplet(field, offset.getEventId(), offset.getStartOffset(), offset.getEndOffset()); } /** - * Get all index pairs found for matching hits for the specified field. May return null. + * Get all offsets found for matching hits for the specified field. May return an empty collection or null. * * @param field * the field * @return the index pairs if any, otherwise null */ - public Collection> getIndices(String field) { + public Collection getPhraseOffsets(String field) { return map.get(field); } @@ -189,57 +194,6 @@ public boolean containsField(String field) { return map.containsKey(field); } - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - PhraseIndexes that = (PhraseIndexes) o; - return Objects.equals(map, that.map); - } - - @Override - public int hashCode() { - return Objects.hash(map); - } - - /** - * Returns this {@link PhraseIndexes} as a formatted string that can later be parsed back into a {@link PhraseIndexes} using - * {@link PhraseIndexes#from(String)}. The string will have the format FIELD:eventId,start,end:eventId,start,end:.../FIELD:eventId,start,end:... - * - * @return a formatted string - */ - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - Iterator fieldIterator = map.keySet().iterator(); - while (fieldIterator.hasNext()) { - // Write the field. - String field = fieldIterator.next(); - sb.append(field).append(Constants.COLON); - // Write the indexes found for the field. - Iterator> indexIterator = map.get(field).iterator(); - while (indexIterator.hasNext()) { - Triplet indexTriplet = indexIterator.next(); - if (indexTriplet.getValue0() != null) { - sb.append(indexTriplet.getValue0()); - } - sb.append(Constants.COMMA).append(indexTriplet.getValue1()); - sb.append(Constants.COMMA).append(indexTriplet.getValue2()); - if (indexIterator.hasNext()) { - sb.append(Constants.COLON); - } - } - if (fieldIterator.hasNext()) { - sb.append(Constants.FORWARD_SLASH); - } - } - return sb.toString(); - } - /** * Utility function to see if two offset ranges overlap * @@ -266,8 +220,8 @@ public static boolean overlaps(int start1, int end1, int start2, int end2) { * pos * @return true if overlapping */ - public static boolean overlaps(Triplet triplet, TermWeightPosition pos) { - return overlaps(triplet.getValue1(), triplet.getValue2(), pos.getLowOffset(), pos.getOffset()); + public static boolean overlaps(PhraseOffset triplet, TermWeightPosition pos) { + return overlaps(triplet.getStartOffset(), triplet.getEndOffset(), pos.getLowOffset(), pos.getOffset()); } /** @@ -281,18 +235,18 @@ public static boolean overlaps(Triplet triplet, TermWeig * term weight position * @return the overlapping triplet */ - public Triplet getOverlap(String fieldName, String eventId, TermWeightPosition position) { - Collection> indexes = getIndices(fieldName); - if (indexes != null) { + public PhraseOffset getOverlap(String fieldName, String eventId, TermWeightPosition position) { + Collection phraseOffsets = getPhraseOffsets(fieldName); + if (phraseOffsets != null) { int start = position.getLowOffset(); int end = position.getOffset(); - for (Triplet triplet : indexes) { + for (PhraseOffset offset : phraseOffsets) { // if the start of the triplet is past the end, then no more possibility of overlapping - if (triplet.getValue1() > end) { + if (offset.getStartOffset() > end) { break; } - if (Objects.equals(eventId, triplet.getValue0()) && overlaps(triplet.getValue1(), triplet.getValue2(), start, end)) { - return triplet; + if (Objects.equals(eventId, offset.getEventId()) && overlaps(offset.getStartOffset(), offset.getEndOffset(), start, end)) { + return offset; } } } @@ -311,12 +265,12 @@ public Triplet getOverlap(String fieldName, String event * @return An overlapping TermWeightPosition if any */ public TermWeightPosition getOverlappingPosition(String fieldName, String eventId, TermWeight.Info twInfo) { - // get the phases for this fieldname - Collection> triplets = getIndices(fieldName); + // get the phases for this field name + Collection triplets = getPhraseOffsets(fieldName); if (triplets != null) { // get the triplets is reverse sorted order base on the end index and filtered by event id - List> reverseEndIndexSortedList = triplets.stream().filter(t -> Objects.equals(eventId, t.getValue0())) + List reverseEndIndexSortedList = triplets.stream().filter(t -> Objects.equals(eventId, t.getEventId())) .sorted(endOffsetComparator.reversed()).collect(Collectors.toList()); if (!reverseEndIndexSortedList.isEmpty()) { @@ -344,9 +298,9 @@ public TermWeightPosition getOverlappingPosition(String fieldName, String eventI int start = twInfo.getTermOffsetCount() - 1; // iterator through the phrase triplets - for (Triplet triplet : reverseEndIndexSortedList) { + for (PhraseOffset triplet : reverseEndIndexSortedList) { // find the index of the first offset before or equal to the triplet end offset plus the max skip value - start = findPrevOffset(twInfo, start, triplet.getValue2() + maxSkip); + start = findPrevOffset(twInfo, start, triplet.getEndOffset() + maxSkip); // if we have an index, then search backwards for an overlap if (start >= 0) { for (int offsetIndex = start; offsetIndex >= 0; offsetIndex--) { @@ -354,7 +308,7 @@ public TermWeightPosition getOverlappingPosition(String fieldName, String eventI TermWeightPosition pos = position.build(); if (PhraseIndexes.overlaps(triplet, pos)) { return pos; - } else if (pos.getOffset() < triplet.getValue1()) { + } else if (pos.getOffset() < triplet.getStartOffset()) { break; } } @@ -398,18 +352,54 @@ private static int findPrevOffset(TermWeight.Info twInfo, int startIndex, int of return nextOffset; } - private static final Comparator> endOffsetComparator = new Comparator>() { - @Override - public int compare(Triplet o1, Triplet o2) { - return o1.getValue2() - o2.getValue2(); + @Override + public boolean equals(Object o) { + if (this == o) { + return true; } - }; - - private static final Comparator reverseIntegerCompare = new Comparator() { - @Override - public int compare(Integer o1, Integer o2) { - return o1.compareTo(o2); + if (o == null || getClass() != o.getClass()) { + return false; } - }.reversed(); + PhraseIndexes that = (PhraseIndexes) o; + return Objects.equals(map, that.map); + } + @Override + public int hashCode() { + return Objects.hash(map); + } + + /** + * Returns this {@link PhraseIndexes} as a formatted string that can later be parsed back into a {@link PhraseIndexes} using + * {@link PhraseIndexes#from(String)}. The string will have the format FIELD:eventId,start,end:eventId,start,end:.../FIELD:eventId,start,end:... + * + * @return a formatted string + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + Iterator fieldIterator = map.keySet().iterator(); + while (fieldIterator.hasNext()) { + // Write the field. + String field = fieldIterator.next(); + sb.append(field).append(Constants.COLON); + // Write the indexes found for the field. + Iterator indexIterator = map.get(field).iterator(); + while (indexIterator.hasNext()) { + PhraseOffset indexTriplet = indexIterator.next(); + if (indexTriplet.getEventId() != null) { + sb.append(indexTriplet.getEventId()); + } + sb.append(Constants.COMMA).append(indexTriplet.getStartOffset()); + sb.append(Constants.COMMA).append(indexTriplet.getEndOffset()); + if (indexIterator.hasNext()) { + sb.append(Constants.COLON); + } + } + if (fieldIterator.hasNext()) { + sb.append(Constants.FORWARD_SLASH); + } + } + return sb.toString(); + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/PhraseOffset.java b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/PhraseOffset.java new file mode 100644 index 00000000000..67cf165afb6 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/PhraseOffset.java @@ -0,0 +1,64 @@ +package datawave.query.postprocessing.tf; + +import java.util.Objects; + +/** + * This class represents a specific phrase that has been matched in a retrieved document field. It contains the id of the event it is associated with and the + * start and end offset of the phrase in the field specified. The offsets match those found for individual terms in the tf portion of the index.
+ *
+ * The eventId has the form as defined by TermFrequencyList.getEventId(key) + */ +public class PhraseOffset implements Comparable { + private final String eventId; + private final int startOffset; + private final int endOffset; + + public static PhraseOffset with(String eventId, int startOffset, int endOffset) { + return new PhraseOffset(eventId, startOffset, endOffset); + } + + private PhraseOffset(String eventId, int startOffset, int endOffset) { + this.eventId = eventId; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + @Override + public int compareTo(PhraseOffset o) { + int cmp; + if ((cmp = eventId.compareTo(o.eventId)) != 0) { + return cmp; + } + if ((cmp = Integer.compare(startOffset, o.startOffset)) != 0) { + return cmp; + } + return Integer.compare(endOffset, o.endOffset); + } + + public int getEndOffset() { + return endOffset; + } + + public int getStartOffset() { + return startOffset; + } + + public String getEventId() { + return eventId; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (!(o instanceof PhraseOffset)) + return false; + PhraseOffset that = (PhraseOffset) o; + return startOffset == that.startOffset && endOffset == that.endOffset && Objects.equals(eventId, that.eventId); + } + + @Override + public int hashCode() { + return Objects.hash(eventId, startOffset, endOffset); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/TermOffsetMap.java b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/TermOffsetMap.java index 65892cf5b7c..c796a672efd 100644 --- a/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/TermOffsetMap.java +++ b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/TermOffsetMap.java @@ -7,8 +7,6 @@ import java.util.Set; import java.util.StringJoiner; -import org.javatuples.Triplet; - import datawave.query.jexl.functions.TermFrequencyList; /** @@ -99,9 +97,9 @@ public void addPhraseIndexTriplet(String field, String eventId, int start, int e * the field * @return the phrase indexes */ - public Collection> getPhraseIndexes(String field) { + public Collection getPhraseIndexes(String field) { if (phraseIndexes != null) { - return phraseIndexes.getIndices(field); + return phraseIndexes.getPhraseOffsets(field); } return null; } @@ -109,7 +107,7 @@ public Collection> getPhraseIndexes(String field /** * Return the underlying {@link PhraseIndexes} object * - * @return a phraseindexes object + * @return a PhraseIndexes object */ public PhraseIndexes getPhraseIndexes() { return phraseIndexes; diff --git a/warehouse/query-core/src/main/java/datawave/query/transformer/ExcerptTransform.java b/warehouse/query-core/src/main/java/datawave/query/transformer/ExcerptTransform.java index 7cad133f67f..bbcf9560f34 100644 --- a/warehouse/query-core/src/main/java/datawave/query/transformer/ExcerptTransform.java +++ b/warehouse/query-core/src/main/java/datawave/query/transformer/ExcerptTransform.java @@ -1,13 +1,17 @@ package datawave.query.transformer; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.END_OFFSET; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.FIELD_NAME; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.START_OFFSET; + import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; -import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Objects; @@ -21,9 +25,8 @@ import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.IteratorEnvironment; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; -import org.apache.commons.lang3.StringUtils; -import org.apache.log4j.Logger; -import org.javatuples.Triplet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.google.common.collect.Iterators; import com.google.protobuf.InvalidProtocolBufferException; @@ -41,20 +44,19 @@ import datawave.query.function.JexlEvaluation; import datawave.query.iterator.logic.TermFrequencyExcerptIterator; import datawave.query.postprocessing.tf.PhraseIndexes; +import datawave.query.postprocessing.tf.PhraseOffset; public class ExcerptTransform extends DocumentTransform.DefaultDocumentTransform { - private static final Logger log = Logger.getLogger(ExcerptTransform.class); - + private static final Logger log = LoggerFactory.getLogger(ExcerptTransform.class); public static final String PHRASE_INDEXES_ATTRIBUTE = "PHRASE_INDEXES_ATTRIBUTE"; public static final String HIT_EXCERPT = "HIT_EXCERPT"; + public static final String HIT_EXCERPT_WITH_SCORES = "HIT_EXCERPT_WITH_SCORES"; + public static final String HIT_EXCERPT_ONE_BEST = "HIT_EXCERPT_ONE_BEST"; + public static final String EXCERPT_ERROR_MESSAGE = "SOMETHING WENT WRONG GENERATING YOUR EXCERPT!"; + private static final Excerpt ERROR_EXCERPT = new Excerpt(null, EXCERPT_ERROR_MESSAGE, EXCERPT_ERROR_MESSAGE, EXCERPT_ERROR_MESSAGE); - private static final String BEFORE = "BEFORE"; - - private static final String AFTER = "AFTER"; - - private final Map excerptIteratorOptions = new HashMap<>(); - private final SortedKeyValueIterator excerptIterator; + private final TermFrequencyExcerptIterator excerptIterator; private final ExcerptFields excerptFields; private final IteratorEnvironment env; private final SortedKeyValueIterator source; @@ -71,7 +73,7 @@ public ExcerptTransform(ExcerptFields excerptFields, IteratorEnvironment env, So this.excerptFields = excerptFields; this.env = env; this.source = source; - this.excerptIterator = excerptIterator; + this.excerptIterator = (TermFrequencyExcerptIterator) excerptIterator; } @Nullable @@ -84,13 +86,13 @@ public Entry apply(@Nullable Entry entry) { PhraseIndexes phraseIndexes = getPhraseIndexes(document); if (!phraseIndexes.isEmpty()) { if (log.isTraceEnabled()) { - log.trace("Fetching phrase excerpts " + excerptFields + " for document " + document.getMetadata()); + log.trace("Fetching phrase excerpts {} for document {}", excerptFields, document.getMetadata()); } Set excerpts = getExcerpts(phraseIndexes); addExcerptsToDocument(excerpts, document); } else { if (log.isTraceEnabled()) { - log.trace("Phrase indexes were not added to document " + document.getMetadata() + ", skipping"); + log.trace("Phrase indexes were not added to document {}, skipping", document.getMetadata()); } } } @@ -108,7 +110,7 @@ public Entry apply(@Nullable Entry entry) { private PhraseIndexes getPhraseIndexes(Document document) { PhraseIndexes phraseIndexes = null; PhraseIndexes allPhraseIndexes = new PhraseIndexes(); - // first lets find all of the phrase indexes that came from phrase functions + // first lets find all the phrase indexes that came from phrase functions if (document.containsKey(PHRASE_INDEXES_ATTRIBUTE)) { Content content = (Content) document.get(PHRASE_INDEXES_ATTRIBUTE); phraseIndexes = PhraseIndexes.from(content.getContent()); @@ -130,7 +132,7 @@ private PhraseIndexes getPhraseIndexes(Document document) { allPhraseIndexes.addIndexTriplet(String.valueOf(hitTuple.getFieldName()), keyToEventId(attr.getMetadata()), pos.getLowOffset(), pos.getOffset()); } - // save the hit term for later callout + // save the hit term for later call-out Collections.addAll(hitTermValues, ((String) hitTuple.getValue()).split(Constants.SPACE)); } } @@ -141,7 +143,7 @@ private PhraseIndexes getPhraseIndexes(Document document) { /** * Get the term weight position (offset) for the specified hit term. This will return an offset overlapping a phrase in the existing phrase index map first. - * Otherwise the first position will be returned. + * Otherwise, the first position will be returned. * * @param hitTuple * The hit term tuple @@ -188,28 +190,13 @@ private TermWeightPosition getOffset(ValueTuple hitTuple, PhraseIndexes phraseIn } } catch (InvalidProtocolBufferException e) { - log.error("Value passed to aggregator was not of type TermWeight.Info for " + tfKey, e); + log.error("Value passed to aggregator was not of type TermWeight.Info for {}", tfKey, e); } catch (IOException e) { - log.error("Failed to scan for term frequencies at " + tfKey, e); + log.error("Failed to scan for term frequencies at {}", tfKey, e); } return null; } - /** - * Given a hit term attribute, return a ValueTuple representation which will give us the field and value parsed out. - * - * @param source - * a hit term attribute - * @return A ValueTuple representation of the document hit-term attribute - */ - private ValueTuple attributeToHitTuple(Attribute source) { - String hitTuple = String.valueOf(source.getData()); - int index = hitTuple.indexOf(':'); - String fieldName = hitTuple.substring(0, index); - String value = hitTuple.substring(index + 1); - return new ValueTuple(fieldName, value, value, source); - } - /** * Add the excerpts to the document as part of {@value #HIT_EXCERPT}. * @@ -218,45 +205,35 @@ private ValueTuple attributeToHitTuple(Attribute source) { * @param document * the document */ - private void addExcerptsToDocument(Set excerpts, Document document) { - Attributes attributes = new Attributes(true); + private static void addExcerptsToDocument(Set excerpts, Document document) { + Attributes attributesWithoutScores = new Attributes(true); + Attributes attributesWithScores = new Attributes(true); + Attributes attributesOneBest = new Attributes(true); + + boolean hasScores = false; + for (Excerpt excerpt : excerpts) { - Content content = new Content(excerpt.getExcerpt(), excerpt.getSource(), true); - attributes.add(content); - } - document.put(HIT_EXCERPT, attributes); - } + Content contentWithoutScores = new Content(excerpt.getExcerptWithoutScores(), excerpt.getSource(), true); + attributesWithoutScores.add(contentWithoutScores); - /** - * Given an event ID, return the document Key - * - * @param eventId - * eventId string - * @return the document Key - */ - private Key eventIdToKey(String eventId) { - if (eventId != null) { - int split = eventId.indexOf('\u0000'); - if (split < 0) { - throw new IllegalStateException("Malformed eventId (expected a null separator): " + eventId); + String excerptWithScores = excerpt.getExcerptWithScores(); + if (excerptWithScores.isBlank() || excerptWithScores.equals(TermFrequencyExcerptIterator.NOT_SCORED_MARKER)) { + continue; } - return new Key(eventId.substring(0, split), eventId.substring(split + 1)); + + hasScores = true; + + Content contentWithScores = new Content(excerptWithScores, excerpt.getSource(), true); + attributesWithScores.add(contentWithScores); + Content contentOneBest = new Content(excerpt.getExcerptOneBest(), excerpt.getSource(), true); + attributesOneBest.add(contentOneBest); } - return null; - } - /** - * Given a document key, return the eventId - * - * @param docKey - * document key - * @return the event id (shard\x00dt\x00uid) - */ - private String keyToEventId(Key docKey) { - if (docKey != null) { - return docKey.getRow().toString() + '\u0000' + docKey.getColumnFamily().toString(); + document.put(HIT_EXCERPT, attributesWithoutScores); + if (hasScores) { + document.put(HIT_EXCERPT_WITH_SCORES, attributesWithScores); + document.put(HIT_EXCERPT_ONE_BEST, attributesOneBest); } - return null; } /** @@ -267,35 +244,40 @@ private String keyToEventId(Key docKey) { * @return the excerpts */ private Set getExcerpts(PhraseIndexes phraseIndexes) { - phraseIndexes = getOffsetPhraseIndexes(phraseIndexes); - if (phraseIndexes.isEmpty()) { + final PhraseIndexes offsetPhraseIndexes = getOffsetPhraseIndexes(phraseIndexes, excerptFields); + if (offsetPhraseIndexes.isEmpty()) { return Collections.emptySet(); } // Fetch the excerpts. Set excerpts = new HashSet<>(); - for (String field : phraseIndexes.getFields()) { - Collection> indexes = phraseIndexes.getIndices(field); - for (Triplet indexPair : indexes) { - String eventId = indexPair.getValue0(); - int start = indexPair.getValue1(); - int end = indexPair.getValue2(); + for (String field : offsetPhraseIndexes.getFields()) { + Collection indexes = offsetPhraseIndexes.getPhraseOffsets(field); + for (PhraseOffset phraseOffset : indexes) { + String eventId = phraseOffset.getEventId(); + int start = phraseOffset.getStartOffset(); + int end = phraseOffset.getEndOffset(); if (log.isTraceEnabled()) { - log.trace("Fetching excerpt [" + start + "," + end + "] for field " + field + " for document " + eventId.replace('\u0000', '/')); + log.trace("Fetching excerpt [{},{}] for field {} for document {}", start, end, field, eventId.replace('\u0000', '/')); } // Construct the required range for this document. Key startKey = eventIdToKey(eventId); - Key endKey = startKey.followingKey(PartialKey.ROW_COLFAM); + Key endKey; + if (startKey != null) { + endKey = startKey.followingKey(PartialKey.ROW_COLFAM); + } else { + throw new IllegalStateException("eventID string was null"); + } Range range = new Range(startKey, true, endKey, false); - String excerpt = getExcerpt(field, start, end, range, hitTermValues); + Excerpt excerpt = getExcerpt(field, start, end, range, hitTermValues); // Only retain non-blank excerpts. - if (excerpt != null && !excerpt.isEmpty()) { - excerpts.add(new Excerpt(startKey, excerpt)); + if (!excerpt.isEmpty()) { + excerpts.add(excerpt); } else { if (log.isTraceEnabled()) { - log.trace("Failed to find excerpt [" + start + "," + end + "] for field " + field + "for document " + eventId.replace('\u0000', '/')); + log.trace("Failed to find excerpt [{},{}] for field {} for document {}", start, end, field, eventId.replace('\u0000', '/')); } } } @@ -318,52 +300,96 @@ private Set getExcerpts(PhraseIndexes phraseIndexes) { * the term values to match * @return the excerpt */ - private String getExcerpt(String field, int start, int end, Range range, ArrayList hitTermValues) { - excerptIteratorOptions.put(TermFrequencyExcerptIterator.FIELD_NAME, field); - excerptIteratorOptions.put(TermFrequencyExcerptIterator.START_OFFSET, String.valueOf(start)); - excerptIteratorOptions.put(TermFrequencyExcerptIterator.END_OFFSET, String.valueOf(end)); - try { - excerptIterator.init(source, excerptIteratorOptions, env); - excerptIterator.seek(range, Collections.emptyList(), false); - if (excerptIterator.hasTop()) { - Key topKey = excerptIterator.getTopKey(); - String[] parts = topKey.getColumnQualifier().toString().split(Constants.NULL); - // The column qualifier is expected to be field\0phrase. - if (parts.length == 2) { - return getHitPhrase(hitTermValues, parts); - } else { - log.warn(TermFrequencyExcerptIterator.class.getSimpleName() + " returned top key with incorrectly-formatted column qualifier in key: " - + topKey + " when scanning for excerpt [" + start + "," + end + "] for field " + field + " within range " + range); - return null; - } - } else { - return null; - } - } catch (IOException e) { - throw new RuntimeException("Failed to scan for excerpt [" + start + "," + end + "] for field " + field + " within range " + range, e); + private Excerpt getExcerpt(String field, int start, int end, Range range, ArrayList hitTermValues) { + // if given a beginning offset less than 0, set it to 0 + if (start < 0) { + start = 0; } - } - - private String getHitPhrase(ArrayList hitTermValues, String[] phraseParts) { - List hitPhrase = new ArrayList<>(); - for (String phrasePart : phraseParts[1].split(Constants.SPACE)) { - if (hitTermValues.contains(phrasePart)) { - hitPhrase.add("[" + phrasePart + "]"); + final float origHalfSize = (float) (end - start) / 2; // calculate "1/2 of the original requested excerpt size" + final int expandSize = 20; // how much we want to expand the start and end offsets by + + final Map excerptIteratorOptions = new HashMap<>(); + excerptIteratorOptions.put(FIELD_NAME, field); + // We will attempt to create the excerpt we want up to two times. + // Currently, the only condition that will cause a second attempt is if we detect stop words in the TFs we scan. + // The main difference in the second attempt is that it runs with an expanded range to allow us to remove the + // stop words and still have a correctly sized excerpt + for (int attempt = 0; attempt <= 1; attempt++) { + // if this is the first attempt, set the start and end offsets using the passed in values + if (attempt == 0) { + excerptIteratorOptions.put(START_OFFSET, String.valueOf(start)); + excerptIteratorOptions.put(END_OFFSET, String.valueOf(end)); } else { - hitPhrase.add(phrasePart); + // if this is the second attempt, set up the iterator with a larger range by adding/subtracting + // the start and end offsets by "expandedSize" + int expandedStart = Math.max(start - expandSize, 0); + int expandedEnd = end + expandSize; + excerptIteratorOptions.put(START_OFFSET, String.valueOf(expandedStart)); + excerptIteratorOptions.put(END_OFFSET, String.valueOf(expandedEnd)); + + if (log.isDebugEnabled()) { + log.debug("size of excerpt requested: {}", excerptFields.getOffset(field) * 2); + log.debug("original range is ({},{}) and the expanded range is ({},{})", start, end, expandedStart, expandedEnd); + } } - } - // return phrase based on direction - String result = String.join(" ", hitPhrase); // if no direction given, return everything - String direction = excerptFields.getDirection(phraseParts[0]).toUpperCase().trim(); - if (direction.equals(BEFORE)) { // remove tokens prior to hit term - result = StringUtils.substringBeforeLast(result, "]") + "]"; - } else if (direction.equals(AFTER)) { // remove tokens after hit term - result = "[" + StringUtils.substringAfter(result, "["); - } + try { + // set all of our options for the iterator + excerptIterator.init(source, excerptIteratorOptions, env); + excerptIterator.setHitTermsList(hitTermValues); + excerptIterator.setDirection(excerptFields.getDirection(field).toUpperCase().trim()); + excerptIterator.setOrigHalfSize(origHalfSize); + // if this is the second attempt, we want the iterator to trim the excerpt down to the size we want. + // (remember we run the iterator with an expanded range the second time so we can potentially have a bigger excerpt than needed even after + // removing stop words) + if (attempt == 1) { + excerptIterator.setTrimExcerpt(true); + } - return result; + // run the iterator + excerptIterator.seek(range, Collections.emptyList(), false); + + // if an excerpt is returned... + if (excerptIterator.hasTop()) { + // the excerpt will be in the column qualifier of the top key + Key topKey = excerptIterator.getTopKey(); + // The column qualifier is expected to be field\0phraseWithScores\0phraseWithoutScores\0oneBestExcerpt. + // split the column qualifier on null bytes to get the different parts + // we should have 4 parts after splitting the column qualifier on the null bytes + final String[] parts = topKey.getColumnQualifier().toString().split(Constants.NULL); + // if we don't have 4 parts after splitting the column qualifier... + if (parts.length != 4) { + if (attempt == 0) { // if this is the first attempt, try again + continue; + } + + // if this is the second attempt, log an error + if (log.isErrorEnabled()) { + log.error("{} returned top key with incorrectly-formatted column qualifier in key: {} when scanning for excerpt [{},{}] for field {} within range {} : parts= {}", + TermFrequencyExcerptIterator.class.getSimpleName(), topKey, start, end, field, range, Arrays.toString(parts)); + } + break; + } + + // if we have reached the limit of times to try, or we have no stop words removed + if (!parts[1].equals(TermFrequencyExcerptIterator.WORD_SKIPPED_MARKER)) { + // return just the excerpt parts + return new Excerpt(range.getStartKey(), parts[1], parts[2], parts[3]); + } + } else { // If no excerpt was returned on the first attempt, try again. If no excerpt was returned on the second attempt, log an error. + if (attempt == 1 && log.isErrorEnabled()) { + log.error("TermFrequencyExcerptIterator returned with hasTop() false: something went wrong in the iterator (or given bad parameters to run with)"); + log.error("The iterator options were: Field \"{}\" Range= {} StartOffset= {} EndOffset= {} HitTerms= {}", field, range, + excerptIteratorOptions.get(START_OFFSET), excerptIteratorOptions.get(END_OFFSET), hitTermValues); + break; + } + } + } catch (IOException e) { + throw new RuntimeException("Failed to scan for excerpt [" + start + "," + end + "] for field " + field + " within range " + range, e); + } + } + // when working correctly, it should always return from inside the loop so if this is reached something went very wrong + return ERROR_EXCERPT; } /** @@ -372,20 +398,22 @@ private String getHitPhrase(ArrayList hitTermValues, String[] phrasePart * * @param phraseIndexes * the original phrase indexes + * @param excerptFields + * the fields that we want excerpts for * @return the filtered, offset phrase indexes */ - private PhraseIndexes getOffsetPhraseIndexes(PhraseIndexes phraseIndexes) { + private static PhraseIndexes getOffsetPhraseIndexes(PhraseIndexes phraseIndexes, ExcerptFields excerptFields) { PhraseIndexes offsetPhraseIndexes = new PhraseIndexes(); for (String field : excerptFields.getFields()) { // Filter out phrases that are not in desired fields. - Collection> indexes = phraseIndexes.getIndices(field); + Collection indexes = phraseIndexes.getPhraseOffsets(field); if (indexes != null) { int offset = excerptFields.getOffset(field); // Ensure the offset is modified to encompass the target excerpt range. - for (Triplet indexPair : indexes) { - String eventId = indexPair.getValue0(); - int start = indexPair.getValue1() <= offset ? 0 : indexPair.getValue1() - offset; - int end = indexPair.getValue2() + offset + 1; // Add 1 here to offset the non-inclusive end of the range that will be used when scanning. + for (PhraseOffset indexPair : indexes) { + String eventId = indexPair.getEventId(); + int start = indexPair.getStartOffset() <= offset ? 0 : indexPair.getStartOffset() - offset; + int end = indexPair.getEndOffset() + offset + 1; // Add 1 here to offset the non-inclusive end of the range that will be used when scanning. offsetPhraseIndexes.addIndexTriplet(field, eventId, start, end); } } @@ -393,6 +421,50 @@ private PhraseIndexes getOffsetPhraseIndexes(PhraseIndexes phraseIndexes) { return offsetPhraseIndexes; } + /** + * Given a hit term attribute, return a ValueTuple representation which will give us the field and value parsed out. + * + * @param source + * a hit term attribute + * @return A ValueTuple representation of the document hit-term attribute + */ + private static ValueTuple attributeToHitTuple(Attribute source) { + String hitTuple = String.valueOf(source.getData()); + int index = hitTuple.indexOf(':'); + String fieldName = hitTuple.substring(0, index); + String value = hitTuple.substring(index + 1); + return new ValueTuple(fieldName, value, value, source); + } + + /** + * Given an event ID, return the document Key + * + * @param eventId + * eventId string + * @return the document Key + */ + private static Key eventIdToKey(String eventId) { + if (eventId != null) { + int split = eventId.indexOf('\u0000'); + if (split < 0) { + throw new IllegalStateException("Malformed eventId (expected a null separator): " + eventId); + } + return new Key(eventId.substring(0, split), eventId.substring(split + 1)); + } + return null; + } + + /** + * Given a document key, return the eventId + * + * @param docKey + * document key + * @return the event id (shard\x00dt\x00uid) + */ + private static String keyToEventId(Key docKey) { + return docKey != null ? docKey.getRow().toString() + '\u0000' + docKey.getColumnFamily().toString() : null; + } + /** * Add phrase excerpts to the documents from the given iterator. * @@ -408,22 +480,38 @@ public Iterator> getIterator(final Iterator authSet = Collections.singleton(auths); + protected Set authSet = Set.of(auths); @Inject @SpringBean(name = "EventQuery") @@ -203,10 +202,10 @@ protected void runTestQuery(AccumuloClient connector, String queryString, Date s } } - Assert.assertTrue("unexpected fields returned: " + unexpectedFields.toString(), unexpectedFields.isEmpty()); - Assert.assertTrue(goodResults + " was not empty", goodResults.isEmpty()); + assertTrue("unexpected fields returned: " + unexpectedFields, unexpectedFields.isEmpty()); + assertTrue(goodResults + " was not empty", goodResults.isEmpty()); - Assert.assertFalse("No docs were returned!", docs.isEmpty()); + assertFalse("No docs were returned!", docs.isEmpty()); } @Test @@ -220,7 +219,7 @@ public void simpleTest() throws Exception { String queryString = "QUOTE:(farther) #EXCERPT_FIELDS(QUOTE/2)"; // not sure why the timestamp and delete flag are present - Set goodResults = Sets.newHashSet("HIT_EXCERPT:get much [farther] with a: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("HIT_EXCERPT:get much [farther] with a: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -236,7 +235,7 @@ public void simpleTestBefore() throws Exception { String queryString = "QUOTE:(farther) #EXCERPT_FIELDS(QUOTE/2/before)"; // not sure why the timestamp and delete flag are present - Set goodResults = Sets.newHashSet("HIT_EXCERPT:get much [farther]: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("HIT_EXCERPT:get much [farther]: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -252,7 +251,7 @@ public void simpleTestAfter() throws Exception { String queryString = "QUOTE:(farther) #EXCERPT_FIELDS(QUOTE/2/after)"; // not sure why the timestamp and delete flag are present - Set goodResults = Sets.newHashSet("HIT_EXCERPT:[farther] with a: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("HIT_EXCERPT:[farther] with a: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -267,7 +266,7 @@ public void lessSimpleBeforeTest() throws Exception { String queryString = "QUOTE:(he cant refuse) #EXCERPT_FIELDS(QUOTE/2/before)"; - Set goodResults = Sets.newHashSet("HIT_EXCERPT:an offer [he] [cant] [refuse]: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("HIT_EXCERPT:an offer [he] [cant] [refuse]: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -282,7 +281,7 @@ public void lessSimpleAfterTest() throws Exception { String queryString = "QUOTE:(he cant refuse) #EXCERPT_FIELDS(QUOTE/2/after)"; - Set goodResults = Sets.newHashSet("HIT_EXCERPT:[he] [cant] [refuse]: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("HIT_EXCERPT:[he] [cant] [refuse]: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -297,7 +296,7 @@ public void lessSimpleTest() throws Exception { String queryString = "QUOTE:(he cant refuse) #EXCERPT_FIELDS(QUOTE/2)"; - Set goodResults = Sets.newHashSet("HIT_EXCERPT:an offer [he] [cant] [refuse]: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("HIT_EXCERPT:an offer [he] [cant] [refuse]: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -312,7 +311,7 @@ public void biggerRangeThanQuoteLength() throws Exception { String queryString = "QUOTE:(he cant refuse) #EXCERPT_FIELDS(QUOTE/20)"; - Set goodResults = Sets.newHashSet("HIT_EXCERPT:im gonna make him an offer [he] [cant] [refuse]: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("HIT_EXCERPT:im gonna make him an offer [he] [cant] [refuse]: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -327,7 +326,7 @@ public void biggerRangeThanQuoteLengthBeforeTest() throws Exception { String queryString = "QUOTE:(he cant refuse) #EXCERPT_FIELDS(QUOTE/20/before)"; - Set goodResults = Sets.newHashSet("HIT_EXCERPT:im gonna make him an offer [he] [cant] [refuse]: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("HIT_EXCERPT:im gonna make him an offer [he] [cant] [refuse]: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -342,7 +341,7 @@ public void biggerRangeThanQuoteLengthAfterTest() throws Exception { String queryString = "QUOTE:(he cant refuse) #EXCERPT_FIELDS(QUOTE/20/after)"; - Set goodResults = Sets.newHashSet("HIT_EXCERPT:[he] [cant] [refuse]: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("HIT_EXCERPT:[he] [cant] [refuse]: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -357,7 +356,8 @@ public void wholeQuote() throws Exception { String queryString = "QUOTE:(im gonna make him an offer he cant refuse) #EXCERPT_FIELDS(QUOTE/20)"; - Set goodResults = Sets.newHashSet("HIT_EXCERPT:[im] [gonna] [make] [him] [an] [offer] [he] [cant] [refuse]: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>( + Set.of("HIT_EXCERPT:[im] [gonna] [make] [him] [an] [offer] [he] [cant] [refuse]: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -373,7 +373,7 @@ public void anotherFirstTerm() throws Exception { // "if" is the first term for one event String queryString = "QUOTE:(if) #EXCERPT_FIELDS(QUOTE/3)"; - Set goodResults = Sets.newHashSet("UUID.0:SOPRANO", "HIT_EXCERPT:[if] you can quote: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("UUID.0:SOPRANO", "HIT_EXCERPT:[if] you can quote: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -389,7 +389,7 @@ public void anotherFirstTermBeforeTest() throws Exception { // "if" is the first term for one event String queryString = "QUOTE:(if) #EXCERPT_FIELDS(QUOTE/3/before)"; - Set goodResults = Sets.newHashSet("UUID.0:SOPRANO", "HIT_EXCERPT:[if]: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("UUID.0:SOPRANO", "HIT_EXCERPT:[if]: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } @@ -405,7 +405,7 @@ public void anotherFirstTermAfterTest() throws Exception { // "if" is the first term for one event String queryString = "QUOTE:(if) #EXCERPT_FIELDS(QUOTE/3/after)"; - Set goodResults = Sets.newHashSet("UUID.0:SOPRANO", "HIT_EXCERPT:[if] you can quote: : [] 9223372036854775807 false"); + Set goodResults = new HashSet<>(Set.of("UUID.0:SOPRANO", "HIT_EXCERPT:[if] you can quote: : [] 9223372036854775807 false")); runTestQuery(queryString, format.parse("19000101"), format.parse("20240101"), extraParameters, goodResults); } diff --git a/warehouse/query-core/src/test/java/datawave/query/iterator/logic/TermFrequencyExcerptIteratorTest.java b/warehouse/query-core/src/test/java/datawave/query/iterator/logic/TermFrequencyExcerptIteratorTest.java index 3615acc9d44..0cd801341f7 100644 --- a/warehouse/query-core/src/test/java/datawave/query/iterator/logic/TermFrequencyExcerptIteratorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/iterator/logic/TermFrequencyExcerptIteratorTest.java @@ -1,5 +1,8 @@ package datawave.query.iterator.logic; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.END_OFFSET; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.FIELD_NAME; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.START_OFFSET; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -51,11 +54,15 @@ public class TermFrequencyExcerptIteratorTest extends EasyMockSupport { private final TermFrequencyExcerptIterator iterator = new TermFrequencyExcerptIterator(); @BeforeClass - public static void beforeClass() throws Exception { + public static void beforeClass() { givenData("email", "123.456.789", "BODY", "the quick brown fox jumped over the lazy dog "); givenData("email", "123.456.789", "CONTENT", "there is no greater divide in fandoms than that between star wars and star trek fans"); givenData("scan", "987.654.321", "TITLE", "document scan 12345"); givenData("scan", "987.654.321", "CONTENT", "we've been trying to reach you about your car warranty"); + givenData("email", "111.222.333", "BODY", "the coldest tale ever told"); + givenData("email", "111.222.333", "CONTENT", "somewhere far along the street they lost their soul to a person so mean"); + givenData("email", "333.222.111", "BODY", "we like to repeat stuff do not ask questions we like to repeat stuff"); + multiWordsAtOneOffsetBuilder(); } private static void givenData(String datatype, String uid, String fieldName, String phrase) { @@ -66,7 +73,7 @@ private static void givenData(String datatype, String uid, String fieldName, Str // @formatter:off TermWeight.Info info = TermWeight.Info.newBuilder() .addAllTermOffset(termIndexes.get(term)) - .addScore(10000000) + .addScore(-1) .addPrevSkips(0) .setZeroOffsetMatch(true) .build(); @@ -78,6 +85,43 @@ private static void givenData(String datatype, String uid, String fieldName, Str } } + private static void multiWordsAtOneOffsetBuilder() { + Multimap termIndexes1 = getIndexes("one two three four five six seven eight nine ten"); + Multimap termIndexes2 = getIndexes("uno dos tres quatro cinco seis siete ocho nueve diez"); + int[] scores1 = {1560219, 1412017, 1592973, 2114938, 2124947, 2165412, 1215740, 1126708, 1273153, 149462}; + int[] scores2 = {1222082, 1748249, 153222, 1257611, 1235987, 1687421, 1243801, 213722, 1600256, 2171307}; + List scoreList = new ArrayList<>(); + scoreList.add(scores1); + scoreList.add(scores2); + ArrayList> indexesList = new ArrayList<>(); + indexesList.add(termIndexes1); + indexesList.add(termIndexes2); + int i; + for (int j = 0; j < indexesList.size(); j++) { + Multimap indexes = indexesList.get(j); + int[] scores = scoreList.get(j); + i = 0; + for (String term : indexes.keySet()) { + NormalizedFieldAndValue nfv = new NormalizedFieldAndValue("BODY", term); + Text colq = new Text("email" + Constants.NULL + "888.777.666" + Constants.NULL + nfv.getIndexedFieldValue() + Constants.NULL + + nfv.getIndexedFieldName()); + // @formatter:off + TermWeight.Info info = TermWeight.Info.newBuilder() + .addAllTermOffset(indexes.get(term)) + .addScore(scores[i]) + .addPrevSkips(0) + .setZeroOffsetMatch(false) + .build(); + // @formatter:on + Key key = new Key(row, colf, colq, new ColumnVisibility("ALL"), new Date().getTime()); + Value value = new Value(info.toByteArray()); + Map.Entry entry = new AbstractMap.SimpleEntry<>(key, value); + source.add(entry); + i++; + } + } + } + private static Multimap getIndexes(String phrase) { String[] terms = phrase.split(" "); Multimap map = ArrayListMultimap.create(); @@ -88,10 +132,20 @@ private static Multimap getIndexes(String phrase) { } @After - public void tearDown() throws Exception { + public void tearDown() { options.clear(); } + private void givenOptions(String field, int start, int end) { + options.put(FIELD_NAME, field); + options.put(START_OFFSET, String.valueOf(start)); + options.put(END_OFFSET, String.valueOf(end)); + } + + private void initIterator() throws IOException { + iterator.init(new SortedListKeyValueIterator(source), options, env); + } + /** * Verify that the expected phrase is found for the typical usage case of this iterator. */ @@ -103,6 +157,7 @@ public void testMatchFound() throws IOException { Key startKey = new Key(row, new Text("email" + Constants.NULL + "123.456.789")); Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + iterator.setHitTermsList(new ArrayList<>(List.of(""))); iterator.seek(range, Collections.emptyList(), false); assertTrue(iterator.hasTop()); @@ -110,7 +165,8 @@ public void testMatchFound() throws IOException { Key topKey = iterator.getTopKey(); assertEquals(row, topKey.getRow()); assertEquals(new Text("email" + Constants.NULL + "123.456.789"), topKey.getColumnFamily()); - assertEquals(new Text("BODY" + Constants.NULL + "quick brown fox jumped"), topKey.getColumnQualifier()); + assertEquals(new Text("BODY" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + "quick brown fox jumped" + Constants.NULL + "XXXNOTSCOREDXXX"), + topKey.getColumnQualifier()); } /** @@ -124,6 +180,7 @@ public void testOffsetRangeOutsideBounds() throws IOException { Key startKey = new Key(row, new Text("email" + Constants.NULL + "123.456.789")); Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + iterator.setHitTermsList(new ArrayList<>(List.of(""))); iterator.seek(range, Collections.emptyList(), false); assertTrue(iterator.hasTop()); @@ -131,7 +188,8 @@ public void testOffsetRangeOutsideBounds() throws IOException { Key topKey = iterator.getTopKey(); assertEquals(row, topKey.getRow()); assertEquals(new Text("email" + Constants.NULL + "123.456.789"), topKey.getColumnFamily()); - assertEquals(new Text("CONTENT" + Constants.NULL + "there is no greater divide in fandoms than that between star wars and star trek fans"), + assertEquals(new Text("CONTENT" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + + "there is no greater divide in fandoms than that between star wars and star trek fans" + Constants.NULL + "XXXNOTSCOREDXXX"), topKey.getColumnQualifier()); } @@ -146,6 +204,7 @@ public void testMatchingStartAndEndOffset() throws IOException { Key startKey = new Key(row, new Text("email" + Constants.NULL + "123.456.789")); Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + iterator.setHitTermsList(new ArrayList<>(List.of(""))); iterator.seek(range, Collections.emptyList(), false); assertTrue(iterator.hasTop()); @@ -153,7 +212,8 @@ public void testMatchingStartAndEndOffset() throws IOException { Key topKey = iterator.getTopKey(); assertEquals(row, topKey.getRow()); assertEquals(new Text("email" + Constants.NULL + "123.456.789"), topKey.getColumnFamily()); - assertEquals(new Text("CONTENT" + Constants.NULL), topKey.getColumnQualifier()); + assertEquals(new Text("CONTENT" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + "YOUR EXCERPT WAS BLANK! Maybe bad field or size?" + + Constants.NULL + "XXXNOTSCOREDXXX"), topKey.getColumnQualifier()); } /** @@ -167,12 +227,14 @@ public void testNoMatchFoundForField() throws IOException { Key startKey = new Key(row, new Text("email" + Constants.NULL + "123.456.789")); Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + iterator.setHitTermsList(new ArrayList<>(List.of(""))); iterator.seek(range, Collections.emptyList(), false); Key topKey = iterator.getTopKey(); assertEquals(row, topKey.getRow()); assertEquals(new Text("email" + Constants.NULL + "123.456.789"), topKey.getColumnFamily()); - assertEquals(new Text("BAD_FIELD" + Constants.NULL), topKey.getColumnQualifier()); + assertEquals(new Text("BAD_FIELD" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + "YOUR EXCERPT WAS BLANK! Maybe bad field or size?" + + Constants.NULL + "XXXNOTSCOREDXXX"), topKey.getColumnQualifier()); } /** @@ -186,6 +248,7 @@ public void testNoMatchFoundForDataTypeAndUid() throws IOException { Key startKey = new Key(row, new Text("other" + Constants.NULL + "111.111.111")); Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + iterator.setHitTermsList(new ArrayList<>(List.of(""))); iterator.seek(range, Collections.emptyList(), false); assertFalse(iterator.hasTop()); @@ -201,14 +264,277 @@ public void testStartOffsetGreaterThanEndOffset() { Assert.assertThrows("End offset must be greater than start offset", IllegalArgumentException.class, () -> iterator.validateOptions(options)); } - private void givenOptions(String field, int start, int end) { - options.put(TermFrequencyExcerptIterator.FIELD_NAME, field); - options.put(TermFrequencyExcerptIterator.START_OFFSET, String.valueOf(start)); - options.put(TermFrequencyExcerptIterator.END_OFFSET, String.valueOf(end)); + @Test + public void testMatchFoundWithRemovedStoplistWord() throws IOException { + givenOptions("BODY", 1, 5); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of(""))); + iterator.setTrimExcerpt(true); + iterator.setOrigHalfSize(20); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily()); + assertEquals(new Text("BODY" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + "coldest tale ever" + Constants.NULL + "XXXNOTSCOREDXXX"), + topKey.getColumnQualifier()); } - private void initIterator() throws IOException { - // noinspection unchecked - iterator.init(new SortedListKeyValueIterator(source), options, env); + @Test + public void testMatchFoundWithStoplistWordAndOutOfBoundsRange() throws IOException { + givenOptions("CONTENT", -10, 21); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of(""))); + iterator.setTrimExcerpt(true); + iterator.setOrigHalfSize(20); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily()); + assertEquals(new Text("CONTENT" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + + "somewhere far along the street they lost their soul to a person so mean" + Constants.NULL + "XXXNOTSCOREDXXX"), + topKey.getColumnQualifier()); + } + + @Test + public void testBracketsAroundSingleHit() throws IOException { + givenOptions("CONTENT", -10, 21); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of("street"))); + iterator.setTrimExcerpt(true); + iterator.setOrigHalfSize(20); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily()); + assertEquals(new Text("CONTENT" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + + "somewhere far along the [street] they lost their soul to a person so mean" + Constants.NULL + "XXXNOTSCOREDXXX"), + topKey.getColumnQualifier()); + } + + @Test + public void testBracketsAroundMultipleHit() throws IOException { + givenOptions("CONTENT", -10, 21); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of("street", "person", "the", "the street"))); + iterator.setTrimExcerpt(true); + iterator.setOrigHalfSize(20); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily()); + assertEquals(new Text("CONTENT" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + + "somewhere far along [the street] they lost their soul to a [person] so mean" + Constants.NULL + "XXXNOTSCOREDXXX"), + topKey.getColumnQualifier()); + } + + @Test + public void testDirectionBefore() throws IOException { + givenOptions("CONTENT", -10, 21); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of("street", "person", "the", "the street"))); + iterator.setDirection("BEFORE"); + iterator.setTrimExcerpt(true); + iterator.setOrigHalfSize(20); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily()); + assertEquals(new Text("CONTENT" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + + "somewhere far along [the street] they lost their soul to a [person]" + Constants.NULL + "XXXNOTSCOREDXXX"), + topKey.getColumnQualifier()); + } + + @Test + public void testDirectionAfter() throws IOException { + givenOptions("CONTENT", -10, 21); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of("street", "person", "the", "the street"))); + iterator.setDirection("AFTER"); + iterator.setTrimExcerpt(true); + iterator.setOrigHalfSize(20); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily()); + assertEquals(new Text("CONTENT" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + "[the street] they lost their soul to a [person] so mean" + + Constants.NULL + "XXXNOTSCOREDXXX"), topKey.getColumnQualifier()); + } + + @Test + public void testTrimBefore() throws IOException { + givenOptions("CONTENT", -10, 21); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of("street"))); + iterator.setDirection("BEFORE"); + iterator.setOrigHalfSize(1); + iterator.setTrimExcerpt(true); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily()); + assertEquals(new Text("CONTENT" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + "along the [street]" + Constants.NULL + "XXXNOTSCOREDXXX"), + topKey.getColumnQualifier()); + } + + @Test + public void testTrimAfter() throws IOException { + givenOptions("CONTENT", -10, 21); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of("street"))); + iterator.setDirection("AFTER"); + iterator.setOrigHalfSize(1); + iterator.setTrimExcerpt(true); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily()); + assertEquals(new Text("CONTENT" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + "[street] they lost" + Constants.NULL + "XXXNOTSCOREDXXX"), + topKey.getColumnQualifier()); + } + + @Test + public void testTrimBoth() throws IOException { + givenOptions("CONTENT", -10, 21); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of("street"))); + iterator.setDirection("BOTH"); + iterator.setOrigHalfSize(1); + iterator.setTrimExcerpt(true); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily()); + assertEquals(new Text("CONTENT" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + "the [street] they" + Constants.NULL + "XXXNOTSCOREDXXX"), + topKey.getColumnQualifier()); + } + + @Test + public void testQuickFailStopWordFound() throws IOException { + givenOptions("BODY", 1, 5); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of(""))); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily()); + assertEquals(new Text( + "BODY" + Constants.NULL + "XXXWESKIPPEDAWORDXXX" + Constants.NULL + "XXXWESKIPPEDAWORDXXX" + Constants.NULL + "XXXWESKIPPEDAWORDXXX"), + topKey.getColumnQualifier()); + String[] parts = topKey.getColumnQualifier().toString().split(Constants.NULL); + assertEquals(4, parts.length); + } + + @Test + public void testBracketsAroundMultipleHitMultiplePhrases() throws IOException { + givenOptions("BODY", -10, 37); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "333.222.111")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of("like to repeat", "ask questions", "stuff"))); + iterator.setTrimExcerpt(true); + iterator.setOrigHalfSize(20); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "333.222.111"), topKey.getColumnFamily()); + assertEquals(new Text("BODY" + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + + "we [like to repeat] [stuff] do not [ask questions] we [like to repeat] [stuff]" + Constants.NULL + "XXXNOTSCOREDXXX"), + topKey.getColumnQualifier()); + } + + @Test + public void multiWordAtOneOffsetWithScores() throws IOException { + givenOptions("BODY", 0, 11); + initIterator(); + + Key startKey = new Key(row, new Text("email" + Constants.NULL + "888.777.666")); + Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false); + + iterator.setHitTermsList(new ArrayList<>(List.of("four", "cinco"))); + iterator.seek(range, Collections.emptyList(), false); + + assertTrue(iterator.hasTop()); + + Key topKey = iterator.getTopKey(); + assertEquals(row, topKey.getRow()); + assertEquals(new Text("email" + Constants.NULL + "888.777.666"), topKey.getColumnFamily()); + assertEquals(new Text("BODY" + Constants.NULL + "one(85) dos(88) three(81) [four(89)] [cinco(88)] seis(88) siete(88) ocho(98) nine(86) diez(98)" + + Constants.NULL + "one dos three [four(89)] [cinco(88)] seis siete ocho nine diez" + Constants.NULL + + "one dos three [four] five seis siete ocho nine diez"), topKey.getColumnQualifier()); } } diff --git a/warehouse/query-core/src/test/java/datawave/query/iterator/logic/WordsAndScoresTest.java b/warehouse/query-core/src/test/java/datawave/query/iterator/logic/WordsAndScoresTest.java new file mode 100644 index 00000000000..51ba3db1589 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/iterator/logic/WordsAndScoresTest.java @@ -0,0 +1,268 @@ +package datawave.query.iterator.logic; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThrows; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +public class WordsAndScoresTest { + + List hitTermsList = List.of("hit"); + + @Test + public void testSingleWordScoreAdd() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("test", 100, hitTermsList); + assertEquals("test(100)", ws.getWordToOutput()); + assertTrue(ws.getUseScores()); + } + + @Test + public void testSingleWordAdd() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("test", hitTermsList); + assertEquals("test", ws.getWordToOutput()); + assertFalse(ws.getUseScores()); + } + + @Test + public void testReturnSmallestScore() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("test", 37654470, hitTermsList); + ws.addTerm("austin", 47325112, hitTermsList); + ws.addTerm("was", 26381694, hitTermsList); + ws.addTerm("here", 49883548, hitTermsList); + ws.addTerm("datawave", 24734968, hitTermsList); + ws.addTerm("cat", 4999951, hitTermsList); + assertEquals("cat(61)", ws.getWordToOutput()); + assertTrue(ws.getUseScores()); + } + + @Test + public void testReturnLongestWord() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("test", hitTermsList); + ws.addTerm("austin", hitTermsList); + ws.addTerm("was", hitTermsList); + ws.addTerm("here", hitTermsList); + ws.addTerm("datawave", hitTermsList); + ws.addTerm("cat", hitTermsList); + assertEquals("datawave", ws.getWordToOutput()); + assertFalse(ws.getUseScores()); + } + + @Test + public void testReturnMixedAddScoreFirst() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("test", 3835388, hitTermsList); + ws.addTerm("austin", hitTermsList); + ws.addTerm("was", hitTermsList); + ws.addTerm("here", 5239977, hitTermsList); + ws.addTerm("datawave", hitTermsList); + ws.addTerm("cat", 9707535, hitTermsList); + assertEquals("test(68)", ws.getWordToOutput()); + assertTrue(ws.getUseScores()); + } + + @Test + public void testReturnMixedAddNoScoreFirst() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("test", hitTermsList); + ws.addTerm("austin", 22255500, hitTermsList); + ws.addTerm("was", hitTermsList); + ws.addTerm("here", 730921, hitTermsList); + ws.addTerm("datawave", hitTermsList); + ws.addTerm("cat", 11232252, hitTermsList); + assertEquals("here(93)", ws.getWordToOutput()); + assertTrue(ws.getUseScores()); + } + + @Test + public void testReturnAddNegativeScores() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("test", hitTermsList); + ws.addTerm("austin", -1, hitTermsList); + ws.addTerm("was", hitTermsList); + ws.addTerm("here", -1, hitTermsList); + ws.addTerm("datawave", hitTermsList); + ws.addTerm("cat", -1, hitTermsList); + assertEquals("datawave", ws.getWordToOutput()); + assertFalse(ws.getUseScores()); + } + + @Test + public void testGetWordWithNothingAdded() { + WordsAndScores ws = new WordsAndScores(); + assertEquals("REPORTMETODATAWAVE", ws.getWordToOutput()); + } + + @Test + public void testSetWordsList() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("add", hitTermsList); + ws.addTerm("more", hitTermsList); + ws.addTerm("things", hitTermsList); + int before = ws.getScoresList().size(); + ws.setWordsList(new ArrayList<>(List.of("stuff", "things")), hitTermsList); + assertNotEquals(before, ws.getScoresList().size()); + ws.addTerm("three", hitTermsList); + assertEquals(before, ws.getScoresList().size()); + assertFalse(ws.getUseScores()); + } + + @Test + public void testSetWordsAndScoresLists() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("add", 86, hitTermsList); + ws.addTerm("more", 6, hitTermsList); + ws.addTerm("things", 34, hitTermsList); + assertThrows(IllegalArgumentException.class, + () -> ws.setWordsAndScoresList((new ArrayList<>(List.of("stuff", "things"))), new ArrayList<>(List.of(1, 2, 3)), hitTermsList)); + assertDoesNotThrow(() -> ws.setWordsAndScoresList((new ArrayList<>(List.of("stuff", "things"))), new ArrayList<>(List.of(1, 2)), hitTermsList)); + assertTrue(ws.getUseScores()); + assertDoesNotThrow(() -> ws.setWordsAndScoresList((new ArrayList<>(List.of("stuff", "things"))), new ArrayList<>(List.of(-1, -1)), hitTermsList)); + assertFalse(ws.getUseScores()); + } + + @Test + public void testReturnSingleHit() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("hit", hitTermsList); + assertEquals("[hit]", ws.getWordToOutput()); + ws.addTerm("try", hitTermsList); + assertEquals("[hit]", ws.getWordToOutput()); + ws.reset(); + ws.addTerm("hello", hitTermsList); + assertEquals("hello", ws.getWordToOutput()); + ws.addTerm("hit", hitTermsList); + assertEquals("[hit]", ws.getWordToOutput()); + } + + @Test + public void testReturnSingleHitWithScore() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("hit", 9120447, hitTermsList); + assertEquals("[hit(40)]", ws.getWordToOutput()); + ws.addTerm("try", 41315662, hitTermsList); + assertEquals("[hit(40)]", ws.getWordToOutput()); + ws.reset(); + ws.addTerm("hello", 31334736, hitTermsList); + assertEquals("hello(4)", ws.getWordToOutput()); + ws.addTerm("hit", 29938894, hitTermsList); + assertEquals("[hit(5)]", ws.getWordToOutput()); + } + + @Test + public void testReturnSingleHitMixedScore() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("test", hitTermsList); + ws.addTerm("austin", 41096759, hitTermsList); + ws.addTerm("hit", hitTermsList); + ws.addTerm("here", 33072572, hitTermsList); + assertEquals("[hit]", ws.getWordToOutput()); + ws.reset(); + ws.addTerm("test", 21719522, hitTermsList); + ws.addTerm("austin", hitTermsList); + ws.addTerm("hit", hitTermsList); + ws.addTerm("here", 43027819, hitTermsList); + assertEquals("[hit]", ws.getWordToOutput()); + } + + @Test + public void testReturnScoreNotLongestMultipleHit() { + WordsAndScores ws = new WordsAndScores(); + List temp = List.of("hit", "term"); + ws.addTerm("test", 49703356, temp); + ws.addTerm("hit", 33698372, temp); + ws.addTerm("was", temp); + ws.addTerm("here", 15201307, temp); + ws.addTerm("term", temp); + ws.addTerm("cat", temp); + assertEquals("[hit(3)]", ws.getWordToOutput()); + } + + @Test + public void testReturnScoreMultipleHit() { + WordsAndScores ws = new WordsAndScores(); + List temp = List.of("hit", "term"); + ws.addTerm("test", 6788348, temp); + ws.addTerm("hit", 15257973, temp); + ws.addTerm("was", temp); + ws.addTerm("here", 17286266, temp); + ws.addTerm("term", 37536662, temp); + ws.addTerm("cat", temp); + assertEquals("[hit(22)]", ws.getWordToOutput()); + } + + @Test + public void testReturnStopWord() { + WordsAndScores ws = new WordsAndScores(); + List temp = List.of(""); + ws.addTerm("", 5, temp); + ws.addTerm("hi", 5, temp); + assertNull(ws.getWordToOutput()); + } + + @Test + public void testOverride() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("test", hitTermsList); + ws.addTerm("austin", 41096759, hitTermsList); + ws.addTerm("hit", hitTermsList); + ws.addTerm("here", 33072572, hitTermsList); + ws.setOverride(3, 1); + assertEquals("[here(4)", ws.getWordToOutput()); + ws.reset(); + ws.addTerm("test", 21719522, hitTermsList); + ws.addTerm("austin", hitTermsList); + ws.addTerm("hit", hitTermsList); + ws.addTerm("here", 43027819, hitTermsList); + ws.setOverride(1, 2); + assertEquals("austin", ws.getWordToOutput()); + ws.reset(); + ws.addTerm("test", 21719522, hitTermsList); + ws.addTerm("austin", hitTermsList); + ws.addTerm("hit", hitTermsList); + ws.addTerm("here", 43027819, hitTermsList); + ws.setOverride(3, 3); + assertEquals("here(1)]", ws.getWordToOutput()); + } + + @Test + public void testOnebestExcerpt() { + WordsAndScores ws = new WordsAndScores(); + ws.setOneBestExcerpt(true); + ws.addTerm("test", 37654470, hitTermsList); + ws.addTerm("austin", 47325112, hitTermsList); + ws.addTerm("was", 26381694, hitTermsList); + ws.addTerm("here", 49883548, hitTermsList); + ws.addTerm("datawave", 24734968, hitTermsList); + ws.addTerm("hit", 4999951, hitTermsList); + assertEquals("[hit]", ws.getWordToOutput()); + ws.addTerm("cat", 12548, hitTermsList); + assertEquals("cat", ws.getWordToOutput()); + } + + @Test + public void testOutputScores() { + WordsAndScores ws = new WordsAndScores(); + ws.addTerm("test", 37654470, hitTermsList); + ws.addTerm("austin", 47325112, hitTermsList); + ws.addTerm("was", 26381694, hitTermsList); + ws.addTerm("here", 49883548, hitTermsList); + ws.addTerm("datawave", 24734968, hitTermsList); + ws.addTerm("cat", 4999951, hitTermsList); + ws.setOutputScores(false); + assertEquals("cat", ws.getWordToOutput()); + ws.setOutputScores(true); + assertEquals("cat(61)", ws.getWordToOutput()); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsTest.java index 506e9deab9d..b1c8a10b233 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsTest.java @@ -1,13 +1,12 @@ package datawave.query.jexl.functions; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -import java.net.URISyntaxException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -23,15 +22,10 @@ import org.apache.commons.jexl3.parser.ASTJexlScript; import org.apache.commons.jexl3.parser.JexlNode; import org.apache.commons.jexl3.parser.ParseException; -import org.apache.log4j.Logger; -import org.javatuples.Triplet; -import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; import com.google.common.collect.TreeMultimap; import datawave.ingest.protobuf.TermWeightPosition; @@ -43,24 +37,24 @@ import datawave.query.jexl.functions.TermFrequencyList.Zone; import datawave.query.jexl.functions.arguments.JexlArgumentDescriptor; import datawave.query.jexl.visitors.JexlStringBuildingVisitor; +import datawave.query.postprocessing.tf.PhraseOffset; import datawave.query.postprocessing.tf.TermOffsetMap; import datawave.query.util.MockDateIndexHelper; import datawave.query.util.MockMetadataHelper; public class ContentFunctionsTest { - static final Logger log = Logger.getLogger(ContentFunctionsTest.class); private static DatawaveJexlEngine engine; private JexlContext context; private TermOffsetMap termOffSetMap; - private String phraseFunction = ContentFunctions.CONTENT_PHRASE_FUNCTION_NAME; - private String scoredPhraseFunction = ContentFunctions.CONTENT_SCORED_PHRASE_FUNCTION_NAME; + private final String phraseFunction = ContentFunctions.CONTENT_PHRASE_FUNCTION_NAME; + private final String scoredPhraseFunction = ContentFunctions.CONTENT_SCORED_PHRASE_FUNCTION_NAME; private static final String EVENT_ID = "shard\u0000dt\u0000uid"; - private String eventId = EVENT_ID; + private final String eventId = EVENT_ID; @BeforeClass - public static void setUp() throws URISyntaxException { + public static void setUp() { Map functions = new HashMap<>(ArithmeticJexlEngines.functions()); functions.put("f", QueryFunctions.class); functions.put("geo", GeoFunctions.class); @@ -164,12 +158,11 @@ private TermWeightPosition getPosition(int offset, int prevSkips, float score) { .build(); } - private void assertPhraseOffset(String field, int startOffset, int endOffset) { - Collection> phraseOffsets = termOffSetMap.getPhraseIndexes(field); + private void assertPhraseOffset(String field, final int startOffset, final int endOffset) { + Collection phraseOffsets = termOffSetMap.getPhraseIndexes(field); boolean found = phraseOffsets.stream() - .anyMatch((pair) -> pair.getValue0().equals(eventId) && pair.getValue1().equals(startOffset) && pair.getValue2().equals(endOffset)); - Assert.assertTrue( - "Expected phrase offset [" + startOffset + ", " + endOffset + "] for field " + field + " and eventId " + eventId.replace('\u0000', '/'), + .anyMatch(pair -> pair.getEventId().equals(eventId) && pair.getStartOffset() == startOffset && pair.getEndOffset() == endOffset); + assertTrue("Expected phrase offset [" + startOffset + ", " + endOffset + "] for field " + field + " and eventId " + eventId.replace('\u0000', '/'), found); } @@ -178,7 +171,7 @@ private void assertNoPhraseOffsetsFor(String field) { } private void assertPhraseOffsetsEmpty() { - Assert.assertTrue("Expected empty phrase offset map", termOffSetMap.getPhraseIndexes() == null || termOffSetMap.getPhraseIndexes().isEmpty()); + assertTrue("Expected empty phrase offset map", termOffSetMap.getPhraseIndexes() == null || termOffSetMap.getPhraseIndexes().isEmpty()); } @Test @@ -187,18 +180,18 @@ public void testEvaluation1() { JexlExpression expr = engine.createExpression(query); List list1, list2; - list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(0, 0, 0)); - list2 = asList(Arrays.asList(5, 6, 7), Arrays.asList(0, 2, 0)); // match (6-2) should match (3+1) + list1 = asList(List.of(1, 2, 3), List.of(0, 0, 0)); + list2 = asList(List.of(5, 6, 7), List.of(0, 2, 0)); // match (6-2) should match (3+1) - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 3, 4); } @@ -208,20 +201,20 @@ public void reverseSharedTokenIndex() { JexlExpression expr = engine.createExpression(query); List t1, t2, t3; - t1 = asList(Arrays.asList(234, 239, 252, 257, 265, 281, 286, 340, 363, 367), Arrays.asList(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); - t2 = asList(Arrays.asList(212, 229, 252, 272), Arrays.asList(0, 0, 0, 0)); - t3 = asList(Arrays.asList(1, 101, 202, 213, 253, 312, 336), Arrays.asList(0, 0, 0, 0, 0, 0, 0)); + t1 = asList(List.of(234, 239, 252, 257, 265, 281, 286, 340, 363, 367), List.of(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); + t2 = asList(List.of(212, 229, 252, 272), List.of(0, 0, 0, 0)); + t3 = asList(List.of(1, 101, 202, 213, 253, 312, 336), List.of(0, 0, 0, 0, 0, 0, 0)); - termOffSetMap.putTermFrequencyList("a", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t1))); - termOffSetMap.putTermFrequencyList("b", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t2))); - termOffSetMap.putTermFrequencyList("c", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t3))); + termOffSetMap.putTermFrequencyList("a", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t1))); + termOffSetMap.putTermFrequencyList("b", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t2))); + termOffSetMap.putTermFrequencyList("c", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 252, 253); } @@ -231,20 +224,20 @@ public void forwardSharedTokenIndex() { JexlExpression expr = engine.createExpression(query); List t1, t2, t3; - t1 = asList(Arrays.asList(234, 239, 252, 257, 265, 281, 286, 340, 363, 367), Arrays.asList(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); - t2 = asList(Arrays.asList(212, 229, 252, 272), Arrays.asList(0, 0, 0, 0)); - t3 = asList(Arrays.asList(1, 101, 202, 213, 251, 312, 336), Arrays.asList(0, 0, 0, 0, 0, 0, 0)); + t1 = asList(List.of(234, 239, 252, 257, 265, 281, 286, 340, 363, 367), List.of(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); + t2 = asList(List.of(212, 229, 252, 272), List.of(0, 0, 0, 0)); + t3 = asList(List.of(1, 101, 202, 213, 251, 312, 336), List.of(0, 0, 0, 0, 0, 0, 0)); - termOffSetMap.putTermFrequencyList("a", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t1))); - termOffSetMap.putTermFrequencyList("b", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t2))); - termOffSetMap.putTermFrequencyList("c", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t3))); + termOffSetMap.putTermFrequencyList("a", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t1))); + termOffSetMap.putTermFrequencyList("b", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t2))); + termOffSetMap.putTermFrequencyList("c", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 251, 252); } @@ -254,20 +247,20 @@ public void reverseAllSharedTokenIndex() { JexlExpression expr = engine.createExpression(query); List t1, t2, t3; - t1 = asList(Arrays.asList(234, 239, 252, 257, 265, 281, 286, 340, 363, 367), Arrays.asList(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); - t2 = asList(Arrays.asList(212, 229, 252, 272), Arrays.asList(0, 0, 0, 0)); - t3 = asList(Arrays.asList(1, 101, 202, 213, 252, 312, 336), Arrays.asList(0, 0, 0, 0, 0, 0, 0)); + t1 = asList(List.of(234, 239, 252, 257, 265, 281, 286, 340, 363, 367), List.of(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); + t2 = asList(List.of(212, 229, 252, 272), List.of(0, 0, 0, 0)); + t3 = asList(List.of(1, 101, 202, 213, 252, 312, 336), List.of(0, 0, 0, 0, 0, 0, 0)); - termOffSetMap.putTermFrequencyList("a", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t1))); - termOffSetMap.putTermFrequencyList("b", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t2))); - termOffSetMap.putTermFrequencyList("c", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t3))); + termOffSetMap.putTermFrequencyList("a", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t1))); + termOffSetMap.putTermFrequencyList("b", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t2))); + termOffSetMap.putTermFrequencyList("c", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 252, 252); } @@ -277,20 +270,20 @@ public void forwardAllSharedTokenIndex() { JexlExpression expr = engine.createExpression(query); List t1, t2, t3; - t1 = asList(Arrays.asList(234, 239, 252, 257, 265, 281, 286, 340, 363, 367), Arrays.asList(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); - t2 = asList(Arrays.asList(212, 229, 252, 272), Arrays.asList(0, 0, 0, 0)); - t3 = asList(Arrays.asList(1, 101, 202, 213, 252, 312, 336), Arrays.asList(0, 0, 0, 0, 0, 0, 0)); + t1 = asList(List.of(234, 239, 252, 257, 265, 281, 286, 340, 363, 367), List.of(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); + t2 = asList(List.of(212, 229, 252, 272), List.of(0, 0, 0, 0)); + t3 = asList(List.of(1, 101, 202, 213, 252, 312, 336), List.of(0, 0, 0, 0, 0, 0, 0)); - termOffSetMap.putTermFrequencyList("a", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t1))); - termOffSetMap.putTermFrequencyList("b", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t2))); - termOffSetMap.putTermFrequencyList("c", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), t3))); + termOffSetMap.putTermFrequencyList("a", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t1))); + termOffSetMap.putTermFrequencyList("b", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t2))); + termOffSetMap.putTermFrequencyList("c", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), t3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 252, 252); } @@ -303,18 +296,18 @@ public void testEvaluationNoContentFields() { JexlExpression expr = engine.createExpression(query); List list1, list2; - list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(0, 0, 0)); - list2 = asList(Arrays.asList(5, 6, 7), Arrays.asList(0, 0, 3)); // match (7-3_ should match (3+1) + list1 = asList(List.of(1, 2, 3), List.of(0, 0, 0)); + list2 = asList(List.of(5, 6, 7), List.of(0, 0, 3)); // match (7-3_ should match (3+1) - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", false, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", false, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", false, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", false, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -327,15 +320,15 @@ public void testQuotedEvaluation() { list1 = asList(1, 2, 3); list2 = asList(3, 4, 5); - termOffSetMap.putTermFrequencyList("dog's", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog's", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 2, 3); } @@ -359,15 +352,15 @@ public void testEvaluation1_1() { list1 = asList(1); list2 = asList(2); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 2); } @@ -380,13 +373,13 @@ public void testEvaluation2() { list1 = asList(1, 2, 3); list2 = asList(5, 6, 7); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -399,13 +392,13 @@ public void testEvaluation3() { list1 = asList(1, 2, 3); list2 = asList(5); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -415,18 +408,18 @@ public void testEvaluationWithSkips() { JexlExpression expr = engine.createExpression(query); List list1, list2; - list1 = asList(Arrays.asList(4), Arrays.asList(1)); - list2 = asList(Arrays.asList(2), Arrays.asList(1)); // (10-6) = (3+1) + list1 = asList(List.of(4), List.of(1)); + list2 = asList(List.of(2), List.of(1)); // (10-6) = (3+1) - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 2, 3); } @@ -439,15 +432,15 @@ public void testEvaluationEmptyOffsetList() { list1 = asList(1, 2, 3); list2 = new ArrayList<>(); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -463,16 +456,16 @@ public void testEvaluationThreeTerms() { list2 = asList(3, 7, 11); list3 = asList(10, 15, 20, 25); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 7, 10); } @@ -487,16 +480,16 @@ public void testEvaluationThreeTermsTooSmallDistance() { list2 = asList(3, 4, 5); list3 = asList(10, 15, 20, 25); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -511,16 +504,16 @@ public void testEvaluationFailedThreeTerms() { list2 = asList(3, 4, 5); list3 = asList(10, 15, 20, 25); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -535,16 +528,16 @@ public void testEvaluationMiddleMatch() { list2 = asList(2, 4, 20); list3 = asList(6, 8, 15); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 4, 6); } @@ -557,15 +550,15 @@ public void testEvaluationAdjacent1() { list1 = asList(1); list2 = asList(2); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 2); } @@ -578,14 +571,14 @@ public void testEvaluationAdjacent2() { list1 = asList(1, 2, 3); list2 = asList(5, 6, 7); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -598,14 +591,14 @@ public void testEvaluationAdjacent3() { list1 = asList(1, 2, 3); list2 = asList(5); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -618,13 +611,13 @@ public void testEvaluationAdjacentEmptyOffsetList() { list1 = asList(1, 2, 3); list2 = new ArrayList<>(); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -638,16 +631,16 @@ public void testEvaluationAdjacentThreeTerms() { list2 = asList(3, 7, 11); list3 = asList(10, 15, 20, 25); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 9, 11); } @@ -661,15 +654,15 @@ public void testEvaluationAdjacentFailedThreeTerms() { list2 = asList(3, 4, 5); list3 = asList(10, 15, 20, 25); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -682,15 +675,15 @@ public void testEvaluationPhraseBasic() { list1 = asList(1, 2, 3); list2 = asList(3, 4, 5); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 2, 3); } @@ -700,18 +693,18 @@ public void testEvaluationPhraseBasicWithSkips() { JexlExpression expr = engine.createExpression(query); List list1, list2; - list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(0, 1, 0)); - list2 = asList(Arrays.asList(5, 6, 7), Arrays.asList(2, 2, 2)); + list1 = asList(List.of(1, 2, 3), List.of(0, 1, 0)); + list2 = asList(List.of(5, 6, 7), List.of(2, 2, 2)); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 2, 5); } @@ -724,15 +717,15 @@ public void testEvaluationPhraseBasic2() { list1 = asList(1, 2, 3); list2 = asList(4, 5, 6); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 3, 4); } @@ -742,18 +735,18 @@ public void testEvaluationPhraseBasic2WithSkips() { JexlExpression expr = engine.createExpression(query); List list1, list2; - list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(0, 1, 0)); - list2 = asList(Arrays.asList(5, 6, 7), Arrays.asList(1, 3, 1)); + list1 = asList(List.of(1, 2, 3), List.of(0, 1, 0)); + list2 = asList(List.of(5, 6, 7), List.of(1, 3, 1)); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 2, 6); } @@ -767,16 +760,16 @@ public void testEvaluationPhraseBasic3() { list2 = asList(2); list3 = asList(3); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 3); } @@ -786,20 +779,20 @@ public void testEvaluationPhraseBasic3WithSkips() { JexlExpression expr = engine.createExpression(query); List list1, list2, list3; - list1 = asList(Arrays.asList(1), Arrays.asList(0)); - list2 = asList(Arrays.asList(3), Arrays.asList(1)); // ~3-5 - list3 = asList(Arrays.asList(4, 10), Arrays.asList(0, 0)); + list1 = asList(List.of(1), List.of(0)); + list2 = asList(List.of(3), List.of(1)); // ~3-5 + list3 = asList(List.of(4, 10), List.of(0, 0)); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 4); } @@ -813,16 +806,16 @@ public void testEvaluationPhraseBasic3FavorContentOrderedFunction() { list2 = asList(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40); list3 = asList(41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 39, 41); } @@ -835,15 +828,15 @@ public void testEvaluationPhraseBasicOrderFail() { list1 = asList(3, 4, 5); list2 = asList(1, 2); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -853,18 +846,18 @@ public void testEvaluationPhraseBasicFailWithSkips() { JexlExpression expr = engine.createExpression(query); List list1, list2; - list1 = asList(Arrays.asList(3, 4, 5), Arrays.asList(0, 0, 2)); - list2 = asList(Arrays.asList(1, 2), Arrays.asList(0, 1)); + list1 = asList(List.of(3, 4, 5), List.of(0, 0, 2)); + list2 = asList(List.of(1, 2), List.of(0, 1)); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -878,16 +871,16 @@ public void testEvaluationPhraseBasicOrderFail2() { list2 = asList(3); list3 = asList(2); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -897,20 +890,20 @@ public void testEvaluationPhraseBasicFail2WithSkips() { JexlExpression expr = engine.createExpression(query); List list1, list2, list3; - list1 = asList(Arrays.asList(4), Arrays.asList(0)); - list2 = asList(Arrays.asList(3), Arrays.asList(1)); - list3 = asList(Arrays.asList(2), Arrays.asList(0)); + list1 = asList(List.of(4), List.of(0)); + list2 = asList(List.of(3), List.of(1)); + list3 = asList(List.of(2), List.of(0)); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -924,16 +917,16 @@ public void testEvaluationPhraseBasicOrderFail3() { list2 = asList(4); list3 = asList(3); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -943,20 +936,20 @@ public void testEvaluationPhraseBasicFail3WithSkips() { JexlExpression expr = engine.createExpression(query); List list1, list2, list3; - list1 = asList(Arrays.asList(2), Arrays.asList(0)); - list2 = asList(Arrays.asList(4), Arrays.asList(0)); - list3 = asList(Arrays.asList(3), Arrays.asList(0)); + list1 = asList(List.of(2), List.of(0)); + list2 = asList(List.of(4), List.of(0)); + list3 = asList(List.of(3), List.of(0)); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("fish", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -969,15 +962,15 @@ public void testEvaluationPhraseBasicTermOrderFail() { list1 = asList(1, 2, 3); list2 = asList(4, 5, 6); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -987,18 +980,18 @@ public void testEvaluationPhraseBasicTermOrderFailWithSkips() { JexlExpression expr = engine.createExpression(query); List list1, list2; - list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(1, 1, 1)); - list2 = asList(Arrays.asList(4, 5, 6), Arrays.asList(0, 1, 1)); + list1 = asList(List.of(1, 2, 3), List.of(1, 1, 1)); + list2 = asList(List.of(4, 5, 6), List.of(0, 1, 1)); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1010,14 +1003,14 @@ public void testEvaluationPhraseSameTermFailure() { List list1; list1 = asList(1, 3, 5); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1029,14 +1022,14 @@ public void testEvaluationPhraseSameTermSuccessFirst() { List list1; list1 = asList(1, 2, 5); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 2); } @@ -1048,14 +1041,14 @@ public void testEvaluationPhraseSameTermSuccessLast() { List list1; list1 = asList(1, 4, 5); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 4, 5); } @@ -1067,14 +1060,14 @@ public void testEvaluationAdjacencySameTermFailureTest() { List list1; list1 = asList(1, 4); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1086,14 +1079,14 @@ public void testEvaluationAdjacencySameTermSuccessTest() { List list1; list1 = asList(1, 3); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 3); } @@ -1103,16 +1096,16 @@ public void testEvaluationAdjacencySameTermWithSkipsSuccessTest() { JexlExpression expr = engine.createExpression(query); List list1; - list1 = asList(Arrays.asList(1, 4), Arrays.asList(0, 1)); + list1 = asList(List.of(1, 4), List.of(0, 1)); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 3); } @@ -1125,15 +1118,15 @@ public void testEvaluationAdjacencySameTermMixedSuccessTest() { list1 = asList(1, 5); list2 = asList(3); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 3); } @@ -1150,15 +1143,15 @@ public void testEvaluationPhraseBasicTermOrderFalsePositive() { list1 = asList(1, 2, 3); list2 = asList(3, 4, 5); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 3, 3); } @@ -1172,16 +1165,16 @@ public void testEvaluationPhraseThreeTerm() { list2 = asList(5, 7, 9); list3 = asList(6, 8, 10); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 4, 6); } @@ -1195,16 +1188,16 @@ public void testEvaluationPhraseThreeTermFail() { list2 = asList(5, 7, 9); list3 = asList(6, 8, 10); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1218,16 +1211,16 @@ public void testEvaluationPhraseThreeTermPass() { list2 = asList(4, 7, 8, 10); // rat list3 = asList(4, 6); // dog - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 4, 4); } @@ -1241,16 +1234,16 @@ public void testEvaluationPhraseThreeTermFail2() { list2 = asList(5, 7, 9); // rat list3 = asList(4, 6, 10); // dog - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1264,16 +1257,16 @@ public void testEvaluationPhraseTermOverlap() { list2 = asList(1); // rat list3 = asList(1); // dog - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 1); } @@ -1283,20 +1276,20 @@ public void testEvaluationPhraseTermOverlapWithSkips() { JexlExpression expr = engine.createExpression(query); List list1, list2, list3; - list1 = asList(false, Arrays.asList(135), Arrays.asList(6)); // cat - list2 = asList(Arrays.asList(135), Arrays.asList(6)); // rat - list3 = asList(Arrays.asList(1), Arrays.asList(1)); // dog + list1 = asList(false, List.of(135), List.of(6)); // cat + list2 = asList(List.of(135), List.of(6)); // rat + list3 = asList(List.of(1), List.of(1)); // dog - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1310,16 +1303,16 @@ public void testEvaluationPhraseTermOverlapPass2() { list2 = asList(1); // rat list3 = asList(2); // dog - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 2); } @@ -1332,15 +1325,15 @@ public void testEvaluationPhraseTermOverlapPass3() { list1 = asList(1); // cat list2 = asList(1, 5); // rat - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 1); } @@ -1353,15 +1346,15 @@ public void testEvaluationPhraseTermOverlapPass4() { list1 = asList(5); // cat list2 = asList(1, 5); // rat - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 5, 5); } @@ -1375,16 +1368,16 @@ public void testEvaluationPhraseTermOverlapFail() { list2 = asList(2); list3 = asList(1); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("rat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1394,18 +1387,18 @@ public void testEvaluationScorePass() { JexlExpression expr = engine.createExpression(query); List list1, list2; - list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(0, 0, 0), Arrays.asList(-0.223f, -1.4339f, -0.0001f)); - list2 = asList(Arrays.asList(3, 4, 5), Arrays.asList(0, 0, 0), Arrays.asList(-0.001f, -1.4339f, -0.2001f)); + list1 = asList(List.of(1, 2, 3), List.of(0, 0, 0), List.of(-0.223f, -1.4339f, -0.0001f)); + list2 = asList(List.of(3, 4, 5), List.of(0, 0, 0), List.of(-0.001f, -1.4339f, -0.2001f)); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 3, 3); } @@ -1415,18 +1408,18 @@ public void testEvaluationScoreNoZonePass() { JexlExpression expr = engine.createExpression(query); List list1, list2; - list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(0, 0, 0), Arrays.asList(-0.223f, -1.4339f, -0.0001f)); - list2 = asList(Arrays.asList(3, 4, 5), Arrays.asList(0, 0, 0), Arrays.asList(-0.001f, -1.4339f, -0.2001f)); + list1 = asList(List.of(1, 2, 3), List.of(0, 0, 0), List.of(-0.223f, -1.4339f, -0.0001f)); + list2 = asList(List.of(3, 4, 5), List.of(0, 0, 0), List.of(-0.001f, -1.4339f, -0.2001f)); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 3, 3); } @@ -1436,18 +1429,18 @@ public void testEvaluationScoreFail() { JexlExpression expr = engine.createExpression(query); List list1, list2; - list1 = asList(Arrays.asList(1, 2, 3), Arrays.asList(0, 0, 0), Arrays.asList(-0.223f, -1.4339f, -0.2001f)); - list2 = asList(Arrays.asList(3, 4, 5), Arrays.asList(0, 0, 0), Arrays.asList(-0.001f, -1.4339f, -0.2001f)); + list1 = asList(List.of(1, 2, 3), List.of(0, 0, 0), List.of(-0.223f, -1.4339f, -0.2001f)); + list2 = asList(List.of(3, 4, 5), List.of(0, 0, 0), List.of(-0.001f, -1.4339f, -0.2001f)); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1464,16 +1457,16 @@ public void testEvaluationMultipleContentFunctions() { list2 = asList(4, 5, 6); list3 = asList(11, 12, 14); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("bat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("bat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(ArithmeticJexlEngines.isMatched(o), true)); + assertTrue(expect(ArithmeticJexlEngines.isMatched(o), true)); assertPhraseOffset("CONTENT", 1, 4); } @@ -1491,16 +1484,16 @@ public void testEvaluationPhrasePruningEdgeCondition() { list2 = asList(9, 10, 11); list3 = asList(7, 12); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("bat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("bat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 10, 12); } @@ -1518,16 +1511,16 @@ public void testEvaluationReverseOffsetAdjustment() { list2 = asList(3, 9, 10, 12, 13, 20, 23, 25); list3 = asList(1, 12, 13, 27); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("bat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("bat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1542,29 +1535,29 @@ public void testEvaluationMultiEvent() { list2 = asList(4, 5, 6); list3 = asList(4, 5, 6); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("bat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId + ".1"), list3))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("bat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId + ".1"), list3))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); JexlExpression expr = engine.createExpression(query1); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 4); termOffSetMap.getPhraseIndexes().clear(); expr = engine.createExpression(query2); o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); termOffSetMap.getPhraseIndexes().clear(); expr = engine.createExpression(query); o = expr.evaluate(context); - Assert.assertTrue(expect(ArithmeticJexlEngines.isMatched(o), true)); + assertTrue(expect(ArithmeticJexlEngines.isMatched(o), true)); assertPhraseOffset("CONTENT", 1, 4); } @@ -1632,7 +1625,7 @@ public void testJexlFunctionArgumentDescriptor8() throws ParseException { public void testJexlFunctionArgumentDescriptor9() throws ParseException { String query = "content:" + phraseFunction + "(termOffsetMap, 'hello', 'world')"; String expected = "(BODY == 'hello' and BODY == 'world')"; - testJexlFunctionArgumentDescriptors(query, expected, Sets.newHashSet("BODY")); + testJexlFunctionArgumentDescriptors(query, expected, Set.of("BODY")); } @Test @@ -1649,8 +1642,8 @@ private void testJexlFunctionArgumentDescriptors(String query, String expected) private void testJexlFunctionArgumentDescriptors(String query, String expected, Set contentFields) throws ParseException { MockMetadataHelper metadataHelper = new MockMetadataHelper(); - metadataHelper.addTermFrequencyFields(Arrays.asList("BODY", "META")); - metadataHelper.setIndexedFields(Sets.newHashSet("BODY", "META")); + metadataHelper.addTermFrequencyFields(List.of("BODY", "META")); + metadataHelper.setIndexedFields(Set.of("BODY", "META")); if (contentFields != null) { metadataHelper.addContentFields(contentFields); @@ -1661,7 +1654,7 @@ private void testJexlFunctionArgumentDescriptors(String query, String expected, ASTJexlScript script = JexlASTHelper.parseJexlQuery(query); JexlNode child = script.jjtGetChild(0); - Assert.assertEquals("First child of ASTJexlScript is not an AStFunctionNode", ASTFunctionNode.class, child.getClass()); + assertEquals("First child of ASTJexlScript is not an AStFunctionNode", ASTFunctionNode.class, child.getClass()); ASTFunctionNode function = (ASTFunctionNode) child; @@ -1671,7 +1664,7 @@ private void testJexlFunctionArgumentDescriptors(String query, String expected, ASTJexlScript expectedScript = JexlASTHelper.parseJexlQuery(expected); JexlNode scriptChild = expectedScript.jjtGetChild(0); - Assert.assertTrue("Expected " + JexlStringBuildingVisitor.buildQuery(scriptChild) + " but was " + JexlStringBuildingVisitor.buildQuery(indexQuery), + assertTrue("Expected " + JexlStringBuildingVisitor.buildQuery(scriptChild) + " but was " + JexlStringBuildingVisitor.buildQuery(indexQuery), JexlASTHelper.equals(scriptChild, indexQuery)); } @@ -1684,16 +1677,16 @@ public void testDoubleWordInPhrase() { list1 = asList(1, 3); list2 = asList(2); - termOffSetMap.putTermFrequencyList("foo", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("bar", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("foo", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("foo", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("bar", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("foo", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 3); } @@ -1708,18 +1701,18 @@ public void testSomeEmptyOffsetsPhrase() { list3 = asList(260, 284, 304); list4 = asList(1165); - termOffSetMap.putTermFrequencyList("foo", new TermFrequencyList(Maps.immutableEntry(new Zone("BODY", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("bar", new TermFrequencyList(Maps.immutableEntry(new Zone("BODY", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("car", new TermFrequencyList(Maps.immutableEntry(new Zone("BODY", true, eventId), list3), - Maps.immutableEntry(new Zone("META", true, eventId), list4))); + termOffSetMap.putTermFrequencyList("foo", new TermFrequencyList(Map.entry(new Zone("BODY", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("bar", new TermFrequencyList(Map.entry(new Zone("BODY", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("car", + new TermFrequencyList(Map.entry(new Zone("BODY", true, eventId), list3), Map.entry(new Zone("META", true, eventId), list4))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); - context.set("BODY", Arrays.asList("foo", "bar", "car")); + context.set("BODY", List.of("foo", "bar", "car")); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1735,18 +1728,18 @@ public void testSomeEmptyOffsetsAdjacency() { list3 = asList(260, 284, 304); list4 = asList(1165); - termOffSetMap.putTermFrequencyList("foo", new TermFrequencyList(Maps.immutableEntry(new Zone("BODY", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("bar", new TermFrequencyList(Maps.immutableEntry(new Zone("BODY", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("car", new TermFrequencyList(Maps.immutableEntry(new Zone("BODY", true, eventId), list3), - Maps.immutableEntry(new Zone("META", true, eventId), list4))); + termOffSetMap.putTermFrequencyList("foo", new TermFrequencyList(Map.entry(new Zone("BODY", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("bar", new TermFrequencyList(Map.entry(new Zone("BODY", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("car", + new TermFrequencyList(Map.entry(new Zone("BODY", true, eventId), list3), Map.entry(new Zone("META", true, eventId), list4))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); - context.set("BODY", Arrays.asList("foo", "bar", "car")); + context.set("BODY", List.of("foo", "bar", "car")); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1762,18 +1755,18 @@ public void testSomeEmptyOffsetsWithin() { list3 = asList(260, 284, 304); list4 = asList(1165); - termOffSetMap.putTermFrequencyList("foo", new TermFrequencyList(Maps.immutableEntry(new Zone("BODY", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("bar", new TermFrequencyList(Maps.immutableEntry(new Zone("BODY", true, eventId), list2))); - termOffSetMap.putTermFrequencyList("car", new TermFrequencyList(Maps.immutableEntry(new Zone("BODY", true, eventId), list3), - Maps.immutableEntry(new Zone("META", true, eventId), list4))); + termOffSetMap.putTermFrequencyList("foo", new TermFrequencyList(Map.entry(new Zone("BODY", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("bar", new TermFrequencyList(Map.entry(new Zone("BODY", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("car", + new TermFrequencyList(Map.entry(new Zone("BODY", true, eventId), list3), Map.entry(new Zone("META", true, eventId), list4))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); - context.set("BODY", Arrays.asList("foo", "bar", "car")); + context.set("BODY", List.of("foo", "bar", "car")); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, false)); + assertTrue(expect(o, false)); assertPhraseOffsetsEmpty(); } @@ -1840,25 +1833,25 @@ public void testDuplicatePhraseOffset() { // /////////////////////////// // full terms list - Assert.assertNotNull(termOffsetMap.getTermFrequencyList("his")); + assertNotNull(termOffsetMap.getTermFrequencyList("his")); String[] terms = new String[] {"go", "and", "tell", "your", "brother", "that", "dinners", "ready", "and", "come", "and", "wash", "his", "hands"}; - Assert.assertEquals(Collections.singleton("BODY"), ContentFunctions.phrase("BODY", termOffsetMap, terms)); + assertEquals(Set.of("BODY"), ContentFunctions.phrase("BODY", termOffsetMap, terms)); // duplicate consecutive terms fail here terms = new String[] {"go", "and", "and", "tell", "your", "brother", "that", "dinners", "ready", "and", "come", "and", "wash", "his", "hands"}; - Assert.assertEquals(Collections.emptySet(), ContentFunctions.phrase("BODY", termOffsetMap, terms)); + assertEquals(Set.of(), ContentFunctions.phrase("BODY", termOffsetMap, terms)); // duplicate consecutive terms fail here terms = new String[] {"go", "and", "and", "tell", "your", "brother", "that", "dinners", "ready", "and", "come"}; - Assert.assertEquals(Collections.emptySet(), ContentFunctions.phrase("BODY", termOffsetMap, terms)); + assertEquals(Set.of(), ContentFunctions.phrase("BODY", termOffsetMap, terms)); // subset(1, end) terms = new String[] {"and", "tell", "your", "brother", "that", "dinners", "ready", "and", "come", "and", "wash", "his", "hands"}; - Assert.assertEquals(Collections.singleton("BODY"), ContentFunctions.phrase("BODY", termOffsetMap, terms)); + assertEquals(Set.of("BODY"), ContentFunctions.phrase("BODY", termOffsetMap, terms)); // subset(1,end-5) terms = new String[] {"and", "tell", "your", "brother", "that", "dinners", "ready", "and"}; - Assert.assertEquals(Collections.singleton("BODY"), ContentFunctions.phrase("BODY", termOffsetMap, terms)); + assertEquals(Set.of("BODY"), ContentFunctions.phrase("BODY", termOffsetMap, terms)); // /////////////////////////// // Within functions @@ -1866,23 +1859,23 @@ public void testDuplicatePhraseOffset() { // full terms list terms = new String[] {"go", "and", "tell", "your", "brother", "that", "dinners", "ready", "and", "come", "and", "wash", "his", "hands"}; - Assert.assertEquals(Collections.singleton("BODY"), ContentFunctions.within("BODY", 14, termOffsetMap, terms)); + assertEquals(Set.of("BODY"), ContentFunctions.within("BODY", 14, termOffsetMap, terms)); // duplicate consecutive terms fail here terms = new String[] {"go", "and", "and", "tell", "your", "brother", "that", "dinners", "ready", "and", "come", "and", "wash", "his", "hands"}; - Assert.assertEquals(Collections.emptySet(), ContentFunctions.within("BODY", 15, termOffsetMap, terms)); + assertEquals(Set.of(), ContentFunctions.within("BODY", 15, termOffsetMap, terms)); // placement does not matter terms = new String[] {"go", "and", "and", "tell", "your", "brother", "that", "dinners", "ready", "and", "come"}; - Assert.assertEquals(Collections.singleton("BODY"), ContentFunctions.within("BODY", 11, termOffsetMap, terms)); + assertEquals(Set.of("BODY"), ContentFunctions.within("BODY", 11, termOffsetMap, terms)); // subset(1, end) terms = new String[] {"and", "tell", "your", "brother", "that", "dinners", "ready", "and", "come", "and", "wash", "his", "hands"}; - Assert.assertEquals(Collections.singleton("BODY"), ContentFunctions.within("BODY", 12, termOffsetMap, terms)); + assertEquals(Set.of("BODY"), ContentFunctions.within("BODY", 12, termOffsetMap, terms)); // subset(1,end-5) terms = new String[] {"and", "tell", "your", "brother", "that", "dinners", "ready", "and", "come", "and"}; - Assert.assertEquals(Collections.singleton("BODY"), ContentFunctions.within("BODY", 10, termOffsetMap, terms)); + assertEquals(Set.of("BODY"), ContentFunctions.within("BODY", 10, termOffsetMap, terms)); } private Zone genTestZone() { @@ -1928,17 +1921,16 @@ public void testIgnoreIrrelevantZones() { // The only match, [19, 20], is in ZONE2. // Thus, evaluating ZONE1 should return false here (see #1171)... - Assert.assertEquals(Collections.emptySet(), ContentFunctions.phrase(zone1.getZone(), termOffsetMap, terms)); + assertEquals(Set.of(), ContentFunctions.phrase(zone1.getZone(), termOffsetMap, terms)); // Ensure that we do get the hit if we evaluate the other zone - Assert.assertEquals(Collections.singleton(zone2.getZone()), ContentFunctions.phrase(zone2.getZone(), termOffsetMap, terms)); + assertEquals(Set.of(zone2.getZone()), ContentFunctions.phrase(zone2.getZone(), termOffsetMap, terms)); // Ensure that we get the hit if we evaluate both zones - Assert.assertEquals(Collections.singleton(zone2.getZone()), - ContentFunctions.phrase(Arrays.asList(zone1.getZone(), zone2.getZone()), termOffsetMap, terms)); + assertEquals(Set.of(zone2.getZone()), ContentFunctions.phrase(List.of(zone1.getZone(), zone2.getZone()), termOffsetMap, terms)); // Ensure that we get the hit if we evaluate null zone - Assert.assertEquals(Collections.singleton(zone2.getZone()), ContentFunctions.phrase((Object) null, termOffsetMap, terms)); + assertEquals(Set.of(zone2.getZone()), ContentFunctions.phrase((Object) null, termOffsetMap, terms)); } /** @@ -1953,15 +1945,15 @@ public void testGatherPhraseOffsetsIsFalse() { list1 = asList(1); list2 = asList(2); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(false); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffsetsEmpty(); } @@ -1977,15 +1969,15 @@ public void testNonMatchingExcerptFields() { list1 = asList(1); list2 = asList(2); - termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list1))); - termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Maps.immutableEntry(new Zone("CONTENT", true, eventId), list2))); + termOffSetMap.putTermFrequencyList("dog", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list1))); + termOffSetMap.putTermFrequencyList("cat", new TermFrequencyList(Map.entry(new Zone("CONTENT", true, eventId), list2))); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("BODY")); + termOffSetMap.setExcerptFields(Set.of("BODY")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffsetsEmpty(); } @@ -2014,12 +2006,12 @@ public void testNonMatchingExcerptFieldsWithMultipleFieldsPresent() { termOffSetMap.putTermFrequencyList("dog", dogList); termOffSetMap.putTermFrequencyList("cat", catList); termOffSetMap.setGatherPhraseOffsets(true); - termOffSetMap.setExcerptFields(Collections.singleton("CONTENT")); + termOffSetMap.setExcerptFields(Set.of("CONTENT")); context.set(Constants.TERM_OFFSET_MAP_JEXL_VARIABLE_NAME, termOffSetMap); Object o = expr.evaluate(context); - Assert.assertTrue(expect(o, true)); + assertTrue(expect(o, true)); assertPhraseOffset("CONTENT", 1, 2); assertNoPhraseOffsetsFor("BODY"); } @@ -2039,9 +2031,9 @@ public void testAdjacentHitsAcrossChildDocuments() { termOffsetMap.putTermFrequencyList("fish", new TermFrequencyList(multimap)); // full terms list - Assert.assertNotNull(termOffsetMap.getTermFrequencyList("blue")); - Assert.assertNotNull(termOffsetMap.getTermFrequencyList("fish")); + assertNotNull(termOffsetMap.getTermFrequencyList("blue")); + assertNotNull(termOffsetMap.getTermFrequencyList("fish")); String[] terms = new String[] {"blue", "fish"}; - Assert.assertEquals(Collections.emptySet(), ContentFunctions.phrase("BODY", termOffsetMap, terms)); + assertEquals(Set.of(), ContentFunctions.phrase("BODY", termOffsetMap, terms)); } } diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentOrderedEvaluatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentOrderedEvaluatorTest.java index 96d1855dc1f..37d3d011009 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentOrderedEvaluatorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentOrderedEvaluatorTest.java @@ -1,20 +1,19 @@ package datawave.query.jexl.functions; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Set; -import org.javatuples.Pair; -import org.javatuples.Triplet; import org.junit.After; -import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import com.google.common.collect.Sets; - import datawave.ingest.protobuf.TermWeightPosition; +import datawave.query.postprocessing.tf.PhraseOffset; import datawave.query.postprocessing.tf.TermOffsetMap; public class ContentOrderedEvaluatorTest { @@ -364,7 +363,7 @@ private void givenGatherPhraseOffsets(boolean gatherPhraseOffsets) { } private void givenExcerptFields(String... fields) { - termOffsetMap.setExcerptFields(Sets.newHashSet(fields)); + termOffsetMap.setExcerptFields(Set.of(fields)); } private void initEvaluator() { @@ -372,20 +371,19 @@ private void initEvaluator() { } private void assertEvaluate(boolean expected) { - Assert.assertEquals("Expected evaluate() to return " + expected, expected, evaluator.evaluate(field, eventId, offsets)); + assertEquals("Expected evaluate() to return " + expected, expected, evaluator.evaluate(field, eventId, offsets)); } private void assertPhraseOffsetsContain(String field, int startOffset, int endOffset) { - Collection> phraseOffsets = termOffsetMap.getPhraseIndexes(field); + Collection phraseOffsets = termOffsetMap.getPhraseIndexes(field); boolean found = phraseOffsets.stream() - .anyMatch((pair) -> pair.getValue0().equals(eventId) && pair.getValue1().equals(startOffset) && pair.getValue2().equals(endOffset)); - Assert.assertTrue( - "Expected phrase offset [" + startOffset + ", " + endOffset + "] for field " + field + " and eventId " + eventId.replace('\u0000', '/'), + .anyMatch(pair -> pair.getEventId().equals(eventId) && pair.getStartOffset() == startOffset && pair.getEndOffset() == endOffset); + assertTrue("Expected phrase offset [" + startOffset + ", " + endOffset + "] for field " + field + " and eventId " + eventId.replace('\u0000', '/'), found); } private void assertPhraseOffsetsEmpty() { - Assert.assertTrue("Expected empty phrase offset map", termOffsetMap.getPhraseIndexes() == null || termOffsetMap.getPhraseIndexes().isEmpty()); + assertTrue("Expected empty phrase offset map", termOffsetMap.getPhraseIndexes() == null || termOffsetMap.getPhraseIndexes().isEmpty()); } private static class WrappedContentOrderedEvaluator extends ContentOrderedEvaluator { diff --git a/warehouse/query-core/src/test/java/datawave/query/postprocessing/tf/PhraseIndexesTest.java b/warehouse/query-core/src/test/java/datawave/query/postprocessing/tf/PhraseIndexesTest.java index 44495c80d66..b59796e4a0c 100644 --- a/warehouse/query-core/src/test/java/datawave/query/postprocessing/tf/PhraseIndexesTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/postprocessing/tf/PhraseIndexesTest.java @@ -4,7 +4,6 @@ import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; -import org.javatuples.Triplet; import org.junit.Test; import datawave.ingest.protobuf.TermWeight; @@ -116,13 +115,13 @@ public void testOverlap() { actual.addIndexTriplet("CONTENT", EVENT_ID_2, 19, 20); actual.addIndexTriplet("CONTENT", EVENT_ID_2, 12, 17); - assertEquals(null, actual.getOverlap("BODY", EVENT_ID_1, new TermWeightPosition.Builder().setOffset(4).build())); - assertEquals(null, actual.getOverlap("BODY", EVENT_ID_1, new TermWeightPosition.Builder().setOffset(4).build())); - assertEquals(new Triplet(EVENT_ID_1, 1, 3), + assertNull(actual.getOverlap("BODY", EVENT_ID_1, new TermWeightPosition.Builder().setOffset(4).build())); + assertNull(actual.getOverlap("BODY", EVENT_ID_1, new TermWeightPosition.Builder().setOffset(4).build())); + assertEquals(PhraseOffset.with(EVENT_ID_1, 1, 3), actual.getOverlap("BODY", EVENT_ID_1, new TermWeightPosition.Builder().setOffset(4).setPrevSkips(1).build())); - assertEquals(new Triplet(EVENT_ID_2, 19, 20), + assertEquals(PhraseOffset.with(EVENT_ID_2, 19, 20), actual.getOverlap("CONTENT", EVENT_ID_2, new TermWeightPosition.Builder().setOffset(21).setPrevSkips(3).build())); - assertEquals(new Triplet(EVENT_ID_2, 12, 17), + assertEquals(PhraseOffset.with(EVENT_ID_2, 12, 17), actual.getOverlap("CONTENT", EVENT_ID_2, new TermWeightPosition.Builder().setOffset(21).setPrevSkips(4).build())); } @@ -157,7 +156,7 @@ public void testGetOverlappingPosition() { // position 3,4 (4 prevSkip 1) overlaps 1,3 phrase assertEquals(new TermWeightPosition.Builder().setOffset(4).setPrevSkips(1).build(), actual.getOverlappingPosition("BODY", EVENT_ID_1, twInfo)); // no overlaps with 10,11 phrase - assertEquals(null, actual.getOverlappingPosition("BODY", EVENT_ID_2, twInfo)); + assertNull(actual.getOverlappingPosition("BODY", EVENT_ID_2, twInfo)); // position 3,4 (4 prevSkip 1) overlaps 3,4 phrase assertEquals(new TermWeightPosition.Builder().setOffset(4).setPrevSkips(1).build(), actual.getOverlappingPosition("CONTENT", EVENT_ID_1, twInfo)); // position 18,21 (21 prevSkip 3) overlaps 19,20 phrase @@ -195,7 +194,7 @@ public void testGetOverlappingPositionBruteForce() { // position 3,4 (4 prevSkip 1) overlaps 1,3 phrase assertEquals(new TermWeightPosition.Builder().setOffset(4).setPrevSkips(1).build(), actual.getOverlappingPosition("BODY", EVENT_ID_1, twInfo)); // no overlaps with 10,11 phrase - assertEquals(null, actual.getOverlappingPosition("BODY", EVENT_ID_2, twInfo)); + assertNull(actual.getOverlappingPosition("BODY", EVENT_ID_2, twInfo)); // position 3,4 (4 prevSkip 1) overlaps 3,4 phrase assertEquals(new TermWeightPosition.Builder().setOffset(4).setPrevSkips(1).build(), actual.getOverlappingPosition("CONTENT", EVENT_ID_1, twInfo)); // position 18,21 (21 prevSkip 3) overlaps 19,20 phrase diff --git a/warehouse/query-core/src/test/java/datawave/query/transformer/ExcerptTransformTest.java b/warehouse/query-core/src/test/java/datawave/query/transformer/ExcerptTransformTest.java index dae606dae04..2fb10744c42 100644 --- a/warehouse/query-core/src/test/java/datawave/query/transformer/ExcerptTransformTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/transformer/ExcerptTransformTest.java @@ -1,5 +1,8 @@ package datawave.query.transformer; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.END_OFFSET; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.FIELD_NAME; +import static datawave.query.iterator.logic.TermFrequencyExcerptIterator.Configuration.START_OFFSET; import static org.easymock.EasyMock.and; import static org.easymock.EasyMock.anyObject; import static org.easymock.EasyMock.capture; @@ -15,6 +18,7 @@ import java.util.AbstractMap; import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -65,13 +69,13 @@ public class ExcerptTransformTest extends EasyMockSupport { private static final String EVENT_ID = "shard\u0000dt\u0000uid"; @Before - public void setUp() throws Exception { + public void setUp() { phraseIndexes = new PhraseIndexes(); excerptFields = new ExcerptFields(); } @After - public void tearDown() throws Exception { + public void tearDown() { document = null; excerptTransform = null; } @@ -135,11 +139,11 @@ public void testExcerpts() throws IOException { givenMockDocumentWithHitTerm("BODY", "word"); givenMatchingTermFrequencies("BODY", new int[][] {{24, 24}}, "word"); // end offset is inclusive - givenMatchingPhrase("BODY", 22, 26, "and the word from bird"); + givenMatchingPhrase("BODY", 22, 26, "and the [word] from bird", List.of("word")); // also setup a phrase to match on either side of the matching phrase // end offset is inclusive - givenMatchingPhrase("BODY", 8, 16, "the quick brown fox jumped over the lazy dog"); + givenMatchingPhrase("BODY", 8, 16, "the quick brown fox jumped over the lazy dog", List.of("word")); Capture capturedArg = Capture.newInstance(); document.put(eq(ExcerptTransform.HIT_EXCERPT), and(capture(capturedArg), isA(Attributes.class))); @@ -175,7 +179,7 @@ public void testExcerptOverlapped() throws IOException { givenMatchingTermFrequencies("BODY", new int[][] {{1, 2}, {2, 3}, {9, 10}, {20, 21}}, "quick brown"); // note that the start is relative to index 9 (i.e. 9-2=7) because the overlapping term starts at 9 // end offset is inclusive - givenMatchingPhrase("BODY", 7, 16, "and the quick brown fox jumped over the lazy dog"); + givenMatchingPhrase("BODY", 7, 16, "and the [quick] [brown] fox jumped over the lazy dog", List.of("quick", "brown")); Capture capturedArg = Capture.newInstance(); document.put(eq(ExcerptTransform.HIT_EXCERPT), and(capture(capturedArg), isA(Attributes.class))); @@ -211,10 +215,10 @@ public void testExcerptOverlappedAndPhraseOverlapped() throws IOException { givenMockDocumentWithHitTerm("BODY", "quick brown"); givenMatchingTermFrequencies("BODY", new int[][] {{1, 2}, {2, 3}, {9, 10}, {20, 21}}, "quick brown"); // end offset is inclusive - givenMatchingPhrase("BODY", 23, 28, "Jack and Jill jumped over the"); + givenMatchingPhrase("BODY", 23, 28, "Jack and Jill jumped over the", List.of("quick", "brown")); // note that the start is relative to overlapping term index 9 (i.e. 9-2=7) because the overlapping term starts at 9 // AND then we combined the phrase from 2 to 7 with the one from 7 to 16 - givenMatchingPhrase("BODY", 2, 16, "the brown chicken layed an egg and the quick brown fox jumped over the lazy dog"); + givenMatchingPhrase("BODY", 2, 16, "the [brown] chicken layed an egg and the [quick] [brown] fox jumped over the lazy dog", List.of("quick", "brown")); Capture capturedArg = Capture.newInstance(); document.put(eq(ExcerptTransform.HIT_EXCERPT), and(capture(capturedArg), isA(Attributes.class))); @@ -244,7 +248,7 @@ public void testOffsetGreaterThanStartIndex() throws IOException { givenMockDocument(); // end offset is inclusive - givenMatchingPhrase("CONTENT", 0, 7, "the quick brown fox jumped over the lazy dog"); + givenMatchingPhrase("CONTENT", 0, 7, "the quick brown fox jumped over the lazy dog", Collections.emptyList()); Capture capturedArg = Capture.newInstance(); document.put(eq(ExcerptTransform.HIT_EXCERPT), and(capture(capturedArg), isA(Attributes.class))); @@ -273,7 +277,7 @@ public void testEmptyPhraseIndexes() throws IOException { givenMockDocumentWithHitTerm("BODY", "word"); givenMatchingTermFrequencies("BODY", new int[][] {{24, 24}}, "word"); // end offset is inclusive - givenMatchingPhrase("BODY", 22, 26, "and the word from bird"); + givenMatchingPhrase("BODY", 22, 26, "and the [word] from bird", List.of("word")); Capture capturedArg = Capture.newInstance(); document.put(eq(ExcerptTransform.HIT_EXCERPT), and(capture(capturedArg), isA(Attributes.class))); @@ -304,8 +308,8 @@ private void givenMockDocument() { expect(document.isToKeep()).andReturn(true); expect(document.containsKey(ExcerptTransform.PHRASE_INDEXES_ATTRIBUTE)).andReturn(true); - @SuppressWarnings("rawtypes") Key metadata = new Key("Row", "cf", "cq"); + @SuppressWarnings("rawtypes") Attribute phraseIndexAttribute = new Content(phraseIndexes.toString(), metadata, false); // noinspection unchecked expect(document.get(ExcerptTransform.PHRASE_INDEXES_ATTRIBUTE)).andReturn(phraseIndexAttribute); @@ -315,12 +319,12 @@ private void givenMockDocument() { givenDocument(document); } + @SuppressWarnings("rawtypes") private void givenMockDocumentWithHitTerm(String field, String value) { Document document = mock(Document.class); expect(document.isToKeep()).andReturn(true); expect(document.containsKey(ExcerptTransform.PHRASE_INDEXES_ATTRIBUTE)).andReturn(true); - @SuppressWarnings("rawtypes") Key metadata = new Key("shard", "dt\u0000uid"); Attribute phraseIndexAttribute = new Content(phraseIndexes.toString(), metadata, false); // noinspection unchecked @@ -351,17 +355,17 @@ private void givenPhraseIndex(String field, int start, int end) { phraseIndexes.addIndexTriplet(field, EVENT_ID, start, end); } - private void givenHitTerm(String field, String value) { - - } - - private void givenMatchingPhrase(String field, int start, int end, String phrase) throws IOException { + private void givenMatchingPhrase(String field, int start, int end, String phrase, List hitTerms) throws IOException { Map options = getOptions(field, start, end); iterator.init(source, options, env); + iterator.setHitTermsList(hitTerms); + iterator.setDirection("BOTH"); + iterator.setOrigHalfSize((float) ((end + 1) - start) / 2); iterator.seek(anyObject(), anyObject(), eq(false)); if (phrase != null) { expect(iterator.hasTop()).andReturn(true); - Key key = new Key(new Text("row"), new Text("cf"), new Text(field + Constants.NULL + phrase)); + Key key = new Key(new Text("row"), new Text("cf"), + new Text(field + Constants.NULL + "XXXNOTSCOREDXXX" + Constants.NULL + phrase + Constants.NULL + "XXXNOTSCOREDXXX")); expect(iterator.getTopKey()).andReturn(key); } else { expect(iterator.hasTop()).andReturn(false); @@ -385,10 +389,10 @@ private void givenMatchingTermFrequencies(String field, int[][] offsets, String private Map getOptions(String field, int start, int end) { Map options = new HashMap<>(); - options.put(TermFrequencyExcerptIterator.FIELD_NAME, field); - options.put(TermFrequencyExcerptIterator.START_OFFSET, String.valueOf(start)); + options.put(FIELD_NAME, field); + options.put(START_OFFSET, String.valueOf(start)); // for the options, the end offset is exclusive so add 1 - options.put(TermFrequencyExcerptIterator.END_OFFSET, String.valueOf(end + 1)); + options.put(END_OFFSET, String.valueOf(end + 1)); return options; } }