diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS deleted file mode 100644 index b3f21c9a43a..00000000000 --- a/.github/CODEOWNERS +++ /dev/null @@ -1,8 +0,0 @@ -# In order to ensure the query microservices are consistent with the wildfly webservice, we need -# to ensure that changes made to QueryExecutorBean match QueryManagementService, and changes made -# to LookupUUIDUtil match LookupService in the Query Service. -QueryExecutorBean.java @jwomeara @ivakegg -LookupUUIDUtil.java @jwomeara @ivakegg -RunningQuery.java @jwomeara @ivakegg -/core/ @jwomeara @ivakegg -/warehouse/query-core/ @jwomeara @ivakegg \ No newline at end of file diff --git a/.github/workflows/build-accumulo.yml b/.github/workflows/build-accumulo.yml new file mode 100644 index 00000000000..c1e15ef3d43 --- /dev/null +++ b/.github/workflows/build-accumulo.yml @@ -0,0 +1,99 @@ +name: Build Accumulo snapshot and update DataWave to use + +on: + workflow_dispatch: + inputs: + accumuloBranch: + required: true + default: "2.1" + description: "Branch name to build. Will be used as image tag." + accumuloRepo: + required: true + default: "apache/accumulo" + description: "Accumulo Repo to use. Expected to be at Github. Example: apache/accumulo" + deployAccumulo: + required: true + default: "false" + description: "Set to false if this accumulo version has already been pushed to Github Packages" + +# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository.lowercase }} + ACCUMULO_JAVA_VERSION: '17' + DATAWAVE_JAVA_VERSION: '11' + JAVA_DISTRIBUTION: 'zulu' #This is the default on v1 of the action for 1.8 + USER_NAME: ${{ secrets.GHCR_WRITE_USER_NAME }} + ACCESS_TOKEN: ${{ secrets.GHCR_WRITE_ACCESS_TOKEN }} + +jobs: + build-and-deploy-accumulo: + runs-on: ubuntu-latest + # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. + permissions: + contents: read + packages: write + # + steps: + - name: Checkout DataWave + uses: actions/checkout@v4 + with: + path: datawave + + - name: Checkout Accumulo + uses: actions/checkout@v4 + id: accumuloCheckout + with: + repository: ${{ github.event.inputs.accumuloRepo }} + path: accumulo + ref: ${{ github.event.inputs.accumuloBranch }} + + - name: Set up JDK ${{env.ACCUMULO_JAVA_VERSION}} + uses: actions/setup-java@v4 + with: + distribution: ${{env.JAVA_DISTRIBUTION}} + java-version: ${{env.ACCUMULO_JAVA_VERSION}} + cache: 'maven' + - run: echo "ACCUMULO_JAVA=$JAVA_HOME" >> $GITHUB_ENV + - name: Set up JDK ${{env.DATAWAVE_JAVA_VERSION}} + uses: actions/setup-java@v4 + with: + distribution: ${{env.JAVA_DISTRIBUTION}} + java-version: ${{env.DATAWAVE_JAVA_VERSION}} + cache: 'maven' + - run: echo "DATAWAVE_JAVA=$JAVA_HOME" >> $GITHUB_ENV + + - name: Get Accumulo Version + id: get-accumulo-version + run: | + export JAVA_HOME="$ACCUMULO_JAVA" + cd "$GITHUB_WORKSPACE/accumulo" + mvn build-helper:parse-version versions:set -DgenerateBackupPoms=false -DnewVersion=\${parsedVersion.majorVersion}.\${parsedVersion.minorVersion}.\${parsedVersion.incrementalVersion}-dwv-$(git rev-parse --short HEAD) + export newVersion=$(mvn -q help:evaluate -DforceStdout -Dexpression=project.version) + echo accumuloVersion=$newVersion >> $GITHUB_OUTPUT + - name: Deploy Accumulo + if: ${{ github.event.inputs.deployAccumulo == 'true'}} + run: | + export JAVA_HOME="$ACCUMULO_JAVA" + cd "$GITHUB_WORKSPACE/accumulo" + mvn -DaltDeploymentRepository=github-datawave::https://maven.pkg.github.com/NationalSecurityAgency/datawave -V -B -e -ntp "-Dstyle.color=always" -DskipTests -T1C clean source:jar deploy -s "$GITHUB_WORKSPACE/datawave/.github/workflows/settings.xml" + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ env.USER_NAME }} + password: ${{ env.ACCESS_TOKEN }} + + - name: Update DataWave Dependency Version + run: | + + export JAVA_HOME="$DATAWAVE_JAVA" + cd "$GITHUB_WORKSPACE/datawave" + mvn -s "$GITHUB_WORKSPACE/datawave/.github/workflows/settings.xml" versions:set-property -Dproperty=version.accumulo -DnewVersion=${{ steps.get-accumulo-version.outputs.accumuloVersion }} -DgenerateBackupPoms=false + - name: Build Web and Ingest Docker Images (Maven) + run: | + export JAVA_HOME="$DATAWAVE_JAVA" + cd "$GITHUB_WORKSPACE/datawave" + mvn -s "$GITHUB_WORKSPACE/datawave/.github/workflows/settings.xml" clean install -Prpm,kubernetes,assemble,deploy-ws -Ddist -Pdocker -DpushImage -Ddocker-release -DskipTests -Ddocker.image.accumulo.tag=${{ steps.get-accumulo-version.outputs.accumuloVersion }} + + diff --git a/.github/workflows/microservice-build-image.yaml b/.github/workflows/microservice-build-image.yaml new file mode 100644 index 00000000000..fe9e199ce50 --- /dev/null +++ b/.github/workflows/microservice-build-image.yaml @@ -0,0 +1,41 @@ +# +name: Create and publish a Docker image + +on: + workflow_call: + secrets: + USER_NAME: + description: "User Name for maven pulls" + required: true + ACCESS_TOKEN: + description: "Access token for maven pulls" + required: true + + +jobs: + build-and-push-datawave-images: + runs-on: ubuntu-latest + steps: + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ secrets.USER_NAME }} + password: ${{ secrets.ACCESS_TOKEN }} + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up JDK 11 + uses: actions/setup-java@v4 + with: + distribution: "zulu" + java-version: 11 + cache: 'maven' + - name: Build And Push Docker Image (Maven) + env: + MAVEN_OPTS: "-Dhttps.protocols=TLSv1.2 -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=WARN -Djava.awt.headless=true" + USER_NAME: ${{ secrets.USER_NAME }} + ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} + run: | + mvn -s $GITHUB_WORKSPACE/.github/workflows/settings.xml -V -B -e clean install -Pdocker,exec -Ddocker.image.prefix=ghcr.io/nationalsecurityagency/ -DpushImage + + diff --git a/.github/workflows/microservice-maven-tests.yaml b/.github/workflows/microservice-maven-tests.yaml new file mode 100644 index 00000000000..4f20d49c3b8 --- /dev/null +++ b/.github/workflows/microservice-maven-tests.yaml @@ -0,0 +1,72 @@ +name: Tests + +on: + workflow_call: + secrets: + USER_NAME: + description: "User Name for maven pulls" + required: true + ACCESS_TOKEN: + description: "Access token for maven pulls" + required: true + +env: + MAVEN_OPTS: "-Djansi.force=true -Dhttps.protocols=TLSv1.2 -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=WARN -Djava.awt.headless=true -XX:ThreadStackSize=1m" + +jobs: + # Runs the pom sorter and code formatter to ensure that the code + # is formatted and poms are sorted according to project rules. This + # will fail if the formatter makes any changes. + check-code-formatting: + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up JDK 11 + uses: actions/setup-java@v4 + with: + java-version: 11 + distribution: 'zulu' + - uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-format-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven-format- + ${{ runner.os }}-maven- + - name: Format code + run: | + mvn -s $GITHUB_WORKSPACE/.github/workflows/settings.xml -V -B -e clean formatter:format sortpom:sort -Pautoformat + git status + git diff-index --quiet HEAD || (echo "Error! There are modified files after formatting." && false) + env: + MAVEN_OPTS: "-Dhttps.protocols=TLSv1.2 -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=WARN -Djava.awt.headless=true" + USER_NAME: ${{ secrets.USER_NAME }} + ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} + + # Build the code and run the unit/integration tests. + build-and-test: + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up JDK 11 + uses: actions/setup-java@v4 + with: + java-version: 11 + distribution: 'zulu' + - uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-build-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven-build- + ${{ runner.os }}-maven-format- + ${{ runner.os }}-maven- + - name: Build and Run Unit Tests + run: mvn -s $GITHUB_WORKSPACE/.github/workflows/settings.xml -V -B -e -Ddist clean verify + env: + MAVEN_OPTS: "-Dhttps.protocols=TLSv1.2 -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=WARN -Djava.awt.headless=true" + USER_NAME: ${{ secrets.USER_NAME }} + ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} + diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cc57f9c1162..e60d6c94522 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,7 +7,7 @@ on: - 'integration' - 'release/version*' pull_request: - paths-ignore: ['*.md', 'CODEOWNERS', 'LICENSE'] + paths-ignore: ['*.md', 'CODEOWNERS', 'LICENSE', '.github/workflows/microservice*.yaml'] workflow_dispatch: env: diff --git a/.gitmodules b/.gitmodules index 656c2d211b8..26ad0ff918c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -85,3 +85,6 @@ [submodule "microservices/services/map"] path = microservices/services/map url = git@github.com:NationalSecurityAgency/datawave-map-service.git +[submodule "microservices/services/file-provider"] + path = microservices/services/file-provider + url = git@github.com:NationalSecurityAgency/datawave-file-provider-service.git diff --git a/common-test/pom.xml b/common-test/pom.xml index 45286e13c4a..f2e0976259c 100644 --- a/common-test/pom.xml +++ b/common-test/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-common-test ${project.artifactId} diff --git a/contrib/datawave-quickstart/bin/query.sh b/contrib/datawave-quickstart/bin/query.sh index ced895a3d66..d49aa64b4f1 100644 --- a/contrib/datawave-quickstart/bin/query.sh +++ b/contrib/datawave-quickstart/bin/query.sh @@ -131,21 +131,8 @@ function setQueryIdFromResponse() { } function prettyPrintJson() { - local PY=$( which python ) - if [ -n "${PY}" ] ; then - echo "${1}" | ${PY} -c 'from __future__ import print_function;import sys,json;data=json.loads(sys.stdin.read()); print(json.dumps(data, indent=2, sort_keys=True))' - local exitStatus=$? - echo - if [ "${exitStatus}" != "0" ] ; then - printRawResponse "${1}" - warn "Python encountered error. Printed response without formatting" - echo - fi - else - printRawResponse "${1}" - warn "Couldn't find python in your environment. Json response was printed without formatting" - echo - fi + PY_CMD='from __future__ import print_function; import sys,json; data=json.loads(sys.stdin.read()); print(json.dumps(data, indent=2, sort_keys=True))' + echo "${1}" | ( python3 -c "${PY_CMD}" 2>/dev/null || python2 -c "${PY_CMD}" 2>/dev/null ) || ( warn "Python encountered error. Printed response without formatting" && printRawResponse "${1}" ) } function printRawResponse() { diff --git a/contrib/datawave-quickstart/bin/services/datawave/ingest-examples/tvmaze-api-query.sh b/contrib/datawave-quickstart/bin/services/datawave/ingest-examples/tvmaze-api-query.sh index af1da19187e..69cf2d47fa5 100755 --- a/contrib/datawave-quickstart/bin/services/datawave/ingest-examples/tvmaze-api-query.sh +++ b/contrib/datawave-quickstart/bin/services/datawave/ingest-examples/tvmaze-api-query.sh @@ -38,10 +38,11 @@ TVMAZE_RESPONSE_STATUS=$( echo ${CURL_RESPONSE} | tr -d '\n' | sed -e 's/.*HTTP_ [ "${TVMAZE_RESPONSE_STATUS}" != "200" ] && error "api.tvmaze.com returned invalid response status: ${TVMAZE_RESPONSE_STATUS}" && exit 1 [ -z "${TVMAZE_RESPONSE_BODY}" ] && error "Response body is empty!" && exit 1 +PY_CMD='from __future__ import print_function; import sys,json; data=json.loads(sys.stdin.read()); print(json.dumps(data, indent=2, sort_keys=True))' if [ "${PRETTY}" == true ] ; then - echo "${TVMAZE_RESPONSE_BODY}" | python -c 'from __future__ import print_function;import sys,json;data=json.loads(sys.stdin.read()); print(json.dumps(data, indent=2, sort_keys=True))' + echo "${TVMAZE_RESPONSE_BODY}" | ( python3 -c "${PY_CMD}" 2>/dev/null || python2 -c "${PY_CMD}" 2>/dev/null ) || ( warn "Unable to pretty print, Python not detected" && echo "${TVMAZE_RESPONSE_BODY}" ) else - echo "${TVMAZE_RESPONSE_BODY}" + echo "${TVMAZE_RESPONSE_BODY}" fi exit 0 \ No newline at end of file diff --git a/contrib/datawave-quickstart/docker/Dockerfile b/contrib/datawave-quickstart/docker/Dockerfile index 3d2e2bd238f..7396b0c7527 100644 --- a/contrib/datawave-quickstart/docker/Dockerfile +++ b/contrib/datawave-quickstart/docker/Dockerfile @@ -35,7 +35,7 @@ COPY . /opt/datawave # Install dependencies, configure password-less/zero-prompt SSH... -RUN dnf -y install gcc-c++ openssl openssh openssh-server openssh-clients openssl-libs which bc wget git java-11-openjdk-devel iproute && \ +RUN dnf -y install gcc-c++ openssl python3 openssh openssh-server openssh-clients openssl-libs which bc wget git java-11-openjdk-devel iproute && \ dnf clean all && \ ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa && \ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \ diff --git a/contrib/datawave-quickstart/docker/pom.xml b/contrib/datawave-quickstart/docker/pom.xml index 9086f4cad48..024cfe92b6a 100644 --- a/contrib/datawave-quickstart/docker/pom.xml +++ b/contrib/datawave-quickstart/docker/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT ../../../pom.xml quickstart diff --git a/core/base-rest-responses b/core/base-rest-responses index 946206bf649..bf7c012b0bb 160000 --- a/core/base-rest-responses +++ b/core/base-rest-responses @@ -1 +1 @@ -Subproject commit 946206bf649dbf6b448d13136af0151dc740b4dc +Subproject commit bf7c012b0bb2c4445f9afd7e220a6ac23fd29b48 diff --git a/core/cached-results/pom.xml b/core/cached-results/pom.xml index 9e222278737..5da8ce95ed6 100644 --- a/core/cached-results/pom.xml +++ b/core/cached-results/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-core-cached-results ${project.artifactId} diff --git a/core/common-util/pom.xml b/core/common-util/pom.xml index d755fbf95a8..42778a71bb4 100644 --- a/core/common-util/pom.xml +++ b/core/common-util/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-core-common-util ${project.artifactId} diff --git a/core/common/pom.xml b/core/common/pom.xml index 1cd1efcfd05..a173a5f7dd5 100644 --- a/core/common/pom.xml +++ b/core/common/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-core-common ${project.artifactId} diff --git a/core/connection-pool/pom.xml b/core/connection-pool/pom.xml index 908e4bc6f48..012c5917e52 100644 --- a/core/connection-pool/pom.xml +++ b/core/connection-pool/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-core-connection-pool ${project.artifactId} diff --git a/core/in-memory-accumulo b/core/in-memory-accumulo index 4ed63595496..8a9d2f46d20 160000 --- a/core/in-memory-accumulo +++ b/core/in-memory-accumulo @@ -1 +1 @@ -Subproject commit 4ed63595496d3bbdce661f951385dc173bbdbd3b +Subproject commit 8a9d2f46d2012d4493baff5e8dc9f08f45f746d5 diff --git a/core/map-reduce/pom.xml b/core/map-reduce/pom.xml index 746f7a9658c..7b434a73eff 100644 --- a/core/map-reduce/pom.xml +++ b/core/map-reduce/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-core-map-reduce ${project.artifactId} diff --git a/core/metrics-reporter b/core/metrics-reporter index 78b4f3e624d..992378d6294 160000 --- a/core/metrics-reporter +++ b/core/metrics-reporter @@ -1 +1 @@ -Subproject commit 78b4f3e624d1a72968564b32b985fb5a4cf578ab +Subproject commit 992378d62946730d2ee799606276adca9522e050 diff --git a/core/modification/pom.xml b/core/modification/pom.xml index 9e3b89657e9..436f684cedc 100644 --- a/core/modification/pom.xml +++ b/core/modification/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-core-modification ${project.artifactId} diff --git a/core/pom.xml b/core/pom.xml index e095c990aa8..b036b622de6 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT gov.nsa.datawave.core datawave-core-parent diff --git a/core/query/pom.xml b/core/query/pom.xml index 2ed9f69891f..d5faab14ec2 100644 --- a/core/query/pom.xml +++ b/core/query/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-core-query ${project.artifactId} diff --git a/core/query/src/main/java/datawave/core/query/configuration/GenericQueryConfiguration.java b/core/query/src/main/java/datawave/core/query/configuration/GenericQueryConfiguration.java index deadd498218..9368856f9fb 100644 --- a/core/query/src/main/java/datawave/core/query/configuration/GenericQueryConfiguration.java +++ b/core/query/src/main/java/datawave/core/query/configuration/GenericQueryConfiguration.java @@ -77,6 +77,9 @@ public class GenericQueryConfiguration implements Serializable { // either IMMEDIATE or EVENTUAL private Map tableConsistencyLevels = new HashMap<>(); + // provides default scan hints + // NOTE: accumulo reserves the execution hint name 'meta' + // NOTE: datawave reserves the execution hint name 'expansion' for index expansion private Map> tableHints = new HashMap<>(); /** diff --git a/core/query/src/main/java/datawave/core/query/logic/composite/CompositeQueryLogic.java b/core/query/src/main/java/datawave/core/query/logic/composite/CompositeQueryLogic.java index 950cddef817..e4a6c022328 100644 --- a/core/query/src/main/java/datawave/core/query/logic/composite/CompositeQueryLogic.java +++ b/core/query/src/main/java/datawave/core/query/logic/composite/CompositeQueryLogic.java @@ -271,13 +271,14 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting // duplicate the settings for this query Query settingsCopy = settings.duplicate(settings.getQueryName() + " -> " + logicName); + // ensure we use the same query id + settingsCopy.setId(settings.getId()); + // update the query auths and runtime query authorizations for this logic runtimeQueryAuthorizations = updateRuntimeAuthorizationsAndQueryAuths(logic, settingsCopy); config = logic.initialize(client, settingsCopy, runtimeQueryAuthorizations); - logicQueryStringBuilder.append(" && ").append("( queryId = '").append(settingsCopy.getId()).append("' )"); - // only add this query logic to the initialized logic states if it was not simply filtered out if (logic instanceof FilteredQueryLogic && ((FilteredQueryLogic) logic).isFiltered()) { log.info("Dropping " + logic.getLogicName() + " as it was filtered out"); @@ -440,7 +441,8 @@ public synchronized QueryLogicTransformer getTransformer(Query settings) { responseClass = refResponse.getClass(); } else { if (!responseClass.equals(refResponse.getClass())) { - throw new RuntimeException("All query logics must use transformers that return the same object type"); + throw new RuntimeException("All query logics must use transformers that return the same object type: " + responseClass + " vs " + + refResponse.getClass()); } } } diff --git a/core/utils/accumulo-utils b/core/utils/accumulo-utils index 11583fbfede..638b3eda970 160000 --- a/core/utils/accumulo-utils +++ b/core/utils/accumulo-utils @@ -1 +1 @@ -Subproject commit 11583fbfedebf3b4e4f765d5f8cd6225ab0e24fd +Subproject commit 638b3eda97016bb66a7d014112b215075aac212e diff --git a/core/utils/common-utils b/core/utils/common-utils index 2810ed2bdd7..b39469f438e 160000 --- a/core/utils/common-utils +++ b/core/utils/common-utils @@ -1 +1 @@ -Subproject commit 2810ed2bdd7733b7ec98fb4bf470cc070443f5bc +Subproject commit b39469f438eca9e1e3a30d76cfbbf43e1013e46e diff --git a/core/utils/metadata-utils b/core/utils/metadata-utils index 212507ce17b..6169ce38337 160000 --- a/core/utils/metadata-utils +++ b/core/utils/metadata-utils @@ -1 +1 @@ -Subproject commit 212507ce17b255e855ceed209e48a19c736ed5c3 +Subproject commit 6169ce38337a31e9ef0a68285b544f7459010ab7 diff --git a/core/utils/pom.xml b/core/utils/pom.xml index a9464f68c31..fc447c743e4 100644 --- a/core/utils/pom.xml +++ b/core/utils/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT gov.nsa.datawave.core datawave-utils-parent diff --git a/core/utils/type-utils b/core/utils/type-utils index 2c39e2a311a..55d92d5d99c 160000 --- a/core/utils/type-utils +++ b/core/utils/type-utils @@ -1 +1 @@ -Subproject commit 2c39e2a311a544a492f43278ef2ac3934bf97ff4 +Subproject commit 55d92d5d99c6e232ba1d7ad12c210ded9ec240a9 diff --git a/docker/README.md b/docker/README.md index b07df3a6e1f..14187956565 100644 --- a/docker/README.md +++ b/docker/README.md @@ -136,6 +136,17 @@ Enabled via the 'dictionary', or 'full' profile. You will need to build the docker image for this service on your local machine following the instructions in the dictionary service README. +### File Provider + +Enabled via the 'file-provider', or 'full' profile. + +This microservice is in development, and can be found in this repo. + +[Datawave File Provider Service](https://github.com/NationalSecurityAgency/datawave-file-provider-service/tree/main) provides file management and access to Datawave and it's services. + +You will need to build the docker image for this service on your local machine following the instructions in the file provider service README. + + ## Usage Please read through these instructions in their entirety before attempting to build or deploy Datawave. @@ -328,6 +339,10 @@ Start the default services, the kafka services, and the dictionary service: ```docker compose --profile quickstart --profile dictionary --profile kafka up -d``` +Start the default services, and the file provider service: + +```docker compose --profile quickstart --profile file-provider up -d``` + Start all services: ```docker compose --profile quickstart --profile full up -d``` diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 89e364086f5..edfeca0f379 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -413,6 +413,25 @@ services: authorization: condition: service_healthy + file-provider: + profiles: + - file-provider + - full + image: datawave/file-provider-service + command: + - --spring.output.ansi.enabled=ALWAYS + - --spring.profiles.active=consul,compose,remoteauth + - --spring.cloud.consul.host=consul + - --spring.cloud.consul.discovery.instance-id=$${spring.application.name}:$${random.value} + ports: + - "8280:8080" + - "8643:8443" + volumes: + - ${PKI_DIR:-./pki}:/etc/pki:ro + - ./logs:/logs + networks: + - demo + # If you want to test cached results, enable the cachedresults profile mysql: profiles: diff --git a/docs/pom.xml b/docs/pom.xml index 43983c0b8b3..86b2183a261 100644 --- a/docs/pom.xml +++ b/docs/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-docs diff --git a/microservices/microservice-parent b/microservices/microservice-parent index 94e402333c1..6207c9d6576 160000 --- a/microservices/microservice-parent +++ b/microservices/microservice-parent @@ -1 +1 @@ -Subproject commit 94e402333c16767ae91c01b60a9ff66b5aaafda1 +Subproject commit 6207c9d65768c191773099a6f39f2b935aa52acd diff --git a/microservices/microservice-service-parent b/microservices/microservice-service-parent index 65cda7b2c52..8064d20ccf5 160000 --- a/microservices/microservice-service-parent +++ b/microservices/microservice-service-parent @@ -1 +1 @@ -Subproject commit 65cda7b2c526af5a3a9791b85a88dbd2422bf690 +Subproject commit 8064d20ccf5fb48dbf1d309503ffafa8ddaafb6c diff --git a/microservices/pom.xml b/microservices/pom.xml index 6675c04d3cf..349c820b047 100644 --- a/microservices/pom.xml +++ b/microservices/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT gov.nsa.datawave.microservice datawave-microservice-build-parent diff --git a/microservices/services/accumulo b/microservices/services/accumulo index 9a0b8183c7f..1ed0c60ba6a 160000 --- a/microservices/services/accumulo +++ b/microservices/services/accumulo @@ -1 +1 @@ -Subproject commit 9a0b8183c7fb0b9f6411426ef407ce0b350a60cc +Subproject commit 1ed0c60ba6afae87c12ce9f1f8207fd7fad1781c diff --git a/microservices/services/audit b/microservices/services/audit index aa90b87636f..6053f146335 160000 --- a/microservices/services/audit +++ b/microservices/services/audit @@ -1 +1 @@ -Subproject commit aa90b87636fb9cfbf817cbd619baef7b0d268c4f +Subproject commit 6053f146335380e389de1573b2b5c1227b032cc3 diff --git a/microservices/services/authorization b/microservices/services/authorization index 1ebdc36380c..630c8081b4f 160000 --- a/microservices/services/authorization +++ b/microservices/services/authorization @@ -1 +1 @@ -Subproject commit 1ebdc36380c6f258f872b62ff51bc114b79487f1 +Subproject commit 630c8081b4fae83723ad3fe09acb6a1b2bc3e83f diff --git a/microservices/services/config b/microservices/services/config index fc5c26bd5b1..b2611c17502 160000 --- a/microservices/services/config +++ b/microservices/services/config @@ -1 +1 @@ -Subproject commit fc5c26bd5b155c88bbf297552943d5a93b4d69d7 +Subproject commit b2611c17502810cdc80fc3f59e4af580e4f38387 diff --git a/microservices/services/dictionary b/microservices/services/dictionary index 8cc847076e4..8781fcdd517 160000 --- a/microservices/services/dictionary +++ b/microservices/services/dictionary @@ -1 +1 @@ -Subproject commit 8cc847076e41c26f55797cab7dbd15487ca0dd09 +Subproject commit 8781fcdd5176e1dfe23b362226cb01d7c460a324 diff --git a/microservices/services/file-provider b/microservices/services/file-provider new file mode 160000 index 00000000000..e041a78d0f8 --- /dev/null +++ b/microservices/services/file-provider @@ -0,0 +1 @@ +Subproject commit e041a78d0f850e8886a0c3781236210e0be90a91 diff --git a/microservices/services/hazelcast b/microservices/services/hazelcast index cd332b47623..bc82fc8d1f0 160000 --- a/microservices/services/hazelcast +++ b/microservices/services/hazelcast @@ -1 +1 @@ -Subproject commit cd332b47623b86506b07a3a1612be51187115dd3 +Subproject commit bc82fc8d1f036735d2a4e9980ce90bf9e5f7238e diff --git a/microservices/services/map b/microservices/services/map index a93ecd663fb..473ec437082 160000 --- a/microservices/services/map +++ b/microservices/services/map @@ -1 +1 @@ -Subproject commit a93ecd663fb2f2c293883aecb1c475628f721f55 +Subproject commit 473ec437082e661f51132a9254877b6bb27def84 diff --git a/microservices/services/mapreduce-query b/microservices/services/mapreduce-query index 036eb1673ff..80bf47d63a6 160000 --- a/microservices/services/mapreduce-query +++ b/microservices/services/mapreduce-query @@ -1 +1 @@ -Subproject commit 036eb1673ffd7c6ad4d5731b5c6bc2d8e342b79e +Subproject commit 80bf47d63a6be68d601486dac9b70cd4972562f2 diff --git a/microservices/services/modification b/microservices/services/modification index 6cc297b4ff3..5a7a5f4b59b 160000 --- a/microservices/services/modification +++ b/microservices/services/modification @@ -1 +1 @@ -Subproject commit 6cc297b4ff30f14bf7d66b883228c40436b79dac +Subproject commit 5a7a5f4b59bd0764246edd129ec0ac8e8f1344ed diff --git a/microservices/services/pom.xml b/microservices/services/pom.xml index ed3a9a85101..836874ba321 100644 --- a/microservices/services/pom.xml +++ b/microservices/services/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.microservice datawave-microservice-build-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-microservice-service-build-parent pom @@ -66,6 +66,17 @@ dictionary + + submodule-service-file-provider + + + file-provider/pom.xml + + + + file-provider + + submodule-service-hazelcast diff --git a/microservices/services/query b/microservices/services/query index d2a2130ab4f..0d8a703a35f 160000 --- a/microservices/services/query +++ b/microservices/services/query @@ -1 +1 @@ -Subproject commit d2a2130ab4f892c4899bcc55445eb3974ab787b2 +Subproject commit 0d8a703a35f53333e5465b0fbca40bf2856753ca diff --git a/microservices/services/query-executor b/microservices/services/query-executor index 5f4c3bc84bb..89d4abc1d1c 160000 --- a/microservices/services/query-executor +++ b/microservices/services/query-executor @@ -1 +1 @@ -Subproject commit 5f4c3bc84bb2cd51ddd5a626452d6264b402878a +Subproject commit 89d4abc1d1c85dd55b53bb80c7d8b71fa7e3870f diff --git a/microservices/services/query-metric b/microservices/services/query-metric index 8281bfc5d3a..3c272208f19 160000 --- a/microservices/services/query-metric +++ b/microservices/services/query-metric @@ -1 +1 @@ -Subproject commit 8281bfc5d3a608974e131ea304b325200fa7ff8e +Subproject commit 3c272208f198b4b710bba5e8c4223667dafb9b1a diff --git a/microservices/starters/audit b/microservices/starters/audit index b2bf281813f..4feace8781b 160000 --- a/microservices/starters/audit +++ b/microservices/starters/audit @@ -1 +1 @@ -Subproject commit b2bf281813fc83c15fb0aa14505ce3e65ba15f91 +Subproject commit 4feace8781b33ad7ce0e746a329f2d4cae78fe6a diff --git a/microservices/starters/cache b/microservices/starters/cache index 17c220c186b..30196007910 160000 --- a/microservices/starters/cache +++ b/microservices/starters/cache @@ -1 +1 @@ -Subproject commit 17c220c186bcaf68e42d205b4f45bedb16961634 +Subproject commit 3019600791021114e50b387cc312c97375b979ff diff --git a/microservices/starters/cached-results b/microservices/starters/cached-results index 84e901adb03..3ff06a70611 160000 --- a/microservices/starters/cached-results +++ b/microservices/starters/cached-results @@ -1 +1 @@ -Subproject commit 84e901adb03783fe87732af5578411c05f80c385 +Subproject commit 3ff06a70611fdc36d4eafba533e979181361147f diff --git a/microservices/starters/datawave b/microservices/starters/datawave index ef559bae6fc..05bb23ce9c3 160000 --- a/microservices/starters/datawave +++ b/microservices/starters/datawave @@ -1 +1 @@ -Subproject commit ef559bae6fc9e0b3f58931b546dc7a80ec41176b +Subproject commit 05bb23ce9c34f114bd8eaa11cf747a714e2a239a diff --git a/microservices/starters/metadata b/microservices/starters/metadata index 8cac03428ac..902067cf4da 160000 --- a/microservices/starters/metadata +++ b/microservices/starters/metadata @@ -1 +1 @@ -Subproject commit 8cac03428ac7c05e8a59f0295824da60d2bb552e +Subproject commit 902067cf4da5a3f2f171aa5b00871ece748d28e3 diff --git a/microservices/starters/pom.xml b/microservices/starters/pom.xml index b902e93912b..77449f1ef44 100644 --- a/microservices/starters/pom.xml +++ b/microservices/starters/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.microservice datawave-microservice-build-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-microservice-starter-build-parent pom diff --git a/microservices/starters/query b/microservices/starters/query index f4523bb35b1..2e52295fee2 160000 --- a/microservices/starters/query +++ b/microservices/starters/query @@ -1 +1 @@ -Subproject commit f4523bb35b1bc76d6efcd9b5bd23141903968c73 +Subproject commit 2e52295fee2cc0b68759965afd20f08f6402e28c diff --git a/microservices/starters/query-metric b/microservices/starters/query-metric index 5df8fd573e1..cecb6eff3b7 160000 --- a/microservices/starters/query-metric +++ b/microservices/starters/query-metric @@ -1 +1 @@ -Subproject commit 5df8fd573e17f17d60c41a0ce6cbf4960fa332b1 +Subproject commit cecb6eff3b7b3b56eadfc7d29c7514590af82043 diff --git a/pom.xml b/pom.xml index 8ba7cba22dc..440e7f21b4f 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 gov.nsa.datawave datawave-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT pom DataWave DataWave is a Java-based ingest and query framework that leverages Apache Accumulo to provide fast, secure access to your data. diff --git a/properties/kubernetes.properties b/properties/kubernetes.properties index ee3ffc36200..71711b88633 100644 --- a/properties/kubernetes.properties +++ b/properties/kubernetes.properties @@ -3,6 +3,8 @@ RCPT_TO=hadoop@localhost docker.image.prefix=ghcr.io/nationalsecurityagency/ +docker.image.accumulo.tag=2.1.3 + # ingest properties DATAWAVE_INGEST_HOME=/opt/datawave-ingest/current diff --git a/warehouse/accumulo-extensions/pom.xml b/warehouse/accumulo-extensions/pom.xml index bef5dea6e05..59e797fb498 100644 --- a/warehouse/accumulo-extensions/pom.xml +++ b/warehouse/accumulo-extensions/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-accumulo-extensions ${project.artifactId} diff --git a/warehouse/age-off-utils/pom.xml b/warehouse/age-off-utils/pom.xml index 77f5f8e189f..7ffd0014058 100644 --- a/warehouse/age-off-utils/pom.xml +++ b/warehouse/age-off-utils/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-age-off-utils ${project.artifactId} diff --git a/warehouse/age-off/pom.xml b/warehouse/age-off/pom.xml index 8f5b5bb04bd..337620a5b21 100644 --- a/warehouse/age-off/pom.xml +++ b/warehouse/age-off/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-age-off ${project.artifactId} diff --git a/warehouse/assemble/datawave/pom.xml b/warehouse/assemble/datawave/pom.xml index ce4d330467e..405b7759c52 100644 --- a/warehouse/assemble/datawave/pom.xml +++ b/warehouse/assemble/datawave/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave assemble-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT assemble-datawave jar diff --git a/warehouse/assemble/datawave/src/main/docker/Dockerfile b/warehouse/assemble/datawave/src/main/docker/Dockerfile index 6bc6d4827b2..91b5a37b861 100644 --- a/warehouse/assemble/datawave/src/main/docker/Dockerfile +++ b/warehouse/assemble/datawave/src/main/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM ${docker.image.prefix}datawave-stack-accumulo:2.1.3 +FROM ${docker.image.prefix}datawave-stack-accumulo:${docker.image.accumulo.tag} USER root COPY --from=${docker.image.prefix}datawave-stack-hadoop:3.3.6 /usr/local/hadoop/ /usr/local/hadoop/ diff --git a/warehouse/assemble/pom.xml b/warehouse/assemble/pom.xml index 0be85e57103..5083ccd7fd2 100644 --- a/warehouse/assemble/pom.xml +++ b/warehouse/assemble/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT assemble-parent pom diff --git a/warehouse/assemble/webservice/pom.xml b/warehouse/assemble/webservice/pom.xml index be28fe28f74..fc20c697d93 100644 --- a/warehouse/assemble/webservice/pom.xml +++ b/warehouse/assemble/webservice/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave assemble-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT assemble-webservice ${project.artifactId} diff --git a/warehouse/common/pom.xml b/warehouse/common/pom.xml index 7214cb14500..5562d691140 100644 --- a/warehouse/common/pom.xml +++ b/warehouse/common/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-common ${project.artifactId} diff --git a/warehouse/core/pom.xml b/warehouse/core/pom.xml index 02506eaf856..60b05e87331 100644 --- a/warehouse/core/pom.xml +++ b/warehouse/core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-core jar diff --git a/warehouse/core/src/main/java/datawave/mr/bulk/BulkInputFormat.java b/warehouse/core/src/main/java/datawave/mr/bulk/BulkInputFormat.java index 67fef0102d2..fd55a67bba5 100644 --- a/warehouse/core/src/main/java/datawave/mr/bulk/BulkInputFormat.java +++ b/warehouse/core/src/main/java/datawave/mr/bulk/BulkInputFormat.java @@ -56,7 +56,7 @@ import org.apache.accumulo.core.security.Authorizations; import org.apache.accumulo.core.security.ColumnVisibility; import org.apache.accumulo.core.security.TablePermission; -import org.apache.accumulo.core.singletons.SingletonReservation; +import org.apache.accumulo.core.singletons.SingletonManager; import org.apache.accumulo.core.util.Pair; import org.apache.accumulo.core.util.format.DateFormatSupplier; import org.apache.accumulo.core.util.format.DefaultFormatter; @@ -1091,7 +1091,8 @@ protected static TabletLocator getTabletLocator(Configuration conf) throws Table Properties props = Accumulo.newClientProperties().to(conf.get(INSTANCE_NAME), conf.get(ZOOKEEPERS)) .as(getUsername(conf), new PasswordToken(getPassword(conf))).build(); ClientInfo info = ClientInfo.from(props); - ClientContext context = new ClientContext(SingletonReservation.noop(), info, ClientConfConverter.toAccumuloConf(info.getProperties()), Threads.UEH); + ClientContext context = new ClientContext(SingletonManager.getClientReservation(), info, ClientConfConverter.toAccumuloConf(info.getProperties()), + Threads.UEH); return TabletLocator.getLocator(context, context.getTableId(tableName)); } @@ -1132,8 +1133,8 @@ public List getSplits(JobContext job) throws IOException { // its possible that the cache could contain complete, but old information about a tables tablets... so clear it tl.invalidateCache(); ClientInfo info = ClientInfo.from(cbHelper.newClientProperties()); - ClientContext context = new ClientContext(SingletonReservation.noop(), info, ClientConfConverter.toAccumuloConf(info.getProperties()), - Threads.UEH); + ClientContext context = new ClientContext(SingletonManager.getClientReservation(), info, + ClientConfConverter.toAccumuloConf(info.getProperties()), Threads.UEH); while (!tl.binRanges(context, ranges, binnedRanges).isEmpty()) { if (!(client instanceof InMemoryAccumuloClient)) { if (tableId == null) diff --git a/warehouse/data-dictionary-core/pom.xml b/warehouse/data-dictionary-core/pom.xml index 05771d27eb0..c13251c35c2 100644 --- a/warehouse/data-dictionary-core/pom.xml +++ b/warehouse/data-dictionary-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-data-dictionary-core jar diff --git a/warehouse/edge-dictionary-core/pom.xml b/warehouse/edge-dictionary-core/pom.xml index 36bc95ee9b0..5d11e7893cc 100644 --- a/warehouse/edge-dictionary-core/pom.xml +++ b/warehouse/edge-dictionary-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-edge-dictionary-core jar diff --git a/warehouse/edge-model-configuration-core/pom.xml b/warehouse/edge-model-configuration-core/pom.xml index bded6a94668..cfe4152ad38 100644 --- a/warehouse/edge-model-configuration-core/pom.xml +++ b/warehouse/edge-model-configuration-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-edge-model-configuration-core jar diff --git a/warehouse/index-stats/pom.xml b/warehouse/index-stats/pom.xml index 6ffa2d5d126..9f997fb8e41 100644 --- a/warehouse/index-stats/pom.xml +++ b/warehouse/index-stats/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-index-stats jar diff --git a/warehouse/ingest-configuration/pom.xml b/warehouse/ingest-configuration/pom.xml index 642b280b4dc..061190fa9bd 100644 --- a/warehouse/ingest-configuration/pom.xml +++ b/warehouse/ingest-configuration/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ingest-configuration diff --git a/warehouse/ingest-core/pom.xml b/warehouse/ingest-core/pom.xml index 93b081ca0d6..fb4da561139 100644 --- a/warehouse/ingest-core/pom.xml +++ b/warehouse/ingest-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ingest-core jar diff --git a/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDataTypeHandler.java b/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDataTypeHandler.java index 3bab7240bec..58e0b0748db 100644 --- a/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDataTypeHandler.java +++ b/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDataTypeHandler.java @@ -617,7 +617,15 @@ public long process(KEYIN key, RawRecordContainer event, Multimap edgeHandler = new ProtobufEdgeDataTypeHandler<>(); + TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); + edgeHandler.setup(context); + + Set expectedKeys = new HashSet<>(); + expectedKeys.add("cabernet"); + expectedKeys.add("cabernet%00;orange"); + expectedKeys.add("orange"); + expectedKeys.add("orange%00;cabernet"); + + RawRecordContainer myEvent = getEvent(conf); + + EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 4, true, false); + Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet()); + + } + @Test public void testAwareAllNegated() { // CHEESE != 'apple' AND WINE != 'chianti' @@ -342,8 +373,8 @@ public void testAwareAllNegated() { fields.put("EVENT_DATE", new BaseNormalizedContent("EVENT_DATE", "2022-10-26T01:31:53Z")); fields.put("UUID", new BaseNormalizedContent("UUID", "0016dd72-0000-827d-dd4d-001b2163ba09")); - fields.put("CHEESE", new NormalizedFieldAndValue("FRUIT", "apple", "FOOD", "0")); - fields.put("CHEESE", new NormalizedFieldAndValue("FRUIT", "pear", "FOOD", "1")); + fields.put("FRUIT", new NormalizedFieldAndValue("FRUIT", "apple", "FOOD", "0")); + fields.put("FRUIT", new NormalizedFieldAndValue("FRUIT", "pear", "FOOD", "1")); fields.put("WINE", new NormalizedFieldAndValue("WINE", "pinot noir", "FOOD", "0")); fields.put("WINE", new NormalizedFieldAndValue("WINE", "chianti", "FOOD", "1")); @@ -388,6 +419,34 @@ public void testAwareNegation() { } + @Test + public void testAwareNR() { + // BREAD !~ 'ry.*' + + fields.put("EVENT_DATE", new BaseNormalizedContent("EVENT_DATE", "2022-10-26T01:31:53Z")); + fields.put("UUID", new BaseNormalizedContent("UUID", "0016dd72-0000-827d-dd4d-001b2163ba09")); + fields.put("BREAD", new NormalizedFieldAndValue("BREAD", "rye", "FOOD", "0")); + fields.put("BREAD", new NormalizedFieldAndValue("BREAD", "bagel", "FOOD", "1")); + fields.put("SANDWICH", new NormalizedFieldAndValue("SANDWICH", "reuben", "FOOD", "0")); + fields.put("SANDWICH", new NormalizedFieldAndValue("SANDWICH", "lox", "FOOD", "1")); + + ProtobufEdgeDataTypeHandler edgeHandler = new ProtobufEdgeDataTypeHandler<>(); + TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); + edgeHandler.setup(context); + + Set expectedKeys = new HashSet<>(); + expectedKeys.add("bagel"); + expectedKeys.add("bagel%00;lox"); + expectedKeys.add("lox"); + expectedKeys.add("lox%00;bagel"); + + RawRecordContainer myEvent = getEvent(conf); + + EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 4, true, false); + Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet()); + + } + @Test public void testAwarePreconDifferentGroup() { // CANINE == 'shepherd' @@ -441,7 +500,37 @@ public void testAwareFieldComparison() { expectedKeys.add("spruce%00;canine"); expectedKeys.add("canine"); expectedKeys.add("spruce"); - ; + + RawRecordContainer myEvent = getEvent(conf); + + EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 4, true, false); + Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet()); + + } + + @Test + public void testAwareERFieldComparison() { + // PERSON =~ METAL + + fields.put("EVENT_DATE", new BaseNormalizedContent("EVENT_DATE", "2022-10-26T01:31:53Z")); + fields.put("UUID", new BaseNormalizedContent("UUID", "0016dd72-0000-827d-dd4d-001b2163ba09")); + fields.put("PERSON", new NormalizedFieldAndValue("PERSON", "leader", "PROFESSION", "0")); + fields.put("METAL", new NormalizedFieldAndValue("METAL", "iron", "TOOL", "0")); + fields.put("IMPLEMENT", new NormalizedFieldAndValue("IMPLEMENT", "words", "TOOL", "0")); + + fields.put("PERSON", new NormalizedFieldAndValue("PERSON", "artist", "PROFESSION", "1")); + fields.put("METAL", new NormalizedFieldAndValue("METAL", "lead", "TOOL", "1")); + fields.put("IMPLEMENT", new NormalizedFieldAndValue("IMPLEMENT", "paint", "TOOL", "1")); + + ProtobufEdgeDataTypeHandler edgeHandler = new ProtobufEdgeDataTypeHandler<>(); + TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); + edgeHandler.setup(context); + + Set expectedKeys = new HashSet<>(); + expectedKeys.add("paint%00;leader"); + expectedKeys.add("leader%00;paint"); + expectedKeys.add("paint"); + expectedKeys.add("leader"); RawRecordContainer myEvent = getEvent(conf); diff --git a/warehouse/ingest-core/src/test/resources/config/EdgeSpringConfigPrecon.xml b/warehouse/ingest-core/src/test/resources/config/EdgeSpringConfigPrecon.xml index 5acd9ac10ec..879f7b91c23 100644 --- a/warehouse/ingest-core/src/test/resources/config/EdgeSpringConfigPrecon.xml +++ b/warehouse/ingest-core/src/test/resources/config/EdgeSpringConfigPrecon.xml @@ -25,7 +25,7 @@ http://www.springframework.org/schema/util/spring-util-4.0.xsd"> - + @@ -153,6 +153,39 @@ http://www.springframework.org/schema/util/spring-util-4.0.xsd"> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -165,7 +198,7 @@ http://www.springframework.org/schema/util/spring-util-4.0.xsd"> - + @@ -252,6 +285,39 @@ http://www.springframework.org/schema/util/spring-util-4.0.xsd"> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/warehouse/ingest-csv/pom.xml b/warehouse/ingest-csv/pom.xml index 4c67ee205ed..e8b7452c7dc 100644 --- a/warehouse/ingest-csv/pom.xml +++ b/warehouse/ingest-csv/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ingest-csv jar diff --git a/warehouse/ingest-json/pom.xml b/warehouse/ingest-json/pom.xml index bc5e2c397eb..025ca504ead 100644 --- a/warehouse/ingest-json/pom.xml +++ b/warehouse/ingest-json/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ingest-json jar diff --git a/warehouse/ingest-nyctlc/pom.xml b/warehouse/ingest-nyctlc/pom.xml index 7221c13fbb5..af51973dd8d 100644 --- a/warehouse/ingest-nyctlc/pom.xml +++ b/warehouse/ingest-nyctlc/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ingest-nyctlc jar diff --git a/warehouse/ingest-scripts/pom.xml b/warehouse/ingest-scripts/pom.xml index 7ee5aba0a2d..1221e920463 100644 --- a/warehouse/ingest-scripts/pom.xml +++ b/warehouse/ingest-scripts/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ingest-scripts ${project.artifactId} diff --git a/warehouse/ingest-ssdeep/pom.xml b/warehouse/ingest-ssdeep/pom.xml index a816022838f..11f18aa46d8 100644 --- a/warehouse/ingest-ssdeep/pom.xml +++ b/warehouse/ingest-ssdeep/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ingest-ssdeep diff --git a/warehouse/ingest-wikipedia/pom.xml b/warehouse/ingest-wikipedia/pom.xml index 60452fdb27b..920a7d2926a 100644 --- a/warehouse/ingest-wikipedia/pom.xml +++ b/warehouse/ingest-wikipedia/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ingest-wikipedia jar diff --git a/warehouse/metrics-core/pom.xml b/warehouse/metrics-core/pom.xml index 4a4fe7d4b74..5c73204cd7f 100644 --- a/warehouse/metrics-core/pom.xml +++ b/warehouse/metrics-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-metrics-core jar diff --git a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/FileByteSummaryLoader.java b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/FileByteSummaryLoader.java index 0cd6e772c09..940dac61131 100644 --- a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/FileByteSummaryLoader.java +++ b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/FileByteSummaryLoader.java @@ -3,19 +3,19 @@ import java.io.IOException; import java.util.Collections; import java.util.Date; +import java.util.Properties; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.accumulo.core.client.Accumulo; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; -import org.apache.accumulo.core.client.ClientConfiguration; -import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat; -import org.apache.accumulo.core.client.security.tokens.PasswordToken; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.Text; @@ -105,11 +105,21 @@ public int run(String[] args) throws Exception { job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); job.setInputFormatClass(AccumuloInputFormat.class); - AccumuloInputFormat.setConnectorInfo(job, userName, new PasswordToken(password)); - AccumuloInputFormat.setInputTableName(job, inputTable); - AccumuloInputFormat.setScanAuthorizations(job, Authorizations.EMPTY); - AccumuloInputFormat.setZooKeeperInstance(job, ClientConfiguration.loadDefault().withInstance(instance.trim()).withZkHosts(zookeepers.trim())); - AccumuloInputFormat.setRanges(job, Collections.singletonList(dayRange)); + + // @formatter:off + Properties clientProperties = Accumulo.newClientProperties() + .to(instance.trim(), zookeepers.trim()) + .as(userName, password) + .build(); + + AccumuloInputFormat.configure() + .clientProperties(clientProperties) + .table(inputTable) + .auths(Authorizations.EMPTY) + .ranges(Collections.singletonList(dayRange)) + .store(job); + // @formatter:on + // Ensure all data for a day goes to the same reducer so that we aggregate it correctly before sending to Accumulo RowPartitioner.configureJob(job); diff --git a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/IngestMetricsSummaryLoader.java b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/IngestMetricsSummaryLoader.java index 27e6b5692d5..8fe578599d1 100644 --- a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/IngestMetricsSummaryLoader.java +++ b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/IngestMetricsSummaryLoader.java @@ -7,6 +7,7 @@ import java.util.Date; import java.util.HashSet; import java.util.Map; +import java.util.Properties; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; @@ -16,15 +17,13 @@ import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; -import org.apache.accumulo.core.client.ClientConfiguration; import org.apache.accumulo.core.client.Scanner; import org.apache.accumulo.core.client.TableNotFoundException; -import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat; -import org.apache.accumulo.core.client.security.tokens.PasswordToken; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.Text; @@ -277,12 +276,20 @@ public int run(String[] args) throws Exception { job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); job.setInputFormatClass(AccumuloInputFormat.class); - AccumuloInputFormat.setConnectorInfo(job, userName, new PasswordToken(password)); - AccumuloInputFormat.setZooKeeperInstance(job, ClientConfiguration.loadDefault().withInstance(instance).withZkHosts(zookeepers)); - AccumuloInputFormat.setInputTableName(job, inputTable); - AccumuloInputFormat.setScanAuthorizations(job, Authorizations.EMPTY); - AccumuloInputFormat.setRanges(job, Collections.singletonList(dayRange)); + // @formatter:off + Properties clientProperties = Accumulo.newClientProperties() + .to(instance, zookeepers) + .as(userName, password) + .build(); + + AccumuloInputFormat.configure() + .clientProperties(clientProperties) + .table(inputTable) + .auths(Authorizations.EMPTY) + .ranges(Collections.singletonList(dayRange)) + .store(job); + // @formatter:on // Ensure all data for a day goes to the same reducer so that we aggregate it correctly before sending to Accumulo RowPartitioner.configureJob(job); diff --git a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/QueryMetricsSummaryLoader.java b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/QueryMetricsSummaryLoader.java index 746684e07d6..2eae071041a 100644 --- a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/QueryMetricsSummaryLoader.java +++ b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/QueryMetricsSummaryLoader.java @@ -8,20 +8,20 @@ import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Properties; import java.util.concurrent.TimeUnit; +import org.apache.accumulo.core.client.Accumulo; import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; -import org.apache.accumulo.core.client.ClientConfiguration; import org.apache.accumulo.core.client.IteratorSetting; -import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat; -import org.apache.accumulo.core.client.security.tokens.PasswordToken; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.user.RegExFilter; import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.Text; @@ -287,17 +287,26 @@ public int run(String[] args) throws Exception { job.setMapOutputValueClass(Value.class); job.setInputFormatClass(AccumuloInputFormat.class); - AccumuloInputFormat.setConnectorInfo(job, userName, new PasswordToken(password)); - AccumuloInputFormat.setZooKeeperInstance(job, ClientConfiguration.loadDefault().withInstance(instance).withZkHosts(zookeepers)); - AccumuloInputFormat.setRanges(job, dayRanges); - AccumuloInputFormat.setAutoAdjustRanges(job, false); - AccumuloInputFormat.setInputTableName(job, inputTable); - AccumuloInputFormat.setScanAuthorizations(job, auths); + // @formatter:off + Properties clientProperties = Accumulo.newClientProperties() + .to(instance, zookeepers) + .as(userName, password) + .build(); + // @formatter:on IteratorSetting regex = new IteratorSetting(50, RegExFilter.class); regex.addOption(RegExFilter.COLF_REGEX, QUERY_METRICS_REGEX); - AccumuloInputFormat.addIterator(job, regex); + // @formatter:off + AccumuloInputFormat.configure() + .clientProperties(clientProperties) + .table(inputTable) + .auths(auths) + .ranges(dayRanges) + .autoAdjustRanges(false) + .addIterator(regex) + .store(job); + // @formatter:on // Ensure all data for a day goes to the same reducer so that we aggregate it correctly before sending to Accumulo RowPartitioner.configureJob(job); diff --git a/warehouse/ops-tools/config-compare/pom.xml b/warehouse/ops-tools/config-compare/pom.xml index 8f1839ef77e..61c9623d24e 100644 --- a/warehouse/ops-tools/config-compare/pom.xml +++ b/warehouse/ops-tools/config-compare/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-ops-tools-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ops-tools-config-compare diff --git a/warehouse/ops-tools/index-validation/pom.xml b/warehouse/ops-tools/index-validation/pom.xml index a28da7a1a8b..a85f0b9d20e 100644 --- a/warehouse/ops-tools/index-validation/pom.xml +++ b/warehouse/ops-tools/index-validation/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-ops-tools-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ops-tools-index-validation jar diff --git a/warehouse/ops-tools/pom.xml b/warehouse/ops-tools/pom.xml index 577613d6ea9..c82c4bf2dc9 100644 --- a/warehouse/ops-tools/pom.xml +++ b/warehouse/ops-tools/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ops-tools-parent pom diff --git a/warehouse/pom.xml b/warehouse/pom.xml index 028b013d820..06fd84ea4ea 100644 --- a/warehouse/pom.xml +++ b/warehouse/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-warehouse-parent pom diff --git a/warehouse/query-core/pom.xml b/warehouse/query-core/pom.xml index 04f5f2ff603..2c0912419c1 100644 --- a/warehouse/query-core/pom.xml +++ b/warehouse/query-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-query-core jar diff --git a/warehouse/query-core/src/main/java/datawave/core/iterators/BoundedRangeExpansionIterator.java b/warehouse/query-core/src/main/java/datawave/core/iterators/BoundedRangeExpansionIterator.java new file mode 100644 index 00000000000..f863a4d0675 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/core/iterators/BoundedRangeExpansionIterator.java @@ -0,0 +1,171 @@ +package datawave.core.iterators; + +import java.io.IOException; +import java.util.Collection; +import java.util.Map; +import java.util.TreeSet; + +import org.apache.accumulo.core.data.ByteSequence; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.IteratorEnvironment; +import org.apache.accumulo.core.iterators.OptionDescriber; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; +import org.apache.accumulo.core.iterators.user.SeekingFilter; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.io.Text; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Splitter; + +import datawave.query.Constants; +import datawave.query.jexl.LiteralRange; + +/** + * A {@link SeekingFilter} that attempts to expand bounded ranges using the global index + *

+ * The caller is responsible for fetching the appropriate column families. The range is constructed from a {@link LiteralRange}. + *

+ * The only thing this iterator does is advance through datatypes if a filter is supplied, advance to the start date, and advance to the next row within the + * range. + */ +public class BoundedRangeExpansionIterator extends SeekingFilter implements OptionDescriber { + + private static final Logger log = LoggerFactory.getLogger(BoundedRangeExpansionIterator.class); + + public static final String START_DATE = "start.date"; + public static final String END_DATE = "end.date"; + public static final String DATATYPES_OPT = "dts"; + + private TreeSet datatypes; + private String startDate; + private String endDate; + + private Text prevRow; + + @Override + public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException { + if (!validateOptions(options)) { + throw new IllegalArgumentException("BoundedRangeExpansionIterator not configured with correct options"); + } + + String opt = options.get(DATATYPES_OPT); + if (StringUtils.isBlank(opt)) { + datatypes = new TreeSet<>(); + } else { + datatypes = new TreeSet<>(Splitter.on(',').splitToList(opt)); + } + + startDate = options.get(START_DATE); + endDate = options.get(END_DATE) + Constants.MAX_UNICODE_STRING; + + super.init(source, options, env); + } + + @Override + public IteratorOptions describeOptions() { + IteratorOptions opts = new IteratorOptions(getClass().getName(), "Expands bounded ranges using the global index", null, null); + opts.addNamedOption(START_DATE, "The start date"); + opts.addNamedOption(END_DATE, "The end date"); + opts.addNamedOption(DATATYPES_OPT, "The set of datatypes used to filter keys (optional)"); + return opts; + } + + @Override + public boolean validateOptions(Map options) { + return options.containsKey(START_DATE) && options.containsKey(END_DATE); + } + + @Override + public FilterResult filter(Key k, Value v) { + log.trace("filter key: {}", k.toStringNoTime()); + + // shard + null + datatype + String cq = k.getColumnQualifier().toString(); + int index = cq.indexOf('\u0000'); + String date = cq.substring(0, index); + + if (date.compareTo(startDate) < 0) { + log.trace("{} is before the start date {}, advancing to start date", date, startDate); + return new FilterResult(false, AdvanceResult.USE_HINT); + } + + if (date.compareTo(endDate) > 0) { + log.trace("{} is past the end date {}, advancing to next row", date, endDate); + return new FilterResult(false, AdvanceResult.NEXT_ROW); + } + + String datatype = cq.substring(index + 1); + if (!datatypes.isEmpty() && !datatypes.contains(datatype)) { + log.trace("datatype {} was filtered out, advancing to next key", datatype); + return new FilterResult(false, AdvanceResult.NEXT); + } + + if (prevRow != null && prevRow.equals(k.getRow())) { + // this iterator should only return a single key per unique row, thus the previous row should never match the current row. + log.warn("should never see a duplicate row -- skip to next row"); + return new FilterResult(false, AdvanceResult.NEXT_ROW); + } + + prevRow = k.getRow(); + return new FilterResult(true, AdvanceResult.NEXT_ROW); + } + + /** + * Hint is only used to seek to the start date + * + * @param k + * a key + * @param v + * a value + * @return the key used to seek + */ + @Override + public Key getNextKeyHint(Key k, Value v) { + log.trace("get next key hint: {}", k.toStringNoTime()); + + // shard + null + datatype + String cq = k.getColumnQualifier().toString(); + int index = cq.indexOf('\u0000'); + String date = cq.substring(0, index); + + if (date.compareTo(startDate) < 0) { + Text columnQualifier; + + if (datatypes.isEmpty()) { + log.trace("seek to start date"); + columnQualifier = new Text(startDate + '\u0000'); + } else { + log.trace("seek to start date and datatype"); + columnQualifier = new Text(startDate + '\u0000' + datatypes.first()); + } + + return new Key(k.getRow(), k.getColumnFamily(), columnQualifier); + } + + log.trace("next hint key was called in a bad state, reverting to no-op"); + return k; + } + + @Override + public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { + if (!range.isStartKeyInclusive()) { + // need to skip to next row + Key skip = new Key(range.getStartKey().getRow().toString() + '\u0000'); + if (skip.compareTo(range.getEndKey()) > 0) { + // handles case of bounded range against single value + // filter key: +cE1 NUM:20150808_0%00;generic [NA] + // skip key would be +cE1 but then the start key is greater than the end key. so we cheat accumulo. + Range skipRange = new Range(range.getEndKey(), true, range.getEndKey(), range.isEndKeyInclusive()); + super.seek(skipRange, columnFamilies, inclusive); + } else { + Range skipRange = new Range(skip, true, range.getEndKey(), range.isEndKeyInclusive()); + super.seek(skipRange, columnFamilies, inclusive); + } + } else { + super.seek(range, columnFamilies, inclusive); + } + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java index 1dcbf7583cd..b88cef085f2 100644 --- a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java +++ b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java @@ -84,7 +84,7 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement public static final String QUERY_LOGIC_NAME_SOURCE = "queryLogic"; @SuppressWarnings("unused") - private static final long serialVersionUID = -4354990715046146110L; + private static final long serialVersionUID = 2321985989282659247L; private static final Logger log = Logger.getLogger(ShardQueryConfiguration.class); // is this a tld query, explicitly default to false @@ -482,24 +482,26 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement private boolean pruneQueryOptions = false; /** - * Flag to control gathering field counts from the global index and persisting those to the query iterator. Negated terms and branches are not considered. + * Flag that sorts the query prior to the global index lookup using inferred costs. This step may reduce time spent in the global index depending on + * individual term selectivity. */ - private boolean useFieldCounts = false; + private boolean sortQueryPreIndexWithImpliedCounts = false; + /** - * Flag to control gathering term counts from the global index and persisting those to the query iterator. Negated terms and branches are not considered. + * Flag that sorts the query prior to the global index lookup using field counts from the {@link TableName#METADATA} table. This option opens a scanner and + * thus is more expensive than sorting by implied counts, but is potentially more accurate. */ - private boolean useTermCounts = false; + private boolean sortQueryPreIndexWithFieldCounts = false; + /** - * Flag to control sorting a query by inferred default costs prior to the global index lookup. This step may reduce time performing a secondary sort as when - * {@link #sortQueryByCounts} is enabled. + * Flag that sorts the query using field counts gathered as part of the global index lookup. Negated terms and branches are not considered. */ - private boolean sortQueryBeforeGlobalIndex = false; + private boolean sortQueryPostIndexWithFieldCounts = false; /** - * Flag to control if a query is sorted by either field or term counts. Either {@link #useFieldCounts} or {@link #useTermCounts} must be set for this option - * to take effect. + * Flag that sorts the query using term counts gathered as part of the global index lookup. Negated terms and branches are not considered. */ - private boolean sortQueryByCounts = false; + private boolean sortQueryPostIndexWithTermCounts = false; /** * Insert rules for processing the QueryTree to automatically apply hints to queries. Hints will be passed to the ScannerFactory @@ -746,10 +748,10 @@ public void copyFrom(ShardQueryConfiguration other) { this.setTfAggregationThresholdMs(other.getTfAggregationThresholdMs()); this.setGroupFields(GroupFields.copyOf(other.getGroupFields())); this.setPruneQueryOptions(other.getPruneQueryOptions()); - this.setUseFieldCounts(other.getUseFieldCounts()); - this.setUseTermCounts(other.getUseTermCounts()); - this.setSortQueryBeforeGlobalIndex(other.isSortQueryBeforeGlobalIndex()); - this.setSortQueryByCounts(other.isSortQueryByCounts()); + this.setSortQueryPreIndexWithImpliedCounts(other.isSortQueryPreIndexWithImpliedCounts()); + this.setSortQueryPreIndexWithFieldCounts(other.isSortQueryPreIndexWithFieldCounts()); + this.setSortQueryPostIndexWithTermCounts(other.isSortQueryPostIndexWithTermCounts()); + this.setSortQueryPostIndexWithFieldCounts(other.isSortQueryPostIndexWithFieldCounts()); this.setUseQueryTreeScanHintRules(other.isUseQueryTreeScanHintRules()); this.setQueryTreeScanHintRules(other.getQueryTreeScanHintRules()); this.setFieldIndexHoleMinThreshold(other.getFieldIndexHoleMinThreshold()); @@ -2765,36 +2767,36 @@ public void setReduceIngestTypesPerShard(boolean reduceIngestTypesPerShard) { this.reduceIngestTypesPerShard = reduceIngestTypesPerShard; } - public boolean getUseTermCounts() { - return useTermCounts; + public boolean isSortQueryPreIndexWithImpliedCounts() { + return sortQueryPreIndexWithImpliedCounts; } - public void setUseTermCounts(boolean useTermCounts) { - this.useTermCounts = useTermCounts; + public void setSortQueryPreIndexWithImpliedCounts(boolean sortQueryPreIndexWithImpliedCounts) { + this.sortQueryPreIndexWithImpliedCounts = sortQueryPreIndexWithImpliedCounts; } - public boolean getUseFieldCounts() { - return useFieldCounts; + public boolean isSortQueryPreIndexWithFieldCounts() { + return sortQueryPreIndexWithFieldCounts; } - public void setUseFieldCounts(boolean useFieldCounts) { - this.useFieldCounts = useFieldCounts; + public void setSortQueryPreIndexWithFieldCounts(boolean sortQueryPreIndexWithFieldCounts) { + this.sortQueryPreIndexWithFieldCounts = sortQueryPreIndexWithFieldCounts; } - public boolean isSortQueryBeforeGlobalIndex() { - return sortQueryBeforeGlobalIndex; + public boolean isSortQueryPostIndexWithFieldCounts() { + return sortQueryPostIndexWithFieldCounts; } - public void setSortQueryBeforeGlobalIndex(boolean sortQueryBeforeGlobalIndex) { - this.sortQueryBeforeGlobalIndex = sortQueryBeforeGlobalIndex; + public void setSortQueryPostIndexWithFieldCounts(boolean sortQueryPostIndexWithFieldCounts) { + this.sortQueryPostIndexWithFieldCounts = sortQueryPostIndexWithFieldCounts; } - public boolean isSortQueryByCounts() { - return sortQueryByCounts; + public boolean isSortQueryPostIndexWithTermCounts() { + return sortQueryPostIndexWithTermCounts; } - public void setSortQueryByCounts(boolean sortQueryByCounts) { - this.sortQueryByCounts = sortQueryByCounts; + public void setSortQueryPostIndexWithTermCounts(boolean sortQueryPostIndexWithTermCounts) { + this.sortQueryPostIndexWithTermCounts = sortQueryPostIndexWithTermCounts; } @Override @@ -3001,10 +3003,10 @@ public boolean equals(Object o) { getDocAggregationThresholdMs() == that.getDocAggregationThresholdMs() && getTfAggregationThresholdMs() == that.getTfAggregationThresholdMs() && getPruneQueryOptions() == that.getPruneQueryOptions() && - getUseFieldCounts() == that.getUseFieldCounts() && - getUseTermCounts() == that.getUseTermCounts() && - isSortQueryBeforeGlobalIndex() == that.isSortQueryBeforeGlobalIndex() && - isSortQueryByCounts() == that.isSortQueryByCounts(); + isSortQueryPreIndexWithImpliedCounts() == isSortQueryPreIndexWithImpliedCounts() && + isSortQueryPreIndexWithFieldCounts() == isSortQueryPreIndexWithFieldCounts() && + isSortQueryPostIndexWithTermCounts() == isSortQueryPostIndexWithTermCounts() && + isSortQueryPostIndexWithFieldCounts() == isSortQueryPostIndexWithFieldCounts(); // @formatter:on } @@ -3206,10 +3208,11 @@ public int hashCode() { getDocAggregationThresholdMs(), getTfAggregationThresholdMs(), getPruneQueryOptions(), - getUseFieldCounts(), - getUseTermCounts(), - isSortQueryBeforeGlobalIndex(), - isSortQueryByCounts()); + isSortQueryPreIndexWithImpliedCounts(), + isSortQueryPreIndexWithFieldCounts(), + isSortQueryPostIndexWithTermCounts(), + isSortQueryPostIndexWithFieldCounts() + ); // @formatter:on } diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java index ec0987fdb88..3e2c3e21e95 100644 --- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java +++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java @@ -3,14 +3,14 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.Objects; +import java.util.StringJoiner; import org.apache.commons.lang.builder.CompareToBuilder; import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.VLongWritable; import org.apache.hadoop.io.WritableComparable; -import com.google.common.base.Objects; - import datawave.core.query.configuration.ResultContext; public class DiscoveredThing implements WritableComparable { @@ -86,6 +86,7 @@ public void readFields(DataInput in) throws IOException { @Override public int compareTo(DiscoveredThing o) { + CompareToBuilder cmp = new CompareToBuilder(); if (o == null) { return 1; @@ -96,28 +97,34 @@ public int compareTo(DiscoveredThing o) { cmp.append(getDate(), o.getDate()); cmp.append(getColumnVisibility(), o.getColumnVisibility()); cmp.append(getCount(), o.getCount()); + cmp.append(getCountsByColumnVisibility(), o.getCountsByColumnVisibility()); return cmp.toComparison(); } } @Override public boolean equals(Object o) { - if (!(o instanceof DiscoveredThing)) + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { return false; - DiscoveredThing other = (DiscoveredThing) o; - return Objects.equal(getTerm(), other.getTerm()) && Objects.equal(getField(), other.getField()) && Objects.equal(getType(), other.getType()) - && Objects.equal(getDate(), other.getDate()) && Objects.equal(getColumnVisibility(), other.getColumnVisibility()) - && Objects.equal(getCount(), other.getCount()); + } + DiscoveredThing that = (DiscoveredThing) o; + return Objects.equals(term, that.term) && Objects.equals(field, that.field) && Objects.equals(type, that.type) && Objects.equals(date, that.date) + && Objects.equals(columnVisibility, that.columnVisibility) && Objects.equals(count, that.count) + && Objects.equals(countsByColumnVisibility, that.countsByColumnVisibility); } @Override public int hashCode() { - return Objects.hashCode(getTerm(), getField(), getType(), getDate(), getColumnVisibility(), getCount()); + return Objects.hash(term, field, type, date, columnVisibility, count, countsByColumnVisibility); } @Override public String toString() { - return "DiscoveredThing [term=" + term + ", field=" + field + ", type=" + type + ", date=" + date + ", columnVisibility=" + columnVisibility - + ", count=" + count + "]"; + return new StringJoiner(", ", DiscoveredThing.class.getSimpleName() + "[", "]").add("term='" + term + "'").add("field='" + field + "'") + .add("type='" + type + "'").add("date='" + date + "'").add("columnVisibility='" + columnVisibility + "'").add("count=" + count) + .add("countsByColumnVisibility=" + countsByColumnVisibility).toString(); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java index 1400308f3c2..404d9c29dda 100644 --- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java @@ -1,14 +1,15 @@ package datawave.query.discovery; -import static com.google.common.collect.Collections2.filter; -import static com.google.common.collect.Collections2.transform; -import static com.google.common.collect.Lists.newArrayList; - import java.io.IOException; -import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.BiFunction; +import java.util.stream.Collectors; import org.apache.accumulo.core.data.ByteSequence; import org.apache.accumulo.core.data.Key; @@ -17,141 +18,312 @@ import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.IteratorEnvironment; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; -import org.apache.accumulo.core.util.Pair; +import org.apache.accumulo.core.security.ColumnVisibility; import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.MapWritable; +import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.apache.log4j.Logger; -import com.google.common.base.Predicates; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Multimap; +import com.google.protobuf.InvalidProtocolBufferException; + +import datawave.ingest.protobuf.Uid; +import datawave.marking.MarkingFunctions; +import datawave.query.Constants; public class DiscoveryIterator implements SortedKeyValueIterator { + private static final Logger log = Logger.getLogger(DiscoveryIterator.class); + private static final MarkingFunctions markingFunctions = MarkingFunctions.Factory.createMarkingFunctions(); - private Key tk; - private Value tv; - private SortedKeyValueIterator itr; + private Key key; + private Value value; + private SortedKeyValueIterator iterator; private boolean separateCountsByColVis = false; private boolean showReferenceCount = false; private boolean reverseIndex = false; + private boolean sumCounts = false; @Override public DiscoveryIterator deepCopy(IteratorEnvironment env) { - DiscoveryIterator i = new DiscoveryIterator(); - i.itr = itr.deepCopy(env); - return i; + DiscoveryIterator copy = new DiscoveryIterator(); + copy.iterator = iterator.deepCopy(env); + return copy; } @Override public void next() throws IOException { - tk = null; - tv = null; + this.key = null; + this.value = null; - while (itr.hasTop() && tk == null) { - Multimap terms = aggregateDate(); + while (iterator.hasTop() && key == null) { + // Get the entries to aggregate. + Multimap terms = getTermsByDatatype(); if (terms.isEmpty()) { - if (log.isTraceEnabled()) - log.trace("Couldn't aggregate index info; moving onto next date/field/term if data is available."); - continue; + log.trace("Couldn't aggregate index info; moving onto next date/field/term if data is available."); } else { - if (log.isTraceEnabled()) - log.trace("Received term info multimap of size [" + terms.size() + "]"); - ArrayList things = newArrayList( - filter(transform(terms.asMap().values(), new TermInfoAggregation(separateCountsByColVis, showReferenceCount, reverseIndex)), - Predicates.notNull())); - if (log.isTraceEnabled()) - log.trace("After conversion to discovery objects, there are [" + things.size() + "] term info objects."); - if (things.isEmpty()) { - continue; - } else { - Pair top = makeTop(things); - tk = top.getFirst(); - tv = top.getSecond(); + // Aggregate the entries. + List things = terms.asMap().values().stream().map(this::aggregate).filter(Objects::nonNull).collect(Collectors.toList()); + // Establish the next top of this iterator. + if (!things.isEmpty()) { + setTop(things); return; } } } - if (log.isTraceEnabled()) - log.trace("No data found."); + log.trace("No data found."); } - private Multimap aggregateDate() throws IOException { - Multimap terms = ArrayListMultimap.create(); - Key start = new Key(itr.getTopKey()), key = null; - while (itr.hasTop() && start.equals((key = itr.getTopKey()), PartialKey.ROW_COLFAM) && datesMatch(start, key)) { - TermInfo ti = new TermInfo(key, itr.getTopValue()); - if (ti.valid) - terms.put(ti.datatype, ti); + /** + * Return a multimap containing mappings of datatypes to term entries that should be aggregated. + */ + private Multimap getTermsByDatatype() throws IOException { + Multimap terms = ArrayListMultimap.create(); + Key start = new Key(iterator.getTopKey()); + Key key; + // If we should sum up counts, we want to collect the term entries for each date seen for the current field and term of start. Otherwise, we only want + // to collect the term entries for the current field, term, and date of start. + BiFunction dateMatchingFunction = sumCounts ? (first, second) -> true : this::datesMatch; + // Find all matching entries and parse term entries from them. + while (iterator.hasTop() && start.equals((key = iterator.getTopKey()), PartialKey.ROW_COLFAM) && dateMatchingFunction.apply(start, key)) { + TermEntry termEntry = new TermEntry(key, iterator.getTopValue()); + if (termEntry.isValid()) + terms.put(termEntry.getDatatype(), termEntry); else { - if (log.isTraceEnabled()) - log.trace("Received invalid term info from key: " + key); + if (log.isTraceEnabled()) { + log.trace("Received invalid term entry from key: " + key); + } } - itr.next(); + iterator.next(); } return terms; } - private static boolean datesMatch(Key reference, Key test) { - ByteSequence a = reference.getColumnQualifierData(), b = test.getColumnQualifierData(); + /** + * Return true if the dates for the two keys match, or false otherwise. + */ + private boolean datesMatch(Key left, Key right) { + ByteSequence leftBytes = left.getColumnQualifierData(); + ByteSequence rightBytes = right.getColumnQualifierData(); for (int i = 0; i < 8; i++) { - if (a.byteAt(i) != b.byteAt(i)) { + if (leftBytes.byteAt(i) != rightBytes.byteAt(i)) { return false; } } return true; } - private Pair makeTop(List things) { - Writable[] returnedThings = new Writable[things.size()]; - for (int i = 0; i < returnedThings.length; ++i) - returnedThings[i] = things.get(i); - ArrayWritable aw = new ArrayWritable(DiscoveredThing.class); - aw.set(returnedThings); + /** + * Return the given term entries aggregated into a single {@link DiscoveredThing} if possible, or return null if any issues occurred. + */ + private DiscoveredThing aggregate(Collection termEntries) { + if (termEntries.isEmpty()) { + return null; + } else { + TermEntry first = termEntries.iterator().next(); + String term = reverseIndex ? new StringBuilder(first.getTerm()).reverse().toString() : first.getTerm(); + String date = sumCounts ? "" : first.date; + + Set visibilities = new HashSet<>(); + Map visibilityToCounts = new HashMap<>(); + long count = 0L; + + // Aggregate the counts and visibilities from each entry. + for (TermEntry termEntry : termEntries) { + // Fetch the count to aggregate based of whether we should show the term count or the reference count. + long currentCount = this.showReferenceCount ? termEntry.getUidListSize() : termEntry.getUidCount(); + try { + // Track the distinct visibilities seen. + visibilities.add(termEntry.getVisibility()); + // If counts by visibility should be tracked, do so. + if (this.separateCountsByColVis) { + String visibility = new String(termEntry.getVisibility().flatten()); + visibilityToCounts.compute(visibility, (k, v) -> v == null ? currentCount : v + currentCount); + } + } catch (Exception e) { + // If an exception occurs, skip to the next entry. + log.trace(e); + continue; + } + // Increment the overall count. + count += currentCount; + } + + // If we do not have a count greater than 0, return null. + if (count <= 0) { + if (log.isTraceEnabled()) { + log.trace("Did not aggregate any counts for [" + first.getTerm() + "][" + first.getField() + "][" + first.getDatatype() + "][" + + first.getDate() + "]. Returning null"); + } + return null; + } else { + // Otherwise, combine the visibilities, and return the aggregated result. + try { + ColumnVisibility visibility = markingFunctions.combine(visibilities); + MapWritable countsByVis = new MapWritable(); + visibilityToCounts.forEach((key, value) -> countsByVis.put(new Text(key), new LongWritable(value))); + return new DiscoveredThing(term, first.getField(), first.getDatatype(), date, new String(visibility.flatten()), count, countsByVis); + } catch (Exception e) { + if (log.isTraceEnabled()) { + log.warn("Invalid column visibilities after combining " + visibilities); + } + return null; + } + } + } + } + /** + * Set the top {@link Key} and {@link Value} of this iterator, created from the given list of {@link DiscoveredThing} instances. + */ + private void setTop(List things) { + // We want the key to be the last possible key for this date. Return the key as it is in the index (reversed if necessary) to ensure the keys are + // consistent with the initial seek range. DiscoveredThing thing = things.get(0); - // we want the key to be the last possible key for this date. Return the key as it is in the index (reversed if necessary) to - // ensure the keys are consistent with the initial seek range. - String row = (reverseIndex ? new StringBuilder().append(thing.getTerm()).reverse().toString() : thing.getTerm()); - return new Pair<>(new Key(row, thing.getField(), thing.getDate() + '\uffff'), new Value(WritableUtils.toByteArray(aw))); + String row = (this.reverseIndex ? new StringBuilder().append(thing.getTerm()).reverse().toString() : thing.getTerm()); + Key newKey = new Key(row, thing.getField(), thing.getDate() + "\uffff"); + + // Create a value from the list of things. + ArrayWritable thingArray = new ArrayWritable(DiscoveredThing.class, things.toArray(new DiscoveredThing[0])); + Value newValue = new Value(WritableUtils.toByteArray(thingArray)); + + this.key = newKey; + this.value = newValue; } @Override public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { - - itr.seek(range, columnFamilies, inclusive); - if (log.isTraceEnabled()) - log.trace("My source " + (itr.hasTop() ? "does" : "does not") + " have a top."); + this.iterator.seek(range, columnFamilies, inclusive); + if (log.isTraceEnabled()) { + log.trace("My source " + (this.iterator.hasTop() ? "does" : "does not") + " have a top."); + } next(); } @Override public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException { - itr = source; - separateCountsByColVis = Boolean.parseBoolean(options.get(DiscoveryLogic.SEPARATE_COUNTS_BY_COLVIS)); - showReferenceCount = Boolean.parseBoolean(options.get(DiscoveryLogic.SHOW_REFERENCE_COUNT)); - reverseIndex = Boolean.parseBoolean(options.get(DiscoveryLogic.REVERSE_INDEX)); + this.iterator = source; + this.separateCountsByColVis = Boolean.parseBoolean(options.get(DiscoveryLogic.SEPARATE_COUNTS_BY_COLVIS)); + this.showReferenceCount = Boolean.parseBoolean(options.get(DiscoveryLogic.SHOW_REFERENCE_COUNT)); + this.reverseIndex = Boolean.parseBoolean(options.get(DiscoveryLogic.REVERSE_INDEX)); + this.sumCounts = Boolean.parseBoolean(options.get(DiscoveryLogic.SUM_COUNTS)); if (log.isTraceEnabled()) { - log.trace("My source is a " + source.getClass().getName()); - log.trace("Separate counts by column visibility = " + separateCountsByColVis); - log.trace("Show reference count only = " + showReferenceCount); + log.trace("Source: " + source.getClass().getName()); + log.trace("Separate counts by column visibility: " + this.separateCountsByColVis); + log.trace("Show reference counts only: " + this.showReferenceCount); + log.trace("Reverse index: " + this.reverseIndex); + log.trace("Sum counts: " + this.sumCounts); } } @Override public boolean hasTop() { - return tk != null; + return key != null; } @Override public Key getTopKey() { - return tk; + return key; } @Override public Value getTopValue() { - return tv; + return value; + } + + /** + * Represents term information parsed from a {@link Key}, {@link Value} pair. + */ + private static class TermEntry { + + private final String term; + private final String field; + private String date; + private String datatype; + private ColumnVisibility visibility; + private long uidCount; + private long uidListSize; + private boolean valid; + + public TermEntry(Key key, Value value) { + term = key.getRow().toString(); + field = key.getColumnFamily().toString(); + + String colq = key.getColumnQualifier().toString(); + int firstSeparatorPos = colq.indexOf(Constants.NULL_BYTE_STRING); + if (firstSeparatorPos != -1) { + int lastSeparatorPos = colq.lastIndexOf(Constants.NULL_BYTE_STRING); + // If multiple separators are present, this is a task datatype entry. + if (firstSeparatorPos != lastSeparatorPos) { + // Ensure that we at least have yyyyMMdd. + if ((lastSeparatorPos - firstSeparatorPos) < 9) { + return; + } + // The form is datatype\0date\0task status (old knowledge entry). + date = colq.substring(firstSeparatorPos + 1, firstSeparatorPos + 9); + datatype = colq.substring(0, firstSeparatorPos); + } else { + // Ensure that we at least have yyyyMMdd. + if (firstSeparatorPos < 8) { + return; + } + // The form is shardId\0datatype. + date = colq.substring(0, 8); + datatype = colq.substring(firstSeparatorPos + 1); + } + + // Parse the UID.List object from the value. + try { + Uid.List uidList = Uid.List.parseFrom(value.get()); + if (uidList != null) { + uidCount = uidList.getCOUNT(); + uidListSize = uidList.getUIDList().size(); + } + } catch (InvalidProtocolBufferException e) { + // Don't add UID information. At least we know what shard it's located in. + } + + visibility = new ColumnVisibility(key.getColumnVisibility()); + + // This is now considered a valid entry for aggregation. + valid = true; + } + } + + public String getTerm() { + return term; + } + + public String getField() { + return field; + } + + public String getDate() { + return date; + } + + public String getDatatype() { + return datatype; + } + + public ColumnVisibility getVisibility() { + return visibility; + } + + public long getUidCount() { + return uidCount; + } + + public long getUidListSize() { + return uidListSize; + } + + public boolean isValid() { + return valid; + } } } diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java index 91424b1afb1..6dd595b8792 100644 --- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java +++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java @@ -51,6 +51,7 @@ import datawave.core.query.configuration.QueryData; import datawave.data.type.Type; import datawave.microservice.query.Query; +import datawave.microservice.query.QueryImpl; import datawave.query.Constants; import datawave.query.QueryParameters; import datawave.query.discovery.FindLiteralsAndPatternsVisitor.QueryValues; @@ -72,18 +73,45 @@ public class DiscoveryLogic extends ShardIndexQueryTable { private static final Logger log = Logger.getLogger(DiscoveryLogic.class); + /** + * Used to specify if counts should be separated by column visibility. + */ public static final String SEPARATE_COUNTS_BY_COLVIS = "separate.counts.by.colvis"; + + /** + * Used to specify if reference counts should be shown instead of term counts. + */ public static final String SHOW_REFERENCE_COUNT = "show.reference.count"; + + /** + * Used to specify whether to sum up the counts instead of returning counts per date. + */ + public static final String SUM_COUNTS = "sum.counts"; + + /** + * Used to specify whether to search against the reversed index. + */ public static final String REVERSE_INDEX = "reverse.index"; + private DiscoveryQueryConfiguration config; private MetadataHelper metadataHelper; + /** + * Basic constructor. + */ public DiscoveryLogic() { super(); } + /** + * Copy constructor. + * + * @param other + * the other logic to copy + */ public DiscoveryLogic(DiscoveryLogic other) { super(other); + this.config = new DiscoveryQueryConfiguration(other.config); this.metadataHelper = other.metadataHelper; } @@ -92,7 +120,6 @@ public DiscoveryQueryConfiguration getConfig() { if (this.config == null) { this.config = DiscoveryQueryConfiguration.create(); } - return this.config; } @@ -111,56 +138,48 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting log.debug("Query parameters set to " + settings.getParameters()); } - // Check if the default modelName and modelTableNames have been overriden by custom parameters. - if (null != settings.findParameter(QueryParameters.PARAMETER_MODEL_NAME) - && !settings.findParameter(QueryParameters.PARAMETER_MODEL_NAME).getParameterValue().trim().isEmpty()) { - setModelName(settings.findParameter(QueryParameters.PARAMETER_MODEL_NAME).getParameterValue().trim()); - } - if (null != settings.findParameter(QueryParameters.PARAMETER_MODEL_TABLE_NAME) - && !settings.findParameter(QueryParameters.PARAMETER_MODEL_TABLE_NAME).getParameterValue().trim().isEmpty()) { - setModelTableName(settings.findParameter(QueryParameters.PARAMETER_MODEL_TABLE_NAME).getParameterValue().trim()); - } + // Check if the default model name and model table name have been overridden. + setModelName(getOrDefault(settings, QueryParameters.PARAMETER_MODEL_NAME, getConfig().getModelName())); + setModelTableName(getOrDefault(settings, QueryParameters.PARAMETER_MODEL_TABLE_NAME, getConfig().getModelTableName())); - // Check if user would like counts separated by column visibility - if (null != settings.findParameter(SEPARATE_COUNTS_BY_COLVIS) - && !settings.findParameter(SEPARATE_COUNTS_BY_COLVIS).getParameterValue().trim().isEmpty()) { - boolean separateCountsByColVis = Boolean.valueOf(settings.findParameter(SEPARATE_COUNTS_BY_COLVIS).getParameterValue().trim()); - getConfig().setSeparateCountsByColVis(separateCountsByColVis); - } + // Check if counts should be separated by column visibility. + setSeparateCountsByColVis(getOrDefaultBoolean(settings, SEPARATE_COUNTS_BY_COLVIS, getSeparateCountsByColVis())); - // Check if user would like to show reference counts instead of term counts - if (null != settings.findParameter(SHOW_REFERENCE_COUNT) && !settings.findParameter(SHOW_REFERENCE_COUNT).getParameterValue().trim().isEmpty()) { - boolean showReferenceCount = Boolean.valueOf(settings.findParameter(SHOW_REFERENCE_COUNT).getParameterValue().trim()); - getConfig().setShowReferenceCount(showReferenceCount); - } + // Check if reference counts should be shown. + setShowReferenceCount(getOrDefaultBoolean(settings, SHOW_REFERENCE_COUNT, getShowReferenceCount())); + + // Check if counts should be summed. + setSumCounts(getOrDefaultBoolean(settings, SUM_COUNTS, getSumCounts())); + + // Check if any datatype filters were specified. + getConfig().setDatatypeFilter(getOrDefaultSet(settings, QueryParameters.DATATYPE_FILTER_SET, getConfig().getDatatypeFilter())); + + // Update the query model. setQueryModel(metadataHelper.getQueryModel(getModelTableName(), getModelName(), null)); - // get the data type filter set if any - if (null != settings.findParameter(QueryParameters.DATATYPE_FILTER_SET) - && !settings.findParameter(QueryParameters.DATATYPE_FILTER_SET).getParameterValue().trim().isEmpty()) { - Set dataTypeFilter = new HashSet<>(Arrays.asList(StringUtils - .split(settings.findParameter(QueryParameters.DATATYPE_FILTER_SET).getParameterValue().trim(), Constants.PARAM_VALUE_SEP))); - getConfig().setDatatypeFilter(dataTypeFilter); - if (log.isDebugEnabled()) { - log.debug("Data type filter set to " + dataTypeFilter); - } - } - // Set the connector + // Set the connector. getConfig().setClient(client); - // Set the auths + + // Set the auths. getConfig().setAuthorizations(auths); - // Get the ranges + // Get the ranges. getConfig().setBeginDate(settings.getBeginDate()); getConfig().setEndDate(settings.getEndDate()); - if (null == getConfig().getBeginDate() || null == getConfig().getEndDate()) { - getConfig().setBeginDate(new Date(0)); + // If a begin date was not specified, default to the earliest date. + if (getConfig().getBeginDate() == null) { + getConfig().setBeginDate(new Date(0L)); + log.warn("Begin date not specified, using earliest begin date."); + } + + // If an end date was not specified, default to the latest date. + if (getConfig().getEndDate() == null) { getConfig().setEndDate(new Date(Long.MAX_VALUE)); - log.warn("Dates not specified, using entire date range"); + log.warn("End date not specified, using latest end date."); } - // start with a trimmed version of the query, converted to JEXL + // Start with a trimmed version of the query, converted to JEXL LuceneToJexlQueryParser parser = new LuceneToJexlQueryParser(); parser.setAllowLeadingWildCard(isAllowLeadingWildcard()); QueryNode node = parser.parse(settings.getQuery().trim()); @@ -173,9 +192,9 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting // Parse & flatten the query ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(getConfig().getQueryString()); + CaseSensitivityVisitor.upperCaseIdentifiers(getConfig(), metadataHelper, script); - script = CaseSensitivityVisitor.upperCaseIdentifiers(getConfig(), metadataHelper, script); - + // Apply the query model. Set dataTypes = getConfig().getDatatypeFilter(); Set allFields; allFields = metadataHelper.getAllFields(dataTypes); @@ -183,14 +202,13 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting QueryValues literalsAndPatterns = FindLiteralsAndPatternsVisitor.find(script); Stopwatch timer = Stopwatch.createStarted(); - // no caching for getAllNormalizers, so try some magic with getFields... + // No caching for getAllNormalizers, so try some magic with getFields... Multimap> dataTypeMap = ArrayListMultimap.create(metadataHelper.getFieldsToDatatypes(getConfig().getDatatypeFilter())); - /* - * we have a mapping of FIELD->DataType, but not a mapping of ANYFIELD->DataType which should be all dataTypes - */ - dataTypeMap.putAll(Constants.ANY_FIELD, uniqueByType(dataTypeMap.values())); + // We have a mapping of FIELD->DataType, but not a mapping of ANYFIELD->DataType which should be all datatypes. + dataTypeMap.putAll(Constants.ANY_FIELD, getUniqueTypes(dataTypeMap.values())); timer.stop(); log.debug("Took " + timer.elapsed(TimeUnit.MILLISECONDS) + "ms to get all the dataTypes."); + getConfig().setLiterals(normalize(new LiteralNormalization(), literalsAndPatterns.getLiterals(), dataTypeMap)); getConfig().setPatterns(normalize(new PatternNormalization(), literalsAndPatterns.getPatterns(), dataTypeMap)); getConfig().setRanges(normalizeRanges(new LiteralNormalization(), literalsAndPatterns.getRanges(), dataTypeMap)); @@ -199,44 +217,143 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting log.debug("Normalized Patterns = " + getConfig().getPatterns()); } + // Set the planned queries to execute. getConfig().setQueries(createQueries(getConfig())); return getConfig(); } - public List createQueries(DiscoveryQueryConfiguration config) throws QueryException, TableNotFoundException, IOException, ExecutionException { - final List queries = Lists.newLinkedList(); + /** + * If present, return the value of the given parameter from the given settings, or return the default value otherwise. + */ + private String getOrDefault(Query settings, String parameterName, String defaultValue) { + String value = getTrimmedParameter(settings, parameterName); + return StringUtils.isBlank(value) ? defaultValue : value; + } - Set familiesToSeek = Sets.newHashSet(); - Pair,Set> seekRanges = makeRanges(getConfig(), familiesToSeek, metadataHelper); - Collection forward = seekRanges.getValue0(); + /** + * If present, return the value of the given parameter from the given settings as a boolean, or return the default value otherwise. + */ + private boolean getOrDefaultBoolean(Query settings, String parameterName, boolean defaultValue) { + String value = getTrimmedParameter(settings, parameterName); + log.debug("Trimmed value for " + parameterName + ": " + value); + return StringUtils.isBlank(value) ? defaultValue : Boolean.parseBoolean(value); + } - if (!forward.isEmpty()) { - List settings = getIteratorSettingsForDiscovery(getConfig(), getConfig().getLiterals(), getConfig().getPatterns(), - getConfig().getRanges(), false); - if (isCheckpointable()) { - // if checkpointable, then only one range per query data so that the whole checkpointing thing works correctly - for (Range range : forward) { - queries.add(new QueryData(config.getIndexTableName(), null, Collections.singleton(range), familiesToSeek, settings)); + /** + * If present, return the value of the given parameter from the given settings as a set, or return the default value otherwise. + */ + private Set getOrDefaultSet(Query settings, String parameterName, Set defaultValue) { + String value = getTrimmedParameter(settings, parameterName); + return StringUtils.isBlank(value) ? defaultValue : new HashSet<>(Arrays.asList(StringUtils.split(value, Constants.PARAM_VALUE_SEP))); + } + + /** + * Return the trimmed value of the given parameter from the given settings, or null if a value is not present. + */ + private String getTrimmedParameter(Query settings, String parameterName) { + QueryImpl.Parameter parameter = settings.findParameter(parameterName); + return parameter != null ? parameter.getParameterValue().trim() : null; + } + + /** + * Given a sequence of objects of type T, this method will return a single object for every unique type passed in. This is used to dedupe normalizer + * instances by their type, so that we only get 1 instance per type of normalizer. + */ + private Collection> getUniqueTypes(Iterable> things) { + Map,Type> map = Maps.newHashMap(); + for (Type t : things) { + map.put(t.getClass(), t); + } + return map.values(); + } + + /** + * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object + * will determine whether a regex or literal is being normalized. + * + * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations. + * + * @param normalization + * the normalizer object + * @param valuesToFields + * mapping of values to fields + * @param dataTypeMap + * the data type map + * @return a mapping of the normalized tuples + */ + private Multimap normalize(Normalization normalization, Multimap valuesToFields, Multimap> dataTypeMap) { + Multimap normalizedValuesToFields = HashMultimap.create(); + for (Entry valueAndField : valuesToFields.entries()) { + String value = valueAndField.getKey(), field = valueAndField.getValue(); + for (Type dataType : dataTypeMap.get(field)) { + try { + log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]"); + String normalized = normalization.normalize(dataType, field, value); + normalizedValuesToFields.put(normalized, field); + log.debug("Normalization succeeded!"); + } catch (Exception exception) { + log.debug("Normalization failed."); } - } else { - queries.add(new QueryData(config.getIndexTableName(), null, forward, familiesToSeek, settings)); } } + return normalizedValuesToFields; + } - Collection reverse = seekRanges.getValue1(); - if (!reverse.isEmpty()) { - List settings = getIteratorSettingsForDiscovery(getConfig(), getConfig().getLiterals(), getConfig().getPatterns(), - getConfig().getRanges(), true); - if (isCheckpointable()) { - // if checkpointable, then only one range per query data so that the whole checkpointing thing works correctly - for (Range range : reverse) { - queries.add(new QueryData(config.getReverseIndexTableName(), null, Collections.singleton(range), familiesToSeek, settings)); + /** + * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object + * will determine whether a regex or literal is being normalized. + * + * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations. + * + * @param normalization + * the normalizer object + * @param valuesToFields + * mapping of values to fields + * @param dataTypeMap + * the data type map + * @return a mapping of the normalized ranges + */ + private Multimap> normalizeRanges(Normalization normalization, Multimap> valuesToFields, + Multimap> dataTypeMap) { + Multimap> normalizedValuesToFields = HashMultimap.create(); + for (Entry> valueAndField : valuesToFields.entries()) { + String field = valueAndField.getKey(); + LiteralRange value = valueAndField.getValue(); + for (Type dataType : dataTypeMap.get(field)) { + try { + log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]"); + String normalizedLower = normalization.normalize(dataType, field, value.getLower().toString()); + String normalizedUpper = normalization.normalize(dataType, field, value.getUpper().toString()); + normalizedValuesToFields.put(field, new LiteralRange<>(normalizedLower, value.isLowerInclusive(), normalizedUpper, value.isUpperInclusive(), + value.getFieldName(), value.getNodeOperand())); + log.debug("Normalization succeeded!"); + } catch (Exception exception) { + log.debug("Normalization failed."); } - } else { - queries.add(new QueryData(config.getReverseIndexTableName(), null, reverse, familiesToSeek, settings)); } } + return normalizedValuesToFields; + } + + /** + * Create and return a list of planned queries. + * + * @param config + * the config + * @return the list of query data + */ + private List createQueries(DiscoveryQueryConfiguration config) throws TableNotFoundException, ExecutionException { + final List queries = Lists.newLinkedList(); + + Set familiesToSeek = Sets.newHashSet(); // This will be populated by createRanges(). + Pair,Set> seekRanges = createRanges(config, familiesToSeek, metadataHelper); + + // Create the forward queries. + queries.addAll(createQueriesFromRanges(config, seekRanges.getValue0(), familiesToSeek, false)); + + // Create the reverse queries. + queries.addAll(createQueriesFromRanges(config, seekRanges.getValue1(), familiesToSeek, true)); if (log.isDebugEnabled()) { log.debug("Created ranges: " + queries); @@ -245,67 +362,161 @@ public List createQueries(DiscoveryQueryConfiguration config) throws return queries; } - @Override - public void setupQuery(GenericQueryConfiguration genericConfig) throws QueryException, TableNotFoundException, IOException, ExecutionException { - if (!genericConfig.getClass().getName().equals(DiscoveryQueryConfiguration.class.getName())) { - throw new QueryException("Did not receive a DiscoveryQueryConfiguration instance!!"); + /** + * Create planned queries for the given ranges. + * + * @param config + * the config + * @param ranges + * the ranges + * @param familiesToSeek + * the families to seek + * @param reversed + * whether the ranges are for the reversed index + * @return the queries + */ + private List createQueriesFromRanges(DiscoveryQueryConfiguration config, Set ranges, Set familiesToSeek, boolean reversed) { + List queries = new ArrayList<>(); + if (!ranges.isEmpty()) { + List settings = getIteratorSettings(config, reversed); + String tableName = reversed ? config.getReverseIndexTableName() : config.getIndexTableName(); + if (isCheckpointable()) { + for (Range range : ranges) { + queries.add(new QueryData(tableName, null, Collections.singleton(range), familiesToSeek, settings)); + } + } else { + queries.add(new QueryData(tableName, null, ranges, familiesToSeek, settings)); + } } - this.config = (DiscoveryQueryConfiguration) genericConfig; - final List> iterators = Lists.newArrayList(); + return queries; + } - for (QueryData qd : config.getQueries()) { - if (log.isDebugEnabled()) { - log.debug("Creating scanner for " + qd); + /** + * Creates two collections of ranges: one for the forward index (value0) and one for the reverse index (value1). If a literal has a field name, then the + * Range for that term will include the column family. If there are multiple fields, then multiple ranges are created. + * + * @param config + * the discovery config + * @param familiesToSeek + * the families to seek + * @param metadataHelper + * a metadata helper + * @return a pair of ranges + * @throws TableNotFoundException + * if the table is not found + * @throws ExecutionException + * for execution exceptions + */ + private Pair,Set> createRanges(DiscoveryQueryConfiguration config, Set familiesToSeek, MetadataHelper metadataHelper) + throws TableNotFoundException, ExecutionException { + Set forwardRanges = new HashSet<>(); + Set reverseRanges = new HashSet<>(); + + // Evaluate the literals. + for (Entry literalAndField : config.getLiterals().entries()) { + String literal = literalAndField.getKey(), field = literalAndField.getValue(); + // If the field is _ANYFIELD_, use null when making the range. + field = Constants.ANY_FIELD.equals(field) ? null : field; + // Mark the field as a family to seek if not null. + if (field != null) { + familiesToSeek.add(field); } - // scan the table - BatchScanner bs = scannerFactory.newScanner(qd.getTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery()); + forwardRanges.add(ShardIndexQueryTableStaticMethods.getLiteralRange(field, literal)); + } - bs.setRanges(qd.getRanges()); - for (IteratorSetting setting : qd.getSettings()) { - bs.addScanIterator(setting); + // Evaluate the ranges. + for (Entry> rangeEntry : config.getRanges().entries()) { + LiteralRange range = rangeEntry.getValue(); + String field = rangeEntry.getKey(); + // If the field is _ANYFIELD_, use null when making the range. + field = Constants.ANY_FIELD.equals(field) ? null : field; + // Mark the field as a family to seek if not null. + if (field != null) { + familiesToSeek.add(field); } - for (String cf : qd.getColumnFamilies()) { - bs.fetchColumnFamily(new Text(cf)); + try { + forwardRanges.add(ShardIndexQueryTableStaticMethods.getBoundedRangeRange(range)); + } catch (IllegalRangeArgumentException e) { + log.error("Error using range [" + range + "]", e); } + } - iterators.add(transformScanner(bs, qd)); + // Evaluate the patterns. + for (Entry patternAndField : config.getPatterns().entries()) { + String pattern = patternAndField.getKey(), field = patternAndField.getValue(); + // If the field is _ANYFIELD_, use null when making the range. + field = Constants.ANY_FIELD.equals(field) ? null : field; + // Mark the field as a family to seek if not null. + if (field != null) { + familiesToSeek.add(field); + } + ShardIndexQueryTableStaticMethods.RefactoredRangeDescription description; + try { + description = ShardIndexQueryTableStaticMethods.getRegexRange(field, pattern, false, metadataHelper, config); + } catch (JavaRegexParseException e) { + log.error("Error parsing pattern [" + pattern + "]", e); + continue; + } + if (description.isForReverseIndex) { + reverseRanges.add(description.range); + } else { + forwardRanges.add(description.range); + } } - this.iterator = concat(iterators.iterator()); - } - public static List getIteratorSettingsForDiscovery(DiscoveryQueryConfiguration config, Multimap literals, - Multimap patterns, Multimap> ranges, boolean reverseIndex) { + return Pair.with(forwardRanges, reverseRanges); + } + /** + * Return the set of iterator settings that should be applied to queries for the given configuration. + * + * @param config + * the config + * @param reverseIndex + * whether the iterator settings should be configured for a reversed index + * @return the iterator settings + */ + private List getIteratorSettings(DiscoveryQueryConfiguration config, boolean reverseIndex) { List settings = Lists.newLinkedList(); - // The begin date from the query may be down to the second, for doing lookups in the index we want to use the day because - // the times in the index table have been truncated to the day. + + // Add a date range filter. + // The begin date from the query may be down to the second, for doing look-ups in the index we want to use the day because the times in the index table + // have been truncated to the day. Date begin = DateUtils.truncate(config.getBeginDate(), Calendar.DAY_OF_MONTH); - // we don't need to bump up the end date any more because it's not apart of the range set on the scanner + // we don't need to bump up the end date any more because it's not a part of the range set on the scanner. Date end = config.getEndDate(); - LongRange dateRange = new LongRange(begin.getTime(), end.getTime()); - settings.add(ShardIndexQueryTableStaticMethods.configureGlobalIndexDateRangeFilter(config, dateRange)); + + // Add a datatype filter. settings.add(ShardIndexQueryTableStaticMethods.configureGlobalIndexDataTypeFilter(config, config.getDatatypeFilter())); - IteratorSetting matchingIterator = configureIndexMatchingIterator(config, literals, patterns, ranges, reverseIndex); + // Add an iterator to match literals, patterns, and ranges against the index. + IteratorSetting matchingIterator = configureIndexMatchingIterator(config, reverseIndex); if (matchingIterator != null) { settings.add(matchingIterator); } - IteratorSetting discoveryIteratorSetting = new IteratorSetting(config.getBaseIteratorPriority() + 50, DiscoveryIterator.class); - discoveryIteratorSetting.addOption(REVERSE_INDEX, Boolean.toString(reverseIndex)); - discoveryIteratorSetting.addOption(SEPARATE_COUNTS_BY_COLVIS, config.getSeparateCountsByColVis().toString()); - if (config.getShowReferenceCount()) { - discoveryIteratorSetting.addOption(SHOW_REFERENCE_COUNT, config.getShowReferenceCount().toString()); - } - settings.add(discoveryIteratorSetting); + // Add an iterator to create the actual DiscoveryThings. + settings.add(configureDiscoveryIterator(config, reverseIndex)); return settings; } - public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQueryConfiguration config, Multimap literals, - Multimap patterns, Multimap> ranges, boolean reverseIndex) { + /** + * Return a {@link IteratorSetting} for an {@link IndexMatchingIterator}. + * + * @param config + * the config + * @param reverseIndex + * whether searching against the reversed index. + * @return the iterator setting + */ + private IteratorSetting configureIndexMatchingIterator(DiscoveryQueryConfiguration config, boolean reverseIndex) { + Multimap literals = config.getLiterals(); + Multimap patterns = config.getPatterns(); + Multimap> ranges = config.getRanges(); + if ((literals == null || literals.isEmpty()) && (patterns == null || patterns.isEmpty()) && (ranges == null || ranges.isEmpty())) { return null; } @@ -314,6 +525,7 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer IteratorSetting cfg = new IteratorSetting(config.getBaseIteratorPriority() + 23, "termMatcher", IndexMatchingIterator.class); IndexMatchingIterator.Configuration conf = new IndexMatchingIterator.Configuration(); + // Add literals. if (literals != null) { for (Entry literal : literals.entries()) { if (Constants.ANY_FIELD.equals(literal.getValue())) { @@ -323,6 +535,7 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer } } } + // Add patterns. if (patterns != null) { for (Entry pattern : patterns.entries()) { if (Constants.ANY_FIELD.equals(pattern.getValue())) { @@ -332,6 +545,7 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer } } } + // Add ranges. if (ranges != null) { for (Entry> range : ranges.entries()) { if (Constants.ANY_FIELD.equals(range.getKey())) { @@ -343,12 +557,57 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer } cfg.addOption(IndexMatchingIterator.CONF, IndexMatchingIterator.gson().toJson(conf)); - cfg.addOption(IndexMatchingIterator.REVERSE_INDEX, Boolean.toString(reverseIndex)); return cfg; } + /** + * Return an {@link IteratorSetting} for an {@link DiscoveryIterator}. + * + * @param config + * the config + * @param reverseIndex + * whether searching against the reversed index. + * @return the iterator setting + */ + private IteratorSetting configureDiscoveryIterator(DiscoveryQueryConfiguration config, boolean reverseIndex) { + IteratorSetting setting = new IteratorSetting(config.getBaseIteratorPriority() + 50, DiscoveryIterator.class); + setting.addOption(REVERSE_INDEX, Boolean.toString(reverseIndex)); + setting.addOption(SEPARATE_COUNTS_BY_COLVIS, Boolean.toString(config.getSeparateCountsByColVis())); + setting.addOption(SHOW_REFERENCE_COUNT, Boolean.toString(config.getShowReferenceCount())); + setting.addOption(SUM_COUNTS, Boolean.toString(config.getSumCounts())); + return setting; + } + + @Override + public void setupQuery(GenericQueryConfiguration genericConfig) throws QueryException, TableNotFoundException, IOException, ExecutionException { + if (!genericConfig.getClass().getName().equals(DiscoveryQueryConfiguration.class.getName())) { + throw new QueryException("Did not receive a DiscoveryQueryConfiguration instance!!"); + } + this.config = (DiscoveryQueryConfiguration) genericConfig; + final List> iterators = Lists.newArrayList(); + + for (QueryData qd : config.getQueries()) { + if (log.isDebugEnabled()) { + log.debug("Creating scanner for " + qd); + } + // scan the table + BatchScanner bs = scannerFactory.newScanner(qd.getTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery()); + + bs.setRanges(qd.getRanges()); + for (IteratorSetting setting : qd.getSettings()) { + bs.addScanIterator(setting); + } + for (String cf : qd.getColumnFamilies()) { + bs.fetchColumnFamily(new Text(cf)); + } + + iterators.add(transformScanner(bs, qd)); + } + this.iterator = concat(iterators.iterator()); + } + @Override public ShardIndexQueryTable clone() { return new DiscoveryLogic(this); @@ -361,7 +620,7 @@ public ShardIndexQueryTable clone() { * a batch scanner * @return iterator for discoveredthings */ - public static Iterator transformScanner(final BatchScanner scanner, final QueryData queryData) { + private Iterator transformScanner(final BatchScanner scanner, final QueryData queryData) { return concat(transform(scanner.iterator(), new Function,Iterator>() { DataInputBuffer in = new DataInputBuffer(); @@ -386,183 +645,35 @@ public Iterator apply(Entry from) { })); } - /** - * Makes two collections of ranges: one for the forward index (value0) and one for the reverse index (value1). - * - * If a literal has a field name, then the Range for that term will include the column family. If there are multiple fields, then multiple ranges are - * created. - * - * @param config - * the discovery config - * @param familiesToSeek - * the families to seek - * @param metadataHelper - * a metadata helper - * @return a pair of ranges - * @throws TableNotFoundException - * if the table is not found - * @throws ExecutionException - * for execution exceptions - */ - @SuppressWarnings("unchecked") - public static Pair,Set> makeRanges(DiscoveryQueryConfiguration config, Set familiesToSeek, MetadataHelper metadataHelper) - throws TableNotFoundException, ExecutionException { - Set forwardRanges = new HashSet<>(); - for (Entry literalAndField : config.getLiterals().entries()) { - String literal = literalAndField.getKey(), field = literalAndField.getValue(); - // if we're _ANYFIELD_, then use null when making the literal range - field = Constants.ANY_FIELD.equals(field) ? null : field; - if (field != null) { - familiesToSeek.add(field); - } - forwardRanges.add(ShardIndexQueryTableStaticMethods.getLiteralRange(field, literal)); - } - for (Entry> rangeEntry : config.getRanges().entries()) { - LiteralRange range = rangeEntry.getValue(); - String field = rangeEntry.getKey(); - // if we're _ANYFIELD_, then use null when making the literal range - field = Constants.ANY_FIELD.equals(field) ? null : field; - if (field != null) { - familiesToSeek.add(field); - } - try { - forwardRanges.add(ShardIndexQueryTableStaticMethods.getBoundedRangeRange(range)); - } catch (IllegalRangeArgumentException e) { - log.error("Error using range [" + range + "]", e); - continue; - } - } - Set reverseRanges = new HashSet<>(); - for (Entry patternAndField : config.getPatterns().entries()) { - String pattern = patternAndField.getKey(), field = patternAndField.getValue(); - // if we're _ANYFIELD_, then use null when making the literal range - field = Constants.ANY_FIELD.equals(field) ? null : field; - ShardIndexQueryTableStaticMethods.RefactoredRangeDescription description; - try { - if (field != null) { - familiesToSeek.add(field); - } - description = ShardIndexQueryTableStaticMethods.getRegexRange(field, pattern, false, metadataHelper, config); - } catch (JavaRegexParseException e) { - log.error("Error parsing pattern [" + pattern + "]", e); - continue; - } - if (description.isForReverseIndex) { - reverseRanges.add(description.range); - } else { - forwardRanges.add(description.range); - } - } - return Pair.with(forwardRanges, reverseRanges); - } - - /** - * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object - * will determine whether or not a regex or literal is being normalized. - * - * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations. - * - * @param normalization - * the normalizer object - * @param valuesToFields - * mapping of values to fields - * @param dataTypeMap - * the data type map - * @return a mapping of the noramlized tuples - */ - public static Multimap normalize(Normalization normalization, Multimap valuesToFields, Multimap> dataTypeMap) { - Multimap normalizedValuesToFields = HashMultimap.create(); - for (Entry valueAndField : valuesToFields.entries()) { - String value = valueAndField.getKey(), field = valueAndField.getValue(); - for (Type dataType : dataTypeMap.get(field)) { - try { - log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]"); - String normalized = normalization.normalize(dataType, field, value); - normalizedValuesToFields.put(normalized, field); - log.debug("Normalization succeeded!"); - } catch (Exception exception) { - log.debug("Normalization failed."); - } - } - } - return normalizedValuesToFields; - } - - /** - * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object - * will determine whether or not a regex or literal is being normalized. - * - * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations. - * - * @param normalization - * the normalizer object - * @param valuesToFields - * mapping of values to fields - * @param dataTypeMap - * the data type map - * @return a mapping of the normalized ranges - */ - public static Multimap> normalizeRanges(Normalization normalization, Multimap> valuesToFields, - Multimap> dataTypeMap) { - Multimap> normalizedValuesToFields = HashMultimap.create(); - for (Entry> valueAndField : valuesToFields.entries()) { - String field = valueAndField.getKey(); - LiteralRange value = valueAndField.getValue(); - for (Type dataType : dataTypeMap.get(field)) { - try { - log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]"); - String normalizedLower = normalization.normalize(dataType, field, value.getLower().toString()); - String normalizedUpper = normalization.normalize(dataType, field, value.getUpper().toString()); - normalizedValuesToFields.put(field, new LiteralRange<>(normalizedLower, value.isLowerInclusive(), normalizedUpper, value.isUpperInclusive(), - value.getFieldName(), value.getNodeOperand())); - log.debug("Normalization succeeded!"); - } catch (Exception exception) { - log.debug("Normalization failed."); - } - } - } - return normalizedValuesToFields; - } - - /** - * Given a sequence of objects of type T, this method will return a single object for every unique type passed in. This is used to dedupe normalizer - * instances by their type, so that we only get 1 instance per type of normalizer. - * - * @param things - * iterable list of objects - * @param - * type of the objects - * @return an object for each type passed in - */ - public static Collection uniqueByType(Iterable things) { - Map,T> map = Maps.newHashMap(); - for (T t : things) { - map.put(t.getClass(), t); - } - return map.values(); - } - @Override public Set getOptionalQueryParameters() { Set params = super.getOptionalQueryParameters(); params.add(SEPARATE_COUNTS_BY_COLVIS); + params.add(SUM_COUNTS); return params; } - public Boolean getSeparateCountsByColVis() { + public boolean getSeparateCountsByColVis() { return getConfig().getSeparateCountsByColVis(); } - public void setSeparateCountsByColVis(Boolean separateCountsByColVis) { + public void setSeparateCountsByColVis(boolean separateCountsByColVis) { getConfig().setSeparateCountsByColVis(separateCountsByColVis); } - public Boolean getShowReferenceCount() { + public boolean getShowReferenceCount() { return getConfig().getShowReferenceCount(); } - public void setShowReferenceCount(Boolean showReferenceCount) { + public void setShowReferenceCount(boolean showReferenceCount) { getConfig().setShowReferenceCount(showReferenceCount); } + public boolean getSumCounts() { + return getConfig().getSumCounts(); + } + + public void setSumCounts(boolean sumCounts) { + getConfig().setSumCounts(sumCounts); + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java index 13c8fa25d75..59d09666450 100644 --- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java +++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java @@ -3,6 +3,7 @@ import java.io.Serializable; import java.util.Collection; import java.util.Objects; +import java.util.StringJoiner; import com.google.common.collect.Multimap; @@ -17,8 +18,9 @@ public class DiscoveryQueryConfiguration extends ShardIndexQueryConfiguration implements Serializable { private Multimap literals, patterns; private Multimap> ranges; - private Boolean separateCountsByColVis = false; - private Boolean showReferenceCount = false; + private boolean separateCountsByColVis = false; + private boolean showReferenceCount = false; + private boolean sumCounts = false; public DiscoveryQueryConfiguration() {} @@ -116,23 +118,31 @@ public void setPatterns(Multimap patterns) { this.patterns = patterns; } - public Boolean getSeparateCountsByColVis() { + public boolean getSeparateCountsByColVis() { return separateCountsByColVis; } - public Boolean getShowReferenceCount() { + public boolean getShowReferenceCount() { return showReferenceCount; } + public boolean getSumCounts() { + return sumCounts; + } + public void setSeparateCountsByColVis(boolean separateCountsByColVis) { this.separateCountsByColVis = separateCountsByColVis; } - public void setShowReferenceCount(Boolean showReferenceCount) { + public void setShowReferenceCount(boolean showReferenceCount) { this.showReferenceCount = showReferenceCount; } + public void setSumCounts(boolean sumCounts) { + this.sumCounts = sumCounts; + } + @Override public DiscoveryQueryConfiguration checkpoint() { // Create a new config that only contains what is needed to execute the specified ranges @@ -156,4 +166,11 @@ public boolean equals(Object o) { public int hashCode() { return Objects.hash(super.hashCode(), literals, patterns, ranges, separateCountsByColVis, showReferenceCount); } + + @Override + public String toString() { + return new StringJoiner(", ", DiscoveryQueryConfiguration.class.getSimpleName() + "[", "]").add("literals=" + literals).add("patterns=" + patterns) + .add("ranges=" + ranges).add("separateCountsByColVis=" + separateCountsByColVis).add("showReferenceCount=" + showReferenceCount) + .add("sumCounts=" + sumCounts).toString(); + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java index 19bec2cb83b..bc2a0dc8781 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java @@ -328,11 +328,17 @@ public IndexInfo union(IndexInfo o, List delayedNodes) { merged.count = merged.uids.size(); } - merged.setFieldCounts(this.getFieldCounts()); - merged.mergeFieldCounts(o.getFieldCounts()); + if (this == o) { + // handle idiosyncrasy of the peeking iterator where the first term is merged with itself + merged.setFieldCounts(o.getFieldCounts()); + merged.setTermCounts(o.getTermCounts()); + } else { + merged.setFieldCounts(getFieldCounts()); + merged.setTermCounts(getTermCounts()); - merged.setTermCounts(this.getTermCounts()); - merged.mergeTermCounts(o.getTermCounts()); + merged.mergeFieldCounts(o.getFieldCounts()); + merged.mergeTermCounts(o.getTermCounts()); + } /* * If there are multiple levels within a union we could have an ASTOrNode. We cannot prune OrNodes as we would with an intersection, so propagate the diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java index 0eb3fe6b144..9d1f2951563 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java @@ -157,8 +157,8 @@ public RangeStream(ShardQueryConfiguration config, ScannerFactory scanners, Meta streamExecutor = new ThreadPoolExecutor(executeLookupMin, maxLookup, 100, TimeUnit.MILLISECONDS, runnables); fieldDataTypes = config.getQueryFieldsDatatypes(); collapseUids = config.getCollapseUids(); - fieldCounts = config.getUseFieldCounts(); - termCounts = config.getUseTermCounts(); + fieldCounts = config.isSortQueryPostIndexWithFieldCounts(); + termCounts = config.isSortQueryPostIndexWithTermCounts(); try { Set ioFields = metadataHelper.getIndexOnlyFields(null); if (null != ioFields) { @@ -264,8 +264,8 @@ public Iterator iterator() { this.itr = filter(concat(transform(queryStream, new TupleToRange(config.getShardTableName(), queryStream.currentNode(), config))), getEmptyPlanPruner()); - if (config.isSortQueryByCounts() && (config.getUseFieldCounts() || config.getUseTermCounts())) { - this.itr = transform(itr, new OrderingTransform(config.getUseFieldCounts(), config.getUseTermCounts())); + if (config.isSortQueryPostIndexWithFieldCounts() || config.isSortQueryPostIndexWithTermCounts()) { + this.itr = transform(itr, new OrderingTransform(config.isSortQueryPostIndexWithFieldCounts(), config.isSortQueryPostIndexWithTermCounts())); } } } finally { @@ -362,7 +362,7 @@ public QueryPlan apply(QueryPlan plan) { Map counts = plan.getTermCounts().getCounts(); OrderByCostVisitor.orderByTermCount(plan.getQueryTree(), counts); } else if (useFieldCounts) { - Map counts = plan.getTermCounts().getCounts(); + Map counts = plan.getFieldCounts().getCounts(); OrderByCostVisitor.orderByFieldCount(plan.getQueryTree(), counts); } return plan; @@ -602,6 +602,10 @@ public ScannerStream visit(ASTEQNode node, Object data) { String queryString = fieldName + "=='" + literal + "'"; options.addScanIterator(QueryScannerHelper.getQueryInfoIterator(config.getQuery(), false, queryString)); + // easier to apply hints to new options than deal with copying existing hints between + options.applyExecutionHints(config.getIndexTableName(), config.getTableHints()); + options.applyConsistencyLevel(config.getIndexTableName(), config.getTableConsistencyLevels()); + scannerSession.setOptions(options); scannerSession.setMaxResults(config.getMaxIndexBatchSize()); scannerSession.setExecutor(streamExecutor); diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java index 6681b9ebd0b..4972d173f5f 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java @@ -9,7 +9,7 @@ import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.util.PeekingIterator; +import org.apache.commons.collections4.iterators.PeekingIterator; import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java index 2b437ea61c5..1d763edb37a 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java @@ -12,7 +12,7 @@ import org.apache.accumulo.core.data.PartialKey; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.util.PeekingIterator; +import org.apache.commons.collections4.iterators.PeekingIterator; import org.apache.commons.jexl3.parser.JexlNode; import com.google.common.base.Function; diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java index 82d9e9b24fa..96452c57741 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java @@ -15,6 +15,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.commons.jexl3.parser.ASTAndNode; @@ -37,6 +38,7 @@ import com.google.common.collect.Lists; import com.google.common.collect.PeekingIterator; import com.google.common.collect.Sets; +import com.google.common.collect.Streams; import datawave.query.attributes.AttributeFactory; import datawave.query.config.ShardQueryConfiguration; @@ -104,19 +106,18 @@ public JexlNode getIndexQuery(Set termFrequencyFields, Set index // get the cartesian product of all the fields and terms MutableBoolean oredFields = new MutableBoolean(); - Set[] fieldsAndTerms = fieldsAndTerms(termFrequencyFields, indexedFields, contentFields, oredFields, true); - if (!fieldsAndTerms[0].isEmpty()) { + FieldTerms fieldsAndTerms = fieldsAndTerms(termFrequencyFields, indexedFields, contentFields, oredFields, true); + Set fields = fieldsAndTerms.getFields(); + if (!fields.isEmpty()) { final JexlNode eq = new ASTEQNode(ParserTreeConstants.JJTEQNODE); - - for (String field : fieldsAndTerms[0]) { - nodes.add(JexlNodeFactory.createNodeTreeFromFieldValues(ContainerType.AND_NODE, eq, null, field, fieldsAndTerms[1])); - } + Set terms = fieldsAndTerms.getTerms(); + fields.forEach(field -> nodes.add(JexlNodeFactory.createNodeTreeFromFieldValues(ContainerType.AND_NODE, eq, null, field, terms))); } - if (fieldsAndTerms[0].size() == 0) { + if (fields.isEmpty()) { log.warn("No fields found for content function, will not expand index query"); return new ASTTrueNode(ParserTreeConstants.JJTTRUENODE); - } else if (fieldsAndTerms[0].size() == 1) { + } else if (fields.size() == 1) { // A single field needs no wrapper node. return nodes.iterator().next(); } else if (oredFields.booleanValue()) { @@ -194,7 +195,7 @@ public Set fieldsForNormalization(MetadataHelper helper, Set dat public Set fields(MetadataHelper helper, Set datatypeFilter) { try { return fieldsAndTerms(helper.getTermFrequencyFields(datatypeFilter), helper.getIndexedFields(datatypeFilter), - helper.getContentFields(datatypeFilter), null)[0]; + helper.getContentFields(datatypeFilter), null).getFields(); } catch (TableNotFoundException e) { QueryException qe = new QueryException(DatawaveErrorCode.METADATA_TABLE_FETCH_ERROR, e); throw new DatawaveFatalQueryException(qe); @@ -206,15 +207,15 @@ public Set fields(MetadataHelper helper, Set datatypeFilter) { public Set> fieldSets(MetadataHelper helper, Set datatypeFilter) { try { MutableBoolean oredFields = new MutableBoolean(); - Set[] fieldsAndTerms = fieldsAndTerms(helper.getTermFrequencyFields(datatypeFilter), helper.getIndexedFields(datatypeFilter), + FieldTerms fieldsAndTerms = fieldsAndTerms(helper.getTermFrequencyFields(datatypeFilter), helper.getIndexedFields(datatypeFilter), helper.getContentFields(datatypeFilter), oredFields); Set> fieldSets = new HashSet<>(); if (oredFields.booleanValue()) { - for (String field : fieldsAndTerms[0]) { + for (String field : fieldsAndTerms.getFields()) { fieldSets.add(Collections.singleton(field)); } } else { - fieldSets.add(fieldsAndTerms[0]); + fieldSets.add(fieldsAndTerms.getFields()); } return fieldSets; } catch (TableNotFoundException e) { @@ -224,174 +225,200 @@ public Set> fieldSets(MetadataHelper helper, Set datatypeFil } - public Set[] fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields) { + public FieldTerms fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields) { return fieldsAndTerms(termFrequencyFields, indexedFields, contentFields, oredFields, false); } @SuppressWarnings("unchecked") - public Set[] fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields, + public FieldTerms fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields, boolean validateFields) { + if (this.args.isEmpty()) { + NotFoundQueryException qe = new NotFoundQueryException(DatawaveErrorCode.JEXL_NODES_MISSING, + MessageFormat.format("Class: {0}, Namespace: {1}, Function: {2}", this.getClass().getSimpleName(), this.namespace, this.name)); + throw new IllegalArgumentException(qe); + } - final String funcName = name; - - PeekingIterator args = Iterators.peekingIterator(this.args.iterator()); - - Set termFreqFields = Sets.newHashSet(termFrequencyFields); - Set fields = Sets.newHashSetWithExpectedSize(termFreqFields.size()); - Set terms = Sets.newHashSetWithExpectedSize(this.args.size() - 1); - Iterator itr = termFreqFields.iterator(); // Can any one of the fields satisfy the query? Always true unless the zone is specified in an AND clause. if (oredFields != null) { oredFields.setValue(true); } - while (itr.hasNext()) { - String field = itr.next(); - if (indexedFields.contains(field) && (contentFields.isEmpty() || contentFields.contains(field))) { - fields.add(field); - } - } - - if (args.hasNext()) { - JexlNode termOffsetMap = null; - if (CONTENT_ADJACENT_FUNCTION_NAME.equals(funcName)) { - JexlNode firstArg = args.next(); + PeekingIterator argsIterator = Iterators.peekingIterator(this.args.iterator()); + FieldTerms fieldTerms = new FieldTerms(); + JexlNode termOffsetMap; - // we override the zones if the first argument is a string - if (firstArg instanceof ASTStringLiteral) { - fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(firstArg)); - termOffsetMap = args.next(); - } else { - JexlNode nextArg = args.peek(); - - // The zones may (more likely) be specified as an idenfifier - if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) { - if (oredFields != null && firstArg instanceof ASTAndNode) { - oredFields.setValue(false); - } - - fields = JexlASTHelper.getIdentifierNames(firstArg); - termOffsetMap = args.next(); - } else { - termOffsetMap = firstArg; - } - } - } else if (CONTENT_PHRASE_FUNCTION_NAME.equals(funcName)) { - JexlNode firstArg = args.next(); + switch (this.name) { + case CONTENT_ADJACENT_FUNCTION_NAME: + termOffsetMap = examineContentAdjacentFunction(argsIterator, fieldTerms, oredFields); + break; + case CONTENT_PHRASE_FUNCTION_NAME: + termOffsetMap = examineContentPhraseFunction(argsIterator, fieldTerms, oredFields); + break; + case CONTENT_SCORED_PHRASE_FUNCTION_NAME: + termOffsetMap = examineContentScoredPhraseFunction(argsIterator, fieldTerms, oredFields); + break; + case CONTENT_WITHIN_FUNCTION_NAME: + termOffsetMap = examineContentWithinFunction(argsIterator, fieldTerms, oredFields); + break; + default: + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.FUNCTION_ARGUMENTS_MISSING); + throw new IllegalArgumentException(qe); + } - // we override the zones if the first argument is a string - if (firstArg instanceof ASTStringLiteral) { - fields = Collections.singleton(((ASTStringLiteral) firstArg).getLiteral()); + // Verify that a term offset map with terms were specified. + validateTermsOffsetMapAndTermsPresent(termOffsetMap, argsIterator); - termOffsetMap = args.next(); - } else { - JexlNode nextArg = args.peek(); + // If the fields were not established above, ensure that the fields at least contain any term frequency fields that are indexed and, if any content + // fields were specified, present within there as well. + if (fieldTerms.fields == null) { + Set fields = termFrequencyFields.stream() + .filter(f -> indexedFields.contains(f) && (contentFields.isEmpty() || contentFields.contains(f))).collect(Collectors.toSet()); + fieldTerms.fields = fields; + } - // The zones may (more likely) be specified as an identifier - if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) { - if (oredFields != null && firstArg instanceof ASTAndNode) { - oredFields.setValue(false); - } - - fields = JexlASTHelper.getIdentifierNames(firstArg); - termOffsetMap = args.next(); - } else { - termOffsetMap = firstArg; - } + // Moving this validation later in the call stack, since it requires other processing (i.e. apply query model) + if (validateFields) { + for (String field : fieldTerms.fields) { + // Deconstruct & upcase the fieldname for testing in case we have not normalized the field names yet. Return the unnormalized fieldname. + if (!termFrequencyFields.contains(JexlASTHelper.deconstructIdentifier(field.toUpperCase()))) { + PreConditionFailedQueryException qe = new PreConditionFailedQueryException(DatawaveErrorCode.FIELD_PHRASE_QUERY_NOT_INDEXED, + MessageFormat.format("Field: {0}", field)); + throw new IllegalArgumentException(qe); } - } else if (CONTENT_SCORED_PHRASE_FUNCTION_NAME.equals(funcName)) { - JexlNode arg = args.next(); + } + } - if (arg instanceof ASTNumberLiteral || arg instanceof ASTUnaryMinusNode) { - // if the first argument is a number, then no field exists - // for example, content:scoredPhrase(-1.5, termOffsetMap, 'value') - termOffsetMap = args.next(); - } else { - if (arg instanceof ASTIdentifier) { - // single field case - // for example, content:scoredPhrase(FIELD, -1.5, termOffsetMap, 'value') - fields = Collections.singleton(String.valueOf(JexlASTHelper.getIdentifier(arg))); - } else { - // multi field case - // for example, content:scoredPhrase((FIELD_A || FIELD_B), -1.5, termOffsetMap, 'value') - Set identifiers = JexlASTHelper.getIdentifierNames(arg); - if (!identifiers.isEmpty()) { - fields = identifiers; - - if (oredFields != null && arg instanceof ASTAndNode) { - oredFields.setValue(false); - } - } - } + // Now take the remaining string literals in the arguments as terms. + Set terms = Sets.newHashSetWithExpectedSize(this.args.size() - 1); + // @formatter:off + Streams.stream(argsIterator) + .filter(ASTStringLiteral.class::isInstance) + .map(JexlNodes::getIdentifierOrLiteralAsString) + .forEach(terms::add); + // @formatter:on + fieldTerms.terms = terms; + + return fieldTerms; + } - // skip score because it is not needed when gathering just the fields and values from a function - args.next(); + // Finds and sets the fields for a content:adjacent functions, and returns the anticpatated terms offset map node. + private JexlNode examineContentAdjacentFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) { + JexlNode firstArg = argsIterator.next(); + if (firstArg instanceof ASTStringLiteral) { + fieldTerms.fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(firstArg)); + return argsIterator.next(); + } else { + JexlNode nextArg = argsIterator.peek(); + // The zones may (more likely) be specified as an idenfifier + if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) { + if (oredFields != null && firstArg instanceof ASTAndNode) { + oredFields.setValue(false); + } + fieldTerms.fields = JexlASTHelper.getIdentifierNames(firstArg); + return argsIterator.next(); + } else { + return firstArg; + } + } + } - termOffsetMap = args.next(); + // Finds and sets the fields for a content:phrase functions, and returns the anticpatated terms offset map node. + private JexlNode examineContentPhraseFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) { + JexlNode firstArg = argsIterator.next(); + // we override the zones if the first argument is a string + if (firstArg instanceof ASTStringLiteral) { + fieldTerms.fields = Collections.singleton(((ASTStringLiteral) firstArg).getLiteral()); + return argsIterator.next(); + } else { + JexlNode nextArg = argsIterator.peek(); + // The zones may (more likely) be specified as an identifier + if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) { + if (oredFields != null && firstArg instanceof ASTAndNode) { + oredFields.setValue(false); } - } else if (CONTENT_WITHIN_FUNCTION_NAME.equals(funcName)) { - JexlNode arg = args.next(); + fieldTerms.fields = JexlASTHelper.getIdentifierNames(firstArg); + return argsIterator.next(); + } else { + return firstArg; + } + } + } - // we override the zones if the first argument is a string or identifier - if (arg instanceof ASTStringLiteral) { - fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(arg)); - arg = args.next(); - } else if (!JexlASTHelper.getIdentifiers(arg).isEmpty()) { - if (oredFields != null && arg instanceof ASTAndNode) { + // Finds and sets the fields for a content:scoredPhrase functions, and returns the anticpatated terms offset map node. + private JexlNode examineContentScoredPhraseFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) { + JexlNode firstArg = argsIterator.next(); + if (firstArg instanceof ASTNumberLiteral || firstArg instanceof ASTUnaryMinusNode) { + // if the first argument is a number, then no field exists + // for example, content:scoredPhrase(-1.5, termOffsetMap, 'value') + return argsIterator.next(); + } else { + if (firstArg instanceof ASTIdentifier) { + // single field case + // for example, content:scoredPhrase(FIELD, -1.5, termOffsetMap, 'value') + fieldTerms.fields = Collections.singleton(String.valueOf(JexlASTHelper.getIdentifier(firstArg))); + } else { + // multi field case + // for example, content:scoredPhrase((FIELD_A || FIELD_B), -1.5, termOffsetMap, 'value') + Set identifiers = JexlASTHelper.getIdentifierNames(firstArg); + if (!identifiers.isEmpty()) { + fieldTerms.fields = identifiers; + if (oredFields != null && firstArg instanceof ASTAndNode) { oredFields.setValue(false); } - - fields = JexlASTHelper.getIdentifierNames(arg); - arg = args.next(); } + } - // we can trash the distance - if (!(arg instanceof ASTNumberLiteral || arg instanceof ASTUnaryMinusNode)) { - BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.NUMERIC_DISTANCE_ARGUMENT_MISSING); - throw new IllegalArgumentException(qe); - } + // skip score because it is not needed when gathering just the fields and values from a function + argsIterator.next(); + return argsIterator.next(); + } + } - termOffsetMap = args.next(); - } else { - BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.FUNCTION_ARGUMENTS_MISSING); - throw new IllegalArgumentException(qe); + // Finds and sets the fields for a content:within functions, and returns the anticpatated terms offset map node. + private JexlNode examineContentWithinFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) { + JexlNode arg = argsIterator.next(); + // we override the zones if the first argument is a string or identifier + if (arg instanceof ASTStringLiteral) { + fieldTerms.fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(arg)); + arg = argsIterator.next(); + } else if (!JexlASTHelper.getIdentifiers(arg).isEmpty()) { + if (oredFields != null && arg instanceof ASTAndNode) { + oredFields.setValue(false); } - if (null == termOffsetMap || !(termOffsetMap instanceof ASTIdentifier)) { - BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMOFFSETMAP_AND_TERMS_MISSING); - throw new IllegalArgumentException(qe); - } + fieldTerms.fields = JexlASTHelper.getIdentifierNames(arg); + arg = argsIterator.next(); + } - if (!args.hasNext()) { - BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMS_MISSING); - throw new IllegalArgumentException(qe); - } + // we can trash the distance + if (!(arg instanceof ASTNumberLiteral || arg instanceof ASTUnaryMinusNode)) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.NUMERIC_DISTANCE_ARGUMENT_MISSING); + throw new IllegalArgumentException(qe); + } - // moving this validation later in the call stack, since it requires other processing (i.e. apply query model) - if (validateFields) { - for (String field : fields) { - // deconstruct & upcase the fieldname for testing in case we have not normalized the field names yet. Return the unnormalized fieldname. - if (!termFreqFields.contains(JexlASTHelper.deconstructIdentifier(field.toUpperCase()))) { - PreConditionFailedQueryException qe = new PreConditionFailedQueryException(DatawaveErrorCode.FIELD_PHRASE_QUERY_NOT_INDEXED, - MessageFormat.format("Field: {0}", field)); - throw new IllegalArgumentException(qe); - } - } - } + return argsIterator.next(); + } - // now take the remaining string literals as terms - Iterator termsItr = Iterators.transform(Iterators.filter(args, new StringLiteralsOnly()), new GetImage()); - while (termsItr.hasNext()) { - terms.add(termsItr.next()); - } + /** + * Throws a {@link BadRequestQueryException} if termsOffsetMap is not an instance of {@link ASTIdentifier} or if there are no more nodes in the + * iterator. + * + * @param termOffsetMap + * the terms offset map node + * @param argsIterator + * the iterator of arguments + */ + private void validateTermsOffsetMapAndTermsPresent(JexlNode termOffsetMap, PeekingIterator argsIterator) { + if (!(termOffsetMap instanceof ASTIdentifier)) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMOFFSETMAP_AND_TERMS_MISSING); + throw new IllegalArgumentException(qe); + } - } else { - NotFoundQueryException qe = new NotFoundQueryException(DatawaveErrorCode.JEXL_NODES_MISSING, - MessageFormat.format("Class: {0}, Namespace: {1}, Function: {2}", this.getClass().getSimpleName(), namespace, funcName)); + if (!argsIterator.hasNext()) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMS_MISSING); throw new IllegalArgumentException(qe); } - return new Set[] {fields, terms}; } /** @@ -616,6 +643,29 @@ public boolean allowIvaratorFiltering() { } } + public static class FieldTerms { + + private Set fields; + private Set terms; + + public FieldTerms() { + fields = null; + terms = null; + } + + public Set getFields() { + return fields; + } + + public int totalFields() { + return fields.size(); + } + + public Set getTerms() { + return terms; + } + } + @Override public ContentJexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node) { FunctionJexlNodeVisitor fvis = new FunctionJexlNodeVisitor(); @@ -636,5 +686,4 @@ public ContentJexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node) return new ContentJexlArgumentDescriptor(node, fvis.namespace(), fvis.name(), fvis.args()); } - } diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java index e3c207b39d5..9301f22f490 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java @@ -1,11 +1,11 @@ package datawave.query.jexl.lookups; -import java.io.IOException; +import static datawave.query.jexl.lookups.ShardIndexQueryTableStaticMethods.EXPANSION_HINT_KEY; + import java.text.MessageFormat; import java.util.Collections; import java.util.Iterator; import java.util.Map.Entry; -import java.util.SortedMap; import java.util.concurrent.Callable; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; @@ -18,15 +18,15 @@ import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.iterators.user.WholeRowIterator; import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; import org.springframework.util.StringUtils; +import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import datawave.core.common.logging.ThreadConfigurableLogger; -import datawave.core.iterators.ColumnQualifierRangeIterator; +import datawave.core.iterators.BoundedRangeExpansionIterator; import datawave.core.iterators.CompositeSeekingIterator; import datawave.core.iterators.TimeoutExceptionIterator; import datawave.core.iterators.TimeoutIterator; @@ -126,25 +126,25 @@ public synchronized void submit() { log.debug("Range: " + range); bs = null; try { - bs = scannerFactory.newScanner(config.getIndexTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery()); + // the 'newScanner' method in the ScannerFactory has no knowledge about the 'expansion' hint, so determine hint here + String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName(); + + bs = scannerFactory.newScanner(config.getIndexTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery(), hintKey); bs.setRanges(Collections.singleton(range)); bs.fetchColumnFamily(new Text(literalRange.getFieldName())); - // set up the GlobalIndexRangeSamplingIterator - - IteratorSetting cfg = new IteratorSetting(config.getBaseIteratorPriority() + 50, "WholeRowIterator", WholeRowIterator.class); - bs.addScanIterator(cfg); - - cfg = new IteratorSetting(config.getBaseIteratorPriority() + 48, "DateFilter", ColumnQualifierRangeIterator.class); - // search from 20YYddMM to 20ZZddMM\uffff to ensure we encompass all of the current day - String end = endDay + Constants.MAX_UNICODE_STRING; - cfg.addOption(ColumnQualifierRangeIterator.RANGE_NAME, ColumnQualifierRangeIterator.encodeRange(new Range(startDay, end))); - - bs.addScanIterator(cfg); + IteratorSetting setting = new IteratorSetting(config.getBaseIteratorPriority() + 20, "BoundedRangeExpansionIterator", + BoundedRangeExpansionIterator.class); + setting.addOption(BoundedRangeExpansionIterator.START_DATE, startDay); + setting.addOption(BoundedRangeExpansionIterator.END_DATE, endDay); + if (!config.getDatatypeFilter().isEmpty()) { + setting.addOption(BoundedRangeExpansionIterator.DATATYPES_OPT, Joiner.on(',').join(config.getDatatypeFilter())); + } + bs.addScanIterator(setting); // If this is a composite field, with multiple terms, we need to setup our query to filter based on each component of the composite range - if (config.getCompositeToFieldMap().get(literalRange.getFieldName()) != null) { + if (!config.getCompositeToFieldMap().get(literalRange.getFieldName()).isEmpty()) { String compositeSeparator = null; if (config.getCompositeFieldSeparators() != null) @@ -169,8 +169,8 @@ public synchronized void submit() { } if (null != fairnessIterator) { - cfg = new IteratorSetting(config.getBaseIteratorPriority() + 100, TimeoutExceptionIterator.class); - bs.addScanIterator(cfg); + IteratorSetting timeoutSetting = new IteratorSetting(config.getBaseIteratorPriority() + 100, TimeoutExceptionIterator.class); + bs.addScanIterator(timeoutSetting); } timedScanFuture = execService.submit(createTimedCallable(bs.iterator())); @@ -180,13 +180,6 @@ public synchronized void submit() { log.error(qe); throw new DatawaveFatalQueryException(qe); - } catch (IOException e) { - QueryException qe = new QueryException(DatawaveErrorCode.RANGE_CREATE_ERROR, e, MessageFormat.format("{0}", this.literalRange)); - log.debug(qe); - if (bs != null) { - scannerFactory.close(bs); - } - throw new IllegalRangeArgumentException(qe); } } } @@ -233,6 +226,7 @@ protected Callable createTimedCallable(final Iterator> Key k = entry.getKey(); + log.info("tk: " + k.toStringNoTime()); if (log.isTraceEnabled()) { log.trace("Forward Index entry: " + entry.getKey()); } @@ -240,61 +234,22 @@ protected Callable createTimedCallable(final Iterator> k.getRow(holder); String uniqueTerm = holder.toString(); - SortedMap keymap = WholeRowIterator.decodeRow(entry.getKey(), entry.getValue()); - - String field = null; - - boolean foundDataType = false; - - for (Key topKey : keymap.keySet()) { - if (null == field) { - topKey.getColumnFamily(holder); - field = holder.toString(); - } - // Get the column qualifier from the key. It - // contains the datatype and normalizer class - - if (null != topKey.getColumnQualifier()) { - if (null != config.getDatatypeFilter() && !config.getDatatypeFilter().isEmpty()) { - - String colq = topKey.getColumnQualifier().toString(); - int idx = colq.indexOf(Constants.NULL); - - if (idx != -1) { - String type = colq.substring(idx + 1); - - // If types are specified and this type - // is not in the list, skip it. - if (config.getDatatypeFilter().contains(type)) { - if (log.isTraceEnabled()) { - log.trace(config.getDatatypeFilter() + " contains " + type); - } - - foundDataType = true; - break; - } - } - } else { - foundDataType = true; - } - } - } - if (foundDataType) { + k.getColumnFamily(holder); + String field = holder.toString(); - // obtaining the size of a map can be expensive, - // instead - // track the count of each unique item added. - indexLookupMap.put(field, uniqueTerm); + // safety check... + Preconditions.checkState(field.equals(literalRange.getFieldName()), + "Got an unexpected field name when expanding range" + field + " " + literalRange.getFieldName()); - // safety check... - Preconditions.checkState(field.equals(literalRange.getFieldName()), - "Got an unexpected field name when expanding range" + field + " " + literalRange.getFieldName()); + // obtaining the size of a map can be expensive, + // instead + // track the count of each unique item added. + indexLookupMap.put(field, uniqueTerm); - // If this range expands into to many values, we can - // stop - if (indexLookupMap.get(field).isThresholdExceeded()) { - return true; - } + // If this range expands into to many values, we can + // stop + if (indexLookupMap.get(field).isThresholdExceeded()) { + return true; } } } catch (Exception e) { diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java index c1c068e5bc1..b40001d5fd2 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java @@ -14,7 +14,6 @@ import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicLong; -import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java index 74671654e43..6c82c7f01f1 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java @@ -68,6 +68,9 @@ public class ShardIndexQueryTableStaticMethods { private static FastDateFormat formatter = FastDateFormat.getInstance("yyyyMMdd"); + // name reserved for executor pools + public static final String EXPANSION_HINT_KEY = "expansion"; + /** * Create an IndexLookup task to find field names give a JexlNode and a set of Types for that node * @@ -440,9 +443,13 @@ public static Range getLiteralRange(String fieldName, String normalizedQueryTerm * check for limiting unique terms * @return the scanner session * @throws InvocationTargetException + * if no target exists * @throws NoSuchMethodException + * if no method exists * @throws InstantiationException + * if there is a problem initializing * @throws IllegalAccessException + * if there is an illegal access * @throws IOException * dates can't be formatted */ @@ -455,7 +462,9 @@ public static ScannerSession configureTermMatchOnly(ShardQueryConfiguration conf return null; } - ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery()); + String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName(); + + ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery(), hintKey); bs.setRanges(ranges); @@ -483,7 +492,9 @@ public static ScannerSession configureLimitedDiscovery(ShardQueryConfiguration c return null; } - ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery()); + String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : tableName; + + ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery(), hintKey); bs.setRanges(ranges); @@ -511,6 +522,13 @@ public static final void configureGlobalIndexDateRangeFilter(ShardQueryConfigura } IteratorSetting cfg = configureGlobalIndexDateRangeFilter(config, dateRange); bs.addScanIterator(cfg); + + // unused method, but we'll still configure execution hints if possible + String executionHintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName(); + + if (config.getTableHints().containsKey(executionHintKey)) { + bs.setExecutionHints(config.getTableHints().get(executionHintKey)); + } } public static final IteratorSetting configureGlobalIndexDateRangeFilter(ShardQueryConfiguration config, LongRange dateRange) { @@ -580,6 +598,16 @@ public static final void configureGlobalIndexTermMatchingIterator(ShardQueryConf bs.addScanIterator(cfg); + // unused method, but we'll still configure execution hints if possible + if (!reverseIndex) { + // only apply hints to the global index + String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName(); + + if (config.getTableHints().containsKey(hintKey)) { + bs.setExecutionHints(config.getTableHints().get(hintKey)); + } + } + setExpansionFields(config, bs, reverseIndex, expansionFields); } diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/AbstractNodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/AbstractNodeCostComparator.java new file mode 100644 index 00000000000..9c2cc475401 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/AbstractNodeCostComparator.java @@ -0,0 +1,108 @@ +package datawave.query.jexl.nodes; + +import java.util.Map; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTOrNode; +import org.apache.commons.jexl3.parser.ASTReference; +import org.apache.commons.jexl3.parser.ASTReferenceExpression; +import org.apache.commons.jexl3.parser.JexlNode; +import org.apache.commons.jexl3.parser.JexlNodes; +import org.apache.commons.jexl3.parser.ParserTreeConstants; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.util.count.CountMap; + +/** + * Class that contains core logic for field and term comparators + */ +public abstract class AbstractNodeCostComparator extends JexlNodeComparator { + private static final long NODE_ID_MULTIPLIER = 5000L; + private static final int SEGMENT = Integer.MAX_VALUE / 48; + + private final DefaultJexlNodeComparator comparator = new DefaultJexlNodeComparator(); + + private final Map counts; + + /** + * Constructor that accepts a {@link CountMap} + * + * @param counts + * the count map + */ + protected AbstractNodeCostComparator(CountMap counts) { + this(counts.getCounts()); + } + + /** + * Constructor that accepts a {@link Map} of counts + * + * @param counts + * the count map + */ + protected AbstractNodeCostComparator(Map counts) { + this.counts = counts; + } + + @Override + public int compare(JexlNode left, JexlNode right) { + left = JexlASTHelper.dereference(left); + right = JexlASTHelper.dereference(right); + + int leftCost = getCostIndex(left); + int rightCost = getCostIndex(right); + + int result = Integer.compare(leftCost, rightCost); + + if (result == 0) { + result = comparator.compare(left, right); + } + + return result; + } + + @Override + public int getCostIndex(JexlNode node) { + if ((node instanceof ASTReference || node instanceof ASTReferenceExpression) && node.jjtGetNumChildren() == 1) { + return getCostIndex(node.jjtGetChild(0)); + } else if (node instanceof ASTOrNode) { + return getCostForUnion(node); + } else if (QueryPropertyMarker.findInstance(node).isAnyType()) { + return Integer.MAX_VALUE; + } else if (node instanceof ASTAndNode) { + return getCostForIntersection(node); + } else { + String key = getNodeKey(node); + long score = counts.getOrDefault(key, getDefaultScore(node)); + if (score > Integer.MAX_VALUE) { + score = Integer.MAX_VALUE; + } + return (int) score; + } + } + + /** + * This method is the only difference between calculating cost based on field or term + * + * @param node + * a JexlNode + * @return the node key + */ + abstract String getNodeKey(JexlNode node); + + private long getDefaultScore(JexlNode node) { + int id = JexlNodes.id(node); + switch (id) { + case ParserTreeConstants.JJTFUNCTIONNODE: + return SEGMENT - 4L; + case ParserTreeConstants.JJTNENODE: + return SEGMENT - 3L; + case ParserTreeConstants.JJTNRNODE: + return SEGMENT - 2L; + case ParserTreeConstants.JJTNOTNODE: + return SEGMENT - 1L; + default: + return id * NODE_ID_MULTIPLIER; + } + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultJexlNodeComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultJexlNodeComparator.java new file mode 100644 index 00000000000..af8a2be45fe --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultJexlNodeComparator.java @@ -0,0 +1,87 @@ +package datawave.query.jexl.nodes; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTOrNode; +import org.apache.commons.jexl3.parser.ASTReference; +import org.apache.commons.jexl3.parser.ASTReferenceExpression; +import org.apache.commons.jexl3.parser.JexlNode; +import org.apache.commons.jexl3.parser.JexlNodes; +import org.apache.commons.jexl3.parser.ParserTreeConstants; + +import datawave.query.jexl.JexlASTHelper; + +/** + * Comparator that enforces default ordering according to implied cost + *

+ * Nodes are sorted by node type, then junction, then lexicographically + */ +public class DefaultJexlNodeComparator extends JexlNodeComparator { + + private static final int SEGMENT = Integer.MAX_VALUE / 48; + + private final JunctionComparator junctionComparator = new JunctionComparator(); + private final LexicographicalNodeComparator lexiComparator = new LexicographicalNodeComparator(); + + @Override + public int compare(JexlNode left, JexlNode right) { + left = JexlASTHelper.dereference(left); + right = JexlASTHelper.dereference(right); + + int result = Integer.compare(getCostIndex(left), getCostIndex(right)); + + // EQ vs. (EQ AND EQ) will match + if (result == 0) { + result = junctionComparator.compare(left, right); + } + + if (result == 0) { + result = lexiComparator.compare(left, right); + } + + return result; + } + + /** + * + * @param node + * an arbitrary JexlNode + * @return the node cost + */ + @Override + protected int getCostIndex(JexlNode node) { + if ((node instanceof ASTReference || node instanceof ASTReferenceExpression) && node.jjtGetNumChildren() == 1) { + return getCostIndex(node.jjtGetChild(0)); + } else if (node instanceof ASTOrNode) { + return getCostForUnion(node); + } else if (QueryPropertyMarker.findInstance(node).isAnyType()) { + return Integer.MAX_VALUE; + } else if (node instanceof ASTAndNode) { + return getCostForIntersection(node); + } else { + return getNodeScore(node); + } + } + + /** + * Wrapper around {@link JexlNodes#id(JexlNode)} so that we can boost the score of negated terms + * + * @param node + * any JexlNode + * @return a score for the node + */ + private int getNodeScore(JexlNode node) { + int id = JexlNodes.id(node); + switch (id) { + case ParserTreeConstants.JJTFUNCTIONNODE: + return SEGMENT - 4; + case ParserTreeConstants.JJTNENODE: + return SEGMENT - 3; + case ParserTreeConstants.JJTNRNODE: + return SEGMENT - 2; + case ParserTreeConstants.JJTNOTNODE: + return SEGMENT - 1; + default: + return id; + } + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultNodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultNodeCostComparator.java deleted file mode 100644 index fa5edcc8db7..00000000000 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultNodeCostComparator.java +++ /dev/null @@ -1,69 +0,0 @@ -package datawave.query.jexl.nodes; - -import org.apache.commons.jexl3.parser.ASTAndNode; -import org.apache.commons.jexl3.parser.ASTOrNode; -import org.apache.commons.jexl3.parser.ASTReference; -import org.apache.commons.jexl3.parser.ASTReferenceExpression; -import org.apache.commons.jexl3.parser.JexlNode; -import org.apache.commons.jexl3.parser.JexlNodes; -import org.apache.commons.jexl3.parser.ParserTreeConstants; - -/** - * Provides default node cost calculations based on the Jexl node id - */ -public class DefaultNodeCostComparator extends NodeCostComparator { - - /** - * - * @param node - * an arbitrary JexlNode - * @return the node cost - */ - @Override - protected int getCostIndex(JexlNode node) { - if (node.jjtGetNumChildren() == 1 && (node instanceof ASTReference || node instanceof ASTReferenceExpression)) { - QueryPropertyMarker.Instance instance = QueryPropertyMarker.findInstance(node); - if (instance.isAnyType()) { - return Integer.MAX_VALUE - 4; - } - return getCostIndex(node.jjtGetChild(0)); - } else if (node instanceof ASTOrNode) { - int sum = 0; - for (int i = 0; i < node.jjtGetNumChildren(); i++) { - sum += getCostIndex(node.jjtGetChild(i)); - } - return sum; - } else if (node instanceof ASTAndNode) { - int lowest = Integer.MAX_VALUE; - for (int i = 0; i < node.jjtGetNumChildren(); i++) { - int cost = getCostIndex(node.jjtGetChild(i)); - if (cost < lowest) - lowest = cost; - } - return lowest; - } else { - return getNodeScore(node); - } - } - - /** - * Wrapper around {@link JexlNodes#id(JexlNode)} so that we can boost the score of negated terms - * - * @param node - * any JexlNode - * @return a score for the node - */ - private int getNodeScore(JexlNode node) { - int id = JexlNodes.id(node); - switch (id) { - case ParserTreeConstants.JJTNENODE: - return Integer.MAX_VALUE - 3; - case ParserTreeConstants.JJTNRNODE: - return Integer.MAX_VALUE - 2; - case ParserTreeConstants.JJTNOTNODE: - return Integer.MAX_VALUE - 1; - default: - return id; - } - } -} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldCostComparator.java new file mode 100644 index 00000000000..eb3d1e2956c --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldCostComparator.java @@ -0,0 +1,55 @@ +package datawave.query.jexl.nodes; + +import java.util.Map; + +import org.apache.commons.jexl3.parser.ASTFunctionNode; +import org.apache.commons.jexl3.parser.ASTNENode; +import org.apache.commons.jexl3.parser.ASTNRNode; +import org.apache.commons.jexl3.parser.ASTNotNode; +import org.apache.commons.jexl3.parser.JexlNode; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.util.count.CountMap; + +/** + * Comparator that operates on field cardinality + */ +public class FieldCostComparator extends AbstractNodeCostComparator { + + /** + * Constructor that accepts a {@link CountMap} + * + * @param counts + * the count map + */ + public FieldCostComparator(CountMap counts) { + this(counts.getCounts()); + } + + /** + * Constructor that accepts a {@link Map} of counts + * + * @param counts + * the count map + */ + public FieldCostComparator(Map counts) { + super(counts); + } + + /** + * The {@link FieldCostComparator} uses a node's identifier to calculate cost + * + * @param node + * a JexlNode + * @return the node key + */ + @Override + public String getNodeKey(JexlNode node) { + if (node instanceof ASTNotNode || node instanceof ASTNENode || node instanceof ASTNRNode || node instanceof ASTFunctionNode) { + // certain node types are always kicked out + return null; + } + return JexlASTHelper.getIdentifier(node); + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldOrTermNodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldOrTermNodeCostComparator.java deleted file mode 100644 index 4e66d8e9599..00000000000 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldOrTermNodeCostComparator.java +++ /dev/null @@ -1,116 +0,0 @@ -package datawave.query.jexl.nodes; - -import java.util.Map; - -import org.apache.commons.jexl3.parser.ASTAndNode; -import org.apache.commons.jexl3.parser.ASTFunctionNode; -import org.apache.commons.jexl3.parser.ASTNENode; -import org.apache.commons.jexl3.parser.ASTNRNode; -import org.apache.commons.jexl3.parser.ASTNotNode; -import org.apache.commons.jexl3.parser.ASTOrNode; -import org.apache.commons.jexl3.parser.ASTReference; -import org.apache.commons.jexl3.parser.ASTReferenceExpression; -import org.apache.commons.jexl3.parser.JexlNode; -import org.apache.commons.jexl3.parser.JexlNodes; -import org.apache.commons.jexl3.parser.ParserTreeConstants; - -import datawave.query.jexl.JexlASTHelper; -import datawave.query.jexl.visitors.JexlStringBuildingVisitor; - -/** - * Orders nodes based on field or term counts - */ -public class FieldOrTermNodeCostComparator extends NodeCostComparator { - - private final boolean isFieldCount; - private static final long NODE_ID_MULTIPLIER = 5000; - private final Map counts; - - public FieldOrTermNodeCostComparator(Map counts, boolean isFieldCount) { - this.counts = counts; - this.isFieldCount = isFieldCount; - } - - @Override - int getCostIndex(JexlNode node) { - if (node.jjtGetNumChildren() == 1 && (node instanceof ASTReference || node instanceof ASTReferenceExpression)) { - return getCostIndex(node.jjtGetChild(0)); - } else if (node instanceof ASTOrNode) { - int sum = 0; - for (int i = 0; i < node.jjtGetNumChildren(); i++) { - sum += getCostIndex(node.jjtGetChild(i)); - } - return sum; - } else if (QueryPropertyMarker.findInstance(node).isAnyType()) { - return Integer.MAX_VALUE; - } else if (node instanceof ASTAndNode) { - int lowest = Integer.MAX_VALUE; - for (int i = 0; i < node.jjtGetNumChildren(); i++) { - int cost = getCostIndex(node.jjtGetChild(i)); - if (cost < lowest) { - lowest = cost; - } - } - return lowest; - } else { - return getCostForLeaf(node); - } - } - - /** - * Get the cost for a leaf according to the count map. - *

- * The extra code to handle integer overflows is due to term counts in the global index being a Long but Java's {@link Comparable#compareTo(Object)} returns - * an integer. - * - * @param node - * a JexlNode - * @return an integer used to compare nodes - */ - private int getCostForLeaf(JexlNode node) { - String key = getNodeKey(node); - long value = counts.getOrDefault(key, getNodeScore(node)); - if (value > Integer.MAX_VALUE) { - value = Integer.MAX_VALUE; - } - return (int) value; - } - - /** - * Generate a key for the count map. It's either the field, or the whole node. - * - * @param node - * a JexlNode - * @return a node key - */ - private String getNodeKey(JexlNode node) { - if (node instanceof ASTNotNode || node instanceof ASTNENode || node instanceof ASTNRNode || node instanceof ASTFunctionNode) { - return "NO_KEY"; - } else if (isFieldCount) { - return JexlASTHelper.getIdentifier(node); - } else { - return JexlStringBuildingVisitor.buildQueryWithoutParse(node); - } - } - - /** - * Wrapper around {@link JexlNodes#id(JexlNode)} so that we can boost the score of negated terms - * - * @param node - * any JexlNode - * @return a score for the node - */ - private long getNodeScore(JexlNode node) { - int id = JexlNodes.id(node); - switch (id) { - case ParserTreeConstants.JJTNENODE: - return Integer.MAX_VALUE - 3L; - case ParserTreeConstants.JJTNRNODE: - return Integer.MAX_VALUE - 2L; - case ParserTreeConstants.JJTNOTNODE: - return Integer.MAX_VALUE - 1L; - default: - return id * NODE_ID_MULTIPLIER; - } - } -} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JexlNodeComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JexlNodeComparator.java new file mode 100644 index 00000000000..4796e20b5a6 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JexlNodeComparator.java @@ -0,0 +1,72 @@ +package datawave.query.jexl.nodes; + +import java.util.Comparator; + +import org.apache.commons.jexl3.parser.JexlNode; + +import datawave.query.jexl.JexlASTHelper; + +/** + * Comparator for JexlNodes. + *

+ * Implementing classes may prioritize different features for sorting. For example, sorting leaves before junctions, EQ nodes before ER nodes, or sorting + * lexicographically by field and value + *

+ * EQ < ER < Functions + */ +public abstract class JexlNodeComparator implements Comparator { + + @Override + public int compare(JexlNode left, JexlNode right) { + int leftCost = getCostIndex(JexlASTHelper.dereference(left)); + int rightCost = getCostIndex(JexlASTHelper.dereference(right)); + + return Integer.compare(leftCost, rightCost); + } + + /** + * Calculates a cost for the provided node + * + * @param node + * an arbitrary JexlNode + * @return the integer cost + */ + abstract int getCostIndex(JexlNode node); + + /** + * Get the cost for a union by summing the cost of each child + * + * @param node + * the union + * @return the cost + */ + protected int getCostForUnion(JexlNode node) { + int cost = 0; + for (int i = 0; i < node.jjtGetNumChildren(); i++) { + cost += getCostIndex(node.jjtGetChild(i)); + // check for overflows + if (cost == Integer.MAX_VALUE || cost < 0) { + return Integer.MAX_VALUE; + } + } + return cost; + } + + /** + * Get the cost for an intersection by taking the lowest cost of all children + * + * @param node + * the intersection + * @return the cost + */ + protected int getCostForIntersection(JexlNode node) { + int cost = Integer.MAX_VALUE; + for (int i = 0; i < node.jjtGetNumChildren(); i++) { + int childCost = getCostIndex(node.jjtGetChild(i)); + if (childCost < cost) { + cost = childCost; + } + } + return cost; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JunctionComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JunctionComparator.java new file mode 100644 index 00000000000..859d117700c --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JunctionComparator.java @@ -0,0 +1,24 @@ +package datawave.query.jexl.nodes; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTOrNode; +import org.apache.commons.jexl3.parser.JexlNode; + +/** + * Comparator that pushes single leaf nodes to the left and junctions to the right + *

+ * Note: should only be used to break ties in other comparators. + */ +public class JunctionComparator extends JexlNodeComparator { + + @Override + public int getCostIndex(JexlNode node) { + if (node instanceof ASTAndNode && !QueryPropertyMarker.findInstance(node).isAnyType()) { + return 3; + } else if (node instanceof ASTOrNode) { + return 2; + } else { + return 1; + } + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/LexicographicalNodeComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/LexicographicalNodeComparator.java new file mode 100644 index 00000000000..37e183c46bf --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/LexicographicalNodeComparator.java @@ -0,0 +1,25 @@ +package datawave.query.jexl.nodes; + +import org.apache.commons.jexl3.parser.JexlNode; + +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; + +/** + * Sorts nodes according to the node string. + *

+ * Note: this comparator is intended to break ties between nodes of similar type or cost. Running this comparator in isolation will produce unexpected results. + */ +public class LexicographicalNodeComparator extends JexlNodeComparator { + + @Override + public int compare(JexlNode left, JexlNode right) { + String leftQuery = JexlStringBuildingVisitor.buildQuery(left); + String rightQuery = JexlStringBuildingVisitor.buildQuery(right); + return leftQuery.compareTo(rightQuery); + } + + @Override + public int getCostIndex(JexlNode node) { + throw new IllegalStateException("Not implemented"); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/NodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/NodeCostComparator.java deleted file mode 100644 index a238e5c6007..00000000000 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/NodeCostComparator.java +++ /dev/null @@ -1,40 +0,0 @@ -package datawave.query.jexl.nodes; - -import java.util.Comparator; - -import org.apache.commons.jexl3.parser.JexlNode; - -import datawave.query.jexl.visitors.JexlStringBuildingVisitor; - -/** - * Compare nodes based on arbitrary cost. - *

- * EQ < ER < Functions - */ -public abstract class NodeCostComparator implements Comparator { - - @Override - public int compare(JexlNode left, JexlNode right) { - int leftCost = getCostIndex(left); - int rightCost = getCostIndex(right); - - int result = Integer.compare(leftCost, rightCost); - if (result == 0) { - // if comparing by field cost (same field) provide an opportunity to sort alphabetically - result = JexlStringBuildingVisitor.buildQuery(left).compareTo(JexlStringBuildingVisitor.buildQuery(right)); - } - - return result; - } - - // Evaluate OR nodes last, then And nodes, then nodes by node id - - /** - * Calculates a cost for the provided node - * - * @param node - * an arbitrary JexlNode - * @return the integer cost - */ - abstract int getCostIndex(JexlNode node); -} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/TermCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/TermCostComparator.java new file mode 100644 index 00000000000..ae3b62c2273 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/TermCostComparator.java @@ -0,0 +1,46 @@ +package datawave.query.jexl.nodes; + +import java.util.Map; + +import org.apache.commons.jexl3.parser.JexlNode; + +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; +import datawave.query.util.count.CountMap; + +/** + * Comparator that operates on term cardinality + */ +public class TermCostComparator extends AbstractNodeCostComparator { + + /** + * Constructor that accepts a {@link CountMap} + * + * @param counts + * the count map + */ + public TermCostComparator(CountMap counts) { + this(counts.getCounts()); + } + + /** + * Constructor that accepts a {@link Map} of counts + * + * @param counts + * the count map + */ + public TermCostComparator(Map counts) { + super(counts); + } + + /** + * The {@link TermCostComparator} uses the whole node string to calculate cost + * + * @param node + * a JexlNode + * @return the node key + */ + public String getNodeKey(JexlNode node) { + return JexlStringBuildingVisitor.buildQuery(node); + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java index 6dd428cb411..8b576303a01 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java @@ -391,7 +391,7 @@ private Set getFieldsForFunctionNode(ASTFunctionNode node) { if (visitor.namespace().equals(CONTENT_FUNCTION_NAMESPACE)) { // all content function fields are added ContentFunctionsDescriptor.ContentJexlArgumentDescriptor contentDescriptor = new ContentFunctionsDescriptor().getArgumentDescriptor(node); - return contentDescriptor.fieldsAndTerms(Collections.emptySet(), Collections.emptySet(), Collections.emptySet(), null)[0]; + return contentDescriptor.fieldsAndTerms(Collections.emptySet(), Collections.emptySet(), Collections.emptySet(), null).getFields(); } else { JexlArgumentDescriptor descriptor = JexlFunctionArgumentDescriptorFactory.F.getArgumentDescriptor(node); if (descriptor == null) { diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java index 4f6a8c6fb2c..49dbba0a2c9 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java @@ -217,17 +217,25 @@ private JexlNode pruneNode(JexlNode node, Set fields) { * @return the original node, or null if it is pruned */ private JexlNode pruneUnion(JexlNode node, Set fields) { + // if there is a isNotNull in the union, and we know we have an equality node involving one of the isNotNull nodes, + // we have the means to prune the entire union. + boolean willPrune = false; + for (int i = 0; i < node.jjtGetNumChildren(); i++) { JexlNode deref = JexlASTHelper.dereference(node.jjtGetChild(i)); - if (!isIsNotNullFunction(deref)) { - return node; + if (isIsNotNullFunction(deref) && !willPrune) { + String field = fieldForNode(deref); + if (fields.contains(field)) { + willPrune = true; + } } - String field = fieldForNode(deref); - if (!fields.contains(field)) { - return node; - } } + + if (!willPrune) { + return node; + } + return null; } diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java index b2821874110..1b4f0cddff3 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java @@ -10,30 +10,34 @@ import org.apache.commons.jexl3.parser.JexlNode; import org.apache.commons.jexl3.parser.JexlNodes; import org.apache.commons.jexl3.parser.ParseException; +import org.apache.commons.jexl3.parser.ParserTreeConstants; import org.apache.log4j.Logger; import datawave.query.jexl.JexlASTHelper; -import datawave.query.jexl.nodes.DefaultNodeCostComparator; -import datawave.query.jexl.nodes.FieldOrTermNodeCostComparator; -import datawave.query.jexl.nodes.NodeCostComparator; +import datawave.query.jexl.nodes.DefaultJexlNodeComparator; +import datawave.query.jexl.nodes.FieldCostComparator; +import datawave.query.jexl.nodes.JexlNodeComparator; import datawave.query.jexl.nodes.QueryPropertyMarker; +import datawave.query.jexl.nodes.TermCostComparator; import datawave.query.jexl.visitors.BaseVisitor; import datawave.query.jexl.visitors.JexlStringBuildingVisitor; /** - * Orders query nodes by cost. + * Orders query nodes by cost using one or more {@link JexlNodeComparator}s. *

- * Cost is calculated based on field counts, term counts, or a default cost based on the node id {@link org.apache.commons.jexl3.parser.ParserTreeConstants}. + * The {@link DefaultJexlNodeComparator} orders a query based on the implied cost via the node id, see {@link ParserTreeConstants}. In general an EQ node is + * faster to resolve than an ER node, or a Marker node. *

- * In general an EQ node is faster to resolve than an ER node. + * The {@link FieldCostComparator} orders a query cased on the field cardinality. This cardinality can be gathered from the metadata table across the entire + * date range of the query, or the cardinality can be gathered from the global index and applied on a per-shard basis. *

- * In general an ER node is faster to resolve than a function node. + * The {@link TermCostComparator} orders a query based on the term cardinality. This is gathered from the global index and applied on a per-shard basis. */ public class OrderByCostVisitor extends BaseVisitor { private static final Logger log = Logger.getLogger(OrderByCostVisitor.class); - private NodeCostComparator costComparator; + private JexlNodeComparator comparator; private final boolean isFieldMap; private final Map countMap; @@ -50,8 +54,7 @@ public static String order(String query) { script = order(script); return JexlStringBuildingVisitor.buildQueryWithoutParse(script); } catch (ParseException e) { - log.error("Could not order query by cost: " + query); - e.printStackTrace(); + log.error("Could not order query by cost: " + query, e); } return null; } @@ -182,7 +185,7 @@ private Object visitJunction(JexlNode node, Object data) { QueryPropertyMarker.Instance instance = QueryPropertyMarker.findInstance(node); if (!instance.isAnyType()) { JexlNode[] children = JexlNodes.getChildren(node); - Arrays.sort(children, getCostComparator()); + Arrays.sort(children, getComparator()); JexlNodes.setChildren(node, children); node.childrenAccept(this, data); @@ -190,15 +193,19 @@ private Object visitJunction(JexlNode node, Object data) { return data; } - private NodeCostComparator getCostComparator() { - if (costComparator == null) { + private JexlNodeComparator getComparator() { + if (comparator == null) { if (countMap != null) { - costComparator = new FieldOrTermNodeCostComparator(countMap, isFieldMap); + if (isFieldMap) { + comparator = new FieldCostComparator(countMap); + } else { + comparator = new TermCostComparator(countMap); + } } else { - costComparator = new DefaultNodeCostComparator(); + comparator = new DefaultJexlNodeComparator(); } } - return costComparator; + return comparator; } } diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java index d601d49b8c9..3a06f4ac411 100644 --- a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java +++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java @@ -4,26 +4,27 @@ import java.util.ArrayList; import java.util.List; +import datawave.query.jexl.functions.QueryFunctions; import datawave.query.language.functions.QueryFunction; import datawave.webservice.query.exception.BadRequestQueryException; import datawave.webservice.query.exception.DatawaveErrorCode; /** * This function accepts a comma separated list of fields to be excluded from QueryModel expansion. The purpose is to provide users with an easy way to avoid - * undesirable query model expansions. - * - * Note: The exclude is only applied to the fields in the original query. An original field can be expanded into an excluded field. + * undesirable query model expansions.
+ * Note: The exclusion is only applied to the fields in the original query. An original field can be expanded into an excluded field. */ public class NoExpansion extends JexlQueryFunction { public NoExpansion() { - super("noExpansion", new ArrayList<>()); + super(QueryFunctions.NO_EXPANSION, new ArrayList<>()); } @Override public void validate() throws IllegalArgumentException { - if (this.parameterList.size() != 1) { - BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, MessageFormat.format("{0}", this.name)); + if (this.parameterList.isEmpty()) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, + MessageFormat.format("{0} requires at least one argument", this.name)); throw new IllegalArgumentException(qe); } } @@ -35,7 +36,19 @@ public QueryFunction duplicate() { @Override public String toString() { - List params = getParameterList(); - return "f:noExpansion(" + String.join("", params) + ")"; + StringBuilder sb = new StringBuilder(); + + sb.append(QueryFunctions.QUERY_FUNCTION_NAMESPACE).append(':').append(QueryFunctions.NO_EXPANSION); + if (parameterList.isEmpty()) { + sb.append("()"); + } else { + char separator = '('; + for (String param : parameterList) { + sb.append(separator).append(escapeString(param)); + separator = ','; + } + sb.append(')'); + } + return sb.toString(); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index 6df09d7646c..72d8a852e3d 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -2767,8 +2767,10 @@ public Tuple2,Boolean> getQueryRanges(ScannerFactor } } - if (config.isSortQueryBeforeGlobalIndex()) { + if (config.isSortQueryPreIndexWithFieldCounts()) { config.setQueryTree(timedSortQueryBeforeGlobalIndex(config, getMetadataHelper())); + } else if (config.isSortQueryPreIndexWithImpliedCounts()) { + config.setQueryTree(timedSortQueryBeforeGlobalIndex(config)); } // if a simple examination of the query has not forced a full table @@ -2863,12 +2865,22 @@ protected ASTJexlScript timedSortQueryBeforeGlobalIndex(ShardQueryConfiguration Map counts = metadataHelper.getCountsForFieldsInDateRange(fields, datatypes, config.getBeginDate(), config.getEndDate()); if (!counts.isEmpty()) { return OrderByCostVisitor.orderByFieldCount(config.getQueryTree(), counts); + } else { + // fall back to sorting by implied cardinality + return OrderByCostVisitor.order(config.getQueryTree()); } } return config.getQueryTree(); }); } + protected ASTJexlScript timedSortQueryBeforeGlobalIndex(ShardQueryConfiguration config) throws DatawaveQueryException { + return visitorManager.timedVisit(config.getTimers(), "SortQueryBeforeGlobalIndex", () -> { + // sort by implied cardinality + return OrderByCostVisitor.order(config.getQueryTree()); + }); + } + private TypeMetadata getTypeMetadata() { try { return metadataHelper.getTypeMetadata(); diff --git a/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java index 18bfd3bc275..ab5150b526c 100644 --- a/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java +++ b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java @@ -48,7 +48,7 @@ protected void populateContentFunctions(JexlNode node) { ContentFunctionsDescriptor descriptor = new ContentFunctionsDescriptor(); ContentJexlArgumentDescriptor argsDescriptor; - Set[] fieldsAndTerms; + ContentFunctionsDescriptor.FieldTerms fieldsAndTerms; JexlNode parent; String field; @@ -67,12 +67,12 @@ protected void populateContentFunctions(JexlNode node) { // content, tf, and indexed fields are not actually needed to extract fields from the function node fieldsAndTerms = argsDescriptor.fieldsAndTerms(Collections.emptySet(), Collections.emptySet(), Collections.emptySet(), null); - if (fieldsAndTerms[0].size() != 1) { + if (fieldsAndTerms.totalFields() != 1) { throw new IllegalStateException("content function had more than one field"); } - field = JexlASTHelper.deconstructIdentifier(fieldsAndTerms[0].iterator().next()); - ContentFunction contentFunction = new ContentFunction(field, fieldsAndTerms[1]); + field = JexlASTHelper.deconstructIdentifier(fieldsAndTerms.getFields().iterator().next()); + ContentFunction contentFunction = new ContentFunction(field, fieldsAndTerms.getTerms()); contentFunctions.put(contentFunction.getField(), contentFunction); if (isFunctionNegated(f)) { diff --git a/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java b/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java index 9f168e63e32..ed032b43bdb 100644 --- a/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java +++ b/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java @@ -120,6 +120,10 @@ public List apply(QueryData qd) { options.setQueryConfig(this.config); + String tableName = tableId.canonical(); + options.applyExecutionHints(tableName, config.getTableHints()); + options.applyConsistencyLevel(tableName, config.getTableConsistencyLevels()); + chunks.add(new ScannerChunk(options, plan.getRanges(), qd, server)); } catch (Exception e) { log.error(e); diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java b/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java index 94a332e9772..1f3623a3ae1 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java @@ -28,7 +28,7 @@ import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.security.Authorizations; -import org.apache.accumulo.core.util.PeekingIterator; +import org.apache.commons.collections4.iterators.PeekingIterator; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java b/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java index 5366418ec48..415d6548f73 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java @@ -50,8 +50,11 @@ public class ScannerFactory { protected ResourceQueue scanQueue = null; protected ShardQueryConfiguration config = null; - protected Map consistencyByTable = new HashMap<>(); - protected Map> hintsByTable = new HashMap<>(); + // consistency and execution hints can be mapped to table names or functional names + // for example, 'shardIndex' might map to a default executor pool for the shard index table + // while 'expansion' might map to a separate executor pool on the shard index + protected Map consistencyLevelMap = new HashMap<>(); + protected Map> executionHintMap = new HashMap<>(); private static final Logger log = Logger.getLogger(ScannerFactory.class); @@ -94,7 +97,7 @@ public ScannerFactory(AccumuloClient client, int queueSize) { } /** - * Method that allows a ScannerFactory to be updated by a config after initialization + * Method that allows a ScannerFactory to use scan execution and consistency hints from the provided {@link GenericQueryConfiguration}. * * @param genericConfig * a {@link GenericQueryConfiguration} @@ -105,12 +108,12 @@ public void updateConfigs(GenericQueryConfiguration genericConfig) { Map consistencyLevels = genericConfig.getTableConsistencyLevels(); if (consistencyLevels != null && !consistencyLevels.isEmpty()) { - this.consistencyByTable = genericConfig.getTableConsistencyLevels(); + this.consistencyLevelMap = genericConfig.getTableConsistencyLevels(); } Map> hints = genericConfig.getTableHints(); if (hints != null && !hints.isEmpty()) { - this.hintsByTable = genericConfig.getTableHints(); + this.executionHintMap = genericConfig.getTableHints(); } int numThreads = DEFAULT_MAX_THREADS; @@ -139,6 +142,7 @@ public void updateConfigs(GenericQueryConfiguration genericConfig) { public Scanner newSingleScanner(String tableName, Set auths, Query query) throws TableNotFoundException { if (open.get()) { Scanner bs = QueryScannerHelper.createScannerWithoutInfo(client, tableName, auths, query); + applyConfigs(bs, tableName); log.debug("Created scanner " + System.identityHashCode(bs)); @@ -160,10 +164,47 @@ public Scanner newSingleScanner(String tableName, Set auths, Que } } + /** + * Create a new {@link BatchScanner} using the table name as the execution hint + * + * @param tableName + * the table name + * @param auths + * the set of authorizations + * @param threads + * the number of threads + * @param query + * the Query + * @return a BatchScanner + * @throws TableNotFoundException + * if no table exists + */ public BatchScanner newScanner(String tableName, Set auths, int threads, Query query) throws TableNotFoundException { + return newScanner(tableName, auths, threads, query, tableName); + } + + /** + * Creates a new {@link BatchScanner} with execution hints + * + * @param tableName + * the table name + * @param auths + * the set of authorizations + * @param threads + * the number of threads to use + * @param query + * the Query + * @param hintKey + * the key used to select an execution hint + * @return a BatchScanner + * @throws TableNotFoundException + * if no table exists + */ + public BatchScanner newScanner(String tableName, Set auths, int threads, Query query, String hintKey) throws TableNotFoundException { if (open.get()) { BatchScanner bs = QueryScannerHelper.createBatchScanner(client, tableName, auths, threads, query); - applyConfigs(bs, tableName); + + applyConfigs(bs, hintKey, tableName); log.debug("Created scanner " + System.identityHashCode(bs)); if (log.isTraceEnabled()) { @@ -186,6 +227,7 @@ public BatchScanner newScanner(String tableName, Set auths, int public BatchScanner newScanner(String tableName, Set auths, int threads, Query query, boolean reportErrors) throws TableNotFoundException { if (open.get()) { BatchScanner bs = QueryScannerHelper.createBatchScanner(client, tableName, auths, threads, query, reportErrors); + applyConfigs(bs, tableName); log.debug("Created scanner " + System.identityHashCode(bs)); @@ -229,7 +271,28 @@ public BatchScanner newScanner(String tableName, Query query) throws TableNotFou * if there are issues */ public BatchScannerSession newQueryScanner(final String tableName, final Set auths, Query settings) throws Exception { - return newLimitedScanner(BatchScannerSession.class, tableName, auths, settings).setThreads(scanQueue.getCapacity()); + return newQueryScanner(tableName, auths, settings, tableName); + } + + /** + * Builds a new scanner session using a finalized table name and set of authorizations using the previously defined queue. Note that the number of entries + * is hardcoded, below, to 1000, but can be changed + * + * @param tableName + * the table string + * @param auths + * a set of auths + * @param settings + * query settings + * @param executionHintKey + * a key used to select a scan execution hint + * @return a new scanner session + * @throws Exception + * if there are issues + */ + public BatchScannerSession newQueryScanner(final String tableName, final Set auths, Query settings, String executionHintKey) + throws Exception { + return newLimitedScanner(BatchScannerSession.class, tableName, auths, settings, executionHintKey).setThreads(scanQueue.getCapacity()); } /** @@ -248,13 +311,49 @@ public BatchScannerSession newQueryScanner(final String tableName, final Set T newLimitedScanner(Class wrapper, final String tableName, final Set auths, final Query settings) throws NoSuchMethodException, InvocationTargetException, InstantiationException, IllegalAccessException { + return newLimitedScanner(wrapper, tableName, auths, settings, tableName); + } + + /** + * Builds a new scanner session using a finalized table name and set of authorizations using the previously defined queue. Note that the number of entries + * is hardcoded, below, to 1000, but can be changed + * + * @param tableName + * the table string + * @param auths + * a set of auths + * @param settings + * query settings + * @param hintKey + * the key used to select an execution hint + * @param + * type of the wrapper + * @param wrapper + * a wrapper class + * @return a new scanner session + * @throws NoSuchMethodException + * in the case of no such method + * @throws InvocationTargetException + * in the case of no invocation target + * @throws InstantiationException + * in the case something fails to instantiate + * @throws IllegalAccessException + * in the case of an illegal access + * + */ + public T newLimitedScanner(Class wrapper, final String tableName, final Set auths, final Query settings, + String hintKey) throws NoSuchMethodException, InvocationTargetException, InstantiationException, IllegalAccessException { Preconditions.checkNotNull(scanQueue); Preconditions.checkNotNull(wrapper); Preconditions.checkArgument(open.get(), "Factory has been locked. No New scanners can be created"); @@ -274,7 +373,7 @@ public T newLimitedScanner(Class wrapper, final St .newInstance(new ScannerSession(tableName, auths, scanQueue, maxQueue, settings).applyStats(stats)); } - applyConfigs(session, tableName); + applyConfigs(session, hintKey, tableName); log.debug("Created session " + System.identityHashCode(session)); if (log.isTraceEnabled()) { @@ -416,40 +515,67 @@ public ScannerBase newRfileScanner(String tableName, Set auths, } /** - * Apply table-specific scanner configs to the provided scanner base object + * Apply table-specific scanner configs to the provided scanner base object using the table name as the key * * @param scannerBase * a {@link ScannerBase} * @param tableName - * the table + * the secondary hint key */ - protected void applyConfigs(ScannerBase scannerBase, String tableName) { - if (consistencyByTable != null && consistencyByTable.containsKey(tableName)) { - scannerBase.setConsistencyLevel(consistencyByTable.get(tableName)); + public void applyConfigs(ScannerBase scannerBase, String tableName) { + applyConfigs(scannerBase, tableName, tableName); + } + + /** + * Apply table-specific scanner configs to the provided scanner base object using the provided hint key, falling back to the table name if necessary + * + * @param scannerBase + * a {@link ScannerBase} + * @param hintKey + * the primary hint key + * @param tableName + * the secondary hint key + */ + public void applyConfigs(ScannerBase scannerBase, String hintKey, String tableName) { + + if (consistencyLevelMap != null && !consistencyLevelMap.isEmpty()) { + String key = consistencyLevelMap.containsKey(hintKey) ? hintKey : tableName; + scannerBase.setConsistencyLevel(consistencyLevelMap.get(key)); } - if (hintsByTable != null && hintsByTable.containsKey(tableName)) { - scannerBase.setExecutionHints(hintsByTable.get(tableName)); + if (executionHintMap != null && !executionHintMap.isEmpty()) { + String key = executionHintMap.containsKey(hintKey) ? hintKey : tableName; + scannerBase.setExecutionHints(executionHintMap.get(key)); } } /** - * Apply table-specific scanner configs to the provided scanner session + * Apply table-specific scanner configs to the provided scanner session using the provided hint key, falling back to the table name if necessary * * @param scannerSession * the {@link ScannerSession} + * @param hintKey + * the primary hint key * @param tableName - * the table + * used as a secondary hint key */ - protected void applyConfigs(ScannerSession scannerSession, String tableName) { + protected void applyConfigs(ScannerSession scannerSession, String hintKey, String tableName) { SessionOptions options = scannerSession.getOptions(); - if (consistencyByTable != null && consistencyByTable.containsKey(tableName)) { - options.setConsistencyLevel(consistencyByTable.get(tableName)); + if (consistencyLevelMap != null && !consistencyLevelMap.isEmpty()) { + String key = consistencyLevelMap.containsKey(hintKey) ? hintKey : tableName; + + if (consistencyLevelMap.containsKey(key)) { + options.setConsistencyLevel(consistencyLevelMap.get(key)); + } } - if (hintsByTable != null && hintsByTable.containsKey(tableName)) { - options.setExecutionHints(hintsByTable.get(tableName)); + if (executionHintMap != null && !executionHintMap.isEmpty()) { + String key = executionHintMap.containsKey(hintKey) ? hintKey : tableName; + + if (executionHintMap.containsKey(key)) { + options.setExecutionHints(executionHintMap.get(key)); + } } scannerSession.setOptions(options); diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/SessionOptions.java b/warehouse/query-core/src/main/java/datawave/query/tables/SessionOptions.java index 4303e13f5ff..9f869eeecd2 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/SessionOptions.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/SessionOptions.java @@ -1,6 +1,7 @@ package datawave.query.tables; import java.util.Collection; +import java.util.Map; import org.apache.accumulo.core.client.IteratorSetting; import org.apache.accumulo.core.clientImpl.ScannerOptions; @@ -46,4 +47,24 @@ public Collection getIterators() { } return settings; } + + public void applyExecutionHints(Map scanHints) { + setExecutionHints(scanHints); + } + + public void applyExecutionHints(String tableName, Map> tableScanHints) { + if (tableScanHints.containsKey(tableName)) { + setExecutionHints(tableScanHints.get(tableName)); + } + } + + public void applyConsistencyLevel(ConsistencyLevel consistencyLevel) { + setConsistencyLevel(consistencyLevel); + } + + public void applyConsistencyLevel(String tableName, Map consistencyLevels) { + if (consistencyLevels.containsKey(tableName)) { + setConsistencyLevel(consistencyLevels.get(tableName)); + } + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java index 5b7ab11f61d..3a113c75a66 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java @@ -2932,52 +2932,52 @@ public void setPruneQueryOptions(boolean pruneQueryOptions) { getConfig().setPruneQueryOptions(pruneQueryOptions); } - public boolean getUseFieldCounts() { - return getConfig().getUseFieldCounts(); + public boolean isRebuildDatatypeFilter() { + return getConfig().isRebuildDatatypeFilter(); } - public void setUseFieldCounts(boolean useFieldCounts) { - getConfig().setUseFieldCounts(useFieldCounts); + public void setRebuildDatatypeFilter(boolean rebuildDatatypeFilter) { + getConfig().setRebuildDatatypeFilter(rebuildDatatypeFilter); } - public boolean getUseTermCounts() { - return getConfig().getUseTermCounts(); + public boolean isRebuildDatatypeFilterPerShard() { + return getConfig().isRebuildDatatypeFilterPerShard(); } - public void setUseTermCounts(boolean useTermCounts) { - getConfig().setUseTermCounts(useTermCounts); + public void setRebuildDatatypeFilterPerShard(boolean rebuildDatatypeFilterPerShard) { + getConfig().setRebuildDatatypeFilterPerShard(rebuildDatatypeFilterPerShard); } - public boolean getSortQueryBeforeGlobalIndex() { - return getConfig().isSortQueryBeforeGlobalIndex(); + public boolean isSortQueryPreIndexWithImpliedCounts() { + return getConfig().isSortQueryPreIndexWithImpliedCounts(); } - public void setSortQueryBeforeGlobalIndex(boolean sortQueryBeforeGlobalIndex) { - getConfig().setSortQueryBeforeGlobalIndex(sortQueryBeforeGlobalIndex); + public void setSortQueryPreIndexWithImpliedCounts(boolean sortQueryPreIndexWithImpliedCounts) { + getConfig().setSortQueryPreIndexWithImpliedCounts(sortQueryPreIndexWithImpliedCounts); } - public boolean getSortQueryByCounts() { - return getConfig().isSortQueryByCounts(); + public boolean isSortQueryPreIndexWithFieldCounts() { + return getConfig().isSortQueryPreIndexWithFieldCounts(); } - public void setSortQueryByCounts(boolean sortQueryByCounts) { - getConfig().setSortQueryByCounts(sortQueryByCounts); + public void setSortQueryPreIndexWithFieldCounts(boolean sortQueryPreIndexWithFieldCounts) { + getConfig().setSortQueryPreIndexWithImpliedCounts(sortQueryPreIndexWithFieldCounts); } - public boolean isRebuildDatatypeFilter() { - return getConfig().isRebuildDatatypeFilter(); + public boolean isSortQueryPostIndexWithFieldCounts() { + return getConfig().isSortQueryPostIndexWithFieldCounts(); } - public void setRebuildDatatypeFilter(boolean rebuildDatatypeFilter) { - getConfig().setRebuildDatatypeFilter(rebuildDatatypeFilter); + public void setSortQueryPostIndexWithFieldCounts(boolean sortQueryPostIndexWithFieldCounts) { + getConfig().setSortQueryPostIndexWithFieldCounts(sortQueryPostIndexWithFieldCounts); } - public boolean isRebuildDatatypeFilterPerShard() { - return getConfig().isRebuildDatatypeFilterPerShard(); + public boolean isSortQueryPostIndexWithTermCounts() { + return getConfig().isSortQueryPostIndexWithTermCounts(); } - public void setRebuildDatatypeFilterPerShard(boolean rebuildDatatypeFilterPerShard) { - getConfig().setRebuildDatatypeFilterPerShard(rebuildDatatypeFilterPerShard); + public void setSortQueryPostIndexWithTermCounts(boolean sortQueryPostIndexWithTermCounts) { + getConfig().setSortQueryPostIndexWithTermCounts(sortQueryPostIndexWithTermCounts); } public boolean isUseQueryTreeScanHintRules() { diff --git a/warehouse/query-core/src/test/java/datawave/core/iterators/BoundedRangeExpansionIteratorTest.java b/warehouse/query-core/src/test/java/datawave/core/iterators/BoundedRangeExpansionIteratorTest.java new file mode 100644 index 00000000000..09cbd010988 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/core/iterators/BoundedRangeExpansionIteratorTest.java @@ -0,0 +1,332 @@ +package datawave.core.iterators; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iteratorsImpl.system.SortedMapIterator; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.google.common.base.Joiner; + +public class BoundedRangeExpansionIteratorTest { + + private static final Value emptyValue = new Value(); + + private String startDate; + private String endDate; + + private String lower; + private String upper; + + private Set datatypes = new HashSet<>(); + private Set expected = new HashSet<>(); + + @BeforeEach + public void beforeEach() { + startDate = null; + endDate = null; + lower = null; + upper = null; + datatypes.clear(); + expected.clear(); + } + + @Test + public void testSingleDay_SingleValue_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240501"); + withExpected(Set.of("value-1")); + drive(); + } + + @Test + public void testSingleDay_SingleValue_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1")); + drive(); + } + + @Test + public void testSingleDay_SingleValue_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + @Test + public void testSingleDay_MultiValue_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240501"); + withExpected(Set.of("value-1", "value-2")); + drive(); + } + + @Test + public void testSingleDay_MultiValue_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1", "value-2")); + drive(); + } + + @Test + public void testSingleDay_MultiValue_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + @Test + public void testSingleDay_AllValues_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240501"); + withExpected(Set.of("value-1", "value-2", "value-3")); + drive(); + } + + @Test + public void testSingleDay_AllValues_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1", "value-2")); + // value-3 does not contain datatype-a + drive(); + } + + @Test + public void testSingleDay_AllValues_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + // + + @Test + public void testAllDays_SingleValue_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240505"); + withExpected(Set.of("value-1")); + drive(); + } + + @Test + public void testMultiDay_SingleValue_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1")); + drive(); + } + + @Test + public void testMultiDay_SingleValue_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + @Test + public void testMultiDay_MultiValue_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240505"); + withExpected(Set.of("value-1", "value-2")); + drive(); + } + + @Test + public void testMultiDay_MultiValue_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1", "value-2")); + drive(); + } + + @Test + public void testMultiDay_MultiValue_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + @Test + public void testMultiDay_AllValues_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240505"); + withExpected(Set.of("value-1", "value-2", "value-3")); + drive(); + } + + @Test + public void testMultiDay_AllValues_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1", "value-2")); + // value-3 does not contain datatype-a + drive(); + } + + @Test + public void testMultiDay_AllValues_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + private void drive() { + assertNotNull(lower, "lower bound must be specified"); + assertNotNull(upper, "upper bound must be specified"); + assertNotNull(startDate, "start date must be specified"); + assertNotNull(endDate, "end date must be specified"); + + Map options = new HashMap<>(); + options.put(BoundedRangeExpansionIterator.START_DATE, startDate); + options.put(BoundedRangeExpansionIterator.END_DATE, endDate); + if (!datatypes.isEmpty()) { + options.put(BoundedRangeExpansionIterator.DATATYPES_OPT, Joiner.on(',').join(datatypes)); + } + + SortedMapIterator data = createData(); + BoundedRangeExpansionIterator iter = new BoundedRangeExpansionIterator(); + + Range range = new Range(lower, true, upper, true); + + try { + iter.init(data, options, null); + iter.seek(range, Collections.emptySet(), true); + + Set results = new HashSet<>(); + while (iter.hasTop()) { + Key k = iter.getTopKey(); + boolean first = results.add(k.getRow().toString()); + assertTrue(first, "Iterator returned the same row twice"); + iter.next(); + } + + assertEquals(expected, results); + + } catch (Exception e) { + fail("Failed to execute test", e); + } + } + + @Test + public void testTeardownRebuild() { + withDateRange("20240501", "20240505"); + + Map options = new HashMap<>(); + options.put(BoundedRangeExpansionIterator.START_DATE, startDate); + options.put(BoundedRangeExpansionIterator.END_DATE, endDate); + if (!datatypes.isEmpty()) { + options.put(BoundedRangeExpansionIterator.DATATYPES_OPT, Joiner.on(',').join(datatypes)); + } + + SortedMapIterator data = createData(); + BoundedRangeExpansionIterator iter = new BoundedRangeExpansionIterator(); + + Range range = new Range("value-2", false, "value-3", true); + + try { + iter.init(data, options, null); + iter.seek(range, Collections.emptySet(), true); + + assertTrue(iter.hasTop()); + Key k = iter.getTopKey(); + + assertEquals("value-3", k.getRow().toString()); + } catch (Exception e) { + fail("Failed to execute test", e); + } + } + + private void withBoundedRange(String lower, String upper) { + assertNotNull(lower); + assertNotNull(upper); + this.lower = lower; + this.upper = upper; + } + + private void withDateRange(String startDate, String endDate) { + assertNotNull(startDate); + assertNotNull(endDate); + this.startDate = startDate; + this.endDate = endDate; + } + + private void withDatatypes(Set datatypes) { + assertFalse(datatypes.isEmpty()); + this.datatypes = datatypes; + } + + private void withExpected(Set expectedRows) { + assertFalse(expectedRows.isEmpty()); + this.expected = expectedRows; + } + + /** + * Simulate fetching the column family by only having one field + * + * @return the data + */ + private SortedMapIterator createData() { + SortedMap data = new TreeMap<>(); + data.put(new Key("value-1", "FIELD_A", "20240501_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240501_1\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240501_2\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240501_3\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240502_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240503_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240504_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240505_0\u0000datatype-a"), emptyValue); + + data.put(new Key("value-2", "FIELD_A", "20240501_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240501_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240502_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240502_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240503_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240503_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240504_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240504_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240505_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240505_0\u0000datatype-b"), emptyValue); + + data.put(new Key("value-3", "FIELD_A", "20240501_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240501_1\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240502_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240502_1\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240503_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240503_1\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240504_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240504_1\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240505_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240505_1\u0000datatype-b"), emptyValue); + return new SortedMapIterator(data); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java index 8149117800d..a44295201ef 100644 --- a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java @@ -870,7 +870,26 @@ public void testPermutations() throws Exception { } @Test - public void testSortQueryBeforeGlobalIndex() throws Exception { + public void testSortQueryPreIndexWithImpliedCounts() throws Exception { + try { + // sorting via implied counts should push TYPE to the right of SHAPE + withQuery("TYPE == 'pentagon' || SHAPE == 'triangle'"); + withParameter(QueryParameters.DATATYPE_FILTER_SET, "triangle,pentagon"); + + Set expectedUids = new HashSet<>(triangleUids); + withExpected(expectedUids); + + disableAllSortOptions(); + logic.setSortQueryPreIndexWithImpliedCounts(true); + planAndExecuteQuery(); + assertPlannedQuery("SHAPE == 'triangle' || TYPE == 'pentagon'"); + } finally { + disableAllSortOptions(); + } + } + + @Test + public void testSortQueryPreIndexWithFieldCounts() throws Exception { try { // SHAPE cardinality for triangle and pentagon types is 23 // TYPE cardinality for triangle and pentagon types is 21 @@ -880,12 +899,20 @@ public void testSortQueryBeforeGlobalIndex() throws Exception { Set expectedUids = new HashSet<>(triangleUids); withExpected(expectedUids); - logic.setSortQueryBeforeGlobalIndex(true); + disableAllSortOptions(); + logic.setSortQueryPreIndexWithFieldCounts(true); planAndExecuteQuery(); assertPlannedQuery("TYPE == 'pentagon' || SHAPE == 'triangle'"); } finally { - logic.setSortQueryBeforeGlobalIndex(false); + disableAllSortOptions(); } } + private void disableAllSortOptions() { + logic.setSortQueryPreIndexWithImpliedCounts(false); + logic.setSortQueryPreIndexWithFieldCounts(false); + logic.setSortQueryPostIndexWithFieldCounts(false); + logic.setSortQueryPostIndexWithTermCounts(false); + } + } diff --git a/warehouse/query-core/src/test/java/datawave/query/TestLimitReturnedGroupsToHitTermGroups.java b/warehouse/query-core/src/test/java/datawave/query/TestLimitReturnedGroupsToHitTermGroups.java index 57871ef4516..3f105da01fd 100644 --- a/warehouse/query-core/src/test/java/datawave/query/TestLimitReturnedGroupsToHitTermGroups.java +++ b/warehouse/query-core/src/test/java/datawave/query/TestLimitReturnedGroupsToHitTermGroups.java @@ -349,9 +349,11 @@ public void testGroupWithExpandedRegexAlphabeticalOrderAndMatchesInGroupPartTwo( goodResults.addAll(Sets.newHashSet("BIRD.PET.0:parakeet", "CANINE.PET.0:beagle")); // disable just for this test to prove group 0 can be returned - logic.setSortQueryBeforeGlobalIndex(false); + logic.setSortQueryPreIndexWithFieldCounts(false); + logic.setSortQueryPreIndexWithImpliedCounts(false); runTestQuery(queryString, format.parse("20091231"), format.parse("20150101"), extraParameters, goodResults); - logic.setSortQueryBeforeGlobalIndex(true); + logic.setSortQueryPreIndexWithFieldCounts(true); + logic.setSortQueryPreIndexWithImpliedCounts(true); } @Test diff --git a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java index e5fb7b5217f..1ee5fb44b8b 100644 --- a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java @@ -581,14 +581,14 @@ public void setUp() throws Exception { defaultValues.put("groupFields", new GroupFields()); updatedValues.put("groupFields", GroupFields.from("GROUP(FIELD_G,FIELD_H)")); - defaultValues.put("useFieldCounts", false); - updatedValues.put("useFieldCounts", true); - defaultValues.put("useTermCounts", false); - updatedValues.put("useTermCounts", true); - defaultValues.put("sortQueryBeforeGlobalIndex", false); - updatedValues.put("sortQueryBeforeGlobalIndex", true); - defaultValues.put("sortQueryByCounts", false); - updatedValues.put("sortQueryByCounts", true); + defaultValues.put("sortQueryPreIndexWithImpliedCounts", false); + updatedValues.put("sortQueryPreIndexWithImpliedCounts", true); + defaultValues.put("sortQueryPreIndexWithFieldCounts", false); + updatedValues.put("sortQueryPreIndexWithFieldCounts", true); + defaultValues.put("sortQueryPostIndexWithTermCounts", false); + updatedValues.put("sortQueryPostIndexWithTermCounts", true); + defaultValues.put("sortQueryPostIndexWithFieldCounts", false); + updatedValues.put("sortQueryPostIndexWithFieldCounts", true); defaultValues.put("tableConsistencyLevels", Collections.emptyMap()); updatedValues.put("tableConsistencyLevels", Collections.singletonMap(TableName.SHARD, ScannerBase.ConsistencyLevel.EVENTUAL)); defaultValues.put("tableHints", Collections.emptyMap()); diff --git a/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryIteratorTest.java b/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryIteratorTest.java index 3a47941f047..1ac3c24e829 100644 --- a/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryIteratorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryIteratorTest.java @@ -44,7 +44,6 @@ import datawave.query.iterator.SourceManagerTest; public class DiscoveryIteratorTest { - static final Logger log = Logger.getLogger(DiscoveryIteratorTest.class); @Test public void testHappyPath() throws Throwable { diff --git a/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryLogicTest.java b/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryLogicTest.java index b8d3c226d0a..704390ae676 100644 --- a/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryLogicTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryLogicTest.java @@ -1,7 +1,5 @@ package datawave.query.discovery; -import static org.junit.Assert.assertEquals; - import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collections; @@ -15,21 +13,25 @@ import java.util.concurrent.TimeUnit; import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.AccumuloException; +import org.apache.accumulo.core.client.AccumuloSecurityException; import org.apache.accumulo.core.client.BatchWriter; import org.apache.accumulo.core.client.BatchWriterConfig; +import org.apache.accumulo.core.client.TableExistsException; +import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.user.SummingCombiner; import org.apache.accumulo.core.security.Authorizations; import org.apache.accumulo.core.security.ColumnVisibility; +import org.apache.hadoop.io.MapWritable; import org.apache.log4j.Logger; -import org.javatuples.Pair; +import org.assertj.core.api.Assertions; +import org.junit.After; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Sets; - import datawave.core.query.configuration.GenericQueryConfiguration; import datawave.core.query.result.event.DefaultResponseObjectFactory; import datawave.data.type.LcNoDiacriticsType; @@ -42,245 +44,463 @@ import datawave.util.TableName; public class DiscoveryLogicTest { - private static Logger log = Logger.getLogger(DiscoveryLogicTest.class); - protected static Set> terms; - protected static Set> terms2; - protected static Value blank; + private static final Logger log = Logger.getLogger(DiscoveryLogicTest.class); + + private static final Value BLANK_VALUE = new Value(new byte[0]); + private static final Set AUTHS = Collections.singleton(new Authorizations("FOO", "BAR")); + private static final String QUERY_AUTHS = "FOO,BAR"; - protected static Set auths = Collections.singleton(new Authorizations("FOO", "BAR")); - protected static String queryAuths = "FOO,BAR"; - protected AccumuloClient client = null; - protected MockAccumuloRecordWriter recordWriter; - protected DiscoveryLogic logic; - protected SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyyMMdd"); + private final SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyyMMdd"); + private AccumuloClient client = null; + private DiscoveryLogic logic; + + private String query; + private String startDate; + private String endDate; + private Map parameters = new HashMap<>(); + + private final List expected = new ArrayList<>(); @BeforeClass public static void setUp() { - blank = new Value(new byte[0]); - terms = Sets.newHashSet(Pair.with("firetruck", "vehicle"), Pair.with("ruddy duck", "bird"), Pair.with("ruddy duck", "unidentified flying object"), - Pair.with("motorcycle", "vehicle"), Pair.with("motorboat", "vehicle"), Pair.with("strike", "actionable offense"), - Pair.with("car", "vehicle"), Pair.with("trophy", "prize"), Pair.with("police officer", "otherperson"), - Pair.with("skydiver", "occupation"), Pair.with("bbc", "network"), Pair.with("onyx", "pokemon"), Pair.with("onyx", "rock"), - Pair.with("onyx", "rooster"), Pair.with("rooster", "cockadoodledoo")); - - terms2 = Sets.newHashSet(Pair.with("skydiver", "job"), Pair.with("skydiver", "job"), Pair.with("skydiver", "job"), Pair.with("skydiver", "job"), - Pair.with("skydiver", "occupation"), Pair.with("skydiver", "occupation"), Pair.with("skydiver", "occupation"), - Pair.with("skydiver", "occupation"), Pair.with("skydiver", "occupation"), Pair.with("skydiver", "occupation"), - Pair.with("skydiver", "occupation"), Pair.with("xxx.skydiver", "occupation"), Pair.with("xxx.skydiver", "occupation"), - Pair.with("xxx.skydiver", "occupation"), Pair.with("xxx.skydiver", "occupation"), Pair.with("xxx.skydiver", "occupation"), - Pair.with("yyy.skydiver", "occupation"), Pair.with("yyy.skydiver", "occupation"), Pair.with("yyy.skydiver", "occupation"), - Pair.with("zskydiver", "occupation")); - System.setProperty(MetadataHelperFactory.ALL_AUTHS_PROPERTY, queryAuths); + System.setProperty(MetadataHelperFactory.ALL_AUTHS_PROPERTY, QUERY_AUTHS); } @Before public void setup() throws Throwable { + initClient(); + writeData(); + initLogic(); + } + + private void initClient() throws AccumuloException, TableNotFoundException, TableExistsException, AccumuloSecurityException { QueryTestTableHelper testTableHelper = new QueryTestTableHelper(DiscoveryLogicTest.class.getCanonicalName(), log); - recordWriter = new MockAccumuloRecordWriter(); + MockAccumuloRecordWriter recordWriter = new MockAccumuloRecordWriter(); testTableHelper.configureTables(recordWriter); - client = testTableHelper.client; - - for (Pair p : terms) { - insertIndex(p); - } - - insertForwardModel("animal", "rooster"); - insertForwardModel("animal", "bird"); - insertReverseModel("occupation", "job"); - - logic = new DiscoveryLogic(); - logic.setIndexTableName(TableName.SHARD_INDEX); - logic.setReverseIndexTableName(TableName.SHARD_RINDEX); - logic.setModelTableName(QueryTestTableHelper.METADATA_TABLE_NAME); - logic.setMetadataTableName(QueryTestTableHelper.METADATA_TABLE_NAME); - logic.setModelName("DATAWAVE"); - logic.setFullTableScanEnabled(false); - logic.setMaxResults(-1); - logic.setMaxWork(-1); - logic.setAllowLeadingWildcard(true); - logic.setResponseObjectFactory(new DefaultResponseObjectFactory()); - logic.setMarkingFunctions(new MarkingFunctions.Default()); - logic.setMetadataHelperFactory(new MetadataHelperFactory()); + this.client = testTableHelper.client; } - protected Uid.List makeUidList(int count) { - Uid.List.Builder builder = Uid.List.newBuilder(); - builder.setIGNORE(true); - builder.setCOUNT(count); - return builder.build(); + private void writeData() throws Throwable { + writeEntries("VEHICLE", "motorcycle", "csv", "FOO", "20130101", 10, 20, 2); + writeEntries("VEHICLE", "motorcycle", "csv", "FOO", "20130102", 10, 20, 2); + writeEntries("ROCK", "onyx", "csv", "FOO", "20130101", 1, 1, 1); + writeEntries("ROCK", "onyx", "csv", "FOO", "20130102", 1, 3, 4); + writeEntries("ROCK", "onyx", "csv", "FOO", "20130103", 1, 3, 3); + writeEntries("POKEMON", "onyx", "csv", "FOO", "20130101", 20, 5, 5); + writeEntries("POKEMON", "onyx", "csv", "FOO", "20130102", 10, 1, 1); + writeEntries("POKEMON", "onyx", "csv", "FOO", "20130103", 1, 1, 22); + writeEntries("ROOSTER", "onyx", "csv", "BAR", "20130101", 5, 24, 2); + writeEntries("ROOSTER", "onyx", "csv", "BAR", "20130102", 5, 24, 2); + writeEntries("ROOSTER", "onyx", "csv", "BAR", "20130103", 5, 24, 20); + writeEntries("NETWORK", "bbc", "csv", "FOO", "20130101", 10, 24, 20); + writeEntries("NETWORK", "bbc", "csv", "FOO", "20130102", 10, 24, 20); + writeEntries("NETWORK", "bbc", "csv", "FOO", "20130103", 10, 24, 20); + writeEntries("OCCUPATION", "skydiver", "text", "FOO", "20130101", 10, 10, 5); + writeEntries("OCCUPATION", "skydiver", "text", "FOO", "20130102", 10, 10, 5); + writeEntries("OCCUPATION", "skydiver", "text", "FOO", "20130103", 10, 10, 5); + writeEntries("OCCUPATION", "skydiver", "text", "FOO", "20130104", 10, 10, 5); + writeEntries("OCCUPATION", "xxx.skydiver", "text", "FOO", "20130101", 10, 10, 5); + writeEntries("OCCUPATION", "xxx.skydiver", "text", "FOO", "20130102", 10, 10, 5); + writeEntries("OCCUPATION", "xxx.skydiver", "text", "FOO", "20130103", 10, 10, 5); + writeEntries("OCCUPATION", "xxx.skydiver", "text", "FOO", "20130104", 10, 10, 5); + writeEntries("OCCUPATION", "yyy.skydiver", "text", "FOO", "20130101", 10, 10, 5); + writeEntries("OCCUPATION", "yyy.skydiver", "text", "FOO", "20130102", 10, 10, 5); + writeEntries("OCCUPATION", "yyy.skydiver", "text", "FOO", "20130103", 10, 10, 5); + writeEntries("OCCUPATION", "yyy.skydiver", "text", "FOO", "20130104", 10, 10, 5); + writeEntries("JOB", "skydiver", "text", "BAR", "20130101", 10, 10, 5); + writeEntries("JOB", "skydiver", "text", "BAR", "20130102", 10, 10, 5); + writeEntries("JOB", "skydiver", "text", "BAR", "20130103", 10, 10, 5); + writeEntries("JOB", "skydiver", "text", "BAR", "20130104", 10, 10, 5); + writeEntries("JOB", "police officer", "idem", "FOO", "20130101", 15, 15, 5); + writeEntries("JOB", "police officer", "idem", "FOO", "20130102", 15, 15, 5); + writeEntries("JOB", "police officer", "idem", "FOO", "20130103", 15, 15, 5); + writeEntries("PRIZE", "trophy", "idem", "FOO", "20130101", 1, 5, 5); + writeEntries("PRIZE", "trophy", "idem", "FOO", "20130102", 1, 5, 5); + writeEntries("PRIZE", "trophy", "idem", "FOO", "20130103", 1, 5, 5); + writeEntries("PRIZE", "trophy", "idem", "FOO", "20130104", 1, 5, 5); + writeEntries("FLOCK", "rooster", "stock", "BAR", "20130101", 2, 15, 5); + writeEntries("FLOCK", "rooster", "stock", "BAR", "20130102", 2, 15, 5); + writeEntries("FLOCK", "rooster", "stock", "BAR", "20130103", 2, 15, 5); + writeEntries("BIRD", "ruddy duck", "stock", "FOO", "20130101", 20, 15, 2); + writeEntries("BIRD", "ruddy duck", "stock", "FOO", "20130102", 20, 15, 2); + writeEntries("BIRD", "ruddy duck", "stock", "FOO", "20130103", 20, 15, 2); + writeEntries("VEHICLE", "ranger", "stock", "FOO", "20130101", 20, 15, 2); + writeEntries("VEHICLE", "ranger", "stock", "BAR", "20130101", 1, 1, 2); + writeEntries("VEHICLE", "ranger", "stock", "FOO", "20130102", 20, 15, 2); + writeEntries("VEHICLE", "ranger", "stock", "BAR", "20130102", 5, 5, 5); + writeEntries("VEHICLE", "ranger", "stock", "FOO", "20130103", 20, 15, 2); + writeEntries("VEHICLE", "ranger", "stock", "BAR", "20130103", 6, 1, 2); + + writeForwardModel("ANIMAL", "ROOSTER"); + writeForwardModel("ANIMAL", "BIRD"); + writeReverseModel("occupation", "job"); } - protected void insertIndex(Pair valueField) throws Throwable { + private void writeEntries(String field, String term, String datatype, String visibility, String dateStr, int numShards, int uidListCount, int uidListSize) + throws Exception { BatchWriterConfig config = new BatchWriterConfig().setMaxMemory(1024L).setMaxLatency(1, TimeUnit.SECONDS).setMaxWriteThreads(1); - ColumnVisibility viz = new ColumnVisibility("FOO"); - - List dates = new ArrayList<>(); - for (int i = 1; i <= 3; i++) { - dates.add(dateFormatter.parse("2013010" + i)); - } + ColumnVisibility columnVisibility = new ColumnVisibility(visibility); + Date date = dateFormatter.parse(dateStr); try (BatchWriter writer = client.createBatchWriter(QueryTestTableHelper.METADATA_TABLE_NAME, config)) { - Mutation m = new Mutation(valueField.getValue1().toUpperCase()); - m.put("t", "datatype\u0000" + LcNoDiacriticsType.class.getName(), viz, blank); - m.put("i", "datatype", viz, blank); - m.put("ri", "datatype", viz, blank); - writer.addMutation(m); + Mutation mutation = new Mutation(field); + mutation.put("t", datatype + "\u0000" + LcNoDiacriticsType.class.getName(), columnVisibility, BLANK_VALUE); + mutation.put("i", datatype + "\u0000" + dateStr, columnVisibility, new Value(SummingCombiner.VAR_LEN_ENCODER.encode(1L))); + mutation.put("ri", datatype + "\u0000" + dateStr, columnVisibility, new Value(SummingCombiner.VAR_LEN_ENCODER.encode(1L))); + writer.addMutation(mutation); } try (BatchWriter writer = client.createBatchWriter(TableName.SHARD_INDEX, config)) { - Mutation m = new Mutation(valueField.getValue0().toLowerCase()); - int numShards = 10; + Mutation mutation = new Mutation(term); for (int i = 0; i < numShards; i++) { - for (Date date : dates) { - String shard = dateFormatter.format(date); - m.put(valueField.getValue1().toUpperCase(), shard + "_" + i + "\u0000datatype", viz, date.getTime(), - new Value(makeUidList(24).toByteArray())); - } + mutation.put(field, dateStr + "_" + i + "\u0000" + datatype, columnVisibility, date.getTime(), createUidListValue(uidListCount, uidListSize)); } - writer.addMutation(m); + writer.addMutation(mutation); } try (BatchWriter writer = client.createBatchWriter(TableName.SHARD_RINDEX, config)) { - Mutation m = new Mutation(new StringBuilder().append(valueField.getValue0().toLowerCase()).reverse().toString()); - int numShards = 10; + Mutation mutation = new Mutation(new StringBuilder(term).reverse().toString()); for (int i = 0; i < numShards; i++) { - for (Date date : dates) { - String shard = dateFormatter.format(date); - m.put(valueField.getValue1().toUpperCase(), shard + "_" + i + "\u0000datatype", viz, date.getTime(), - new Value(makeUidList(24).toByteArray())); - } + mutation.put(field, dateStr + "_" + i + "\u0000" + datatype, columnVisibility, date.getTime(), createUidListValue(uidListCount, uidListSize)); } - writer.addMutation(m); + writer.addMutation(mutation); } } - protected void insertForwardModel(String from, String to) throws Throwable { + private Value createUidListValue(int count, int listSize) { + Uid.List.Builder builder = Uid.List.newBuilder().setIGNORE(true).setCOUNT(count); + for (int i = 0; i < listSize; i++) { + builder.addUID(UUID.randomUUID().toString()); + } + return new Value(builder.build().toByteArray()); + } + + private void writeForwardModel(String from, String to) throws Throwable { BatchWriterConfig config = new BatchWriterConfig().setMaxMemory(1024L).setMaxLatency(1, TimeUnit.SECONDS).setMaxWriteThreads(1); ColumnVisibility viz = new ColumnVisibility("FOO"); try (BatchWriter writer = client.createBatchWriter(QueryTestTableHelper.METADATA_TABLE_NAME, config)) { - Mutation m = new Mutation(from.toUpperCase()); - m.put("DATAWAVE", to.toUpperCase() + "\u0000forward", viz, blank); + Mutation m = new Mutation(from); + m.put("DATAWAVE", to + "\u0000forward", viz, BLANK_VALUE); writer.addMutation(m); } } - protected void insertReverseModel(String from, String to) throws Throwable { + private void writeReverseModel(String from, String to) throws Throwable { BatchWriterConfig config = new BatchWriterConfig().setMaxMemory(1024L).setMaxLatency(1, TimeUnit.SECONDS).setMaxWriteThreads(1); ColumnVisibility viz = new ColumnVisibility("FOO"); try (BatchWriter writer = client.createBatchWriter(QueryTestTableHelper.METADATA_TABLE_NAME, config)) { - Mutation m = new Mutation(from.toUpperCase()); - m.put("DATAWAVE", to.toUpperCase() + "\u0000reverse", viz, blank); + Mutation m = new Mutation(from); + m.put("DATAWAVE", to + "\u0000reverse", viz, BLANK_VALUE); writer.addMutation(m); } } - protected Iterator runTestQuery(String querystr) throws Throwable { - return runTestQuery(querystr, dateFormatter.parse("20130101"), dateFormatter.parse("20130102")); + private void initLogic() { + logic = new DiscoveryLogic(); + logic.setIndexTableName(TableName.SHARD_INDEX); + logic.setReverseIndexTableName(TableName.SHARD_RINDEX); + logic.setModelTableName(QueryTestTableHelper.METADATA_TABLE_NAME); + logic.setMetadataTableName(QueryTestTableHelper.METADATA_TABLE_NAME); + logic.setModelName("DATAWAVE"); + logic.setFullTableScanEnabled(false); + logic.setMaxResults(-1); + logic.setMaxWork(-1); + logic.setAllowLeadingWildcard(true); + logic.setResponseObjectFactory(new DefaultResponseObjectFactory()); + logic.setMarkingFunctions(new MarkingFunctions.Default()); + logic.setMetadataHelperFactory(new MetadataHelperFactory()); } - protected Iterator runTestQuery(String querystr, Date startDate, Date endDate) throws Throwable { - return runTestQuery(querystr, new HashMap<>(), startDate, endDate); + @After + public void tearDown() throws Exception { + query = null; + startDate = null; + endDate = null; + parameters.clear(); + expected.clear(); } - protected Iterator runTestQuery(String querystr, Map params, Date startDate, Date endDate) throws Throwable { + private void assertQueryResults() throws Exception { QueryImpl settings = new QueryImpl(); - settings.setBeginDate(startDate); - settings.setEndDate(endDate); - + settings.setBeginDate(dateFormatter.parse(startDate)); + settings.setEndDate(dateFormatter.parse(endDate)); settings.setPagesize(Integer.MAX_VALUE); - settings.setQueryAuthorizations(queryAuths); - settings.setQuery(querystr); + settings.setQueryAuthorizations(QUERY_AUTHS); + settings.setQuery(query); settings.setId(UUID.randomUUID()); - settings.addParameters(params); + settings.addParameters(this.parameters); - GenericQueryConfiguration config = logic.initialize(client, settings, auths); + GenericQueryConfiguration config = logic.initialize(client, settings, AUTHS); logic.setupQuery(config); - return logic.iterator(); + Iterator iterator = logic.iterator(); + List actual = new ArrayList<>(); + while (iterator.hasNext()) { + actual.add(iterator.next()); + } + + Assertions.assertThat(actual).hasSize(expected.size()); + for (int i = 0; i < expected.size(); i++) { + DiscoveredThing actualThing = actual.get(i); + DiscoveredThing expectedThing = expected.get(i); + Assertions.assertThat(actualThing).isEqualTo(expectedThing); + Assertions.assertThat(actualThing.getCountsByColumnVisibility()).isEqualTo(expectedThing.getCountsByColumnVisibility()); + } + } + + private void givenQuery(String query) { + this.query = query; + } + + private void givenStartDate(String startDate) { + this.startDate = startDate; + } + + private void givenEndDate(String endDate) { + this.endDate = endDate; + } + + private void givenParameter(String parameter, String value) { + this.parameters.put(parameter, value); + } + + private void expect(DiscoveredThing discoveredThing) { + this.expected.add(discoveredThing); } @Test - public void testUnfieldedLiterals() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("bbc OR onyx"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } - assertEquals(ImmutableSet.of(Pair.with("bbc", "NETWORK"), Pair.with("onyx", "POKEMON"), Pair.with("onyx", "ROCK"), Pair.with("onyx", "ROOSTER")), - matches); + public void testLiterals() throws Exception { + givenQuery("bbc OR onyx"); + givenStartDate("20130101"); + givenEndDate("20130102"); + + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "20130101", "FOO", 240L, new MapWritable())); + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "20130102", "FOO", 240L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130102", "FOO", 10L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130101", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130102", "FOO", 3L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "20130101", "BAR", 120L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "20130102", "BAR", 120L, new MapWritable())); + + assertQueryResults(); } @Test - public void testUnfieldedPatterns() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("*er OR m*"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } + public void testPatterns() throws Exception { + givenQuery("*yx OR b*"); + givenStartDate("20130101"); + givenEndDate("20130102"); + + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "20130101", "FOO", 240L, new MapWritable())); + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "20130102", "FOO", 240L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130102", "FOO", 10L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130101", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130102", "FOO", 3L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "20130101", "BAR", 120L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "20130102", "BAR", 120L, new MapWritable())); + + assertQueryResults(); + } - assertEquals(ImmutableSet.of(Pair.with("motorcycle", "VEHICLE"), Pair.with("motorboat", "VEHICLE"), Pair.with("police officer", "OTHERPERSON"), - Pair.with("skydiver", "OCCUPATION"), Pair.with("rooster", "COCKADOODLEDOO")), matches); + @Test + public void testPatternAndLiteral() throws Exception { + givenQuery("*er OR trophy"); + givenStartDate("20130102"); + givenEndDate("20130104"); + + expect(new DiscoveredThing("trophy", "PRIZE", "idem", "20130102", "FOO", 5L, new MapWritable())); + expect(new DiscoveredThing("trophy", "PRIZE", "idem", "20130103", "FOO", 5L, new MapWritable())); + expect(new DiscoveredThing("trophy", "PRIZE", "idem", "20130104", "FOO", 5L, new MapWritable())); + expect(new DiscoveredThing("police officer", "JOB", "idem", "20130102", "FOO", 225L, new MapWritable())); + expect(new DiscoveredThing("police officer", "JOB", "idem", "20130103", "FOO", 225L, new MapWritable())); + expect(new DiscoveredThing("ranger", "VEHICLE", "stock", "20130102", "BAR&FOO", 325L, new MapWritable())); + expect(new DiscoveredThing("ranger", "VEHICLE", "stock", "20130103", "BAR&FOO", 306L, new MapWritable())); + expect(new DiscoveredThing("rooster", "FLOCK", "stock", "20130102", "BAR", 30L, new MapWritable())); + expect(new DiscoveredThing("rooster", "FLOCK", "stock", "20130103", "BAR", 30L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "JOB", "text", "20130102", "BAR", 100L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "JOB", "text", "20130103", "BAR", 100L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "JOB", "text", "20130104", "BAR", 100L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "OCCUPATION", "text", "20130102", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "OCCUPATION", "text", "20130103", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "OCCUPATION", "text", "20130104", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130102", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130103", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130104", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130102", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130103", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130104", "FOO", 100L, new MapWritable())); + + assertQueryResults(); } @Test - public void testUnfielded() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("*er OR trophy"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } + public void testFieldedLiterals() throws Exception { + givenQuery("rock:onyx OR pokemon:onyx"); + givenStartDate("20130101"); + givenEndDate("20130104"); + + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130102", "FOO", 10L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130103", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130101", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130102", "FOO", 3L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130103", "FOO", 3L, new MapWritable())); + + assertQueryResults(); + } - assertEquals(ImmutableSet.of(Pair.with("trophy", "PRIZE"), Pair.with("police officer", "OTHERPERSON"), Pair.with("skydiver", "OCCUPATION"), - Pair.with("rooster", "COCKADOODLEDOO")), matches); + @Test + public void testFieldedPatterns() throws Exception { + givenQuery("rock:*n*x OR bird:*r*k"); + givenStartDate("20130101"); + givenEndDate("20130103"); + + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130101", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130102", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130103", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130101", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130102", "FOO", 3L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130103", "FOO", 3L, new MapWritable())); + + assertQueryResults(); } @Test - public void testFieldedLiteral() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("rock:onyx OR pokemon:onyx"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } - assertEquals(ImmutableSet.of(Pair.with("onyx", "POKEMON"), Pair.with("onyx", "ROCK")), matches); + public void testFieldLiteralAndPattern() throws Exception { + givenQuery("pokemon:onyx OR bird:*r*k"); + givenStartDate("20130101"); + givenEndDate("20130104"); + + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130102", "FOO", 10L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130103", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130101", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130102", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130103", "FOO", 300L, new MapWritable())); + + assertQueryResults(); } @Test - public void testFieldedPattern() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("vehicle:*r*k OR bird:*r*k"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } + public void testReverse() throws Exception { + givenQuery("*.sky*er"); + givenStartDate("20130101"); + givenEndDate("20130104"); + + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130102", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130103", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130104", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130102", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130103", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130104", "FOO", 100L, new MapWritable())); + + assertQueryResults(); + } - assertEquals(ImmutableSet.of(Pair.with("firetruck", "VEHICLE"), Pair.with("ruddy duck", "BIRD")), matches); + @Test + public void testSumCountsForLiterals() throws Exception { + givenQuery("bbc OR onyx"); + givenStartDate("20130101"); + givenEndDate("20130102"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "", "FOO", 480L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "", "FOO", 110L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "", "FOO", 4L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "", "BAR", 240L, new MapWritable())); + + assertQueryResults(); } @Test - public void testFielded() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("pokemon:onyx OR bird:*r*k"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } + public void testSumCountsForPatterns() throws Exception { + givenQuery("*yx OR b*"); + givenStartDate("20130101"); + givenEndDate("20130102"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "", "FOO", 480L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "", "FOO", 110L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "", "FOO", 4L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "", "BAR", 240L, new MapWritable())); + + assertQueryResults(); + } - assertEquals(ImmutableSet.of(Pair.with("onyx", "POKEMON"), Pair.with("ruddy duck", "BIRD")), matches); + @Test + public void testSumCountsForPatternAndLiteral() throws Exception { + givenQuery("*er OR trophy"); + givenStartDate("20130102"); + givenEndDate("20130104"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("trophy", "PRIZE", "idem", "", "FOO", 15L, new MapWritable())); + expect(new DiscoveredThing("police officer", "JOB", "idem", "", "FOO", 450L, new MapWritable())); + expect(new DiscoveredThing("ranger", "VEHICLE", "stock", "", "BAR&FOO", 631L, new MapWritable())); + expect(new DiscoveredThing("rooster", "FLOCK", "stock", "", "BAR", 60L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "JOB", "text", "", "BAR", 300L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "OCCUPATION", "text", "", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "", "FOO", 300L, new MapWritable())); + + assertQueryResults(); } @Test - public void testReverse() throws Throwable { - for (Pair p : terms2) { - insertIndex(p); - } + public void testSumCountsForFieldedLiterals() throws Exception { + givenQuery("rock:onyx OR pokemon:onyx"); + givenStartDate("20130101"); + givenEndDate("20130104"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("*.sky*er"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "", "FOO", 111L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "", "FOO", 7L, new MapWritable())); + + assertQueryResults(); + } + + @Test + public void testSumCountsForFieldedPatterns() throws Exception { + givenQuery("rock:*n*x OR bird:*r*k"); + givenStartDate("20130101"); + givenEndDate("20130103"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "", "FOO", 900L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "", "FOO", 7L, new MapWritable())); - assertEquals(ImmutableSet.of(Pair.with("xxx.skydiver", "OCCUPATION"), Pair.with("yyy.skydiver", "OCCUPATION")), matches); + assertQueryResults(); } + @Test + public void testSumCountsForFieldLiteralAndPattern() throws Exception { + givenQuery("pokemon:onyx OR bird:*r*k"); + givenStartDate("20130101"); + givenEndDate("20130104"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "", "FOO", 111L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "", "FOO", 900L, new MapWritable())); + + assertQueryResults(); + } + + @Test + public void testSumCountsForReverse() throws Exception { + givenQuery("*.sky*er"); + givenStartDate("20130101"); + givenEndDate("20130104"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "", "FOO", 400L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "", "FOO", 400L, new MapWritable())); + + assertQueryResults(); + } } diff --git a/warehouse/query-core/src/test/java/datawave/query/index/lookup/IndexInfoTest.java b/warehouse/query-core/src/test/java/datawave/query/index/lookup/IndexInfoTest.java index 4f9ddec3d2a..feed357294f 100644 --- a/warehouse/query-core/src/test/java/datawave/query/index/lookup/IndexInfoTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/index/lookup/IndexInfoTest.java @@ -11,10 +11,8 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Set; import org.apache.commons.jexl3.parser.JexlNode; diff --git a/warehouse/query-core/src/test/java/datawave/query/index/lookup/RangeStreamTestX.java b/warehouse/query-core/src/test/java/datawave/query/index/lookup/RangeStreamTestX.java index ef440984a4d..9abd2c30a91 100644 --- a/warehouse/query-core/src/test/java/datawave/query/index/lookup/RangeStreamTestX.java +++ b/warehouse/query-core/src/test/java/datawave/query/index/lookup/RangeStreamTestX.java @@ -343,6 +343,26 @@ public static void setupAccumulo() throws Exception { m.put(new Text("F4"), new Text("20200101_13\0datatype1"), valueForShard); bw.addMutation(m); + // --------------- some entries for post-index sorting via field or term counts + + m = new Mutation("23"); + m.put(new Text("FIELD_A"), new Text("20200101_10\0sort-type"), createValue(23L)); + m.put(new Text("FIELD_B"), new Text("20200101_10\0sort-type"), createValue(23L)); + m.put(new Text("FIELD_C"), new Text("20200101_10\0sort-type"), createValue(23L)); + bw.addMutation(m); + + m = new Mutation("34"); + m.put(new Text("FIELD_A"), new Text("20200101_10\0sort-type"), createValue(34L)); + m.put(new Text("FIELD_B"), new Text("20200101_10\0sort-type"), createValue(34L)); + m.put(new Text("FIELD_C"), new Text("20200101_10\0sort-type"), createValue(34L)); + bw.addMutation(m); + + m = new Mutation("45"); + m.put(new Text("FIELD_A"), new Text("20200101_10\0sort-type"), createValue(45L)); + m.put(new Text("FIELD_B"), new Text("20200101_10\0sort-type"), createValue(45L)); + m.put(new Text("FIELD_C"), new Text("20200101_10\0sort-type"), createValue(45L)); + bw.addMutation(m); + // --------------- bw.flush(); @@ -358,6 +378,21 @@ private static Value buildValueForShard() { return new Value(list.toByteArray()); } + /** + * Create a value with a count + * + * @param count + * the count + * @return a value + */ + private static Value createValue(long count) { + Uid.List.Builder builder = Uid.List.newBuilder(); + builder.setIGNORE(true); + builder.setCOUNT(count); + Uid.List list = builder.build(); + return new Value(list.toByteArray()); + } + // A value that will roll into a day range. private static Value buildValueForDay() { Uid.List.Builder builder = Uid.List.newBuilder(); @@ -371,6 +406,10 @@ private static Value buildValueForDay() { public void setupTest() { config = new ShardQueryConfiguration(); config.setClient(client); + + // disable all post-index sort options by default + config.setSortQueryPostIndexWithFieldCounts(false); + config.setSortQueryPostIndexWithTermCounts(false); } // A && B @@ -3400,6 +3439,24 @@ public void testOrAndOrWithDeeplyNestedDelayedTerm() throws Exception { runTest(query, expectedRanges, expectedQueries); } + @Test + public void testSortingByFieldCardinality() { + String query = "FIELD_A == '45' || FIELD_B == '34' || FIELD_C == '23'"; + String expected = "(FIELD_C == '23' || FIELD_B == '34' || FIELD_A == '45')"; + + config.setSortQueryPostIndexWithFieldCounts(true); + drive(query, expected); + } + + @Test + public void testSortingByTermCardinality() { + String query = "FIELD_A == '45' || FIELD_B == '34' || FIELD_C == '23'"; + String expected = "(FIELD_C == '23' || FIELD_B == '34' || FIELD_A == '45')"; + + config.setSortQueryPostIndexWithTermCounts(true); + drive(query, expected); + } + private void runTest(String query, List expectedRanges, List expectedQueries) throws Exception { assertEquals("Expected ranges and queries do not match, ranges: " + expectedRanges.size() + " queries: " + expectedQueries.size(), @@ -3485,4 +3542,53 @@ private void runTest(RangeStream rangeStream, ASTJexlScript script, List if (queryIter.hasNext()) fail("Expected queries still exist after test: " + queryIter.next()); } + + /** + * Drives a query against a subset of the index data to verify post-index sorting options + * + * @param query + * the input query + * @param expected + * the expected query + */ + private void drive(String query, String expected) { + try { + ASTJexlScript script = JexlASTHelper.parseJexlQuery(query); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); + config.setBeginDate(sdf.parse("20200101")); + config.setEndDate(sdf.parse("20200105")); + + config.setDatatypeFilter(Sets.newHashSet("sort-type")); + + Multimap> dataTypes = HashMultimap.create(); + dataTypes.putAll("FIELD_A", Sets.newHashSet(new LcNoDiacriticsType())); + dataTypes.putAll("FIELD_B", Sets.newHashSet(new LcNoDiacriticsType())); + dataTypes.putAll("FIELD_C", Sets.newHashSet(new LcNoDiacriticsType())); + + config.setQueryFieldsDatatypes(dataTypes); + config.setIndexedFields(dataTypes); + + MockMetadataHelper helper = new MockMetadataHelper(); + helper.setIndexedFields(dataTypes.keySet()); + + // Run a standard limited-scanner range stream. + ScannerFactory scannerFactory = new ScannerFactory(config); + try (RangeStream rangeStream = new RangeStream(config, scannerFactory, helper)) { + rangeStream.setLimitScanners(true); + + Iterator plans = rangeStream.streamPlans(script).iterator(); + + assertTrue(plans.hasNext()); + QueryPlan plan = plans.next(); + + String plannedQuery = plan.getQueryString(); + assertEquals(expected, plannedQuery); + + assertFalse(plans.hasNext()); + } + } catch (Exception e) { + fail("test failed: " + e.getMessage()); + } + } } diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsDescriptorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsDescriptorTest.java index c5af43d9a6d..a3774a319dd 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsDescriptorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsDescriptorTest.java @@ -88,29 +88,31 @@ private void assertHitTermValues(ContentJexlArgumentDescriptor jexlDescriptor, S @Test @SuppressWarnings("unchecked") void testFieldsAndTerms() { - assertFieldsAndTerms(getDescriptor(unfieldedPhrase), new Set[] {Set.of(), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(fieldedPhrase), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(multiFieldedPhrase), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")}); + assertFieldsAndTerms(getDescriptor(unfieldedPhrase), Set.of(), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(fieldedPhrase), Set.of("FIELD"), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(multiFieldedPhrase), Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")); - assertFieldsAndTerms(getDescriptor(unfieldedScoredPhrase), new Set[] {Set.of(), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(fieldedScoredPhrase), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(multiFieldedScoredPhrase), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")}); + assertFieldsAndTerms(getDescriptor(unfieldedScoredPhrase), Set.of(), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(fieldedScoredPhrase), Set.of("FIELD"), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(multiFieldedScoredPhrase), Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")); - assertFieldsAndTerms(getDescriptor(unfieldedAdjacent), new Set[] {Set.of(), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(fieldedAdjacent), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(multiFieldedAdjacent), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")}); + assertFieldsAndTerms(getDescriptor(unfieldedAdjacent), Set.of(), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(fieldedAdjacent), Set.of("FIELD"), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(multiFieldedAdjacent), Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")); - assertFieldsAndTerms(getDescriptor(unfieldedWithin), new Set[] {Set.of(), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(fieldedWithin), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(multiFieldedWithin), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")}); + assertFieldsAndTerms(getDescriptor(unfieldedWithin), Set.of(), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(fieldedWithin), Set.of("FIELD"), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(multiFieldedWithin), Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")); } - private void assertFieldsAndTerms(ContentJexlArgumentDescriptor jexlDescriptor, Set[] expected) { - Set[] fieldsAndTerms = jexlDescriptor.fieldsAndTerms(Set.of(), Set.of(), Set.of(), new MutableBoolean(true)); - assertArrayEquals(expected, fieldsAndTerms); + private void assertFieldsAndTerms(ContentJexlArgumentDescriptor jexlDescriptor, Set fields, Set terms) { + ContentFunctionsDescriptor.FieldTerms fieldsAndTerms = jexlDescriptor.fieldsAndTerms(Set.of(), Set.of(), Set.of(), new MutableBoolean(true)); + assertEquals(fields, fieldsAndTerms.getFields()); + assertEquals(terms, fieldsAndTerms.getTerms()); fieldsAndTerms = jexlDescriptor.fieldsAndTerms(Set.of(), Set.of(), Set.of(), new MutableBoolean(true), false); - assertArrayEquals(expected, fieldsAndTerms); + assertEquals(fields, fieldsAndTerms.getFields()); + assertEquals(terms, fieldsAndTerms.getTerms()); } @Test diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/BoundedRangeIndexLookupTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/BoundedRangeIndexLookupTest.java new file mode 100644 index 00000000000..2ec75fd13e3 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/BoundedRangeIndexLookupTest.java @@ -0,0 +1,314 @@ +package datawave.query.jexl.lookups; + +import static org.easymock.EasyMock.createMock; +import static org.easymock.EasyMock.eq; +import static org.easymock.EasyMock.expect; +import static org.easymock.EasyMock.isA; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.powermock.api.easymock.PowerMock.replayAll; +import static org.powermock.api.easymock.PowerMock.verifyAll; + +import java.text.SimpleDateFormat; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.BatchWriter; +import org.apache.accumulo.core.client.BatchWriterConfig; +import org.apache.accumulo.core.client.TableNotFoundException; +import org.apache.accumulo.core.client.security.tokens.PasswordToken; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Mutation; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.minicluster.MiniAccumuloCluster; +import org.easymock.EasyMockSupport; +import org.junit.After; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import datawave.microservice.query.QueryImpl; +import datawave.query.config.ShardQueryConfiguration; +import datawave.query.iterator.SortedListKeyValueIterator; +import datawave.query.jexl.LiteralRange; +import datawave.query.scanner.LocalBatchScanner; +import datawave.query.tables.ScannerFactory; +import datawave.util.TableName; +import datawave.util.time.DateHelper; + +public class BoundedRangeIndexLookupTest extends EasyMockSupport { + + @ClassRule + public static TemporaryFolder temporaryFolder = new TemporaryFolder(); + + private static final String PASSWORD = "password"; + + private static final String shard = "2024070"; + private static final Set fields = Set.of("FIELD_A", "FIELD_B", "FIELD_C", "FIELD_D", "FIELD_E"); + private static final Set datatypes = Set.of("datatype-a", "datatype-b", "datatype-c", "datatype-d", "datatype-e"); + + private static MiniAccumuloCluster cluster; + private static AccumuloClient client; + private ExecutorService executorService; + + private ShardQueryConfiguration config; + private ScannerFactory scannerFactory; + + private final SortedSet expected = new TreeSet<>(); + + // variables for large row test + private BoundedRangeIndexLookup largeLookup; + private ShardQueryConfiguration largeConfig; + private ScannerFactory largeScannerFactory; + + @BeforeClass + public static void setupClass() throws Exception { + cluster = new MiniAccumuloCluster(temporaryFolder.newFolder(), PASSWORD); + cluster.start(); + + client = cluster.createAccumuloClient("root", new PasswordToken(PASSWORD)); + + writeData(); + } + + @Before + public void setup() { + scannerFactory = new ScannerFactory(client); + + config = new ShardQueryConfiguration(); + config.setClient(client); + + executorService = Executors.newFixedThreadPool(5); + + expected.clear(); + + // large lookup + largeConfig = new ShardQueryConfiguration(); + largeScannerFactory = createMock(ScannerFactory.class); + } + + @After + public void teardown() { + executorService.shutdownNow(); + } + + public static void writeData() throws Exception { + client.tableOperations().create(TableName.SHARD_INDEX); + + int numTerms = 25; + + try (BatchWriter bw = client.createBatchWriter(TableName.SHARD_INDEX, new BatchWriterConfig())) { + for (int i = 0; i < numTerms; i++) { + Mutation m = new Mutation("value-" + i); + for (String field : fields) { + for (int j = 0; j < 10; j++) { + for (String datatype : datatypes) { + for (int k = 0; k < 5; k++) { + m.put(field, shard + j + '_' + k + '\u0000' + datatype, new Value()); + } + } + } + } + bw.addMutation(m); + } + } + } + + @Test + public void testSingleDay_singleValue() { + withDateRange("20240701", "20240701"); + withDatatypeFilter(Set.of("datatype-b")); + withExpected(Set.of("value-1")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-1"); + test(lookup, "FIELD_A"); + } + + @Test + public void testSingleDay_multiValue() { + withDateRange("20240701", "20240701"); + withExpected(Set.of("value-10", "value-12", "value-11", "value-14", "value-13", "value-16", "value-15", "value-18", "value-17", "value-19", "value-1", + "value-2")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-2"); + test(lookup, "FIELD_A"); + } + + @Test + public void testSingleDay_allValues() { + withDateRange("20240701", "20240701"); + withExpected(createAllValues(1, 25)); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-9"); + test(lookup, "FIELD_A"); + } + + @Test + public void testMultiDay_singleValue() { + withDateRange("20240701", "20240703"); + withExpected(Set.of("value-1")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-1"); + test(lookup, "FIELD_A"); + } + + @Test + public void testMultiDay_multiValue() { + withDateRange("20240701", "20240703"); + withExpected(Set.of("value-3", "value-4", "value-5")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-3", "value-5"); + test(lookup, "FIELD_A"); + } + + @Test + public void testMultiDay_allValues() { + withDateRange("20240701", "20240703"); + withExpected(createAllValues(1, 25)); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-9"); + test(lookup, "FIELD_A"); + } + + @Test + public void testAllDays_singleValue() { + withDateRange("20240701", "20240709"); + withExpected(Set.of("value-1")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-1"); + test(lookup, "FIELD_A"); + } + + @Test + public void testAllDays_multiValue() { + withDateRange("20240701", "20240709"); + withExpected(Set.of("value-21", "value-3", "value-2", "value-20", "value-23", "value-22", "value-24")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-2", "value-3"); + test(lookup, "FIELD_A"); + } + + @Test + public void testAllDays_allValues() { + withDateRange("20240701", "20240709"); + withExpected(createAllValues(1, 25)); + + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-9"); + test(lookup, "FIELD_A"); + } + + @Test + public void testInvalidDateRange() { + withDateRange("20240808", "20240909"); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-1"); + test(lookup, "FIELD_A"); + } + + @Test + public void testInvalidBoundedRange() { + withDateRange("20240701", "20240709"); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "abc", "def"); + test(lookup, "FIELD_A"); + } + + @Test + public void testInvalidField() { + withDateRange("20240701", "20240709"); + BoundedRangeIndexLookup lookup = createLookup("FIELD_Z", "value-1", "value-1"); + test(lookup, "FIELD_Z"); + } + + @Test + public void testInvalidDataTypeFilter() { + withDateRange("20240701", "20240709"); + withDatatypeFilter(Set.of("datatype-z")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-1"); + test(lookup, "FIELD_A"); + } + + private void test(BoundedRangeIndexLookup lookup, String field) { + lookup.submit(); + + IndexLookupMap lookupMap = lookup.lookup(); + + if (expected.isEmpty()) { + assertTrue(lookupMap.keySet().isEmpty()); + } else { + assertTrue(lookupMap.containsKey(field)); + Set values = new HashSet<>(lookupMap.get(field)); + assertEquals(expected, values); + } + } + + private BoundedRangeIndexLookup createLookup(String field, String lower, String upper) { + LiteralRange range = new LiteralRange<>(lower, true, upper, true, field, LiteralRange.NodeOperand.AND); + return createLookup(range); + } + + private BoundedRangeIndexLookup createLookup(LiteralRange range) { + return new BoundedRangeIndexLookup(config, scannerFactory, range, executorService); + } + + private void withDateRange(String start, String end) { + assertNotNull(config); + config.setBeginDate(DateHelper.parse(start)); + config.setEndDate(DateHelper.parse(end)); + } + + private void withDatatypeFilter(Set datatypes) { + assertNotNull(config); + config.setDatatypeFilter(datatypes); + } + + private void withExpected(Set expected) { + assertTrue("should only set expected values once per test", this.expected.isEmpty()); + this.expected.addAll(expected); + } + + private Set createAllValues(int start, int stop) { + Set values = new HashSet<>(); + for (int i = start; i < stop; i++) { + values.add("value-" + i); + } + return values; + } + + @Test + public void largeRowInBoundedRangeTest() throws TableNotFoundException { + ExecutorService s = Executors.newSingleThreadExecutor(); + Date begin = new Date(); + Date end = new Date(); + config.setBeginDate(begin); + config.setEndDate(end); + config.setNumQueryThreads(1); + // defaults to 5000 + config.setMaxValueExpansionThreshold(1); + SimpleDateFormat sdf = new SimpleDateFormat("YYYYMMdd"); + LiteralRange range = new LiteralRange("R", true, "S", false, "FOO", LiteralRange.NodeOperand.OR); + largeLookup = new BoundedRangeIndexLookup(config, largeScannerFactory, range, s); + // create index data to iterate over + List> src = new ArrayList<>(); + for (int i = 0; i < 10000; i++) { + src.add(new AbstractMap.SimpleImmutableEntry<>(new Key("R" + i, "FOO", sdf.format(begin) + "_1" + '\0' + "myDataType"), new Value())); + } + SortedListKeyValueIterator itr = new SortedListKeyValueIterator(src); + LocalBatchScanner scanner = new LocalBatchScanner(itr, true); + // add expects for the scanner factory + expect(largeScannerFactory.newScanner(eq("shardIndex"), isA(Set.class), eq(1), isA(QueryImpl.class), eq("shardIndex"))).andAnswer(() -> scanner); + expect(largeScannerFactory.close(scanner)).andReturn(true); + replayAll(); + largeLookup.submit(); + IndexLookupMap map = largeLookup.lookup(); + // verify we went over all the data even though the threshold was lower than this + assertEquals(10001, scanner.getSeekCount()); // with new iterator this is initial seek + one seek per unique row in the range + // this represents data collapsed and sent back to the client by the WholeRowIterator + assertEquals(0, scanner.getNextCount()); // no next cals with seeking filter + assertTrue(map.get("FOO").isThresholdExceeded()); + verifyAll(); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/FieldNameIndexLookupTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/FieldNameIndexLookupTest.java index e7357fce8f2..5861c8af516 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/FieldNameIndexLookupTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/FieldNameIndexLookupTest.java @@ -1,12 +1,10 @@ package datawave.query.jexl.lookups; import static org.easymock.EasyMock.anyObject; -import static org.easymock.EasyMock.eq; import static org.easymock.EasyMock.expect; import static org.easymock.EasyMock.expectLastCall; import static org.easymock.EasyMock.isA; -import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.Date; import java.util.HashSet; @@ -14,22 +12,15 @@ import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.TimeoutException; -import org.apache.accumulo.core.clientImpl.ScannerOptions; -import org.easymock.EasyMock; import org.easymock.EasyMockSupport; import org.junit.Before; import org.junit.Test; -import datawave.microservice.query.Query; import datawave.microservice.query.QueryImpl; import datawave.query.config.ShardQueryConfiguration; -import datawave.query.enrich.DataEnricher; import datawave.query.tables.AnyFieldScanner; import datawave.query.tables.ScannerFactory; -import datawave.query.tables.ScannerSession; import datawave.query.tables.SessionOptions; public class FieldNameIndexLookupTest extends EasyMockSupport { @@ -57,6 +48,7 @@ public void initTest() { } @Test(expected = RuntimeException.class) + @SuppressWarnings({"unchecked", "ConstantConditions"}) public void submitErrorEnsureCloseTest() throws InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException { AnyFieldScanner scannerSession = createMock(AnyFieldScanner.class); @@ -70,12 +62,11 @@ public void submitErrorEnsureCloseTest() throws InvocationTargetException, NoSuc terms.add("lookMeUp"); lookup = new FieldNameIndexLookup(config, scannerFactory, fields, terms, executorService); - expect(scannerFactory.newLimitedScanner(isA(Class.class), isA(String.class), isA(Set.class), isA(QueryImpl.class))).andReturn(scannerSession); + expect(scannerFactory.newLimitedScanner(isA(Class.class), isA(String.class), isA(Set.class), isA(QueryImpl.class), isA(String.class))) + .andReturn(scannerSession).anyTimes(); expect(scannerSession.setRanges(anyObject())).andReturn(scannerSession); expect(scannerSession.setOptions(anyObject())).andReturn(scannerSession); - expect(scannerSession.getOptions()).andAnswer(() -> { - return new SessionOptions(); - }).anyTimes(); + expect(scannerSession.getOptions()).andAnswer(SessionOptions::new).anyTimes(); // this is sort of contrived, but necessary to test that the cleanup of the batch scanner would actually happen expect(executorService.submit(isA(Callable.class))).andThrow(new RuntimeException("testing")); scannerSession.close(); @@ -88,9 +79,9 @@ public void submitErrorEnsureCloseTest() throws InvocationTargetException, NoSuc } @Test + @SuppressWarnings({"unchecked", "ConstantConditions"}) public void timeoutTest() throws InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException { AnyFieldScanner scannerSession = createMock(AnyFieldScanner.class); - Future f = EasyMock.createMock(Future.class); ExecutorService s = Executors.newSingleThreadExecutor(); @@ -107,7 +98,8 @@ public void timeoutTest() throws InvocationTargetException, NoSuchMethodExceptio terms.add("lookMeUp"); lookup = new FieldNameIndexLookup(config, scannerFactory, fields, terms, s); - expect(scannerFactory.newLimitedScanner(isA(Class.class), isA(String.class), isA(Set.class), isA(QueryImpl.class))).andReturn(scannerSession); + expect(scannerFactory.newLimitedScanner(isA(Class.class), isA(String.class), isA(Set.class), isA(QueryImpl.class), isA(String.class))) + .andReturn(scannerSession); expect(scannerSession.setRanges(anyObject())).andReturn(scannerSession); expect(scannerSession.setOptions(anyObject())).andReturn(scannerSession); expect(scannerSession.getOptions()).andAnswer(SessionOptions::new).anyTimes(); diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultJexlNodeComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultJexlNodeComparatorTest.java new file mode 100644 index 00000000000..9cf025557aa --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultJexlNodeComparatorTest.java @@ -0,0 +1,49 @@ +package datawave.query.jexl.nodes; + +import org.junit.Test; + +public class DefaultJexlNodeComparatorTest extends NodeComparatorTestUtil { + + private final JexlNodeComparator comparator = new DefaultJexlNodeComparator(); + + @Test + public void testSortSameFieldDifferentValues() { + String query = "FOO == 'baz' || FOO == 'bar'"; + String expected = "FOO == 'bar' || FOO == 'baz'"; + drive(query, expected, comparator); + } + + @Test + public void testDifferentFieldSameValues() { + String query = "FOO_B == 'baz' || FOO_A == 'baz'"; + String expected = "FOO_A == 'baz' || FOO_B == 'baz'"; + drive(query, expected, comparator); + } + + @Test + public void testSortOrderWithNodePairs() { + // EQ before NE + String query = "FOO != 'bar' || FOO == 'bar'"; + String expected = "FOO == 'bar' || FOO != 'bar'"; + drive(query, expected, comparator); + } + + @Test + public void testSortSingleNodesBeforeJunctions() { + String query = "(FOO == 'bar' && FOO == 'baz') || FOO == 'fizz'"; + String expected = "FOO == 'fizz' || (FOO == 'bar' && FOO == 'baz')"; + drive(query, expected, comparator); + + query = "(FOO == 'bar' || FOO == 'baz') && FOO == 'fizz'"; + expected = "FOO == 'fizz' && (FOO == 'bar' || FOO == 'baz')"; + drive(query, expected, comparator); + } + + @Test + public void testMarkersSortLast() { + String query = "B == '2' && ((_Value_ = true) && (A =~ 'ba.*')) && A == '1'"; + String expected = "A == '1' && B == '2' && ((_Value_ = true) && (A =~ 'ba.*'))"; + drive(query, expected, comparator); + } + +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultNodeCostComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultNodeCostComparatorTest.java deleted file mode 100644 index 279b8f6743f..00000000000 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultNodeCostComparatorTest.java +++ /dev/null @@ -1,80 +0,0 @@ -package datawave.query.jexl.nodes; - -import static org.junit.Assert.assertEquals; - -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -import org.apache.commons.jexl3.parser.JexlNode; -import org.junit.Test; - -import datawave.query.jexl.JexlNodeFactory; - -public class DefaultNodeCostComparatorTest { - - @Test - public void testCompareTwoEq() { - JexlNode left = JexlNodeFactory.buildEQNode("FOO", "bar"); - JexlNode right = JexlNodeFactory.buildEQNode("FOO", "baz"); - - List nodes = new LinkedList<>(); - nodes.add(left); - nodes.add(right); - - Iterator iter = nodes.iterator(); - assertEquals(left, iter.next()); - assertEquals(right, iter.next()); - - nodes.sort(new DefaultNodeCostComparator()); - - // Order should not have changed - iter = nodes.iterator(); - assertEquals(left, iter.next()); - assertEquals(right, iter.next()); - } - - @Test - public void testCompareEqAndRe() { - JexlNode left = JexlNodeFactory.buildEQNode("FOO", "bar"); - JexlNode right = JexlNodeFactory.buildERNode("FOO", "baz.*"); - - List nodes = new LinkedList<>(); - nodes.add(right); - nodes.add(left); - - // Assert insert order - Iterator iter = nodes.iterator(); - assertEquals(right, iter.next()); - assertEquals(left, iter.next()); - - nodes.sort(new DefaultNodeCostComparator()); - - // Assert proper sort order, EQ before ER - iter = nodes.iterator(); - assertEquals(left, iter.next()); - assertEquals(right, iter.next()); - } - - @Test - public void testCompareEqAndFunction() { - JexlNode left = JexlNodeFactory.buildEQNode("FOO", "bar"); - JexlNode right = JexlNodeFactory.buildFunctionNode("content", "phrase", "FOO", "baz"); - - List nodes = new LinkedList<>(); - nodes.add(right); - nodes.add(left); - - // Assert insert order - Iterator iter = nodes.iterator(); - assertEquals(right, iter.next()); - assertEquals(left, iter.next()); - - nodes.sort(new DefaultNodeCostComparator()); - - // Assert proper sort order, EQ before ER - iter = nodes.iterator(); - assertEquals(left, iter.next()); - assertEquals(right, iter.next()); - } -} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/FieldCostComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/FieldCostComparatorTest.java new file mode 100644 index 00000000000..ecc2002e07d --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/FieldCostComparatorTest.java @@ -0,0 +1,147 @@ +package datawave.query.jexl.nodes; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.junit.jupiter.api.Test; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; +import datawave.query.jexl.visitors.order.OrderByCostVisitor; +import datawave.query.util.count.CountMap; + +/** + * Tests for the {@link FieldCostComparator} + */ +public class FieldCostComparatorTest extends NodeComparatorTestUtil { + + private CountMap counts; + private JexlNodeComparator comparator; + + // sort when all fields present in map + @Test + public void testAllFieldsHaveCardinality() { + String query = "F23 == '23' || F12 == '12'"; + String expected = "F12 == '12' || F23 == '23'"; + + drive(query, expected, getComparator()); + } + + // sort when some fields present in map + @Test + public void testSomeFieldsHaveCardinality() { + // F11 is not found in the count map, should get sorted to the right + String query = "F11 == '11' || F12 == '12'"; + String expected = "F12 == '12' || F11 == '11'"; + + drive(query, expected, getComparator()); + } + + // sort when no fields are present in map (default ordering) + @Test + public void testNoFieldsHaveCardinality() { + String query = "F2 == '2' || F1 == '1' || F2 == '1'"; + String expected = "F1 == '1' || F2 == '1' || F2 == '2'"; + + drive(query, expected, getComparator()); + } + + // sort with leaves and unions + + @Test + public void testJunctionSortsLeftOfHighCostLeaf() { + String query = "F45 == '45' && (F12 == '12' || F23 == '23')"; + String expected = "(F12 == '12' || F23 == '23') && F45 == '45'"; + drive(query, expected, getComparator()); + } + + @Test + public void testIntersectionSortsRightWithUniformCosts() { + // because intersections take the lowest cost, if a leaf joins with a junction + // and the leaf shares the lowest cost node in the junction, you get a tie + String query = "(F12 == '12' && F23 == '23') || F12 == '12'"; + String expected = "F12 == '12' || (F12 == '12' && F23 == '23')"; + drive(query, expected, getComparator()); + } + + // sort with leaves or junctions + + // sort with unions of variable sizes + @Test + public void testSortUnionsOfVariableSizeAndCost() { + // lower cardinality unions should sort first even if it has more terms + String query = "(F45 == '45' || F45 == '45') && (F12 == '12' || F12 == '12' || F12 == '12')"; + String expected = "(F12 == '12' || F12 == '12' || F12 == '12') && (F45 == '45' || F45 == '45')"; + drive(query, expected, getComparator()); + } + + // sort with intersections of variable sizes + @Test + public void testSortIntersectionsOfVariableSizeAndCost() { + // lower cardinality intersections should sort first even if it has more terms + String query = "(F45 == '45' && F45 == '45') || (F12 == '12' && F12 == '12' && F12 == '12')"; + String expected = "(F12 == '12' && F12 == '12' && F12 == '12') || (F45 == '45' && F45 == '45')"; + drive(query, expected, getComparator()); + } + + // test integer overflow with multiple negation nodes + @Test + public void testNestedUnionOfNegatedTermsSortsLast() { + String query = "(!(F == '1') || !(F == '1')) && F12 == '12'"; + String expected = "F12 == '12' && (!(F == '1') || !(F == '1'))"; + drive(query, expected, getComparator()); + + query = "(F != '1' || F != '1') && F12 == '12'"; + expected = "F12 == '12' && (F != '1' || F != '1')"; + drive(query, expected, getComparator()); + } + + // test integer overflow with multiple marker nodes + @Test + public void testAvoidIntegerOverFlowWithMultipleMarkerNodes() { + String query = "((_Value_ = true) && (F =~ 'aa.*')) && ((_Value_ = true) && (F =~ 'bb.*')) && F == '2'"; + String expected = "F == '2' && ((_Value_ = true) && (F =~ 'aa.*')) && ((_Value_ = true) && (F =~ 'bb.*'))"; + drive(query, expected, getComparator()); + } + + /** + * Explicit override of test utility code so the {@link OrderByCostVisitor} can be run + * + * @param query + * the input query + * @param expected + * the expected query + * @param comparator + * the comparator + */ + @Override + public void drive(String query, String expected, JexlNodeComparator comparator) { + try { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); + script = OrderByCostVisitor.orderByFieldCount(script, getCounts().getCounts()); + String ordered = JexlStringBuildingVisitor.buildQuery(script); + assertEquals(expected, ordered); + } catch (Exception e) { + fail("Failed to run test", e); + } + } + + private JexlNodeComparator getComparator() { + if (comparator == null) { + comparator = new FieldCostComparator(getCounts()); + } + return comparator; + } + + private CountMap getCounts() { + if (counts == null) { + counts = new CountMap(); + counts.put("F12", 12L); + counts.put("F23", 23L); + counts.put("F34", 34L); + counts.put("F45", 45L); + } + return counts; + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/JunctionComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/JunctionComparatorTest.java new file mode 100644 index 00000000000..6bb1561d140 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/JunctionComparatorTest.java @@ -0,0 +1,190 @@ +package datawave.query.jexl.nodes; + +import org.junit.jupiter.api.Test; + +/** + * Tests for the {@link JunctionComparator} to verify that leaf nodes sort before junctions + */ +public class JunctionComparatorTest extends NodeComparatorTestUtil { + + private final JexlNodeComparator comparator = new JunctionComparator(); + + /** + * Test that asserts no changes to queries of the following types + *

+ * A && B + *

+ */ + @Test + public void testIntersectionOfLeafNodes() { + // @formatter:off + String[] queries = new String[] { + "F == '1' && F == '2'", // eq + "F != '1' && F == '2'", // ne + "F < '1' && F == '2'", // lt + "F > '1' && F == '2'", // gt + "F <= '1' && F == '2'", // le + "F >= '1' && F == '2'", // ge + "F =~ '1' && F == '2'", // er + "F !~ '1' && F == '2'", // nr + "!(F == '1') && F == '2'", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + } + + // A || B + @Test + public void testUnionOfLeafNodes() { + // @formatter:off + String[] queries = new String[] { + "F == '1' || F == '2'", // eq + "F != '1' || F == '2'", // ne + "F < '1' || F == '2'", // lt + "F > '1' || F == '2'", // gt + "F <= '1' || F == '2'", // le + "F >= '1' || F == '2'", // ge + "F =~ '1' || F == '2'", // er + "F !~ '1' || F == '2'", // nr + "!(F == '1') || F == '2'", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + } + + // A && (B || C) + @Test + public void testIntersectionWithNestedUnion() { + // first, assert queries with no change + // @formatter:off + String[] queries = new String[] { + "F == '1' && (F == '2' || F == '3')", // eq + "F != '1' && (F == '2' || F == '3')", // ne + "F < '1' && (F == '2' || F == '3')", // lt + "F > '1' && (F == '2' || F == '3')", // gt + "F <= '1' && (F == '2' || F == '3')", // le + "F >= '1' && (F == '2' || F == '3')", // ge + "F =~ '1' && (F == '2' || F == '3')", // er + "F !~ '1' && (F == '2' || F == '3')", // nr + "!(F == '1') && (F == '2' || F == '3')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + + // next, assert queries with change to sort order + // @formatter:off + String[][] sortable = new String[][] { + {"(F == '2' || F == '3') && F == '1'", "F == '1' && (F == '2' || F == '3')"}, // eq + {"(F == '2' || F == '3') && F != '1'", "F != '1' && (F == '2' || F == '3')"}, // ne + {"(F == '2' || F == '3') && F < '1'", "F < '1' && (F == '2' || F == '3')"}, // lt + {"(F == '2' || F == '3') && F > '1'", "F > '1' && (F == '2' || F == '3')"}, // gt + {"(F == '2' || F == '3') && F <= '1'", "F <= '1' && (F == '2' || F == '3')"}, // le + {"(F == '2' || F == '3') && F >= '1'", "F >= '1' && (F == '2' || F == '3')"}, // ge + {"(F == '2' || F == '3') && F =~ '1'", "F =~ '1' && (F == '2' || F == '3')"}, // er + {"(F == '2' || F == '3') && F !~ '1'", "F !~ '1' && (F == '2' || F == '3')"}, // nr + {"(F == '2' || F == '3') && !(F == '1')", "!(F == '1') && (F == '2' || F == '3')"} // not + }; + // @formatter:off + + for (String[] query : sortable) { + drive(query[0], query[1], comparator); + } + } + + // A || (B && C) + @Test + public void testUnionWithNestedIntersection() { + // first, assert queries with no change + // @formatter:off + String[] queries = new String[] { + "F == '1' || (F == '2' && F == '3')", // eq + "F != '1' || (F == '2' && F == '3')", // ne + "F < '1' || (F == '2' && F == '3')", // lt + "F > '1' || (F == '2' && F == '3')", // gt + "F <= '1' || (F == '2' && F == '3')", // le + "F >= '1' || (F == '2' && F == '3')", // ge + "F =~ '1' || (F == '2' && F == '3')", // er + "F !~ '1' || (F == '2' && F == '3')", // nr + "!(F == '1') || (F == '2' && F == '3')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + + // next, assert queries with change to sort order + // @formatter:off + String[][] sortable = new String[][] { + {"(F == '2' && F == '3') || F == '1'", "F == '1' || (F == '2' && F == '3')"}, // eq + {"(F == '2' && F == '3') || F != '1'", "F != '1' || (F == '2' && F == '3')"}, // ne + {"(F == '2' && F == '3') || F < '1'", "F < '1' || (F == '2' && F == '3')"}, // lt + {"(F == '2' && F == '3') || F > '1'", "F > '1' || (F == '2' && F == '3')"}, // gt + {"(F == '2' && F == '3') || F <= '1'", "F <= '1' || (F == '2' && F == '3')"}, // le + {"(F == '2' && F == '3') || F >= '1'", "F >= '1' || (F == '2' && F == '3')"}, // ge + {"(F == '2' && F == '3') || F =~ '1'", "F =~ '1' || (F == '2' && F == '3')"}, // er + {"(F == '2' && F == '3') || F !~ '1'", "F !~ '1' || (F == '2' && F == '3')"}, // nr + {"(F == '2' && F == '3') || !(F == '1')", "!(F == '1') || (F == '2' && F == '3')"} // not + }; + // @formatter:off + + for (String[] query : sortable) { + drive(query[0], query[1], comparator); + } + } + + // (A || B) && (C || D) + @Test + public void testIntersectionOfNestedUnions() { + // assert no changes + // @formatter:off + String[] queries = new String[] { + "(F == '1' || F == '2') && (F == '3' || F == '4')", // eq + "(F == '1' || F != '2') && (F != '3' || F == '4')", // ne + "(F == '1' || F < '2') && (F < '3' || F == '4')", // lt + "(F == '1' || F > '2') && (F > '3' || F == '4')", // gt + "(F == '1' || F <= '2') && (F <= '3' || F == '4')", // le + "(F == '1' || F >= '2') && (F >= '3' || F == '4')", // ge + "(F == '1' || F =~ '2') && (F =~ '3' || F == '4')", // er + "(F == '1' || F !~ '2') && (F !~ '3' || F == '4')", // nr + "(F == '1' || !(F == '2')) && (!(F == '3') || F == '4')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + } + + // (A && B) || (C && D) + @Test + public void testUnionOfNestedIntersections() { + // assert no changes + // @formatter:off + String[] queries = new String[] { + "(F == '1' && F == '2') || (F == '3' && F == '4')", // eq + "(F == '1' && F != '2') || (F != '3' && F == '4')", // ne + "(F == '1' && F < '2') || (F < '3' && F == '4')", // lt + "(F == '1' && F > '2') || (F > '3' && F == '4')", // gt + "(F == '1' && F <= '2') || (F <= '3' && F == '4')", // le + "(F == '1' && F >= '2') || (F >= '3' && F == '4')", // ge + "(F == '1' && F =~ '2') || (F =~ '3' && F == '4')", // er + "(F == '1' && F !~ '2') || (F !~ '3' && F == '4')", // nr + "(F == '1' && !(F == '2')) || (!(F == '3') && F == '4')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/LexicographicalNodeComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/LexicographicalNodeComparatorTest.java new file mode 100644 index 00000000000..22b168aa4cd --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/LexicographicalNodeComparatorTest.java @@ -0,0 +1,247 @@ +package datawave.query.jexl.nodes; + +import org.junit.jupiter.api.Test; + +/** + * Tests for the {@link LexicographicalNodeComparator} to verify expected sorts with different fields and values + */ +public class LexicographicalNodeComparatorTest extends NodeComparatorTestUtil { + + private final JexlNodeComparator comparator = new LexicographicalNodeComparator(); + + // same node, same field, same value + @Test + public void testSameNodeType_sameField_sameValue() { + // assert no changes + // @formatter:off + String[] queries = new String[] { + // intersections + "F == '1' && F == '1'", // eq + "F != '1' && F != '1'", // ne + "F < '1' && F < '1'", // lt + "F > '1' && F > '1'", // gt + "F <= '1' && F <= '1'", // le + "F >= '1' && F >= '1'", // ge + "F =~ '1' && F =~ '1'", // er + "F !~ '1' && F !~ '1'", // nr + "!(F == '1') && !(F == '1')", // not + // unions + "F == '1' || F == '1'", // eq + "F != '1' || F != '1'", // ne + "F < '1' || F < '1'", // lt + "F > '1' || F > '1'", // gt + "F <= '1' || F <= '1'", // le + "F >= '1' || F >= '1'", // ge + "F =~ '1' || F =~ '1'", // er + "F !~ '1' || F !~ '1'", // nr + "!(F == '1') || !(F == '1')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + } + + // same node, same field, different values + @Test + public void testSameNodeType_sameField_differentValue() { + // different values, correct order, no change + // @formatter:off + String[] queries = new String[] { + // intersections + "F == '1' && F == '2'", // eq + "F != '1' && F != '2'", // ne + "F < '1' && F < '2'", // lt + "F > '1' && F > '2'", // gt + "F <= '1' && F <= '2'", // le + "F >= '1' && F >= '2'", // ge + "F =~ '1' && F =~ '2'", // er + "F !~ '1' && F !~ '2'", // nr + "!(F == '1') && !(F == '2')", // not + // unions + "F == '1' || F == '2'", // eq + "F != '1' || F != '2'", // ne + "F < '1' || F < '2'", // lt + "F > '1' || F > '2'", // gt + "F <= '1' || F <= '2'", // le + "F >= '1' || F >= '2'", // ge + "F =~ '1' || F =~ '2'", // er + "F !~ '1' || F !~ '2'", // nr + "!(F == '1') || !(F == '2')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + + // different values, incorrect order, expect changes + + // @formatter:off + String[][] sortable = new String[][] { + // intersections + {"F == '2' && F == '1'", "F == '1' && F == '2'"}, // eq + {"F != '2' && F != '1'", "F != '1' && F != '2'"}, // ne + {"F < '2' && F < '1'", "F < '1' && F < '2'"}, // lt + {"F > '2' && F > '1'", "F > '1' && F > '2'"}, // gt + {"F <= '2' && F <= '1'", "F <= '1' && F <= '2'"}, // le + {"F >= '2' && F >= '1'", "F >= '1' && F >= '2'"}, // ge + {"F =~ '2' && F =~ '1'", "F =~ '1' && F =~ '2'"}, // er + {"F !~ '2' && F !~ '1'", "F !~ '1' && F !~ '2'"}, // nr + {"!(F == '2') && !(F == '1')", "!(F == '1') && !(F == '2')"}, // not + // unions + {"F == '2' || F == '1'", "F == '1' || F == '2'"}, // eq + {"F != '2' || F != '1'", "F != '1' || F != '2'"}, // ne + {"F < '2' || F < '1'", "F < '1' || F < '2'"}, // lt + {"F > '2' || F > '1'", "F > '1' || F > '2'"}, // gt + {"F <= '2' || F <= '1'", "F <= '1' || F <= '2'"}, // le + {"F >= '2' || F >= '1'", "F >= '1' || F >= '2'"}, // ge + {"F =~ '2' || F =~ '1'", "F =~ '1' || F =~ '2'"}, // er + {"F !~ '2' || F !~ '1'", "F !~ '1' || F !~ '2'"}, // nr + {"!(F == '2') || !(F == '1')", "!(F == '1') || !(F == '2')"}, // not + }; + // @formatter:on + + for (String[] query : sortable) { + drive(query[0], query[1], comparator); + } + } + + // same node, different field, same values + @Test + public void testSameNodeType_differentField_sameValue() { + // different fields, correct order, no change + // @formatter:off + String[] queries = new String[] { + // intersections + "F1 == '1' && F2 == '1'", // eq + "F1 != '1' && F2 != '1'", // ne + "F1 < '1' && F2 < '1'", // lt + "F1 > '1' && F2 > '1'", // gt + "F1 <= '1' && F2 <= '1'", // le + "F1 >= '1' && F2 >= '1'", // ge + "F1 =~ '1' && F2 =~ '1'", // er + "F1 !~ '1' && F2 !~ '1'", // nr + "!(F1 == '1') && !(F2 == '1')", // not + // unions + "F1 == '1' || F2 == '1'", // eq + "F1 != '1' || F2 != '1'", // ne + "F1 < '1' || F2 < '1'", // lt + "F1 > '1' || F2 > '1'", // gt + "F1 <= '1' || F2 <= '1'", // le + "F1 >= '1' || F2 >= '1'", // ge + "F1 =~ '1' || F2 =~ '1'", // er + "F1 !~ '1' || F2 !~ '1'", // nr + "!(F1 == '1') || !(F2 == '1')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + + // different fields, incorrect order, expect changes + + // @formatter:off + String[][] sortable = new String[][] { + // intersections + {"F2 == '1' && F1 == '1'", "F1 == '1' && F2 == '1'"}, // eq + {"F2 != '1' && F1 != '1'", "F1 != '1' && F2 != '1'"}, // ne + {"F2 < '1' && F1 < '1'", "F1 < '1' && F2 < '1'"}, // lt + {"F2 > '1' && F1 > '1'", "F1 > '1' && F2 > '1'"}, // gt + {"F2 <= '1' && F1 <= '1'", "F1 <= '1' && F2 <= '1'"}, // le + {"F2 >= '1' && F1 >= '1'", "F1 >= '1' && F2 >= '1'"}, // ge + {"F2 =~ '1' && F1 =~ '1'", "F1 =~ '1' && F2 =~ '1'"}, // er + {"F2 !~ '1' && F1 !~ '1'", "F1 !~ '1' && F2 !~ '1'"}, // nr + {"!(F2 == '1') && !(F1 == '1')", "!(F1 == '1') && !(F2 == '1')"}, // not + // unions + {"F2 == '1' || F1 == '1'", "F1 == '1' || F2 == '1'"}, // eq + {"F2 != '1' || F1 != '1'", "F1 != '1' || F2 != '1'"}, // ne + {"F2 < '1' || F1 < '1'", "F1 < '1' || F2 < '1'"}, // lt + {"F2 > '1' || F1 > '1'", "F1 > '1' || F2 > '1'"}, // gt + {"F2 <= '1' || F1 <= '1'", "F1 <= '1' || F2 <= '1'"}, // le + {"F2 >= '1' || F1 >= '1'", "F1 >= '1' || F2 >= '1'"}, // ge + {"F2 =~ '1' || F1 =~ '1'", "F1 =~ '1' || F2 =~ '1'"}, // er + {"F2 !~ '1' || F1 !~ '1'", "F1 !~ '1' || F2 !~ '1'"}, // nr + {"!(F2 == '1') || !(F1 == '1')", "!(F1 == '1') || !(F2 == '1')"}, // not + }; + // @formatter:on + + for (String[] query : sortable) { + drive(query[0], query[1], comparator); + } + } + + // same node, different field, different values + @Test + public void testSameNodeType_differentField_differentValue() { + // different fields and values, correct order, no change + // @formatter:off + String[] queries = new String[] { + // intersections + "F1 == '1' && F2 == '1' && F2 == '2'", // eq + "F1 != '1' && F2 != '1' && F2 != '2'", // ne + "F1 < '1' && F2 < '1' && F2 < '2'", // lt + "F1 > '1' && F2 > '1' && F2 > '2'", // gt + "F1 <= '1' && F2 <= '1' && F2 <= '2'", // le + "F1 >= '1' && F2 >= '1' && F2 >= '2'", // ge + "F1 =~ '1' && F2 =~ '1' && F2 =~ '2'", // er + "F1 !~ '1' && F2 !~ '1' && F2 !~ '2'", // nr + "!(F1 == '1') && !(F2 == '1') && !(F2 == '2')", // not + // unions + "F1 == '1' || F2 == '1' || F2 == '2'", // eq + "F1 != '1' || F2 != '1' || F2 != '2'", // ne + "F1 < '1' || F2 < '1' || F2 < '2'", // lt + "F1 > '1' || F2 > '1' || F2 > '2'", // gt + "F1 <= '1' || F2 <= '1' || F2 <= '2'", // le + "F1 >= '1' || F2 >= '1' || F2 >= '2'", // ge + "F1 =~ '1' || F2 =~ '1' || F2 =~ '2'", // er + "F1 !~ '1' || F2 !~ '1' || F2 !~ '2'", // nr + "!(F1 == '1') || !(F2 == '1') || !(F2 == '2')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + + // different fields and values, incorrect order, change expected + // @formatter:off + String[][] sortable = new String[][] { + // intersections + {"F2 == '2' && F2 == '1' && F1 == '1'", "F1 == '1' && F2 == '1' && F2 == '2'"}, // eq + {"F2 != '2' && F2 != '1' && F1 != '1'", "F1 != '1' && F2 != '1' && F2 != '2'"}, // ne + {"F2 < '2' && F2 < '1' && F1 < '1'", "F1 < '1' && F2 < '1' && F2 < '2'"}, // lt + {"F2 > '2' && F2 > '1' && F1 > '1'", "F1 > '1' && F2 > '1' && F2 > '2'"}, // gt + {"F2 <= '2' && F2 <= '1' && F1 <= '1'", "F1 <= '1' && F2 <= '1' && F2 <= '2'"}, // le + {"F2 >= '2' && F2 >= '1' && F1 >= '1'", "F1 >= '1' && F2 >= '1' && F2 >= '2'"}, // ge + {"F2 =~ '2' && F2 =~ '1' && F1 =~ '1'", "F1 =~ '1' && F2 =~ '1' && F2 =~ '2'"}, // er + {"F2 !~ '2' && F2 !~ '1' && F1 !~ '1'", "F1 !~ '1' && F2 !~ '1' && F2 !~ '2'"}, // nr + {"!(F2 == '2') && !(F2 == '1') && !(F1 == '1')", "!(F1 == '1') && !(F2 == '1') && !(F2 == '2')"}, // not + // unions + {"F2 == '2' || F2 == '1' || F1 == '1'", "F1 == '1' || F2 == '1' || F2 == '2'"}, // eq + {"F2 != '2' || F2 != '1' || F1 != '1'", "F1 != '1' || F2 != '1' || F2 != '2'"}, // ne + {"F2 < '2' || F2 < '1' || F1 < '1'", "F1 < '1' || F2 < '1' || F2 < '2'"}, // lt + {"F2 > '2' || F2 > '1' || F1 > '1'", "F1 > '1' || F2 > '1' || F2 > '2'"}, // gt + {"F2 <= '2' || F2 <= '1' || F1 <= '1'", "F1 <= '1' || F2 <= '1' || F2 <= '2'"}, // le + {"F2 >= '2' || F2 >= '1' || F1 >= '1'", "F1 >= '1' || F2 >= '1' || F2 >= '2'"}, // ge + {"F2 =~ '2' || F2 =~ '1' || F1 =~ '1'", "F1 =~ '1' || F2 =~ '1' || F2 =~ '2'"}, // er + {"F2 !~ '2' || F2 !~ '1' || F1 !~ '1'", "F1 !~ '1' || F2 !~ '1' || F2 !~ '2'"}, // nr + {"!(F2 == '2') || !(F2 == '1') || !(F1 == '1')", "!(F1 == '1') || !(F2 == '1') || !(F2 == '2')"}, // not + }; + // @formatter:on + + for (String[] query : sortable) { + drive(query[0], query[1], comparator); + } + } + + @Test + public void testDemonstrateJunctionSortOrder() { + // this test case demonstrates why this visitor should only be used to break ties between two otherwise equivalent nodes + String query = "F == '1' && (F == '2' || F == '3')"; + String expected = "(F == '2' || F == '3') && F == '1'"; + drive(query, expected, comparator); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/NodeComparatorTestUtil.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/NodeComparatorTestUtil.java new file mode 100644 index 00000000000..a5bc13cdeda --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/NodeComparatorTestUtil.java @@ -0,0 +1,57 @@ +package datawave.query.jexl.nodes; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.util.Arrays; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.ASTOrNode; +import org.apache.commons.jexl3.parser.JexlNode; +import org.apache.commons.jexl3.parser.JexlNodes; +import org.apache.commons.jexl3.parser.ParseException; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; + +/** + * Common test code for node comparator tests + */ +public class NodeComparatorTestUtil { + + /** + * Assumes the provided queries are either a union or an intersection + * + * @param query + * the input query + * @param expected + * the expected query + */ + public void drive(String query, String expected, JexlNodeComparator comparator) { + JexlNode[] queryChildren = parse(query); + Arrays.sort(queryChildren, comparator); + + JexlNode[] expectedChildren = parse(expected); + + assertEquals(expectedChildren.length, queryChildren.length); + for (int i = 0; i < expectedChildren.length; i++) { + String expectedChild = JexlStringBuildingVisitor.buildQuery(expectedChildren[i]); + String queryChild = JexlStringBuildingVisitor.buildQuery(queryChildren[i]); + assertEquals(expectedChild, queryChild); + } + } + + private JexlNode[] parse(String query) { + try { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); + JexlNode node = script.jjtGetChild(0); + assertTrue(node instanceof ASTAndNode || node instanceof ASTOrNode); + return JexlNodes.getChildren(node); + } catch (ParseException e) { + fail("Failed test: " + query); + throw new RuntimeException("Failed test: " + query); + } + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/TermCostComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/TermCostComparatorTest.java new file mode 100644 index 00000000000..19a451b97e2 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/TermCostComparatorTest.java @@ -0,0 +1,159 @@ +package datawave.query.jexl.nodes; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.junit.jupiter.api.Test; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; +import datawave.query.jexl.visitors.order.OrderByCostVisitor; +import datawave.query.util.count.CountMap; + +public class TermCostComparatorTest extends NodeComparatorTestUtil { + + private CountMap counts; + private JexlNodeComparator comparator; + + // sort all terms have cardinality + @Test + public void testAllTermsHaveCardinality() { + String[][] queries = {{"F == '23' || F == '12'", "F == '12' || F == '23'"}, {"F == '23' && F == '12'", "F == '12' && F == '23'"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + // sort some terms have cardinality + @Test + public void testSomeTermsHaveCardinality() { + String[][] queries = {{"F == '0' || F == '12'", "F == '12' || F == '0'"}, {"F == '0' && F == '12'", "F == '12' && F == '0'"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + // sort no terms have cardinality (fallback) + @Test + public void testNoTermsHaveCardinality() { + String[][] queries = {{"F == '2' || F == '1'", "F == '1' || F == '2'"}, {"F == '2' && F == '1'", "F == '1' && F == '2'"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + // sort junctions all terms have cardinality + variable size + @Test + public void testJunctionsSortLeftOfHighCostLeaf() { + String[][] queries = {{"(F == '12' || F == '23') && F == '45'", "(F == '12' || F == '23') && F == '45'"}, + {"(F == '12' && F == '23') || F == '45'", "(F == '12' && F == '23') || F == '45'"}, + // sort order applied to nested junctions + {"(F == '23' || F == '12') && F == '45'", "(F == '12' || F == '23') && F == '45'"}, + {"(F == '23' && F == '12') || F == '45'", "(F == '12' && F == '23') || F == '45'"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + @Test + public void testJunctionSort() { + String[][] queries = { + // assert no change ordered nested junctions and ordered top level junction + {"(F == '12' || F == '23') && (F == '34' || F == '45')", "(F == '12' || F == '23') && (F == '34' || F == '45')"}, + {"(F == '12' && F == '23') || (F == '34' && F == '45')", "(F == '12' && F == '23') || (F == '34' && F == '45')"}, + // assert unordered nested junctions and ordered top level junctions + {"(F == '23' || F == '12') && (F == '45' || F == '34')", "(F == '12' || F == '23') && (F == '34' || F == '45')"}, + {"(F == '23' && F == '12') || (F == '45' && F == '34')", "(F == '12' && F == '23') || (F == '34' && F == '45')"}, + // assert ordered nested junctions and unordered top level junctions + {"(F == '34' || F == '45') && (F == '12' || F == '23')", "(F == '12' || F == '23') && (F == '34' || F == '45')"}, + {"(F == '34' && F == '45') || (F == '12' && F == '23')", "(F == '12' && F == '23') || (F == '34' && F == '45')"}, + // assert unordered nested junctions and unordered top level junctions + {"(F == '45' || F == '34') && (F == '23' || F == '12')", "(F == '12' || F == '23') && (F == '34' || F == '45')"}, + {"(F == '45' && F == '34') || (F == '23' && F == '12')", "(F == '12' && F == '23') || (F == '34' && F == '45')"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + @Test + public void testJunctionsOfVariableSize() { + String[][] queries = { + // ordered junctions + {"(F == '12' || F == '12' || F == '12') && (F == '34' || F == '45')", "(F == '12' || F == '12' || F == '12') && (F == '34' || F == '45')"}, + {"(F == '12' && F == '12' && F == '12') || (F == '34' && F == '45')", "(F == '12' && F == '12' && F == '12') || (F == '34' && F == '45')"}, + // unordered junctions + {"(F == '34' || F == '45') && (F == '12' || F == '12' || F == '12')", "(F == '12' || F == '12' || F == '12') && (F == '34' || F == '45')"}, + {"(F == '34' && F == '45') || (F == '12' && F == '12' && F == '12')", "(F == '12' && F == '12' && F == '12') || (F == '34' && F == '45')"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + // sort junctions partial cardinality + @Test + public void testJunctionsWithPartialCardinality() { + String[][] queries = {{"F == '1' || F == '23'", "F == '23' || F == '1'"}, {"F == '1' && F == '23'", "F == '23' && F == '1'"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + // sort junctions one side has cardinality the other does not + @Test + public void testSomeJunctionsHaveCardinality() { + String[][] queries = {{"(F == '1' || F == '2') && (F == '12' || F == '23')", "(F == '12' || F == '23') && (F == '1' || F == '2')"}, + {"(F == '1' && F == '2') || (F == '12' && F == '23')", "(F == '12' && F == '23') || (F == '1' && F == '2')"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + /** + * Explicit override of test utility code so the {@link OrderByCostVisitor} can be run + * + * @param query + * the input query + * @param expected + * the expected query + * @param comparator + * the comparator + */ + @Override + public void drive(String query, String expected, JexlNodeComparator comparator) { + try { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); + script = OrderByCostVisitor.orderByTermCount(script, getCounts().getCounts()); + String ordered = JexlStringBuildingVisitor.buildQuery(script); + assertEquals(expected, ordered); + } catch (Exception e) { + fail("Failed to run test", e); + } + } + + private JexlNodeComparator getComparator() { + if (comparator == null) { + comparator = new TermCostComparator(getCounts()); + } + return comparator; + } + + private CountMap getCounts() { + if (counts == null) { + counts = new CountMap(); + counts.put("F == '12'", 12L); + counts.put("F == '23'", 23L); + counts.put("F == '34'", 34L); + counts.put("F == '45'", 45L); + } + return counts; + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/IsNotNullPruningVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/IsNotNullPruningVisitorTest.java index 4bda59efc5f..89058778946 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/IsNotNullPruningVisitorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/IsNotNullPruningVisitorTest.java @@ -493,13 +493,13 @@ public void testFutureCase_PartialPruneOfUnionViaUnion() { // union of same field should allow us to perform a partial prune String query = "(!(FOO == null) || !(FOO2 == null)) && (FOO == 'bar' || FOO == 'baz')"; - // String expected = "!(FOO2 == null) && (FOO == 'bar' || FOO == 'baz')"; - test(query, query); + String expected = "(FOO == 'bar' || FOO == 'baz')"; + test(query, expected); // should also work for filter:includeRegex query = "(!(FOO == null) || !(FOO2 == null)) && (filter:includeRegex(FOO, 'bar.*') || filter:includeRegex(FOO, 'baz.*'))"; - // expected = "!(FOO2 == null) && (filter:includeRegex(FOO, 'bar.*') || filter:includeRegex(FOO, 'baz.*'))"; - test(query, query); + expected = "(filter:includeRegex(FOO, 'bar.*') || filter:includeRegex(FOO, 'baz.*'))"; + test(query, expected); } // test cases where nothing should be done @@ -537,10 +537,12 @@ public void testNoOpCases() { // cannot prune half of a union query = "(!(FOO == null) || !(FOO2 == null)) && FOO == 'bar'"; - test(query, query); + String expected = "FOO == 'bar'"; + test(query, expected); query = "(!(FOO == null) || !(FOO2 == null)) && FOO =~ 'ba.*'"; - test(query, query); + expected = "FOO =~ 'ba.*'"; + test(query, expected); } @Test @@ -575,6 +577,22 @@ public void testNoOpQueryPropertyMarkers() { test(query, query); } + @Test + public void testPruningNestedUnionOfIsNotNullFunctions() { + // logically, these unions are equivalent and the 'is not null' side can be pruned + String query = "FOO == 'bar' && (!(FOO == null) || !(FOO2 == null) || !(FOO3 == null) || !(FOO4 == null))"; + String expected = "FOO == 'bar'"; + + test(query, expected); + } + + @Test + public void testPruningNestedUnionOfIsNotNullFunctions_Two() { + // in this case, since the FOO field is not in the union nothing will be pruned. + String query = "FOO == 'bar' && (!(FOO2 == null) || !(FOO4 == null))"; + test(query, query); + } + private void test(String query, String expected) { try { ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/order/OrderByCostVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/order/OrderByCostVisitorTest.java index cde4e603d7b..88355eca03c 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/order/OrderByCostVisitorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/order/OrderByCostVisitorTest.java @@ -332,4 +332,19 @@ private Map getTermCountMap() { counts.put("F == '6'", 5L); // same counts for E and F return counts; } + + @Test + public void testCase() throws Exception { + Map counts = new HashMap<>(); + counts.put("FIELD_A", 23L); + counts.put("FIELD_B", 34L); + counts.put("FIELD_C", 45L); + + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery("FIELD_C == 'v' || FIELD_B == 'v' || FIELD_A == 'v'"); + + OrderByCostVisitor.orderByFieldCount(script, counts); + + String ordered = JexlStringBuildingVisitor.buildQuery(script); + assertEquals("FIELD_A == 'v' || FIELD_B == 'v' || FIELD_C == 'v'", ordered); + } } diff --git a/warehouse/query-core/src/test/java/datawave/query/language/functions/jexl/NoExpansionTest.java b/warehouse/query-core/src/test/java/datawave/query/language/functions/jexl/NoExpansionTest.java new file mode 100644 index 00000000000..06dd0ff76ee --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/language/functions/jexl/NoExpansionTest.java @@ -0,0 +1,63 @@ +package datawave.query.language.functions.jexl; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + +import java.util.List; + +import org.junit.Test; + +public class NoExpansionTest { + + /** + * Verify that {@link NoExpansion#validate()} throws an exception given an empty parameter list. + */ + @Test + public void testValidateWithEmptyParameters() { + NoExpansion noExpansion = new NoExpansion(); + noExpansion.setParameterList(List.of()); + Exception exception = assertThrows(IllegalArgumentException.class, noExpansion::validate); + assertEquals("datawave.webservice.query.exception.BadRequestQueryException: Invalid arguments to function. noExpansion requires at least one argument", + exception.getMessage()); + } + + /** + * Verify that {@link NoExpansion#validate()} does not throw an error for a single parameter. + */ + @Test + public void testValidateWithOneField() { + NoExpansion noExpansion = new NoExpansion(); + noExpansion.setParameterList(List.of("field1")); + noExpansion.validate(); + } + + /** + * Verify that {@link NoExpansion#validate()} does not throw an error for multiple parameters. + */ + @Test + public void testValidateWithMultipleFields() { + NoExpansion noExpansion = new NoExpansion(); + noExpansion.setParameterList(List.of("field1", "field2", "field3")); + noExpansion.validate(); + } + + @Test + public void testToStringWithNoParameters() { + NoExpansion noExpansion = new NoExpansion(); + assertEquals("f:noExpansion()", noExpansion.toString()); + } + + @Test + public void testToStringWithOneParameter() { + NoExpansion noExpansion = new NoExpansion(); + noExpansion.setParameterList(List.of("field1")); + assertEquals("f:noExpansion('field1')", noExpansion.toString()); + } + + @Test + public void testToStringWithMultipleParameter() { + NoExpansion noExpansion = new NoExpansion(); + noExpansion.setParameterList(List.of("field1", "field2", "field3")); + assertEquals("f:noExpansion('field1','field2','field3')", noExpansion.toString()); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlParser.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlParser.java index 4cce7c7bcdd..65d8c5111a9 100644 --- a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlParser.java +++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlParser.java @@ -57,7 +57,10 @@ public void test1() throws Exception { public void testParseFunction_NoExpansion() throws ParseException { LuceneToJexlQueryParser parser = getQueryParser(); QueryNode node = parser.parse("FIELD:SOMETHING AND #NOEXPANSION(FIELD)"); - Assert.assertEquals("FIELD == 'SOMETHING' && f:noExpansion(FIELD)", node.getOriginalQuery()); + Assert.assertEquals("FIELD == 'SOMETHING' && f:noExpansion('FIELD')", node.getOriginalQuery()); + + node = parser.parse("FIELD:SOMETHING AND #NOEXPANSION(FIELD1,FIELD2)"); + Assert.assertEquals("FIELD == 'SOMETHING' && f:noExpansion('FIELD1','FIELD2')", node.getOriginalQuery()); } @Test diff --git a/warehouse/query-core/src/test/java/datawave/query/predicate/ValueToAttributesTest.java b/warehouse/query-core/src/test/java/datawave/query/predicate/ValueToAttributesTest.java index 0dd9877d65e..2076fd2d60d 100644 --- a/warehouse/query-core/src/test/java/datawave/query/predicate/ValueToAttributesTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/predicate/ValueToAttributesTest.java @@ -245,9 +245,9 @@ public void testComposites() { compositeMetadata.setCompositeFieldMappingByType(ingestType, "MAKE_COLOR", Arrays.asList("MAKE", "COLOR")); compositeMetadata.setCompositeFieldMappingByType(ingestType, "COLOR_WHEELS", Arrays.asList("MAKE", "COLOR")); } - TypeMetadata typeMetadata = new TypeMetadata( - "MAKE:[beep:datawave.data.type.LcNoDiacriticsType];MAKE_COLOR:[beep:datawave.data.type.NoOpType];START_DATE:[beep:datawave.data.type.DateType];TYPE_NOEVAL:[beep:datawave.data.type.LcNoDiacriticsType];IP_ADDR:[beep:datawave.data.type.IpAddressType];WHEELS:[beep:datawave.data.type.LcNoDiacriticsType,datawave.data.type.NumberType];COLOR:[beep:datawave.data.type.LcNoDiacriticsType];COLOR_WHEELS:[beep:datawave.data.type.NoOpType];TYPE:[beep:datawave.data.type.LcNoDiacriticsType]"); + "dts:[0:beep];types:[0:datawave.data.type.DateType,1:datawave.data.type.IpAddressType,2:datawave.data.type.LcNoDiacriticsType,3:datawave.data.type.NoOpType,4:datawave.data.type.NumberType];MAKE:[0:2];MAKE_COLOR:[0:3];START_DATE:[0:0];TYPE_NOEVAL:[0:2];IP_ADDR:[0:1];WHEELS:[0:2,0:4];COLOR:[0:2];COLOR_WHEELS:[0:3];TYPE:[0:2]"); + MarkingFunctions markingFunctions = new MarkingFunctions.Default(); ValueToAttributes valueToAttributes = new ValueToAttributes(compositeMetadata, typeMetadata, null, markingFunctions, true); } diff --git a/warehouse/query-core/src/test/java/datawave/query/scanner/LocalBatchScanner.java b/warehouse/query-core/src/test/java/datawave/query/scanner/LocalBatchScanner.java new file mode 100644 index 00000000000..88192ed8eec --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/scanner/LocalBatchScanner.java @@ -0,0 +1,153 @@ +package datawave.query.scanner; + +import java.io.IOException; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.accumulo.core.client.BatchScanner; +import org.apache.accumulo.core.data.ArrayByteSequence; +import org.apache.accumulo.core.data.ByteSequence; +import org.apache.accumulo.core.data.Column; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.IteratorEnvironment; +import org.apache.accumulo.core.iterators.IteratorUtil; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; +import org.apache.accumulo.core.iterators.WrappingIterator; +import org.apache.accumulo.core.iteratorsImpl.IteratorBuilder; +import org.apache.accumulo.core.iteratorsImpl.IteratorConfigUtil; +import org.apache.accumulo.core.security.Authorizations; + +import datawave.query.iterator.SortedListKeyValueIterator; +import datawave.query.tables.SessionOptions; + +public class LocalBatchScanner extends SessionOptions implements BatchScanner { + private SortedListKeyValueIterator itr; + private Collection ranges; + private boolean statsEnabled = false; + private StatsIterator statsIterator; + + public LocalBatchScanner(SortedListKeyValueIterator itr) { + this(itr, false); + } + + public LocalBatchScanner(SortedListKeyValueIterator itr, boolean statsEnabled) { + this.itr = itr; + this.statsEnabled = statsEnabled; + } + + public long getNextCount() { + return statsIterator == null ? -1 : statsIterator.getNextCount(); + } + + public long getSeekCount() { + return statsIterator == null ? -1 : statsIterator.getSeekCount(); + } + + @Override + public Iterator> iterator() { + Collections.sort(serverSideIteratorList, (o1, o2) -> { + if (o1.priority < o2.priority) { + return -1; + } else if (o1.priority > o2.priority) { + return 1; + } else { + return 0; + } + }); + + SortedKeyValueIterator base = this.itr; + IteratorEnvironment env = new LocalIteratorEnvironment(); + + if (statsEnabled) { + statsIterator = new StatsIterator(); + try { + statsIterator.init(base, Collections.emptyMap(), env); + base = statsIterator; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + IteratorBuilder iteratorBuilder = IteratorBuilder.builder(serverSideIteratorList).opts(serverSideIteratorOptions).env(env).build(); + + List> list = new ArrayList<>(); + try { + SortedKeyValueIterator created = IteratorConfigUtil.loadIterators(base, iteratorBuilder); + List columns = new ArrayList<>(); + for (Column c : fetchedColumns) { + columns.add(new ArrayByteSequence(c.columnFamily)); + } + + for (Range range : ranges) { + created.seek(range, columns, true); + while (created.hasTop()) { + list.add(new AbstractMap.SimpleImmutableEntry<>(created.getTopKey(), created.getTopValue())); + created.next(); + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + + return list.iterator(); + } + + @Override + public void setRanges(Collection ranges) { + this.ranges = ranges; + } + + public static class LocalIteratorEnvironment implements IteratorEnvironment { + @Override + public IteratorUtil.IteratorScope getIteratorScope() { + return IteratorUtil.IteratorScope.scan; + } + + @Override + public boolean isUserCompaction() { + return false; + } + + @Override + public boolean isFullMajorCompaction() { + return false; + } + + @Override + public Authorizations getAuthorizations() { + return new Authorizations(); + } + } + + public static class StatsIterator extends WrappingIterator { + private long nextCount = 0; + private long seekCount = 0; + + @Override + public void next() throws IOException { + super.next(); + nextCount++; + } + + @Override + public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { + super.seek(range, columnFamilies, inclusive); + seekCount++; + } + + public long getNextCount() { + return nextCount; + } + + public long getSeekCount() { + return seekCount; + } + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/transformer/NoExpansionTests.java b/warehouse/query-core/src/test/java/datawave/query/transformer/NoExpansionTests.java index 340379642d6..d36f1c70ee5 100644 --- a/warehouse/query-core/src/test/java/datawave/query/transformer/NoExpansionTests.java +++ b/warehouse/query-core/src/test/java/datawave/query/transformer/NoExpansionTests.java @@ -149,7 +149,7 @@ private void runTestQuery() throws Exception { // order of terms in planned script is arbitrary, fall back to comparing the jexl trees ASTJexlScript plannedScript = JexlASTHelper.parseJexlQuery(plan); ASTJexlScript expectedScript = JexlASTHelper.parseJexlQuery(this.expectedPlan); - JexlNodeAssert.assertThat(expectedScript).isEqualTo(plannedScript); + JexlNodeAssert.assertThat(plannedScript).isEqualTo(expectedScript); } private AccumuloClient createClient() throws Exception { @@ -179,8 +179,8 @@ private void givenExpectedPlan(String expectedPlan) { */ @Test public void testDefaultQueryModelExpansion() throws Exception { - givenQuery("COLOR == 'blue'"); - givenExpectedPlan("(COLOR == 'blue' || HUE == 'blue')"); + givenQuery("COLOR == 'blue' && FASTENER == 'bolt'"); + givenExpectedPlan("(COLOR == 'blue' || HUE == 'blue') && (FASTENER == 'bolt' || FIXTURE == 'bolt')"); runTestQuery(); } @@ -196,6 +196,17 @@ public void testNoExpansionViaFunction() throws Exception { runTestQuery(); } + /** + * Verify that when #NO_EXPANSION is specified in the query string itself with multiple fields, expansion does not occur. + */ + @Test + public void testNoExpansionViaFunctionWithMultipleFields() throws Exception { + givenQuery("COLOR == 'blue' && FASTENER == 'bolt' && f:noExpansion(COLOR,FASTENER)"); + givenExpectedPlan("COLOR == 'blue' && FASTENER == 'bolt'"); + + runTestQuery(); + } + /** * Verify that when #NO_EXPANSION is specified via the query parameters, expansion does not occur. */ @@ -208,6 +219,18 @@ public void testNoExpansionViaQueryParameters() throws Exception { runTestQuery(); } + /** + * Verify that when #NO_EXPANSION is specified via the query parameters, expansion does not occur. + */ + @Test + public void testNoExpansionViaQueryParametersWithMultipleFields() throws Exception { + givenQuery("COLOR == 'blue' && FASTENER == 'bolt'"); + givenQueryParameter(QueryParameters.NO_EXPANSION_FIELDS, "COLOR,FASTENER"); + givenExpectedPlan("COLOR == 'blue' && FASTENER == 'bolt'"); + + runTestQuery(); + } + /** * Verify that when #NO_EXPANSION is specified in the query string itself and in query parameters, expansion does not occur. */ diff --git a/warehouse/query-core/src/test/java/datawave/query/util/VisibilityWiseGuysIngestWithModel.java b/warehouse/query-core/src/test/java/datawave/query/util/VisibilityWiseGuysIngestWithModel.java index d38f646e984..0e0170ac4e7 100644 --- a/warehouse/query-core/src/test/java/datawave/query/util/VisibilityWiseGuysIngestWithModel.java +++ b/warehouse/query-core/src/test/java/datawave/query/util/VisibilityWiseGuysIngestWithModel.java @@ -776,6 +776,24 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put(ColumnFamilyConstants.COLF_T, new Text(datatype + "\u0000" + lcNoDiacriticsType.getClass().getName()), emptyValue); bw.addMutation(mutation); + // for testing #NOEXPANSION function + mutation = new Mutation("FASTENER"); + mutation.put(ColumnFamilyConstants.COLF_E, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_F, new Text(datatype + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(10L))); + mutation.put(ColumnFamilyConstants.COLF_I, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_RI, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_T, new Text(datatype + "\u0000" + lcNoDiacriticsType.getClass().getName()), emptyValue); + bw.addMutation(mutation); + + // for testing #NOEXPANSION function + mutation = new Mutation("FIXTURE"); + mutation.put(ColumnFamilyConstants.COLF_E, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_F, new Text(datatype + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(10L))); + mutation.put(ColumnFamilyConstants.COLF_I, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_RI, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_T, new Text(datatype + "\u0000" + lcNoDiacriticsType.getClass().getName()), emptyValue); + bw.addMutation(mutation); + } finally { if (null != bw) { bw.close(); @@ -822,6 +840,12 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put("DATAWAVE", "HUE" + "\u0000" + "forward", columnVisibility, timeStamp, emptyValue); bw.addMutation(mutation); + // specifically for testing the #NOEXPANSION function + mutation = new Mutation("FASTENER"); + mutation.put("DATAWAVE", "FASTENER" + "\u0000" + "forward", columnVisibility, timeStamp, emptyValue); + mutation.put("DATAWAVE", "FIXTURE" + "\u0000" + "forward", columnVisibility, timeStamp, emptyValue); + bw.addMutation(mutation); + } finally { if (null != bw) { bw.close(); diff --git a/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml index 7f499a7aea2..f652a4731b5 100644 --- a/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml +++ b/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml @@ -128,6 +128,11 @@ + + + + +
diff --git a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml index 5bc61292091..7c043448b9e 100644 --- a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml +++ b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml @@ -240,9 +240,16 @@ + + + + + + + @@ -274,12 +281,14 @@ - - - - + + + + + + diff --git a/warehouse/regression-testing/pom.xml b/warehouse/regression-testing/pom.xml index fefdcf4e6de..11221505791 100644 --- a/warehouse/regression-testing/pom.xml +++ b/warehouse/regression-testing/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-regression-testing ${project.artifactId} diff --git a/warehouse/ssdeep-common/pom.xml b/warehouse/ssdeep-common/pom.xml index 19074894fd0..07eeac4bed1 100644 --- a/warehouse/ssdeep-common/pom.xml +++ b/warehouse/ssdeep-common/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ssdeep-common diff --git a/web-services/accumulo/pom.xml b/web-services/accumulo/pom.xml index d6927f34c7b..e758047c803 100644 --- a/web-services/accumulo/pom.xml +++ b/web-services/accumulo/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-accumulo ejb diff --git a/web-services/atom/pom.xml b/web-services/atom/pom.xml index ae45473b753..5a6d1438d84 100644 --- a/web-services/atom/pom.xml +++ b/web-services/atom/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-atom ejb diff --git a/web-services/cached-results/pom.xml b/web-services/cached-results/pom.xml index 636328595e0..66ee61b3cac 100644 --- a/web-services/cached-results/pom.xml +++ b/web-services/cached-results/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-cached-results ejb diff --git a/web-services/client/pom.xml b/web-services/client/pom.xml index b4091f7af58..51b4424232e 100644 --- a/web-services/client/pom.xml +++ b/web-services/client/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-client jar diff --git a/web-services/common-util/pom.xml b/web-services/common-util/pom.xml index fc514ac97d8..f23de252dbc 100644 --- a/web-services/common-util/pom.xml +++ b/web-services/common-util/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-common-util jar diff --git a/web-services/common/pom.xml b/web-services/common/pom.xml index a36891faded..d24ffa01414 100644 --- a/web-services/common/pom.xml +++ b/web-services/common/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-common ejb diff --git a/web-services/deploy/application/pom.xml b/web-services/deploy/application/pom.xml index ac69264e886..6a34f6a245c 100644 --- a/web-services/deploy/application/pom.xml +++ b/web-services/deploy/application/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-deploy-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-deploy-application ear diff --git a/web-services/deploy/configuration/pom.xml b/web-services/deploy/configuration/pom.xml index c91491e62cd..797fd1b27ea 100644 --- a/web-services/deploy/configuration/pom.xml +++ b/web-services/deploy/configuration/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-deploy-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-deploy-configuration jar diff --git a/web-services/deploy/configuration/src/main/resources/datawave/mapreduce/MapReduceJobs.xml b/web-services/deploy/configuration/src/main/resources/datawave/mapreduce/MapReduceJobs.xml index b3a04ec7bb6..3817a39f851 100644 --- a/web-services/deploy/configuration/src/main/resources/datawave/mapreduce/MapReduceJobs.xml +++ b/web-services/deploy/configuration/src/main/resources/datawave/mapreduce/MapReduceJobs.xml @@ -15,7 +15,7 @@ - org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat + org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormat datawave.mr.bulk.BulkInputFormat diff --git a/web-services/deploy/docs/pom.xml b/web-services/deploy/docs/pom.xml index 49a2c2cf4b6..a71cd681b12 100644 --- a/web-services/deploy/docs/pom.xml +++ b/web-services/deploy/docs/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-deploy-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-deploy-docs war diff --git a/web-services/deploy/pom.xml b/web-services/deploy/pom.xml index cdcd96acaa1..3da61758242 100644 --- a/web-services/deploy/pom.xml +++ b/web-services/deploy/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT gov.nsa.datawave.webservices datawave-ws-deploy-parent diff --git a/web-services/deploy/spring-framework-integration/pom.xml b/web-services/deploy/spring-framework-integration/pom.xml index 931e5d6c0d4..06f92ee16a3 100644 --- a/web-services/deploy/spring-framework-integration/pom.xml +++ b/web-services/deploy/spring-framework-integration/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-deploy-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT spring-framework-integration ${project.artifactId} diff --git a/web-services/dictionary/pom.xml b/web-services/dictionary/pom.xml index 999143bbd1a..ee68c0f014d 100644 --- a/web-services/dictionary/pom.xml +++ b/web-services/dictionary/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-dictionary ejb diff --git a/web-services/examples/client-login/pom.xml b/web-services/examples/client-login/pom.xml index cd35b0e1131..68ae6588177 100644 --- a/web-services/examples/client-login/pom.xml +++ b/web-services/examples/client-login/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-examples-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-examples-client-login ejb diff --git a/web-services/examples/http-client/pom.xml b/web-services/examples/http-client/pom.xml index 22cc04dc8ec..3e9f688b247 100644 --- a/web-services/examples/http-client/pom.xml +++ b/web-services/examples/http-client/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-examples-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-examples-http-client jar diff --git a/web-services/examples/jms-client/pom.xml b/web-services/examples/jms-client/pom.xml index 7982a93033b..ab89a13d188 100644 --- a/web-services/examples/jms-client/pom.xml +++ b/web-services/examples/jms-client/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-examples-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-examples-jms-client jar diff --git a/web-services/examples/pom.xml b/web-services/examples/pom.xml index ee55c13d70a..52dd744a8cf 100644 --- a/web-services/examples/pom.xml +++ b/web-services/examples/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-examples-parent pom diff --git a/web-services/examples/query-war/pom.xml b/web-services/examples/query-war/pom.xml index 91afc637a2f..590610440b4 100644 --- a/web-services/examples/query-war/pom.xml +++ b/web-services/examples/query-war/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-examples-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-examples-query-war war diff --git a/web-services/map-reduce-embedded/pom.xml b/web-services/map-reduce-embedded/pom.xml index ae380189c94..58ef7b22d09 100644 --- a/web-services/map-reduce-embedded/pom.xml +++ b/web-services/map-reduce-embedded/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-map-reduce-embedded jar diff --git a/web-services/map-reduce-status/pom.xml b/web-services/map-reduce-status/pom.xml index 5579fd6bea0..56360db510f 100644 --- a/web-services/map-reduce-status/pom.xml +++ b/web-services/map-reduce-status/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-map-reduce-status ejb diff --git a/web-services/map-reduce/pom.xml b/web-services/map-reduce/pom.xml index a201ec7642a..005e04c1ac1 100644 --- a/web-services/map-reduce/pom.xml +++ b/web-services/map-reduce/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-map-reduce ejb diff --git a/web-services/map-reduce/src/main/java/datawave/webservice/mr/MapReduceBean.java b/web-services/map-reduce/src/main/java/datawave/webservice/mr/MapReduceBean.java index 4fa9026af48..d0943b3d0d4 100644 --- a/web-services/map-reduce/src/main/java/datawave/webservice/mr/MapReduceBean.java +++ b/web-services/map-reduce/src/main/java/datawave/webservice/mr/MapReduceBean.java @@ -43,6 +43,7 @@ import javax.ws.rs.core.MultivaluedMap; import javax.ws.rs.core.StreamingOutput; +import org.apache.accumulo.core.client.mapred.AccumuloInputFormat; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; import org.apache.commons.lang.StringUtils; diff --git a/web-services/metrics/pom.xml b/web-services/metrics/pom.xml index d15422cfa23..6529d46e18c 100644 --- a/web-services/metrics/pom.xml +++ b/web-services/metrics/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-metrics ejb diff --git a/web-services/model/pom.xml b/web-services/model/pom.xml index e7d79aecc5c..68aecdcb411 100644 --- a/web-services/model/pom.xml +++ b/web-services/model/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-model ejb diff --git a/web-services/modification/pom.xml b/web-services/modification/pom.xml index 611fa223f84..f8e65640a17 100644 --- a/web-services/modification/pom.xml +++ b/web-services/modification/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-modification ejb diff --git a/web-services/pom.xml b/web-services/pom.xml index fd19dd7481e..f0cc839a5ae 100644 --- a/web-services/pom.xml +++ b/web-services/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT gov.nsa.datawave.webservices datawave-ws-parent diff --git a/web-services/query-websocket/pom.xml b/web-services/query-websocket/pom.xml index 2a2ea6160e1..c4c0a3cd1ec 100644 --- a/web-services/query-websocket/pom.xml +++ b/web-services/query-websocket/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-query-websocket war diff --git a/web-services/query/pom.xml b/web-services/query/pom.xml index 7d16ac95865..2d9a030e525 100644 --- a/web-services/query/pom.xml +++ b/web-services/query/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-query ejb diff --git a/web-services/query/src/main/java/datawave/webservice/query/runner/QueryExecutorBean.java b/web-services/query/src/main/java/datawave/webservice/query/runner/QueryExecutorBean.java index 4044facb91e..ddb81dc9a7d 100644 --- a/web-services/query/src/main/java/datawave/webservice/query/runner/QueryExecutorBean.java +++ b/web-services/query/src/main/java/datawave/webservice/query/runner/QueryExecutorBean.java @@ -2997,6 +2997,26 @@ private void updateQueryParams(Query q, String queryLogicName, String query, Dat } } + /** + * @param queryLogicName + * the logic name + * @param queryParameters + * the query parameters + * @return the generic response + */ + @POST + @Produces({"application/xml", "text/xml", "application/json", "text/yaml", "text/x-yaml", "application/x-yaml", "application/x-protobuf", + "application/x-protostuff"}) + @Path("/{logicName}/validate") + @Interceptors({RequiredInterceptor.class, ResponseInterceptor.class}) + @Timed(name = "dw.query.validateQuery", absolute = true) + public GenericResponse validateQuery(@Required("logicName") @PathParam("logicName") String queryLogicName, + MultivaluedMap queryParameters) { + GenericResponse response = new GenericResponse<>(); + response.setMessages(Collections.singletonList("Query validator coming soon.")); + throw new DatawaveWebApplicationException(new UnsupportedOperationException("Query validator not implemented"), response, 501); + } + /** * Administrator credentials required. Returns list of queries for some other user * diff --git a/web-services/rest-api/pom.xml b/web-services/rest-api/pom.xml index be7f17b8469..6e220903426 100644 --- a/web-services/rest-api/pom.xml +++ b/web-services/rest-api/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-rest-api war diff --git a/web-services/security/pom.xml b/web-services/security/pom.xml index 706d67db7dc..6905ddf5777 100644 --- a/web-services/security/pom.xml +++ b/web-services/security/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-security ejb diff --git a/web-services/web-root/pom.xml b/web-services/web-root/pom.xml index 4cbc8aabc20..b4ef7bc57db 100644 --- a/web-services/web-root/pom.xml +++ b/web-services/web-root/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.8.0-SNAPSHOT + 7.10.0-SNAPSHOT datawave-ws-web-root war