diff --git a/.build/build-accord.xml b/.build/build-accord.xml
new file mode 100644
index 000000000000..6fc716d2d0c2
--- /dev/null
+++ b/.build/build-accord.xml
@@ -0,0 +1,45 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/.build/build-checkstyle.xml b/.build/build-checkstyle.xml
index af5867e4aa9a..0484e4098c66 100644
--- a/.build/build-checkstyle.xml
+++ b/.build/build-checkstyle.xml
@@ -19,7 +19,7 @@
-
+
@@ -45,7 +45,7 @@
-
+
diff --git a/.build/build-rat.xml b/.build/build-rat.xml
index fea028363c07..2fbacb74d0ec 100644
--- a/.build/build-rat.xml
+++ b/.build/build-rat.xml
@@ -76,6 +76,7 @@
+
diff --git a/.build/build-resolver.xml b/.build/build-resolver.xml
index 42bcc82512d0..49d1e8ba793d 100644
--- a/.build/build-resolver.xml
+++ b/.build/build-resolver.xml
@@ -178,7 +178,7 @@
-
+
@@ -206,7 +206,7 @@
-
+
diff --git a/.build/cassandra-build-deps-template.xml b/.build/cassandra-build-deps-template.xml
index 4ec59cdf2d4b..c6b56955e013 100644
--- a/.build/cassandra-build-deps-template.xml
+++ b/.build/cassandra-build-deps-template.xml
@@ -155,5 +155,10 @@
org.bouncycastle
bcutil-jdk18on
+
+ org.apache.cassandra
+ cassandra-accord
+ tests
+
diff --git a/.build/cassandra-deps-template.xml b/.build/cassandra-deps-template.xml
index a7c27ee12666..e6afd9b9b018 100644
--- a/.build/cassandra-deps-template.xml
+++ b/.build/cassandra-deps-template.xml
@@ -116,6 +116,10 @@
org.mindrot
jbcrypt
+
+ org.apache.cassandra
+ cassandra-accord
+
io.airlift
airline
diff --git a/.build/checkstyle_suppressions.xml b/.build/checkstyle_suppressions.xml
index ed4d1443f7fc..230c808c1435 100644
--- a/.build/checkstyle_suppressions.xml
+++ b/.build/checkstyle_suppressions.xml
@@ -21,5 +21,4 @@
"https://checkstyle.org/dtds/suppressions_1_1.dtd">
-
diff --git a/.build/git/git-hooks/post-checkout/100-update-submodules.sh b/.build/git/git-hooks/post-checkout/100-update-submodules.sh
new file mode 100755
index 000000000000..b495ed086054
--- /dev/null
+++ b/.build/git/git-hooks/post-checkout/100-update-submodules.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Redirect output to stderr.
+exec 1>&2
+
+#set -o xtrace
+set -o errexit
+set -o pipefail
+set -o nounset
+
+bin="$(cd "$(dirname "$0")" > /dev/null; pwd)"
+
+_main() {
+ # In case the usage happens at a different layer, make sure to cd to the toplevel
+ local root_dir
+ root_dir="$(git rev-parse --show-toplevel)"
+ cd "$root_dir"
+
+ if [[ ! -e .gitmodules ]]; then
+ # nothing to see here, look away!
+ return 0
+ fi
+ git submodule update --init --recursive
+}
+
+_main "$@"
diff --git a/.build/git/git-hooks/post-switch b/.build/git/git-hooks/post-switch
new file mode 120000
index 000000000000..5513d1deed30
--- /dev/null
+++ b/.build/git/git-hooks/post-switch
@@ -0,0 +1 @@
+post-checkout
\ No newline at end of file
diff --git a/.build/git/git-hooks/pre-commit/100-verify-submodules-pushed.sh b/.build/git/git-hooks/pre-commit/100-verify-submodules-pushed.sh
new file mode 100755
index 000000000000..ec10bba04a5d
--- /dev/null
+++ b/.build/git/git-hooks/pre-commit/100-verify-submodules-pushed.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+##
+## When working with submodules the top level project (Apache Cassandra) needs to commit all submodule
+## changes so the top level knows what SHA to use. When working in a development environment it is
+## common that multiple commits will exist in both projects, if the submodule has its history
+## rewritten, then historic top level commits are no longer valid unless the SHAs are pushed to a
+## remote repo; this is what the script attempts to do, make sure all SHAs added to the
+## Apache Cassandra are backed up to a remote repo to make the Cassandra SHA buildable.
+##
+
+# Redirect output to stderr.
+exec 1>&2
+
+
+#set -o xtrace
+set -o errexit
+set -o pipefail
+set -o nounset
+
+bin="$(cd "$(dirname "$0")" > /dev/null; pwd)"
+
+_log() {
+ echo -e "[pre-commit]\t$*"
+}
+
+error() {
+ _log "$@" 1>&2
+ exit 1
+}
+
+# Status Table
+# A Added
+# C Copied
+# D Deleted
+# M Modified
+# R Renamed
+# T Type Changed (i.e. regular file, symlink, submodule, …<200b>)
+# U Unmerged
+# X Unknown
+# B Broken
+_main() {
+ # In case the usage happens at a different layer, make sure to cd to the toplevel
+ local root_dir
+ root_dir="$(git rev-parse --show-toplevel)"
+ cd "$root_dir"
+
+ [[ ! -e .gitmodules ]] && return 0
+ local enabled=$(git config --bool cassandra.pre-commit.verify-submodules.enabled || echo true)
+ [ "$enabled" == "false" ] && return 0
+ local submodules=( $(git config --file .gitmodules --get-regexp path | awk '{ print $2 }') )
+
+ local is_submodule=false
+ local git_sub_dir
+ local git_sha
+ while read status file; do
+ is_submodule=false
+ for to_check in "${submodules[*]}"; do
+ if [[ "$to_check" == "$file" ]]; then
+ is_submodule=true
+ break
+ fi
+ done
+ if $is_submodule; then
+ local enabled=$(git config --bool cassandra.pre-commit.verify-submodule-${file}.enabled || echo true)
+ [ "$enabled" == "false" ] && continue
+ _log "Submodule detected: ${file} with status ${status}; attempting a push"
+ _log "\tTo disable pushes, run"
+ _log "\t\tgit config --local cassandra.pre-commit.verify-submodules.enabled false"
+ _log "\tOr"
+ _log "\t\tgit config --local cassandra.pre-commit.verify-submodule-${file}.enabled false"
+ git_sub_dir="${file}/.git"
+ branch="$(git config -f .gitmodules "submodule.${file}.branch")"
+ [[ -z "${branch:-}" ]] && error "Submodule ${file} does not define a branch"
+ git_sha="$(git --git-dir "${git_sub_dir}" rev-parse HEAD)"
+ local remote="$(git --git-dir "${git_sub_dir}" config --get "branch.${branch}.remote" || error "Git branch ${branch} is not set up to track any remote in submodule ${file}")"
+ git --git-dir "${git_sub_dir}" fetch "${remote}"
+ git --git-dir "${git_sub_dir}" branch "${remote}/${branch}" --contains "${git_sha}" || error "Git commit ${git_sha} not found in $(git remote get-url "${remote}") on branch ${branch}"
+ fi
+ done < <(git diff --cached --name-status)
+}
+
+_main "$@"
diff --git a/.build/git/install-git-defaults.sh b/.build/git/install-git-defaults.sh
new file mode 100755
index 000000000000..7c26ed5eda7c
--- /dev/null
+++ b/.build/git/install-git-defaults.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#set -o xtrace
+set -o errexit
+set -o pipefail
+set -o nounset
+
+bin="$(cd "$(dirname "$0")" > /dev/null; pwd)"
+
+install_template_script() {
+ local -r name="$1"
+ local -r d_dir="$2"
+
+ cat < "$name"
+#!/usr/bin/env bash
+
+# This script is autogenerated by the Apache Cassandra build; DO NOT CHANGE!
+# When this script is not found it will be installed automatically by the build
+# If an existing script is found, that script will be reloated under ${d_dir} as 000-original.sh
+
+# Redirect output to stderr.
+exec 1>&2
+
+# Find all scripts to run
+for path in \$(find "$d_dir" -name '*.sh' | perl -e "print sort{(split '/', \\\$a)[-1] <=> (split '/', \\\$b)[-1]}<>"); do
+ "\$path" "\$@"
+done
+EOF
+ chmod a+x "$name"
+}
+
+install_hook() {
+ local -r git_dir="$1"
+ local -r hooks_dir="${git_dir}/hooks"
+ local -r name="$2"
+ local -r d_dir="${hooks_dir}/${name}.d"
+ local -r trigger_on_install=$3
+
+ mkdir "${d_dir}" &> /dev/null || true
+ local -r script_name="${hooks_dir}/${name}"
+ local installed=true
+ if [[ -e "$script_name" ]]; then
+ # was the script already installed?
+ if ! grep "This script is autogenerated by the Apache Cassandra build" "$script_name" &> /dev/null ; then
+ echo "$script_name found, but was not generated by the Apache Cassandra build; please remove or move to ${d_dir}/000-original.sh; creating and moving to ${d_dir} will cause it to run as expected, but won't conflict with hooks this build adds" 1>&2
+ exit 1
+ else
+ installed=false
+ fi
+ fi
+ # install all hooks
+ cp "$bin"/git-hooks/"${name}"/* "$d_dir"/
+
+ # install coordinator hook
+ install_template_script "$script_name" "$d_dir"
+ if $installed && $trigger_on_install ; then
+ echo "Running script $script_name"
+ "$script_name"
+ fi
+}
+
+_install_hooks() {
+ local git_dir
+ # make sure to use --git-common-dir and not --git-dir to support worktrees
+ git_dir="$(git rev-parse --git-common-dir 2> /dev/null || true)"
+ if [[ -z "${git_dir:-}" ]]; then
+ # not in a git repo, noop
+ return 0
+ fi
+
+ # make sure hooks directory exists; does not exist by default for worktrees
+ mkdir -p "${git_dir}/hooks" &> /dev/null || true
+
+ install_hook "$git_dir" "post-checkout" true
+ install_hook "$git_dir" "post-switch" false
+ install_hook "$git_dir" "pre-commit" false
+}
+
+_git_config_set() {
+ local -r name="$1"
+ # only care about rc
+ git config --local --get "$name" &> /dev/null
+}
+
+_install_configs() {
+ # when doing pull, this makes sure submodules are updated
+ _git_config_set submodule.recurse || git config --local submodule.recurse true
+}
+
+_main() {
+ local git_dir
+ # make sure to use --git-common-dir and not --git-dir to support worktrees
+ git_dir="$(git rev-parse --git-common-dir 2> /dev/null || true)"
+ # not in a git repo, noop
+ [[ -z "${git_dir:-}" ]] && return 0
+
+ _install_configs
+ _install_hooks
+}
+
+_main "$@"
diff --git a/.build/parent-pom-template.xml b/.build/parent-pom-template.xml
index 0235ae6ea90e..25d4f0ddc01a 100644
--- a/.build/parent-pom-template.xml
+++ b/.build/parent-pom-template.xml
@@ -715,6 +715,42 @@
jbcrypt
0.4
+
+ org.apache.cassandra
+ cassandra-accord
+ @version@
+
+
+ org.apache.cassandra
+ cassandra-all
+
+
+
+
+ org.apache.cassandra
+ cassandra-accord
+ @version@
+ tests
+ test
+
+
+ org.junit.jupiter
+ junit-jupiter-api
+
+
+ org.junit.jupiter
+ junit-jupiter-engine
+
+
+ ch.qos.logback
+ logback-classic
+
+
+ org.apache.cassandra
+ cassandra-all
+
+
+
io.airlift
airline
diff --git a/.build/sh/bump-accord.sh b/.build/sh/bump-accord.sh
new file mode 100755
index 000000000000..43a476f3edfb
--- /dev/null
+++ b/.build/sh/bump-accord.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#set -o xtrace
+set -o errexit
+set -o pipefail
+set -o nounset
+
+_main() {
+ local home
+ home="$(git rev-parse --show-toplevel)"
+ cd "$home"
+
+ git submodule status modules/accord
+ echo "Is this the correct SHA? [y/n; default=y]"
+ read correct
+ if [[ "${correct:-y}" != "y" ]]; then
+ echo "Please update Accord's SHA and try again"
+ exit 1
+ fi
+ git commit -m "Change Accord to $(cd modules/accord; git log -1 --format='%h: %B')" modules/accord
+}
+
+_main "$@"
diff --git a/.build/sh/change-submodule-accord.sh b/.build/sh/change-submodule-accord.sh
new file mode 100755
index 000000000000..997db3dc2c29
--- /dev/null
+++ b/.build/sh/change-submodule-accord.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#set -o xtrace
+set -o errexit
+set -o pipefail
+set -o nounset
+
+bin="$(cd "$(dirname "$0")" > /dev/null; pwd)"
+
+"$bin"/change-submodule.sh modules/accord 'https://github.com/apache/cassandra-accord.git' trunk
diff --git a/.build/sh/change-submodule.sh b/.build/sh/change-submodule.sh
new file mode 100755
index 000000000000..6ab2d3795afd
--- /dev/null
+++ b/.build/sh/change-submodule.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#set -o xtrace
+set -o errexit
+set -o pipefail
+set -o nounset
+
+_usage() {
+ cat <&2
+ exit 1
+}
+
+_usage() {
+ cat <
+```
+
+When changes are made to a submodule (such as to accord), you need to commit and update the reference in Apache Cassandra
+
+```
+$ (cd modules/accord ; git commit -am 'Saving progress')
+$ .build/sh/bump-accord.sh
+```
+
+## Commit and Merge Process
+
+Due to the nature of submodules, the changes to the submodules must be committed and pushed before the changes to Apache Cassandra; these are different repositories so git's `--atomic` does not prevent conflicts from concurrent merges; the basic process is as follows:
+
+* Follow the normal merge process for the submodule
+* Update Apache Cassandra's submodule entry to point to the newly committed change; follow the Accord example below for an example
+
+```
+$ .build/sh/change-submodule-accord.sh
+$ .build/sh/bump-accord.sh
+```
+
# Useful Links
- How you can contribute to Apache Cassandra [presentation](http://www.slideshare.net/yukim/cassandrasummit2013) by Yuki Morishita
diff --git a/accord_demo.txt b/accord_demo.txt
new file mode 100644
index 000000000000..63b7d21201d8
--- /dev/null
+++ b/accord_demo.txt
@@ -0,0 +1,14 @@
+ccm create accord-cql-poc -n 3
+ccm start
+
+bin/cqlsh -e "CREATE KEYSPACE ks WITH replication={'class':'SimpleStrategy', 'replication_factor':3};"
+bin/cqlsh -e "CREATE TABLE ks.tbl1 (k int PRIMARY KEY, v int) WITH transactional_mode = 'full';"
+bin/cqlsh -e "CREATE TABLE ks.tbl2 (k int PRIMARY KEY, v int) WITH transactional_mode = 'full';"
+
+BEGIN TRANSACTION
+ LET row1 = (SELECT * FROM ks.tbl1 WHERE k = 1);
+ SELECT row1.v;
+ IF row1 IS NULL THEN
+ INSERT INTO ks.tbl2 (k, v) VALUES (1, 2);
+ END IF
+COMMIT TRANSACTION;
diff --git a/build.xml b/build.xml
index 45af0462603b..f57c05d5dbc9 100644
--- a/build.xml
+++ b/build.xml
@@ -100,6 +100,8 @@
the user specifies the tmp.dir property -->
+
+
@@ -109,8 +111,12 @@
+
+
+
+
@@ -220,6 +226,24 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -322,6 +346,7 @@
-XX:-CMSClassUnloadingEnabled
-Dio.netty.tryReflectionSetAccessible=true
+ -XX:MaxMetaspaceSize=2G
@@ -396,6 +421,7 @@
+
@@ -517,7 +543,8 @@
-
+
+
@@ -961,6 +988,9 @@
+
+
+
@@ -980,6 +1010,7 @@
+
@@ -997,6 +1028,32 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
@@ -2049,6 +2109,7 @@
+
+
+
+
+
+
+
diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index 546333dfcfd3..19f9c0817a3e 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml
@@ -2311,3 +2311,22 @@ drop_compact_storage_enabled: false
# compatibility mode would no longer toggle behaviors as when it was running in the UPGRADING mode.
#
storage_compatibility_mode: NONE
+
+#accord:
+# # Enables the execution of Accord (multi-key) transactions on this node.
+# enabled: false
+#
+# # Journal directory for Accord
+# journal_directory:
+#
+# # The number of Accord shards on this node; -1 means use the number of cores
+# queue_shard_count: -1
+#
+# # The number of Accord shards on this node; -1 means use the number of cores
+# command_store_shard_count: -1
+#
+# # Recover delay: the time between a transaction being initiated and a remote replica being willing to interrupt it to complete it
+# recover_delay: 1s
+#
+# # how quickly the fast path is reconfigured when nodes go up/down
+# fast_path_update_delay: 5s
diff --git a/doc/modules/cassandra/pages/developing/accord/index.adoc b/doc/modules/cassandra/pages/developing/accord/index.adoc
new file mode 100644
index 000000000000..8320b49a0a5b
--- /dev/null
+++ b/doc/modules/cassandra/pages/developing/accord/index.adoc
@@ -0,0 +1,360 @@
+== Accord Intro
+
+This document is intended to facilitate quick dive into Accord and
+Cassandra Integration code for anyone interested in the project. Readers
+should be closely familiar at very least with Single-Decree Paxos and
+fluent in Consensus terminology. Familiarity with Accord protocol
+itself, or similar protocols such as EPaxos, TAPIR, Janus, or Tempo, can
+be useful.
+
+Accord code is logically split into local and coordinator part.
+Coordination code contains code intended for coordination/invocation of
+the client query, driving it through the Accord state machine, and all
+commands and utilities for tracking/retrying their state. Node-local
+code contains utility for keeping record of replica state and facilitate
+local execution (i.e. responding to coordinator queries).
+
+There are _many_ enums in Accord. They’re extremely useful for
+understanding the state machine of each of the components.
+
+Cassandra Integration implements interfaces provided by Accord, and
+plugs in messaging, serialization, CQL, concurrency/execution, on-disk
+state management, and stable storage (i.e. Cassandra tables).
+
+When the request comes from the client, broadly speaking, it gets parsed
+and turns into `TransactionStatement`. `TransactionStatement` contains
+updates, selects, assignments, and conditions intended for
+atomic/transactional execution. These statements are translated into
+Accord commands (i.e. `Read`, `Write`, or `Update`), and form Accord
+Transaction (`Txn`). Transaction is executed yielding `TxnResult` that
+can be returned to the client.
+
+== Coordinator Side
+
+=== Accord Protocol Basics
+
+Coordinator allocates a globally unique transaction ID `TxnId` for the
+transaction, and begins coordination (see `CoordinateTransaction`).
+Here, coordinator perform initial rounds of `PreAccept` and `Accept`
+until the agreement about when transaction should execute is reached.
+Coordinated query execution starts with a `PreAccept` message, which
+contains transaction definition and routing information.
+
+On the replica locally, each Accord message first lands in
+`AccordVerbHandler`, which handles _all_ Accord messages. Replica
+determines whether it is aware of the _epoch_ specified by the
+transaction coordinator. Messages for the future epochs are parked until
+epoch becomes active on the node; messages for known epochs are
+submitted to their corresponding command stores (think: local shards).
+Replica applies the message locally, changing its local state, and
+producing coordinator response. Coordinator collects replica responses
+and continues driving transaction through the execution state machine.
+
+Every transaction has a home key - a global value that defines the home
+shard, the one tasked with ensuring the transaction is finished. Home
+key is chosen arbitrarily: it is either a first key the coordinator
+owns, or it is picked completely at random.
+
+== Replica Side
+
+=== CommandStore
+
+`Command` is a unit of Accord _metadata_ that relates to a specific
+operation, as opposed to `Message`, which is an _instruction_ sent by
+coordinator to the replica for execution that _changes_ this command
+state. `Command` does _not_ hold the state of an entire transaction, but
+rather a _part_ of transaction executed on a particular shard.
+_Coordinator_ is responsible for executing the entirety of the
+transaction, `Command`s are just local execution states.
+
+Commands are held by a Command _Store_, a single threaded internal shard
+of accord transaction metadata. It holds state required for command
+execution, and executes commands sequentially. For command execution,
+`CommandStore` creates a `SafeCommandStore`, a version of `CommandStore`
+created for command execution, during which it has exclusive access to
+it.
+
+Roughly speaking, you can think of relation between CommandStore and
+SafeCommandStore as:
+
+....
+SafeCommandStore safeStore = commandStore.beginOperation(context)
+try {
+ message.apply(safeStore);
+}
+finally {
+ commandStore.completeOperation(safeStore);
+}
+....
+
+In other words, `CommandStore` collects the `PreLoadContext`, state
+required to be in memory for command execution (possible dependencies,
+such as `TxnId`s, and `Key`s of commands, but also `CommandsForKeys`
+that will be needed during execution). Once the context is collected and
+command’s turn to execute on command store comes, _safe_ command store
+is created and passed to the command.
+
+Any executing operation may require changes to command store state. For
+this, `SafeCommandStore` creates a special version of command state,
+`SafeCommand` and `SafeCommandsForKey` that can be updated during
+execution. Naturally, either _all_ of the states changed during
+operation execution will become visible, or none of them will. In order
+to ensure transactional integrity, changes to commands are tracked and
+are recorded into `Journal` for crash-recovery. `ProgressLog` and
+`CommandsForKey` are up
+
+On Cassandra side, concurrent execution is controlled by `AccordTask`,
+which contains cache loading logic and persistence callbacks. Since
+Accord may potentially hold a large number of command states in memory,
+their states may be _shrunk_ to their binary representation to save some
+memory, or they can get fully evicted. This also means that `AccordTask`
+will have to reload relevant dependencies from preload context before
+command execution can begin.
+
+=== AsyncChain, AccordTask, AccordExecutor
+
+Accord is designed for high concurrency, and most things are constructed
+as asynchronous chains. `AsyncChain` API is very similar to the one of
+Java futures, but has several convenient methods that make execution on
+multiple executors (think: command stores, loaders) simpler.
+
+Each `CommandStore` has its own `AccordExecutor`. For the purpose of
+this document you may consider it as a single-threaded executor.
+`AccordExecutor` keeps track of tasks in different states, primarily:
+
+* `WAITING_TO_LOAD` - executor has a maximum number of concurrent load
+tasks. If the number of in-progress loads exceeds this number, all
+subsequently added loads will go into the waiting to load queue.
+* `LOADING` - tasks for which dependencies are being loaded.
+`CommandsForKeys` are paged in from the auxiliary table, while `Command`
+states are loaded directly from the `Journal`.
+* `WAITING_TO_RUN` / `RUNNING` / `FINISHED` - these three are
+self-explanatory; once dependencies are loaded, task is ready to run;
+when its turn comes, it transitions to running state, and once its done,
+it’s finished.
+
+There are several other states, which you can find in
+`AccordTask$State`. It might be worth to mention that Accord tasks are
+_cancellable_. Tasks that were timed out before execution, have been
+preempted, or should not run due to other reasons, can and will be
+cancelled. Tasks transition between different AccordExecutor queues
+depending on their execution states.
+
+In Accord, all tasks have to be executed in strict order, and a task
+can’t execute before its dependencies have executed, else there’s no
+guarantee of strict order. Tasks are notified about dependency readiness
+using `NotificationSink`, which updates the tasks’s `WaitingOn`
+collection. `WaitingOn` is responsible for registering listeners with
+`CommandStore` if dependencies need to be executed before the current
+task can.
+
+`WaitingOn`, `NotificationSink` and `LocalListeners` registered with
+CommandStore can be thought of as a ``happy path'' execution: when
+coordinator makes timely progress changing command states. If
+coordinator _fails_ to make progress, `ProgressLog` kicks in after the
+registered deadline.
+
+=== ProgressLog
+
+The progress log is responsible for ensuring progress in transactions
+that aren’t making any. It does two things:
+
+* Fetches data from peers via `WaitingState`. Depending on the state of
+transaction, it may trigger fetch of a subset of required dependencies
+from peers via `FetchData`. For example, we haven’t received Apply, but
+we’re ReadyToExecute.
+* Triggers recovery via `HomeState`. The progress log may also
+autonomously decide that a transaction which hasn’t been
+decided/executed (and otherwise should be able to do so) should have the
+recovery protocol invoked. In other words, if _coordination_ of the
+transaction is stuck (i.e. further progress is not happening not due to
+lack of dependencies required locally, but because of the transaction
+coordinator), may trigger recovery via `MaybeRecover`.
+
+=== Command
+
+Command is a core block of the Accord local state. `Message`s, such as
+`PreAccept`, `Propose`, `Accept`, and many others, change `Command`
+state for a given store during execution.
+
+* `SaveStatus` - node-local command status
+* `Participants` - core routing information required for transaction.
+Keys or Ranges participating in the transaction.
+* Timestamps:
+** `ExecuteAt` - a timestamp at which this transaction is decided to be
+executed. May differ from its `TxnId` if a higher ballot was witnessed
+during `PreAccept` phase, in case there any conflicts are discovered.
+** `ExecutesAtLeast` - only relevant for `WaitingOnWithExecutesAtLeast`
+** Ballots for coordinating within a specific `TxnId`:
+*** `Promised` - a non-zero ballot can be set as a result of recovery; a
+recovery coordinator (see Recovery Protocol in Accord paper for details)
+is picking its own globally unique ballot for re-proposal.
+*** `AcceptedOrCommitted` - same as `Promised` (i.e. a non-zero ballot
+is set as a result of recovery), except for later protocol stages.
+* `PartialTxn` - shard-relevant definition of the transaction.
+* Dependencies:
+** `PartialDeps` - a collection of transaction dependencies, keyed by
+the key or range on which they were adopted.
+** `WaitingOn` - a subset of the above dependencies this command needs
+to wait on.
+** A collection of transaction dependencies, keyed by the key or range
+on which they were adopted.
+* `Writes` - a collection of data to write to one or more stores
+* `Result` - a result to be returned to a client, or be stored in a
+node’s command state. Effectively unused in Cassandra implementation.
+
+=== CommandsForKey (CFK)
+
+`CommandsForKey` is a specialised collection for efficiently
+representing and querying everything Accord needs for making
+coordination and recovery decisions about a key’s command conflicts, and
+for managing execution order.
+
+CommandsForKey is updated via `SafeCommandsForKey` after command
+execution in `SafeCommandStore#updateCommandsForKey`. CommandsForKey
+defferentiates between managed and unmanaged transactions:
+
+* Managed transactions are transactions witnessed by `CommandsForKey`
+for dependency management (essentially all globally visible key
+transactions): simple key transactions, like reads and writes.
+* Unmanaged transactions are those that depend on the simple key
+transactions but are not themselves such, e.g. sync points, range
+transactions, etc. These transactions need only adopt a dependency on
+the Key to represent _all of these transactions_. CFK will then notify
+when they have executed.
+
+=== CommandStore’s auxiliary collections
+
+==== RedundantBefore
+
+RedundantBefore is (incrementally) persisted in Journal and used by
+CommandStore to track transactions that have been fully applied, or
+invalidated across all shards. Once the transaction is redundant
+(i.e. it has been either _applied_ or _invalidated_ durably on the
+majority of participants), its metadata can be removed and only
+transactional bounds can be maintained for dependency tracking purposes.
+`RedundantBefore` plays an important role during journal compaction (by
+providing information about which transactions can be purged).
+
+=== DurabilityService and (Exclusive)SyncPoint
+
+For intent of this document, we will only be covering _Exclusive_
+SyncPoints, even though other kinds might still exist as of time of
+writing this. `SyncPoints` serve as a logical barrier in transaction
+history, and are used for invalidating older `TxnId`s, so that a newly
+bootstrapped node may have a complete log as of a point in time `TxnId`,
+and replicas could purge/GC earlier transaction metadata.
+
+SyncPoints are not expected to be processed by the the whole cluster,
+and we do not want transaction processing to be held up, so while these
+are processed much like a transaction, they are invisible to real
+transactions which may proceed before SyncPoint is witnessed by the node
+processing it.
+
+ExclusiveSyncPoint is created by `DurabilityScheduler`, as the first
+step for coordinating shard durability, which is scheduled for periodic
+execution. During this step, we perform initial rounds of `PreAccept`
+and `Accept` until we have reached agreement about when `SyncPoint`
+should execute.
+
+After shard is marked durable, `RedundantBefore` collection is updated,
+which serves an important role in bootstrap, log replay, log compaction,
+and replica-side command purging/invalidation.
+
+=== ConfigurationService and TopologyManager
+
+Time in Accord is sliced into epochs. Each epoch constitutes a unique
+cluster configuration (`Topology`). Topology represents mapping between
+key ranges and nodes, here every range has to be replicated to a certain
+number of nodes. Coordinator assigns epoch to each transaction; replicas
+may decline transactions that arrive to epochs that were previously
+closed.
+
+`TopologyManager` is responsible for listening to notifications about
+cluster configuration changes, and creation of epochs. Once epoch is
+created, it needs to be bootstrapped before it is ready. Epoch readiness
+consists of 4 _independent_ states:
+
+* Metadata: The new epoch has been setup locally and the node is ready
+to process commands for it.
+* Coordinate: The node has retrieved enough remote information to answer
+coordination decisions for the epoch (including fast path decisions).
+Once a quorum of the new epoch has achieved this, earlier epochs do not
+need to be contacted by coordinators of transactions started in the new
+epoch (or later).
+* Data: The node has successfully replicated the underlying `DataStore`
+information for the new epoch, but may need to perform some additional
+coordination before it can execute the read portion of a transaction.
+* Reads: The node has retrieved enough remote information to safely
+process reads, including replicating all necessary DataStore
+information, and any additional transactions necessary for consistency.
+
+=== Data Store
+
+One of the most important integration points, DataStore, is responsible
+for application of transactional information into database’s stable
+storage.
+
+=== Accord Journal
+
+==== Garbage Collection / Cleanup
+
+* `ERASE`: we can erase data once we are certain no other replicas
+require our information. Erased should ONLY be adopted on a replica that
+knows EVERY shard has successfully applied the transaction at all
+healthy replicas (or else that it is durably invalidated).
+* `EXPUNGE`: we can expunge data once we can reliably and safely expunge
+any partial record. To achieve the latter, we use only global summary
+information and the TxnId and if present any applyAt.
+* `INVALIDATE`: command has been was decidedly (and durably) superseded
+by a different command (e.g., a higher higher ballot was witnessed
+during recovery), and will *never* be executed.
+* `VESTIGIAL`: command cannot be completed and is either pre-bootstrap,
+did not commit, or did not participate in this shard’s epoch.
+* `TRUNCATE`: means the subset of command metadata (i.e., deps, outcome,
+or appliedAt) can be partially discarded.
+
+== Contributing Changes to Accord
+
+Accord is covered by a large number of tests, but probably most
+prominent among them is a `BurnTest`. BurnTest is a deterministic
+simulation of the protocol with strict serializability checker. BurnTest
+simulates time, message passing, concurrency, faults, and many other
+things. If you are intending to make a chance to Accord, it is
+recommended you run `BurnTest` at very least several dozen times in the
+loop to ensure correctness of your change. BurnTest can also be useful
+for reasoning about and exploring protocol states. Put a breakpoint at a
+spot you consider important, run the burn test and see what’s going on.
+
+Accord also comes with many built-in assertions. Protocol has many
+checks for internal consistency that can be helpful during development.
+Most of the time, rather than triggering a strict serializability
+checker error, you will see some form of internal assertion detecting an
+inconsistency. These invariants are there for a reason, and in an
+overwhelming majority of cases disabling or ignoring them is not a good
+idea.
+
+== Cheat Sheet
+
+* Medium Path - is a coordinator optimization. This is the case where t0
+can be agreed (i.e. executeAt=txnId), and where we would like not to
+take 3 round-trips, as this situation is likely to occur when we lose
+the fast path quorum. The medium path permits only 2 round-trips because
+it can be used as a complete set of dependencies (due to their having
+been calculated against the correct bound, t0, and that bound having
+been applied at a quorum so that conflicting transactions will propose a
+higher executeAt).
+* `SaveStatus` vs `Status` - `SaveStatus` is a replica-local status that
+contains additional information helpful for tracking state machine state,
+and heavily used for validating internal consistency in Accord, while
+`Status` is a part of a distributed state machine that tracks distributed
+transaction state.
+* `Routable` - something that can be found in the cluster, and MAYBE
+found on disk (if Seekable.
+** `Unseekable` - _routing_ key; in Cassandra terms, you can think of a
+`Token`
+** `Seekable` - Something that can be found within the cluster AND found
+on disk, queried and returned; i.e., key or key range.
+* Route vs RoutingKey vs FullRoute vs PartialRoute -
+** `Partial` vs `Full` route are understood in the context of a single
+transaction.
diff --git a/doc/modules/cassandra/pages/developing/index.adoc b/doc/modules/cassandra/pages/developing/index.adoc
index 8c9f735e2c3d..409a423a6fc8 100644
--- a/doc/modules/cassandra/pages/developing/index.adoc
+++ b/doc/modules/cassandra/pages/developing/index.adoc
@@ -2,3 +2,4 @@
* xref:cassandra:developing/data-modeling/index.adoc[Data Modeling]
* xref:cassandra:developing/cql/index.adoc[CQL]
+* xref:cassandra:developing/accord/index.adoc[Accord]
diff --git a/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc b/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc
index d3b948e3d172..6ab4917ef7b3 100644
--- a/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc
+++ b/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc
@@ -72,6 +72,8 @@ cqlsh> select * from system_metrics.all_groups ;
group_name | virtual_table
-------------------+---------------------------
+ AccordCoordinator | accord_coordinator_group
+ AccordReplica | accord_replica_group
Batch | batch_group
BufferPool | buffer_pool_group
CIDRAuthorizer | cidr_authorizer_group
@@ -98,6 +100,7 @@ cqlsh> select * from system_metrics.all_groups ;
Paxos | paxos_group
ReadRepair | read_repair_group
Repair | repair_group
+ RouteIndex | route_index_group
Storage | storage_group
StorageProxy | storage_proxy_group
Streaming | streaming_group
diff --git a/ide/idea-iml-file.xml b/ide/idea-iml-file.xml
index 13e66fa61308..1d189db8d6bc 100644
--- a/ide/idea-iml-file.xml
+++ b/ide/idea-iml-file.xml
@@ -49,6 +49,16 @@
+
+
+
+
+
+
+
+
+
+
@@ -56,6 +66,8 @@
+
+
@@ -63,12 +75,17 @@
+
+
+
+
+
@@ -76,6 +93,9 @@
+
+
+
diff --git a/ide/idea/vcs.xml b/ide/idea/vcs.xml
index 81872fd3f150..a5367a526e4d 100644
--- a/ide/idea/vcs.xml
+++ b/ide/idea/vcs.xml
@@ -2,6 +2,7 @@
+
@@ -13,4 +14,4 @@
-
\ No newline at end of file
+
diff --git a/ide/idea/workspace.xml b/ide/idea/workspace.xml
index c5c0e28b963b..13018f4052d2 100644
--- a/ide/idea/workspace.xml
+++ b/ide/idea/workspace.xml
@@ -183,24 +183,39 @@
diff --git a/modules/accord b/modules/accord
new file mode 160000
index 000000000000..c7379e12bd8f
--- /dev/null
+++ b/modules/accord
@@ -0,0 +1 @@
+Subproject commit c7379e12bd8f8732004cb77264801fe157af1dbe
diff --git a/pylib/cqlshlib/cqlhandling.py b/pylib/cqlshlib/cqlhandling.py
index 504371c16dcb..0805867dc9a9 100644
--- a/pylib/cqlshlib/cqlhandling.py
+++ b/pylib/cqlshlib/cqlhandling.py
@@ -25,10 +25,10 @@
Hint = pylexotron.Hint
cql_keywords_reserved = {'add', 'allow', 'alter', 'and', 'apply', 'asc', 'authorize', 'batch', 'begin', 'by',
- 'columnfamily', 'create', 'delete', 'desc', 'describe', 'drop', 'entries', 'execute', 'from',
- 'full', 'grant', 'if', 'in', 'index', 'infinity', 'insert', 'into', 'is', 'keyspace', 'limit',
+ 'columnfamily', 'create', 'commit', 'delete', 'desc', 'describe', 'drop', 'end', 'entries', 'execute', 'from',
+ 'full', 'grant', 'if', 'in', 'index', 'infinity', 'insert', 'into', 'is', 'keyspace', 'let', 'limit',
'materialized', 'modify', 'nan', 'norecursive', 'not', 'null', 'of', 'on', 'or', 'order',
- 'primary', 'rename', 'revoke', 'schema', 'select', 'set', 'table', 'to', 'token', 'truncate',
+ 'primary', 'rename', 'revoke', 'schema', 'select', 'set', 'table', 'then', 'to', 'token', 'transaction', 'truncate',
'unlogged', 'update', 'use', 'using', 'view', 'where', 'with'}
"""
Set of reserved keywords in CQL.
@@ -145,7 +145,7 @@ def cql_split_statements(self, text):
else:
output.append(stmt)
if len(stmt) > 2:
- if stmt[-3][1].upper() == 'APPLY':
+ if stmt[-3][1].upper() == 'APPLY' or stmt[0][1].upper() == 'COMMIT' or (stmt[0][1].upper() == 'END' and stmt[1][1].upper() == 'IF'):
in_batch = False
elif stmt[0][1].upper() == 'BEGIN':
in_batch = True
diff --git a/pylib/cqlshlib/test/test_cqlsh_output.py b/pylib/cqlshlib/test/test_cqlsh_output.py
index 78dc0331454a..c32690b42496 100644
--- a/pylib/cqlshlib/test/test_cqlsh_output.py
+++ b/pylib/cqlshlib/test/test_cqlsh_output.py
@@ -690,6 +690,7 @@ def test_describe_columnfamily_output(self):
AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
AND memtable = 'default'
AND crc_check_chance = 1.0
+ AND fast_path = 'keyspace'
AND default_time_to_live = 0
AND extensions = {}
AND gc_grace_seconds = 864000
@@ -698,6 +699,8 @@ def test_describe_columnfamily_output(self):
AND memtable_flush_period_in_ms = 0
AND min_index_interval = 128
AND read_repair = 'BLOCKING'
+ AND transactional_mode = 'off'
+ AND transactional_migration_from = 'none'
AND speculative_retry = '99p';""" % quote_name(get_keyspace()))
with cqlsh_testrun(tty=True, env=self.default_env) as c:
@@ -791,7 +794,7 @@ def test_describe_schema_output(self):
self.assertNoHasColors(output)
# Since CASSANDRA-7622 'DESC FULL SCHEMA' also shows all VIRTUAL keyspaces
self.assertIn('VIRTUAL KEYSPACE system_virtual_schema', output)
- self.assertIn("\nCREATE KEYSPACE system_auth WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'} AND durable_writes = true;\n",
+ self.assertIn("\nCREATE KEYSPACE system_auth WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'} AND durable_writes = true AND fast_path = 'simple';\n",
output)
self.assertRegex(output, r'.*\s*$')
diff --git a/simulator.sh b/simulator.sh
new file mode 100755
index 000000000000..516405e974c2
--- /dev/null
+++ b/simulator.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#ant jar simulator-jars
+
+DIR=`pwd`
+JVM_OPTS="$JVM_OPTS -Dcassandra.config=file://$DIR/test/conf/cassandra.yaml"
+JVM_OPTS="$JVM_OPTS -Dlogback.configurationFile=file://$DIR/test/conf/logback-simulator.xml"
+JVM_OPTS="$JVM_OPTS -Dcassandra.logdir=$DIR/build/test/logs"
+#JVM_OPTS="$JVM_OPTS -Djava.library.path=$DIR/lib/sigar-bin"
+JVM_OPTS="$JVM_OPTS -Dlegacy-sstable-root=$DIR/test/data/legacy-sstables"
+JVM_OPTS="$JVM_OPTS -Dinvalid-legacy-sstable-root=$DIR/test/data/invalid-legacy-sstables"
+JVM_OPTS="$JVM_OPTS -Dcassandra.ring_delay_ms=1000"
+JVM_OPTS="$JVM_OPTS -Dcassandra.skip_sync=true"
+JVM_OPTS="$JVM_OPTS -ea"
+JVM_OPTS="$JVM_OPTS -XX:MaxMetaspaceSize=1G"
+JVM_OPTS="$JVM_OPTS -XX:SoftRefLRUPolicyMSPerMB=0"
+JVM_OPTS="$JVM_OPTS -Dcassandra.strict.runtime.checks=true"
+JVM_OPTS="$JVM_OPTS -javaagent:$DIR/build/test/lib/jars/simulator-asm.jar"
+JVM_OPTS="$JVM_OPTS -Xbootclasspath/a:$DIR/build/test/lib/jars/simulator-bootstrap.jar"
+JVM_OPTS="$JVM_OPTS -XX:ActiveProcessorCount=4"
+JVM_OPTS="$JVM_OPTS -XX:-TieredCompilation"
+JVM_OPTS="$JVM_OPTS -XX:Tier4CompileThreshold=1000"
+JVM_OPTS="$JVM_OPTS -XX:ReservedCodeCacheSize=256M"
+JVM_OPTS="$JVM_OPTS -Xmx8G"
+JVM_OPTS="$JVM_OPTS -Dcassandra.test.simulator.determinismcheck=strict"
+JVM_OPTS="$JVM_OPTS -Dcassandra.debugrefcount=false"
+JVM_OPTS="$JVM_OPTS -Dcassandra.skip_sync=true"
+JVM_OPTS="$JVM_OPTS -Dcassandra.tolerate_sstable_size=true"
+JVM_OPTS="$JVM_OPTS -Dcassandra.test.simulator.debug=true"
+JVM_OPTS="$JVM_OPTS -Dcassandra.test.simulator.determinismcheck=strict"
+echo $JVM_OPTS
+
+CLASSPATH="$DIR"/build/test/classes
+for dir in "$DIR"/build/classes/*; do
+ CLASSPATH="$CLASSPATH:$dir"
+done
+
+for jar in "$DIR"/lib/*.jar; do
+ CLASSPATH="$CLASSPATH:$jar"
+done
+for jar in "$DIR"/build/*.jar; do
+ if [[ $jar != *"logback-classic"* ]]; then
+ CLASSPATH="$CLASSPATH:$jar"
+ fi
+done
+for jar in "$DIR"/build/lib/jars/*.jar; do
+ if [[ $jar != *"logback-classic"* ]]; then
+ CLASSPATH="$CLASSPATH:$jar"
+ fi
+done
+for jar in "$DIR"/build/test/lib/jars/*.jar; do
+ if [[ $jar != *"logback-classic"* ]]; then
+ CLASSPATH="$CLASSPATH:$jar"
+ fi
+done
+
+CLASS="org.apache.cassandra.simulator.paxos.AccordSimulationRunner"
+OPTS="run -n 3..6 -t 1000 --cluster-action-limit -1 -c 2 -s 30"
+
+echo "java -cp <...> $CLASS $OPTS $@"
+
+while true
+do
+ echo ""
+ java -cp $CLASSPATH $JVM_OPTS $CLASS $OPTS $@
+ status=$?
+ if [ $status -ne 0 ] ; then
+ exit $status
+ fi
+
+done
diff --git a/src/antlr/Cql.g b/src/antlr/Cql.g
index b25f3944d6cf..cd2ffa33a8ff 100644
--- a/src/antlr/Cql.g
+++ b/src/antlr/Cql.g
@@ -45,6 +45,7 @@ import Parser,Lexer;
import org.apache.cassandra.cql3.statements.*;
import org.apache.cassandra.cql3.statements.schema.*;
import org.apache.cassandra.cql3.terms.*;
+ import org.apache.cassandra.cql3.transactions.*;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.exceptions.SyntaxException;
diff --git a/src/antlr/Lexer.g b/src/antlr/Lexer.g
index 34f5a80ccdf2..c956284b65c7 100644
--- a/src/antlr/Lexer.g
+++ b/src/antlr/Lexer.g
@@ -60,6 +60,7 @@ lexer grammar Lexer;
// pylib/cqlshlib/cqlhandling.py::cql_keywords_reserved.
// When adding a new unreserved keyword, add entry to unreserved keywords in Parser.g.
K_SELECT: S E L E C T;
+K_LET: L E T;
K_FROM: F R O M;
K_AS: A S;
K_WHERE: W H E R E;
@@ -83,8 +84,10 @@ K_BEGIN: B E G I N;
K_UNLOGGED: U N L O G G E D;
K_BATCH: B A T C H;
K_APPLY: A P P L Y;
+K_COMMIT: C O M M I T;
K_TRUNCATE: T R U N C A T E;
K_DELETE: D E L E T E;
+K_TRANSACTION: T R A N S A C T I O N;
K_IN: I N;
K_CREATE: C R E A T E;
K_SCHEMA: S C H E M A;
@@ -122,6 +125,8 @@ K_DESC: D E S C;
K_ALLOW: A L L O W;
K_FILTERING: F I L T E R I N G;
K_IF: I F;
+K_THEN: T H E N;
+K_END: E N D;
K_IS: I S;
K_CONTAINS: C O N T A I N S;
K_BETWEEN: B E T W E E N;
diff --git a/src/antlr/Parser.g b/src/antlr/Parser.g
index 099feaa1db2e..9adb77805c00 100644
--- a/src/antlr/Parser.g
+++ b/src/antlr/Parser.g
@@ -27,6 +27,15 @@ options {
private final List listeners = new ArrayList();
protected final List bindVariables = new ArrayList();
+ // enables parsing txn specific syntax when true
+ protected boolean isParsingTxn = false;
+ // tracks whether a txn has conditional updates
+ protected boolean isTxnConditional = false;
+
+ protected List references;
+
+ private Token statementBeginMarker;
+
public static final Set reservedTypeNames = new HashSet()
{{
add("byte");
@@ -59,6 +68,19 @@ options {
return marker;
}
+ public RowDataReference.Raw newRowDataReference(Selectable.RawIdentifier tuple, Selectable.Raw selectable)
+ {
+ if (!isParsingTxn)
+ throw new SyntaxException("Cannot create a row data reference unless parsing a transaction");
+
+ if (references == null)
+ references = new ArrayList<>();
+
+ RowDataReference.Raw reference = RowDataReference.Raw.fromSelectable(tuple, selectable);
+ references.add(reference);
+ return reference;
+ }
+
public void addErrorListener(ErrorListener listener)
{
this.listeners.add(listener);
@@ -122,14 +144,24 @@ options {
return res;
}
- public void addRawUpdate(List> operations, ColumnIdentifier key, Operation.RawUpdate update)
+ public void addRawUpdate(UpdateStatement.OperationCollector collector, ColumnIdentifier key, Operation.RawUpdate update)
{
- for (Pair p : operations)
- {
- if (p.left.equals(key) && !p.right.isCompatibleWith(update))
- addRecognitionError("Multiple incompatible setting of column " + key);
- }
- operations.add(Pair.create(key, update));
+ if (collector.conflictsWithExistingUpdate(key, update))
+ addRecognitionError("Multiple incompatible setting of column " + key);
+ if (collector.conflictsWithExistingSubstitution(key))
+ addRecognitionError("Normal and reference operations for " + key);
+
+ collector.addRawUpdate(key, update);
+ }
+
+ public void addRawReferenceOperation(UpdateStatement.OperationCollector collector, ColumnIdentifier key, ReferenceOperation.Raw update)
+ {
+ if (collector.conflictsWithExistingUpdate(key))
+ addRecognitionError("Multiple incompatible setting of column " + key);
+ if (collector.conflictsWithExistingSubstitution(key))
+ addRecognitionError("Normal and reference operations for " + key);
+
+ collector.addRawReferenceOperation(key, update);
}
public Set filterPermissions(Set permissions, IResource resource)
@@ -186,6 +218,19 @@ options {
{
// Do nothing.
}
+
+ public Token stmtBegins()
+ {
+ statementBeginMarker = input.LT(1);
+ return statementBeginMarker;
+ }
+
+ public StatementSource stmtSrc()
+ {
+ StatementSource stmtSrc = StatementSource.create(statementBeginMarker);
+ statementBeginMarker = null;
+ return stmtSrc;
+ }
}
/** STATEMENTS **/
@@ -236,6 +281,8 @@ cqlStatement returns [CQLStatement.Raw stmt]
| st42=addIdentityStatement { $stmt = st42; }
| st43=dropIdentityStatement { $stmt = st43; }
| st44=listSuperUsersStatement { $stmt = st44; }
+ | st45=batchTxnStatement { $stmt = st45; }
+ | st46=letStatement { $stmt = st46; }
;
/*
@@ -259,6 +306,7 @@ selectStatement returns [SelectStatement.RawStatement expr]
List groups = new ArrayList<>();
boolean allowFiltering = false;
boolean isJson = false;
+ stmtBegins();
}
: K_SELECT
// json is a valid column name. By consequence, we need to resolve the ambiguity for "json - json"
@@ -275,11 +323,40 @@ selectStatement returns [SelectStatement.RawStatement expr]
groups,
$sclause.isDistinct,
allowFiltering,
- isJson);
+ isJson,
+ null);
WhereClause where = wclause == null ? WhereClause.empty() : wclause.build();
- $expr = new SelectStatement.RawStatement(cf, params, $sclause.selectors, where, limit, perPartitionLimit);
+ $expr = new SelectStatement.RawStatement(cf, params, $sclause.selectors, where, limit, perPartitionLimit, stmtSrc());
}
;
+
+/**
+ * ex. LET x = (SELECT * FROM WHERE k=1 AND c=2)
+ * ex. LET y = (SELECT * FROM WHERE k=1 LIMIT 1)
+ */
+letStatement returns [SelectStatement.RawStatement expr]
+ @init {
+ Term.Raw limit = null;
+ }
+ : K_LET txnVar=IDENT '='
+ '(' { stmtBegins(); } K_SELECT assignments=letSelectors K_FROM cf=columnFamilyName K_WHERE wclause=whereClause ( K_LIMIT rows=intValue { limit = rows; } )? ')'
+ {
+ SelectStatement.Parameters params = new SelectStatement.Parameters(Collections.emptyList(), Collections.emptyList(), false, false, false, $txnVar.text);
+ WhereClause where = wclause == null ? WhereClause.empty() : wclause.build();
+
+ $expr = new SelectStatement.RawStatement(cf, params, assignments, where, limit, null, stmtSrc());
+ }
+ ;
+
+letSelectors returns [List expr]
+ : t1=letSelector { $expr = new ArrayList(); $expr.add(t1); } (',' tN=letSelector { $expr.add(tN); })*
+ | '\*' { $expr = Collections.emptyList();}
+ ;
+
+letSelector returns [RawSelector s]
+ @init{ ColumnIdentifier alias = null; }
+ : us=unaliasedSelector { $s = new RawSelector(us, alias); }
+ ;
selectClause returns [boolean isDistinct, List selectors]
@init{ $isDistinct = false; }
@@ -474,6 +551,9 @@ groupByClause[List groups]
*
*/
insertStatement returns [ModificationStatement.Parsed expr]
+ @init {
+ stmtBegins();
+ }
: K_INSERT K_INTO cf=columnFamilyName
( st1=normalInsertStatement[cf] { $expr = st1; }
| K_JSON st2=jsonInsertStatement[cf] { $expr = st2; })
@@ -488,14 +568,19 @@ normalInsertStatement [QualifiedName qn] returns [UpdateStatement.ParsedInsert e
}
: '(' c1=cident { columnNames.add(c1); } ( ',' cn=cident { columnNames.add(cn); } )* ')'
K_VALUES
- '(' v1=term { values.add(v1); } ( ',' vn=term { values.add(vn); } )* ')'
+ '(' insertValue[values] ( ',' insertValue[values] )* ')'
( K_IF K_NOT K_EXISTS { ifNotExists = true; } )?
( usingClause[attrs] )?
{
- $expr = new UpdateStatement.ParsedInsert(qn, attrs, columnNames, values, ifNotExists);
+ $expr = new UpdateStatement.ParsedInsert(qn, attrs, columnNames, values, ifNotExists, stmtSrc(), isParsingTxn);
}
;
+insertValue[List values]
+ : t=term { values.add(t); }
+ | {isParsingTxn}? dr=rowDataReference { values.add(new ReferenceValue.Substitution.Raw(dr)); }
+ ;
+
jsonInsertStatement [QualifiedName qn] returns [UpdateStatement.ParsedInsertJson expr]
@init {
Attributes.Raw attrs = new Attributes.Raw();
@@ -507,7 +592,7 @@ jsonInsertStatement [QualifiedName qn] returns [UpdateStatement.ParsedInsertJson
( K_IF K_NOT K_EXISTS { ifNotExists = true; } )?
( usingClause[attrs] )?
{
- $expr = new UpdateStatement.ParsedInsertJson(qn, attrs, val, defaultUnset, ifNotExists);
+ $expr = new UpdateStatement.ParsedInsertJson(qn, attrs, val, defaultUnset, ifNotExists, stmtSrc(), isParsingTxn);
}
;
@@ -536,8 +621,9 @@ usingClauseObjective[Attributes.Raw attrs]
updateStatement returns [UpdateStatement.ParsedUpdate expr]
@init {
Attributes.Raw attrs = new Attributes.Raw();
- List> operations = new ArrayList<>();
+ UpdateStatement.OperationCollector operations = new UpdateStatement.OperationCollector();
boolean ifExists = false;
+ stmtBegins();
}
: K_UPDATE cf=columnFamilyName
( usingClause[attrs] )?
@@ -550,7 +636,9 @@ updateStatement returns [UpdateStatement.ParsedUpdate expr]
operations,
wclause.build(),
conditions == null ? Collections.emptyList() : conditions,
- ifExists);
+ ifExists,
+ isParsingTxn,
+ stmtSrc());
}
;
@@ -571,6 +659,7 @@ deleteStatement returns [DeleteStatement.Parsed expr]
Attributes.Raw attrs = new Attributes.Raw();
List columnDeletions = Collections.emptyList();
boolean ifExists = false;
+ stmtBegins();
}
: K_DELETE ( dels=deleteSelection { columnDeletions = dels; } )?
K_FROM cf=columnFamilyName
@@ -583,7 +672,9 @@ deleteStatement returns [DeleteStatement.Parsed expr]
columnDeletions,
wclause.build(),
conditions == null ? Collections.emptyList() : conditions,
- ifExists);
+ ifExists,
+ stmtSrc(),
+ isParsingTxn);
}
;
@@ -649,6 +740,102 @@ batchStatementObjective returns [ModificationStatement.Parsed statement]
| d=deleteStatement { $statement = d; }
;
+/**
+ * ex. conditional update returning pre-update values
+ *
+ * BEGIN TRANSACTION
+ * LET row1 = (SELECT * FROM WHERE k=1 AND c=2);
+ * LET row2 = (SELECT * FROM WHERE k=2 AND c=2);
+ * SELECT row1.v, row2.v;
+ * IF row1.v = 3 AND row2.v = 4 THEN
+ * UPDATE SET v = row1.v + 1 WHERE k = 1 AND c = 2;
+ * END IF
+ * COMMIT TRANSACTION
+ *
+ * ex. read-only transaction
+ *
+ * BEGIN TRANSACTION
+ * SELECT * FROM WHERE k=1 AND c=2;
+ * COMMIT TRANSACTION
+ *
+ * ex. write-only transaction
+ *
+ * BEGIN TRANSACTION
+ * INSERT INTO (k, c, v) VALUES (0, 0, 1);
+ * COMMIT TRANSACTION
+ */
+batchTxnStatement returns [TransactionStatement.Parsed expr]
+ @init {
+ isParsingTxn = true;
+ List assignments = new ArrayList<>();
+ SelectStatement.RawStatement select = null;
+ List returning = null;
+ List updates = new ArrayList<>();
+ }
+ : K_BEGIN K_TRANSACTION
+ ( let=letStatement ';' { assignments.add(let); })*
+ ( ( (selectStatement) => s=selectStatement ';' { select = s; }) | ( K_SELECT drs=rowDataReferences ';' { returning = drs; }) )?
+ ( K_IF conditions=txnConditions K_THEN { isTxnConditional = true; } )?
+ ( upd=batchStatementObjective ';' { updates.add(upd); } )*
+ ( {!isTxnConditional}? (K_COMMIT K_TRANSACTION) | {isTxnConditional}? (K_END K_IF K_COMMIT K_TRANSACTION))
+ {
+ $expr = new TransactionStatement.Parsed(assignments, select, returning, updates, conditions, references);
+ }
+ ;
+ finally { isParsingTxn = false; }
+
+rowDataReferences returns [List refs]
+ : r1=rowDataReference { refs = new ArrayList(); refs.add(r1); } (',' rN=rowDataReference { refs.add(rN); })*
+ ;
+
+rowDataReference returns [RowDataReference.Raw rawRef]
+ @init { Selectable.RawIdentifier tuple = null; Selectable.Raw selectable = null; }
+ @after { $rawRef = newRowDataReference(tuple, selectable); }
+ : t=sident ('.' s=referenceSelection)? { tuple = t; selectable = s; }
+ ;
+
+referenceSelection returns [Selectable.Raw s]
+ : g=referenceSelectionWithoutField m=selectorModifier[g] {$s = m;}
+ ;
+
+referenceSelectionWithoutField returns [Selectable.Raw s]
+ @init { Selectable.Raw tmp = null; }
+ @after { $s = tmp; }
+ : sn=sident { tmp=sn; }
+ | (selectionTypeHint)=> h=selectionTypeHint { tmp=h; }
+ | t=selectionTupleOrNestedSelector { tmp=t; }
+ | l=selectionList { tmp=l; }
+ | m=selectionMapOrSet { tmp=m; }
+ // UDTs are equivalent to maps from the syntax point of view, so the final decision will be done in Selectable.WithMapOrUdt
+ ;
+
+txnConditions returns [List conditions]
+ @init { conditions = new ArrayList(); }
+ : txnColumnCondition[conditions] ( K_AND txnColumnCondition[conditions] )*
+ ;
+
+txnConditionKind returns [ConditionStatement.Kind op]
+ : '=' { $op = ConditionStatement.Kind.EQ; }
+ | '<' { $op = ConditionStatement.Kind.LT; }
+ | '<=' { $op = ConditionStatement.Kind.LTE; }
+ | '>' { $op = ConditionStatement.Kind.GT; }
+ | '>=' { $op = ConditionStatement.Kind.GTE; }
+ | '!=' { $op = ConditionStatement.Kind.NEQ; }
+ ;
+
+txnColumnCondition[List conditions]
+ : lhs=rowDataReference
+ (
+ K_IS
+ (
+ K_NOT K_NULL { conditions.add(new ConditionStatement.Raw(lhs, ConditionStatement.Kind.IS_NOT_NULL, null)); }
+ | K_NULL { conditions.add(new ConditionStatement.Raw(lhs, ConditionStatement.Kind.IS_NULL, null)); }
+ )
+ | (txnConditionKind term)=> op=txnConditionKind t=term { conditions.add(new ConditionStatement.Raw(lhs, op, t)); }
+ )
+ | lhs=term op=txnConditionKind rhs=rowDataReference { conditions.add(new ConditionStatement.Raw(lhs, op, rhs)); }
+ ;
+
createAggregateStatement returns [CreateAggregateStatement.Raw stmt]
@init {
boolean orReplace = false;
@@ -1690,18 +1877,18 @@ simpleTerm returns [Term.Raw term]
| K_CAST '(' t=simpleTerm K_AS n=native_type ')' { $term = FunctionCall.Raw.newCast(t, n); }
;
-columnOperation[List> operations]
+columnOperation[UpdateStatement.OperationCollector operations]
: key=cident columnOperationDifferentiator[operations, key]
;
-columnOperationDifferentiator[List> operations, ColumnIdentifier key]
+columnOperationDifferentiator[UpdateStatement.OperationCollector operations, ColumnIdentifier key]
: '=' normalColumnOperation[operations, key]
| shorthandColumnOperation[operations, key]
| '[' k=term ']' collectionColumnOperation[operations, key, k]
| '.' field=fident udtColumnOperation[operations, key, field]
;
-normalColumnOperation[List> operations, ColumnIdentifier key]
+normalColumnOperation[UpdateStatement.OperationCollector operations, ColumnIdentifier key]
: t=term ('+' c=cident )?
{
if (c == null)
@@ -1729,27 +1916,56 @@ normalColumnOperation[List> operatio
addRecognitionError("Only expressions of the form X = X " + ($i.text.charAt(0) == '-' ? '-' : '+') + " are supported.");
addRawUpdate(operations, key, new Operation.Addition(Constants.Literal.integer($i.text)));
}
+ | {isParsingTxn}? r=rowDataReference
+ {
+ addRawReferenceOperation(operations, key, new ReferenceOperation.Raw(new Operation.SetValue(r), key, new ReferenceValue.Substitution.Raw(r)));
+ }
;
-shorthandColumnOperation[List> operations, ColumnIdentifier key]
- : sig=('+=' | '-=') t=term
- {
- addRawUpdate(operations, key, $sig.text.equals("+=") ? new Operation.Addition(t) : new Operation.Substraction(t));
- }
+shorthandColumnOperation[UpdateStatement.OperationCollector operations, ColumnIdentifier key]
+ : sig=('+=' | '-=')
+ (
+ t=term
+ {
+ addRawUpdate(operations, key, $sig.text.equals("+=") ? new Operation.Addition(t) : new Operation.Substraction(t));
+ }
+ | {isParsingTxn}? dr=rowDataReference
+ {
+ ReferenceValue.Raw right = new ReferenceValue.Substitution.Raw(dr);
+ Operation.RawUpdate operation = $sig.text.equals("+=") ? new Operation.Addition(dr) : new Operation.Substraction(dr);
+ addRawReferenceOperation(operations, key, new ReferenceOperation.Raw(operation, key, right));
+ }
+ )
;
-collectionColumnOperation[List> operations, ColumnIdentifier key, Term.Raw k]
- : '=' t=term
- {
- addRawUpdate(operations, key, new Operation.SetElement(k, t));
- }
+collectionColumnOperation[UpdateStatement.OperationCollector operations, ColumnIdentifier key, Term.Raw k]
+ : '='
+ (
+ t=term
+ {
+ addRawUpdate(operations, key, new Operation.SetElement(k, t));
+ }
+ | {isParsingTxn}? dr=rowDataReference
+ {
+ ReferenceValue.Raw right = new ReferenceValue.Substitution.Raw(dr);
+ addRawReferenceOperation(operations, key, new ReferenceOperation.Raw(new Operation.SetElement(k, dr), key, right));
+ }
+ )
;
-udtColumnOperation[List> operations, ColumnIdentifier key, FieldIdentifier field]
- : '=' t=term
- {
- addRawUpdate(operations, key, new Operation.SetField(field, t));
- }
+udtColumnOperation[UpdateStatement.OperationCollector operations, ColumnIdentifier key, FieldIdentifier field]
+ : '='
+ (
+ t=term
+ {
+ addRawUpdate(operations, key, new Operation.SetField(field, t));
+ }
+ | {isParsingTxn}? dr=rowDataReference
+ {
+ ReferenceValue.Raw right = new ReferenceValue.Substitution.Raw(dr);
+ addRawReferenceOperation(operations, key, new ReferenceOperation.Raw(new Operation.SetField(field, dr), key, right));
+ }
+ )
;
columnCondition returns [ColumnCondition.Raw condition]
diff --git a/src/java/org/apache/cassandra/audit/AuditLogEntryCategory.java b/src/java/org/apache/cassandra/audit/AuditLogEntryCategory.java
index 9db4ce05e9c7..ae5efc6677a4 100644
--- a/src/java/org/apache/cassandra/audit/AuditLogEntryCategory.java
+++ b/src/java/org/apache/cassandra/audit/AuditLogEntryCategory.java
@@ -23,5 +23,5 @@
*/
public enum AuditLogEntryCategory
{
- QUERY, DML, DDL, DCL, OTHER, AUTH, ERROR, PREPARE
+ QUERY, DML, DDL, DCL, OTHER, AUTH, ERROR, PREPARE, TRANSACTION
}
diff --git a/src/java/org/apache/cassandra/audit/AuditLogEntryType.java b/src/java/org/apache/cassandra/audit/AuditLogEntryType.java
index 17d4c98feafe..e7baa6e54a1d 100644
--- a/src/java/org/apache/cassandra/audit/AuditLogEntryType.java
+++ b/src/java/org/apache/cassandra/audit/AuditLogEntryType.java
@@ -62,6 +62,7 @@ public enum AuditLogEntryType
DROP_IDENTITY(AuditLogEntryCategory.DCL),
USE_KEYSPACE(AuditLogEntryCategory.OTHER),
DESCRIBE(AuditLogEntryCategory.OTHER),
+ TRANSACTION(AuditLogEntryCategory.TRANSACTION),
/*
* Common Audit Log Entry Types
diff --git a/src/java/org/apache/cassandra/audit/AuditLogFilter.java b/src/java/org/apache/cassandra/audit/AuditLogFilter.java
index d240e78c83fc..d75e54475277 100644
--- a/src/java/org/apache/cassandra/audit/AuditLogFilter.java
+++ b/src/java/org/apache/cassandra/audit/AuditLogFilter.java
@@ -28,7 +28,7 @@ final class AuditLogFilter
{
private static final Logger logger = LoggerFactory.getLogger(AuditLogFilter.class);
- private static ImmutableSet EMPTY_FILTERS = ImmutableSet.of();
+ private static final ImmutableSet EMPTY_FILTERS = ImmutableSet.of();
final ImmutableSet excludedKeyspaces;
final ImmutableSet includedKeyspaces;
diff --git a/src/java/org/apache/cassandra/auth/CassandraRoleManager.java b/src/java/org/apache/cassandra/auth/CassandraRoleManager.java
index 3c1b52f8237a..a84ab00efbf2 100644
--- a/src/java/org/apache/cassandra/auth/CassandraRoleManager.java
+++ b/src/java/org/apache/cassandra/auth/CassandraRoleManager.java
@@ -67,7 +67,6 @@
import org.apache.cassandra.utils.NoSpamLogger;
import org.mindrot.jbcrypt.BCrypt;
-import static org.apache.cassandra.config.CassandraRelevantProperties.AUTH_BCRYPT_GENSALT_LOG2_ROUNDS;
import static org.apache.cassandra.service.QueryState.forInternalCalls;
/**
@@ -132,17 +131,6 @@ public class CassandraRoleManager implements IRoleManager
}
};
- private static final int GENSALT_LOG2_ROUNDS = getGensaltLogRounds();
-
- static int getGensaltLogRounds()
- {
- int rounds = AUTH_BCRYPT_GENSALT_LOG2_ROUNDS.getInt(10);
- if (rounds < 4 || rounds > 30)
- throw new ConfigurationException(String.format("Bad value for system property %s." +
- "Please use a value between 4 and 30 inclusively", AUTH_BCRYPT_GENSALT_LOG2_ROUNDS.getKey()));
- return rounds;
- }
-
private SelectStatement loadRoleStatement;
private SelectStatement loadIdentityStatement;
@@ -656,9 +644,11 @@ private String optionsToAssignments(Map options)
.collect(Collectors.joining(","));
}
+
+
private static String hashpw(String password)
{
- return BCrypt.hashpw(password, BCrypt.gensalt(GENSALT_LOG2_ROUNDS));
+ return BCrypt.hashpw(password, PasswordSaltSupplier.get());
}
private static String escape(String name)
diff --git a/src/java/org/apache/cassandra/auth/PasswordSaltSupplier.java b/src/java/org/apache/cassandra/auth/PasswordSaltSupplier.java
new file mode 100644
index 000000000000..9c9bd1d0f813
--- /dev/null
+++ b/src/java/org/apache/cassandra/auth/PasswordSaltSupplier.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.auth;
+
+import java.util.function.Supplier;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.mindrot.jbcrypt.BCrypt;
+
+import static org.apache.cassandra.config.CassandraRelevantProperties.AUTH_BCRYPT_GENSALT_LOG2_ROUNDS;
+
+public class PasswordSaltSupplier
+{
+ // 2 ** GENSALT_LOG2_ROUNDS rounds of hashing will be performed.
+ private static final int GENSALT_LOG2_ROUNDS = getGensaltLogRounds();
+
+ @VisibleForTesting
+ static int getGensaltLogRounds()
+ {
+ int rounds = AUTH_BCRYPT_GENSALT_LOG2_ROUNDS.getInt();
+ if (rounds < 4 || rounds > 30)
+ throw new ConfigurationException(String.format("Bad value for system property -D%s." +
+ "Please use a value between 4 and 30 inclusively",
+ AUTH_BCRYPT_GENSALT_LOG2_ROUNDS.getKey()));
+ return rounds;
+ }
+ private static Supplier DEFAULT_SALT_SUPPLIER = () -> BCrypt.gensalt(GENSALT_LOG2_ROUNDS);
+ private static Supplier saltSupplier = DEFAULT_SALT_SUPPLIER;
+
+ public static void unsafeSet(Supplier newSaltSupplier)
+ {
+ assert newSaltSupplier != null;
+ saltSupplier = newSaltSupplier;
+ }
+ public static void unsafeReset()
+ {
+ saltSupplier = DEFAULT_SALT_SUPPLIER;
+ }
+
+ public static String get()
+ {
+ return saltSupplier.get();
+ }
+}
diff --git a/src/java/org/apache/cassandra/batchlog/BatchlogManager.java b/src/java/org/apache/cassandra/batchlog/BatchlogManager.java
index b5b33d5e0b34..4c19d1c0105c 100644
--- a/src/java/org/apache/cassandra/batchlog/BatchlogManager.java
+++ b/src/java/org/apache/cassandra/batchlog/BatchlogManager.java
@@ -27,25 +27,24 @@
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Supplier;
+import javax.annotation.Nullable;
import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.util.concurrent.RateLimiter;
-import org.apache.cassandra.concurrent.ScheduledExecutorPlus;
-import org.apache.cassandra.schema.KeyspaceMetadata;
-import org.apache.cassandra.tcm.ClusterMetadata;
-import org.apache.cassandra.transport.Dispatcher;
-import org.apache.cassandra.utils.TimeUUID;
-import org.apache.cassandra.utils.concurrent.Future;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.cassandra.concurrent.ScheduledExecutorPlus;
+import org.apache.cassandra.config.CassandraRelevantProperties;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.cql3.UntypedResultSet.Row;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.ConsistencyLevel;
import org.apache.cassandra.db.Keyspace;
@@ -55,6 +54,7 @@
import org.apache.cassandra.db.marshal.BytesType;
import org.apache.cassandra.db.partitions.PartitionUpdate;
import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.RetryOnDifferentSystemException;
import org.apache.cassandra.exceptions.WriteFailureException;
import org.apache.cassandra.exceptions.WriteTimeoutException;
import org.apache.cassandra.gms.FailureDetector;
@@ -70,26 +70,39 @@
import org.apache.cassandra.net.Message;
import org.apache.cassandra.net.MessageFlag;
import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.KeyspaceMetadata;
import org.apache.cassandra.schema.SchemaConstants;
import org.apache.cassandra.schema.TableId;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.service.WriteResponseHandler;
+import org.apache.cassandra.service.accord.IAccordService.IAccordResult;
+import org.apache.cassandra.service.accord.txn.TxnResult;
+import org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper;
+import org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper.SplitMutations;
+import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.transport.Dispatcher;
+import org.apache.cassandra.utils.Clock;
import org.apache.cassandra.utils.ExecutorUtils;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.MBeanWrapper;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.cassandra.utils.TimeUUID;
+import org.apache.cassandra.utils.concurrent.Future;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory;
import static org.apache.cassandra.config.CassandraRelevantProperties.BATCHLOG_REPLAY_TIMEOUT_IN_MS;
import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
import static org.apache.cassandra.cql3.QueryProcessor.executeInternalWithPaging;
+import static org.apache.cassandra.hints.HintsService.RETRY_ON_DIFFERENT_SYSTEM_UUID;
import static org.apache.cassandra.net.Verb.MUTATION_REQ;
+import static org.apache.cassandra.service.accord.txn.TxnResult.Kind.retry_new_protocol;
+import static org.apache.cassandra.service.consensus.migration.ConsensusMigrationMutationHelper.mutateWithAccordAsync;
import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis;
public class BatchlogManager implements BatchlogManagerMBean
{
public static final String MBEAN_NAME = "org.apache.cassandra.db:type=BatchlogManager";
- private static final long REPLAY_INTERVAL = 10 * 1000; // milliseconds
static final int DEFAULT_PAGE_SIZE = 128;
private static final Logger logger = LoggerFactory.getLogger(BatchlogManager.class);
@@ -104,6 +117,8 @@ public class BatchlogManager implements BatchlogManagerMBean
private final RateLimiter rateLimiter = RateLimiter.create(Double.MAX_VALUE);
+ private final AtomicBoolean isBatchlogReplayPaused = new AtomicBoolean(false);
+
public BatchlogManager()
{
batchlogTasks = executorFactory().scheduled(false, "BatchlogTasks");
@@ -115,7 +130,7 @@ public void start()
batchlogTasks.scheduleWithFixedDelay(this::replayFailedBatches,
StorageService.RING_DELAY_MILLIS,
- REPLAY_INTERVAL,
+ CassandraRelevantProperties.BATCHLOG_REPLAY_INTERVAL_MS.getLong(),
MILLISECONDS);
}
@@ -184,7 +199,9 @@ public long getTotalBatchesReplayed()
public void forceBatchlogReplay() throws Exception
{
+ logger.debug("Forcing batchlog replay");
startBatchlogReplay().get();
+ logger.debug("Finished forcing batchlog replay");
}
public Future> startBatchlogReplay()
@@ -193,14 +210,25 @@ public Future> startBatchlogReplay()
return batchlogTasks.submit(this::replayFailedBatches);
}
- void performInitialReplay() throws InterruptedException, ExecutionException
+ public void pauseReplay()
+ {
+ logger.debug("Paused batchlog replay");
+ isBatchlogReplayPaused.set(true);
+ }
+
+ public void resumeReplay()
{
- // Invokes initial replay. Used for testing only.
- batchlogTasks.submit(this::replayFailedBatches).get();
+ logger.debug("Resumed batchlog replay");
+ isBatchlogReplayPaused.set(false);
}
private void replayFailedBatches()
{
+ if (isBatchlogReplayPaused.get())
+ {
+ logger.debug("Batch log replay is paused, skipping replay");
+ return;
+ }
logger.trace("Started replayFailedBatches");
// rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml).
@@ -223,6 +251,7 @@ private void replayFailedBatches()
SchemaConstants.SYSTEM_KEYSPACE_NAME,
SystemKeyspace.BATCHES);
UntypedResultSet batches = executeInternalWithPaging(query, pageSize, lastReplayedUuid, limitUuid);
+
processBatchlogEntries(batches, pageSize, rateLimiter);
lastReplayedUuid = limitUuid;
logger.trace("Finished replayFailedBatches");
@@ -276,16 +305,7 @@ private void processBatchlogEntries(UntypedResultSet batches, int pageSize, Rate
int version = row.getInt("version");
try
{
- ReplayingBatch batch = new ReplayingBatch(id, version, row.getList("mutations", BytesType.instance));
- if (batch.replay(rateLimiter, hintedNodes) > 0)
- {
- unfinishedBatches.add(batch);
- }
- else
- {
- remove(id); // no write mutations were sent (either expired or all CFs involved truncated).
- ++totalBatchesReplayed;
- }
+ dispatchBatch(rateLimiter, row, id, version, hintedNodes, unfinishedBatches);
}
catch (IOException e)
{
@@ -307,6 +327,8 @@ private void processBatchlogEntries(UntypedResultSet batches, int pageSize, Rate
// finalize the incomplete last page of batches
if (positionInPage > 0)
finishAndClearBatches(unfinishedBatches, hintedNodes, replayedBatches);
+ else
+ logger.trace("Had no batches to replay");
if (caughtException != null)
logger.warn(String.format("Encountered %d unexpected exceptions while sending out batches", skipped), caughtException);
@@ -318,6 +340,35 @@ private void processBatchlogEntries(UntypedResultSet batches, int pageSize, Rate
replayedBatches.forEach(BatchlogManager::remove);
}
+ private void dispatchBatch(RateLimiter rateLimiter, Row row, TimeUUID id, int version, Set hintedNodes, ArrayList unfinishedBatches) throws IOException
+ {
+ while (true)
+ {
+ ClusterMetadata cm = ClusterMetadata.current();
+ try
+ {
+ ReplayingBatch batch = new ReplayingBatch(id, version, row.getList("mutations", BytesType.instance), cm);
+ if (batch.replay(rateLimiter, hintedNodes))
+ {
+ unfinishedBatches.add(batch);
+ }
+ else
+ {
+ remove(id); // no write mutations were sent (either expired or all CFs involved truncated).
+ ++totalBatchesReplayed;
+ }
+ }
+ catch (RetryOnDifferentSystemException e)
+ {
+ // Self apply can throw retry on different system
+ // Barring bugs we should already have the latest cluster metadata needed to correctly
+ // split the batch and retry since that is what was used to generate the exception
+ continue;
+ }
+ break;
+ }
+ }
+
private void finishAndClearBatches(ArrayList batches, Set hintedNodes, Set replayedBatches)
{
// schedule hints for timed out deliveries
@@ -340,61 +391,111 @@ private static class ReplayingBatch
{
private final TimeUUID id;
private final long writtenAt;
- private final List mutations;
+ private final int unsplitGcGs;
+ private final List normalMutations;
+ private final List accordMutations;
private final int replayedBytes;
+ private final ClusterMetadata cm;
- private List> replayHandlers;
+ private List> replayHandlers = ImmutableList.of();
+ private IAccordResult accordResult;
+ @Nullable
+ private Dispatcher.RequestTime accordTxnStart;
- ReplayingBatch(TimeUUID id, int version, List serializedMutations) throws IOException
+ ReplayingBatch(TimeUUID id, int version, List serializedMutations, ClusterMetadata cm) throws IOException
{
this.id = id;
this.writtenAt = id.unix(MILLISECONDS);
- this.mutations = new ArrayList<>(serializedMutations.size());
- this.replayedBytes = addMutations(version, serializedMutations);
+ List unsplitMutations = new ArrayList<>(serializedMutations.size());
+ this.replayedBytes = addMutations(unsplitMutations, writtenAt, version, serializedMutations);
+ unsplitGcGs = gcgs(unsplitMutations);
+ SplitMutations splitMutations = ConsensusMigrationMutationHelper.splitMutationsIntoAccordAndNormal(cm, unsplitMutations);
+ logger.trace("Replaying batch with Accord {} and normal {}", splitMutations.accordMutations(), splitMutations.normalMutations());
+ normalMutations = splitMutations.normalMutations();
+ accordMutations = splitMutations.accordMutations();
+ if (accordMutations != null)
+ accordTxnStart = new Dispatcher.RequestTime(Clock.Global.nanoTime());
+ this.cm = cm;
}
- public int replay(RateLimiter rateLimiter, Set hintedNodes) throws IOException
+ public boolean replay(RateLimiter rateLimiter, Set hintedNodes) throws IOException
{
logger.trace("Replaying batch {}", id);
- if (mutations.isEmpty())
- return 0;
+ if ((normalMutations == null || normalMutations.isEmpty()) && (accordMutations == null || accordMutations.isEmpty()))
+ return false;
- int gcgs = gcgs(mutations);
- if (MILLISECONDS.toSeconds(writtenAt) + gcgs <= FBUtilities.nowInSeconds())
- return 0;
+ if (MILLISECONDS.toSeconds(writtenAt) + unsplitGcGs <= FBUtilities.nowInSeconds())
+ return false;
- replayHandlers = sendReplays(mutations, writtenAt, hintedNodes);
+ if (accordMutations != null)
+ {
+ accordTxnStart = accordTxnStart.withStartedAt(Clock.Global.nanoTime());
+ accordResult = accordMutations != null ? mutateWithAccordAsync(cm, accordMutations, null, accordTxnStart) : null;
+ }
+
+ if (normalMutations != null)
+ replayHandlers = sendReplays(normalMutations, writtenAt, hintedNodes);
rateLimiter.acquire(replayedBytes); // acquire afterwards, to not mess up ttl calculation.
- return replayHandlers.size();
+ return replayHandlers.size() > 0 || accordMutations != null;
}
public void finish(Set hintedNodes)
{
- for (int i = 0; i < replayHandlers.size(); i++)
+ Throwable failure = null;
+ // Check if the Accord mutations succeeded asynchronously
+ try
{
- ReplayWriteResponseHandler handler = replayHandlers.get(i);
- try
+ if (accordResult != null)
{
- handler.get();
+ TxnResult.Kind kind = accordResult.awaitAndGet().kind();
+ if (kind == retry_new_protocol)
+ throw new RetryOnDifferentSystemException();
}
- catch (WriteTimeoutException|WriteFailureException e)
+ }
+ catch (WriteTimeoutException | WriteFailureException | RetryOnDifferentSystemException e )
+ {
+ logger.trace("Failed replaying a batched mutation on Accord, will write a hint");
+ logger.trace("Failure was : {}", e.getMessage());
+ writeHintsForUndeliveredAccordTxns(hintedNodes);
+ }
+ catch (Exception e)
+ {
+ failure = Throwables.merge(failure, e);
+ }
+
+ try
+ {
+ for (int i = 0; i < replayHandlers.size(); i++)
{
- if (logger.isTraceEnabled())
+ ReplayWriteResponseHandler handler = replayHandlers.get(i);
+ try
+ {
+ handler.get();
+ }
+ catch (WriteTimeoutException|WriteFailureException|RetryOnDifferentSystemException e)
{
logger.trace("Failed replaying a batched mutation to a node, will write a hint");
logger.trace("Failure was : {}", e.getMessage());
+ // writing hints for the rest to hints, starting from i
+ writeHintsForUndeliveredEndpoints(i, hintedNodes);
+ break;
}
- // writing hints for the rest to hints, starting from i
- writeHintsForUndeliveredEndpoints(i, hintedNodes);
- return;
}
}
+ catch (Exception e)
+ {
+ logger.debug("Unexpected batchlog replay exception", e);
+ failure = Throwables.merge(failure, e);
+ }
+
+ if (failure != null)
+ throw Throwables.unchecked(failure);
}
- private int addMutations(int version, List serializedMutations) throws IOException
+ private static int addMutations(List unsplitMutations, long writtenAt, int version, List serializedMutations) throws IOException
{
int ret = 0;
for (ByteBuffer serializedMutation : serializedMutations)
@@ -402,7 +503,7 @@ private int addMutations(int version, List serializedMutations) thro
ret += serializedMutation.remaining();
try (DataInputBuffer in = new DataInputBuffer(serializedMutation, true))
{
- addMutation(Mutation.serializer.deserialize(in, version));
+ addMutation(unsplitMutations, writtenAt, Mutation.serializer.deserialize(in, version));
}
}
@@ -412,19 +513,41 @@ private int addMutations(int version, List serializedMutations) thro
// Remove CFs that have been truncated since. writtenAt and SystemTable#getTruncatedAt() both return millis.
// We don't abort the replay entirely b/c this can be considered a success (truncated is same as delivered then
// truncated.
- private void addMutation(Mutation mutation)
+ private static void addMutation(List unsplitMutations, long writtenAt, Mutation mutation)
{
for (TableId tableId : mutation.getTableIds())
if (writtenAt <= SystemKeyspace.getTruncatedAt(tableId))
mutation = mutation.without(tableId);
- if (!mutation.isEmpty())
- mutations.add(mutation);
+ if (mutation != null)
+ unsplitMutations.add(mutation);
+ }
+
+ // Write the hint assuming that when it is replayed it will probably be replayed
+ // as an Accord transaction so no reason to record per endpoint hints for all the endpoints
+ // Hints will still have to split and re-route on replay
+ private void writeHintsForUndeliveredAccordTxns(Set hintedNodes)
+ {
+ if (accordMutations == null)
+ return;
+
+ int gcgs = gcgs(accordMutations);
+
+ // expired
+ if (MILLISECONDS.toSeconds(writtenAt) + gcgs <= FBUtilities.nowInSeconds())
+ return;
+
+ for (Mutation m : accordMutations)
+ HintsService.instance.write(ImmutableList.of(RETRY_ON_DIFFERENT_SYSTEM_UUID), Hint.create(m, writtenAt));
+ hintedNodes.add(RETRY_ON_DIFFERENT_SYSTEM_UUID);
}
private void writeHintsForUndeliveredEndpoints(int startFrom, Set hintedNodes)
{
- int gcgs = gcgs(mutations);
+ if (normalMutations == null)
+ return;
+
+ int gcgs = gcgs(normalMutations);
// expired
if (MILLISECONDS.toSeconds(writtenAt) + gcgs <= FBUtilities.nowInSeconds())
@@ -434,7 +557,7 @@ private void writeHintsForUndeliveredEndpoints(int startFrom, Set hintedNo
for (int i = startFrom; i < replayHandlers.size(); i++)
{
ReplayWriteResponseHandler handler = replayHandlers.get(i);
- Mutation undeliveredMutation = mutations.get(i);
+ Mutation undeliveredMutation = normalMutations.get(i);
if (handler != null)
{
diff --git a/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java b/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java
index ac10a70c3066..c9487e416458 100644
--- a/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java
+++ b/src/java/org/apache/cassandra/concurrent/InfiniteLoopExecutor.java
@@ -52,6 +52,11 @@ public enum InternalState { SHUTTING_DOWN_NOW, TERMINATED }
@Shared(scope = Shared.Scope.SIMULATION)
public enum SimulatorSafe { SAFE, UNSAFE }
+ /**
+ * Does this loop always block on some external work provision that is going to be simulator-controlled, or does
+ * it loop periodically? If the latter, it may prevent simulation making progress between phases, and should be
+ * marked as a DAEMON process.
+ */
@Shared(scope = Shared.Scope.SIMULATION)
public enum Daemon { DAEMON, NON_DAEMON }
@@ -103,7 +108,6 @@ private static Consumer interruptHandler(final Object monitor)
};
}
-
private void loop()
{
boolean interrupted = false;
@@ -189,6 +193,11 @@ public boolean awaitTermination(long time, TimeUnit unit) throws InterruptedExce
return isTerminated();
}
+ public long threadId()
+ {
+ return thread.getId();
+ }
+
@VisibleForTesting
public boolean isAlive()
{
diff --git a/src/java/org/apache/cassandra/net/ManyToOneConcurrentLinkedQueue.java b/src/java/org/apache/cassandra/concurrent/ManyToOneConcurrentLinkedQueue.java
similarity index 97%
rename from src/java/org/apache/cassandra/net/ManyToOneConcurrentLinkedQueue.java
rename to src/java/org/apache/cassandra/concurrent/ManyToOneConcurrentLinkedQueue.java
index 4c73bdc9cd2e..8615e99c22f8 100644
--- a/src/java/org/apache/cassandra/net/ManyToOneConcurrentLinkedQueue.java
+++ b/src/java/org/apache/cassandra/concurrent/ManyToOneConcurrentLinkedQueue.java
@@ -15,7 +15,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.cassandra.net;
+package org.apache.cassandra.concurrent;
import java.util.Collection;
import java.util.Iterator;
@@ -37,12 +37,12 @@
* In addition to that, provides a {@link #relaxedPeekLastAndOffer(Object)} method that we use to avoid a CAS when
* putting message handlers onto the wait queue.
*/
-class ManyToOneConcurrentLinkedQueue extends ManyToOneConcurrentLinkedQueueHead implements Queue
+public class ManyToOneConcurrentLinkedQueue extends ManyToOneConcurrentLinkedQueueHead implements Queue
{
@SuppressWarnings("unused") // pad two cache lines after the head to prevent false sharing
protected long p31, p32, p33, p34, p35, p36, p37, p38, p39, p40, p41, p42, p43, p44, p45;
- ManyToOneConcurrentLinkedQueue()
+ public ManyToOneConcurrentLinkedQueue()
{
head = tail = new Node<>(null);
}
@@ -63,7 +63,7 @@ public boolean isEmpty()
* - {@code false} result indicates that the queue MIGHT BE non-empty - the value of {@code head} might
* not yet have been made externally visible by the consumer thread.
*/
- boolean relaxedIsEmpty()
+ public boolean relaxedIsEmpty()
{
return null == head.next;
}
@@ -156,7 +156,7 @@ public boolean remove(Object o)
* Yields no performance benefit over invoking {@link #poll()} manually - there just isn't
* anything to meaningfully amortise on the consumer side of this queue.
*/
- void drain(Consumer consumer)
+ public void drain(Consumer consumer)
{
E item;
while ((item = poll()) != null)
@@ -181,7 +181,7 @@ public boolean offer(E e)
*
* @return previously last tail item in the queue, potentially stale
*/
- E relaxedPeekLastAndOffer(E e)
+ public E relaxedPeekLastAndOffer(E e)
{
return internalOffer(e);
}
diff --git a/src/java/org/apache/cassandra/concurrent/Shutdownable.java b/src/java/org/apache/cassandra/concurrent/Shutdownable.java
index 185875b791d2..a72253fc87e9 100644
--- a/src/java/org/apache/cassandra/concurrent/Shutdownable.java
+++ b/src/java/org/apache/cassandra/concurrent/Shutdownable.java
@@ -19,7 +19,9 @@
package org.apache.cassandra.concurrent;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import org.apache.cassandra.utils.ExecutorUtils;
import org.apache.cassandra.utils.Shared;
import static org.apache.cassandra.utils.Shared.Scope.SIMULATION;
@@ -29,6 +31,11 @@ public interface Shutdownable
{
boolean isTerminated();
+ default boolean isShutdown()
+ {
+ return isTerminated();
+ }
+
/**
* Shutdown once any remaining work has completed (however this is defined for the implementation).
*/
@@ -42,5 +49,10 @@ public interface Shutdownable
/**
* Await termination of this object, i.e. the cessation of all current and future work.
*/
- public boolean awaitTermination(long timeout, TimeUnit units) throws InterruptedException;
+ boolean awaitTermination(long timeout, TimeUnit units) throws InterruptedException;
+
+ default void shutdownAndWait(long timeout, TimeUnit unit) throws InterruptedException, TimeoutException
+ {
+ ExecutorUtils.shutdownAndWait(timeout, unit, this);
+ }
}
diff --git a/src/java/org/apache/cassandra/concurrent/SingleThreadExecutorPlus.java b/src/java/org/apache/cassandra/concurrent/SingleThreadExecutorPlus.java
index eb2827774a59..553855ad7bc5 100644
--- a/src/java/org/apache/cassandra/concurrent/SingleThreadExecutorPlus.java
+++ b/src/java/org/apache/cassandra/concurrent/SingleThreadExecutorPlus.java
@@ -25,6 +25,7 @@ public class SingleThreadExecutorPlus extends ThreadPoolExecutorPlus implements
{
public static class AtLeastOnce extends AtomicBoolean implements AtLeastOnceTrigger, Runnable
{
+ private static final long serialVersionUID = 0; // for simulator support
protected final SequentialExecutorPlus executor;
protected final Runnable run;
diff --git a/src/java/org/apache/cassandra/concurrent/Stage.java b/src/java/org/apache/cassandra/concurrent/Stage.java
index 4def1774229a..7b2881f53500 100644
--- a/src/java/org/apache/cassandra/concurrent/Stage.java
+++ b/src/java/org/apache/cassandra/concurrent/Stage.java
@@ -42,23 +42,23 @@
public enum Stage
{
- READ (false, "ReadStage", "request", DatabaseDescriptor::getConcurrentReaders, DatabaseDescriptor::setConcurrentReaders, Stage::multiThreadedLowSignalStage),
- MUTATION (true, "MutationStage", "request", DatabaseDescriptor::getConcurrentWriters, DatabaseDescriptor::setConcurrentWriters, Stage::multiThreadedLowSignalStage),
- COUNTER_MUTATION (true, "CounterMutationStage", "request", DatabaseDescriptor::getConcurrentCounterWriters, DatabaseDescriptor::setConcurrentCounterWriters, Stage::multiThreadedLowSignalStage),
- VIEW_MUTATION (true, "ViewMutationStage", "request", DatabaseDescriptor::getConcurrentViewWriters, DatabaseDescriptor::setConcurrentViewWriters, Stage::multiThreadedLowSignalStage),
- GOSSIP (true, "GossipStage", "internal", () -> 1, null, Stage::singleThreadedStage),
- REQUEST_RESPONSE (false, "RequestResponseStage", "request", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedLowSignalStage),
- ANTI_ENTROPY (false, "AntiEntropyStage", "internal", () -> 1, null, Stage::singleThreadedStage),
- MIGRATION (false, "MigrationStage", "internal", () -> 1, null, Stage::migrationStage),
- MISC (false, "MiscStage", "internal", () -> 1, null, Stage::singleThreadedStage),
- TRACING (false, "TracingStage", "internal", () -> 1, null, Stage::tracingStage),
- INTERNAL_RESPONSE (false, "InternalResponseStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage),
- IMMEDIATE (false, "ImmediateStage", "internal", () -> 0, null, Stage::immediateExecutor),
- PAXOS_REPAIR (false, "PaxosRepairStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage),
- INTERNAL_METADATA (false, "InternalMetadataStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage),
- FETCH_LOG (false, "MetadataFetchLogStage", "internal", () -> 1, null, Stage::singleThreadedStage)
+ READ (false, "ReadStage", "request", DatabaseDescriptor::getConcurrentReaders, DatabaseDescriptor::setConcurrentReaders, Stage::multiThreadedLowSignalStage),
+ MUTATION (true, "MutationStage", "request", DatabaseDescriptor::getConcurrentWriters, DatabaseDescriptor::setConcurrentWriters, Stage::multiThreadedLowSignalStage),
+ COUNTER_MUTATION (true, "CounterMutationStage", "request", DatabaseDescriptor::getConcurrentCounterWriters, DatabaseDescriptor::setConcurrentCounterWriters, Stage::multiThreadedLowSignalStage),
+ VIEW_MUTATION (true, "ViewMutationStage", "request", DatabaseDescriptor::getConcurrentViewWriters, DatabaseDescriptor::setConcurrentViewWriters, Stage::multiThreadedLowSignalStage),
+ ACCORD_MIGRATION (false, "AccordMigrationStage", "request", DatabaseDescriptor::getAccordConcurrentOps, DatabaseDescriptor::setConcurrentAccordOps, Stage::multiThreadedLowSignalStage),
+ GOSSIP (true, "GossipStage", "internal", () -> 1, null, Stage::singleThreadedStage),
+ REQUEST_RESPONSE (false, "RequestResponseStage", "request", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedLowSignalStage),
+ ANTI_ENTROPY (false, "AntiEntropyStage", "internal", () -> 1, null, Stage::singleThreadedStage),
+ MIGRATION (false, "MigrationStage", "internal", () -> 1, null, Stage::migrationStage),
+ MISC (false, "MiscStage", "internal", () -> 1, null, Stage::singleThreadedStage),
+ TRACING (false, "TracingStage", "internal", () -> 1, null, Stage::tracingStage),
+ INTERNAL_RESPONSE (false, "InternalResponseStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage),
+ IMMEDIATE (false, "ImmediateStage", "internal", () -> 0, null, Stage::immediateExecutor),
+ PAXOS_REPAIR (false, "PaxosRepairStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage),
+ INTERNAL_METADATA (false, "InternalMetadataStage", "internal", FBUtilities::getAvailableProcessors, null, Stage::multiThreadedStage),
+ FETCH_METADATA (false, "MetadataFetchLogStage", "internal", () -> 1, null, Stage::singleThreadedStage),
;
-
public final String jmxName;
private final Supplier executorSupplier;
private volatile ExecutorPlus executor;
diff --git a/src/java/org/apache/cassandra/concurrent/SyncFutureTask.java b/src/java/org/apache/cassandra/concurrent/SyncFutureTask.java
index 422da99fb806..8176913de7e1 100644
--- a/src/java/org/apache/cassandra/concurrent/SyncFutureTask.java
+++ b/src/java/org/apache/cassandra/concurrent/SyncFutureTask.java
@@ -71,7 +71,11 @@ public void run()
catch (Throwable t)
{
tryFailure(t);
- ExecutionFailure.handle(t);
+ // A lot of exceptions are expected and will be handled by Cassandra
+ // by consuming the result of the future task so only treat Error
+ // as uncaught
+ if (t instanceof Error)
+ ExecutionFailure.handle(t);
}
}
diff --git a/src/java/org/apache/cassandra/config/AccordSpec.java b/src/java/org/apache/cassandra/config/AccordSpec.java
new file mode 100644
index 000000000000..b81792df0c93
--- /dev/null
+++ b/src/java/org/apache/cassandra/config/AccordSpec.java
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.config;
+
+import java.util.concurrent.TimeUnit;
+
+import accord.utils.Invariants;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import org.apache.cassandra.journal.Params;
+import org.apache.cassandra.service.accord.serializers.Version;
+import org.apache.cassandra.service.consensus.TransactionalMode;
+
+import static org.apache.cassandra.config.AccordSpec.QueueShardModel.THREAD_POOL_PER_SHARD;
+import static org.apache.cassandra.config.AccordSpec.QueueSubmissionModel.SYNC;
+
+public class AccordSpec
+{
+ public volatile boolean enabled = false;
+
+ public volatile String journal_directory;
+
+ public volatile boolean enable_journal_compaction = true;
+
+ /**
+ * Enables the virtual Accord debug-only keyspace with tables
+ * that expose internal state to aid the developers working
+ * on Accord implementation.
+ *
+ * These tables can and will change and/or go away at any point,
+ * including in a minor release, are not to be considered part of the API,
+ * and are NOT to be relied on for anything.
+ *
+ * Only enable this keyspace if you are working on Accord and
+ * need to debug an issue with Accord implementation, or if an Accord
+ * developer asked you to.
+ */
+ public boolean enable_virtual_debug_only_keyspace = false;
+
+ public enum QueueShardModel
+ {
+ /**
+ * Same number of threads as queue shards, but the shard lock is held only while managing the queue,
+ * so that submitting threads may queue load/save work.
+ *
+ * The global READ and WRITE stages are used for IO.
+ */
+ THREAD_PER_SHARD,
+
+ /**
+ * Same number of threads as shards, and the shard lock is held for the duration of serving requests.
+ * The global READ and WRITE stages are used for IO.
+ */
+ THREAD_PER_SHARD_SYNC_QUEUE,
+
+ /**
+ * More threads than shards. Threads update transaction state as well as performing IO, minimising context switching.
+ * Fewer shards is generally better, until queue-contention is encountered.
+ */
+ THREAD_POOL_PER_SHARD,
+
+ /**
+ * More threads than shards. Threads update transaction state only, relying on READ and WRITE stages for IO.
+ * Fewer shards is generally better, until queue-contention is encountered.
+ */
+ THREAD_POOL_PER_SHARD_EXCLUDES_IO,
+ }
+
+ public enum QueueSubmissionModel
+ {
+ /**
+ * The queue workers and all submissions require ownership of the lock.
+ */
+ SYNC,
+
+ /**
+ * The queue workers and some submissions require ownership of the lock.
+ * That is, if the lock is available on submission we take it; if it is not we try to guarantee that
+ * another thread will witness the work submission promptly, but if we cannot we wait for the lock
+ * to ensure work is scheduled.
+ */
+ SEMI_SYNC,
+
+ /**
+ * The queue workers only require ownership of the lock, submissions happens fully asynchronously.
+ */
+ ASYNC,
+
+ /**
+ * The queue is backed by submission to a single-threaded plain executor.
+ * This implementation does not honur the sharding model option.
+ *
+ * Note: this isn't intended to be used by real clusters.
+ */
+ EXEC_ST
+ }
+
+ public QueueShardModel queue_shard_model = THREAD_POOL_PER_SHARD;
+ public QueueSubmissionModel queue_submission_model = SYNC;
+
+ /**
+ * The number of queue (and cache) shards.
+ */
+ public volatile OptionaldPositiveInt queue_shard_count = OptionaldPositiveInt.UNDEFINED;
+
+ /**
+ * The target number of command stores to create per topology shard.
+ * This determines the amount of execution parallelism possible for a given table/shard on the host.
+ * More shards means more parallelism, but more state.
+ *
+ * TODO (expected): make this a table property
+ * TODO (expected): adjust this by proportion of ring
+ */
+ public volatile OptionaldPositiveInt command_store_shard_count = OptionaldPositiveInt.UNDEFINED;
+
+ public volatile OptionaldPositiveInt max_queued_loads = OptionaldPositiveInt.UNDEFINED;
+ public volatile OptionaldPositiveInt max_queued_range_loads = OptionaldPositiveInt.UNDEFINED;
+
+ public DataStorageSpec.LongMebibytesBound cache_size = null;
+ public DataStorageSpec.LongMebibytesBound working_set_size = null;
+ public boolean shrink_cache_entries_before_eviction = true;
+
+ public DurationSpec.IntMillisecondsBound range_syncpoint_timeout = new DurationSpec.IntMillisecondsBound("3m");
+ public DurationSpec.IntMillisecondsBound repair_timeout = new DurationSpec.IntMillisecondsBound("10m");
+ public String recover_txn = "5s*attempts <= 60s";
+ public StringRetryStrategy recover_syncpoint = new StringRetryStrategy("60s <= 30s*attempts...60s*attempts <= 600s");
+ public String fetch_txn = "1s*attempts";
+ public String fetch_syncpoint = "5s*attempts";
+ public String expire_txn = "5s*attempts";
+ public String expire_syncpoint = "60s*attempts<=300s";
+ public String expire_epoch_wait = "10s";
+ // we don't want to wait ages for durability as it blocks other durability progress; even this might be too long, as we can always retry
+ public String expire_durability = "10s*attempts <= 30s";
+ public String slow_syncpoint_preaccept = "10s";
+ public String slow_txn_preaccept = "30ms <= p50*2 <= 100ms";
+ public String slow_read = "30ms <= p50*2 <= 100ms";
+ public StringRetryStrategy retry_syncpoint = new StringRetryStrategy("10s*attempts <= 600s");
+ public StringRetryStrategy retry_durability = new StringRetryStrategy("10s*attempts <= 600s");
+ public StringRetryStrategy retry_bootstrap = new StringRetryStrategy("10s*attempts <= 600s");
+ public StringRetryStrategy retry_fetch_min_epoch = new StringRetryStrategy("200ms...1s*attempts <= 1s,retries=3");
+ public StringRetryStrategy retry_fetch_topology = new StringRetryStrategy("200ms...1s*attempts <= 1s,retries=100");
+
+ public volatile DurationSpec.IntSecondsBound fast_path_update_delay = null;
+
+ public volatile DurationSpec.IntSecondsBound gc_delay = new DurationSpec.IntSecondsBound("5m");
+ public volatile int shard_durability_target_splits = 128;
+ public volatile DurationSpec.IntSecondsBound durability_txnid_lag = new DurationSpec.IntSecondsBound(5);
+ public volatile DurationSpec.IntSecondsBound shard_durability_cycle = new DurationSpec.IntSecondsBound(15, TimeUnit.MINUTES);
+ public volatile DurationSpec.IntSecondsBound global_durability_cycle = new DurationSpec.IntSecondsBound(10, TimeUnit.MINUTES);
+
+ public enum TransactionalRangeMigration
+ {
+ auto, explicit
+ }
+
+ /**
+ * Defines the behavior of range migration opt-in when changing transactional settings on a table. In auto,
+ * all ranges are marked as migrating and no additional user action is needed aside from running repairs. In
+ * explicit, no ranges are marked as migrating, and the user needs to explicitly mark ranges as migrating to
+ * the target transactional mode via nodetool.
+ */
+ public volatile TransactionalRangeMigration range_migration = TransactionalRangeMigration.auto;
+
+ /**
+ * default transactional mode for tables created by this node when no transactional mode has been specified in the DDL
+ */
+ public TransactionalMode default_transactional_mode = TransactionalMode.off;
+ public boolean ephemeralReadEnabled = true;
+ public boolean state_cache_listener_jfr_enabled = true;
+ public final JournalSpec journal = new JournalSpec();
+
+ public static class JournalSpec implements Params
+ {
+ public int segmentSize = 32 << 20;
+ public FailurePolicy failurePolicy = FailurePolicy.STOP;
+ public FlushMode flushMode = FlushMode.PERIODIC;
+ public volatile DurationSpec flushPeriod; // pulls default from 'commitlog_sync_period'
+ public DurationSpec periodicFlushLagBlock = new DurationSpec.IntMillisecondsBound("1500ms");
+ public DurationSpec.IntMillisecondsBound compactionPeriod = new DurationSpec.IntMillisecondsBound("60000ms");
+ private volatile long flushCombinedBlockPeriod = Long.MIN_VALUE;
+ public Version version = Version.DOWNGRADE_SAFE_VERSION;
+
+ public void setFlushPeriod(DurationSpec newFlushPeriod)
+ {
+ flushPeriod = newFlushPeriod;
+ flushCombinedBlockPeriod = Long.MIN_VALUE;
+ }
+
+ public void setPeriodicFlushLagBlock(DurationSpec newPeriodicFlushLagBlock)
+ {
+ periodicFlushLagBlock = newPeriodicFlushLagBlock;
+ flushCombinedBlockPeriod = Long.MIN_VALUE;
+ }
+
+ @Override
+ public int segmentSize()
+ {
+ return segmentSize;
+ }
+
+ @Override
+ public FailurePolicy failurePolicy()
+ {
+ return failurePolicy;
+ }
+
+ @Override
+ public FlushMode flushMode()
+ {
+ return flushMode;
+ }
+
+ @Override
+ public boolean enableCompaction()
+ {
+ return DatabaseDescriptor.getAccord().enable_journal_compaction;
+ }
+
+ @Override
+ public long compactionPeriod(TimeUnit unit)
+ {
+ return compactionPeriod.to(unit);
+ }
+
+ @JsonIgnore
+ @Override
+ public long flushPeriod(TimeUnit units)
+ {
+ return flushPeriod.to(units);
+ }
+
+ @JsonIgnore
+ @Override
+ public long periodicBlockPeriod(TimeUnit units)
+ {
+ long nanos = flushCombinedBlockPeriod;
+ if (nanos >= 0)
+ return units.convert(nanos, TimeUnit.NANOSECONDS);
+
+ long flushPeriodNanos = flushPeriod(TimeUnit.NANOSECONDS);
+ Invariants.require(flushPeriodNanos > 0);
+ nanos = periodicFlushLagBlock.to(TimeUnit.NANOSECONDS) + flushPeriodNanos;
+ // it is possible for this to race and cache the wrong value after an update
+ flushCombinedBlockPeriod = nanos;
+ return nanos;
+ }
+
+ /**
+ * This is required by the journal, but we don't have multiple versions, so block it from showing up, so we don't need to worry about maintaining it
+ */
+ @JsonIgnore
+ @Override
+ public int userVersion()
+ {
+ return version.version;
+ }
+ }
+}
diff --git a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java
index a0baf8de9009..242af3195b76 100644
--- a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java
+++ b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java
@@ -21,14 +21,13 @@
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.TimeUnit;
-
import javax.annotation.Nullable;
import com.google.common.primitives.Ints;
+import accord.utils.Invariants;
import org.apache.cassandra.db.virtual.LogMessagesTable;
import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.service.FileSystemOwnershipCheck;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.StorageCompatibilityMode;
@@ -37,6 +36,12 @@
/** A class that extracts system properties for the cassandra node it runs within. */
public enum CassandraRelevantProperties
{
+ ACCORD_AGENT_CLASS("cassandra.test.accord.agent"),
+ ACCORD_ALLOW_TEST_MODES("cassandra.test.accord.allow_test_modes", "false"),
+ ACCORD_KEY_PARANOIA_COSTFACTOR(Invariants.KEY_PARANOIA_COSTFACTOR),
+ ACCORD_KEY_PARANOIA_CPU(Invariants.KEY_PARANOIA_CPU),
+ ACCORD_KEY_PARANOIA_MEMORY(Invariants.KEY_PARANOIA_MEMORY),
+ ACCORD_REPAIR_RANGE_STEP_UPDATE_INTERVAL("cassandra.accord.repair.range_step_update_interval", "100"),
ACQUIRE_RETRY_SECONDS("cassandra.acquire_retry_seconds", "60"),
ACQUIRE_SLEEP_MS("cassandra.acquire_sleep_ms", "1000"),
ALLOCATE_TOKENS_FOR_KEYSPACE("cassandra.allocate_tokens_for_keyspace"),
@@ -52,13 +57,14 @@ public enum CassandraRelevantProperties
ALLOW_UNSAFE_TRANSIENT_CHANGES("cassandra.allow_unsafe_transient_changes"),
APPROXIMATE_TIME_PRECISION_MS("cassandra.approximate_time_precision_ms", "2"),
/** 2 ** GENSALT_LOG2_ROUNDS rounds of hashing will be performed. */
- AUTH_BCRYPT_GENSALT_LOG2_ROUNDS("cassandra.auth_bcrypt_gensalt_log2_rounds"),
+ AUTH_BCRYPT_GENSALT_LOG2_ROUNDS("cassandra.auth_bcrypt_gensalt_log2_rounds", "4"),
/** We expect default values on cache retries and interval to be sufficient for everyone but have this escape hatch just in case. */
AUTH_CACHE_WARMING_MAX_RETRIES("cassandra.auth_cache.warming.max_retries"),
AUTH_CACHE_WARMING_RETRY_INTERVAL_MS("cassandra.auth_cache.warming.retry_interval_ms"),
AUTOCOMPACTION_ON_STARTUP_ENABLED("cassandra.autocompaction_on_startup_enabled", "true"),
AUTO_BOOTSTRAP("cassandra.auto_bootstrap"),
AUTO_REPAIR_FREQUENCY_SECONDS("cassandra.auto_repair_frequency_seconds", convertToString(TimeUnit.MINUTES.toSeconds(5))),
+ BATCHLOG_REPLAY_INTERVAL_MS("cassandra.batchlog.replay_interval_ms", "10000"),
BATCHLOG_REPLAY_TIMEOUT_IN_MS("cassandra.batchlog.replay_timeout_in_ms"),
BATCH_COMMIT_LOG_SYNC_INTERVAL("cassandra.batch_commitlog_sync_interval_millis", "1000"),
/**
@@ -207,6 +213,8 @@ public enum CassandraRelevantProperties
*/
DRAIN_EXECUTOR_TIMEOUT_MS("cassandra.drain_executor_timeout_ms", convertToString(TimeUnit.MINUTES.toMillis(5))),
DROP_OVERSIZED_READ_REPAIR_MUTATIONS("cassandra.drop_oversized_readrepair_mutations"),
+ DTEST_ACCORD_ENABLED("jvm_dtest.accord.enabled", "true"),
+ DTEST_ACCORD_JOURNAL_SANITY_CHECK_ENABLED("jvm_dtest.accord.journal_sanity_check_enabled", "false"),
DTEST_API_LOG_TOPOLOGY("cassandra.dtest.api.log.topology"),
/** This property indicates if the code is running under the in-jvm dtest framework */
DTEST_IS_IN_JVM_DTEST("org.apache.cassandra.dtest.is_in_jvm_dtest"),
@@ -231,11 +239,11 @@ public enum CassandraRelevantProperties
/** @deprecated should be removed in favor of flags in relevant startup check (FileSystemOwnershipCheck) */
/** @deprecated See CASSANDRA-17797 */
@Deprecated(since = "4.1")
- FILE_SYSTEM_CHECK_OWNERSHIP_FILENAME("cassandra.fs_ownership_filename", FileSystemOwnershipCheck.DEFAULT_FS_OWNERSHIP_FILENAME),
+ FILE_SYSTEM_CHECK_OWNERSHIP_FILENAME("cassandra.fs_ownership_filename", ".cassandra_fs_ownership"),
/** @deprecated should be removed in favor of flags in relevant startup check (FileSystemOwnershipCheck) */
/** @deprecated See CASSANDRA-17797 */
@Deprecated(since = "4.1")
- FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN(FileSystemOwnershipCheck.FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN),
+ FILE_SYSTEM_CHECK_OWNERSHIP_TOKEN("CassandraOwnershipToken"),
FORCE_DEFAULT_INDEXING_PAGE_SIZE("cassandra.force_default_indexing_page_size"),
/** Used when running in Client mode and the system and schema keyspaces need to be initialized outside of their normal initialization path **/
FORCE_LOAD_LOCAL_KEYSPACES("cassandra.schema.force_load_local_keyspaces"),
@@ -263,6 +271,8 @@ public enum CassandraRelevantProperties
*/
GOSSIP_SETTLE_POLL_SUCCESSES_REQUIRED("cassandra.gossip_settle_poll_success_required", "3"),
+ HINT_DISPATCH_INTERVAL_MS("cassandra.hint_dispatch_interval_ms", "10000"),
+
IGNORED_SCHEMA_CHECK_ENDPOINTS("cassandra.skip_schema_check_for_endpoints"),
IGNORED_SCHEMA_CHECK_VERSIONS("cassandra.skip_schema_check_for_versions"),
IGNORE_CORRUPTED_SCHEMA_TABLES("cassandra.ignore_corrupted_schema_tables"),
@@ -323,6 +333,16 @@ public enum CassandraRelevantProperties
/** Java Virtual Machine implementation name */
JAVA_VM_NAME("java.vm.name"),
JOIN_RING("cassandra.join_ring", "true"),
+
+ /**
+ * {@link StorageCompatibilityMode} mode sets how the node will behave, sstable or messaging versions to use etc according to a yaml setting.
+ * But many tests don't load the config hence we need to force it otherwise they would run always under the default. Config is null for junits
+ * that don't load the config. Get from env var that CI/build.xml sets.
+ *
+ * This is a dev/CI only property. Do not use otherwise.
+ */
+ JUNIT_STORAGE_COMPATIBILITY_MODE("cassandra.junit_storage_compatibility_mode", StorageCompatibilityMode.CASSANDRA_4.toString()),
+
/** startup checks properties */
LIBJEMALLOC("cassandra.libjemalloc"),
/** Line separator ("\n" on UNIX). */
@@ -478,6 +498,8 @@ public enum CassandraRelevantProperties
SERIALIZATION_EMPTY_TYPE_NONEMPTY_BEHAVIOR("cassandra.serialization.emptytype.nonempty_behavior"),
SET_SEP_THREAD_NAME("cassandra.set_sep_thread_name", "true"),
SHUTDOWN_ANNOUNCE_DELAY_IN_MS("cassandra.shutdown_announce_in_ms", "2000"),
+ SIMULATOR_SEED("cassandra.simulator.seed"),
+ SIMULATOR_STARTED("cassandra.simulator.started"),
SIZE_RECORDER_INTERVAL("cassandra.size_recorder_interval", "300"),
SKIP_AUTH_SETUP("cassandra.skip_auth_setup", "false"),
SKIP_GC_INSPECTOR("cassandra.skip_gc_inspector", "false"),
@@ -540,9 +562,9 @@ public enum CassandraRelevantProperties
TCM_UNSAFE_BOOT_WITH_CLUSTERMETADATA("cassandra.unsafe_boot_with_clustermetadata", null),
TCM_USE_ATOMIC_LONG_PROCESSOR("cassandra.test.use_atomic_long_processor", "false"),
TCM_USE_NO_OP_REPLICATOR("cassandra.test.use_no_op_replicator", "false"),
-
+ TEST_ACCORD_STORE_THREAD_CHECKS_ENABLED("cassandra.test.accord.store.thread_checks_enabled", "true"),
TEST_BBFAILHELPER_ENABLED("test.bbfailhelper.enabled"),
- TEST_BLOB_SHARED_SEED("cassandra.test.blob.shared.seed"),
+ TEST_BLOB_SHARED_SEED("cassandra.test.blob.shared.seed", "42"),
TEST_BYTEMAN_TRANSFORMATIONS_DEBUG("cassandra.test.byteman.transformations.debug"),
TEST_CASSANDRA_KEEPBRIEFBRIEF("cassandra.keepBriefBrief"),
TEST_CASSANDRA_RELEVANT_PROPERTIES("org.apache.cassandra.conf.CassandraRelevantPropertiesTest"),
@@ -553,6 +575,7 @@ public enum CassandraRelevantProperties
TEST_COMPRESSION("cassandra.test.compression"),
TEST_COMPRESSION_ALGO("cassandra.test.compression.algo", "lz4"),
TEST_DEBUG_REF_COUNT("cassandra.debugrefcount"),
+ TEST_DEBUG_REF_EVENTS("cassandra.debug.refevents"),
TEST_DRIVER_CONNECTION_TIMEOUT_MS("cassandra.test.driver.connection_timeout_ms", "5000"),
TEST_DRIVER_READ_TIMEOUT_MS("cassandra.test.driver.read_timeout_ms", "12000"),
TEST_ENCRYPTION("cassandra.test.encryption", "false"),
@@ -564,6 +587,8 @@ public enum CassandraRelevantProperties
* can be also done manually for that particular case: {@code flush(SchemaConstants.SCHEMA_KEYSPACE_NAME);}. */
TEST_FLUSH_LOCAL_SCHEMA_CHANGES("cassandra.test.flush_local_schema_changes", "true"),
TEST_HARRY_SWITCH_AFTER("cassandra.test.harry.progression.switch-after", "1"),
+ TEST_HISTORY_VALIDATOR_LOGGING_ENABLED("cassandra.test.history_validator.logging.enabled", "false"),
+ TEST_IGNORE_SIGAR("cassandra.test.ignore_sigar"),
TEST_INVALID_LEGACY_SSTABLE_ROOT("invalid-legacy-sstable-root"),
TEST_JVM_DTEST_DISABLE_SSL("cassandra.test.disable_ssl"),
TEST_JVM_SHUTDOWN_MESSAGING_GRACEFULLY("cassandra.test.messagingService.gracefulShutdown", "false"),
@@ -571,9 +596,11 @@ public enum CassandraRelevantProperties
TEST_ORG_CAFFINITAS_OHC_SEGMENTCOUNT("org.caffinitas.ohc.segmentCount"),
TEST_PRESERVE_THREAD_CREATION_STACKTRACE("cassandra.test.preserve_thread_creation_stacktrace", "false"),
TEST_RANDOM_SEED("cassandra.test.random.seed"),
+ TEST_RANGE_EXPENSIVE_CHECKS("cassandra.test.range_expensive_checks"),
TEST_READ_ITERATION_DELAY_MS("cassandra.test.read_iteration_delay_ms", "0"),
TEST_REUSE_PREPARED("cassandra.test.reuse_prepared", "true"),
TEST_ROW_CACHE_SIZE("cassandra.test.row_cache_size"),
+ TEST_SEED("cassandra.test.seed"),
TEST_SERIALIZATION_WRITES("cassandra.test-serialization-writes"),
TEST_SIMULATOR_DEBUG("cassandra.test.simulator.debug"),
TEST_SIMULATOR_DETERMINISM_CHECK("cassandra.test.simulator.determinismcheck", "none"),
diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
index 361eb3bffe4e..0e7189a38589 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java
@@ -46,6 +46,7 @@
import org.apache.cassandra.utils.StorageCompatibilityMode;
import static org.apache.cassandra.config.CassandraRelevantProperties.AUTOCOMPACTION_ON_STARTUP_ENABLED;
+import static org.apache.cassandra.config.CassandraRelevantProperties.CASSANDRA_AVAILABLE_PROCESSORS;
import static org.apache.cassandra.config.CassandraRelevantProperties.FILE_CACHE_ENABLED;
import static org.apache.cassandra.config.CassandraRelevantProperties.SKIP_PAXOS_REPAIR_ON_TOPOLOGY_CHANGE;
import static org.apache.cassandra.config.CassandraRelevantProperties.SKIP_PAXOS_REPAIR_ON_TOPOLOGY_CHANGE_KEYSPACES;
@@ -150,12 +151,16 @@ public static Set splitCommaDelimited(String src)
@Replaces(oldName = "write_request_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true)
public volatile DurationSpec.LongMillisecondsBound write_request_timeout = new DurationSpec.LongMillisecondsBound("2000ms");
+ public volatile DurationSpec.LongMillisecondsBound short_rpc_timeout = new DurationSpec.LongMillisecondsBound("1000ms");
+
@Replaces(oldName = "counter_write_request_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true)
public volatile DurationSpec.LongMillisecondsBound counter_write_request_timeout = new DurationSpec.LongMillisecondsBound("5000ms");
@Replaces(oldName = "cas_contention_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true)
public volatile DurationSpec.LongMillisecondsBound cas_contention_timeout = new DurationSpec.LongMillisecondsBound("1800ms");
+ public volatile DurationSpec.LongMillisecondsBound accord_preaccept_timeout = new DurationSpec.LongMillisecondsBound("1s");
+
@Replaces(oldName = "truncate_request_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true)
public volatile DurationSpec.LongMillisecondsBound truncate_request_timeout = new DurationSpec.LongMillisecondsBound("60000ms");
@@ -176,7 +181,12 @@ public static Set splitCommaDelimited(String src)
public volatile DurationSpec.LongMillisecondsBound cms_await_timeout = new DurationSpec.LongMillisecondsBound("120000ms");
public volatile int cms_default_max_retries = 10;
- public volatile DurationSpec.IntMillisecondsBound cms_default_retry_backoff = new DurationSpec.IntMillisecondsBound("50ms");
+ @Deprecated(since="5.1")
+ public volatile DurationSpec.IntMillisecondsBound cms_default_retry_backoff = null;
+ @Deprecated(since="5.1")
+ public volatile DurationSpec.IntMillisecondsBound cms_default_max_retry_backoff = null;
+ public String cms_retry_delay = "0 <= 50ms*1*attempts <= 10s,retries=10";
+
/**
* How often we should snapshot the cluster metadata.
*/
@@ -187,9 +197,10 @@ public static Set splitCommaDelimited(String src)
public int concurrent_reads = 32;
public int concurrent_writes = 32;
+ public int concurrent_accord_operations = 32;
public int concurrent_counter_writes = 32;
public int concurrent_materialized_view_writes = 32;
- public int available_processors = -1;
+ public OptionaldPositiveInt available_processors = new OptionaldPositiveInt(CASSANDRA_AVAILABLE_PROCESSORS.getInt(OptionaldPositiveInt.UNDEFINED_VALUE));
public int memtable_flush_writers = 0;
@Replaces(oldName = "memtable_heap_space_in_mb", converter = Converters.MEBIBYTES_DATA_STORAGE_INT, deprecated = true)
@@ -390,6 +401,7 @@ public static class SSTableConfig
@Replaces(oldName = "commitlog_total_space_in_mb", converter = Converters.MEBIBYTES_DATA_STORAGE_INT, deprecated = true)
public DataStorageSpec.IntMebibytesBound commitlog_total_space;
public CommitLogSync commitlog_sync;
+
@Replaces(oldName = "commitlog_sync_group_window_in_ms", converter = Converters.MILLIS_DURATION_DOUBLE, deprecated = true)
public DurationSpec.IntMillisecondsBound commitlog_sync_group_window = new DurationSpec.IntMillisecondsBound("0ms");
@Replaces(oldName = "commitlog_sync_period_in_ms", converter = Converters.MILLIS_DURATION_INT, deprecated = true)
@@ -482,6 +494,8 @@ public static class SSTableConfig
public DataStorageSpec.LongMebibytesBound paxos_cache_size = null;
+ public DataStorageSpec.LongMebibytesBound consensus_migration_cache_size = null;
+
@Replaces(oldName = "cache_load_timeout_seconds", converter = Converters.NEGATIVE_SECONDS_DURATION, deprecated = true)
public DurationSpec.IntSecondsBound cache_load_timeout = new DurationSpec.IntSecondsBound("30s");
@@ -1128,6 +1142,7 @@ public enum PaxosOnLinearizabilityViolation
public volatile boolean client_request_size_metrics_enabled = true;
+
public volatile int max_top_size_partition_count = 10;
public volatile int max_top_tombstone_partition_count = 10;
public volatile DataStorageSpec.LongBytesBound min_tracked_partition_size = new DataStorageSpec.LongBytesBound("1MiB");
@@ -1141,6 +1156,8 @@ public enum PaxosOnLinearizabilityViolation
*/
public ParameterizedClass default_compaction = null;
+ public final AccordSpec accord = new AccordSpec();
+
public static Supplier getOverrideLoadConfig()
{
return overrideLoadConfig;
@@ -1277,10 +1294,10 @@ public static void log(Config config)
String value;
try
{
- // Field.get() can throw NPE if the value of the field is null
- value = field.get(config).toString();
+ Object obj = field.get(config);
+ value = obj != null ? obj.toString() : "null";
}
- catch (NullPointerException | IllegalAccessException npe)
+ catch (IllegalAccessException npe)
{
value = "null";
}
diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index 66e0fd0532b6..aa4c25af0017 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -62,6 +62,8 @@
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;
import com.google.common.util.concurrent.RateLimiter;
+
+import org.apache.cassandra.service.accord.api.AccordWaitStrategies;
import org.apache.cassandra.utils.Pair;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
@@ -82,6 +84,7 @@
import org.apache.cassandra.config.Config.DiskAccessMode;
import org.apache.cassandra.config.Config.PaxosOnLinearizabilityViolation;
import org.apache.cassandra.config.Config.PaxosStatePurging;
+import org.apache.cassandra.config.DurationSpec.IntMillisecondsBound;
import org.apache.cassandra.db.ConsistencyLevel;
import org.apache.cassandra.db.commitlog.AbstractCommitLogSegmentManager;
import org.apache.cassandra.db.commitlog.CommitLog;
@@ -113,6 +116,7 @@
import org.apache.cassandra.security.SSLFactory;
import org.apache.cassandra.service.CacheService.CacheType;
import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.service.consensus.TransactionalMode;
import org.apache.cassandra.service.paxos.Paxos;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.MBeanWrapper;
@@ -155,6 +159,7 @@
import static org.apache.cassandra.db.ConsistencyLevel.QUORUM;
import static org.apache.cassandra.io.util.FileUtils.ONE_GIB;
import static org.apache.cassandra.io.util.FileUtils.ONE_MIB;
+import static org.apache.cassandra.journal.Params.FlushMode.PERIODIC;
import static org.apache.cassandra.utils.Clock.Global.logInitializationOutcome;
public class DatabaseDescriptor
@@ -213,6 +218,9 @@ public class DatabaseDescriptor
private static long keyCacheSizeInMiB;
private static long paxosCacheSizeInMiB;
+ private static long accordCacheSizeInMiB;
+ private static long accordWorkingSetSizeInMiB;
+ private static long consensusMigrationCacheSizeInMiB;
private static long counterCacheSizeInMiB;
private static long indexSummaryCapacityInMiB;
@@ -369,6 +377,14 @@ public static void clientInitialization(boolean failIfDaemonOrTool)
clientInitialization(failIfDaemonOrTool, Config::new);
}
+ // For simulator tests
+ public static void clientWithDaemonConfig()
+ {
+ clientInitialization(true, DatabaseDescriptor::loadConfig);
+ applyAll();
+ AuthConfig.applyAuth();
+ }
+
/**
* Initializes this class as a client, which means that just an empty configuration will
* be used.
@@ -583,6 +599,16 @@ else if (conf.commitlog_sync_period.toMilliseconds() != 0)
logger.debug("Syncing log with a period of {}", conf.commitlog_sync_period.toString());
}
+ if (conf.accord.journal.flushPeriod == null)
+ {
+ conf.accord.journal.flushPeriod = conf.commitlog_sync_period;
+ if (conf.accord.journal.flushMode == PERIODIC && conf.commitlog_sync_period.toMilliseconds() == 0)
+ {
+ logger.warn("Accord journal is configured in periodic mode, while Cassandra commit log is configured in {} mode", conf.commitlog_sync);
+ conf.accord.journal.flushPeriod = conf.accord.journal.periodicFlushLagBlock;
+ }
+ }
+
/* evaluate the DiskAccessMode Config directive, which also affects indexAccessMode selection */
if (conf.disk_access_mode == DiskAccessMode.auto || conf.disk_access_mode == DiskAccessMode.mmap_index_only)
{
@@ -624,6 +650,9 @@ else if (conf.disk_access_mode == DiskAccessMode.direct)
if (conf.concurrent_counter_writes < 2)
throw new ConfigurationException("concurrent_counter_writes must be at least 2, but was " + conf.concurrent_counter_writes, false);
+ if (conf.concurrent_accord_operations < 1)
+ throw new ConfigurationException("concurrent_accord_operations must be at least 1, but was " + conf.concurrent_accord_operations, false);
+
if (conf.networking_cache_size == null)
conf.networking_cache_size = new DataStorageSpec.IntMebibytesBound(Math.min(128, (int) (Runtime.getRuntime().maxMemory() / (16 * 1048576))));
@@ -695,6 +724,11 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m
if (commitLogWriteDiskAccessMode != conf.commitlog_disk_access_mode)
logger.info("commitlog_disk_access_mode resolved to: {}", commitLogWriteDiskAccessMode);
+ if (conf.accord.journal_directory == null)
+ {
+ conf.accord.journal_directory = storagedirFor("accord_journal");
+ }
+
if (conf.hints_directory == null)
{
conf.hints_directory = storagedirFor("hints");
@@ -770,6 +804,8 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m
throw new ConfigurationException("local_system_data_file_directory must not be the same as any data_file_directories", false);
if (datadir.equals(conf.commitlog_directory))
throw new ConfigurationException("commitlog_directory must not be the same as any data_file_directories", false);
+ if (datadir.equals(conf.accord.journal_directory))
+ throw new ConfigurationException("accord.journal_directory must not be the same as any data_file_directories", false);
if (datadir.equals(conf.hints_directory))
throw new ConfigurationException("hints_directory must not be the same as any data_file_directories", false);
if (datadir.equals(conf.saved_caches_directory))
@@ -785,6 +821,8 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m
{
if (conf.local_system_data_file_directory.equals(conf.commitlog_directory))
throw new ConfigurationException("local_system_data_file_directory must not be the same as the commitlog_directory", false);
+ if (conf.local_system_data_file_directory.equals(conf.accord.journal_directory))
+ throw new ConfigurationException("local_system_data_file_directory must not be the same as the accord.journal_directory", false);
if (conf.local_system_data_file_directory.equals(conf.saved_caches_directory))
throw new ConfigurationException("local_system_data_file_directory must not be the same as the saved_caches_directory", false);
if (conf.local_system_data_file_directory.equals(conf.hints_directory))
@@ -797,10 +835,18 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m
FBUtilities.prettyPrintMemory(freeBytes));
}
- if (conf.commitlog_directory.equals(conf.saved_caches_directory))
- throw new ConfigurationException("saved_caches_directory must not be the same as the commitlog_directory", false);
+ if (conf.commitlog_directory.equals(conf.accord.journal_directory))
+ throw new ConfigurationException("accord.journal_directory must not be the same as the commitlog_directory", false);
if (conf.commitlog_directory.equals(conf.hints_directory))
throw new ConfigurationException("hints_directory must not be the same as the commitlog_directory", false);
+ if (conf.commitlog_directory.equals(conf.saved_caches_directory))
+ throw new ConfigurationException("saved_caches_directory must not be the same as the commitlog_directory", false);
+
+ if (conf.accord.journal_directory.equals(conf.hints_directory))
+ throw new ConfigurationException("hints_directory must not be the same as the accord.journal_directory", false);
+ if (conf.accord.journal_directory.equals(conf.saved_caches_directory))
+ throw new ConfigurationException("saved_caches_directory must not be the same as the accord.journal_directory", false);
+
if (conf.hints_directory.equals(conf.saved_caches_directory))
throw new ConfigurationException("saved_caches_directory must not be the same as the hints_directory", false);
@@ -914,6 +960,52 @@ else if (conf.repair_session_space.toMebibytes() > (int) (Runtime.getRuntime().m
+ conf.paxos_cache_size + "', supported values are >= 0.", false);
}
+ try
+ {
+ // if accordCacheSizeInMiB option was set to "auto" then size of the cache should be "max(10% of Heap (in MB), 1MB)
+ accordCacheSizeInMiB = (conf.accord.cache_size == null)
+ ? Math.max(1, (int) ((Runtime.getRuntime().totalMemory() * 0.10) / 1024 / 1024))
+ : conf.accord.cache_size.toMebibytes();
+
+ if (accordCacheSizeInMiB < 0)
+ throw new NumberFormatException(); // to escape duplicating error message
+ }
+ catch (NumberFormatException e)
+ {
+ throw new ConfigurationException("accord.cache_size option was set incorrectly to '"
+ + conf.accord.cache_size + "', supported values are >= 0.", false);
+ }
+
+ try
+ {
+ // if accordWorkingSetSizeInMiB option was set to "auto" then size of the working set should be "max(5% of Heap (in MB), 1MB)
+ // if negative, there is no limit
+ accordWorkingSetSizeInMiB = (conf.accord.working_set_size == null)
+ ? Math.max(1, (int) ((Runtime.getRuntime().totalMemory() * 0.05) / 1024 / 1024))
+ : conf.accord.working_set_size.toMebibytes();
+ }
+ catch (NumberFormatException e)
+ {
+ throw new ConfigurationException("accord.working_set_size option was set incorrectly to '"
+ + conf.accord.working_set_size + "', supported values are >= 0.", false);
+ }
+
+ try
+ {
+ // if consensusMigrationCacheSizeInMiB option was set to "auto" then size of the cache should be "min(1% of Heap (in MB), 50MB)
+ consensusMigrationCacheSizeInMiB = (conf.consensus_migration_cache_size == null)
+ ? Math.min(Math.max(1, (int) (Runtime.getRuntime().totalMemory() * 0.01 / 1024 / 1024)), 50)
+ : conf.consensus_migration_cache_size.toMebibytes();
+
+ if (consensusMigrationCacheSizeInMiB < 0)
+ throw new NumberFormatException(); // to escape duplicating error message
+ }
+ catch (NumberFormatException e)
+ {
+ throw new ConfigurationException("consensus_migration_cache_size option was set incorrectly to '"
+ + conf.consensus_migration_cache_size + "', supported values are >= 0.", false);
+ }
+
// we need this assignment for the Settings virtual table - CASSANDRA-17735
conf.counter_cache_size = new DataStorageSpec.LongMebibytesBound(counterCacheSizeInMiB);
@@ -1412,6 +1504,12 @@ static void checkForLowestAcceptedTimeouts(Config conf)
logInfo("truncate_request_timeout", conf.truncate_request_timeout, LOWEST_ACCEPTED_TIMEOUT);
conf.truncate_request_timeout = LOWEST_ACCEPTED_TIMEOUT;
}
+
+ if (conf.accord_preaccept_timeout.toMilliseconds() < LOWEST_ACCEPTED_TIMEOUT.toMilliseconds())
+ {
+ logInfo("accord_preaccept_timeout", conf.accord_preaccept_timeout, LOWEST_ACCEPTED_TIMEOUT);
+ conf.accord_preaccept_timeout = LOWEST_ACCEPTED_TIMEOUT;
+ }
}
private static void logInfo(String property, DurationSpec.LongMillisecondsBound actualValue, DurationSpec.LongMillisecondsBound lowestAcceptedValue)
@@ -1995,6 +2093,10 @@ public static void createAllDirectories()
throw new ConfigurationException("commitlog_directory must be specified", false);
FileUtils.createDirectory(conf.commitlog_directory);
+ if (conf.accord.journal_directory == null)
+ throw new ConfigurationException("accord.journal_directory must be specified", false);
+ FileUtils.createDirectory(conf.accord.journal_directory);
+
if (conf.hints_directory == null)
throw new ConfigurationException("hints_directory must be specified", false);
FileUtils.createDirectory(conf.hints_directory);
@@ -2264,6 +2366,11 @@ public static long getWriteRpcTimeout(TimeUnit unit)
return conf.write_request_timeout.to(unit);
}
+ public static long getShortRpcTimeout(TimeUnit unit)
+ {
+ return conf.short_rpc_timeout.to(unit);
+ }
+
public static void setWriteRpcTimeout(long timeOutInMillis)
{
conf.write_request_timeout = new DurationSpec.LongMillisecondsBound(timeOutInMillis);
@@ -2502,6 +2609,20 @@ public static void setConcurrentViewWriters(int concurrent_materialized_view_wri
conf.concurrent_materialized_view_writes = concurrent_materialized_view_writes;
}
+ public static int getAccordConcurrentOps()
+ {
+ return conf.concurrent_accord_operations;
+ }
+
+ public static void setConcurrentAccordOps(int concurrent_operations)
+ {
+ if (concurrent_operations < 0)
+ {
+ throw new IllegalArgumentException("Concurrent accord operations must be non-negative");
+ }
+ conf.concurrent_accord_operations = concurrent_operations;
+ }
+
public static int getFlushWriters()
{
return conf.memtable_flush_writers;
@@ -2509,7 +2630,13 @@ public static int getFlushWriters()
public static int getAvailableProcessors()
{
- return conf == null ? -1 : conf.available_processors;
+ OptionaldPositiveInt ap = conf == null ? OptionaldPositiveInt.UNDEFINED : conf.available_processors;
+ return ap.or(Runtime.getRuntime()::availableProcessors);
+ }
+
+ public static void setAvailableProcessors(int value)
+ {
+ conf.available_processors = new OptionaldPositiveInt(value);
}
public static int getConcurrentCompactors()
@@ -2829,6 +2956,16 @@ public static void setCommitLogCompression(ParameterizedClass compressor)
conf.commitlog_compression = compressor;
}
+ public static String getAccordJournalDirectory()
+ {
+ return conf.accord.journal_directory;
+ }
+
+ public static void setAccordJournalDirectory(String path)
+ {
+ conf.accord.journal_directory = path;
+ }
+
public static Config.FlushCompression getFlushCompression()
{
return conf.flush_compression;
@@ -3360,6 +3497,11 @@ public static boolean paxoTopologyRepairStrictEachQuorum()
return conf.paxos_topology_repair_strict_each_quorum;
}
+ public static TransactionalMode defaultTransactionalMode()
+ {
+ return conf.accord.default_transactional_mode;
+ }
+
public static void setNativeTransportMaxRequestDataInFlightPerIpInBytes(long maxRequestDataInFlightInBytes)
{
if (maxRequestDataInFlightInBytes == -1)
@@ -3680,6 +3822,11 @@ public static int getHintsFlushPeriodInMS()
return conf.hints_flush_period.toMilliseconds();
}
+ public static void setHintsFlushPeriodInMS(int milliseconds)
+ {
+ conf.hints_flush_period = new IntMillisecondsBound(milliseconds);
+ }
+
public static long getMaxHintsFileSize()
{
return conf.max_hints_file_size.toBytesInLong();
@@ -3898,6 +4045,21 @@ public static long getPaxosCacheSizeInMiB()
return paxosCacheSizeInMiB;
}
+ public static long getAccordCacheSizeInMiB()
+ {
+ return accordCacheSizeInMiB;
+ }
+
+ public static long getAccordWorkingSetSizeInMiB()
+ {
+ return accordWorkingSetSizeInMiB;
+ }
+
+ public static long getConsensusMigrationCacheSizeInMiB()
+ {
+ return consensusMigrationCacheSizeInMiB;
+ }
+
public static long getCounterCacheSizeInMiB()
{
return counterCacheSizeInMiB;
@@ -4916,6 +5078,145 @@ public static void setUseStatementsEnabled(boolean enabled)
}
}
+
+ public static AccordSpec getAccord()
+ {
+ return conf.accord;
+ }
+
+ public static AccordSpec.TransactionalRangeMigration getTransactionalRangeMigration()
+ {
+ return conf.accord.range_migration;
+ }
+
+ public static void setTransactionalRangeMigration(AccordSpec.TransactionalRangeMigration val)
+ {
+ conf.accord.range_migration = Preconditions.checkNotNull(val);
+ }
+
+ public static long getAccordRangeSyncPointTimeoutNanos()
+ {
+ return conf.accord.range_syncpoint_timeout.to(TimeUnit.NANOSECONDS);
+ }
+
+ public static long getAccordRepairTimeoutNanos()
+ {
+ return conf.accord.repair_timeout.to(TimeUnit.NANOSECONDS);
+ }
+
+ public static boolean getAccordTransactionsEnabled()
+ {
+ return conf.accord.enabled;
+ }
+
+ public static void setAccordTransactionsEnabled(boolean b)
+ {
+ conf.accord.enabled = b;
+ }
+
+ public static AccordSpec.QueueShardModel getAccordQueueShardModel()
+ {
+ return conf.accord.queue_shard_model;
+ }
+
+ public static AccordSpec.QueueSubmissionModel getAccordQueueSubmissionModel()
+ {
+ return conf.accord.queue_submission_model;
+ }
+
+ public static int getAccordQueueShardCount()
+ {
+ switch (getAccordQueueShardModel())
+ {
+ default: throw new AssertionError("Unhandled queue_shard_model: " + conf.accord.queue_shard_model);
+ case THREAD_PER_SHARD:
+ case THREAD_PER_SHARD_SYNC_QUEUE:
+ return conf.accord.queue_shard_count.or(DatabaseDescriptor::getAvailableProcessors);
+ case THREAD_POOL_PER_SHARD:
+ case THREAD_POOL_PER_SHARD_EXCLUDES_IO:
+ int defaultMax = getAccordQueueSubmissionModel() == AccordSpec.QueueSubmissionModel.SYNC ? 8 : 4;
+ return conf.accord.queue_shard_count.or(Math.min(defaultMax, DatabaseDescriptor.getAvailableProcessors()));
+ }
+ }
+
+ public static int getAccordCommandStoreShardCount()
+ {
+ return conf.accord.command_store_shard_count.or(DatabaseDescriptor::getAvailableProcessors);
+ }
+
+ public static int getAccordMaxQueuedLoadCount()
+ {
+ return conf.accord.max_queued_loads.or(getAccordConcurrentOps());
+ }
+
+ public static int getAccordMaxQueuedRangeLoadCount()
+ {
+ return conf.accord.max_queued_range_loads.or(Math.max(4, getAccordConcurrentOps() / 4));
+ }
+
+ public static boolean getAccordCacheShrinkingOn()
+ {
+ return conf.accord.shrink_cache_entries_before_eviction;
+ }
+
+ public static String getAccordRecoverTxnDelay()
+ {
+ return conf.accord.recover_txn;
+ }
+
+ public static void setAccordRecoverTxnDelay(String recoverTxnDelay)
+ {
+ AccordWaitStrategies.setRecoverTxn(recoverTxnDelay);
+ conf.accord.recover_txn = recoverTxnDelay;
+ }
+
+ public static String getAccordExpireTxnDelay()
+ {
+ return conf.accord.expire_txn;
+ }
+
+ public static void setAccordExpireTxnDelay(String expireTxnDelay)
+ {
+ AccordWaitStrategies.setExpireTxn(expireTxnDelay);
+ conf.accord.expire_txn = expireTxnDelay;
+ }
+
+ public static long getAccordFastPathUpdateDelayMillis()
+ {
+ DurationSpec.IntSecondsBound bound = conf.accord.fast_path_update_delay;
+ return bound == null ? -1 : bound.to(TimeUnit.MILLISECONDS);
+ }
+
+ public static long getAccordGCDelay(TimeUnit unit)
+ {
+ return conf.accord.gc_delay.to(unit);
+ }
+
+ public static int getAccordShardDurabilityTargetSplits()
+ {
+ return conf.accord.shard_durability_target_splits;
+ }
+
+ public static long getAccordScheduleDurabilityTxnIdLag(TimeUnit unit)
+ {
+ return conf.accord.durability_txnid_lag.to(unit);
+ }
+
+ public static long getAccordGlobalDurabilityCycle(TimeUnit unit)
+ {
+ return conf.accord.global_durability_cycle.to(unit);
+ }
+
+ public static long getAccordShardDurabilityCycle(TimeUnit unit)
+ {
+ return conf.accord.shard_durability_cycle.to(unit);
+ }
+
+ public static boolean getAccordStateCacheListenerJFREnabled()
+ {
+ return conf.accord.state_cache_listener_jfr_enabled;
+ }
+
public static boolean getForceNewPreparedStatementBehaviour()
{
return conf.force_new_prepared_statement_behaviour;
@@ -5228,11 +5529,21 @@ public static void setCmsDefaultRetryMaxTries(int value)
conf.cms_default_max_retries = value;
}
- public static DurationSpec getDefaultRetryBackoff()
+ public static DurationSpec.IntMillisecondsBound getDefaultRetryBackoff()
{
return conf.cms_default_retry_backoff;
}
+ public static DurationSpec.IntMillisecondsBound getDefaultMaxRetryBackoff()
+ {
+ return conf.cms_default_max_retry_backoff;
+ }
+
+ public static String getCMSRetryDelay()
+ {
+ return conf.cms_retry_delay;
+ }
+
public static DurationSpec getCmsAwaitTimeout()
{
return conf.cms_await_timeout;
@@ -5319,4 +5630,9 @@ public static boolean isPasswordValidatorReconfigurationEnabled()
{
return conf.password_validator_reconfiguration_enabled;
}
+
+ public static boolean getAccordEphemeralReadEnabledEnabled()
+ {
+ return conf.accord.ephemeralReadEnabled;
+ }
}
diff --git a/src/java/org/apache/cassandra/config/DurationSpec.java b/src/java/org/apache/cassandra/config/DurationSpec.java
index 2522d86124f5..2f218f70e066 100644
--- a/src/java/org/apache/cassandra/config/DurationSpec.java
+++ b/src/java/org/apache/cassandra/config/DurationSpec.java
@@ -17,6 +17,7 @@
*/
package org.apache.cassandra.config;
+import java.time.Duration;
import java.util.Arrays;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
@@ -135,6 +136,11 @@ public TimeUnit unit()
return unit;
}
+ public Duration toDuration()
+ {
+ return Duration.of(quantity(), unit().toChronoUnit());
+ }
+
/**
* @param symbol the time unit symbol
* @return the time unit associated to the specified symbol
diff --git a/src/java/org/apache/cassandra/config/OptionaldPositiveInt.java b/src/java/org/apache/cassandra/config/OptionaldPositiveInt.java
new file mode 100644
index 000000000000..ea33b7af98f6
--- /dev/null
+++ b/src/java/org/apache/cassandra/config/OptionaldPositiveInt.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.config;
+
+import java.util.Objects;
+import java.util.function.IntSupplier;
+
+public class OptionaldPositiveInt
+{
+ public static final int UNDEFINED_VALUE = -1;
+ public static final OptionaldPositiveInt UNDEFINED = new OptionaldPositiveInt(UNDEFINED_VALUE);
+
+ private final int value;
+
+ public OptionaldPositiveInt(int value)
+ {
+ if (!(value == -1 || value >= 1))
+ throw new IllegalArgumentException(String.format("Only -1 (undefined) and positive values are allowed; given %d", value));
+ this.value = value;
+ }
+
+ public boolean isDefined()
+ {
+ return value != UNDEFINED_VALUE;
+ }
+
+ public int or(int defaultValue)
+ {
+ return value == UNDEFINED_VALUE ? defaultValue : value;
+ }
+
+ public int or(IntSupplier defaultValue)
+ {
+ return value == UNDEFINED_VALUE ? defaultValue.getAsInt() : value;
+ }
+
+ @Override
+ public boolean equals(Object o)
+ {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ OptionaldPositiveInt that = (OptionaldPositiveInt) o;
+ return value == that.value;
+ }
+
+ @Override
+ public int hashCode()
+ {
+ return Objects.hash(value);
+ }
+
+ @Override
+ public String toString()
+ {
+ return !isDefined() ? "null" : Integer.toString(value);
+ }
+}
diff --git a/src/java/org/apache/cassandra/config/RetrySpec.java b/src/java/org/apache/cassandra/config/RetrySpec.java
index 4f113af962b5..de4cb3ebf14c 100644
--- a/src/java/org/apache/cassandra/config/RetrySpec.java
+++ b/src/java/org/apache/cassandra/config/RetrySpec.java
@@ -19,10 +19,22 @@
package org.apache.cassandra.config;
import java.util.Objects;
+import java.util.Random;
import javax.annotation.Nullable;
+import accord.utils.DefaultRandom;
import org.apache.cassandra.config.DurationSpec.LongMillisecondsBound;
+import org.apache.cassandra.repair.SharedContext;
+import org.apache.cassandra.service.RetryStrategy;
+import org.apache.cassandra.service.TimeoutStrategy;
+import org.apache.cassandra.service.TimeoutStrategy.LatencySupplier.Constant;
+import org.apache.cassandra.service.TimeoutStrategy.Wait.Modifying;
+import org.apache.cassandra.service.WaitStrategy;
+
+import static java.util.concurrent.TimeUnit.MICROSECONDS;
+import static org.apache.cassandra.service.RetryStrategy.randomizers;
+import static org.apache.cassandra.service.TimeoutStrategy.modifiers;
public class RetrySpec
{
@@ -79,7 +91,7 @@ public Partial()
public RetrySpec withDefaults(RetrySpec defaultValues)
{
- MaxAttempt maxAttempts = nonNull(this.maxAttempts, defaultValues.getMaxAttempts(), DEFAULT_MAX_ATTEMPTS);
+ MaxAttempt maxAttempts = nonNull(this.maxAttempts, defaultValues.getMaxAttempts(), DEFAULT_MAX_RETRIES);
LongMillisecondsBound baseSleepTime = nonNull(this.baseSleepTime, defaultValues.getBaseSleepTime(), DEFAULT_BASE_SLEEP);
LongMillisecondsBound maxSleepTime = nonNull(this.maxSleepTime, defaultValues.getMaxSleepTime(), DEFAULT_MAX_SLEEP);
return new RetrySpec(maxAttempts, baseSleepTime, maxSleepTime);
@@ -95,7 +107,7 @@ private static T nonNull(@Nullable T left, @Nullable T right, T defaultValue
}
}
- public static final MaxAttempt DEFAULT_MAX_ATTEMPTS = MaxAttempt.DISABLED;
+ public static final MaxAttempt DEFAULT_MAX_RETRIES = MaxAttempt.DISABLED;
public static final LongMillisecondsBound DEFAULT_BASE_SLEEP = new LongMillisecondsBound("200ms");
public static final LongMillisecondsBound DEFAULT_MAX_SLEEP = new LongMillisecondsBound("1s");
@@ -104,7 +116,7 @@ private static T nonNull(@Nullable T left, @Nullable T right, T defaultValue
*
* To disable, set to 0.
*/
- public MaxAttempt maxAttempts = DEFAULT_MAX_ATTEMPTS; // 2 retries, 1 original request; so 3 total
+ public MaxAttempt maxAttempts = DEFAULT_MAX_RETRIES; // 2 retries, 1 original request; so 3 total
public LongMillisecondsBound baseSleepTime = DEFAULT_BASE_SLEEP;
public LongMillisecondsBound maxSleepTime = DEFAULT_MAX_SLEEP;
@@ -153,6 +165,13 @@ public LongMillisecondsBound getMaxSleepTime()
return !isEnabled() ? null : maxSleepTime;
}
+ public static WaitStrategy toStrategy(SharedContext ctx, RetrySpec spec)
+ {
+ if (!spec.isEnabled())
+ return WaitStrategy.None.INSTANCE;
+ return doublingWaitStrategy(spec.maxAttempts.value, spec.baseSleepTime.to(MICROSECONDS), spec.maxSleepTime.to(MICROSECONDS), ctx.random().get());
+ }
+
@Override
public String toString()
{
@@ -162,4 +181,19 @@ public String toString()
", maxSleepTime=" + maxSleepTime +
'}';
}
+
+ // note: maxAttempts here excludes the initial attempt, so we are permitted this many retries
+ private static WaitStrategy doublingWaitStrategy(int maxRetries, long baseSleepTimeMicros, long maxSleepMicros, Random random)
+ {
+ return new RetryStrategy(randomizers(new DefaultRandom(random)).uniform(),
+ 0,
+ doublingWait(baseSleepTimeMicros / 2),
+ doublingWait(baseSleepTimeMicros + (baseSleepTimeMicros / 2)),
+ maxSleepMicros, maxRetries);
+ }
+
+ private static TimeoutStrategy.Wait doublingWait(long baseSleepTimeMicros)
+ {
+ return new Modifying(new Constant(baseSleepTimeMicros), modifiers.doubleByRetries());
+ }
}
diff --git a/src/java/org/apache/cassandra/config/StringRetryStrategy.java b/src/java/org/apache/cassandra/config/StringRetryStrategy.java
new file mode 100644
index 000000000000..6003c6591ac7
--- /dev/null
+++ b/src/java/org/apache/cassandra/config/StringRetryStrategy.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.config;
+
+import org.apache.cassandra.service.RetryStrategy;
+
+import static org.apache.cassandra.service.TimeoutStrategy.LatencySourceFactory.none;
+
+public class StringRetryStrategy
+{
+ private final String spec;
+ private final RetryStrategy retry;
+
+ public StringRetryStrategy(String spec)
+ {
+ this.spec = spec;
+ this.retry = RetryStrategy.parse(spec, none());
+ }
+
+ public RetryStrategy retry()
+ {
+ return retry;
+ }
+
+ @Override
+ public String toString()
+ {
+ return spec;
+ }
+}
diff --git a/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java b/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java
index 9bf4e415592c..f37a42e8fa54 100644
--- a/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java
+++ b/src/java/org/apache/cassandra/config/YamlConfigurationLoader.java
@@ -135,16 +135,7 @@ public Config loadConfig(URL url) throws ConfigurationException
throw new AssertionError(e);
}
- SafeConstructor constructor = new CustomConstructor(Config.class, Yaml.class.getClassLoader());
- Map, Map> replacements = getNameReplacements(Config.class);
- verifyReplacements(replacements, configBytes);
- PropertiesChecker propertiesChecker = new PropertiesChecker(replacements);
- constructor.setPropertyUtils(propertiesChecker);
- Yaml yaml = new Yaml(constructor);
- Config result = loadConfig(yaml, configBytes);
- propertiesChecker.check();
- maybeAddSystemProperties(result);
- return result;
+ return loadConfig(configBytes);
}
catch (YAMLException e)
{
@@ -152,6 +143,21 @@ public Config loadConfig(URL url) throws ConfigurationException
}
}
+ @VisibleForTesting
+ static Config loadConfig(byte[] configBytes)
+ {
+ SafeConstructor constructor = new CustomConstructor(Config.class, Yaml.class.getClassLoader());
+ Map, Map> replacements = getNameReplacements(Config.class);
+ verifyReplacements(replacements, configBytes);
+ PropertiesChecker propertiesChecker = new PropertiesChecker(replacements);
+ constructor.setPropertyUtils(propertiesChecker);
+ Yaml yaml = new Yaml(constructor);
+ Config result = loadConfig(yaml, configBytes);
+ propertiesChecker.check();
+ maybeAddSystemProperties(result);
+ return result;
+ }
+
private static void maybeAddSystemProperties(Object obj)
{
if (CassandraRelevantProperties.CONFIG_ALLOW_SYSTEM_PROPERTIES.getBoolean())
diff --git a/src/java/org/apache/cassandra/cql3/CQLStatement.java b/src/java/org/apache/cassandra/cql3/CQLStatement.java
index badf9c342879..e3fbba9aa0ff 100644
--- a/src/java/org/apache/cassandra/cql3/CQLStatement.java
+++ b/src/java/org/apache/cassandra/cql3/CQLStatement.java
@@ -131,4 +131,14 @@ interface SingleKeyspaceCqlStatement extends CQLStatement
{
String keyspace();
}
+
+ interface CompositeCQLStatement extends CQLStatement
+ {
+ Iterable extends CQLStatement> getStatements();
+ }
+
+ interface ReturningCQLStatement extends CQLStatement
+ {
+ ResultSet.ResultMetadata getResultMetadata();
+ }
}
diff --git a/src/java/org/apache/cassandra/cql3/Operation.java b/src/java/org/apache/cassandra/cql3/Operation.java
index 7a7c0e842070..5cd0d094b29b 100644
--- a/src/java/org/apache/cassandra/cql3/Operation.java
+++ b/src/java/org/apache/cassandra/cql3/Operation.java
@@ -17,6 +17,7 @@
*/
package org.apache.cassandra.cql3;
+import java.nio.ByteBuffer;
import java.util.List;
import org.apache.cassandra.cql3.functions.Function;
@@ -26,8 +27,19 @@
import org.apache.cassandra.cql3.terms.Sets;
import org.apache.cassandra.cql3.terms.Term;
import org.apache.cassandra.cql3.terms.UserTypes;
+import org.apache.cassandra.db.Clustering;
import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.CounterColumnType;
+import org.apache.cassandra.db.marshal.ListType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.NumberType;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.marshal.StringType;
+import org.apache.cassandra.db.marshal.TupleType;
+import org.apache.cassandra.db.marshal.UserType;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.schema.TableMetadata;
@@ -62,6 +74,11 @@ protected Operation(ColumnMetadata column, Term t)
this.t = t;
}
+ public Term term()
+ {
+ return t;
+ }
+
public void addFunctionsTo(List functions)
{
if (t != null)
@@ -69,14 +86,23 @@ public void addFunctionsTo(List functions)
}
/**
- * @return whether the operation requires a read of the previous value to be executed
- * (only lists setterByIdx, discard and discardByIdx requires that).
+ * @return whether the operation requires a read of the existing value to be executed
*/
public boolean requiresRead()
{
return false;
}
+
+ /**
+ * @return whether the operation requires its timestamp to be known to be executed safely
+ */
+ public boolean requiresTimestamp()
+ {
+ return false;
+ }
+
+
/**
* Collects the column specification for the bind variables of this operation.
*
@@ -89,6 +115,13 @@ public void collectMarkerSpecification(VariableSpecifications boundNames)
t.collectMarkerSpecification(boundNames);
}
+ protected ByteBuffer getCurrentCellBuffer(DecoratedKey key, UpdateParameters params)
+ {
+ Row currentRow = params.getPrefetchedRow(key, column.isStatic() ? Clustering.STATIC_CLUSTERING : params.currentClustering());
+ Cell> currentCell = currentRow == null ? null : currentRow.getCell(column);
+ return currentCell == null ? null : currentCell.buffer();
+ }
+
/**
* Execute the operation.
*
@@ -178,7 +211,7 @@ public Operation prepare(TableMetadata metadata, ColumnMetadata receiver, boolea
if (receiver.type.isCollection())
{
- switch (((CollectionType) receiver.type).kind)
+ switch (((CollectionType>) receiver.type).kind)
{
case LIST:
return new Lists.Setter(receiver, v);
@@ -228,7 +261,7 @@ public Operation prepare(TableMetadata metadata, ColumnMetadata receiver, boolea
else if (!(receiver.type.isMultiCell()))
throw new InvalidRequestException(String.format("Invalid operation (%s) for frozen collection column %s", toString(receiver), receiver.name));
- switch (((CollectionType)receiver.type).kind)
+ switch (((CollectionType>)receiver.type).kind)
{
case LIST:
Term idx = selector.prepare(metadata.keyspace, Lists.indexSpecOf(receiver));
@@ -328,7 +361,7 @@ public Operation prepare(TableMetadata metadata, ColumnMetadata receiver, boolea
else if (!(receiver.type.isMultiCell()))
throw new InvalidRequestException(String.format("Invalid operation (%s) for frozen collection column %s", toString(receiver), receiver.name));
- switch (((CollectionType)receiver.type).kind)
+ switch (((CollectionType>)receiver.type).kind)
{
case LIST:
return new Lists.Appender(receiver, value.prepare(metadata.keyspace, receiver));
@@ -371,17 +404,23 @@ public Substraction(Term.Raw value)
}
public Operation prepare(TableMetadata metadata, ColumnMetadata receiver, boolean canReadExistingState) throws InvalidRequestException
- {
+ {
if (!(receiver.type instanceof CollectionType))
{
- if (!(receiver.type instanceof CounterColumnType))
+ if (canReadExistingState)
+ {
+ if (!(receiver.type instanceof NumberType))
+ throw new InvalidRequestException(String.format("Invalid operation (%s) for non-numeric type %s", toString(receiver), receiver.name));
+ }
+ else if (!(receiver.type instanceof CounterColumnType))
throw new InvalidRequestException(String.format("Invalid operation (%s) for non counter column %s", toString(receiver), receiver.name));
+
return new Constants.Substracter(receiver, value.prepare(metadata.keyspace, receiver));
}
else if (!(receiver.type.isMultiCell()))
throw new InvalidRequestException(String.format("Invalid operation (%s) for frozen collection column %s", toString(receiver), receiver.name));
- switch (((CollectionType)receiver.type).kind)
+ switch (((CollectionType>)receiver.type).kind)
{
case LIST:
return new Lists.Discarder(receiver, value.prepare(metadata.keyspace, receiver));
@@ -392,7 +431,7 @@ else if (!(receiver.type.isMultiCell()))
ColumnSpecification vr = new ColumnSpecification(receiver.ksName,
receiver.cfName,
receiver.name,
- SetType.getInstance(((MapType)receiver.type).getKeysType(), false));
+ SetType.getInstance(((MapType, ?>) receiver.type).getKeysType(), true));
Term term;
try
{
@@ -494,7 +533,7 @@ public Operation prepare(String keyspace, ColumnMetadata receiver, TableMetadata
else if (!(receiver.type.isMultiCell()))
throw new InvalidRequestException(String.format("Invalid deletion operation for frozen collection column %s", receiver.name));
- switch (((CollectionType)receiver.type).kind)
+ switch (((CollectionType>)receiver.type).kind)
{
case LIST:
Term idx = element.prepare(keyspace, Lists.indexSpecOf(receiver));
diff --git a/src/java/org/apache/cassandra/cql3/Operations.java b/src/java/org/apache/cassandra/cql3/Operations.java
index a9451d7fc544..305d2baa8922 100644
--- a/src/java/org/apache/cassandra/cql3/Operations.java
+++ b/src/java/org/apache/cassandra/cql3/Operations.java
@@ -21,10 +21,13 @@
import java.util.Iterator;
import java.util.List;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterators;
+
import org.apache.cassandra.cql3.functions.Function;
import org.apache.cassandra.cql3.statements.StatementType;
-
-import com.google.common.collect.Iterators;
+import org.apache.cassandra.cql3.transactions.ReferenceOperation;
+import org.apache.cassandra.schema.ColumnMetadata;
/**
* A set of Operations.
@@ -36,6 +39,10 @@ public final class Operations implements Iterable
* The type of statement.
*/
private final StatementType type;
+ /**
+ * If this operation is for a Transaction; this causes Operations to "migrate" when they require-read
+ */
+ private final boolean isForTxn;
/**
* The operations on regular columns.
@@ -47,9 +54,29 @@ public final class Operations implements Iterable
*/
private final List staticOperations = new ArrayList<>();
- public Operations(StatementType type)
+ private final List regularSubstitutions = new ArrayList<>();
+ private final List staticSubstitutions = new ArrayList<>();
+
+ public Operations(StatementType type, boolean isForTxn)
{
this.type = type;
+ this.isForTxn = isForTxn;
+ }
+
+ private Operations(Operations other)
+ {
+ Preconditions.checkState(!other.isForTxn, "Unable to migrate from txn to txn");
+ Preconditions.checkState(other.regularSubstitutions.isEmpty() && other.staticSubstitutions.isEmpty(), "Transaction substitutions are defined for a non-transaction operations! regular=%s, static=%s", other.regularSubstitutions, other.staticSubstitutions);
+
+ type = other.type;
+ isForTxn = true;
+ for (Operation opt : other)
+ add(opt);
+ }
+
+ public Operations forTxn()
+ {
+ return new Operations(this);
}
/**
@@ -59,7 +86,7 @@ public Operations(StatementType type)
*/
public boolean appliesToStaticColumns()
{
- return !staticOperations.isEmpty();
+ return !staticIsEmpty();
}
/**
@@ -69,10 +96,10 @@ public boolean appliesToStaticColumns()
*/
public boolean appliesToRegularColumns()
{
- // If we have regular operations, this applies to regular columns.
+ // If we have regular operations, this applies to regular columns.
// Otherwise, if the statement is a DELETE and staticOperations is also empty, this means we have no operations,
// which for a DELETE means a full row deletion. Which means the operation applies to all columns and regular ones in particular.
- return !regularOperations.isEmpty() || (type.isDelete() && staticOperations.isEmpty());
+ return !regularIsEmpty() || (type.isDelete() && staticIsEmpty());
}
/**
@@ -99,12 +126,26 @@ public List staticOperations()
*/
public void add(Operation operation)
{
+ if (isForTxn && (operation.requiresRead() || operation.requiresTimestamp()))
+ {
+ add(operation.column, ReferenceOperation.create(operation));
+ return;
+ }
if (operation.column.isStatic())
staticOperations.add(operation);
else
regularOperations.add(operation);
}
+ public void add(ColumnMetadata column, ReferenceOperation operation)
+ {
+ Preconditions.checkState(isForTxn, "Unable to add a transaction reference to a non-transaction operation");
+ if (column.isStatic())
+ staticSubstitutions.add(operation);
+ else
+ regularSubstitutions.add(operation);
+ }
+
/**
* Checks if one of the operations requires a read.
*
@@ -126,7 +167,7 @@ public boolean requiresRead()
*/
public boolean isEmpty()
{
- return staticOperations.isEmpty() && regularOperations.isEmpty();
+ return staticIsEmpty() && regularIsEmpty();
}
/**
@@ -142,5 +183,41 @@ public void addFunctionsTo(List functions)
{
regularOperations.forEach(p -> p.addFunctionsTo(functions));
staticOperations.forEach(p -> p.addFunctionsTo(functions));
+ //TODO substitutions as well?
+ }
+
+ public List allSubstitutions()
+ {
+ if (staticSubstitutions.isEmpty())
+ return regularSubstitutions;
+
+ if (regularSubstitutions.isEmpty())
+ return staticSubstitutions;
+
+ // Only create a new list if we actually have something to combine
+ List list = new ArrayList<>(staticSubstitutions.size() + regularSubstitutions.size());
+ list.addAll(staticSubstitutions);
+ list.addAll(regularSubstitutions);
+ return list;
+ }
+
+ public List regularSubstitutions()
+ {
+ return regularSubstitutions;
+ }
+
+ public List staticSubstitutions()
+ {
+ return staticSubstitutions;
+ }
+
+ private boolean regularIsEmpty()
+ {
+ return regularOperations.isEmpty() && regularSubstitutions.isEmpty();
+ }
+
+ private boolean staticIsEmpty()
+ {
+ return staticOperations.isEmpty() && staticSubstitutions.isEmpty();
}
}
diff --git a/src/java/org/apache/cassandra/cql3/Operator.java b/src/java/org/apache/cassandra/cql3/Operator.java
index 64658b43226f..201a046b2f57 100644
--- a/src/java/org/apache/cassandra/cql3/Operator.java
+++ b/src/java/org/apache/cassandra/cql3/Operator.java
@@ -26,6 +26,7 @@
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;
+import java.util.stream.Stream;
import com.google.common.collect.RangeSet;
@@ -39,13 +40,18 @@
import org.apache.cassandra.db.marshal.SetType;
import org.apache.cassandra.db.rows.CellPath;
import org.apache.cassandra.db.rows.ComplexColumnData;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.serializers.ListSerializer;
import org.apache.cassandra.utils.ByteBufferUtil;
+import static com.google.common.base.Preconditions.checkArgument;
+
import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue;
import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
+import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt;
public enum Operator
{
@@ -828,6 +834,25 @@ public enum Kind
BINARY, TERNARY, MULTI_VALUE;
};
+ private static final Operator[] idToOperatorMapping;
+
+ static
+ {
+ Operator[] operators = values();
+ int maxId = Stream.of(operators)
+ .map(Operator::getValue)
+ .max(Integer::compareTo)
+ .get();
+
+ idToOperatorMapping = new Operator[maxId + 1];
+ for (Operator operator : operators)
+ {
+ if (null != idToOperatorMapping[operator.b])
+ throw new IllegalStateException("Duplicate Operator id " + operator.b);
+ idToOperatorMapping[operator.b] = operator;
+ }
+ }
+
/**
* The binary representation of this Enum value.
*/
@@ -853,6 +878,17 @@ public void writeTo(DataOutput output) throws IOException
output.writeInt(getValue());
}
+ /**
+ * Write the serialized version of this Operator to the specified output.
+ *
+ * @param output the output to write to
+ * @throws IOException if an I/O problem occurs while writing to the specified output
+ */
+ public void writeToUnsignedVInt(DataOutputPlus output) throws IOException
+ {
+ output.writeUnsignedVInt32(b);
+ }
+
public int getValue()
{
return b;
@@ -885,12 +921,27 @@ public boolean isTernary()
*/
public static Operator readFrom(DataInput input) throws IOException
{
- int b = input.readInt();
- for (Operator operator : values())
- if (operator.b == b)
- return operator;
+ return fromBinary(input.readInt());
+ }
+
+ /**
+ * Deserializes a Operator instance from the specified input.
+ *
+ * @param input the input to read from
+ * @return the Operator instance deserialized
+ * @throws IOException if a problem occurs while deserializing the Type instance.
+ */
+ public static Operator readFromUnsignedVInt(DataInputPlus input) throws IOException
+ {
+ return fromBinary(input.readUnsignedVInt32());
+ }
- throw new IOException(String.format("Cannot resolve Relation.Type from binary representation: %s", b));
+ private static Operator fromBinary(int b) throws IOException
+ {
+ checkArgument(b > -1, "b must be > -1 to be a valid Operator id");
+ if (b > idToOperatorMapping.length)
+ throw new IOException(String.format("Cannot resolve Operator from binary representation: %s", b));
+ return idToOperatorMapping[b];
}
@@ -1149,4 +1200,9 @@ private String buildCQLString(String leftOperand, T rightOperand, Function>>";
+ }
+ else
+ {
+ if (!isEmpty())
+ return String.format("at [%d:%d]", line + 1, charPositionInLine + 1);
+ else
+ return "";
+ }
+ }
+
+ public boolean isEmpty()
+ {
+ return line > Character.MAX_VALUE || line == Character.MAX_VALUE && charPositionInLine > Character.MAX_VALUE;
+ }
+
+ // note - this can also reproduce the original statement raw text by getting TokenStream and calling toString(startToken, endToken)
+ public static StatementSource create(Token startToken)
+ {
+ Objects.requireNonNull(startToken);
+
+ if (startToken.getType() == Token.EOF)
+ return new StatementSource(Character.MAX_VALUE + 1, 0);
+
+ int startLine = min(max(startToken.getLine(), 1) - 1, Character.MAX_VALUE);
+ int startChar = min(max(startToken.getCharPositionInLine(), 0), Character.MAX_VALUE);
+
+ return new StatementSource(startLine, startChar);
+ }
+
+}
diff --git a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
index a0201c500a39..f1b44fc8e131 100644
--- a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
+++ b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
@@ -20,7 +20,6 @@
import java.net.InetAddress;
import java.nio.ByteBuffer;
-import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
@@ -28,24 +27,34 @@
import java.util.Map;
import java.util.Set;
import java.util.UUID;
+import javax.annotation.Nonnull;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableList;
import com.datastax.driver.core.CodecUtils;
import org.apache.cassandra.cql3.functions.types.LocalDate;
import org.apache.cassandra.cql3.statements.SelectStatement;
-import org.apache.cassandra.db.Clustering;
import org.apache.cassandra.db.ConsistencyLevel;
-import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.db.ReadExecutionController;
-import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.BooleanType;
+import org.apache.cassandra.db.marshal.ByteType;
+import org.apache.cassandra.db.marshal.DoubleType;
+import org.apache.cassandra.db.marshal.InetAddressType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.ListType;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.marshal.ShortType;
+import org.apache.cassandra.db.marshal.TimestampType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.db.marshal.VectorType;
import org.apache.cassandra.db.partitions.PartitionIterator;
-import org.apache.cassandra.db.rows.Cell;
-import org.apache.cassandra.db.rows.ComplexColumnData;
-import org.apache.cassandra.schema.ColumnMetadata;
-import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.service.ClientState;
import org.apache.cassandra.service.pager.QueryPager;
import org.apache.cassandra.transport.Dispatcher;
@@ -61,11 +70,6 @@ public static UntypedResultSet create(ResultSet rs)
return new FromResultSet(rs);
}
- public static UntypedResultSet create(List> results)
- {
- return new FromResultList(results);
- }
-
public static UntypedResultSet create(SelectStatement select, QueryPager pager, int pageSize)
{
return new FromPager(select, pager, pageSize);
@@ -143,48 +147,6 @@ public List metadata()
}
}
- private static class FromResultList extends UntypedResultSet
- {
- private final List> cqlRows;
-
- private FromResultList(List> cqlRows)
- {
- this.cqlRows = cqlRows;
- }
-
- public int size()
- {
- return cqlRows.size();
- }
-
- public Row one()
- {
- if (cqlRows.size() != 1)
- throw new IllegalStateException("One row required, " + cqlRows.size() + " found");
- return new Row(cqlRows.get(0));
- }
-
- public Iterator iterator()
- {
- return new AbstractIterator()
- {
- final Iterator> iter = cqlRows.iterator();
-
- protected Row computeNext()
- {
- if (!iter.hasNext())
- return endOfData();
- return new Row(iter.next());
- }
- };
- }
-
- public List metadata()
- {
- throw new UnsupportedOperationException();
- }
- }
-
private static class FromPager extends UntypedResultSet
{
private final SelectStatement select;
@@ -308,52 +270,18 @@ public List metadata()
public static class Row
{
+ @Nonnull
private final Map data = new HashMap<>();
- private final List columns = new ArrayList<>();
+ @Nonnull
+ private final List columns;
- public Row(Map data)
+ public Row(@Nonnull List names, @Nonnull List columns)
{
- this.data.putAll(data);
- }
-
- public Row(List names, List columns)
- {
- this.columns.addAll(names);
+ this.columns = ImmutableList.copyOf(names);
for (int i = 0; i < names.size(); i++)
data.put(names.get(i).name.toString(), columns.get(i));
}
- public static Row fromInternalRow(TableMetadata metadata, DecoratedKey key, org.apache.cassandra.db.rows.Row row)
- {
- Map data = new HashMap<>();
-
- ByteBuffer[] keyComponents = SelectStatement.getComponents(metadata, key);
- for (ColumnMetadata def : metadata.partitionKeyColumns())
- data.put(def.name.toString(), keyComponents[def.position()]);
-
- Clustering> clustering = row.clustering();
- for (ColumnMetadata def : metadata.clusteringColumns())
- data.put(def.name.toString(), clustering.bufferAt(def.position()));
-
- for (ColumnMetadata def : metadata.regularAndStaticColumns())
- {
- if (def.isSimple())
- {
- Cell> cell = row.getCell(def);
- if (cell != null)
- data.put(def.name.toString(), cell.buffer());
- }
- else
- {
- ComplexColumnData complexData = row.getComplexColumnData(def);
- if (complexData != null)
- data.put(def.name.toString(), ((CollectionType>) def.type).serializeForNativeProtocol(complexData.iterator()));
- }
- }
-
- return new Row(data);
- }
-
public boolean has(String column)
{
// Note that containsKey won't work because we may have null values
@@ -504,7 +432,47 @@ public List getColumns()
@Override
public String toString()
{
- return data.toString();
+ StringBuilder sb = new StringBuilder();
+ toString(sb);
+ return sb.toString();
+ }
+
+ public void toString(StringBuilder sb)
+ {
+ for (int i = 0; i < columns.size(); i++)
+ {
+ ColumnSpecification cspec = columns.get(i);
+ ByteBuffer v = data.get(cspec.name.toString());
+ if (i != 0)
+ sb.append(" | ");
+ if (v == null)
+ {
+ sb.append("null");
+ }
+ else
+ {
+ sb.append(cspec.type.getString(v));
+ }
+ }
+ }
+ }
+
+ /**
+ * When UntypedResultSet is from a pager calling toString will consume the pager.
+ * toString shouldn't mutate the object and this of course breaks things waiting to consume
+ * the results so if you want to get a pretty printed string you need to call this method explicitly.
+ */
+ @SuppressWarnings("unused")
+ public String toStringUnsafe()
+ {
+ StringBuilder sb = new StringBuilder();
+ sb.append(metadata()).append('\n');
+ for (Row row : this)
+ {
+ row.toString(sb);
+ sb.append('\n');
}
+ sb.append("---");
+ return sb.toString();
}
}
diff --git a/src/java/org/apache/cassandra/cql3/UpdateParameters.java b/src/java/org/apache/cassandra/cql3/UpdateParameters.java
index a68fd1d736cd..e4ceec16fb7f 100644
--- a/src/java/org/apache/cassandra/cql3/UpdateParameters.java
+++ b/src/java/org/apache/cassandra/cql3/UpdateParameters.java
@@ -20,14 +20,26 @@
import java.nio.ByteBuffer;
import java.util.Map;
-import org.apache.cassandra.db.guardrails.Guardrails;
-import org.apache.cassandra.schema.ColumnMetadata;
-import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionPurger;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.LivenessInfo;
+import org.apache.cassandra.db.RangeTombstone;
+import org.apache.cassandra.db.Slice;
import org.apache.cassandra.db.context.CounterContext;
+import org.apache.cassandra.db.guardrails.Guardrails;
import org.apache.cassandra.db.partitions.Partition;
-import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.rows.BTreeRow;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.CellPath;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Rows;
import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.service.ClientState;
import org.apache.cassandra.utils.TimeUUID;
@@ -37,17 +49,16 @@
public class UpdateParameters
{
public final TableMetadata metadata;
- public final RegularAndStaticColumns updatedColumns;
public final ClientState clientState;
public final QueryOptions options;
private final long nowInSec;
- private final long timestamp;
+ protected final long timestamp;
private final int ttl;
private final DeletionTime deletionTime;
- // For lists operation that require a read-before-write. Will be null otherwise.
+ // Holds data for operations that require a read-before-write. Will be null otherwise.
private final Map prefetchedRows;
private Row.Builder staticBuilder;
@@ -57,17 +68,14 @@ public class UpdateParameters
private Row.Builder builder;
public UpdateParameters(TableMetadata metadata,
- RegularAndStaticColumns updatedColumns,
ClientState clientState,
QueryOptions options,
long timestamp,
long nowInSec,
int ttl,
- Map prefetchedRows)
- throws InvalidRequestException
+ Map prefetchedRows) throws InvalidRequestException
{
this.metadata = metadata;
- this.updatedColumns = updatedColumns;
this.clientState = clientState;
this.options = options;
@@ -123,10 +131,20 @@ public Clustering> currentClustering()
public void addPrimaryKeyLivenessInfo()
{
- builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(timestamp, ttl, nowInSec));
+ addPrimaryKeyLivenessInfo(LivenessInfo.create(timestamp, ttl, nowInSec));
+ }
+
+ private void addPrimaryKeyLivenessInfo(LivenessInfo info)
+ {
+ builder.addPrimaryKeyLivenessInfo(info);
}
public void addRowDeletion()
+ {
+ addRowDeletion(Row.Deletion.regular(deletionTime));
+ }
+
+ private void addRowDeletion(Row.Deletion deletion)
{
// For compact tables, at the exclusion of the static row (of static compact tables), each row ever has a single column,
// the "compact" one. As such, deleting the row or deleting that single cell is equivalent. We favor the later
@@ -134,7 +152,7 @@ public void addRowDeletion()
if (metadata.isCompactTable() && builder.clustering() != Clustering.STATIC_CLUSTERING)
addTombstone(((TableMetadata.CompactTableMetadata) metadata).compactValueColumn);
else
- builder.addRowDeletion(Row.Deletion.regular(deletionTime));
+ builder.addRowDeletion(deletion);
}
public void addTombstone(ColumnMetadata column) throws InvalidRequestException
@@ -175,6 +193,14 @@ public Cell> addCell(ColumnMetadata column, CellPath path, ByteBuffer value) t
return cell;
}
+ public void addRow(Row row)
+ {
+ newRow(row.clustering());
+ addRowDeletion(row.deletion());
+ addPrimaryKeyLivenessInfo(row.primaryKeyLivenessInfo());
+ row.cells().forEach(builder::addCell);
+ }
+
public void addCounter(ColumnMetadata column, long increment) throws InvalidRequestException
{
assert ttl == LivenessInfo.NO_TTL;
diff --git a/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java b/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java
index 1d0afcddc2e0..b3813d4f2007 100644
--- a/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java
+++ b/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java
@@ -17,6 +17,7 @@
*/
package org.apache.cassandra.cql3.conditions;
+import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
@@ -42,11 +43,17 @@
import org.apache.cassandra.db.rows.ColumnData;
import org.apache.cassandra.db.rows.ComplexColumnData;
import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.io.UnversionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.utils.ByteBufferUtil;
import static org.apache.cassandra.cql3.statements.RequestValidations.*;
+import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt;
+import static org.apache.cassandra.service.accord.AccordSerializers.columnMetadataSerializer;
+import static org.apache.cassandra.utils.ByteBufferUtil.nullableByteBufferSerializer;
/**
* A CQL3 condition on the value of a column or collection element. For example, "UPDATE .. IF a = 0".
@@ -171,11 +178,46 @@ public String toCQLString()
return operator.buildCQLString(columnsExpression, values);
}
+ public interface BoundSerializer
+ {
+ default void serialize(T bound, DataOutputPlus out) throws IOException {}
+ Bound deserialize(DataInputPlus in, ColumnMetadata column, Operator operator, ByteBuffer value) throws IOException;
+ default long serializedSize(T condition) { return 0; }
+ }
+
+ public enum BoundKind
+ {
+ Simple(0, SimpleBound.serializer),
+ ElementOrFieldAccess(1, ElementOrFieldAccessBound.serializer),
+ MultiCell(2, MultiCellBound.serializer);
+
+ private final int id;
+ @SuppressWarnings("rawtypes")
+ public final BoundSerializer serializer;
+
+ BoundKind(int id, BoundSerializer> serializer)
+ {
+ this.id = id;
+ this.serializer = serializer;
+ }
+
+ public static BoundKind valueOf(int id)
+ {
+ switch (id)
+ {
+ case 0: return BoundKind.Simple;
+ case 1: return BoundKind.ElementOrFieldAccess;
+ case 2: return BoundKind.MultiCell;
+ default: throw new IllegalArgumentException("Unknown id: " + id);
+ }
+ }
+ }
+
public static abstract class Bound
{
- protected final ColumnMetadata column;
- protected final Operator operator;
- protected final ByteBuffer value;
+ public final ColumnMetadata column;
+ public final Operator operator;
+ public final ByteBuffer value;
protected Bound(ColumnMetadata column, Operator operator, ByteBuffer value)
{
@@ -188,14 +230,52 @@ protected Bound(ColumnMetadata column, Operator operator, ByteBuffer value)
* Validates whether this condition applies to {@code current}.
*/
public abstract boolean appliesTo(Row row);
+
+ public abstract BoundKind kind();
+
+ public static final UnversionedSerializer serializer = new UnversionedSerializer<>() {
+ @Override
+ public void serialize(Bound bound, DataOutputPlus out) throws IOException
+ {
+ columnMetadataSerializer.serialize(bound.column, out);
+ bound.operator.writeToUnsignedVInt(out);
+ nullableByteBufferSerializer.serialize(bound.value, out);
+ ColumnCondition.BoundKind kind = bound.kind();
+ out.writeUnsignedVInt32(kind.ordinal());
+ kind.serializer.serialize(bound, out);
+ }
+
+ @Override
+ public Bound deserialize(DataInputPlus in) throws IOException
+ {
+ ColumnMetadata column = columnMetadataSerializer.deserialize(in);
+ Operator operator = Operator.readFromUnsignedVInt(in);
+ ByteBuffer value = nullableByteBufferSerializer.deserialize(in);
+ ColumnCondition.BoundKind boundKind = ColumnCondition.BoundKind.valueOf(in.readUnsignedVInt32());
+ return boundKind.serializer.deserialize(in, column, operator, value);
+ }
+
+ @Override
+ public long serializedSize(Bound bound)
+ {
+ ColumnCondition.BoundKind kind = bound.kind();
+ return columnMetadataSerializer.serializedSize(bound.column)
+ + bound.operator.sizeAsUnsignedVInt()
+ + nullableByteBufferSerializer.serializedSize(bound.value)
+ + sizeofUnsignedVInt(kind.ordinal())
+ + kind.serializer.serializedSize(bound);
+ }
+ };
}
/**
* A condition on a single non-collection column.
*/
- private static final class SimpleBound extends Bound
+ public static class SimpleBound extends Bound
{
- private SimpleBound(ColumnMetadata column, Operator operator, ByteBuffer value)
+ private static final BoundSerializer serializer = (in, column, operator, value) -> new SimpleBound(column, operator, value);
+
+ public SimpleBound(ColumnMetadata column, Operator operator, ByteBuffer value)
{
super(column, operator, value);
}
@@ -206,7 +286,7 @@ public boolean appliesTo(Row row)
return operator.isSatisfiedBy(column.type, rowValue(row), value);
}
- private ByteBuffer rowValue(Row row)
+ protected ByteBuffer rowValue(Row row)
{
// If we're asking for a given cell, and we didn't get any row from our read, it's
// the same as not having said cell.
@@ -216,13 +296,70 @@ private ByteBuffer rowValue(Row row)
Cell> c = row.getCell(column);
return c == null ? null : c.buffer();
}
+
+ @Override
+ public BoundKind kind()
+ {
+ return BoundKind.Simple;
+ }
+
+ @Override
+ public boolean equals(Object o)
+ {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ SimpleBound bound = (SimpleBound) o;
+ return column.equals(bound.column) && operator == bound.operator && Objects.equals(value, bound.value);
+ }
+
+ @Override
+ public int hashCode()
+ {
+ return Objects.hash(column, operator, value);
+ }
+ }
+
+ public static class SimpleClusteringBound extends SimpleBound
+ {
+ public SimpleClusteringBound(ColumnMetadata column, Operator operator, ByteBuffer value)
+ {
+ super(column, operator, value);
+ assert column.isClusteringColumn() : String.format("Column must be a clustering column, but given %s", column);
+ }
+
+ @Override
+ protected ByteBuffer rowValue(Row row)
+ {
+ return row == null ? null : row.clustering().bufferAt(column.position());
+ }
}
/**
* A condition on a collection element or a UDT field.
*/
- private static final class ElementOrFieldAccessBound extends Bound
+ public static final class ElementOrFieldAccessBound extends Bound
{
+ private static final BoundSerializer serializer = new BoundSerializer<>()
+ {
+ @Override
+ public void serialize(ElementOrFieldAccessBound bound, DataOutputPlus out) throws IOException
+ {
+ nullableByteBufferSerializer.serialize(bound.keyOrIndex, out);
+ }
+
+ @Override
+ public Bound deserialize(DataInputPlus in, ColumnMetadata column, Operator operator, ByteBuffer value) throws IOException
+ {
+ ByteBuffer keyOrIndex = nullableByteBufferSerializer.deserialize(in);
+ return new ElementOrFieldAccessBound(column, keyOrIndex, operator, value);
+ }
+
+ @Override
+ public long serializedSize(ElementOrFieldAccessBound condition)
+ {
+ return nullableByteBufferSerializer.serializedSize(condition.keyOrIndex);
+ }
+ };
/**
* The collection element or UDT field type.
*/
@@ -234,16 +371,22 @@ private static final class ElementOrFieldAccessBound extends Bound
private final ByteBuffer keyOrIndex;
- private ElementOrFieldAccessBound(ColumnMetadata column,
- ByteBuffer keyOrIndex,
- Operator operator,
- ByteBuffer value)
+ public ElementOrFieldAccessBound(ColumnMetadata column,
+ ByteBuffer keyOrIndex,
+ Operator operator,
+ ByteBuffer value)
{
super(column, operator, value);
this.elementType = ((MultiElementType>) column.type).elementType(keyOrIndex);
this.keyOrIndex = keyOrIndex;
}
+ @Override
+ public BoundKind kind()
+ {
+ return BoundKind.ElementOrFieldAccess;
+ }
+
@Override
public boolean appliesTo(Row row)
{
@@ -260,17 +403,40 @@ private ColumnData columnData(Row row)
{
return row == null ? null : row.getColumnData(column);
}
+
+ @Override
+ public boolean equals(Object o)
+ {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ ElementOrFieldAccessBound bound = (ElementOrFieldAccessBound) o;
+ return column.equals(bound.column) && operator == bound.operator && Objects.equals(value, bound.value) && Objects.equals(keyOrIndex, bound.keyOrIndex);
+ }
+
+ @Override
+ public int hashCode()
+ {
+ return Objects.hash(column, operator, value);
+ }
}
/**
* A condition on a multicell column.
*/
- private static final class MultiCellBound extends Bound
+ public static final class MultiCellBound extends Bound
{
+ private static final BoundSerializer serializer = (in, column, operator, value) -> new MultiCellBound(column, operator, value);
+
public MultiCellBound(ColumnMetadata column, Operator operator, ByteBuffer value)
{
super(column, operator, value);
- assert column.type.isMultiCell();
+ assert column.type.isMultiCell() : String.format("Unexpected type: %s", column.type);
+ }
+
+ @Override
+ public BoundKind kind()
+ {
+ return BoundKind.MultiCell;
}
public boolean appliesTo(Row row)
@@ -278,6 +444,21 @@ public boolean appliesTo(Row row)
ComplexColumnData columnData = row == null ? null : row.getComplexColumnData(column);
return operator.isSatisfiedBy((MultiElementType>) column.type, columnData, value);
}
+
+ @Override
+ public boolean equals(Object o)
+ {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ MultiCellBound bound = (MultiCellBound) o;
+ return column.equals(bound.column) && operator == bound.operator && Objects.equals(value, bound.value);
+ }
+
+ @Override
+ public int hashCode()
+ {
+ return Objects.hash(column, operator, value);
+ }
}
public static class Raw
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/SimpleRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/SimpleRestriction.java
index 8592fbbb7b17..9a31c09ec114 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/SimpleRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/SimpleRestriction.java
@@ -33,6 +33,7 @@
import org.apache.cassandra.cql3.terms.Term;
import org.apache.cassandra.cql3.terms.Terms;
import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
import org.apache.cassandra.db.marshal.ListType;
import org.apache.cassandra.index.Index;
import org.apache.cassandra.index.IndexRegistry;
@@ -402,7 +403,7 @@ else if (isIN())
private static ByteBuffer multiInputOperatorValues(ColumnMetadata column, List values)
{
- return ListType.getInstance(column.type, false).pack(values);
+ return ListType.getInstance(column.type, false).pack(values, ByteBufferAccessor.instance);
}
@Override
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
index 71958bc61b1c..761bd3c3ead1 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
@@ -868,13 +868,23 @@ private void validateSecondaryIndexSelections()
*
* @return true if all the primary key columns are restricted by an equality relation.
*/
- public boolean hasAllPKColumnsRestrictedByEqualities()
+ public boolean hasAllPrimaryKeyColumnsRestrictedByEqualities()
+ {
+ return hasAllPartitionKeyColumnsRestrictedByEqualities()
+ && !hasUnrestrictedClusteringColumns()
+ && (clusteringColumnsRestrictions.hasOnlyEqualityRestrictions());
+ }
+
+ /**
+ * Checks that all the partition key columns are restricted by an equality relation ('=' or 'IN').
+ *
+ * @return true if all the partition key columns are restricted by an equality relation.
+ */
+ public boolean hasAllPartitionKeyColumnsRestrictedByEqualities()
{
return !isPartitionKeyRestrictionsOnToken()
- && !partitionKeyRestrictions.hasUnrestrictedPartitionKeyComponents()
- && (partitionKeyRestrictions.hasOnlyEqualityRestrictions())
- && !hasUnrestrictedClusteringColumns()
- && (clusteringColumnsRestrictions.hasOnlyEqualityRestrictions());
+ && !partitionKeyRestrictions.hasUnrestrictedPartitionKeyComponents()
+ && (partitionKeyRestrictions.hasOnlyEqualityRestrictions());
}
/**
diff --git a/src/java/org/apache/cassandra/cql3/selection/ColumnTimestamps.java b/src/java/org/apache/cassandra/cql3/selection/ColumnTimestamps.java
index b3f3fa4ef7de..713e85b77fb3 100644
--- a/src/java/org/apache/cassandra/cql3/selection/ColumnTimestamps.java
+++ b/src/java/org/apache/cassandra/cql3/selection/ColumnTimestamps.java
@@ -27,6 +27,7 @@
import com.google.common.collect.Range;
import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
import org.apache.cassandra.db.marshal.ListType;
import org.apache.cassandra.db.marshal.LongType;
import org.apache.cassandra.db.marshal.UserType;
@@ -384,7 +385,7 @@ public ByteBuffer toByteBuffer(ProtocolVersion protocolVersion)
List buffers = new ArrayList<>(timestamps.size());
timestamps.forEach(timestamp -> buffers.add(type.toByteBuffer(timestamp)));
- return LONG_LIST_TYPE.pack(buffers);
+ return LONG_LIST_TYPE.pack(buffers, ByteBufferAccessor.instance);
}
@Override
diff --git a/src/java/org/apache/cassandra/cql3/selection/ListSelector.java b/src/java/org/apache/cassandra/cql3/selection/ListSelector.java
index 3494b4b831d2..44849805d5f2 100644
--- a/src/java/org/apache/cassandra/cql3/selection/ListSelector.java
+++ b/src/java/org/apache/cassandra/cql3/selection/ListSelector.java
@@ -29,6 +29,7 @@
import org.apache.cassandra.db.TypeSizes;
import org.apache.cassandra.db.filter.ColumnFilter.Builder;
import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
import org.apache.cassandra.db.marshal.ListType;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
@@ -101,7 +102,7 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion)
{
buffers.add(elements.get(i).getOutput(protocolVersion));
}
- return type.pack(buffers);
+ return type.pack(buffers, ByteBufferAccessor.instance);
}
public void reset()
diff --git a/src/java/org/apache/cassandra/cql3/selection/MapSelector.java b/src/java/org/apache/cassandra/cql3/selection/MapSelector.java
index 450b64a58b73..b0ccac4d93cd 100644
--- a/src/java/org/apache/cassandra/cql3/selection/MapSelector.java
+++ b/src/java/org/apache/cassandra/cql3/selection/MapSelector.java
@@ -34,6 +34,7 @@
import org.apache.cassandra.db.TypeSizes;
import org.apache.cassandra.db.filter.ColumnFilter.Builder;
import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
import org.apache.cassandra.db.marshal.MapType;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
@@ -217,7 +218,7 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion)
buffers.add(entry.getKey());
buffers.add(entry.getValue());
}
- return type.pack(buffers);
+ return type.pack(buffers, ByteBufferAccessor.instance);
}
public void reset()
diff --git a/src/java/org/apache/cassandra/cql3/selection/Selectable.java b/src/java/org/apache/cassandra/cql3/selection/Selectable.java
index 1fade9fa99de..606414eb3eaf 100644
--- a/src/java/org/apache/cassandra/cql3/selection/Selectable.java
+++ b/src/java/org/apache/cassandra/cql3/selection/Selectable.java
@@ -18,12 +18,27 @@
*/
package org.apache.cassandra.cql3.selection;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
import java.util.function.Predicate;
import java.util.stream.Collectors;
-import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.cql3.functions.*;
+import org.apache.cassandra.cql3.AssignmentTestable;
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.ColumnSpecification;
+import org.apache.cassandra.cql3.FieldIdentifier;
+import org.apache.cassandra.cql3.VariableSpecifications;
+import org.apache.cassandra.cql3.functions.AggregateFcts;
+import org.apache.cassandra.cql3.functions.CastFcts;
+import org.apache.cassandra.cql3.functions.Function;
+import org.apache.cassandra.cql3.functions.FunctionName;
+import org.apache.cassandra.cql3.functions.FunctionResolver;
+import org.apache.cassandra.cql3.functions.OperationFcts;
import org.apache.cassandra.cql3.selection.Selector.Factory;
import org.apache.cassandra.cql3.terms.Constants;
import org.apache.cassandra.cql3.terms.Lists;
@@ -33,7 +48,18 @@
import org.apache.cassandra.cql3.terms.Tuples;
import org.apache.cassandra.cql3.terms.UserTypes;
import org.apache.cassandra.cql3.terms.Vectors;
-import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.DurationType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.ListType;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.ReversedType;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.marshal.TupleType;
+import org.apache.cassandra.db.marshal.UserType;
+import org.apache.cassandra.db.marshal.VectorType;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.schema.TableMetadata;
@@ -576,8 +602,8 @@ public boolean selectColumns(Predicate predicate)
public static class Raw implements Selectable.Raw
{
- private final Selectable.Raw selected;
- private final FieldIdentifier field;
+ public final Selectable.Raw selected;
+ public final FieldIdentifier field;
public Raw(Selectable.Raw selected, FieldIdentifier field)
{
@@ -1401,6 +1427,11 @@ public FieldIdentifier toFieldIdentifier()
: FieldIdentifier.forUnquoted(text);
}
+ public String getText()
+ {
+ return text;
+ }
+
@Override
public String toString()
{
@@ -1470,8 +1501,8 @@ public boolean selectColumns(Predicate predicate)
public static class Raw implements Selectable.Raw
{
- private final Selectable.Raw selected;
- private final Term.Raw element;
+ public final Selectable.Raw selected;
+ public final Term.Raw element;
public Raw(Selectable.Raw selected, Term.Raw element)
{
diff --git a/src/java/org/apache/cassandra/cql3/selection/Selection.java b/src/java/org/apache/cassandra/cql3/selection/Selection.java
index da87f2619a3c..b7e195dda71f 100644
--- a/src/java/org/apache/cassandra/cql3/selection/Selection.java
+++ b/src/java/org/apache/cassandra/cql3/selection/Selection.java
@@ -18,7 +18,13 @@
package org.apache.cassandra.cql3.selection;
import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
import com.google.common.base.MoreObjects;
import com.google.common.base.Predicate;
@@ -132,6 +138,11 @@ public ResultSet.ResultMetadata getResultMetadata()
return resultMetadata;
}
+ public static Selection.Selectors noopSelector()
+ {
+ return new SimpleSelectors();
+ }
+
public static Selection wildcard(TableMetadata table, boolean isJson, boolean returnStaticContentOnPartitionWithNoRows)
{
List all = new ArrayList<>(table.columns().size());
@@ -344,55 +355,72 @@ private static List rowToJson(List row,
return Arrays.asList(jsonRow);
}
- public static interface Selectors
+ public interface Selectors
{
/**
* Returns the {@code ColumnFilter} corresponding to those selectors
*
* @return the {@code ColumnFilter} corresponding to those selectors
*/
- public ColumnFilter getColumnFilter();
+ default ColumnFilter getColumnFilter() { return ColumnFilter.NONE; }
/**
* Checks if this Selectors perform some processing
* @return {@code true} if this Selectors perform some processing, {@code false} otherwise.
*/
- public boolean hasProcessing();
+ default boolean hasProcessing() { return false; }
/**
* Checks if one of the selectors perform some aggregations.
* @return {@code true} if one of the selectors perform some aggregations, {@code false} otherwise.
*/
- public boolean isAggregate();
-
- /**
- * Returns the number of fetched columns
- * @return the number of fetched columns
- */
- public int numberOfFetchedColumns();
+ default boolean isAggregate() { return false; }
/**
* Checks if one of the selectors collect TTLs.
* @return {@code true} if one of the selectors collect TTLs, {@code false} otherwise.
*/
- public boolean collectTTLs();
+ default boolean collectTTLs() { return false; }
/**
* Checks if one of the selectors collects write timestamps.
* @return {@code true} if one of the selectors collects write timestamps, {@code false} otherwise.
*/
- public boolean collectWritetimes();
+ default boolean collectWritetimes() { return false; }
/**
* Adds the current row of the specified ResultSetBuilder.
*
* @param input the input row
*/
- public void addInputRow(InputRow input);
+ void addInputRow(InputRow input);
- public List getOutputRow();
+ List getOutputRow();
- public void reset();
+ void reset();
+ }
+
+ public static class SimpleSelectors implements Selectors
+ {
+ protected List current;
+
+ @Override
+ public void addInputRow(InputRow input)
+ {
+ current = input.getValues();
+ }
+
+ @Override
+ public List getOutputRow()
+ {
+ return current;
+ }
+
+ @Override
+ public void reset()
+ {
+ current = null;
+ }
}
// Special cased selection for when only columns are selected.
@@ -464,15 +492,9 @@ public boolean isAggregate()
public Selectors newSelectors(QueryOptions options)
{
- return new Selectors()
+ return new SimpleSelectors()
{
- private List current;
-
- public void reset()
- {
- current = null;
- }
-
+ @Override
public List getOutputRow()
{
if (isJson)
@@ -480,39 +502,6 @@ public List getOutputRow()
return current;
}
- public void addInputRow(InputRow input)
- {
- current = input.getValues();
- }
-
- public boolean isAggregate()
- {
- return false;
- }
-
- public boolean hasProcessing()
- {
- return false;
- }
-
- @Override
- public int numberOfFetchedColumns()
- {
- return getColumns().size();
- }
-
- @Override
- public boolean collectTTLs()
- {
- return false;
- }
-
- @Override
- public boolean collectWritetimes()
- {
- return false;
- }
-
@Override
public ColumnFilter getColumnFilter()
{
@@ -613,12 +602,6 @@ public void addInputRow(InputRow input)
selector.addInput(input);
}
- @Override
- public int numberOfFetchedColumns()
- {
- return getColumns().size();
- }
-
@Override
public boolean collectTTLs()
{
diff --git a/src/java/org/apache/cassandra/cql3/selection/Selector.java b/src/java/org/apache/cassandra/cql3/selection/Selector.java
index fce2ef063407..fa22ea0bb5e2 100644
--- a/src/java/org/apache/cassandra/cql3/selection/Selector.java
+++ b/src/java/org/apache/cassandra/cql3/selection/Selector.java
@@ -414,7 +414,7 @@ private void add(ComplexColumnData ccd, long nowInSec)
UserType udt = (UserType) type;
int size = udt.size();
- values[index] = udt.serializeForNativeProtocol(ccd.iterator(), protocolVersion);
+ values[index] = udt.serializeForNativeProtocol(ccd.iterator());
short fieldPosition = 0;
for (Cell> cell : ccd)
diff --git a/src/java/org/apache/cassandra/cql3/selection/SetSelector.java b/src/java/org/apache/cassandra/cql3/selection/SetSelector.java
index a4cc5009af1a..03fd6ac71d19 100644
--- a/src/java/org/apache/cassandra/cql3/selection/SetSelector.java
+++ b/src/java/org/apache/cassandra/cql3/selection/SetSelector.java
@@ -31,6 +31,7 @@
import org.apache.cassandra.db.TypeSizes;
import org.apache.cassandra.db.filter.ColumnFilter.Builder;
import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
import org.apache.cassandra.db.marshal.SetType;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
@@ -103,7 +104,7 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion)
{
buffers.add(elements.get(i).getOutput(protocolVersion));
}
- return type.pack(new ArrayList<>(buffers));
+ return type.pack(new ArrayList<>(buffers), ByteBufferAccessor.instance);
}
public void reset()
diff --git a/src/java/org/apache/cassandra/cql3/selection/TupleSelector.java b/src/java/org/apache/cassandra/cql3/selection/TupleSelector.java
index fd3071b1cd2a..65326fd01eff 100644
--- a/src/java/org/apache/cassandra/cql3/selection/TupleSelector.java
+++ b/src/java/org/apache/cassandra/cql3/selection/TupleSelector.java
@@ -29,6 +29,7 @@
import org.apache.cassandra.db.TypeSizes;
import org.apache.cassandra.db.filter.ColumnFilter.Builder;
import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
import org.apache.cassandra.db.marshal.TupleType;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.io.util.DataInputPlus;
@@ -102,7 +103,7 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion) throws InvalidReque
{
buffers.add(elements.get(i).getOutput(protocolVersion));
}
- return type.pack(buffers);
+ return type.pack(buffers, ByteBufferAccessor.instance);
}
public void reset()
diff --git a/src/java/org/apache/cassandra/cql3/selection/UserTypeSelector.java b/src/java/org/apache/cassandra/cql3/selection/UserTypeSelector.java
index 6778cca964ce..af13ccbecde1 100644
--- a/src/java/org/apache/cassandra/cql3/selection/UserTypeSelector.java
+++ b/src/java/org/apache/cassandra/cql3/selection/UserTypeSelector.java
@@ -27,6 +27,7 @@
import com.google.common.base.Objects;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.cql3.ColumnSpecification;
@@ -197,7 +198,7 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion)
Selector selector = fields.get(userType.fieldName(i));
buffers.add(selector == null ? null : selector.getOutput(protocolVersion));
}
- return type.pack(buffers);
+ return type.pack(buffers, ByteBufferAccessor.instance);
}
public void reset()
diff --git a/src/java/org/apache/cassandra/cql3/selection/VectorSelector.java b/src/java/org/apache/cassandra/cql3/selection/VectorSelector.java
index f61d8d20d666..8dd66bb5d5d0 100644
--- a/src/java/org/apache/cassandra/cql3/selection/VectorSelector.java
+++ b/src/java/org/apache/cassandra/cql3/selection/VectorSelector.java
@@ -30,6 +30,7 @@
import org.apache.cassandra.cql3.QueryOptions;
import org.apache.cassandra.db.filter.ColumnFilter;
import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
import org.apache.cassandra.db.marshal.VectorType;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.io.util.DataInputPlus;
@@ -126,7 +127,7 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion) throws InvalidReque
for (int i = 0, m = elements.size(); i < m; i++)
buffers.add(elements.get(i).getOutput(protocolVersion));
- return type.pack(buffers);
+ return type.pack(buffers, ByteBufferAccessor.instance);
}
@Override
diff --git a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
index e5104376cef4..0ef6fb4cdf9e 100644
--- a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
@@ -18,7 +18,16 @@
package org.apache.cassandra.cql3.statements;
import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
import java.util.concurrent.TimeUnit;
import com.google.common.annotations.VisibleForTesting;
@@ -31,19 +40,38 @@
import org.apache.cassandra.audit.AuditLogContext;
import org.apache.cassandra.audit.AuditLogEntryType;
-import org.apache.cassandra.db.guardrails.Guardrails;
-import org.apache.cassandra.schema.TableId;
-import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.cql3.Attributes;
+import org.apache.cassandra.cql3.BatchQueryOptions;
+import org.apache.cassandra.cql3.CQLStatement;
+import org.apache.cassandra.cql3.ColumnSpecification;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.ResultSet;
+import org.apache.cassandra.cql3.VariableSpecifications;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.IMutation;
+import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.guardrails.Guardrails;
import org.apache.cassandra.db.partitions.PartitionUpdate;
import org.apache.cassandra.db.rows.RowIterator;
-import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.exceptions.UnauthorizedException;
import org.apache.cassandra.metrics.BatchMetrics;
import org.apache.cassandra.metrics.ClientRequestSizeMetrics;
-import org.apache.cassandra.service.*;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.TableId;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.ClientWarn;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.service.StorageProxy;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.transport.Dispatcher;
import org.apache.cassandra.transport.messages.ResultMessage;
@@ -52,13 +80,12 @@
import org.apache.cassandra.utils.Pair;
import static java.util.function.Predicate.isEqual;
-
import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
/**
* A BATCH statement parsed from a CQL query.
*/
-public class BatchStatement implements CQLStatement
+public class BatchStatement implements CQLStatement.CompositeCQLStatement
{
public enum Type
{
@@ -201,7 +228,7 @@ public void validate() throws InvalidRequestException
for (ModificationStatement statement : statements)
{
if (timestampSet && statement.isTimestampSet())
- throw new InvalidRequestException("Timestamp must be set either on BATCH or individual statements");
+ throw new InvalidRequestException("Timestamp must be set either on BATCH or individual statements: " + statement.source);
if (statement.isCounter())
hasCounters = true;
@@ -242,7 +269,7 @@ public void validate() throws InvalidRequestException
for (ModificationStatement stmt : statements)
{
if (ksName != null && (!stmt.keyspace().equals(ksName) || !stmt.table().equals(cfName)))
- throw new InvalidRequestException("Batch with conditions cannot span multiple tables");
+ throw new InvalidRequestException("Batch with conditions cannot span multiple tables: " + stmt.source);
ksName = stmt.keyspace();
cfName = stmt.table();
}
@@ -267,6 +294,7 @@ public void validate(ClientState state) throws InvalidRequestException
statement.validate(state);
}
+ @Override
public List getStatements()
{
return statements;
@@ -315,7 +343,7 @@ public List extends IMutation> getMutations(ClientState state,
}
QueryOptions statementOptions = options.forStatement(i);
long timestamp = attrs.getTimestamp(batchTimestamp, statementOptions);
- statement.addUpdates(collector, partitionKeys.get(i), state, statementOptions, local, timestamp, nowInSeconds, requestTime);
+ statement.addUpdates(collector, partitionKeys.get(i), state, statementOptions, local, timestamp, nowInSeconds, requestTime, false);
}
if (tablesWithZeroGcGs != null)
@@ -326,7 +354,9 @@ public List extends IMutation> getMutations(ClientState state,
ClientWarn.instance.warn(MessageFormatter.arrayFormat(LOGGED_BATCH_LOW_GCGS_WARNING, new Object[] { suffix, tablesWithZeroGcGs })
.getMessage());
}
- return collector.toMutations(state);
+ // local is either executeWithoutConditions modifying a virtual table (doesn't support txns) or executeLocal
+ // which is called by test or internal things that are bypassing distributed system modification/checks
+ return collector.toMutations(state, local ? PotentialTxnConflicts.ALLOW : PotentialTxnConflicts.DISALLOW);
}
/**
@@ -614,7 +644,7 @@ public String toString()
return String.format("BatchStatement(type=%s, statements=%s)", type, statements);
}
- public static class Parsed extends QualifiedStatement
+ public static class Parsed extends QualifiedStatement.Composite
{
private final Type type;
private final Attributes.Raw attrs;
@@ -622,21 +652,15 @@ public static class Parsed extends QualifiedStatement
public Parsed(Type type, Attributes.Raw attrs, List parsedStatements)
{
- super(null);
this.type = type;
this.attrs = attrs;
this.parsedStatements = parsedStatements;
}
- // Not doing this in the constructor since we only need this for prepared statements
@Override
- public boolean isFullyQualified()
+ protected Iterable extends QualifiedStatement> getStatements()
{
- for (ModificationStatement.Parsed statement : parsedStatements)
- if (!statement.isFullyQualified())
- return false;
-
- return true;
+ return parsedStatements;
}
@Override
diff --git a/src/java/org/apache/cassandra/cql3/statements/BatchUpdatesCollector.java b/src/java/org/apache/cassandra/cql3/statements/BatchUpdatesCollector.java
index 521cd2afa6e2..4bc0d909d2d0 100644
--- a/src/java/org/apache/cassandra/cql3/statements/BatchUpdatesCollector.java
+++ b/src/java/org/apache/cassandra/cql3/statements/BatchUpdatesCollector.java
@@ -32,6 +32,7 @@
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.db.IMutation;
import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts;
import org.apache.cassandra.db.RegularAndStaticColumns;
import org.apache.cassandra.db.commitlog.CommitLogSegment;
import org.apache.cassandra.db.partitions.PartitionUpdate;
@@ -137,14 +138,14 @@ private IMutationBuilder makeMutationBuilder(TableMetadata metadata, DecoratedKe
* @return a collection containing all the mutations.
*/
@Override
- public List toMutations(ClientState state)
+ public List toMutations(ClientState state, PotentialTxnConflicts potentialTxnConflicts)
{
List ms = new ArrayList<>();
for (Map ksMap : mutationBuilders.values())
{
for (IMutationBuilder builder : ksMap.values())
{
- IMutation mutation = builder.build();
+ IMutation mutation = builder.build(potentialTxnConflicts);
mutation.validateIndexedColumns(state);
mutation.validateSize(MessagingService.current_version, CommitLogSegment.ENTRY_OVERHEAD_SIZE);
ms.add(mutation);
@@ -182,7 +183,7 @@ private interface IMutationBuilder
/**
* Build the immutable mutation
*/
- IMutation build();
+ IMutation build(PotentialTxnConflicts potentialTxnConflicts);
/**
* Get the builder for the given tableId
@@ -215,7 +216,7 @@ public MutationBuilder add(PartitionUpdate.Builder updateBuilder)
return this;
}
- public Mutation build()
+ public Mutation build(PotentialTxnConflicts potentialTxnConflicts)
{
ImmutableMap.Builder updates = new ImmutableMap.Builder<>();
for (Map.Entry updateEntry : modifications.entrySet())
@@ -223,7 +224,7 @@ public Mutation build()
PartitionUpdate update = updateEntry.getValue().build();
updates.put(updateEntry.getKey(), update);
}
- return new Mutation(keyspaceName, key, updates.build(), createdAt);
+ return new Mutation(keyspaceName, key, updates.build(), createdAt, potentialTxnConflicts);
}
public PartitionUpdate.Builder get(TableId tableId)
@@ -263,9 +264,9 @@ public IMutationBuilder add(PartitionUpdate.Builder builder)
return mutationBuilder.add(builder);
}
- public IMutation build()
+ public IMutation build(PotentialTxnConflicts potentialTxnConflicts)
{
- return new CounterMutation(mutationBuilder.build(), cl);
+ return new CounterMutation(mutationBuilder.build(potentialTxnConflicts), cl);
}
public PartitionUpdate.Builder get(TableId id)
@@ -297,7 +298,7 @@ public VirtualMutationBuilder add(PartitionUpdate.Builder builder)
}
@Override
- public VirtualMutation build()
+ public VirtualMutation build(PotentialTxnConflicts potentialTxnConflicts)
{
ImmutableMap.Builder updates = new ImmutableMap.Builder<>();
modifications.forEach((tableId, updateBuilder) -> updates.put(tableId, updateBuilder.build()));
diff --git a/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java b/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java
index 0d322691c6e6..abc07ebc82c5 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java
@@ -17,33 +17,79 @@
*/
package org.apache.cassandra.cql3.statements;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
-import org.apache.cassandra.db.marshal.TimeUUIDType;
-import org.apache.cassandra.index.IndexRegistry;
-import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.cql3.*;
+import org.apache.commons.lang3.builder.ToStringBuilder;
+import org.apache.commons.lang3.builder.ToStringStyle;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import accord.api.Update;
+import accord.primitives.Txn;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.UpdateParameters;
import org.apache.cassandra.cql3.conditions.ColumnCondition;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.Columns;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.SinglePartitionReadCommand;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.marshal.TimeUUIDType;
import org.apache.cassandra.db.partitions.FilteredPartition;
import org.apache.cassandra.db.partitions.Partition;
import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.Row;
import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.IndexRegistry;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.TableParams;
import org.apache.cassandra.service.CASRequest;
import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.accord.txn.TxnCondition;
+import org.apache.cassandra.service.accord.txn.TxnData;
+import org.apache.cassandra.service.accord.txn.TxnDataKeyValue;
+import org.apache.cassandra.service.accord.txn.TxnQuery;
+import org.apache.cassandra.service.accord.txn.TxnRead;
+import org.apache.cassandra.service.accord.txn.TxnReference;
+import org.apache.cassandra.service.accord.txn.TxnResult;
+import org.apache.cassandra.service.accord.txn.TxnUpdate;
+import org.apache.cassandra.service.accord.txn.TxnWrite;
import org.apache.cassandra.service.paxos.Ballot;
+import org.apache.cassandra.tcm.ClusterMetadata;
import org.apache.cassandra.utils.TimeUUID;
-import org.apache.commons.lang3.builder.ToStringBuilder;
-import org.apache.commons.lang3.builder.ToStringStyle;
+import static com.google.common.base.Preconditions.checkState;
+import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult;
+import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult.RETRY_NEW_PROTOCOL;
+import static org.apache.cassandra.service.StorageProxy.ConsensusAttemptResult.casResult;
+import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.CAS_READ;
+import static org.apache.cassandra.service.accord.txn.TxnData.txnDataName;
+import static org.apache.cassandra.service.accord.txn.TxnResult.Kind.retry_new_protocol;
+import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.getTableMetadata;
/**
* Processed CAS conditions and update on potentially multiple rows of the same partition.
*/
public class CQL3CasRequest implements CASRequest
{
+ @SuppressWarnings("unused")
+ private static final Logger logger = LoggerFactory.getLogger(CQL3CasRequest.class);
+
public final TableMetadata metadata;
public final DecoratedKey key;
private final RegularAndStaticColumns conditionColumns;
@@ -249,9 +295,9 @@ private static class CASUpdateParameters extends UpdateParameters
final long timeUuidMsb;
long timeUuidNanos;
- public CASUpdateParameters(TableMetadata metadata, RegularAndStaticColumns updatedColumns, ClientState state, QueryOptions options, long timestamp, long nowInSec, int ttl, Map prefetchedRows, long timeUuidMsb, long timeUuidNanos) throws InvalidRequestException
+ public CASUpdateParameters(TableMetadata metadata, ClientState state, QueryOptions options, long timestamp, long nowInSec, int ttl, Map prefetchedRows, long timeUuidMsb, long timeUuidNanos) throws InvalidRequestException
{
- super(metadata, updatedColumns, state, options, timestamp, nowInSec, ttl, prefetchedRows);
+ super(metadata, state, options, timestamp, nowInSec, ttl, prefetchedRows);
this.timeUuidMsb = timeUuidMsb;
this.timeUuidNanos = timeUuidNanos;
}
@@ -289,7 +335,7 @@ long applyUpdates(FilteredPartition current, PartitionUpdate.Builder updateBuild
{
Map map = stmt.requiresRead() ? Collections.singletonMap(key, current) : null;
CASUpdateParameters params =
- new CASUpdateParameters(metadata, updateBuilder.columns(), state, options, timestamp, nowInSeconds,
+ new CASUpdateParameters(metadata, state, options, timestamp, nowInSeconds,
stmt.getTimeToLive(options), map, timeUuidMsb, timeUuidNanos);
stmt.addUpdateForKey(updateBuilder, clustering, params);
return params.timeUuidNanos;
@@ -319,7 +365,6 @@ void applyUpdates(FilteredPartition current, PartitionUpdate.Builder updateBuild
Map map = stmt.requiresRead() ? Collections.singletonMap(key, current) : null;
UpdateParameters params =
new UpdateParameters(metadata,
- updateBuilder.columns(),
state,
options,
timestamp,
@@ -340,6 +385,8 @@ protected RowCondition(Clustering> clustering)
}
public abstract boolean appliesTo(FilteredPartition current) throws InvalidRequestException;
+
+ public abstract TxnCondition asTxnCondition();
}
private static class NotExistCondition extends RowCondition
@@ -353,6 +400,13 @@ public boolean appliesTo(FilteredPartition current)
{
return current.getRow(clustering) == null;
}
+
+ @Override
+ public TxnCondition asTxnCondition()
+ {
+ TxnReference txnReference = new TxnReference(txnDataName(CAS_READ), null);
+ return new TxnCondition.Exists(txnReference, TxnCondition.Kind.IS_NULL);
+ }
}
private static class ExistCondition extends RowCondition
@@ -366,6 +420,13 @@ public boolean appliesTo(FilteredPartition current)
{
return current.getRow(clustering) != null;
}
+
+ @Override
+ public TxnCondition asTxnCondition()
+ {
+ TxnReference txnReference = new TxnReference(txnDataName(CAS_READ), null);
+ return new TxnCondition.Exists(txnReference, TxnCondition.Kind.IS_NOT_NULL);
+ }
}
private static class ColumnsConditions extends RowCondition
@@ -395,6 +456,12 @@ public boolean appliesTo(FilteredPartition current) throws InvalidRequestExcepti
}
return true;
}
+
+ @Override
+ public TxnCondition asTxnCondition()
+ {
+ return new TxnCondition.ColumnConditionsAdapter(clustering, conditions);
+ }
}
@Override
@@ -402,4 +469,79 @@ public String toString()
{
return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE);
}
+
+ @Override
+ public Txn toAccordTxn(ClusterMetadata cm, ConsistencyLevel consistencyLevel, ConsistencyLevel commitConsistencyLevel, ClientState clientState, long nowInSecs)
+ {
+ SinglePartitionReadCommand readCommand = readCommand(nowInSecs);
+ Update update = createUpdate(cm, clientState, commitConsistencyLevel);
+ // If the write strategy is sending all writes through Accord there is no need to use the supplied consistency
+ // level since Accord will manage reading safely
+ TableParams tableParams = getTableMetadata(cm, metadata.id).params;
+ consistencyLevel = tableParams.transactionalMode.readCLForMode(tableParams.transactionalMigrationFrom, consistencyLevel, cm, metadata.id, readCommand.partitionKey().getToken());
+ TxnRead read = TxnRead.createCasRead(readCommand, consistencyLevel);
+ // In a CAS requesting only one key is supported and writes
+ // can't be dependent on any data that is read (only conditions)
+ // so the only relevant keys are the read key
+ return new Txn.InMemory(read.keys(), read, TxnQuery.CONDITION, update);
+ }
+
+ private Update createUpdate(ClusterMetadata cm, ClientState clientState, ConsistencyLevel commitConsistencyLevel)
+ {
+ // Potentially ignore commit consistency level if TransactionalMode is full
+ // since it is safe to match what non-SERIAL writes do
+ TableMetadata tableMetadata = getTableMetadata(cm, metadata.id);
+ TableParams tableParams = tableMetadata.params;
+ commitConsistencyLevel = tableParams.transactionalMode.commitCLForMode(tableParams.transactionalMigrationFrom, commitConsistencyLevel, cm, metadata.id, key.getToken());
+ // CAS requires using the new txn timestamp to correctly linearize some kinds of updates
+ return new TxnUpdate(createWriteFragments(clientState), createCondition(), commitConsistencyLevel, false);
+ }
+
+ private TxnCondition createCondition()
+ {
+ List txnConditions = new ArrayList<>(conditions.size() + (staticConditions == null ? 0 : 1));
+ if (staticConditions != null)
+ {
+ txnConditions.add(staticConditions.asTxnCondition());
+ }
+ for (RowCondition condition : conditions.values())
+ txnConditions.add(condition.asTxnCondition());
+ // CAS forbids empty conditions
+ checkState(!txnConditions.isEmpty());
+ return conditions.size() == 1 ? txnConditions.get(0) : new TxnCondition.BooleanGroup(TxnCondition.Kind.AND, txnConditions);
+ }
+
+ private List createWriteFragments(ClientState state)
+ {
+ List fragments = new ArrayList<>();
+ int idx = 0;
+ for (RowUpdate update : updates)
+ {
+ // Some operations may need to migrate to run in the transaction, so need to call forTxn to make sure this
+ // happens.
+ // see CASSANDRA-18337
+ ModificationStatement modification = update.stmt.forTxn();
+ QueryOptions options = update.options;
+ TxnWrite.Fragment fragment = modification.getTxnWriteFragment(idx++, state, options);
+ fragments.add(fragment);
+ }
+ for (RangeDeletion rangeDeletion : rangeDeletions)
+ {
+ ModificationStatement modification = rangeDeletion.stmt;
+ QueryOptions options = rangeDeletion.options;
+ TxnWrite.Fragment fragment = modification.getTxnWriteFragment(idx++, state, options);
+ fragments.add(fragment);
+ }
+ return fragments;
+ }
+
+ @Override
+ public ConsensusAttemptResult toCasResult(TxnResult txnResult)
+ {
+ if (txnResult.kind() == retry_new_protocol)
+ return RETRY_NEW_PROTOCOL;
+ TxnData txnData = (TxnData)txnResult;
+ TxnDataKeyValue partition = (TxnDataKeyValue)txnData.get(txnDataName(CAS_READ));
+ return casResult(partition != null ? partition.rowIterator(false) : null);
+ }
}
diff --git a/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java b/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java
index 0bc22842556d..f2bbec1458d6 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java
@@ -20,9 +20,19 @@
import java.util.Collections;
import java.util.List;
+import org.apache.commons.lang3.builder.ToStringBuilder;
+import org.apache.commons.lang3.builder.ToStringStyle;
+
import org.apache.cassandra.audit.AuditLogContext;
import org.apache.cassandra.audit.AuditLogEntryType;
-import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.Attributes;
+import org.apache.cassandra.cql3.Operation;
+import org.apache.cassandra.cql3.Operations;
+import org.apache.cassandra.cql3.QualifiedName;
+import org.apache.cassandra.cql3.StatementSource;
+import org.apache.cassandra.cql3.UpdateParameters;
+import org.apache.cassandra.cql3.VariableSpecifications;
+import org.apache.cassandra.cql3.WhereClause;
import org.apache.cassandra.cql3.conditions.ColumnCondition;
import org.apache.cassandra.cql3.conditions.Conditions;
import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
@@ -33,8 +43,6 @@
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.service.ClientState;
-import org.apache.commons.lang3.builder.ToStringBuilder;
-import org.apache.commons.lang3.builder.ToStringStyle;
import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue;
@@ -49,9 +57,16 @@ private DeleteStatement(VariableSpecifications bindVariables,
Operations operations,
StatementRestrictions restrictions,
Conditions conditions,
- Attributes attrs)
+ Attributes attrs,
+ StatementSource source)
+ {
+ super(StatementType.DELETE, bindVariables, cfm, operations, restrictions, conditions, attrs, source);
+ }
+
+ @Override
+ protected ModificationStatement withOperations(Operations operations)
{
- super(StatementType.DELETE, bindVariables, cfm, operations, restrictions, conditions, attrs);
+ return new DeleteStatement(bindVariables, metadata, operations, restrictions, conditions, attrs, source);
}
@Override
@@ -126,17 +141,21 @@ public static class Parsed extends ModificationStatement.Parsed
{
private final List deletions;
private final WhereClause whereClause;
+ private final boolean isForTxn;
public Parsed(QualifiedName name,
Attributes.Raw attrs,
List deletions,
WhereClause whereClause,
List conditions,
- boolean ifExists)
+ boolean ifExists,
+ StatementSource source,
+ boolean isForTxn)
{
- super(name, StatementType.DELETE, attrs, conditions, false, ifExists);
+ super(name, StatementType.DELETE, attrs, conditions, false, ifExists, source);
this.deletions = deletions;
this.whereClause = whereClause;
+ this.isForTxn = isForTxn;
}
@@ -147,7 +166,7 @@ protected ModificationStatement prepareInternal(ClientState state,
Conditions conditions,
Attributes attrs)
{
- Operations operations = new Operations(type);
+ Operations operations = new Operations(type, isForTxn);
for (Operation.RawDeletion deletion : deletions)
{
@@ -175,9 +194,10 @@ protected ModificationStatement prepareInternal(ClientState state,
operations,
restrictions,
conditions,
- attrs);
+ attrs,
+ source);
- if (stmt.hasConditions() && !restrictions.hasAllPKColumnsRestrictedByEqualities())
+ if (stmt.hasConditions() && !restrictions.hasAllPrimaryKeyColumnsRestrictedByEqualities())
{
checkFalse(stmt.isVirtual(), "DELETE statements must restrict all PRIMARY KEY columns with equality relations");
diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
index e3662a609571..bd517cc4cf94 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
@@ -18,24 +18,43 @@
package org.apache.cassandra.cql3.statements;
import java.nio.ByteBuffer;
-import java.util.*;
-
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableSet;
+import java.util.Set;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.auth.Permission;
-import org.apache.cassandra.db.guardrails.Guardrails;
-import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.locator.Replica;
-import org.apache.cassandra.locator.ReplicaLayout;
-import org.apache.cassandra.schema.ColumnMetadata;
-import org.apache.cassandra.schema.Schema;
-import org.apache.cassandra.schema.SchemaConstants;
-import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.schema.ViewMetadata;
-import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.Attributes;
+import org.apache.cassandra.cql3.CQLStatement;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.ColumnSpecification;
+import org.apache.cassandra.cql3.Operation;
+import org.apache.cassandra.cql3.Operations;
+import org.apache.cassandra.cql3.Ordering;
+import org.apache.cassandra.cql3.QualifiedName;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.ResultSet;
+import org.apache.cassandra.cql3.StatementSource;
+import org.apache.cassandra.cql3.UpdateParameters;
+import org.apache.cassandra.cql3.Validation;
+import org.apache.cassandra.cql3.VariableSpecifications;
+import org.apache.cassandra.cql3.WhereClause;
import org.apache.cassandra.cql3.conditions.ColumnCondition;
import org.apache.cassandra.cql3.conditions.ColumnConditions;
import org.apache.cassandra.cql3.conditions.Conditions;
@@ -44,17 +63,55 @@
import org.apache.cassandra.cql3.selection.ResultSetBuilder;
import org.apache.cassandra.cql3.selection.Selection;
import org.apache.cassandra.cql3.selection.Selection.Selectors;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.cql3.terms.Constants;
+import org.apache.cassandra.cql3.transactions.ReferenceOperation;
+import org.apache.cassandra.db.CBuilder;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.IMutation;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts;
+import org.apache.cassandra.db.ReadExecutionController;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.SinglePartitionReadCommand;
+import org.apache.cassandra.db.SinglePartitionReadQuery;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.guardrails.Guardrails;
import org.apache.cassandra.db.marshal.BooleanType;
-import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.partitions.FilteredPartition;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.PartitionIterators;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
import org.apache.cassandra.db.rows.RowIterator;
import org.apache.cassandra.db.view.View;
-import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.locator.Replica;
+import org.apache.cassandra.locator.ReplicaLayout;
import org.apache.cassandra.metrics.ClientRequestSizeMetrics;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.SchemaConstants;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.ViewMetadata;
import org.apache.cassandra.service.ClientState;
import org.apache.cassandra.service.QueryState;
import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.accord.txn.TxnReferenceOperation;
+import org.apache.cassandra.service.accord.txn.TxnReferenceOperations;
+import org.apache.cassandra.service.accord.txn.TxnWrite;
import org.apache.cassandra.service.disk.usage.DiskUsageBroadcaster;
import org.apache.cassandra.service.paxos.Ballot;
import org.apache.cassandra.service.paxos.BallotGenerator;
@@ -62,6 +119,7 @@
import org.apache.cassandra.transport.Dispatcher;
import org.apache.cassandra.transport.messages.ResultMessage;
import org.apache.cassandra.triggers.TriggerExecutor;
+import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.MD5Digest;
@@ -88,19 +146,25 @@ public abstract class ModificationStatement implements CQLStatement.SingleKeyspa
protected final VariableSpecifications bindVariables;
public final TableMetadata metadata;
- private final Attributes attrs;
+ protected final Attributes attrs;
- private final StatementRestrictions restrictions;
+ protected final StatementRestrictions restrictions;
private final Operations operations;
private final RegularAndStaticColumns updatedColumns;
- private final Conditions conditions;
+ protected final Conditions conditions;
private final RegularAndStaticColumns conditionColumns;
private final RegularAndStaticColumns requiresRead;
+ /**
+ * Used by {@link #forTxn()} to only compute a migrated copy of this statement for transactions
+ */
+ private ModificationStatement txnStmt;
+
+ public final StatementSource source;
public ModificationStatement(StatementType type,
VariableSpecifications bindVariables,
@@ -108,7 +172,8 @@ public ModificationStatement(StatementType type,
Operations operations,
StatementRestrictions restrictions,
Conditions conditions,
- Attributes attrs)
+ Attributes attrs,
+ StatementSource source)
{
this.type = type;
this.bindVariables = bindVariables;
@@ -117,6 +182,7 @@ public ModificationStatement(StatementType type,
this.operations = operations;
this.conditions = conditions;
this.attrs = attrs;
+ this.source = source;
if (!conditions.isEmpty())
{
@@ -142,6 +208,15 @@ public ModificationStatement(StatementType type,
requiresReadBuilder.add(operation.column);
}
}
+ for (ReferenceOperation operation : operations.allSubstitutions())
+ {
+ ColumnMetadata receiver = operation.getReceiver();
+ updatedColumnsBuilder.add(receiver);
+ // If the operation requires a read-before-write, make sure its receiver is selected by the auto-read the
+ // transaction creates during update creation. (see createSelectForTxn())
+ if (operation.requiresRead())
+ requiresReadBuilder.add(receiver);
+ }
RegularAndStaticColumns modifiedColumns = updatedColumnsBuilder.build();
@@ -356,6 +431,11 @@ public Iterable allOperations()
return operations;
}
+ public Collection allReferenceOperations()
+ {
+ return operations.allSubstitutions();
+ }
+
public Iterable getColumnsWithConditions()
{
return conditions.getColumns();
@@ -418,7 +498,7 @@ public boolean requiresRead()
// * Deleting list element by value
// * Performing addition on a StringType (i.e. concatenation, only supported for CAS operations)
// * Performing addition on a NumberType, again only supported for CAS operations.
- return !requiresRead.isEmpty();
+ return operations.requiresRead();
}
private Map readRequiredLists(Collection partitionKeys,
@@ -528,7 +608,8 @@ private ResultMessage executeWithoutCondition(QueryState queryState, QueryOption
false,
options.getTimestamp(queryState),
options.getNowInSeconds(queryState),
- requestTime);
+ requestTime,
+ false);
if (!mutations.isEmpty())
{
StorageProxy.mutateWithTriggers(mutations, cl, false, requestTime);
@@ -693,7 +774,7 @@ public ResultMessage executeInternalWithoutCondition(QueryState queryState, Quer
{
long timestamp = options.getTimestamp(queryState);
long nowInSeconds = options.getNowInSeconds(queryState);
- for (IMutation mutation : getMutations(queryState.getClientState(), options, true, timestamp, nowInSeconds, requestTime))
+ for (IMutation mutation : getMutations(queryState.getClientState(), options, true, timestamp, nowInSeconds, requestTime, false))
mutation.apply();
return null;
}
@@ -721,7 +802,7 @@ static RowIterator casInternal(ClientState state, CQL3CasRequest request, long t
}
if (!request.appliesTo(current))
- return current.rowIterator();
+ return current.rowIterator(false);
PartitionUpdate updates = request.makeUpdates(current, state, ballot);
updates = TriggerExecutor.instance.execute(updates);
@@ -741,18 +822,81 @@ static RowIterator casInternal(ClientState state, CQL3CasRequest request, long t
*
* @return list of the mutations
*/
- private List extends IMutation> getMutations(ClientState state,
+ public List extends IMutation> getMutations(ClientState state,
QueryOptions options,
boolean local,
long timestamp,
long nowInSeconds,
- Dispatcher.RequestTime requestTime)
+ Dispatcher.RequestTime requestTime,
+ boolean constructingAccordBaseUpdate)
{
List keys = buildPartitionKeyNames(options, state);
HashMultiset perPartitionKeyCounts = HashMultiset.create(keys);
SingleTableUpdatesCollector collector = new SingleTableUpdatesCollector(metadata, updatedColumns, perPartitionKeyCounts);
- addUpdates(collector, keys, state, options, local, timestamp, nowInSeconds, requestTime);
- return collector.toMutations(state);
+ addUpdates(collector, keys, state, options, local, timestamp, nowInSeconds, requestTime, constructingAccordBaseUpdate);
+ // local means this is test or internal things that are bypassing distributed system modification/checks
+ return collector.toMutations(state, local ? PotentialTxnConflicts.ALLOW : PotentialTxnConflicts.DISALLOW);
+ }
+
+ public PartitionUpdate getTxnUpdate(ClientState state, QueryOptions options)
+ {
+ List extends IMutation> mutations = getMutations(state, options, false, 0, 0, new Dispatcher.RequestTime(0, 0), true);
+ // TODO: Temporary fix for CASSANDRA-20079
+ if (mutations.isEmpty())
+ return PartitionUpdate.emptyUpdate(metadata, metadata.partitioner.decorateKey(ByteBufferUtil.EMPTY_BYTE_BUFFER));
+ if (mutations.size() != 1)
+ throw new IllegalArgumentException("When running withing a transaction, modification statements may only mutate a single partition");
+ return Iterables.getOnlyElement(mutations.get(0).getPartitionUpdates());
+ }
+
+ private static List getTxnReferenceOps(List operations, QueryOptions options)
+ {
+ if (operations.isEmpty())
+ return Collections.emptyList();
+
+ List result = new ArrayList<>(operations.size());
+ for (ReferenceOperation operation : operations)
+ result.add(operation.bindAndGet(options));
+ return result;
+ }
+
+ public TxnReferenceOperations getTxnReferenceOps(QueryOptions options, ClientState state)
+ {
+ List regularOps = getTxnReferenceOps(operations.regularSubstitutions(), options);
+ List staticOps = getTxnReferenceOps(operations.staticSubstitutions(), options);
+ Clustering> clustering = !regularOps.isEmpty() ? Iterables.getOnlyElement(createClustering(options, state)) : null;
+ return new TxnReferenceOperations(metadata, clustering, regularOps, staticOps);
+ }
+
+ public ModificationStatement forTxn()
+ {
+ if (requiresRead.isEmpty()) return this;
+ ModificationStatement migrated = txnStmt;
+ if (migrated == null)
+ {
+ synchronized (requiresRead)
+ {
+ migrated = txnStmt;
+ if (migrated == null)
+ txnStmt = migrated = withOperations(operations.forTxn());
+ }
+ }
+ return migrated;
+ }
+
+ protected abstract ModificationStatement withOperations(Operations operations);
+
+ @VisibleForTesting
+ public List getSubstitutions()
+ {
+ return operations.allSubstitutions();
+ }
+
+ public TxnWrite.Fragment getTxnWriteFragment(int index, ClientState state, QueryOptions options)
+ {
+ PartitionUpdate baseUpdate = getTxnUpdate(state, options);
+ TxnReferenceOperations referenceOps = getTxnReferenceOps(options, state);
+ return new TxnWrite.Fragment(index, baseUpdate, referenceOps);
}
final void addUpdates(UpdatesCollector collector,
@@ -762,7 +906,8 @@ final void addUpdates(UpdatesCollector collector,
boolean local,
long timestamp,
long nowInSeconds,
- Dispatcher.RequestTime requestTime)
+ Dispatcher.RequestTime requestTime,
+ boolean constructingAccordBaseUpdate)
{
if (hasSlices())
{
@@ -780,7 +925,8 @@ final void addUpdates(UpdatesCollector collector,
local,
timestamp,
nowInSeconds,
- requestTime);
+ requestTime
+ );
for (ByteBuffer key : keys)
{
Validation.validateKey(metadata(), key);
@@ -848,7 +994,8 @@ private UpdateParameters makeUpdateParameters(Collection keys,
local,
timestamp,
nowInSeconds,
- requestTime);
+ requestTime
+ );
return makeUpdateParameters(keys,
new ClusteringIndexNamesFilter(clusterings, false),
@@ -858,7 +1005,8 @@ private UpdateParameters makeUpdateParameters(Collection keys,
local,
timestamp,
nowInSeconds,
- requestTime);
+ requestTime
+ );
}
private UpdateParameters makeUpdateParameters(Collection keys,
@@ -882,7 +1030,6 @@ private UpdateParameters makeUpdateParameters(Collection keys,
requestTime);
return new UpdateParameters(metadata(),
- updatedColumns(),
state,
options,
getTimestamp(timestamp, options),
@@ -898,13 +1045,15 @@ public static abstract class Parsed extends QualifiedStatement
private final List conditions;
private final boolean ifNotExists;
private final boolean ifExists;
+ protected final StatementSource source;
protected Parsed(QualifiedName name,
StatementType type,
Attributes.Raw attrs,
List conditions,
boolean ifNotExists,
- boolean ifExists)
+ boolean ifExists,
+ StatementSource source)
{
super(name);
this.type = type;
@@ -912,6 +1061,7 @@ protected Parsed(QualifiedName name,
this.conditions = conditions == null ? Collections.emptyList() : conditions;
this.ifNotExists = ifNotExists;
this.ifExists = ifExists;
+ this.source = source;
}
public ModificationStatement prepare(ClientState state)
@@ -928,6 +1078,7 @@ public ModificationStatement prepare(ClientState state, VariableSpecifications b
Conditions preparedConditions = prepareConditions(metadata, bindVariables);
+ // TODO: if this is a txn and has a read name, and updates non-static columns, confirm it selects an entire row
return prepareInternal(state, metadata, bindVariables, preparedConditions, preparedAttributes);
}
@@ -952,7 +1103,6 @@ private Conditions prepareConditions(TableMetadata metadata, VariableSpecificati
if (ifNotExists)
{
assert conditions.isEmpty();
- assert !ifExists;
return Conditions.IF_NOT_EXISTS_CONDITION;
}
@@ -1021,4 +1171,24 @@ public List getConditions()
return conditions;
}
}
+
+ private static final Constants.Value ONE = new Constants.Value(ByteBufferUtil.bytes(1));
+
+ public SelectStatement createSelectForTxn()
+ {
+ // TODO: get working with static-only updates that don't specify any/all primary key columns
+ Preconditions.checkState(getRestrictions().hasAllPrimaryKeyColumnsRestrictedByEqualities());
+ Selection selection = Selection.forColumns(metadata, Lists.newArrayList(requiresRead), false);
+ return new SelectStatement(metadata,
+ bindVariables,
+ SelectStatement.defaultParameters,
+ selection,
+ getRestrictions(),
+ false,
+ null,
+ null,
+ ONE,
+ null,
+ StatementSource.INTERNAL);
+ }
}
diff --git a/src/java/org/apache/cassandra/cql3/statements/QualifiedStatement.java b/src/java/org/apache/cassandra/cql3/statements/QualifiedStatement.java
index 4ed41d168888..c7183a9d1b54 100644
--- a/src/java/org/apache/cassandra/cql3/statements/QualifiedStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/QualifiedStatement.java
@@ -78,4 +78,50 @@ public String toString()
{
return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE);
}
+
+ public static abstract class Composite extends QualifiedStatement
+ {
+ Composite()
+ {
+ super(null);
+ }
+
+ protected abstract Iterable extends QualifiedStatement> getStatements();
+
+ @Override
+ public boolean isFullyQualified()
+ {
+ for (QualifiedStatement statement : getStatements())
+ if (!statement.isFullyQualified())
+ return false;
+
+ return true;
+ }
+
+ @Override
+ public void setKeyspace(ClientState state)
+ {
+ for (QualifiedStatement statement : getStatements())
+ statement.setKeyspace(state);
+ }
+
+ @Override
+ public void setKeyspace(String keyspace)
+ {
+ for (QualifiedStatement statement : getStatements())
+ statement.setKeyspace(keyspace);
+ }
+
+ @Override
+ public String keyspace()
+ {
+ return null;
+ }
+
+ @Override
+ public String name()
+ {
+ return null;
+ }
+ }
}
diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
index aebfffd98364..afdd96dd1363 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
@@ -18,10 +18,20 @@
package org.apache.cassandra.cql3.statements;
import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.stream.Collectors;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableSet;
+import java.util.Set;
+import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
-
+import java.util.stream.Collectors;
import javax.annotation.concurrent.ThreadSafe;
import com.google.common.annotations.VisibleForTesting;
@@ -29,24 +39,27 @@
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
-
+import org.apache.commons.lang3.builder.ToStringBuilder;
+import org.apache.commons.lang3.builder.ToStringStyle;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.audit.AuditLogContext;
import org.apache.cassandra.audit.AuditLogEntryType;
import org.apache.cassandra.auth.Permission;
-import org.apache.cassandra.cql3.restrictions.SingleRestriction;
-import org.apache.cassandra.cql3.terms.Term;
-import org.apache.cassandra.db.guardrails.Guardrails;
-import org.apache.cassandra.index.Index;
-import org.apache.cassandra.schema.ColumnMetadata;
-import org.apache.cassandra.schema.Schema;
-import org.apache.cassandra.schema.SchemaConstants;
-import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.schema.TableMetadataRef;
-import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.cql3.CQLStatement;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.ColumnSpecification;
+import org.apache.cassandra.cql3.Ordering;
+import org.apache.cassandra.cql3.QualifiedName;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.ResultSet;
+import org.apache.cassandra.cql3.StatementSource;
+import org.apache.cassandra.cql3.VariableSpecifications;
+import org.apache.cassandra.cql3.WhereClause;
import org.apache.cassandra.cql3.functions.Function;
+import org.apache.cassandra.cql3.restrictions.SingleRestriction;
import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
import org.apache.cassandra.cql3.selection.RawSelector;
import org.apache.cassandra.cql3.selection.ResultSetBuilder;
@@ -55,10 +68,32 @@
import org.apache.cassandra.cql3.selection.Selection;
import org.apache.cassandra.cql3.selection.Selection.Selectors;
import org.apache.cassandra.cql3.selection.Selector;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.cql3.terms.Marker;
+import org.apache.cassandra.cql3.terms.Term;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.PartitionRangeReadQuery;
+import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts;
+import org.apache.cassandra.db.ReadExecutionController;
+import org.apache.cassandra.db.ReadQuery;
+import org.apache.cassandra.db.SinglePartitionReadCommand;
+import org.apache.cassandra.db.SinglePartitionReadQuery;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.Slices;
import org.apache.cassandra.db.aggregation.AggregationSpecification;
import org.apache.cassandra.db.aggregation.GroupMaker;
-import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.guardrails.Guardrails;
import org.apache.cassandra.db.marshal.CompositeType;
import org.apache.cassandra.db.marshal.Int32Type;
import org.apache.cassandra.db.partitions.PartitionIterator;
@@ -66,9 +101,20 @@
import org.apache.cassandra.db.rows.RowIterator;
import org.apache.cassandra.db.view.View;
import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.exceptions.*;
-import org.apache.cassandra.metrics.ClientRequestSizeMetrics;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.ReadSizeAbortException;
+import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.exceptions.RequestFailureReason;
+import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.index.Index;
import org.apache.cassandra.index.IndexRegistry;
+import org.apache.cassandra.metrics.ClientRequestSizeMetrics;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.SchemaConstants;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.TableMetadataRef;
import org.apache.cassandra.serializers.MarshalException;
import org.apache.cassandra.service.ClientState;
import org.apache.cassandra.service.ClientWarn;
@@ -84,9 +130,6 @@
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.NoSpamLogger;
-import org.apache.commons.lang3.builder.ToStringBuilder;
-import org.apache.commons.lang3.builder.ToStringStyle;
-
import static java.lang.String.format;
import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
@@ -107,7 +150,7 @@
* Note that select statements can be accessed by multiple threads, so we cannot rely on mutable attributes.
*/
@ThreadSafe
-public class SelectStatement implements CQLStatement.SingleKeyspaceCqlStatement
+public class SelectStatement implements CQLStatement.SingleKeyspaceCqlStatement, CQLStatement.ReturningCQLStatement
{
private static final Logger logger = LoggerFactory.getLogger(SelectStatement.class);
private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(SelectStatement.logger, 1, TimeUnit.MINUTES);
@@ -144,8 +187,10 @@ public class SelectStatement implements CQLStatement.SingleKeyspaceCqlStatement
*/
private final ColumnComparator> orderingComparator;
+ public final StatementSource source;
+
// Used by forSelection below
- private static final Parameters defaultParameters = new Parameters(Collections.emptyList(),
+ public static final Parameters defaultParameters = new Parameters(Collections.emptyList(),
Collections.emptyList(),
false,
false,
@@ -160,7 +205,8 @@ public SelectStatement(TableMetadata table,
AggregationSpecification.Factory aggregationSpecFactory,
ColumnComparator> orderingComparator,
Term limit,
- Term perPartitionLimit)
+ Term perPartitionLimit,
+ StatementSource source)
{
this.table = table;
this.bindVariables = bindVariables;
@@ -172,6 +218,7 @@ public SelectStatement(TableMetadata table,
this.parameters = parameters;
this.limit = limit;
this.perPartitionLimit = perPartitionLimit;
+ this.source = source;
}
@Override
@@ -238,9 +285,11 @@ static SelectStatement forSelection(TableMetadata table, Selection selection)
null,
null,
null,
- null);
+ null,
+ StatementSource.INTERNAL);
}
+ @Override
public ResultSet.ResultMetadata getResultMetadata()
{
return selection.getResultMetadata();
@@ -336,7 +385,7 @@ public ResultMessage.Rows execute(QueryState state, QueryOptions options, Dispat
}
}
- ReadQuery query = getQuery(options, state.getClientState(), selectors.getColumnFilter(), nowInSec, limit);
+ ReadQuery query = getQuery(options, state.getClientState(), selectors.getColumnFilter(), nowInSec, limit, PotentialTxnConflicts.DISALLOW);
if (options.isReadThresholdsEnabled())
query.trackWarnings();
@@ -372,6 +421,11 @@ public AggregationSpecification getAggregationSpec(QueryOptions options)
return aggregationSpecFactory == null ? null : aggregationSpecFactory.newInstance(options);
}
+ public boolean hasAggregation()
+ {
+ return aggregationSpecFactory != null;
+ }
+
public ReadQuery getQuery(QueryOptions options, long nowInSec) throws RequestValidationException
{
Selectors selectors = selection.newSelectors(options);
@@ -382,7 +436,8 @@ public ReadQuery getQuery(QueryOptions options, long nowInSec) throws RequestVal
getLimit(options),
getPerPartitionLimit(options),
options.getPageSize(),
- getAggregationSpec(options));
+ getAggregationSpec(options),
+ PotentialTxnConflicts.DISALLOW);
}
public ReadQuery getQuery(QueryOptions options,
@@ -392,32 +447,36 @@ public ReadQuery getQuery(QueryOptions options,
int userLimit,
int perPartitionLimit,
int pageSize,
- AggregationSpecification aggregationSpec)
+ AggregationSpecification aggregationSpec,
+ PotentialTxnConflicts potentialTxnConflicts)
{
DataLimits limit = getDataLimits(userLimit, perPartitionLimit, pageSize, aggregationSpec);
- return getQuery(options, state, columnFilter, nowInSec, limit);
+ return getQuery(options, state, columnFilter, nowInSec, limit, potentialTxnConflicts);
}
public ReadQuery getQuery(QueryOptions options,
ClientState state,
ColumnFilter columnFilter,
long nowInSec,
- DataLimits limit)
+ DataLimits limit,
+ PotentialTxnConflicts potentialTxnConflicts)
{
- boolean isPartitionRangeQuery = restrictions.isKeyRange() || restrictions.usesSecondaryIndexing();
+ RowFilter rowFilter = getRowFilter(options, state);
- if (isPartitionRangeQuery)
+ if (restrictions.isKeyRange())
{
- if (restrictions.isKeyRange() && restrictions.usesSecondaryIndexing() && !SchemaConstants.isLocalSystemKeyspace(table.keyspace))
+ if (restrictions.usesSecondaryIndexing() && !SchemaConstants.isLocalSystemKeyspace(table.keyspace))
Guardrails.nonPartitionRestrictedIndexQueryEnabled.ensureEnabled(state);
- return getRangeCommand(options, state, columnFilter, limit, nowInSec);
+ return getRangeCommand(options, state, columnFilter, rowFilter, limit, nowInSec, potentialTxnConflicts);
}
- return getSliceCommands(options, state, columnFilter, limit, nowInSec);
- }
+ if (restrictions.usesSecondaryIndexing() && !rowFilter.isStrict())
+ return getRangeCommand(options, state, columnFilter, rowFilter, limit, nowInSec, potentialTxnConflicts);
+ return getSliceCommands(options, state, columnFilter, rowFilter, limit, nowInSec, potentialTxnConflicts);
+ }
private ResultMessage.Rows execute(ReadQuery query,
QueryOptions options,
ClientState state,
@@ -599,7 +658,8 @@ public ResultMessage.Rows executeInternal(QueryState state,
userLimit,
userPerPartitionLimit,
pageSize,
- aggregationSpec);
+ aggregationSpec,
+ PotentialTxnConflicts.ALLOW);
try (ReadExecutionController executionController = query.executionController())
{
@@ -646,7 +706,7 @@ public Map> executeRawInternal(QueryOptions options, Cli
throw new IllegalStateException();
Selectors selectors = selection.newSelectors(options);
- ReadQuery query = getQuery(options, state, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, Integer.MAX_VALUE, null);
+ ReadQuery query = getQuery(options, state, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, Integer.MAX_VALUE, null, PotentialTxnConflicts.ALLOW);
Map> result = Collections.emptyMap();
try (ReadExecutionController executionController = query.executionController())
@@ -714,8 +774,13 @@ public StatementRestrictions getRestrictions()
return restrictions;
}
+ public boolean isPartitionRangeQuery()
+ {
+ return isForPartitionRange(restrictions);
+ }
+
private ReadQuery getSliceCommands(QueryOptions options, ClientState state, ColumnFilter columnFilter,
- DataLimits limit, long nowInSec)
+ RowFilter rowFilter, DataLimits limit, long nowInSec, PotentialTxnConflicts potentialTxnConflicts)
{
Collection keys = restrictions.getPartitionKeys(options, state);
if (keys.isEmpty())
@@ -730,8 +795,6 @@ private ReadQuery getSliceCommands(QueryOptions options, ClientState state, Colu
if (filter == null || filter.isEmpty(table.comparator))
return ReadQuery.empty(table);
- RowFilter rowFilter = getRowFilter(options, state);
-
List decoratedKeys = new ArrayList<>(keys.size());
for (ByteBuffer key : keys)
{
@@ -739,7 +802,13 @@ private ReadQuery getSliceCommands(QueryOptions options, ClientState state, Colu
decoratedKeys.add(table.partitioner.decorateKey(ByteBufferUtil.clone(key)));
}
- return SinglePartitionReadQuery.createGroup(table, nowInSec, columnFilter, rowFilter, limit, decoratedKeys, filter);
+ SinglePartitionReadQuery.Group extends SinglePartitionReadQuery> group =
+ SinglePartitionReadQuery.createGroup(table, nowInSec, columnFilter, rowFilter, limit, decoratedKeys, filter, potentialTxnConflicts);
+
+ // If there's a secondary index that the commands can use, have it validate the request parameters.
+ group.maybeValidateIndex();
+
+ return group;
}
/**
@@ -787,14 +856,13 @@ public RowFilter rowFilterForInternalCalls()
return getRowFilter(QueryOptions.forInternalCalls(Collections.emptyList()), ClientState.forInternalCalls());
}
- private ReadQuery getRangeCommand(QueryOptions options, ClientState state, ColumnFilter columnFilter, DataLimits limit, long nowInSec)
+ private ReadQuery getRangeCommand(QueryOptions options, ClientState state, ColumnFilter columnFilter,
+ RowFilter rowFilter, DataLimits limit, long nowInSec, PotentialTxnConflicts potentialTxnConflicts)
{
ClusteringIndexFilter clusteringIndexFilter = makeClusteringIndexFilter(options, state, columnFilter);
if (clusteringIndexFilter == null)
return ReadQuery.empty(table);
- RowFilter rowFilter = getRowFilter(options, state);
-
// The LIMIT provided by the user is the number of CQL row he wants returned.
// We want to have getRangeSlice to count the number of columns, not the number of keys.
AbstractBounds keyBounds = restrictions.getPartitionKeyBounds(options);
@@ -802,7 +870,7 @@ private ReadQuery getRangeCommand(QueryOptions options, ClientState state, Colum
return ReadQuery.empty(table);
ReadQuery command =
- PartitionRangeReadQuery.create(table, nowInSec, columnFilter, rowFilter, limit, new DataRange(keyBounds, clusteringIndexFilter));
+ PartitionRangeReadQuery.create(table, nowInSec, columnFilter, rowFilter, limit, new DataRange(keyBounds, clusteringIndexFilter), potentialTxnConflicts);
// If there's a secondary index that the command can use, have it validate the request parameters.
command.maybeValidateIndex();
@@ -903,6 +971,11 @@ public int getLimit(QueryOptions options)
return getLimit(limit, options);
}
+ public boolean isLimitMarker()
+ {
+ return limit instanceof Marker;
+ }
+
/**
* Returns the per partition limit specified by the user.
* May be used by custom QueryHandler implementations
@@ -1150,6 +1223,11 @@ private void orderResults(ResultSet cqlRows, QueryOptions options, ClientState s
cqlRows.rows.sort(comparator);
}
+ private static boolean isForPartitionRange(StatementRestrictions restrictions)
+ {
+ return restrictions.isKeyRange() || restrictions.usesSecondaryIndexing();
+ }
+
public static class RawStatement extends QualifiedStatement
{
public final Parameters parameters;
@@ -1158,13 +1236,15 @@ public static class RawStatement extends QualifiedStatement
public final Term.Raw limit;
public final Term.Raw perPartitionLimit;
private ClientState state;
+ public final StatementSource source;
public RawStatement(QualifiedName cfName,
Parameters parameters,
List selectClause,
WhereClause whereClause,
Term.Raw limit,
- Term.Raw perPartitionLimit)
+ Term.Raw perPartitionLimit,
+ StatementSource source)
{
super(cfName);
this.parameters = parameters;
@@ -1172,16 +1252,35 @@ public RawStatement(QualifiedName cfName,
this.whereClause = whereClause;
this.limit = limit;
this.perPartitionLimit = perPartitionLimit;
+ this.source = source;
}
public SelectStatement prepare(ClientState state)
{
// Cache locally for use by Guardrails
this.state = state;
- return prepare(state, false);
+ return prepare(state, false, bindVariables);
}
- public SelectStatement prepare(ClientState state, boolean forView) throws InvalidRequestException
+ public SelectStatement prepare(ClientState state, boolean forView)
+ {
+ return prepare(state, forView, bindVariables);
+ }
+
+ public SelectStatement prepare(VariableSpecifications variableSpecifications)
+ {
+ return prepare(state, false, variableSpecifications);
+ }
+
+ public SelectStatement prepare(boolean forView)
+ {
+ return prepare(state, forView, bindVariables);
+ }
+
+ /**
+ * @throws InvalidRequestException if the statement being prepared is invalid
+ */
+ public SelectStatement prepare(ClientState state, boolean forView, VariableSpecifications variableSpecifications) throws InvalidRequestException
{
TableMetadata table = Schema.instance.validateTable(keyspace(), name());
@@ -1189,7 +1288,7 @@ public SelectStatement prepare(ClientState state, boolean forView) throws Invali
boolean containsOnlyStaticColumns = selectOnlyStaticColumns(table, selectables);
List orderings = getOrderings(table);
- StatementRestrictions restrictions = prepareRestrictions(state, table, bindVariables, orderings, containsOnlyStaticColumns, forView);
+ StatementRestrictions restrictions = prepareRestrictions(state, table, variableSpecifications, orderings, containsOnlyStaticColumns, forView);
// If we order post-query, the sorted column needs to be in the ResultSet for sorting,
// even if we don't ultimately ship them to the client (CASSANDRA-4911).
@@ -1198,7 +1297,7 @@ public SelectStatement prepare(ClientState state, boolean forView) throws Invali
Selection selection = prepareSelection(table,
selectables,
- bindVariables,
+ variableSpecifications,
resultSetOrderingColumns,
restrictions);
@@ -1234,15 +1333,16 @@ public SelectStatement prepare(ClientState state, boolean forView) throws Invali
checkNeedsFiltering(table, restrictions);
return new SelectStatement(table,
- bindVariables,
+ variableSpecifications,
parameters,
selection,
restrictions,
isReversed,
aggregationSpecFactory,
orderingComparator,
- prepareLimit(bindVariables, limit, keyspace(), limitReceiver()),
- prepareLimit(bindVariables, perPartitionLimit, keyspace(), perPartitionLimitReceiver()));
+ prepareLimit(variableSpecifications, limit, keyspace(), limitReceiver()),
+ prepareLimit(variableSpecifications, perPartitionLimit, keyspace(), perPartitionLimitReceiver()),
+ source);
}
private Set getResultSetOrdering(StatementRestrictions restrictions, Map orderingColumns)
@@ -1573,7 +1673,7 @@ private boolean isReversed(TableMetadata table, Map or
private void checkNeedsFiltering(TableMetadata table, StatementRestrictions restrictions) throws InvalidRequestException
{
// non-key-range non-indexed queries cannot involve filtering underneath
- if (!parameters.allowFiltering && (restrictions.isKeyRange() || restrictions.usesSecondaryIndexing()))
+ if (!parameters.allowFiltering && isForPartitionRange(restrictions))
{
// We will potentially filter data if the row filter is not the identity and there isn't any index group
// supporting all the expressions in the filter.
@@ -1612,18 +1712,30 @@ public static class Parameters
public final boolean isDistinct;
public final boolean allowFiltering;
public final boolean isJson;
+ public final String refName;
public Parameters(List orderings,
List groups,
boolean isDistinct,
boolean allowFiltering,
boolean isJson)
+ {
+ this(orderings, groups, isDistinct, allowFiltering, isJson, null);
+ }
+
+ public Parameters(List orderings,
+ List groups,
+ boolean isDistinct,
+ boolean allowFiltering,
+ boolean isJson,
+ String refName)
{
this.orderings = orderings;
this.groups = groups;
this.isDistinct = isDistinct;
this.allowFiltering = allowFiltering;
this.isJson = isJson;
+ this.refName = refName;
}
}
@@ -1772,7 +1884,7 @@ public String toString()
private String loggableTokens(QueryOptions options, ClientState state)
{
- if (restrictions.isKeyRange() || restrictions.usesSecondaryIndexing())
+ if (isPartitionRangeQuery())
{
AbstractBounds bounds = restrictions.getPartitionKeyBounds(options);
return "token range: " + (bounds.inclusiveLeft() ? '[' : '(') +
@@ -1802,14 +1914,14 @@ private String loggableTokens(QueryOptions options, ClientState state)
}
}
- private String asCQL(QueryOptions options, ClientState state)
+ public String asCQL(QueryOptions options, ClientState state)
{
ColumnFilter columnFilter = selection.newSelectors(options).getColumnFilter();
StringBuilder sb = new StringBuilder();
sb.append("SELECT ").append(queriedColumns().toCQLString());
sb.append(" FROM ").append(table.keyspace).append('.').append(table.name);
- if (restrictions.isKeyRange() || restrictions.usesSecondaryIndexing())
+ if (isPartitionRangeQuery())
{
// partition range
ClusteringIndexFilter clusteringIndexFilter = makeClusteringIndexFilter(options, state, columnFilter);
diff --git a/src/java/org/apache/cassandra/cql3/statements/SingleTableUpdatesCollector.java b/src/java/org/apache/cassandra/cql3/statements/SingleTableUpdatesCollector.java
index 5ff299eb88d4..6a7c325a0a01 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SingleTableUpdatesCollector.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SingleTableUpdatesCollector.java
@@ -30,6 +30,7 @@
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.db.IMutation;
import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.ReadCommand.PotentialTxnConflicts;
import org.apache.cassandra.db.RegularAndStaticColumns;
import org.apache.cassandra.db.commitlog.CommitLogSegment;
import org.apache.cassandra.db.partitions.PartitionUpdate;
@@ -94,7 +95,7 @@ public PartitionUpdate.Builder getPartitionUpdateBuilder(TableMetadata metadata,
* @return a collection containing all the mutations.
*/
@Override
- public List toMutations(ClientState state)
+ public List toMutations(ClientState state, PotentialTxnConflicts potentialTxnConflicts)
{
List ms = new ArrayList<>(puBuilders.size());
for (PartitionUpdate.Builder builder : puBuilders.values())
@@ -106,7 +107,7 @@ public List toMutations(ClientState state)
else if (metadata.isCounter())
mutation = new CounterMutation(new Mutation(builder.build()), counterConsistencyLevel);
else
- mutation = new Mutation(builder.build());
+ mutation = new Mutation(builder.build(), potentialTxnConflicts);
mutation.validateIndexedColumns(state);
mutation.validateSize(MessagingService.current_version, CommitLogSegment.ENTRY_OVERHEAD_SIZE);
diff --git a/src/java/org/apache/cassandra/cql3/statements/TransactionStatement.java b/src/java/org/apache/cassandra/cql3/statements/TransactionStatement.java
new file mode 100644
index 000000000000..f3b469c0e3b3
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/statements/TransactionStatement.java
@@ -0,0 +1,739 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.statements;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import javax.annotation.Nullable;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+
+import accord.api.Key;
+import accord.primitives.Keys;
+import accord.primitives.Routable.Domain;
+import accord.primitives.Txn;
+import org.agrona.collections.Int2ObjectHashMap;
+import org.apache.cassandra.audit.AuditLogContext;
+import org.apache.cassandra.audit.AuditLogEntryType;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLStatement;
+import org.apache.cassandra.cql3.ColumnSpecification;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.ResultSet;
+import org.apache.cassandra.cql3.VariableSpecifications;
+import org.apache.cassandra.cql3.selection.ResultSetBuilder;
+import org.apache.cassandra.cql3.selection.Selection;
+import org.apache.cassandra.cql3.transactions.ConditionStatement;
+import org.apache.cassandra.cql3.transactions.ReferenceOperation;
+import org.apache.cassandra.cql3.transactions.RowDataReference;
+import org.apache.cassandra.cql3.transactions.SelectReferenceSource;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.SinglePartitionReadCommand;
+import org.apache.cassandra.db.SinglePartitionReadQuery;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.TableId;
+import org.apache.cassandra.schema.TableParams;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.service.accord.AccordService;
+import org.apache.cassandra.service.accord.api.AccordRoutableKey;
+import org.apache.cassandra.service.accord.api.PartitionKey;
+import org.apache.cassandra.service.accord.txn.AccordUpdate;
+import org.apache.cassandra.service.accord.txn.TxnCondition;
+import org.apache.cassandra.service.accord.txn.TxnData;
+import org.apache.cassandra.service.accord.txn.TxnDataKeyValue;
+import org.apache.cassandra.service.accord.txn.TxnNamedRead;
+import org.apache.cassandra.service.accord.txn.TxnQuery;
+import org.apache.cassandra.service.accord.txn.TxnRead;
+import org.apache.cassandra.service.accord.txn.TxnReference;
+import org.apache.cassandra.service.accord.txn.TxnResult;
+import org.apache.cassandra.service.accord.txn.TxnUpdate;
+import org.apache.cassandra.service.accord.txn.TxnWrite;
+import org.apache.cassandra.service.consensus.TransactionalMode;
+import org.apache.cassandra.service.consensus.migration.TransactionalMigrationFromMode;
+import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.tcm.Epoch;
+import org.apache.cassandra.transport.Dispatcher;
+import org.apache.cassandra.transport.messages.ResultMessage;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static accord.primitives.Txn.Kind.Read;
+import static com.google.common.base.Preconditions.checkArgument;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue;
+import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
+import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.AUTO_READ;
+import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.RETURNING;
+import static org.apache.cassandra.service.accord.txn.TxnData.TxnDataNameKind.USER;
+import static org.apache.cassandra.service.accord.txn.TxnData.txnDataName;
+import static org.apache.cassandra.service.accord.txn.TxnRead.createTxnRead;
+import static org.apache.cassandra.service.accord.txn.TxnResult.Kind.retry_new_protocol;
+import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.getTableMetadata;
+import static org.apache.cassandra.service.consensus.migration.ConsensusRequestRouter.shouldReadEphemerally;
+
+public class TransactionStatement implements CQLStatement.CompositeCQLStatement, CQLStatement.ReturningCQLStatement
+{
+ public static final String DUPLICATE_TUPLE_NAME_MESSAGE = "The name '%s' has already been used by a LET assignment.";
+ public static final String INCOMPLETE_PARTITION_KEY_SELECT_MESSAGE = "SELECT must specify either all partition key elements. Partition key elements must be always specified with equality operators; %s %s";
+ public static final String INCOMPLETE_PRIMARY_KEY_SELECT_MESSAGE = "SELECT must specify either all primary key elements or all partition key elements and LIMIT 1. In both cases partition key elements must be always specified with equality operators; %s %s";
+ public static final String NO_CONDITIONS_IN_UPDATES_MESSAGE = "Updates within transactions may not specify their own conditions; %s statement %s";
+ public static final String NO_TIMESTAMPS_IN_UPDATES_MESSAGE = "Updates within transactions may not specify custom timestamps; %s statement %s";
+ public static final String NO_TTLS_IN_UPDATES_MESSAGE = "Updates within transactions may not specify custom ttls; %s statement %s";
+ public static final String TRANSACTIONS_DISABLED_ON_TABLE_MESSAGE = "Accord transactions are disabled on table (See transactional_mode in table options); %s statement %s";
+ public static final String TRANSACTIONS_DISABLED_ON_TABLE_BEING_DROPPED_MESSAGE = "Accord transactions are disabled on table (table is being dropped); %s statement %s";
+ public static final String NO_COUNTERS_IN_TXNS_MESSAGE = "Counter columns cannot be accessed within a transaction; %s statement %s";
+ public static final String NO_AGGREGATION_IN_TXNS_MESSAGE = "No aggregation functions allowed within a transaction; %s statement %s";
+ public static final String NO_ORDER_BY_IN_TXNS_MESSAGE = "No ORDER BY clause allowed within a transaction; %s statement %s";
+ public static final String NO_GROUP_BY_IN_TXNS_MESSAGE = "No GROUP BY clause allowed within a transaction; %s statement %s";
+ public static final String EMPTY_TRANSACTION_MESSAGE = "Transaction contains no reads or writes";
+ public static final String SELECT_REFS_NEED_COLUMN_MESSAGE = "SELECT references must specify a column.";
+ public static final String TRANSACTIONS_DISABLED_MESSAGE = "Accord transactions are disabled. (See accord.enabled in cassandra.yaml)";
+ public static final String ILLEGAL_RANGE_QUERY_MESSAGE = "Range queries are not allowed for reads within a transaction; %s %s";
+ public static final String UNSUPPORTED_MIGRATION = "Transaction Statement is unsupported when migrating away from Accord or before migration to Accord is complete for a range";
+ public static final String NO_PARTITION_IN_CLAUSE_WITH_LIMIT = "Partition key is present in IN clause and there is a LIMIT... this is currently not supported; %s statement %s";
+
+ static class NamedSelect
+ {
+ final int name;
+ final SelectStatement select;
+
+ public NamedSelect(int name, SelectStatement select)
+ {
+ this.name = name;
+ this.select = select;
+ }
+ }
+
+ private final List assignments;
+ private final NamedSelect returningSelect;
+ private final List returningReferences;
+ private final List updates;
+ private final List conditions;
+
+ private final VariableSpecifications bindVariables;
+ private final ResultSet.ResultMetadata resultMetadata;
+
+ private long minEpoch = Epoch.EMPTY.getEpoch();
+
+ public TransactionStatement(List assignments,
+ NamedSelect returningSelect,
+ List returningReferences,
+ List updates,
+ List conditions,
+ VariableSpecifications bindVariables)
+ {
+ this.assignments = assignments;
+ this.returningSelect = returningSelect;
+ this.returningReferences = returningReferences;
+ this.updates = updates;
+ this.conditions = conditions;
+ this.bindVariables = bindVariables;
+
+ if (returningSelect != null)
+ {
+ resultMetadata = returningSelect.select.getResultMetadata();
+ }
+ else if (returningReferences != null && !returningReferences.isEmpty())
+ {
+ List names = new ArrayList<>(returningReferences.size());
+ for (RowDataReference reference : returningReferences)
+ names.add(reference.toResultMetadata());
+ resultMetadata = new ResultSet.ResultMetadata(names);
+ }
+ else
+ {
+ resultMetadata = ResultSet.ResultMetadata.EMPTY;
+ }
+ }
+
+ public List getUpdates()
+ {
+ return updates;
+ }
+
+ @Override
+ public List getBindVariables()
+ {
+ return bindVariables.getBindVariables();
+ }
+
+ @Override
+ public void authorize(ClientState state)
+ {
+ // Assess read permissions for all data from both explicit LET statements and generated reads.
+ for (NamedSelect let : assignments)
+ let.select.authorize(state);
+
+ if (returningSelect != null)
+ returningSelect.select.authorize(state);
+
+ for (ModificationStatement update : updates)
+ update.authorize(state);
+ }
+
+ @Override
+ public void validate(ClientState state)
+ {
+ for (NamedSelect statement : assignments)
+ statement.select.validate(state);
+ if (returningSelect != null)
+ returningSelect.select.validate(state);
+ for (ModificationStatement statement : updates)
+ statement.validate(state);
+ }
+
+ @Override
+ public Iterable getStatements()
+ {
+ return () -> {
+ Stream stream = assignments.stream().map(n -> n.select);
+ if (returningSelect != null)
+ stream = Stream.concat(stream, Stream.of(returningSelect.select));
+ stream = Stream.concat(stream, updates.stream());
+ return stream.iterator();
+ };
+ }
+
+ @Override
+ public ResultSet.ResultMetadata getResultMetadata()
+ {
+ return resultMetadata;
+ }
+
+ TxnNamedRead createNamedRead(NamedSelect namedSelect, QueryOptions options, ClientState state)
+ {
+ SelectStatement select = namedSelect.select;
+ // We reject reads from both LET and SELECT that do not specify a single row.
+ @SuppressWarnings("unchecked")
+ SinglePartitionReadQuery.Group selectQuery = (SinglePartitionReadQuery.Group) select.getQuery(options, 0);
+
+ if (selectQuery.queries.size() != 1)
+ throw new IllegalArgumentException("Within a transaction, SELECT statements must select a single partition; found " + selectQuery.queries.size() + " partitions");
+
+ return new TxnNamedRead(namedSelect.name, Iterables.getOnlyElement(selectQuery.queries));
+ }
+
+ List createNamedReads(NamedSelect namedSelect, QueryOptions options, ClientState state)
+ {
+ SelectStatement select = namedSelect.select;
+ // We reject reads from both LET and SELECT that do not specify a single row.
+ @SuppressWarnings("unchecked")
+ SinglePartitionReadQuery.Group selectQuery = (SinglePartitionReadQuery.Group) select.getQuery(options, 0);
+
+ if (selectQuery.queries.size() == 1)
+ return Collections.singletonList(new TxnNamedRead(namedSelect.name, Iterables.getOnlyElement(selectQuery.queries)));
+
+ List list = new ArrayList<>(selectQuery.queries.size());
+ for (int i = 0; i < selectQuery.queries.size(); i++)
+ list.add(new TxnNamedRead(txnDataName(RETURNING, i), selectQuery.queries.get(i)));
+ return list;
+ }
+
+ private List createNamedReads(QueryOptions options, ClientState state, @Nullable Int2ObjectHashMap