diff --git a/.build/build-accord.xml b/.build/build-accord.xml new file mode 100644 index 000000000000..6fc716d2d0c2 --- /dev/null +++ b/.build/build-accord.xml @@ -0,0 +1,45 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.build/build-checkstyle.xml b/.build/build-checkstyle.xml index af5867e4aa9a..0484e4098c66 100644 --- a/.build/build-checkstyle.xml +++ b/.build/build-checkstyle.xml @@ -19,7 +19,7 @@ - + @@ -45,7 +45,7 @@ - + diff --git a/.build/build-rat.xml b/.build/build-rat.xml index fea028363c07..2fbacb74d0ec 100644 --- a/.build/build-rat.xml +++ b/.build/build-rat.xml @@ -76,6 +76,7 @@ + diff --git a/.build/build-resolver.xml b/.build/build-resolver.xml index 42bcc82512d0..49d1e8ba793d 100644 --- a/.build/build-resolver.xml +++ b/.build/build-resolver.xml @@ -178,7 +178,7 @@ - + @@ -206,7 +206,7 @@ - + diff --git a/.build/cassandra-build-deps-template.xml b/.build/cassandra-build-deps-template.xml index 4ec59cdf2d4b..c6b56955e013 100644 --- a/.build/cassandra-build-deps-template.xml +++ b/.build/cassandra-build-deps-template.xml @@ -155,5 +155,10 @@ org.bouncycastle bcutil-jdk18on + + org.apache.cassandra + cassandra-accord + tests + diff --git a/.build/cassandra-deps-template.xml b/.build/cassandra-deps-template.xml index a7c27ee12666..e6afd9b9b018 100644 --- a/.build/cassandra-deps-template.xml +++ b/.build/cassandra-deps-template.xml @@ -116,6 +116,10 @@ org.mindrot jbcrypt + + org.apache.cassandra + cassandra-accord + io.airlift airline diff --git a/.build/checkstyle_suppressions.xml b/.build/checkstyle_suppressions.xml index ed4d1443f7fc..230c808c1435 100644 --- a/.build/checkstyle_suppressions.xml +++ b/.build/checkstyle_suppressions.xml @@ -21,5 +21,4 @@ "https://checkstyle.org/dtds/suppressions_1_1.dtd"> - diff --git a/.build/git/git-hooks/post-checkout/100-update-submodules.sh b/.build/git/git-hooks/post-checkout/100-update-submodules.sh new file mode 100755 index 000000000000..b495ed086054 --- /dev/null +++ b/.build/git/git-hooks/post-checkout/100-update-submodules.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Redirect output to stderr. +exec 1>&2 + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +bin="$(cd "$(dirname "$0")" > /dev/null; pwd)" + +_main() { + # In case the usage happens at a different layer, make sure to cd to the toplevel + local root_dir + root_dir="$(git rev-parse --show-toplevel)" + cd "$root_dir" + + if [[ ! -e .gitmodules ]]; then + # nothing to see here, look away! + return 0 + fi + git submodule update --init --recursive +} + +_main "$@" diff --git a/.build/git/git-hooks/post-switch b/.build/git/git-hooks/post-switch new file mode 120000 index 000000000000..5513d1deed30 --- /dev/null +++ b/.build/git/git-hooks/post-switch @@ -0,0 +1 @@ +post-checkout \ No newline at end of file diff --git a/.build/git/git-hooks/pre-commit/100-verify-submodules-pushed.sh b/.build/git/git-hooks/pre-commit/100-verify-submodules-pushed.sh new file mode 100755 index 000000000000..ec10bba04a5d --- /dev/null +++ b/.build/git/git-hooks/pre-commit/100-verify-submodules-pushed.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## +## When working with submodules the top level project (Apache Cassandra) needs to commit all submodule +## changes so the top level knows what SHA to use. When working in a development environment it is +## common that multiple commits will exist in both projects, if the submodule has its history +## rewritten, then historic top level commits are no longer valid unless the SHAs are pushed to a +## remote repo; this is what the script attempts to do, make sure all SHAs added to the +## Apache Cassandra are backed up to a remote repo to make the Cassandra SHA buildable. +## + +# Redirect output to stderr. +exec 1>&2 + + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +bin="$(cd "$(dirname "$0")" > /dev/null; pwd)" + +_log() { + echo -e "[pre-commit]\t$*" +} + +error() { + _log "$@" 1>&2 + exit 1 +} + +# Status Table +# A Added +# C Copied +# D Deleted +# M Modified +# R Renamed +# T Type Changed (i.e. regular file, symlink, submodule, …<200b>) +# U Unmerged +# X Unknown +# B Broken +_main() { + # In case the usage happens at a different layer, make sure to cd to the toplevel + local root_dir + root_dir="$(git rev-parse --show-toplevel)" + cd "$root_dir" + + [[ ! -e .gitmodules ]] && return 0 + local enabled=$(git config --bool cassandra.pre-commit.verify-submodules.enabled || echo true) + [ "$enabled" == "false" ] && return 0 + local submodules=( $(git config --file .gitmodules --get-regexp path | awk '{ print $2 }') ) + + local is_submodule=false + local git_sub_dir + local git_sha + while read status file; do + is_submodule=false + for to_check in "${submodules[*]}"; do + if [[ "$to_check" == "$file" ]]; then + is_submodule=true + break + fi + done + if $is_submodule; then + local enabled=$(git config --bool cassandra.pre-commit.verify-submodule-${file}.enabled || echo true) + [ "$enabled" == "false" ] && continue + _log "Submodule detected: ${file} with status ${status}; attempting a push" + _log "\tTo disable pushes, run" + _log "\t\tgit config --local cassandra.pre-commit.verify-submodules.enabled false" + _log "\tOr" + _log "\t\tgit config --local cassandra.pre-commit.verify-submodule-${file}.enabled false" + git_sub_dir="${file}/.git" + branch="$(git config -f .gitmodules "submodule.${file}.branch")" + [[ -z "${branch:-}" ]] && error "Submodule ${file} does not define a branch" + git_sha="$(git --git-dir "${git_sub_dir}" rev-parse HEAD)" + local remote="$(git --git-dir "${git_sub_dir}" config --get "branch.${branch}.remote" || error "Git branch ${branch} is not set up to track any remote in submodule ${file}")" + git --git-dir "${git_sub_dir}" fetch "${remote}" + git --git-dir "${git_sub_dir}" branch "${remote}/${branch}" --contains "${git_sha}" || error "Git commit ${git_sha} not found in $(git remote get-url "${remote}") on branch ${branch}" + fi + done < <(git diff --cached --name-status) +} + +_main "$@" diff --git a/.build/git/install-git-defaults.sh b/.build/git/install-git-defaults.sh new file mode 100755 index 000000000000..7c26ed5eda7c --- /dev/null +++ b/.build/git/install-git-defaults.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +bin="$(cd "$(dirname "$0")" > /dev/null; pwd)" + +install_template_script() { + local -r name="$1" + local -r d_dir="$2" + + cat < "$name" +#!/usr/bin/env bash + +# This script is autogenerated by the Apache Cassandra build; DO NOT CHANGE! +# When this script is not found it will be installed automatically by the build +# If an existing script is found, that script will be reloated under ${d_dir} as 000-original.sh + +# Redirect output to stderr. +exec 1>&2 + +# Find all scripts to run +for path in \$(find "$d_dir" -name '*.sh' | perl -e "print sort{(split '/', \\\$a)[-1] <=> (split '/', \\\$b)[-1]}<>"); do + "\$path" "\$@" +done +EOF + chmod a+x "$name" +} + +install_hook() { + local -r git_dir="$1" + local -r hooks_dir="${git_dir}/hooks" + local -r name="$2" + local -r d_dir="${hooks_dir}/${name}.d" + local -r trigger_on_install=$3 + + mkdir "${d_dir}" &> /dev/null || true + local -r script_name="${hooks_dir}/${name}" + local installed=true + if [[ -e "$script_name" ]]; then + # was the script already installed? + if ! grep "This script is autogenerated by the Apache Cassandra build" "$script_name" &> /dev/null ; then + echo "$script_name found, but was not generated by the Apache Cassandra build; please remove or move to ${d_dir}/000-original.sh; creating and moving to ${d_dir} will cause it to run as expected, but won't conflict with hooks this build adds" 1>&2 + exit 1 + else + installed=false + fi + fi + # install all hooks + cp "$bin"/git-hooks/"${name}"/* "$d_dir"/ + + # install coordinator hook + install_template_script "$script_name" "$d_dir" + if $installed && $trigger_on_install ; then + echo "Running script $script_name" + "$script_name" + fi +} + +_install_hooks() { + local git_dir + # make sure to use --git-common-dir and not --git-dir to support worktrees + git_dir="$(git rev-parse --git-common-dir 2> /dev/null || true)" + if [[ -z "${git_dir:-}" ]]; then + # not in a git repo, noop + return 0 + fi + + # make sure hooks directory exists; does not exist by default for worktrees + mkdir -p "${git_dir}/hooks" &> /dev/null || true + + install_hook "$git_dir" "post-checkout" true + install_hook "$git_dir" "post-switch" false + install_hook "$git_dir" "pre-commit" false +} + +_git_config_set() { + local -r name="$1" + # only care about rc + git config --local --get "$name" &> /dev/null +} + +_install_configs() { + # when doing pull, this makes sure submodules are updated + _git_config_set submodule.recurse || git config --local submodule.recurse true +} + +_main() { + local git_dir + # make sure to use --git-common-dir and not --git-dir to support worktrees + git_dir="$(git rev-parse --git-common-dir 2> /dev/null || true)" + # not in a git repo, noop + [[ -z "${git_dir:-}" ]] && return 0 + + _install_configs + _install_hooks +} + +_main "$@" diff --git a/.build/parent-pom-template.xml b/.build/parent-pom-template.xml index 0235ae6ea90e..25d4f0ddc01a 100644 --- a/.build/parent-pom-template.xml +++ b/.build/parent-pom-template.xml @@ -715,6 +715,42 @@ jbcrypt 0.4 + + org.apache.cassandra + cassandra-accord + @version@ + + + org.apache.cassandra + cassandra-all + + + + + org.apache.cassandra + cassandra-accord + @version@ + tests + test + + + org.junit.jupiter + junit-jupiter-api + + + org.junit.jupiter + junit-jupiter-engine + + + ch.qos.logback + logback-classic + + + org.apache.cassandra + cassandra-all + + + io.airlift airline diff --git a/.build/sh/bump-accord.sh b/.build/sh/bump-accord.sh new file mode 100755 index 000000000000..43a476f3edfb --- /dev/null +++ b/.build/sh/bump-accord.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +_main() { + local home + home="$(git rev-parse --show-toplevel)" + cd "$home" + + git submodule status modules/accord + echo "Is this the correct SHA? [y/n; default=y]" + read correct + if [[ "${correct:-y}" != "y" ]]; then + echo "Please update Accord's SHA and try again" + exit 1 + fi + git commit -m "Change Accord to $(cd modules/accord; git log -1 --format='%h: %B')" modules/accord +} + +_main "$@" diff --git a/.build/sh/change-submodule-accord.sh b/.build/sh/change-submodule-accord.sh new file mode 100755 index 000000000000..997db3dc2c29 --- /dev/null +++ b/.build/sh/change-submodule-accord.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +bin="$(cd "$(dirname "$0")" > /dev/null; pwd)" + +"$bin"/change-submodule.sh modules/accord 'https://github.com/apache/cassandra-accord.git' trunk diff --git a/.build/sh/change-submodule.sh b/.build/sh/change-submodule.sh new file mode 100755 index 000000000000..6ab2d3795afd --- /dev/null +++ b/.build/sh/change-submodule.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#set -o xtrace +set -o errexit +set -o pipefail +set -o nounset + +_usage() { + cat <&2 + exit 1 +} + +_usage() { + cat < +``` + +When changes are made to a submodule (such as to accord), you need to commit and update the reference in Apache Cassandra + +``` +$ (cd modules/accord ; git commit -am 'Saving progress') +$ .build/sh/bump-accord.sh +``` + +## Commit and Merge Process + +Due to the nature of submodules, the changes to the submodules must be committed and pushed before the changes to Apache Cassandra; these are different repositories so git's `--atomic` does not prevent conflicts from concurrent merges; the basic process is as follows: + +* Follow the normal merge process for the submodule +* Update Apache Cassandra's submodule entry to point to the newly committed change; follow the Accord example below for an example + +``` +$ .build/sh/change-submodule-accord.sh +$ .build/sh/bump-accord.sh +``` + # Useful Links - How you can contribute to Apache Cassandra [presentation](http://www.slideshare.net/yukim/cassandrasummit2013) by Yuki Morishita diff --git a/accord_demo.txt b/accord_demo.txt new file mode 100644 index 000000000000..63b7d21201d8 --- /dev/null +++ b/accord_demo.txt @@ -0,0 +1,14 @@ +ccm create accord-cql-poc -n 3 +ccm start + +bin/cqlsh -e "CREATE KEYSPACE ks WITH replication={'class':'SimpleStrategy', 'replication_factor':3};" +bin/cqlsh -e "CREATE TABLE ks.tbl1 (k int PRIMARY KEY, v int) WITH transactional_mode = 'full';" +bin/cqlsh -e "CREATE TABLE ks.tbl2 (k int PRIMARY KEY, v int) WITH transactional_mode = 'full';" + +BEGIN TRANSACTION + LET row1 = (SELECT * FROM ks.tbl1 WHERE k = 1); + SELECT row1.v; + IF row1 IS NULL THEN + INSERT INTO ks.tbl2 (k, v) VALUES (1, 2); + END IF +COMMIT TRANSACTION; diff --git a/build.xml b/build.xml index 45af0462603b..f57c05d5dbc9 100644 --- a/build.xml +++ b/build.xml @@ -100,6 +100,8 @@ the user specifies the tmp.dir property --> + + @@ -109,8 +111,12 @@ + + + + @@ -220,6 +226,24 @@ + + + + + + + + + + + + + + + + @@ -322,6 +346,7 @@ -XX:-CMSClassUnloadingEnabled -Dio.netty.tryReflectionSetAccessible=true + -XX:MaxMetaspaceSize=2G @@ -396,6 +421,7 @@ + @@ -517,7 +543,8 @@ - + + @@ -961,6 +988,9 @@ + + + @@ -980,6 +1010,7 @@ + @@ -997,6 +1028,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + - + @@ -2049,6 +2109,7 @@ + + + + + + + diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml index 546333dfcfd3..19f9c0817a3e 100644 --- a/conf/cassandra.yaml +++ b/conf/cassandra.yaml @@ -2311,3 +2311,22 @@ drop_compact_storage_enabled: false # compatibility mode would no longer toggle behaviors as when it was running in the UPGRADING mode. # storage_compatibility_mode: NONE + +#accord: +# # Enables the execution of Accord (multi-key) transactions on this node. +# enabled: false +# +# # Journal directory for Accord +# journal_directory: +# +# # The number of Accord shards on this node; -1 means use the number of cores +# queue_shard_count: -1 +# +# # The number of Accord shards on this node; -1 means use the number of cores +# command_store_shard_count: -1 +# +# # Recover delay: the time between a transaction being initiated and a remote replica being willing to interrupt it to complete it +# recover_delay: 1s +# +# # how quickly the fast path is reconfigured when nodes go up/down +# fast_path_update_delay: 5s diff --git a/doc/modules/cassandra/pages/developing/accord/index.adoc b/doc/modules/cassandra/pages/developing/accord/index.adoc new file mode 100644 index 000000000000..8320b49a0a5b --- /dev/null +++ b/doc/modules/cassandra/pages/developing/accord/index.adoc @@ -0,0 +1,360 @@ +== Accord Intro + +This document is intended to facilitate quick dive into Accord and +Cassandra Integration code for anyone interested in the project. Readers +should be closely familiar at very least with Single-Decree Paxos and +fluent in Consensus terminology. Familiarity with Accord protocol +itself, or similar protocols such as EPaxos, TAPIR, Janus, or Tempo, can +be useful. + +Accord code is logically split into local and coordinator part. +Coordination code contains code intended for coordination/invocation of +the client query, driving it through the Accord state machine, and all +commands and utilities for tracking/retrying their state. Node-local +code contains utility for keeping record of replica state and facilitate +local execution (i.e. responding to coordinator queries). + +There are _many_ enums in Accord. They’re extremely useful for +understanding the state machine of each of the components. + +Cassandra Integration implements interfaces provided by Accord, and +plugs in messaging, serialization, CQL, concurrency/execution, on-disk +state management, and stable storage (i.e. Cassandra tables). + +When the request comes from the client, broadly speaking, it gets parsed +and turns into `TransactionStatement`. `TransactionStatement` contains +updates, selects, assignments, and conditions intended for +atomic/transactional execution. These statements are translated into +Accord commands (i.e. `Read`, `Write`, or `Update`), and form Accord +Transaction (`Txn`). Transaction is executed yielding `TxnResult` that +can be returned to the client. + +== Coordinator Side + +=== Accord Protocol Basics + +Coordinator allocates a globally unique transaction ID `TxnId` for the +transaction, and begins coordination (see `CoordinateTransaction`). +Here, coordinator perform initial rounds of `PreAccept` and `Accept` +until the agreement about when transaction should execute is reached. +Coordinated query execution starts with a `PreAccept` message, which +contains transaction definition and routing information. + +On the replica locally, each Accord message first lands in +`AccordVerbHandler`, which handles _all_ Accord messages. Replica +determines whether it is aware of the _epoch_ specified by the +transaction coordinator. Messages for the future epochs are parked until +epoch becomes active on the node; messages for known epochs are +submitted to their corresponding command stores (think: local shards). +Replica applies the message locally, changing its local state, and +producing coordinator response. Coordinator collects replica responses +and continues driving transaction through the execution state machine. + +Every transaction has a home key - a global value that defines the home +shard, the one tasked with ensuring the transaction is finished. Home +key is chosen arbitrarily: it is either a first key the coordinator +owns, or it is picked completely at random. + +== Replica Side + +=== CommandStore + +`Command` is a unit of Accord _metadata_ that relates to a specific +operation, as opposed to `Message`, which is an _instruction_ sent by +coordinator to the replica for execution that _changes_ this command +state. `Command` does _not_ hold the state of an entire transaction, but +rather a _part_ of transaction executed on a particular shard. +_Coordinator_ is responsible for executing the entirety of the +transaction, `Command`s are just local execution states. + +Commands are held by a Command _Store_, a single threaded internal shard +of accord transaction metadata. It holds state required for command +execution, and executes commands sequentially. For command execution, +`CommandStore` creates a `SafeCommandStore`, a version of `CommandStore` +created for command execution, during which it has exclusive access to +it. + +Roughly speaking, you can think of relation between CommandStore and +SafeCommandStore as: + +.... +SafeCommandStore safeStore = commandStore.beginOperation(context) +try { + message.apply(safeStore); +} +finally { + commandStore.completeOperation(safeStore); +} +.... + +In other words, `CommandStore` collects the `PreLoadContext`, state +required to be in memory for command execution (possible dependencies, +such as `TxnId`s, and `Key`s of commands, but also `CommandsForKeys` +that will be needed during execution). Once the context is collected and +command’s turn to execute on command store comes, _safe_ command store +is created and passed to the command. + +Any executing operation may require changes to command store state. For +this, `SafeCommandStore` creates a special version of command state, +`SafeCommand` and `SafeCommandsForKey` that can be updated during +execution. Naturally, either _all_ of the states changed during +operation execution will become visible, or none of them will. In order +to ensure transactional integrity, changes to commands are tracked and +are recorded into `Journal` for crash-recovery. `ProgressLog` and +`CommandsForKey` are up + +On Cassandra side, concurrent execution is controlled by `AccordTask`, +which contains cache loading logic and persistence callbacks. Since +Accord may potentially hold a large number of command states in memory, +their states may be _shrunk_ to their binary representation to save some +memory, or they can get fully evicted. This also means that `AccordTask` +will have to reload relevant dependencies from preload context before +command execution can begin. + +=== AsyncChain, AccordTask, AccordExecutor + +Accord is designed for high concurrency, and most things are constructed +as asynchronous chains. `AsyncChain` API is very similar to the one of +Java futures, but has several convenient methods that make execution on +multiple executors (think: command stores, loaders) simpler. + +Each `CommandStore` has its own `AccordExecutor`. For the purpose of +this document you may consider it as a single-threaded executor. +`AccordExecutor` keeps track of tasks in different states, primarily: + +* `WAITING_TO_LOAD` - executor has a maximum number of concurrent load +tasks. If the number of in-progress loads exceeds this number, all +subsequently added loads will go into the waiting to load queue. +* `LOADING` - tasks for which dependencies are being loaded. +`CommandsForKeys` are paged in from the auxiliary table, while `Command` +states are loaded directly from the `Journal`. +* `WAITING_TO_RUN` / `RUNNING` / `FINISHED` - these three are +self-explanatory; once dependencies are loaded, task is ready to run; +when its turn comes, it transitions to running state, and once its done, +it’s finished. + +There are several other states, which you can find in +`AccordTask$State`. It might be worth to mention that Accord tasks are +_cancellable_. Tasks that were timed out before execution, have been +preempted, or should not run due to other reasons, can and will be +cancelled. Tasks transition between different AccordExecutor queues +depending on their execution states. + +In Accord, all tasks have to be executed in strict order, and a task +can’t execute before its dependencies have executed, else there’s no +guarantee of strict order. Tasks are notified about dependency readiness +using `NotificationSink`, which updates the tasks’s `WaitingOn` +collection. `WaitingOn` is responsible for registering listeners with +`CommandStore` if dependencies need to be executed before the current +task can. + +`WaitingOn`, `NotificationSink` and `LocalListeners` registered with +CommandStore can be thought of as a ``happy path'' execution: when +coordinator makes timely progress changing command states. If +coordinator _fails_ to make progress, `ProgressLog` kicks in after the +registered deadline. + +=== ProgressLog + +The progress log is responsible for ensuring progress in transactions +that aren’t making any. It does two things: + +* Fetches data from peers via `WaitingState`. Depending on the state of +transaction, it may trigger fetch of a subset of required dependencies +from peers via `FetchData`. For example, we haven’t received Apply, but +we’re ReadyToExecute. +* Triggers recovery via `HomeState`. The progress log may also +autonomously decide that a transaction which hasn’t been +decided/executed (and otherwise should be able to do so) should have the +recovery protocol invoked. In other words, if _coordination_ of the +transaction is stuck (i.e. further progress is not happening not due to +lack of dependencies required locally, but because of the transaction +coordinator), may trigger recovery via `MaybeRecover`. + +=== Command + +Command is a core block of the Accord local state. `Message`s, such as +`PreAccept`, `Propose`, `Accept`, and many others, change `Command` +state for a given store during execution. + +* `SaveStatus` - node-local command status +* `Participants` - core routing information required for transaction. +Keys or Ranges participating in the transaction. +* Timestamps: +** `ExecuteAt` - a timestamp at which this transaction is decided to be +executed. May differ from its `TxnId` if a higher ballot was witnessed +during `PreAccept` phase, in case there any conflicts are discovered. +** `ExecutesAtLeast` - only relevant for `WaitingOnWithExecutesAtLeast` +** Ballots for coordinating within a specific `TxnId`: +*** `Promised` - a non-zero ballot can be set as a result of recovery; a +recovery coordinator (see Recovery Protocol in Accord paper for details) +is picking its own globally unique ballot for re-proposal. +*** `AcceptedOrCommitted` - same as `Promised` (i.e. a non-zero ballot +is set as a result of recovery), except for later protocol stages. +* `PartialTxn` - shard-relevant definition of the transaction. +* Dependencies: +** `PartialDeps` - a collection of transaction dependencies, keyed by +the key or range on which they were adopted. +** `WaitingOn` - a subset of the above dependencies this command needs +to wait on. +** A collection of transaction dependencies, keyed by the key or range +on which they were adopted. +* `Writes` - a collection of data to write to one or more stores +* `Result` - a result to be returned to a client, or be stored in a +node’s command state. Effectively unused in Cassandra implementation. + +=== CommandsForKey (CFK) + +`CommandsForKey` is a specialised collection for efficiently +representing and querying everything Accord needs for making +coordination and recovery decisions about a key’s command conflicts, and +for managing execution order. + +CommandsForKey is updated via `SafeCommandsForKey` after command +execution in `SafeCommandStore#updateCommandsForKey`. CommandsForKey +defferentiates between managed and unmanaged transactions: + +* Managed transactions are transactions witnessed by `CommandsForKey` +for dependency management (essentially all globally visible key +transactions): simple key transactions, like reads and writes. +* Unmanaged transactions are those that depend on the simple key +transactions but are not themselves such, e.g. sync points, range +transactions, etc. These transactions need only adopt a dependency on +the Key to represent _all of these transactions_. CFK will then notify +when they have executed. + +=== CommandStore’s auxiliary collections + +==== RedundantBefore + +RedundantBefore is (incrementally) persisted in Journal and used by +CommandStore to track transactions that have been fully applied, or +invalidated across all shards. Once the transaction is redundant +(i.e. it has been either _applied_ or _invalidated_ durably on the +majority of participants), its metadata can be removed and only +transactional bounds can be maintained for dependency tracking purposes. +`RedundantBefore` plays an important role during journal compaction (by +providing information about which transactions can be purged). + +=== DurabilityService and (Exclusive)SyncPoint + +For intent of this document, we will only be covering _Exclusive_ +SyncPoints, even though other kinds might still exist as of time of +writing this. `SyncPoints` serve as a logical barrier in transaction +history, and are used for invalidating older `TxnId`s, so that a newly +bootstrapped node may have a complete log as of a point in time `TxnId`, +and replicas could purge/GC earlier transaction metadata. + +SyncPoints are not expected to be processed by the the whole cluster, +and we do not want transaction processing to be held up, so while these +are processed much like a transaction, they are invisible to real +transactions which may proceed before SyncPoint is witnessed by the node +processing it. + +ExclusiveSyncPoint is created by `DurabilityScheduler`, as the first +step for coordinating shard durability, which is scheduled for periodic +execution. During this step, we perform initial rounds of `PreAccept` +and `Accept` until we have reached agreement about when `SyncPoint` +should execute. + +After shard is marked durable, `RedundantBefore` collection is updated, +which serves an important role in bootstrap, log replay, log compaction, +and replica-side command purging/invalidation. + +=== ConfigurationService and TopologyManager + +Time in Accord is sliced into epochs. Each epoch constitutes a unique +cluster configuration (`Topology`). Topology represents mapping between +key ranges and nodes, here every range has to be replicated to a certain +number of nodes. Coordinator assigns epoch to each transaction; replicas +may decline transactions that arrive to epochs that were previously +closed. + +`TopologyManager` is responsible for listening to notifications about +cluster configuration changes, and creation of epochs. Once epoch is +created, it needs to be bootstrapped before it is ready. Epoch readiness +consists of 4 _independent_ states: + +* Metadata: The new epoch has been setup locally and the node is ready +to process commands for it. +* Coordinate: The node has retrieved enough remote information to answer +coordination decisions for the epoch (including fast path decisions). +Once a quorum of the new epoch has achieved this, earlier epochs do not +need to be contacted by coordinators of transactions started in the new +epoch (or later). +* Data: The node has successfully replicated the underlying `DataStore` +information for the new epoch, but may need to perform some additional +coordination before it can execute the read portion of a transaction. +* Reads: The node has retrieved enough remote information to safely +process reads, including replicating all necessary DataStore +information, and any additional transactions necessary for consistency. + +=== Data Store + +One of the most important integration points, DataStore, is responsible +for application of transactional information into database’s stable +storage. + +=== Accord Journal + +==== Garbage Collection / Cleanup + +* `ERASE`: we can erase data once we are certain no other replicas +require our information. Erased should ONLY be adopted on a replica that +knows EVERY shard has successfully applied the transaction at all +healthy replicas (or else that it is durably invalidated). +* `EXPUNGE`: we can expunge data once we can reliably and safely expunge +any partial record. To achieve the latter, we use only global summary +information and the TxnId and if present any applyAt. +* `INVALIDATE`: command has been was decidedly (and durably) superseded +by a different command (e.g., a higher higher ballot was witnessed +during recovery), and will *never* be executed. +* `VESTIGIAL`: command cannot be completed and is either pre-bootstrap, +did not commit, or did not participate in this shard’s epoch. +* `TRUNCATE`: means the subset of command metadata (i.e., deps, outcome, +or appliedAt) can be partially discarded. + +== Contributing Changes to Accord + +Accord is covered by a large number of tests, but probably most +prominent among them is a `BurnTest`. BurnTest is a deterministic +simulation of the protocol with strict serializability checker. BurnTest +simulates time, message passing, concurrency, faults, and many other +things. If you are intending to make a chance to Accord, it is +recommended you run `BurnTest` at very least several dozen times in the +loop to ensure correctness of your change. BurnTest can also be useful +for reasoning about and exploring protocol states. Put a breakpoint at a +spot you consider important, run the burn test and see what’s going on. + +Accord also comes with many built-in assertions. Protocol has many +checks for internal consistency that can be helpful during development. +Most of the time, rather than triggering a strict serializability +checker error, you will see some form of internal assertion detecting an +inconsistency. These invariants are there for a reason, and in an +overwhelming majority of cases disabling or ignoring them is not a good +idea. + +== Cheat Sheet + +* Medium Path - is a coordinator optimization. This is the case where t0 +can be agreed (i.e. executeAt=txnId), and where we would like not to +take 3 round-trips, as this situation is likely to occur when we lose +the fast path quorum. The medium path permits only 2 round-trips because +it can be used as a complete set of dependencies (due to their having +been calculated against the correct bound, t0, and that bound having +been applied at a quorum so that conflicting transactions will propose a +higher executeAt). +* `SaveStatus` vs `Status` - `SaveStatus` is a replica-local status that +contains additional information helpful for tracking state machine state, +and heavily used for validating internal consistency in Accord, while +`Status` is a part of a distributed state machine that tracks distributed +transaction state. +* `Routable` - something that can be found in the cluster, and MAYBE +found on disk (if Seekable. +** `Unseekable` - _routing_ key; in Cassandra terms, you can think of a +`Token` +** `Seekable` - Something that can be found within the cluster AND found +on disk, queried and returned; i.e., key or key range. +* Route vs RoutingKey vs FullRoute vs PartialRoute - +** `Partial` vs `Full` route are understood in the context of a single +transaction. diff --git a/doc/modules/cassandra/pages/developing/index.adoc b/doc/modules/cassandra/pages/developing/index.adoc index 8c9f735e2c3d..409a423a6fc8 100644 --- a/doc/modules/cassandra/pages/developing/index.adoc +++ b/doc/modules/cassandra/pages/developing/index.adoc @@ -2,3 +2,4 @@ * xref:cassandra:developing/data-modeling/index.adoc[Data Modeling] * xref:cassandra:developing/cql/index.adoc[CQL] +* xref:cassandra:developing/accord/index.adoc[Accord] diff --git a/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc b/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc index d3b948e3d172..6ab4917ef7b3 100644 --- a/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc +++ b/doc/modules/cassandra/pages/managing/operating/virtualtables.adoc @@ -72,6 +72,8 @@ cqlsh> select * from system_metrics.all_groups ; group_name | virtual_table -------------------+--------------------------- + AccordCoordinator | accord_coordinator_group + AccordReplica | accord_replica_group Batch | batch_group BufferPool | buffer_pool_group CIDRAuthorizer | cidr_authorizer_group @@ -98,6 +100,7 @@ cqlsh> select * from system_metrics.all_groups ; Paxos | paxos_group ReadRepair | read_repair_group Repair | repair_group + RouteIndex | route_index_group Storage | storage_group StorageProxy | storage_proxy_group Streaming | streaming_group diff --git a/ide/idea-iml-file.xml b/ide/idea-iml-file.xml index 13e66fa61308..1d189db8d6bc 100644 --- a/ide/idea-iml-file.xml +++ b/ide/idea-iml-file.xml @@ -49,6 +49,16 @@ + + + + + + + + + + @@ -56,6 +66,8 @@ + + @@ -63,12 +75,17 @@ + + + + + @@ -76,6 +93,9 @@ + + + diff --git a/ide/idea/vcs.xml b/ide/idea/vcs.xml index 81872fd3f150..a5367a526e4d 100644 --- a/ide/idea/vcs.xml +++ b/ide/idea/vcs.xml @@ -2,6 +2,7 @@ + - \ No newline at end of file + diff --git a/ide/idea/workspace.xml b/ide/idea/workspace.xml index c5c0e28b963b..13018f4052d2 100644 --- a/ide/idea/workspace.xml +++ b/ide/idea/workspace.xml @@ -183,24 +183,39 @@